├── README.md
└── mine_next
    ├── functions
        ├── __pycache__
        │   ├── dataset.cpython-37.pyc
        │   ├── main_function2.cpython-37.pyc
        │   ├── sent_to_graph.cpython-37.pyc
        │   └── stance_main_func.cpython-37.pyc
        ├── dataset.py
        ├── dev_error.json
        ├── gcn_test2.py
        ├── heterograph.py
        ├── homograph.py
        ├── main_function.py
        ├── main_function2.py
        ├── make_graph.py
        ├── pos_analy.py
        ├── save_graph.py
        ├── sent2_to_graph.py
        ├── sent_to_graph.py
        ├── stance_main_func.py
        ├── test.py
        ├── test_error.json
        ├── textrank.py
        ├── txt2json.py
        ├── use_bertopic.py
        ├── use_bertopic2.py
        ├── use_bertopic3.py
        └── use_firstsent.py
    ├── model
        ├── __pycache__
        │   └── modeling.cpython-37.pyc
        └── modeling.py
    ├── run_base.py
    ├── run_debug.py
    ├── run_grad1.py
    ├── run_grand.py
    ├── run_grand2
    ├── run_grand2.py
    ├── run_grand3_test.py
    ├── run_one.py
    └── run_stance.py


/README.md:
--------------------------------------------------------------------------------
 1 | 학습 실행 방법
 2 | 
 3 |     python run.py --train_file TRAIN_FILE_PATH --save_dir SAVE_DIRECTORY_NAME --do_train True --init_weight True
 4 | 
 5 | 테스트 실행 방법
 6 | 
 7 |     python run.py --predict_file PREDICT_FILE_PATH --output_dir MODEL_DIRECTORY_NAME --checkpoint MODEL_CHECKPOINT --do_eval True
 8 | 
 9 | 실제 예시
10 | 
11 |     python run.py --predict_file extractive_summary_mrc_test_4.0.json --output_dir ./ --checkpoint 16000 --do_eval True
12 |     
13 | --output_dir : 저장된 모델을 불러오는 디렉토리. --checkpoint와 같이 엮임.
14 | 
15 | ex)
16 | --output_dir : ./ 
17 | --checkpoint : 16000
18 | ./checkpoint-16000 안에 들어있는 모델 불러옴
19 | 
20 | 


--------------------------------------------------------------------------------
/mine_next/functions/__pycache__/dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/dataset.cpython-37.pyc


--------------------------------------------------------------------------------
/mine_next/functions/__pycache__/main_function2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/main_function2.cpython-37.pyc


--------------------------------------------------------------------------------
/mine_next/functions/__pycache__/sent_to_graph.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/sent_to_graph.cpython-37.pyc


--------------------------------------------------------------------------------
/mine_next/functions/__pycache__/stance_main_func.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/stance_main_func.cpython-37.pyc


--------------------------------------------------------------------------------
/mine_next/functions/dataset.py:
--------------------------------------------------------------------------------
  1 | import json, ast
  2 | import benepar
  3 | import dgl.frame
  4 | from torch.utils.data import TensorDataset, Dataset
  5 | import torch
  6 | from transformers import AutoTokenizer
  7 | import pandas as pd
  8 | import argparse
  9 | from tqdm import tqdm
 10 | import spacy
 11 | from mine_next.functions.sent_to_graph import constituent_to_tree, get_cons_tag_vocab, final_graph, all_process_graph
 12 | import os, string
 13 | 
 14 | nlp = spacy.load('en_core_web_sm')
 15 | nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
 16 | 
 17 | 
 18 | def convert_only_sentence2tensordataset(dataset, pseudo, tokenizer, max_length, mode):
 19 |     printable = set(string.printable)
 20 |     total_idx = []
 21 |     total_input_ids = []
 22 |     total_attention_mask = []
 23 |     total_label = []
 24 |     total_token_type_ids = []
 25 |     total_sim_label = []
 26 |     claim_sentences = dataset['claim_sentence'].tolist()
 27 |     claim_labels = dataset['claim_label'].tolist()
 28 |     claim_article_id = dataset['article_id'].tolist()
 29 |     gold_topic_sentences = dataset['topic_sentence'].tolist()
 30 | 
 31 |     claim_labels = [0 if label is 'O' else 1 for label in claim_labels]
 32 |     # 여기부분은 평소에 불러다 쓸때 사용하는 부분
 33 |     # total_graph = {}
 34 |     # max_constituent_length = 600
 35 |     # total_constituent_labels = []
 36 |     # with open('../data/IAM/claims/graphs/{}_constituent.txt'.format(mode), 'r', encoding='utf-8') as f:
 37 |     #     constituents = f.readlines()
 38 |     # # 테스트
 39 |     # for constituent in constituents:
 40 |     #     constituent = ast.literal_eval(constituent.replace('\n', ''))
 41 |     #     total_constituent_labels.append(constituent+[-1]*(max_constituent_length-len(constituent)))
 42 |     # graphs = os.listdir('../data/IAM/claims/graphs')
 43 |     # graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file)] #train이나 dev 의 dgl만
 44 |     # for graph in graphs_list:
 45 |     #     (g,), _ = dgl.load_graphs(graph)
 46 |     #     idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
 47 |     #     total_graph[int(idx)] = g
 48 | 
 49 |     # 평소 불러쓸때
 50 |     total_graph_first = {}
 51 |     total_graph_second = {}
 52 |     max_constituent_length = 600
 53 |     total_constituent_label_first = []
 54 |     total_constituent_label_second = []
 55 |     with open('../data/IAM/claims/graphs/{}_constituent_first_second.txt'.format(mode), 'r', encoding='utf-8') as f:
 56 |         constituents = f.readlines()
 57 |     for constituent in constituents:
 58 |         constituent = ast.literal_eval(constituent.replace('\n', ''))
 59 |         total_constituent_label_first.append(constituent[0]+[-1]*(max_constituent_length-len(constituent[0])))
 60 |         total_constituent_label_second.append(constituent[1]+[-1]*(max_constituent_length-len(constituent[1])))
 61 |     graphs = os.listdir('../data/IAM/claims/graphs')
 62 |     graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'first' in file)] #train이나 dev 의 dgl만
 63 |     for graph in graphs_list:
 64 |         (g,), _ = dgl.load_graphs(graph)
 65 |         idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
 66 |         total_graph_first[int(idx)] = g
 67 |     graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'second' in file)] #train이나 dev 의 dgl만
 68 |     for graph in graphs_list:
 69 |         (g,), _ = dgl.load_graphs(graph)
 70 |         idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
 71 |         total_graph_second[int(idx)] = g
 72 | 
 73 |     for idx, (topic, claim_sentence, claim_label, article_id) in tqdm(enumerate(zip(gold_topic_sentences, claim_sentences, claim_labels, claim_article_id)), desc='convert to data to tensordataset', total=len(claim_labels)):
 74 |         claim_sentence = claim_sentence.lower().replace('“', '"').replace('”', '"')
 75 |         claim_sentence = "".join(filter(lambda x : x in printable, claim_sentence))
 76 | 
 77 |         # claim_graph_first, claim_graph_second, constituent_label_first, constituent_label_second = all_process_graph(nlp, tokenizer, claim_sentence)
 78 |         # total_graph_first[idx] = claim_graph_first
 79 |         # total_graph_second[idx] = claim_graph_second
 80 |         # constituent_label_first = constituent_label_first.tolist() + [-1]*(max_constituent_length-len(constituent_label_first.tolist()))
 81 |         # constituent_label_second = constituent_label_second.tolist() + [-1]*(max_constituent_length-len(constituent_label_second.tolist()))
 82 |         # total_constituent_label_first.append(constituent_label_first)
 83 |         # total_constituent_label_second.append(constituent_label_second)
 84 | 
 85 |         # 슈도 토픽 할때
 86 |         #process_sentence = tokenizer(pseudo[article_id], claim_sentence, max_length=max_length, padding='max_length', truncation=True)
 87 |         #process_sentence = tokenizer(claim_sentence, max_length=max_length, padding='max_length', truncation=True)
 88 |         process_sentence = tokenizer(topic, claim_sentence, max_length=max_length, padding='max_length', truncation=True)
 89 |         input_ids = process_sentence['input_ids']
 90 |         attention_mask = process_sentence['attention_mask']
 91 |         # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식
 92 |         sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2]
 93 |         try:
 94 |             second_sep_index = sep_index[1]
 95 |             token_type_ids = [0] * second_sep_index
 96 |             token_type_ids += [1] * (len(input_ids)-len(token_type_ids))
 97 |         except IndexError:
 98 |             token_type_ids = [0] * max_length
 99 |         # 주장일 때
100 |         if claim_label == 1:
101 |             sim_label = 1
102 |         # 주장이 아닐때
103 |         elif claim_label == 0:
104 |             sim_label = -1
105 | 
106 |         total_idx.append(idx)
107 |         total_input_ids.append(input_ids)
108 |         total_attention_mask.append(attention_mask)
109 |         total_token_type_ids.append(token_type_ids)
110 |         total_label.append(claim_label)
111 |         total_sim_label.append(sim_label)
112 |         #total_graph[idx] = claim_graph
113 |         #total_constituent_labels.append(constituent_label_list)
114 |         if idx < 3:
115 |             print()
116 |             print("****EXAMPLE****")
117 |             print("topic sentence : {}".format(topic))
118 |             print("claim sentence : {}".format(claim_sentence))
119 |             print("claim sentence input ids : {}".format(input_ids))
120 |             print("claim sentence attention mask : {}".format(attention_mask))
121 |             print("claim sentence token type ids : {}".format(token_type_ids))
122 |             print("label : {}".format(claim_label))
123 |             print("sim label : {}".format(sim_label))
124 | 
125 | 
126 |     total_idx = torch.tensor(total_idx, dtype=torch.long)
127 |     total_input_ids = torch.tensor(total_input_ids, dtype=torch.long)
128 |     total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long)
129 |     total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long)
130 |     total_label = torch.tensor(total_label, dtype=torch.long)
131 |     total_sim_label = torch.tensor(total_sim_label, dtype=torch.long)
132 |     total_constituent_label_first = torch.tensor(total_constituent_label_first, dtype=torch.long)
133 |     total_constituent_label_second = torch.tensor(total_constituent_label_second, dtype=torch.long)
134 |     dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_label, total_sim_label,
135 |                             total_constituent_label_first, total_constituent_label_second)
136 | 
137 |     return dataset, total_graph_first, total_graph_second
138 | 
139 | 
140 | def convert_only_sentence2tensordataset(dataset, pseudo, tokenizer, max_length, mode):
141 |     printable = set(string.printable)
142 |     total_idx = []
143 |     total_input_ids = []
144 |     total_attention_mask = []
145 |     total_label = []
146 |     total_token_type_ids = []
147 |     total_sim_label = []
148 |     claim_sentences = dataset['claim_sentence'].tolist()
149 |     claim_labels = dataset['claim_label'].tolist()
150 |     claim_article_id = dataset['article_id'].tolist()
151 |     gold_topic_sentences = dataset['topic_sentence'].tolist()
152 | 
153 |     claim_labels = [0 if label is 'O' else 1 for label in claim_labels]
154 |     # 여기부분은 평소에 불러다 쓸때 사용하는 부분
155 |     # total_graph = {}
156 |     # max_constituent_length = 600
157 |     # total_constituent_labels = []
158 |     # with open('../data/IAM/claims/graphs/{}_constituent.txt'.format(mode), 'r', encoding='utf-8') as f:
159 |     #     constituents = f.readlines()
160 |     # #테스트
161 |     # for constituent in constituents:
162 |     #     constituent = ast.literal_eval(constituent.replace('\n', ''))
163 |     #     total_constituent_labels.append(constituent+[-1]*(max_constituent_length-len(constituent)))
164 |     # graphs = os.listdir('../data/IAM/claims/graphs')
165 |     # graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file)] #train이나 dev 의 dgl만
166 |     # for graph in graphs_list:
167 |     #     (g,), _ = dgl.load_graphs(graph)
168 |     #     idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
169 |     #     total_graph[int(idx)] = g
170 | 
171 |     #평소 불러쓸때
172 |     total_graph_first = {}
173 |     total_graph_second = {}
174 |     max_constituent_length = 600
175 |     total_constituent_label_first = []
176 |     total_constituent_label_second = []
177 |     with open('../data/IAM/claims/graphs/{}_constituent_first_second.txt'.format(mode), 'r', encoding='utf-8') as f:
178 |         constituents = f.readlines()
179 |     for constituent in constituents:
180 |         constituent = ast.literal_eval(constituent.replace('\n', ''))
181 |         total_constituent_label_first.append(constituent[0]+[-1]*(max_constituent_length-len(constituent[0])))
182 |         total_constituent_label_second.append(constituent[1]+[-1]*(max_constituent_length-len(constituent[1])))
183 |     graphs = os.listdir('../data/IAM/claims/graphs')
184 |     graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'first' in file)] #train이나 dev 의 dgl만
185 |     for graph in graphs_list:
186 |         (g,), _ = dgl.load_graphs(graph)
187 |         idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
188 |         total_graph_first[int(idx)] = g
189 |     graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'second' in file)] #train이나 dev 의 dgl만
190 |     for graph in graphs_list:
191 |         (g,), _ = dgl.load_graphs(graph)
192 |         idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
193 |         total_graph_second[int(idx)] = g
194 | 
195 |     for idx, (topic, claim_sentence, claim_label, article_id) in tqdm(enumerate(zip(gold_topic_sentences, claim_sentences, claim_labels, claim_article_id)), desc='convert to data to tensordataset', total=len(claim_labels)):
196 |         claim_sentence = claim_sentence.lower().replace('“', '"').replace('”', '"')
197 |         claim_sentence = "".join(filter(lambda x : x in printable, claim_sentence))
198 | 
199 |         # claim_graph_first, claim_graph_second, constituent_label_first, constituent_label_second = all_process_graph(nlp, tokenizer, claim_sentence)
200 |         # total_graph_first[idx] = claim_graph_first
201 |         # total_graph_second[idx] = claim_graph_second
202 |         # constituent_label_first = constituent_label_first.tolist() + [-1]*(max_constituent_length-len(constituent_label_first.tolist()))
203 |         # constituent_label_second = constituent_label_second.tolist() + [-1]*(max_constituent_length-len(constituent_label_second.tolist()))
204 |         # total_constituent_label_first.append(constituent_label_first)
205 |         # total_constituent_label_second.append(constituent_label_second)
206 | 
207 |         # 슈도 토픽 할때
208 |         process_sentence = tokenizer(pseudo[article_id], claim_sentence, max_length=max_length, padding='max_length', truncation=True)
209 |         # 그냥 문장 하나만 할 때
210 |         # process_sentence = tokenizer(claim_sentence, max_length=max_length, padding='max_length', truncation=True)
211 |         # 골든 토픽과 문장 하나
212 |         # process_sentence = tokenizer(topic, claim_sentence, max_length=max_length, padding='max_length', truncation=True)
213 | 
214 |         input_ids = process_sentence['input_ids']
215 |         attention_mask = process_sentence['attention_mask']
216 |         # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식
217 |         sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2]
218 |         try:
219 |             second_sep_index = sep_index[1]
220 |             token_type_ids = [0] * second_sep_index
221 |             token_type_ids += [1] * (len(input_ids)-len(token_type_ids))
222 |         except IndexError:
223 |             token_type_ids = [0] * max_length
224 |         # 주장일 때
225 |         if claim_label == 1:
226 |             sim_label = 1
227 |         # 주장이 아닐때
228 |         elif claim_label == 0:
229 |             sim_label = -1
230 | 
231 |         total_idx.append(idx)
232 |         total_input_ids.append(input_ids)
233 |         total_attention_mask.append(attention_mask)
234 |         total_token_type_ids.append(token_type_ids)
235 |         total_label.append(claim_label)
236 |         total_sim_label.append(sim_label)
237 |         #total_graph[idx] = claim_graph
238 |         #total_constituent_labels.append(constituent_label_list)
239 |         if idx < 3:
240 |             print()
241 |             print("****EXAMPLE****")
242 |             print("topic sentence : {}".format(topic))
243 |             print("pseudo topic sentence : {}".format(pseudo[article_id]))
244 |             print("claim sentence : {}".format(claim_sentence))
245 |             print("claim sentence input ids : {}".format(input_ids))
246 |             print("claim sentence attention mask : {}".format(attention_mask))
247 |             print("claim sentence token type ids : {}".format(token_type_ids))
248 |             print("label : {}".format(claim_label))
249 |             print("sim label : {}".format(sim_label))
250 | 
251 | 
252 |     total_idx = torch.tensor(total_idx, dtype=torch.long)
253 |     total_input_ids = torch.tensor(total_input_ids, dtype=torch.long)
254 |     total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long)
255 |     total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long)
256 |     total_label = torch.tensor(total_label, dtype=torch.long)
257 |     total_sim_label = torch.tensor(total_sim_label, dtype=torch.long)
258 |     total_constituent_label_first = torch.tensor(total_constituent_label_first, dtype=torch.long)
259 |     total_constituent_label_second = torch.tensor(total_constituent_label_second, dtype=torch.long)
260 |     dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_label, total_sim_label,
261 |                             total_constituent_label_first, total_constituent_label_second)
262 | 
263 |     return dataset, total_graph_first, total_graph_second
264 | 
265 | 
266 | def convert_data2tensordataset(dataset, tokenizer, max_length, mode):
267 |     total_input_ids = []
268 |     total_attention_mask = []
269 |     total_label = []
270 |     total_token_type_ids = []
271 |     total_sim_label = []
272 |     total_idx = []
273 |     claim_sentences = dataset['claim_sentence'].tolist()
274 |     claim_labels = dataset['claim_label'].tolist()
275 |     claim_labels = [0 if label is 'O' else 1 for label in claim_labels]
276 |     topic_sentences = dataset['topic_sentence'].tolist()
277 |     for idx, (topic_sentence, claim_sentence, claim_label) in tqdm(enumerate(zip(topic_sentences, claim_sentences, claim_labels)), desc='convert to data to tensordataset', total=len(claim_labels)):
278 |         process_sentence = tokenizer(topic_sentence, claim_sentence, max_length=max_length, padding='max_length', truncation=True)
279 |         input_ids = process_sentence['input_ids']
280 |         attention_mask = process_sentence['attention_mask']
281 |         # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식
282 |         sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2]
283 |         second_sep_index = sep_index[1]
284 |         token_type_ids = [0] * second_sep_index
285 |         token_type_ids += [1] * (len(input_ids)-len(token_type_ids))
286 |         # 주장일 때
287 |         if claim_label == 1:
288 |             sim_label = 1
289 |         # 주장이 아닐때
290 |         elif claim_label == 0:
291 |             sim_label = -1
292 |         total_idx.append(idx)
293 |         total_input_ids.append(input_ids)
294 |         total_attention_mask.append(attention_mask)
295 |         total_token_type_ids.append(token_type_ids)
296 |         total_label.append(claim_label)
297 |         total_sim_label.append(sim_label)
298 |         if idx < 3:
299 |             print()
300 |             print("****EXAMPLE****")
301 |             print("topic sentence : {}".format(topic_sentence))
302 |             print("claim sentence : {}".format(claim_sentence))
303 |             print("topic, claim sentence input ids : {}".format(input_ids))
304 |             print("topic, claim sentence attention mask : {}".format(attention_mask))
305 |             print("topic, claim sentence token type ids : {}".format(token_type_ids))
306 |             print("label : {}".format(claim_label))
307 |             print("sim label : {}".format(sim_label))
308 |     total_idx = torch.tensor(total_idx, dtype=torch.long)
309 |     total_input_ids = torch.tensor(total_input_ids, dtype=torch.long)
310 |     total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long)
311 |     total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long)
312 |     total_label = torch.tensor(total_label, dtype=torch.long)
313 |     total_sim_label = torch.tensor(total_sim_label, dtype=torch.long)
314 |     dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_label, total_sim_label)
315 |     return dataset
316 | 
317 | 
318 | def convert_stance_data2tensordataset(dataset, tokenizer, max_length, mode=None):
319 |     total_idx = []
320 |     total_input_ids = []
321 |     total_attention_mask = []
322 |     total_label = []
323 |     total_token_type_ids = []
324 |     total_sim_label = []
325 |     total_stance_label = []
326 |     #dataset = dataset[dataset['claim_label'] == 'C']
327 | 
328 |     claim_sentences = dataset['claim_sentence'].tolist()
329 |     topic_sentences = dataset['topic_sentence'].tolist()
330 |     stance_labels = dataset['stance_labels'].tolist()
331 |     for idx, (topic_sentence, claim_sentence, stance_label) in tqdm(enumerate(zip(topic_sentences, claim_sentences, stance_labels)), desc='convert to data to tensordataset', total=len(stance_labels)):
332 |         process_sentence = tokenizer(topic_sentence, claim_sentence, max_length=max_length, padding='max_length', truncation=True)
333 |         input_ids = process_sentence['input_ids']
334 |         attention_mask = process_sentence['attention_mask']
335 |         # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식
336 |         try:
337 |             sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2]
338 |             second_sep_index = sep_index[1]
339 |             token_type_ids = [0] * second_sep_index
340 |             token_type_ids += [1] * (len(input_ids)-len(token_type_ids))
341 |         except IndexError:
342 |             token_type_ids = [0] * max_length
343 |         #sent_attention_mask = (1-token_type_ids) * attention_mask
344 |         total_idx.append(idx)
345 |         total_input_ids.append(input_ids)
346 |         total_attention_mask.append(attention_mask)
347 |         total_token_type_ids.append(token_type_ids)
348 |         if stance_label == -1:
349 |             total_stance_label.append(0)
350 |         else:
351 |             total_stance_label.append(1)
352 |         #total_stance_label.append(stance_label)
353 |         if idx < 3:
354 |             print()
355 |             print("****EXAMPLE****")
356 |             print("topic sentence : {}".format(topic_sentence))
357 |             print("claim sentence : {}".format(claim_sentence))
358 |             print("topic, claim sentence input ids : {}".format(input_ids))
359 |             print("topic, claim sentence attention mask : {}".format(attention_mask))
360 |             print("topic, claim sentence token type ids : {}".format(token_type_ids))
361 |             print("stance label : {}".format(stance_label))
362 | 
363 |     total_idx = torch.tensor(total_idx, dtype=torch.long)
364 |     total_input_ids = torch.tensor(total_input_ids, dtype=torch.long)
365 |     total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long)
366 |     total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long)
367 |     total_stance_label = torch.tensor(total_stance_label, dtype=torch.long)
368 |     dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_stance_label)
369 |     return dataset
370 | 
371 | # with open('../../../data/train_claim.json', 'r', encoding='utf-8') as reader:
372 | #     dataset = json.load(reader)['data']
373 | #
374 | #     total_title = []
375 | #     total_input_ids = []
376 | #     total_attention_mask = []
377 | #     total_label = []
378 | #     for data in dataset:
379 | #         title = data['title']
380 | #         total_title.append(title)
381 | #         paragraphs = data['paragraphs']
382 | #         for para in paragraphs:
383 | #             answers = para['qas'][0]['answers']
384 | #             context = para['context']
385 | #             result = tokenizer(context, padding='max_length', max_length=4096, truncation=True)
386 | #             # cls idx 2 / sep idx 3
387 | #             total_input_ids.append(result['input_ids'])
388 | #             total_attention_mask.append(result['attention_mask'])
389 | #             context_list = context.split('[SEP]')
390 | #             each_label = [0] * len(context_list)
391 | #             # 첫 sep는 첫번째 문장에 대한 표현. 문장의 오른쪽에 있는 sep를 기준으로 한다.
392 | #             for answer in answers:
393 | #                 text = answer['text']
394 | #                 for idx, ctx in enumerate(context_list):
395 | #                     if text in ctx:
396 | #                         print(idx+1)
397 | 
398 | 
399 | 
400 | # if __name__ == '__main__':
401 | #     parser = argparse.ArgumentParser(description='dataset creating')
402 | #     parser.add_argument('--train_data', type=str, default='../../../data/train_claim.json')
403 | 


--------------------------------------------------------------------------------
/mine_next/functions/gcn_test2.py:
--------------------------------------------------------------------------------
 1 | import dgl.nn.pytorch as dglnn
 2 | import torch.nn as nn
 3 | import dgl.data
 4 | import torch.nn.functional as F
 5 | from dgl.dataloading import GraphDataLoader
 6 | import torch
 7 | 
 8 | 
 9 | ###
10 | # 이건 그래프 통쨰로 분류하는 코드
11 | ###
12 | 
13 | 
14 | dataset = dgl.data.GINDataset('MUTAG', False)
15 | 
16 | dataloader = GraphDataLoader(
17 |     dataset,
18 |     batch_size=1024,
19 |     drop_last=False,
20 |     shuffle=True)
21 | 
22 | 
23 | class Classifier(nn.Module):
24 |     def __init__(self, in_dim, hidden_dim, n_classes):
25 |         super(Classifier, self).__init__()
26 |         self.conv1 = dglnn.GraphConv(in_dim, hidden_dim)
27 |         self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim)
28 |         self.classify = nn.Linear(hidden_dim, n_classes)
29 | 
30 |     def forward(self, g, h):
31 |         # Apply graph convolution and activation.
32 |         h = F.relu(self.conv1(g, h))
33 |         h = F.relu(self.conv2(g, h))
34 |         with g.local_scope():
35 |             g.ndata['h'] = h
36 |             # Calculate graph representation by average readout.
37 |             # (batch size, 20(아마 히든사이즈))
38 |             hg = dgl.mean_nodes(g, 'h')
39 |             return self.classify(hg)
40 | 
41 | 
42 | model = Classifier(7, 20, 5)
43 | opt = torch.optim.Adam(model.parameters())
44 | for epoch in range(20):
45 |     for batched_graph, labels in dataloader:
46 |         # (num nodes, 7) 아마 라벨 개수가 7개인듯
47 |         feats = batched_graph.ndata['attr']
48 |         logits = model(batched_graph, feats)
49 |         loss = F.cross_entropy(logits, labels)
50 |         opt.zero_grad()
51 |         loss.backward()
52 |         opt.step()


--------------------------------------------------------------------------------
/mine_next/functions/heterograph.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. currentmodule:: dgl
  3 | 
  4 | Working with Heterogeneous Graphs
  5 | =================================
  6 | 
  7 | **Author**: Quan Gan, `Minjie Wang <https://jermainewang.github.io/>`_, Mufei Li,
  8 | George Karypis, Zheng Zhang
  9 | 
 10 | In this tutorial, you learn about:
 11 | 
 12 | * Examples of heterogenous graph data and typical applications.
 13 | 
 14 | * Creating and manipulating a heterogenous graph in DGL.
 15 | 
 16 | * Implementing `Relational-GCN <https://arxiv.org/abs/1703.06103>`_, a popular GNN model,
 17 |   for heterogenous graph input.
 18 | 
 19 | * Training a model to solve a node classification task.
 20 | 
 21 | Heterogeneous graphs, or *heterographs* for short, are graphs that contain
 22 | different types of nodes and edges. The different types of nodes and edges tend
 23 | to have different types of attributes that are designed to capture the
 24 | characteristics of each node and edge type. Within the context of
 25 | graph neural networks, depending on their complexity, certain node and edge types
 26 | might need to be modeled with representations that have a different number of dimensions.
 27 | 
 28 | DGL supports graph neural network computations on such heterogeneous graphs, by
 29 | using the heterograph class and its associated API.
 30 | 
 31 | """
 32 | 
 33 | ###############################################################################
 34 | # Examples of heterographs
 35 | # -----------------------
 36 | # Many graph datasets represent relationships among various types of entities.
 37 | # This section provides an overview for several graph use-cases that show such relationships
 38 | # and can have their data represented as heterographs.
 39 | #
 40 | # Citation graph
 41 | # ~~~~~~~~~~~~~~~
 42 | # The Association for Computing Machinery publishes an `ACM dataset <https://aminer.org/citation>`_ that contains two
 43 | # million papers, their authors, publication venues, and the other papers
 44 | # that were cited. This information can be represented as a heterogeneous graph.
 45 | #
 46 | # The following diagram shows several entities in the ACM dataset and the relationships among them
 47 | # (taken from `Shi et al., 2015 <https://arxiv.org/pdf/1511.04854.pdf>`_).
 48 | #
 49 | # .. figure:: https://data.dgl.ai/tutorial/hetero/acm-example.png#
 50 | #
 51 | # This graph has three types of entities that correspond to papers, authors, and publication venues.
 52 | # It also contains three types of edges that connect the following:
 53 | #
 54 | # * Authors with papers corresponding to *written-by* relationships
 55 | #
 56 | # * Papers with publication venues corresponding to *published-in* relationships
 57 | #
 58 | # * Papers with other papers corresponding to *cited-by* relationships
 59 | #
 60 | #
 61 | # Recommender systems
 62 | # ~~~~~~~~~~~~~~~~~~~~
 63 | # The datasets used in recommender systems often contain
 64 | # interactions between users and items. For example, the data could include the
 65 | # ratings that users have provided to movies. Such interactions can be modeled
 66 | # as heterographs.
 67 | #
 68 | # The nodes in these heterographs will have two types, *users* and *movies*. The edges
 69 | # will correspond to the user-movie interactions. Furthermore, if an interaction is
 70 | # marked with a rating, then each rating value could correspond to a different edge type.
 71 | # The following diagram shows an example of user-item interactions as a heterograph.
 72 | #
 73 | # .. figure:: https://data.dgl.ai/tutorial/hetero/recsys-example.png
 74 | #
 75 | #
 76 | # Knowledge graph
 77 | # ~~~~~~~~~~~~~~~~
 78 | # Knowledge graphs are inherently heterogenous. For example, in
 79 | # Wikidata, Barack Obama (item Q76) is an instance of a human, which could be viewed as
 80 | # the entity class, whose spouse (item P26) is Michelle Obama (item Q13133) and
 81 | # occupation (item P106) is politician (item Q82955). The relationships are shown in the following.
 82 | # diagram.
 83 | #
 84 | # .. figure:: https://data.dgl.ai/tutorial/hetero/kg-example.png
 85 | #
 86 | 
 87 | ###############################################################################
 88 | # Creating a heterograph in DGL
 89 | # -----------------------------
 90 | # You can create a heterograph in DGL using the :func:`dgl.heterograph` API.
 91 | # The argument to :func:`dgl.heterograph` is a dictionary. The keys are tuples
 92 | # in the form of ``(srctype, edgetype, dsttype)`` specifying the relation name
 93 | # and the two entity types it connects. Such tuples are called *canonical edge types*
 94 | # The values are data to initialize the graph structures, that is, which
 95 | # nodes the edges actually connect.
 96 | #
 97 | # For instance, the following code creates the user-item interactions heterograph shown earlier.
 98 | 
 99 | # Each value of the dictionary is a pair of source and destination arrays.
100 | # Nodes are integer IDs starting from zero. Nodes IDs of different types have
101 | # separate countings.
102 | import dgl
103 | import numpy as np
104 | 
105 | ratings = dgl.heterograph(
106 |     {('user', '+1', 'movie') : (np.array([0, 0, 1]), np.array([0, 1, 0])),
107 |      ('user', '-1', 'movie') : (np.array([2]), np.array([1]))})
108 | 
109 | ###############################################################################
110 | # Manipulating heterograph
111 | # ------------------------
112 | # You can create a more realistic heterograph using the ACM dataset. To do this, first
113 | # download the dataset as follows:
114 | 
115 | import scipy.io
116 | import urllib.request
117 | 
118 | data_url = 'https://data.dgl.ai/dataset/ACM.mat'
119 | data_file_path = '/tmp/ACM.mat'
120 | 
121 | urllib.request.urlretrieve(data_url, data_file_path)
122 | data = scipy.io.loadmat(data_file_path)
123 | print(list(data.keys()))
124 | 
125 | ###############################################################################
126 | # The dataset stores node information by their types: ``P`` for paper, ``A``
127 | # for author, ``C`` for conference, ``L`` for subject code, and so on. The relationships
128 | # are stored as SciPy sparse matrix under key ``XvsY``, where ``X`` and ``Y``
129 | # could be any of the node type code.
130 | #
131 | # The following code prints out some statistics about the paper-author relationships.
132 | 
133 | print(type(data['PvsA']))
134 | print('#Papers:', data['PvsA'].shape[0])
135 | print('#Authors:', data['PvsA'].shape[1])
136 | print('#Links:', data['PvsA'].nnz)
137 | 
138 | ###############################################################################
139 | # Converting this SciPy matrix to a heterograph in DGL is straightforward.
140 | 
141 | pa_g = dgl.heterograph({('paper', 'written-by', 'author') : data['PvsA'].nonzero()})
142 | 
143 | ###############################################################################
144 | # You can easily print out the type names and other structural information.
145 | 
146 | print('Node types:', pa_g.ntypes)
147 | print('Edge types:', pa_g.etypes)
148 | print('Canonical edge types:', pa_g.canonical_etypes)
149 | 
150 | # Nodes and edges are assigned integer IDs starting from zero and each type has its own counting.
151 | # To distinguish the nodes and edges of different types, specify the type name as the argument.
152 | print(pa_g.number_of_nodes('paper'))
153 | # Canonical edge type name can be shortened to only one edge type name if it is
154 | # uniquely distinguishable.
155 | print(pa_g.number_of_edges(('paper', 'written-by', 'author')))
156 | print(pa_g.number_of_edges('written-by'))
157 | print(pa_g.successors(1, etype='written-by'))  # get the authors that write paper #1
158 | 
159 | # Type name argument could be omitted whenever the behavior is unambiguous.
160 | print(pa_g.number_of_edges())  # Only one edge type, the edge type argument could be omitted
161 | 
162 | ###############################################################################
163 | # A homogeneous graph is just a special case of a heterograph with only one type
164 | # of node and edge.
165 | 
166 | # Paper-citing-paper graph is a homogeneous graph
167 | pp_g = dgl.heterograph({('paper', 'citing', 'paper') : data['PvsP'].nonzero()})
168 | # equivalent (shorter) API for creating homogeneous graph
169 | pp_g = dgl.from_scipy(data['PvsP'])
170 | 
171 | # All the ntype and etype arguments could be omitted because the behavior is unambiguous.
172 | print(pp_g.number_of_nodes())
173 | print(pp_g.number_of_edges())
174 | print(pp_g.successors(3))
175 | 
176 | ###############################################################################
177 | # Create a subset of the ACM graph using the paper-author, paper-paper,
178 | # and paper-subject relationships.  Meanwhile, also add the reverse
179 | # relationship to prepare for the later sections.
180 | 
181 | G = dgl.heterograph({
182 |         ('paper', 'written-by', 'author') : data['PvsA'].nonzero(),
183 |         ('author', 'writing', 'paper') : data['PvsA'].transpose().nonzero(),
184 |         ('paper', 'citing', 'paper') : data['PvsP'].nonzero(),
185 |         ('paper', 'cited', 'paper') : data['PvsP'].transpose().nonzero(),
186 |         ('paper', 'is-about', 'subject') : data['PvsL'].nonzero(),
187 |         ('subject', 'has', 'paper') : data['PvsL'].transpose().nonzero(),
188 |     })
189 | 
190 | print(G)
191 | 
192 | ###############################################################################
193 | # **Metagraph** (or network schema) is a useful summary of a heterograph.
194 | # Serving as a template for a heterograph, it tells how many types of objects
195 | # exist in the network and where the possible links exist.
196 | #
197 | # DGL provides easy access to the metagraph, which could be visualized using
198 | # external tools.
199 | 
200 | # Draw the metagraph using graphviz.
201 | # import pygraphviz as pgv
202 | # def plot_graph(nxg):
203 | #     ag = pgv.AGraph(strict=False, directed=True)
204 | #     for u, v, k in nxg.edges(keys=True):
205 | #         ag.add_edge(u, v, label=k)
206 | #     ag.layout('dot')
207 | #     ag.draw('graph.png')
208 | #
209 | # plot_graph(G.metagraph())
210 | 
211 | ###############################################################################
212 | # Learning tasks associated with heterographs
213 | # -------------------------------------------
214 | # Some of the typical learning tasks that involve heterographs include:
215 | #
216 | # * *Node classification and regression* to predict the class of each node or
217 | #   estimate a value associated with it.
218 | #
219 | # * *Link prediction* to predict if there is an edge of a certain
220 | #   type between a pair of nodes, or predict which other nodes a particular
221 | #   node is connected with (and optionally the edge types of such connections).
222 | #
223 | # * *Graph classification/regression* to assign an entire
224 | #   heterograph into one of the target classes or to estimate a numerical
225 | #   value associated with it.
226 | #
227 | # In this tutorial, we designed a simple example for the first task.
228 | #
229 | # A semi-supervised node classification example
230 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
231 | # Our goal is to predict the publishing conference of a paper using the ACM
232 | # academic graph we just created. To further simplify the task, we only focus
233 | # on papers published in three conferences: *KDD*, *ICML*, and *VLDB*. All
234 | # the other papers are not labeled, making it a semi-supervised setting.
235 | #
236 | # The following code extracts those papers from the raw dataset and prepares
237 | # the training, validation, testing split.
238 | 
239 | import numpy as np
240 | import torch
241 | import torch.nn as nn
242 | import torch.nn.functional as F
243 | 
244 | pvc = data['PvsC'].tocsr()
245 | # find all papers published in KDD, ICML, VLDB
246 | c_selected = [0, 11, 13]  # KDD, ICML, VLDB
247 | p_selected = pvc[:, c_selected].tocoo()
248 | # generate labels
249 | labels = pvc.indices
250 | labels[labels == 11] = 1
251 | labels[labels == 13] = 2
252 | labels = torch.tensor(labels).long()
253 | 
254 | # generate train/val/test split
255 | pid = p_selected.row
256 | shuffle = np.random.permutation(pid)
257 | train_idx = torch.tensor(shuffle[0:800]).long()
258 | val_idx = torch.tensor(shuffle[800:900]).long()
259 | test_idx = torch.tensor(shuffle[900:]).long()
260 | 
261 | ###############################################################################
262 | # Relational-GCN on heterograph
263 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
264 | # We use `Relational-GCN <https://arxiv.org/abs/1703.06103>`_ to learn the
265 | # representation of nodes in the graph. Its message-passing equation is as
266 | # follows:
267 | #
268 | # .. math::
269 | #
270 | #    h_i^{(l+1)} = \sigma\left(\sum_{r\in \mathcal{R}}
271 | #    \sum_{j\in\mathcal{N}_r(i)}W_r^{(l)}h_j^{(l)}\right)
272 | #
273 | # Breaking down the equation, you see that there are two parts in the
274 | # computation.
275 | #
276 | # (i) Message computation and aggregation within each relation :math:`r`
277 | #
278 | # (ii) Reduction that merges the results from multiple relationships
279 | #
280 | # Following this intuition, perform message passing on a heterograph in
281 | # two steps.
282 | #
283 | # (i) Per-edge-type message passing
284 | #
285 | # (ii) Type wise reduction
286 | 
287 | import dgl.function as fn
288 | 
289 | class HeteroRGCNLayer(nn.Module):
290 |     def __init__(self, in_size, out_size, etypes):
291 |         super(HeteroRGCNLayer, self).__init__()
292 |         # W_r for each relation
293 |         self.weight = nn.ModuleDict({
294 |                 name : nn.Linear(in_size, out_size) for name in etypes
295 |             })
296 | 
297 |     def forward(self, G, feat_dict):
298 |         # The input is a dictionary of node features for each type
299 |         funcs = {}
300 |         for srctype, etype, dsttype in G.canonical_etypes:
301 |             # Compute W_r * h
302 |             Wh = self.weight[etype](feat_dict[srctype])
303 |             # Save it in graph for message passing
304 |             G.nodes[srctype].data['Wh_%s' % etype] = Wh
305 |             # Specify per-relation message passing functions: (message_func, reduce_func).
306 |             # Note that the results are saved to the same destination feature 'h', which
307 |             # hints the type wise reducer for aggregation.
308 |             funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
309 |         # Trigger message passing of multiple types.
310 |         # The first argument is the message passing functions for each relation.
311 |         # The second one is the type wise reducer, could be "sum", "max",
312 |         # "min", "mean", "stack"
313 |         G.multi_update_all(funcs, 'sum')
314 |         # return the updated node feature dictionary
315 |         return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}
316 | 
317 | ###############################################################################
318 | # Create a simple GNN by stacking two ``HeteroRGCNLayer``. Since the
319 | # nodes do not have input features, make their embeddings trainable.
320 | 
321 | class HeteroRGCN(nn.Module):
322 |     def __init__(self, G, in_size, hidden_size, out_size):
323 |         super(HeteroRGCN, self).__init__()
324 |         # Use trainable node embeddings as featureless inputs.
325 |         embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), in_size))
326 |                       for ntype in G.ntypes}
327 |         for key, embed in embed_dict.items():
328 |             nn.init.xavier_uniform_(embed)
329 |         self.embed = nn.ParameterDict(embed_dict)
330 |         # create layers
331 |         self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes)
332 |         self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes)
333 | 
334 |     def forward(self, G):
335 |         h_dict = self.layer1(G, self.embed)
336 |         h_dict = {k : F.leaky_relu(h) for k, h in h_dict.items()}
337 |         h_dict = self.layer2(G, h_dict)
338 |         # get paper logits
339 |         return h_dict['paper']
340 | 
341 | ###############################################################################
342 | # Train and evaluate
343 | # ~~~~~~~~~~~~~~~~~~
344 | # Train and evaluate this network.
345 | 
346 | # Create the model. The output has three logits for three classes.
347 | model = HeteroRGCN(G, 10, 10, 3)
348 | 
349 | opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
350 | 
351 | best_val_acc = 0
352 | best_test_acc = 0
353 | 
354 | for epoch in range(100):
355 |     logits = model(G)
356 |     # The loss is computed only for labeled nodes.
357 |     loss = F.cross_entropy(logits[train_idx], labels[train_idx])
358 | 
359 |     pred = logits.argmax(1)
360 |     train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
361 |     val_acc = (pred[val_idx] == labels[val_idx]).float().mean()
362 |     test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
363 | 
364 |     if best_val_acc < val_acc:
365 |         best_val_acc = val_acc
366 |         best_test_acc = test_acc
367 | 
368 |     opt.zero_grad()
369 |     loss.backward()
370 |     opt.step()
371 | 
372 |     if epoch % 5 == 0:
373 |         print('Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
374 |             loss.item(),
375 |             train_acc.item(),
376 |             val_acc.item(),
377 |             best_val_acc.item(),
378 |             test_acc.item(),
379 |             best_test_acc.item(),
380 |         ))
381 | 
382 | ###############################################################################
383 | # What's next?
384 | # ------------
385 | # * Check out our full implementation in PyTorch
386 | #   `here <https://github.com/dmlc/dgl/tree/master/examples/pytorch/rgcn-hetero>`_.
387 | #
388 | # * We also provide the following model examples:
389 | #
390 | #   * `Graph Convolutional Matrix Completion <https://arxiv.org/abs/1706.02263>_`,
391 | #     which we implement in MXNet
392 | #     `here <https://github.com/dmlc/dgl/tree/v0.4.0/examples/mxnet/gcmc>`_.
393 | #
394 | #   * `Heterogeneous Graph Attention Network <https://arxiv.org/abs/1903.07293>`_
395 | #     requires transforming a heterograph into a homogeneous graph according to
396 | #     a given metapath (i.e. a path template consisting of edge types).  We
397 | #     provide :func:`dgl.transform.metapath_reachable_graph` to do this.  See full
398 | #     implementation
399 | #     `here <https://github.com/dmlc/dgl/tree/master/examples/pytorch/han>`_.
400 | #
401 | #   * `Metapath2vec <https://dl.acm.org/citation.cfm?id=3098036>`_ requires
402 | #     generating random walk paths according to a given metapath.  Please
403 | #     refer to the full metapath2vec implementation
404 | #     `here <https://github.com/dmlc/dgl/tree/master/examples/pytorch/metapath2vec>`_.
405 | #
406 | # * :doc:`Full heterograph API reference <../../api/python/heterograph>`.
407 | 


--------------------------------------------------------------------------------
/mine_next/functions/homograph.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | import dgl
  3 | import dgl.frame
  4 | import torch
  5 | import os, csv
  6 | import pandas as pd
  7 | from tqdm import tqdm
  8 | import benepar
  9 | from transformers import AutoTokenizer
 10 | import string
 11 | import dgl.nn.pytorch as dglnn
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | 
 15 | 
 16 | class Classifier(nn.Module):
 17 |     def __init__(self, in_dim, hidden_dim, n_classes):
 18 |         super(Classifier, self).__init__()
 19 |         self.conv1 = dglnn.GraphConv(in_dim, hidden_dim)
 20 |         self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim)
 21 |         self.classify = nn.Linear(hidden_dim, n_classes)
 22 |         #self.cons_type_embeddings = nn.Embedding(82, 300)
 23 | 
 24 |     def forward(self, g, h, edge_type=None):
 25 |         # Apply graph convolution and activation.
 26 |         # cons_node_ids = g.filter(lambda nodes:nodes.data['dtype'] == 1 )
 27 |         # cc_edge_id = g.filter(lambda edges : edges.data['dtype'] == edge_type)
 28 |         # self_edge_id = g.filter(lambda edges : edges.data['dtype'] == 4)
 29 |         # cc_edge_id = torch.cat([cc_edge_id, self_edge_id], dim=0)
 30 | 
 31 |         h = F.relu(self.conv1(g, h))
 32 |         h = F.relu(self.conv2(g, h))
 33 |         with g.local_scope():
 34 |             g.ndata['h'] = h
 35 |             # Calculate graph representation by average readout.
 36 |             # (batch size, 20(아마 히든사이즈))
 37 |             hg = dgl.mean_nodes(g, 'h')
 38 |             return self.classify(hg)
 39 | 
 40 | 
 41 | class RGCN(nn.Module):
 42 |     def __init__(self, in_feats, hid_feats, out_feats, rel_names):
 43 |         super().__init__()
 44 | 
 45 |         self.conv1 = dglnn.HeteroGraphConv({
 46 |             rel: dglnn.GraphConv(in_feats, hid_feats)
 47 |             for rel in rel_names}, aggregate='sum')
 48 |         self.conv2 = dglnn.HeteroGraphConv({
 49 |             rel: dglnn.GraphConv(hid_feats, out_feats)
 50 |             for rel in rel_names}, aggregate='sum')
 51 | 
 52 |     def forward(self, graph, inputs):
 53 |         # inputs is features of nodes
 54 |         h = self.conv1(graph, inputs)
 55 |         h = {k: F.relu(v) for k, v in h.items()}
 56 |         h = self.conv2(graph, h)
 57 |         return h
 58 | 
 59 | 
 60 | class HeteroClassifier(nn.Module):
 61 |     def __init__(self, in_dim, hidden_dim, n_classes, rel_names):
 62 |         super().__init__()
 63 | 
 64 |         self.rgcn = RGCN(in_dim, hidden_dim, hidden_dim, rel_names)
 65 |         self.classify = nn.Linear(hidden_dim, n_classes)
 66 | 
 67 |     def forward(self, g, h):
 68 |         #h = g.ndata['feat']
 69 |         h = self.rgcn(g, h)
 70 |         with g.local_scope():
 71 |             g.ndata['h'] = h
 72 |             # Calculate graph representation by average readout.
 73 |             hg = 0
 74 |             for ntype in g.ntypes:
 75 |                 hg = hg + dgl.mean_nodes(g, 'h', ntype=ntype)
 76 |             return self.classify(hg)
 77 | 
 78 | 
 79 | class Tree(object):
 80 |     def __init__(self, type):
 81 |         # self.start, self.end 단어 기준으로 세는것. one 이 0번째, . 이 27번쨰라 root 노드가 start=0, end=27을 가지고 있는거임
 82 |         self.parent = None
 83 |         self.num_children = 0
 84 |         self.children = list()
 85 |         self.type = type
 86 |         self.is_leaf = False
 87 |         self.start = -1
 88 |         self.end = -1
 89 |         self.idx = -1
 90 | 
 91 |     def add_child(self, child):
 92 |         child.parent = self
 93 |         self.num_children += 1
 94 |         self.children.append(child)
 95 | 
 96 |     def size(self):
 97 |         count = 1
 98 |         for i in range(self.num_children):
 99 |             count += self.children[i].size()
100 |         return count
101 | 
102 |     def __str__(self):
103 |         return self.type
104 | 
105 |     def __iter__(self):
106 |         yield self
107 |         for c in self.children:
108 |             for x in c:
109 |                 yield x
110 | 
111 | def get_cons_tag_vocab(data_path):
112 |     tag2id = {}
113 |     with open(data_path) as f:
114 |         for line in f.readlines():
115 |             tag, idx = line.strip().split('\t')
116 |             tag2id[tag] = int(idx)
117 |     return tag2id
118 | 
119 | def span_starts_ends(node: Tree):
120 |     if len(node.children) == 0:
121 |         return
122 |     for child in node.children:
123 |         span_starts_ends(child)
124 | 
125 |     node.start = node.children[0].start
126 |     node.end = node.children[-1].end
127 | 
128 | def constituent_to_tree(tokenizer, constituent_string, sentence, word_offset, node_offset, num_orders=2):
129 |     constituents = []
130 |     temp_str = ""
131 |     # 괄호 ['(', 'S', '(', 'NP', '(', 'NP', '(', 'CD', 'one', ')', '(', 'ADJP', '(', 'RB', 'long', ')' ... ] 이런식으로 (,)나 단어, constituent 단위로 분리
132 |     for i, char in enumerate(constituent_string):
133 |         if char == "(" or char == ")" or char == " ":
134 |             if len(temp_str) != 0:
135 |                 constituents.append(temp_str)
136 |                 temp_str = ""
137 |             if char != " ":
138 |                 constituents.append(char)
139 |         else:
140 |             temp_str += char
141 |     # NP, PP등 노드 단위로 stack
142 |     stack = []
143 |     for cons in constituents:
144 |         if cons != ")":
145 |             stack.append(cons)
146 |         else:
147 |             tail = stack.pop()
148 |             temp_constituents = []
149 |             while tail != "(":
150 |                 temp_constituents.append(tail)
151 |                 tail = stack.pop()
152 | 
153 |             parent = Tree(temp_constituents[-1])
154 |             for i in range(len(temp_constituents) - 2, -1, -1):
155 |                 if isinstance(temp_constituents[i], Tree):
156 |                     parent.add_child(temp_constituents[i])
157 |                 else:
158 |                     child = Tree(temp_constituents[i])
159 |                     parent.add_child(child)
160 |             stack.append(parent)
161 |     root = stack[-1]
162 |     for node in root:
163 |         if len(node.children) == 0:
164 |             node.is_leaf = True
165 | 
166 |     for node in root:
167 |         if node.is_leaf:
168 |             node.start = word_offset
169 |             node.end = word_offset
170 |             word_offset += 1
171 |     span_starts_ends(root)
172 | 
173 |     node_sequence = []
174 |     # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음
175 |     internal_nodes = []
176 |     for node in root:
177 |         if not node.is_leaf:
178 |             internal_nodes.append(node)
179 |         node_sequence.append(node)
180 | 
181 |     node_offset_original = node_offset
182 |     for node in root:
183 |         if node.is_leaf:
184 |             continue
185 |         node.idx = node_offset
186 |         node_offset += 1
187 | 
188 |     constituent_sequence = [] # [(node idx, node start, node end, node type, parent idx)]
189 |     num_internal_nodes = len(internal_nodes)
190 |     # constituent_edge
191 |     constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)]
192 |     for i, node in enumerate(internal_nodes):
193 |         parent_idx = node.parent.idx if node.parent else -1
194 |         constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx))
195 |         if parent_idx != -1:
196 |             constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 # 바로 아래 코드랑 보면 양방향 엣지 포함하는거임
197 |             constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1
198 |     # 이부분은 한계층 건너 뛰어서 엣지 이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌
199 |     high_order_sequence = [constituent_sequence]
200 |     for i in range(1, num_orders):
201 |         new_constituent_sequence = []
202 |         for idx, start, end, type, parent_idx in high_order_sequence[-1]:
203 |             if parent_idx == -1:
204 |                 continue
205 |             parent_node = constituent_sequence[parent_idx - node_offset_original]
206 |             if parent_node[-1] == -1:
207 |                 continue
208 |             new_constituent_sequence.append((idx, start, end, type, parent_node[-1]))
209 |             constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1
210 |             constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1
211 |         high_order_sequence.append(new_constituent_sequence)
212 |     return high_order_sequence, word_offset, node_offset
213 | 
214 | def final_graph(constituent_list, graph):
215 |     cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt')
216 |     forward_edge_type, backward_edge_type = 0, 2
217 | 
218 |     prev_root_node_id = None
219 |     constituent_labels = []
220 |     for high_order_sent_cons in constituent_list:
221 |         for i, sent_cons in enumerate(high_order_sent_cons):
222 |             for idx, start, end, label, parent_idx in sent_cons:
223 |                 idx_nodeid = idx  # 원래는 constituent_start_idx = 0, node_id_offset = 406(token id까지였음. 1063중 406이 토큰이고 이후가 node 였었음.)
224 |                 # parent 없는 노드
225 |                 if parent_idx == -1:
226 |                     if prev_root_node_id is not None:
227 |                         # graph.add_edges(prev_root_node_id, idx_nodeid,
228 |                         #                 data={'cc_link': torch.tensor([forward_edge_type + i]),
229 |                         #                       'dtype': torch.tensor([forward_edge_type + i])})
230 |                         # # dual GAT
231 |                         # graph.add_edges(idx_nodeid, prev_root_node_id,
232 |                         #                 data={'cc_link': torch.tensor([backward_edge_type + i]),
233 |                         #                       'dtype': torch.tensor([backward_edge_type + i])})
234 |                         graph.add_edges(prev_root_node_id, idx_nodeid,
235 |                                         data={'cc_link': torch.tensor([1]),
236 |                                               'dtype': torch.tensor([1])})
237 |                         # dual GAT
238 |                         graph.add_edges(idx_nodeid, prev_root_node_id,
239 |                                         data={'cc_link': torch.tensor([1]),
240 |                                               'dtype': torch.tensor([1])})
241 |                     prev_root_node_id = idx_nodeid
242 |                 # parent 있는 노드들
243 |                 if parent_idx != -1:
244 |                     parent_idx_nodeid = parent_idx
245 |                     # graph.add_edges(parent_idx_nodeid, idx_nodeid,
246 |                     #                 data={'cc_link': torch.tensor([forward_edge_type + i]),
247 |                     #                       'dtype': torch.tensor([forward_edge_type + i])})
248 |                     # graph.add_edges(idx_nodeid, parent_idx_nodeid,
249 |                     #                 data={'cc_link': torch.tensor([backward_edge_type + i]),
250 |                     #                       'dtype': torch.tensor([backward_edge_type + i])})
251 |                     graph.add_edges(parent_idx_nodeid, idx_nodeid,
252 |                                     data={'cc_link': torch.tensor([1]),
253 |                                           'dtype': torch.tensor([1])})
254 |                     graph.add_edges(idx_nodeid, parent_idx_nodeid,
255 |                                     data={'cc_link': torch.tensor([1]),
256 |                                           'dtype': torch.tensor([1])})
257 |                 if i == 0:
258 |                     # self-loop edge
259 |                     # graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([4]),
260 |                     #                                               'dtype': torch.tensor([4])})
261 |                     graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
262 |                                                                   'dtype': torch.tensor([1])})
263 |                     constituent_labels.append(cons_tag2id[label])
264 | 
265 |     constituent_labels = torch.tensor(constituent_labels,dtype=torch.long)
266 |     return graph, constituent_labels
267 | 
268 | def all_process_graph(nlp, tokenizer, sentence):
269 |     sentence_doc = nlp(sentence)
270 |     sentence_sent = list(sentence_doc.sents)[0]
271 |     parse_string = sentence_sent._.parse_string
272 |     word_offset, node_offset = 0, 0
273 |     constituent = []
274 |     constituent_sequence, word_offset, node_offset = \
275 |         constituent_to_tree(tokenizer, parse_string, sentence, word_offset, node_offset)
276 |     constituent.append(constituent_sequence)
277 | 
278 |     graph = dgl.graph([])
279 |     graph.set_n_initializer(dgl.frame.zero_initializer)
280 |     num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
281 | 
282 |     graph.add_nodes(num_cons)
283 |     graph.ndata['unit'] = torch.ones(num_cons)
284 |     graph.ndata['dtype'] = torch.ones(num_cons)
285 | 
286 |     claim_graph, constituent_labels = \
287 |         final_graph(constituent, graph)
288 |     return claim_graph, constituent_labels
289 | 
290 | 
291 | if __name__ == "__main__":
292 |     nlp = spacy.load('en_core_web_sm')
293 |     nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
294 |     tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=False, use_fast=False)
295 | 
296 |     dev_data = pd.read_csv('../../data/IAM/claims/dev.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
297 |     dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
298 |     dev_data = dev_data.dropna(axis=0)
299 | 
300 |     dev_sentences = dev_data['claim_sentence'].tolist()[:10]
301 |     total_dev_constituent_label = []
302 |     printable = set(string.printable)
303 |     total_graph = {}
304 |     cons_type_embeddings = nn.Embedding(82, 300)
305 |     model = Classifier(300, 300, 2) # homo graph 테스트용
306 | 
307 |     for idx, dev in tqdm(enumerate(dev_sentences), total=len(dev_sentences)):
308 |         dev = dev.lower().replace('“', '"').replace('”', '"')
309 |         dev = "".join(filter(lambda x: x in printable, dev))
310 | 
311 |         dev_graph, dev_constituent_label = all_process_graph(nlp, tokenizer, dev)
312 |         total_dev_constituent_label.append([dev_constituent_label])
313 |         total_graph[idx] = dev_graph
314 |         cons_node_feat = cons_type_embeddings(dev_constituent_label)
315 |         #etypes = ['0', '1', '2', '3', '4']
316 |         #model = HeteroClassifier(300, 300, 2, etypes)
317 |         #print(dev_graph.edges(form='all'))
318 |         logits = model(dev_graph, cons_node_feat) # homo
319 |         #logits = model(dev_graph, cons_node_feat) # hetero
320 |         print(logits)
321 | 
322 | 
323 | 


--------------------------------------------------------------------------------
/mine_next/functions/main_function.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import os
  3 | from tqdm import tqdm
  4 | import torch
  5 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
  6 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup
  7 | from transformers import AutoConfig, AutoTokenizer
  8 | from sklearn.metrics import classification_report, accuracy_score
  9 | from sklearn.utils import resample
 10 | 
 11 | import csv
 12 | import numpy as np
 13 | import pandas as pd
 14 | import json
 15 | 
 16 | # from source.claim_classification.model.modeling import KoElectraForClaimClassification
 17 | # from source.claim_classification.func.dataset import convert_data2tensordataset
 18 | 
 19 | from mine_next.model.modeling import RobertaForClassification
 20 | from mine_next.functions.dataset import (
 21 |     convert_data2tensordataset,
 22 |     convert_stance_data2tensordataset,
 23 |     convert_only_sentence2tensordataset
 24 | )
 25 | 
 26 | 
 27 | def random_downsampling(dataset):
 28 |     major = dataset[dataset['claim_label'] == 'O']
 29 |     minor = dataset[dataset['claim_label'] == 'C']
 30 |     sampling_data = resample(major, replace=True, n_samples=len(minor)*5, random_state=42)
 31 |     train_data = pd.concat([sampling_data, minor])
 32 |     return train_data
 33 | 
 34 | 
 35 | def random_upsampling(dataset):
 36 |     major = dataset[dataset['claim_label'] == 'O']
 37 |     minor = dataset[dataset['claim_label'] == 'C']
 38 |     sampling_data = resample(minor, replace=True, n_samples=len(major), random_state=42)
 39 |     train_data = pd.concat([sampling_data, major])
 40 |     return train_data
 41 | 
 42 | def do_train(config, model, optimizer, scheduler, train_dataloader, epoch, global_step, total_graph):
 43 |     losses = []
 44 |     total_predicts, total_corrects = [], []
 45 |     for step, batch in tqdm(enumerate(train_dataloader), desc='do_train(epoch_{})'.format(epoch), total=len(train_dataloader)):
 46 |         batch = tuple(t.cuda() for t in batch)
 47 |         # graph 같이 학습할 경우
 48 |         # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
 49 |         # constituent_labels = batch[6]
 50 |         # loss, predicts = model(
 51 |         #     idx=idx,
 52 |         #     input_ids=input_ids,
 53 |         #     attention_mask=attention_mask,
 54 |         #     token_type_ids=token_type_ids,
 55 |         #     labels=labels,
 56 |         #     sim_labels=sim_labels,
 57 |         #     all_graph=total_graph,
 58 |         #     constituent_labels=constituent_labels
 59 |         # )
 60 |         # base
 61 |         idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \
 62 |                                                                              batch[4], batch[5]
 63 |         loss, predicts = model(
 64 |             idx=idx,
 65 |             input_ids=input_ids,
 66 |             attention_mask=attention_mask,
 67 |             token_type_ids=token_type_ids,
 68 |             labels=labels,
 69 |             sim_labels=sim_labels,
 70 |         )
 71 |         predicts = predicts.argmax(dim=-1)
 72 |         predicts = predicts.cpu().detach().numpy().tolist()
 73 |         labels = labels.cpu().detach().numpy().tolist()
 74 | 
 75 |         total_predicts.extend(predicts)
 76 |         total_corrects.extend(labels)
 77 | 
 78 |         if config.gradient_accumulation_steps > 1:
 79 |             loss = loss / config.gradient_accumulation_steps
 80 |         # 원래는 tensor(0.7255)이런식
 81 |         loss.backward()
 82 |         losses.append(loss.data.item())
 83 |         if (step + 1) % config.gradient_accumulation_steps == 0 or \
 84 |                 (len(train_dataloader) <= config.gradient_accumulation_steps and (step + 1) == len(
 85 |                     train_dataloader)):
 86 |             torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
 87 |             optimizer.step()
 88 |             scheduler.step()
 89 | 
 90 |             model.zero_grad()
 91 |             global_step += 1
 92 |     target_names = ['class 0', 'class 1']
 93 |     print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
 94 |     accuracy = accuracy_score(total_corrects, total_predicts)
 95 |     return accuracy, np.mean(losses), global_step
 96 | 
 97 | 
 98 | def do_evaluate(model, dev_dataloader, total_graph):
 99 |     total_predicts, total_corrects = [], []
100 |     for step, batch in tqdm(enumerate(dev_dataloader), desc="do_evaluate", total=len(dev_dataloader)):
101 |         batch = tuple(t.cuda() for t in batch)
102 |         # graph 학습할 경우
103 |         # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
104 |         # constituent_labels = batch[6]
105 |         # predicts = model(
106 |         #     idx=idx,
107 |         #     input_ids=input_ids,
108 |         #     attention_mask=attention_mask,
109 |         #     token_type_ids=token_type_ids,
110 |         #     all_graph=total_graph,
111 |         #     constituent_labels=constituent_labels
112 |         # )
113 |         # base
114 |         idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \
115 |                                                                              batch[4], batch[5]
116 |         predicts = model(
117 |             idx=idx,
118 |             input_ids=input_ids,
119 |             attention_mask=attention_mask,
120 |             token_type_ids=token_type_ids,
121 |         )
122 |         predicts = predicts.argmax(dim=-1)
123 |         predicts = predicts.detach().cpu().tolist()
124 |         labels = labels.detach().cpu().tolist()
125 |         total_predicts.extend(predicts)
126 |         total_corrects.extend(labels)
127 |     target_names = ['class 0', 'class 1']
128 |     print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
129 |     accuracy = accuracy_score(total_corrects, total_predicts)
130 |     return accuracy, total_predicts
131 | 
132 | 
133 | def train(config, model, tokenizer):
134 | 
135 |     # 데이터셋 로드
136 |     train_data = pd.read_csv(config.claim_train, sep='\t', header=None, quoting=csv.QUOTE_NONE)
137 |     train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
138 |     train_data = train_data.dropna(axis=0)
139 |     # train_data = train_data[:100]
140 |     dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
141 |     dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
142 |     dev_data = dev_data.dropna(axis=0)
143 |     # dev_data = dev_data[:100]
144 | 
145 |     #train_data = random_upsampling(train_data)
146 |     train_dataset, train_total_graph = convert_only_sentence2tensordataset(train_data, tokenizer, config.max_length, 'train')
147 |     dev_dataset, dev_total_graph = convert_only_sentence2tensordataset(dev_data, tokenizer, config.max_length, 'dev')
148 | 
149 | 
150 |     train_sampler = RandomSampler(train_dataset)
151 |     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size)
152 |     dev_sampler = SequentialSampler(dev_dataset)
153 |     dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
154 | 
155 |     t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.epoch
156 |     optimizer = AdamW(model.parameters(), lr=config.learning_rate)
157 |     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total)
158 | 
159 |     global_step = 0
160 |     max_test_accuracy = 0
161 |     model.zero_grad()
162 |     for epoch in range(config.epoch):
163 |         model.train()
164 |         train_accuracy, average_loss, global_step = do_train(
165 |             config=config, model=model,
166 |             optimizer=optimizer, scheduler=scheduler,
167 |             train_dataloader=train_dataloader, epoch=epoch+1, global_step=global_step, total_graph=train_total_graph)
168 |         print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4)))
169 | 
170 |         model.eval()
171 |         test_accuracy, _ = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=dev_total_graph)
172 |         print("test_accuracy : {}\n".format(round(test_accuracy, 4)))
173 |         output_dir = os.path.join(config.save_dir, "checkpoint-{}".format(epoch))
174 |         if not os.path.exists(output_dir):
175 |             os.makedirs(output_dir)
176 |         model_to_save = model.module if hasattr(model, "module") else model
177 |         model_to_save.save_pretrained(output_dir)
178 |         tokenizer.save_pretrained(output_dir)
179 |         torch.save(config, os.path.join(output_dir, "training_args.bin"))
180 | 
181 | 
182 | def evaluate(config, model, tokenizer):
183 |     dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
184 |     dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
185 |     dev_data = dev_data.dropna(axis=0)
186 |     # dev_data = dev_data[:10]
187 |     # dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length)
188 |     dev_dataset, total_graph = convert_only_sentence2tensordataset(dev_data, tokenizer, config.max_length, 'dev')
189 | 
190 |     dev_sampler = SequentialSampler(dev_dataset)
191 |     dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
192 | 
193 |     test_accuracy, total_predicts = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=total_graph)
194 |     print("test accuracy : {}".format(round(test_accuracy,4)))
195 |     total_corrects = dev_data['claim_label'].tolist()
196 |     total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects]
197 |     totaL_claim_sentence = dev_data['claim_sentence'].tolist()
198 |     error_list = []
199 |     for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence):
200 |         if predict != correct:
201 |             error = {}
202 |             error['predict'] = predict
203 |             error['correct'] = correct
204 |             error['claim_sentence'] = claim
205 |             error_list.append(error)
206 | 
207 |     with open('../mine/functions/dev_error.json', 'w', encoding='utf-8') as f:
208 |         json.dump(error_list, f, indent=4)
209 | 


--------------------------------------------------------------------------------
/mine_next/functions/main_function2.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import os
  3 | from tqdm import tqdm
  4 | import torch
  5 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
  6 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup
  7 | from transformers import AutoConfig, AutoTokenizer
  8 | from sklearn.metrics import classification_report, accuracy_score
  9 | from sklearn.utils import resample
 10 | 
 11 | import csv
 12 | import numpy as np
 13 | import pandas as pd
 14 | import json
 15 | 
 16 | # from source.claim_classification.model.modeling import KoElectraForClaimClassification
 17 | # from source.claim_classification.func.dataset import convert_data2tensordataset
 18 | 
 19 | from mine_next.model.modeling import RobertaForClassification
 20 | from mine_next.functions.dataset import (
 21 |     convert_data2tensordataset,
 22 |     convert_stance_data2tensordataset,
 23 |     convert_only_sentence2tensordataset
 24 | )
 25 | 
 26 | 
 27 | def random_downsampling(dataset):
 28 |     major = dataset[dataset['claim_label'] == 'O']
 29 |     minor = dataset[dataset['claim_label'] == 'C']
 30 |     sampling_data = resample(major, replace=True, n_samples=len(minor)*5, random_state=42)
 31 |     train_data = pd.concat([sampling_data, minor])
 32 |     return train_data
 33 | 
 34 | def random_upsampling(dataset):
 35 |     major = dataset[dataset['claim_label'] == 'O']
 36 |     minor = dataset[dataset['claim_label'] == 'C']
 37 |     sampling_data = resample(minor, replace=True, n_samples=len(major), random_state=42)
 38 |     train_data = pd.concat([sampling_data, major])
 39 |     return train_data
 40 | 
 41 | def do_train(config, model, optimizer, scheduler, train_dataloader, epoch, global_step, total_graph):
 42 |     losses = []
 43 |     total_predicts, total_corrects = [], []
 44 |     for step, batch in tqdm(enumerate(train_dataloader), desc='do_train(epoch_{})'.format(epoch), total=len(train_dataloader)):
 45 |         batch = tuple(t.cuda() for t in batch)
 46 |         # graph 같이 학습할 경우
 47 |         idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
 48 |         constituent_labels_first, constituent_labels_second = batch[6], batch[7]
 49 |         loss, predicts = model(
 50 |             idx=idx,
 51 |             input_ids=input_ids,
 52 |             attention_mask=attention_mask,
 53 |             token_type_ids=token_type_ids,
 54 |             labels=labels,
 55 |             sim_labels=sim_labels,
 56 |             all_graph=total_graph,
 57 |             constituent_labels_first=constituent_labels_first,
 58 |             constituent_labels_second=constituent_labels_second
 59 |         )
 60 |         # base
 61 |         # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \
 62 |         #                                                                      batch[4], batch[5]
 63 |         # loss, predicts = model(
 64 |         #     idx=idx,
 65 |         #     input_ids=input_ids,
 66 |         #     attention_mask=attention_mask,
 67 |         #     token_type_ids=token_type_ids,
 68 |         #     labels=labels,
 69 |         #     sim_labels=sim_labels,
 70 |         # )
 71 |         predicts = predicts.argmax(dim=-1)
 72 |         predicts = predicts.cpu().detach().numpy().tolist()
 73 |         labels = labels.cpu().detach().numpy().tolist()
 74 | 
 75 |         total_predicts.extend(predicts)
 76 |         total_corrects.extend(labels)
 77 | 
 78 |         if config.gradient_accumulation_steps > 1:
 79 |             loss = loss / config.gradient_accumulation_steps
 80 |         # 원래는 tensor(0.7255)이런식
 81 |         loss.backward()
 82 |         losses.append(loss.data.item())
 83 |         if (step + 1) % config.gradient_accumulation_steps == 0 or \
 84 |                 (len(train_dataloader) <= config.gradient_accumulation_steps and (step + 1) == len(
 85 |                     train_dataloader)):
 86 |             torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
 87 |             optimizer.step()
 88 |             scheduler.step()
 89 | 
 90 |             model.zero_grad()
 91 |             global_step += 1
 92 |     target_names = ['class 0', 'class 1']
 93 |     print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
 94 |     accuracy = accuracy_score(total_corrects, total_predicts)
 95 |     return accuracy, np.mean(losses), global_step
 96 | 
 97 | def do_evaluate(model, dev_dataloader, total_graph):
 98 |     total_predicts, total_corrects = [], []
 99 |     for step, batch in tqdm(enumerate(dev_dataloader), desc="do_evaluate", total=len(dev_dataloader)):
100 |         batch = tuple(t.cuda() for t in batch)
101 |         # graph 학습할 경우
102 |         idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
103 |         constituent_labels_first, constituent_labels_second = batch[6], batch[7]
104 |         predicts = model(
105 |             idx=idx,
106 |             input_ids=input_ids,
107 |             attention_mask=attention_mask,
108 |             token_type_ids=token_type_ids,
109 |             all_graph=total_graph,
110 |             constituent_labels_first=constituent_labels_first,
111 |             constituent_labels_second= constituent_labels_second
112 |         )
113 |         # base
114 |         # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \
115 |         #                                                                      batch[4], batch[5]
116 |         # predicts = model(
117 |         #     idx=idx,
118 |         #     input_ids=input_ids,
119 |         #     attention_mask=attention_mask,
120 |         #     token_type_ids=token_type_ids,
121 |         # )
122 |         predicts = predicts.argmax(dim=-1)
123 |         predicts = predicts.detach().cpu().tolist()
124 |         labels = labels.detach().cpu().tolist()
125 |         total_predicts.extend(predicts)
126 |         total_corrects.extend(labels)
127 |     target_names = ['class 0', 'class 1']
128 |     result = classification_report(total_corrects, total_predicts, target_names=target_names, digits=4, output_dict=True)
129 |     print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
130 |     accuracy = accuracy_score(total_corrects, total_predicts)
131 |     return accuracy, total_predicts, result['class 1']['f1-score']
132 | 
133 | def train(config, model, tokenizer):
134 | 
135 |     # 데이터셋 로드
136 |     train_data = pd.read_csv(config.claim_train, sep='\t', header=None, quoting=csv.QUOTE_NONE)
137 |     train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
138 |     train_data = train_data.dropna(axis=0)
139 |     dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
140 |     dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
141 |     dev_data = dev_data.dropna(axis=0)
142 | 
143 |     pseudo_train = json.load(open(config.train_pseudo_topic, encoding='utf-8'))
144 |     pseudo_dev = json.load(open(config.dev_pseudo_topic, encoding='utf-8'))
145 |     #train_data = random_upsampling(train_data)
146 | 
147 |     train_dataset, train_total_graph_first, train_total_graph_second = convert_only_sentence2tensordataset(
148 |         train_data, pseudo_train, tokenizer, config.max_length, 'train')
149 |     dev_dataset, dev_total_graph_first, dev_total_graph_second = convert_only_sentence2tensordataset(dev_data, pseudo_dev, tokenizer, config.max_length, 'dev')
150 | 
151 |     train_sampler = RandomSampler(train_dataset)
152 |     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size)
153 |     dev_sampler = SequentialSampler(dev_dataset)
154 |     dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
155 | 
156 |     t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.epoch
157 |     optimizer = AdamW(model.parameters(), lr=config.learning_rate)
158 |     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total)
159 | 
160 |     global_step = 0
161 |     max_test_accuracy = 0
162 |     max_claim_f1 = 0
163 |     model.zero_grad()
164 |     for epoch in range(config.epoch):
165 |         model.train()
166 |         train_accuracy, average_loss, global_step = do_train(
167 |             config=config, model=model,
168 |             optimizer=optimizer, scheduler=scheduler,
169 |             train_dataloader=train_dataloader, epoch=epoch, global_step=global_step, total_graph=[train_total_graph_first, train_total_graph_second])
170 |         print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4)))
171 | 
172 |         model.eval()
173 |         test_accuracy, _, claim_f1 = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=[dev_total_graph_first, dev_total_graph_second])
174 |         print("test_accuracy : {}\n".format(round(test_accuracy, 4)))
175 |         if max_claim_f1 < claim_f1:
176 |             output_dir = os.path.join(config.save_dir, "checkpoint-{}".format(epoch))
177 |             if not os.path.exists(output_dir):
178 |                 os.makedirs(output_dir)
179 |             model_to_save = model.module if hasattr(model, "module") else model
180 |             model_to_save.save_pretrained(output_dir)
181 |             tokenizer.save_pretrained(output_dir)
182 |             torch.save(config, os.path.join(output_dir, "training_args.bin"))
183 |             max_claim_f1 = claim_f1
184 | 
185 | def evaluate(config, model, tokenizer):
186 |     dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
187 |     dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
188 |     dev_data = dev_data.dropna(axis=0)
189 |     # dev_data = dev_data[:10]
190 |     # dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length)
191 |     pseudo_dev = json.load(open(config.dev_pseudo_topic, encoding='utf-8'))
192 |     dev_dataset, dev_total_graph_first, dev_total_graph_second = convert_only_sentence2tensordataset(dev_data, pseudo_dev, tokenizer, config.max_length, 'dev')
193 | 
194 |     dev_sampler = SequentialSampler(dev_dataset)
195 |     dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
196 | 
197 |     test_accuracy, total_predicts, claim_f1 = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=[dev_total_graph_first, dev_total_graph_second])
198 |     print("test accuracy : {}".format(round(test_accuracy,4)))
199 |     total_corrects = dev_data['claim_label'].tolist()
200 |     total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects]
201 |     assert len(total_corrects) == len(total_predicts)
202 |     totaL_claim_sentence = dev_data['claim_sentence'].tolist()
203 |     error_list = []
204 |     for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence):
205 |         error = {}
206 |         error['predict'] = predict
207 |         error['correct'] = correct
208 |         error['claim_sentence'] = claim
209 |         error_list.append(error)
210 | 
211 |     with open('../mine_next/functions/dev_error.json', 'w', encoding='utf-8') as f:
212 |         json.dump(error_list, f, indent=4)
213 | 
214 | 
215 | def test(config, model, tokenizer):
216 |     test_data = pd.read_csv(config.claim_test, sep='\t', header=None, quoting=csv.QUOTE_NONE)
217 |     test_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
218 |     test_data = test_data.dropna(axis=0)
219 |     pseudo_test = json.load(open(config.test_pseudo_topic, encoding='utf-8'))
220 |     test_dataset, test_total_graph_first, test_total_graph_second = convert_only_sentence2tensordataset(test_data, pseudo_test, tokenizer, config.max_length, 'test')
221 | 
222 |     test_sampler = SequentialSampler(test_dataset)
223 |     test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=config.batch_size)
224 | 
225 |     test_accuracy, total_predicts, claim_f1 = do_evaluate(model=model, dev_dataloader=test_dataloader, total_graph=[test_total_graph_first, test_total_graph_second])
226 |     print("test accuracy : {}".format(round(test_accuracy,4)))
227 |     total_corrects = test_data['claim_label'].tolist()
228 |     total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects]
229 |     assert len(total_corrects) == len(total_predicts)
230 |     totaL_claim_sentence = test_data['claim_sentence'].tolist()
231 |     error_list = []
232 |     for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence):
233 |         error = {}
234 |         error['predict'] = predict
235 |         error['correct'] = correct
236 |         error['claim_sentence'] = claim
237 |         error_list.append(error)
238 | 
239 |     with open('../mine_next/functions/test_error.json', 'w', encoding='utf-8') as f:
240 |         json.dump(error_list, f, indent=4)
241 | 


--------------------------------------------------------------------------------
/mine_next/functions/make_graph.py:
--------------------------------------------------------------------------------
  1 | import benepar, spacy
  2 | from nltk.tree import Tree as nltk_tree
  3 | from nltk.treeprettyprinter import TreePrettyPrinter
  4 | from nltk.draw.tree import TreeView
  5 | import os, csv
  6 | import pandas as pd
  7 | from tqdm import tqdm
  8 | import dgl
  9 | from dgl import save_graphs, load_graphs
 10 | from dgl.data.utils import makedirs, save_info, load_info
 11 | 
 12 | import torch
 13 | from transformers import BertTokenizer
 14 | 
 15 | tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 16 | data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
 17 | data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
 18 | data = data.dropna(axis=0)
 19 | data = data[data['claim_label'] == 'C']
 20 | claims = data['claim_sentence'].tolist()
 21 | 
 22 | def get_cons_tag_vocab(data_path):
 23 |     tag2id = {}
 24 |     with open(data_path) as f:
 25 |         for line in f.readlines():
 26 |             tag, idx = line.strip().split('\t')
 27 |             tag2id[tag] = int(idx)
 28 |     return tag2id
 29 | 
 30 | 
 31 | 
 32 | class Tree(object):
 33 |     def __init__(self, type):
 34 |         # self.start, self.end 단어 기준으로 세는것. one 이 0번째, . 이 27번쨰라 root 노드가 start=0, end=27을 가지고 있는거임
 35 |         self.parent = None
 36 |         self.num_children = 0
 37 |         self.children = list()
 38 |         self.type = type
 39 |         self.is_leaf = False
 40 |         self.start = -1
 41 |         self.end = -1
 42 |         self.idx = -1
 43 | 
 44 |     def add_child(self, child):
 45 |         child.parent = self
 46 |         self.num_children += 1
 47 |         self.children.append(child)
 48 | 
 49 |     def size(self):
 50 |         count = 1
 51 |         for i in range(self.num_children):
 52 |             count += self.children[i].size()
 53 | 
 54 |         return count
 55 | 
 56 |     def __str__(self):
 57 |         return self.type
 58 | 
 59 |     def __iter__(self):
 60 |         yield self
 61 |         for c in self.children:
 62 |             for x in c:
 63 |                 yield x
 64 | 
 65 | def span_starts_ends(node: Tree):
 66 |     if len(node.children) == 0:
 67 |         return
 68 |     for child in node.children:
 69 |         span_starts_ends(child)
 70 | 
 71 |     node.start = node.children[0].start
 72 |     node.end = node.children[-1].end
 73 | 
 74 | def constituent_to_tree(constituent_string, word_offset, node_offset, num_orders=2):
 75 |     constituents = []
 76 |     temp_str = ""
 77 |     words = []
 78 |     subtokens = []
 79 |     subtoken_map = []
 80 |     # 괄호 ['(', 'S', '(', 'NP', '(', 'NP', '(', 'CD', 'one', ')', '(', 'ADJP', '(', 'RB', 'long', ')' ... ] 이런식으로 (,)나 단어, constituent 단위로 분리
 81 |     for i, char in enumerate(constituent_string):
 82 |         if char == "(" or char == ")" or char == " ":
 83 |             if len(temp_str) != 0:
 84 |                 constituents.append(temp_str)
 85 |                 temp_str = ""
 86 |             if char != " ":
 87 |                 constituents.append(char)
 88 |         else:
 89 |             temp_str += char
 90 |     # NP, PP등 노드 단위로 stack
 91 |     stack = []
 92 |     for cons in constituents:
 93 |         if cons != ")":
 94 |             stack.append(cons)
 95 |         else:
 96 |             tail = stack.pop()
 97 |             temp_constituents = []
 98 |             while tail != "(":
 99 |                 temp_constituents.append(tail)
100 |                 tail = stack.pop()
101 | 
102 |             parent = Tree(temp_constituents[-1])
103 |             for i in range(len(temp_constituents) - 2, -1, -1):
104 |                 if isinstance(temp_constituents[i], Tree):
105 |                     parent.add_child(temp_constituents[i])
106 |                 else:
107 |                     # parent에 붙일때 parent의 leaf를 true로 바꿔주는 형식으로 해줄것
108 |                     child = Tree(temp_constituents[i])
109 |                     parent.add_child(child)
110 |             stack.append(parent)
111 |     root = stack[-1]
112 |     # 노드 방문하면서 잎인지 체크해야함
113 |     map_count = 0
114 |     for node in root:
115 |         if len(node.children) == 0:
116 |             node.is_leaf = True
117 |             words.append(str(node))
118 |             node_token = tokenizer.tokenize(str(node))
119 |             subtokens.extend(node_token)
120 |             subtoken_map.extend([map_count]*len(node_token))
121 |             map_count += 1
122 | 
123 |     word_offset_original = word_offset
124 |     for node in root:
125 |         if node.is_leaf:
126 |             node.start = word_offset
127 |             node.end = word_offset
128 |             word_offset += 1
129 |     span_starts_ends(root)
130 | 
131 |     node_sequence = []
132 |     # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음
133 |     internal_nodes = []
134 |     for node in root:
135 |         if not node.is_leaf:
136 |             internal_nodes.append(node)
137 |         node_sequence.append(node)
138 |     node_offset_original = node_offset
139 |     for node in root:
140 |         if node.is_leaf:
141 |             #  or node.type in [":", "``", ".", ",", "XX", "X", "-LRB-", "-RRB-", "''", "HYPH"]
142 |             continue
143 |         node.idx = node_offset
144 |         node_offset += 1
145 |     constituent_sequence = [] # [(idx, start, end, type, parent idx)]
146 |     num_internal_nodes = len(internal_nodes)
147 |     # constituent_edge
148 |     constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)]
149 |     for i, node in enumerate(internal_nodes):
150 |         # if node.type in [":", "``", ".", ",", "XX", "X", "-LRB-", "-RRB-", "''", "HYPH"]:
151 |         #     continue
152 |         parent_idx = node.parent.idx if node.parent else -1
153 |         constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx))
154 |         if parent_idx != -1:
155 |             constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 #바로 아래 코드랑 보면 양방향 엣지 포함하는거임
156 |             constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1
157 |     # 이부분은 한계층 건너 뛰어서 엣지 ㅇ이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌
158 |     high_order_sequence = [constituent_sequence]
159 |     for i in range(1, num_orders):
160 |         new_constituent_sequence = []
161 |         for idx, start, end, type, parent_idx in high_order_sequence[-1]:
162 |             if parent_idx == -1:
163 |                 continue
164 |             parent_node = constituent_sequence[parent_idx - node_offset_original]
165 |             if parent_node[-1] == -1:
166 |                 continue
167 |             new_constituent_sequence.append((idx, start, end, type, parent_node[-1]))
168 |             constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1
169 |             constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1
170 |         high_order_sequence.append(new_constituent_sequence)
171 |     return high_order_sequence, word_offset, node_offset, subtoken_map, subtokens
172 | 
173 | 
174 | def print_parse_string(claim_list):
175 |     for claim in claim_list:
176 |         input_string = claim.lower()
177 |         doc = nlp(input_string)
178 |         sent = list(doc.sents)[0]
179 |         print(sent)
180 |         parse_string = sent._.parse_string
181 |         print(parse_string)
182 | 
183 | 
184 | def save(self):
185 |     # save graphs and labels
186 |     self.save_path = '.'
187 |     self.mode = 'test'
188 |     graph_path = os.path.join(self.save_path, self.mode + '_dgl_graph.bin')
189 |     save_graphs(graph_path, self.graphs, {'labels': self.labels})
190 |     # save other information in python dict
191 |     info_path = os.path.join(self.save_path, self.mode + '_info.pkl')
192 |     save_info(info_path, {'num_classes': self.num_classes})
193 | 
194 | def load(self):
195 |     # load processed data from directory `self.save_path`
196 |     self.save_path = '.'
197 |     self.mode = 'test'
198 |     graph_path = os.path.join(self.save_path, self.mode + '_dgl_graph.bin')
199 |     self.graphs, label_dict = load_graphs(graph_path)
200 |     self.labels = label_dict['labels']
201 |     info_path = os.path.join(self.save_path, self.mode + '_info.pkl')
202 |     self.num_classes = load_info(info_path)['num_classes']
203 | 
204 | def has_cache(self):
205 |     # check whether there are processed data in `self.save_path`
206 |     self.save_path = '.'
207 |     self.mode = 'test'
208 |     graph_path = os.path.join(self.save_path, self.mode + '_dgl_graph.bin')
209 |     info_path = os.path.join(self.save_path, self.mode + '_info.pkl')
210 |     return os.path.exists(graph_path) and os.path.exists(info_path)
211 | 
212 | 
213 | #print_parse_string(claims)
214 | 
215 | 
216 | 
217 | 
218 | # doc = nlp(input_string)
219 | #
220 | # sent = list(doc.sents)[0]
221 | # print(sent)
222 | # parse_string = sent._.parse_string
223 | # print(parse_string)
224 | #
225 | # # 원랜 read_constituents 파트. tree.py
226 | # constituents = []
227 | # word_offset, node_offset = 0, 0
228 | # constituent = []
229 | # constituent_sequence, word_offset, node_offset, subtoken_map, subtokens = constituent_to_tree(parse_string, word_offset, node_offset)
230 | # subtoken_map = torch.tensor(subtoken_map, dtype=torch.int64)
231 | # print('constitutuent sequence : ', constituent_sequence) # constituent sequence 0번째가 원래 노드, 1번째가 grand parent와 grand child 관련
232 | # print('word offset , node offset', word_offset, node_offset)
233 | # constituent.append(constituent_sequence)
234 | # constituents.append(constituent)
235 | #
236 | # # 그래프 만들기
237 | # num_tokens = subtoken_map.size()[0] # 문장 토크나이즈 해서 나온 토큰 개수
238 | # num_cons = sum([len(sent_cons[0]) for sent_cons in constituent]) # cons 노드 개수
239 | # graph = dgl.graph([])
240 | # graph.set_n_initializer(dgl.frame.zero_initializer)
241 | # print(graph)
242 | #
243 | # # 그래프에 토큰 관련 추가
244 | # graph.add_nodes(num_tokens)
245 | # graph.ndata['unit'] = torch.zeros(num_tokens)
246 | # graph.ndata['dtype'] = torch.zeros(num_tokens)
247 | #
248 | # # constituent tree 그래프
249 | # graph.add_nodes(num_cons)
250 | # graph.ndata['unit'][num_tokens:] = torch.ones(num_cons)
251 | # graph.ndata['dtype'][num_tokens:] = torch.ones(num_cons)
252 | #
253 | #
254 | # constituent_starts = []
255 | # constituent_ends = []
256 | # constituent_labels = []
257 | # prev_root_node_id = None
258 | # forward_edge_type, backward_edge_type = 0, 2
259 | # constituent_start_idx = 0
260 | # node_id_offset = 0
261 | # num_tokens = len(subtoken_map)
262 | # token_range = torch.arange(0, num_tokens, dtype=torch.int64)
263 | # cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt')
264 | #
265 | #
266 | # for high_order_sent_cons in constituent:
267 | #     for i, sent_cons in enumerate(high_order_sent_cons):
268 | #         for idx, start, end, label, parent_idx in sent_cons:
269 | #             idx_nodeid = idx - constituent_start_idx + node_id_offset # 원래는 constituent_start_idx = 0, node_id_offset = 406(token id까지였음. 1063중 406이 토큰이고 이후가 node 였었음.)
270 | #             # parent 없는 노드
271 | #             if parent_idx == -1:
272 | #                 if prev_root_node_id is not None:
273 | #                     graph.add_edges(prev_root_node_id, idx_nodeid,
274 | #                                     data={'cc_link': torch.tensor([forward_edge_type + i]),
275 | #                                           'dtype': torch.tensor([forward_edge_type + i])})
276 | #                     # dual GAT
277 | #                     graph.add_edges(idx_nodeid, prev_root_node_id,
278 | #                                     data={'cc_link': torch.tensor([backward_edge_type + i]),
279 | #                                           'dtype': torch.tensor([backward_edge_type + i])})
280 | #                 prev_root_node_id = idx_nodeid
281 | #             # parent 없는 노드들
282 | #             if parent_idx != -1:
283 | #                 parent_idx_nodeid = parent_idx - constituent_start_idx + node_id_offset
284 | #                 graph.add_edges(parent_idx_nodeid, idx_nodeid,
285 | #                                 data={'cc_link': torch.tensor([forward_edge_type + i]),
286 | #                                       'dtype': torch.tensor([forward_edge_type + i])})
287 | #                 graph.add_edges(idx_nodeid, parent_idx_nodeid,
288 | #                                 data={'cc_link': torch.tensor([backward_edge_type + i]),
289 | #                                       'dtype': torch.tensor([backward_edge_type + i])})
290 | #
291 | #             if i == 0:
292 | #                 # self-loop edge
293 | #                 graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([4]),
294 | #                                                               'dtype': torch.tensor([4])})
295 | #                 # constituent -> token
296 | #                 token_start = token_range[subtoken_map == start][0]
297 | #                 token_end = token_range[subtoken_map == end][-1]
298 | #                 graph.add_edges(idx_nodeid, token_start, data={'ct_link': torch.tensor([5]),
299 | #                                                                'dtype': torch.tensor([5])})
300 | #                 graph.add_edges(idx_nodeid, token_end, data={'ct_link': torch.tensor([5]),
301 | #                                                              'dtype': torch.tensor([5])})
302 | #                 constituent_starts.append(token_start)
303 | #                 constituent_ends.append(token_end)
304 | #                 constituent_labels.append(cons_tag2id[label])
305 | #
306 | # print(graph)
307 | # # ndata
308 | # # unit 0이 token 노드, 1이 cons 노드
309 | # #print(graph.ndata)
310 | # print('graph ndata unit',graph.ndata['unit'])
311 | # print('graph ndata dtype', graph.ndata['dtype'])
312 | #
313 | # # edata
314 | # # cc link : 4(self loop edge), node-token : 5(constituent token edge) -> 이건 grand 이런거 아니고 그냥 일반적인 parent, child 트리
315 | # # forward edge type(cc link) : 0, backward edge type(cc link) : 2 -> 일반적인 parent child 트리
316 | # # forward edge type(cc link) : 1, backward edge type(cc link) : 3 -> grand parent child 트리
317 | # #print(graph.edata)
318 | # print('graph edata cc link', graph.edata['cc_link'])
319 | # print('graph edata ct link', graph.edata['ct_link'])
320 | # print('graph edata dtype', graph.edata['dtype'])
321 | #
322 | # dgl.save_graphs('graph.dgl', graph)
323 | # (g,), _ = dgl.load_graphs('graph.dgl')
324 | 
325 | 
326 | 
327 | nlp = spacy.load('en_core_web_sm')
328 | nlp.add_pipe('benepar', config={'model':'benepar_en3'})
329 | input_string = 'Effects in the classroom'
330 | input_string = input_string.lower()
331 | doc = nlp(input_string)
332 | 
333 | sent = list(doc.sents)[0]
334 | # print(sent)
335 | # parse_string = sent._.parse_string
336 | # print(parse_string)
337 | # #
338 | # # for tok in doc:
339 | # #
340 | # #     print()
341 | # t = nltk_tree.fromstring(sent._.parse_string)
342 | # TreeView(t)._cframe.print_to_file('output1.ps')
343 | # os.system('convert output1.ps output1.png')
344 | #
345 | # t = nltk_tree.fromstring(sent._.parse_string)
346 | # print(TreePrettyPrinter(t).text())
347 | 
348 | from nltk import Tree
349 | from nltk.draw.util import CanvasFrame
350 | from nltk.draw import TreeWidget
351 | 
352 | cf = CanvasFrame()
353 | t = Tree.fromstring(sent._.parse_string)
354 | tc = TreeWidget(cf.canvas(),t)
355 | tc['node_font'] = 'arial 14 bold'
356 | tc['leaf_font'] = 'arial 14'
357 | tc['node_color'] = '#005990'
358 | tc['leaf_color'] = '#3F8F57'
359 | tc['line_color'] = '#175252'
360 | cf.add_widget(tc,10,10) # (10,10) offsets
361 | cf.print_to_file('tree1.ps')
362 | cf.destroy()
363 | os.system('convert tree1.ps tree1.png')
364 | 


--------------------------------------------------------------------------------
/mine_next/functions/pos_analy.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | from spacy import displacy
  3 | from collections import Counter
  4 | import pandas as pd
  5 | import os
  6 | import csv
  7 | from pathlib import Path
  8 | from nltk.tree import Tree
  9 | from nltk.parse.corenlp import CoreNLPParser
 10 | from nltk.parse.stanford import StanfordParser
 11 | nlp = spacy.load("en_core_web_sm")
 12 | data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
 13 | data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
 14 | data = data.dropna(axis=0)
 15 | data = data[data['claim_label'] == 'C']
 16 | topics = data['topic_sentence'].tolist()
 17 | claims = data['claim_sentence'].tolist()
 18 | # with open('../../data/IAM/all_claim_sentence.txt', 'r', encoding='utf-8') as txt_file:
 19 | #     all_claims = txt_file.readlines()
 20 | 
 21 | counter = Counter()
 22 | claim_dep = []
 23 | claim_pos = []
 24 | 
 25 | # doc = nlp(claims[0])
 26 | #
 27 | #
 28 | # def token_format(token):
 29 | #     return "_".join([token.orth_, token.tag_, token.dep_])
 30 | #
 31 | # def to_nltk_tree(node):
 32 | #     if node.n_lefts + node.n_rights > 0:
 33 | #         return Tree(token_format(node),
 34 | #                     [to_nltk_tree(child)
 35 | #                      for child in node.children]
 36 | #                     )
 37 | #     else:
 38 | #         return token_format(node)
 39 | # tree = [to_nltk_tree(sent.root) for sent in doc.sents]
 40 | #     # The first item in the list is the full tree
 41 | # tree[0].draw()
 42 | 
 43 | # # os.environ['CLASSPATH'] = '../../stanford/*'
 44 | parser = CoreNLPParser(url='http://localhost:9000')
 45 | #parser = StanfordParser(model_path="../../stanford/edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz")
 46 | def nltk_spacy_tree(sent):
 47 |     doc = nlp(sent)
 48 |     def token_format(token):
 49 |         return "_".join([token.orth_, token.tag_, token.dep_])
 50 |     def to_nltk_tree(node):
 51 |         if node.n_lefts + node.n_rights > 0:
 52 |             return Tree(token_format(node), [to_nltk_tree(child) for child in node.children])
 53 |         else:
 54 |             return token_format(node)
 55 |     tree = [to_nltk_tree(sent.root) for sent in doc.sents]
 56 |     print(tree[0])
 57 | nltk_spacy_tree(claims[0])
 58 | 
 59 | def nltk_stanford_tree(sent):
 60 |     parse = parser.raw_parse(sent)
 61 |     tree = list(parse)
 62 |     print(tree[0].draw())
 63 | 
 64 | #nltk_stanford_tree(claims[0])
 65 | 
 66 | # nlp = stanfordnlp.Pipeline(processors='tokenize,pos')
 67 | # doc = nlp(claims[0])
 68 | # print(doc)
 69 | 
 70 | '''
 71 | 디펜던스 파서 트리 그려주는 코드 
 72 | '''
 73 | # for idx, claim in enumerate(claims[:1]):
 74 | #     doc = nlp(claim)
 75 | #     sentence_spans = list(doc.sents)
 76 | #     #displacy.serve(doc, style='dep')
 77 | #
 78 | #     svg = displacy.render(sentence_spans, style='dep')
 79 | #     output_path = Path('../../data/IAM/dep_claim_img/sentence_{}.svg'.format(idx))
 80 | #     output_path.open('w', encoding='utf-8').write(svg)
 81 |     # for tok in doc:
 82 | 
 83 | 
 84 | 
 85 | 
 86 | #     sentence_dep = []
 87 | #     sentence_pos = []
 88 | #     lemma = []
 89 | #     for tok in doc:
 90 | #         sentence_dep.append(tok.dep_)
 91 | #         sentence_pos.append(tok.pos_)
 92 | #         if tok.pos_ == 'VERB':
 93 | #             lemma.append(tok.lemma_)
 94 | #     claim_dep.append(sentence_dep)
 95 | #     claim_pos.append(sentence_pos)
 96 | #     counter.update(lemma)
 97 | # print(counter)
 98 | 
 99 | # with open('../../data/IAM/train_claim_pos.txt', 'w', encoding='utf-8') as pos_file:
100 | #     for pos in claim_pos:
101 | #         pos_file.write(' '.join(pos))
102 | #         pos_file.write('\n')
103 | # with open('../../data/IAM/train_claim_dep.txt', 'w', encoding='utf-8') as dep_file:
104 | #     for dep in claim_dep:
105 | #         dep_file.write(' '.join(dep))
106 | #         dep_file.write('\n')
107 | 
108 | # for tok in doc:
109 | #     print(tok.text, tok.lemma_, tok.pos_, tok.tag_, tok.dep_)
110 | #     print()


--------------------------------------------------------------------------------
/mine_next/functions/save_graph.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/save_graph.py


--------------------------------------------------------------------------------
/mine_next/functions/sent2_to_graph.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | import dgl
  3 | import dgl.frame
  4 | import torch
  5 | import os, csv
  6 | import pandas as pd
  7 | from tqdm import tqdm
  8 | import benepar
  9 | from transformers import AutoTokenizer
 10 | import string
 11 | 
 12 | 
 13 | class Tree(object):
 14 |     def __init__(self, type):
 15 |         self.parent = None
 16 |         self.num_children = 0
 17 |         self.children = list()
 18 |         self.type = type
 19 |         self.is_leaf = False
 20 |         self.start = -1
 21 |         self.end = -1
 22 |         self.idx = -1
 23 | 
 24 |     def add_child(self, child):
 25 |         child.parent = self
 26 |         self.num_children += 1
 27 |         self.children.append(child)
 28 | 
 29 |     def size(self):
 30 |         count = 1
 31 |         for i in range(self.num_children):
 32 |             count += self.children[i].size()
 33 |         return count
 34 | 
 35 |     def __str__(self):
 36 |         return self.type
 37 | 
 38 |     def __iter__(self):
 39 |         yield self
 40 |         for c in self.children:
 41 |             for x in c:
 42 |                 yield x
 43 | 
 44 | def get_cons_tag_vocab(data_path):
 45 |     tag2id = {}
 46 |     with open(data_path) as f:
 47 |         for line in f.readlines():
 48 |             tag, idx = line.strip().split('\t')
 49 |             tag2id[tag] = int(idx)
 50 |     return tag2id
 51 | 
 52 | 
 53 | def span_starts_ends(node: Tree):
 54 |     if len(node.children) == 0:
 55 |         return
 56 |     for child in node.children:
 57 |         span_starts_ends(child)
 58 | 
 59 |     node.start = node.children[0].start
 60 |     node.end = node.children[-1].end
 61 | 
 62 | 
 63 | def constituent_to_tree(tokenizer, constituent_string, sentence, word_offset, node_offset, num_orders=2):
 64 |     constituents = []
 65 |     temp_str = ""
 66 |     for i, char in enumerate(constituent_string):
 67 |         if char == "(" or char == ")" or char == " ":
 68 |             if len(temp_str) != 0:
 69 |                 constituents.append(temp_str)
 70 |                 temp_str = ""
 71 |             if char != " ":
 72 |                 constituents.append(char)
 73 |         else:
 74 |             temp_str += char
 75 |     # NP, PP등 노드 단위로 stack
 76 |     stack = []
 77 |     for cons in constituents:
 78 |         if cons != ")":
 79 |             stack.append(cons)
 80 |         else:
 81 |             tail = stack.pop()
 82 |             temp_constituents = []
 83 |             while tail != "(":
 84 |                 temp_constituents.append(tail)
 85 |                 tail = stack.pop()
 86 | 
 87 |             parent = Tree(temp_constituents[-1])
 88 |             for i in range(len(temp_constituents) - 2, -1, -1):
 89 |                 if isinstance(temp_constituents[i], Tree):
 90 |                     parent.add_child(temp_constituents[i])
 91 |                 else:
 92 |                     child = Tree(temp_constituents[i])
 93 |                     parent.add_child(child)
 94 |             stack.append(parent)
 95 |     root = stack[-1]
 96 |     map_count = 0
 97 |     words = []
 98 |     subtokens = []
 99 |     subtoken_map = []
100 |     for node in root:
101 |         if len(node.children) == 0:
102 |             node.is_leaf = True
103 |             words.append(str(node))
104 |             node_token = tokenizer.tokenize(str(node))
105 |             if len(node_token) == 0:
106 |                 continue
107 |             subtokens.extend(node_token)
108 |             subtoken_map.extend([map_count]*len(node_token))
109 |             map_count += 1
110 | 
111 |     for node in root:
112 |         if node.is_leaf:
113 |             node.start = word_offset
114 |             node.end = word_offset
115 |             word_offset += 1
116 |     span_starts_ends(root)
117 | 
118 |     node_sequence = []
119 |     # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음
120 |     internal_nodes = []
121 |     for node in root:
122 |         if not node.is_leaf:
123 |             internal_nodes.append(node)
124 |         node_sequence.append(node)
125 | 
126 |     node_offset_original = node_offset
127 |     for node in root:
128 |         if node.is_leaf:
129 |             continue
130 |         node.idx = node_offset
131 |         node_offset += 1
132 | 
133 |     constituent_sequence = [] # [(node idx, node start, node end, node type, parent idx)]
134 |     num_internal_nodes = len(internal_nodes)
135 |     # constituent_edge
136 |     constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)]
137 |     for i, node in enumerate(internal_nodes):
138 |         parent_idx = node.parent.idx if node.parent else -1
139 |         constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx))
140 |         if parent_idx != -1:
141 |             constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 #바로 아래 코드랑 보면 양방향 엣지 포함하는거임
142 |             constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1
143 |     # 이부분은 한계층 건너 뛰어서 엣지 이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌
144 |     high_order_sequence = [constituent_sequence]
145 |     for i in range(1, num_orders):
146 |         new_constituent_sequence = []
147 |         for idx, start, end, type, parent_idx in high_order_sequence[-1]:
148 |             if parent_idx == -1:
149 |                 continue
150 |             parent_node = constituent_sequence[parent_idx - node_offset_original]
151 |             if parent_node[-1] == -1:
152 |                 continue
153 |             new_constituent_sequence.append((idx, start, end, type, parent_node[-1]))
154 |             constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1
155 |             constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1
156 |         high_order_sequence.append(new_constituent_sequence)
157 |     return high_order_sequence, word_offset, node_offset
158 | 
159 | 
160 | def final_graph(constituent_list, first_graph, second_graph):
161 |     cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt')
162 |     forward_edge_type, backward_edge_type = 0, 2
163 |     # 여기서 pc, gpc그래프 나눠서 주는게 낫지않을까
164 |     constituent_labels_first = []
165 |     constituent_labels_second = []
166 |     prev_root_node_id = None
167 |     print('fist graph', first_graph.edges())
168 |     print('second graph', second_graph.edges())
169 |     one_order_sent_cons = constituent_list[0][0]
170 |     two_order_sent_cons = constituent_list[0][1]
171 |     for idx, start, end, label, parent_idx in one_order_sent_cons:
172 |         idx_nodeid = idx
173 |         # parent 없는 노드
174 |         if parent_idx == -1:
175 |             if prev_root_node_id is not None:
176 |                 first_graph.add_edges(prev_root_node_id, idx_nodeid,
177 |                                 data={'cc_link': torch.tensor([1]),
178 |                                       'dtype': torch.tensor([1])})
179 |                 # dual GAT
180 |                 first_graph.add_edges(idx_nodeid, prev_root_node_id,
181 |                                 data={'cc_link': torch.tensor([1]),
182 |                                       'dtype': torch.tensor([1])})
183 |             prev_root_node_id = idx_nodeid
184 |         # parent 없는 노드들
185 |         if parent_idx != -1:
186 |             parent_idx_nodeid = parent_idx
187 |             first_graph.add_edges(parent_idx_nodeid, idx_nodeid,
188 |                             data={'cc_link': torch.tensor([1]),
189 |                                   'dtype': torch.tensor([1])})
190 |             first_graph.add_edges(idx_nodeid, parent_idx_nodeid,
191 |                             data={'cc_link': torch.tensor([1]),
192 |                                   'dtype': torch.tensor([1])})
193 | 
194 |         # self-loop edge
195 |         first_graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
196 |                                                       'dtype': torch.tensor([1])})
197 |         constituent_labels_first.append(cons_tag2id[label])
198 |     # print('first graph', first_graph.edges())
199 | 
200 |     for idx, start, end, label, parent_idx in two_order_sent_cons:
201 |         idx_nodeid = idx
202 |         # parent 없는 노드
203 |         if parent_idx == -1:
204 |             if prev_root_node_id is not None:
205 |                 second_graph.add_edges(prev_root_node_id, idx_nodeid,
206 |                                 data={'cc_link': torch.tensor([1]),
207 |                                       'dtype': torch.tensor([1])})
208 |                 # dual GAT
209 |                 second_graph.add_edges(idx_nodeid, prev_root_node_id,
210 |                                 data={'cc_link': torch.tensor([1]),
211 |                                       'dtype': torch.tensor([1])})
212 |             prev_root_node_id = idx_nodeid
213 |         # parent 없는 노드들
214 |         if parent_idx != -1:
215 |             parent_idx_nodeid = parent_idx
216 |             second_graph.add_edges(parent_idx_nodeid, idx_nodeid,
217 |                             data={'cc_link': torch.tensor([1]),
218 |                                   'dtype': torch.tensor([1])})
219 |             second_graph.add_edges(idx_nodeid, parent_idx_nodeid,
220 |                             data={'cc_link': torch.tensor([1]),
221 |                                   'dtype': torch.tensor([1])})
222 |         constituent_labels_second.append(cons_tag2id[label])
223 |     print('second graph', second_graph.edges())
224 |     # for high_order_sent_cons in constituent_list:
225 |     #     # i = 0: parent - child/ i = 1: grand parent - grand child
226 |     #     for i, sent_cons in enumerate(high_order_sent_cons):
227 |     #         for idx, start, end, label, parent_idx in sent_cons:
228 |     #             idx_nodeid = idx
229 |     #             # parent 없는 노드
230 |     #             if parent_idx == -1:
231 |     #                 if prev_root_node_id is not None:
232 |     #                     graph.add_edges(prev_root_node_id, idx_nodeid,
233 |     #                                     data={'cc_link': torch.tensor([1]),
234 |     #                                           'dtype': torch.tensor([1])})
235 |     #                     # dual GAT
236 |     #                     graph.add_edges(idx_nodeid, prev_root_node_id,
237 |     #                                     data={'cc_link': torch.tensor([1]),
238 |     #                                           'dtype': torch.tensor([1])})
239 |     #                 prev_root_node_id = idx_nodeid
240 |     #             # parent 없는 노드들
241 |     #             if parent_idx != -1:
242 |     #                 parent_idx_nodeid = parent_idx
243 |     #                 graph.add_edges(parent_idx_nodeid, idx_nodeid,
244 |     #                                 data={'cc_link': torch.tensor([1]),
245 |     #                                       'dtype': torch.tensor([1])})
246 |     #                 graph.add_edges(idx_nodeid, parent_idx_nodeid,
247 |     #                                 data={'cc_link': torch.tensor([1]),
248 |     #                                       'dtype': torch.tensor([1])})
249 |     #
250 |     #             if i == 0:
251 |     #                 # self-loop edge
252 |     #                 graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
253 |     #                                                               'dtype': torch.tensor([1])})
254 |     #                 constituent_labels.append(cons_tag2id[label])
255 |     #         print(graph.edges(form='all'))
256 | 
257 |     constituent_labels_first = torch.tensor(constituent_labels_first, dtype=torch.long)
258 |     constituent_labels_second = torch.tensor(constituent_labels_second, dtype=torch.long)
259 |     return first_graph, second_graph, constituent_labels_first, constituent_labels_second
260 | 
261 | 
262 | def all_process_graph(nlp, tokenizer, sentence):
263 |     sentence_doc = nlp(sentence)
264 |     sentence_sent = list(sentence_doc.sents)[0]
265 |     parse_string = sentence_sent._.parse_string
266 |     word_offset, node_offset = 0, 0
267 |     constituent = []
268 |     constituent_sequence, word_offset, node_offset = \
269 |         constituent_to_tree(tokenizer, parse_string, sentence, word_offset, node_offset)
270 |     constituent.append(constituent_sequence)
271 | 
272 |     first_graph = dgl.graph([])
273 |     first_graph.set_n_initializer(dgl.frame.zero_initializer)
274 |     num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
275 |     first_graph.add_nodes(num_cons)
276 |     first_graph.ndata['unit'] = torch.ones(num_cons)
277 |     first_graph.ndata['dtype'] = torch.ones(num_cons)
278 |     second_graph = dgl.graph([])
279 |     second_graph.set_n_initializer(dgl.frame.zero_initializer)
280 |     num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
281 |     second_graph.add_nodes(num_cons)
282 |     second_graph.ndata['unit'] = torch.ones(num_cons)
283 |     second_graph.ndata['dtype'] = torch.ones(num_cons)
284 | 
285 |     claim_first_graph, claim_second_graph, constituent_labels_frist, constituent_labels_second = \
286 |         final_graph(constituent, first_graph, second_graph)
287 |     return claim_first_graph, claim_second_graph, constituent_labels_frist, constituent_labels_second
288 | 
289 | 
290 | if __name__ == "__main__":
291 | 
292 |     nlp = spacy.load('en_core_web_sm')
293 |     nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
294 |     printable = set(string.printable)
295 | 
296 |     tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=False, use_fast=False)
297 | 
298 |     train_data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
299 |     train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
300 |     train_data = train_data.dropna(axis=0)
301 |     dev_data = pd.read_csv('../../data/IAM/claims/dev.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
302 |     dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
303 |     dev_data = dev_data.dropna(axis=0)
304 | 
305 |     train_sentences = train_data['claim_sentence'].tolist()[:10]
306 |     dev_sentences = dev_data['claim_sentence'].tolist()[:10]
307 |     total_train = []
308 |     total_dev = []
309 |     for idx, train in tqdm(enumerate(train_sentences), total=len(train_sentences)):
310 |         train = train.lower().replace('“', '"').replace('”', '"')
311 |         train = "".join(filter(lambda x : x in printable, train))
312 | 
313 |         train_first_graph, train_second_graph, train_constituent_labels_first, train_constituent_labels_second \
314 |             = all_process_graph(nlp, tokenizer, train)
315 |         dgl.save_graphs('../../data/IAM/claims/graphs/train_first_graph_{}.dgl'.format(idx), train_first_graph)
316 |         dgl.save_graphs('../../data/IAM/claims/graphs/train_second_graph_{}.dgl'.format(idx), train_second_graph)
317 |         total_train.append([train_constituent_labels_first.tolist(), train_constituent_labels_second.tolist()])
318 | 
319 |     for idx, dev in tqdm(enumerate(dev_sentences), total=len(dev_sentences)):
320 |         dev = dev.lower().replace('“', '"').replace('”', '"')
321 |         dev = "".join(filter(lambda x : x in printable, dev))
322 |         dev_first_graph, dev_second_graph, dev_constituent_label_first, dev_constituent_label_second \
323 |             = all_process_graph(nlp, tokenizer, dev)
324 |         dgl.save_graphs('../../data/IAM/claims/graphs/dev_first_graph_{}.dgl'.format(idx), dev_first_graph)
325 |         dgl.save_graphs('../../data/IAM/claims/graphs/dev_second_graph_{}.dgl'.format(idx), dev_second_graph)
326 |         total_dev.append([dev_constituent_label_first.tolist(), dev_constituent_label_second.tolist()])
327 | 
328 |     with open('../../data/IAM/claims/graphs/train_constituent_test.txt', 'w', encoding='utf-8') as f:
329 |         for line in total_train:
330 |             f.write(str(line)+'\n')
331 | 
332 |     with open('../../data/IAM/claims/graphs/dev_constituent_test.txt', 'w', encoding='utf-8') as f:
333 |         for line in total_dev:
334 |             f.write(str(line)+'\n')
335 | 


--------------------------------------------------------------------------------
/mine_next/functions/sent_to_graph.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | import dgl
  3 | import dgl.frame
  4 | import torch
  5 | import os, csv
  6 | import pandas as pd
  7 | from tqdm import tqdm
  8 | import benepar
  9 | from transformers import AutoTokenizer
 10 | import string
 11 | 
 12 | 
 13 | class Tree(object):
 14 |     def __init__(self, type):
 15 |         # self.start, self.end 단어 기준으로 세는것. one 이 0번째, . 이 27번쨰라 root 노드가 start=0, end=27을 가지고 있는거임
 16 |         self.parent = None
 17 |         self.num_children = 0
 18 |         self.children = list()
 19 |         self.type = type
 20 |         self.is_leaf = False
 21 |         self.start = -1
 22 |         self.end = -1
 23 |         self.idx = -1
 24 | 
 25 |     def add_child(self, child):
 26 |         child.parent = self
 27 |         self.num_children += 1
 28 |         self.children.append(child)
 29 | 
 30 |     def size(self):
 31 |         count = 1
 32 |         for i in range(self.num_children):
 33 |             count += self.children[i].size()
 34 |         return count
 35 | 
 36 |     def __str__(self):
 37 |         return self.type
 38 | 
 39 |     def __iter__(self):
 40 |         yield self
 41 |         for c in self.children:
 42 |             for x in c:
 43 |                 yield x
 44 | 
 45 | 
 46 | def get_cons_tag_vocab(data_path):
 47 |     tag2id = {}
 48 |     with open(data_path) as f:
 49 |         for line in f.readlines():
 50 |             tag, idx = line.strip().split('\t')
 51 |             tag2id[tag] = int(idx)
 52 |     return tag2id
 53 | 
 54 | 
 55 | def span_starts_ends(node: Tree):
 56 |     if len(node.children) == 0:
 57 |         return
 58 |     for child in node.children:
 59 |         span_starts_ends(child)
 60 | 
 61 |     node.start = node.children[0].start
 62 |     node.end = node.children[-1].end
 63 | 
 64 | 
 65 | def constituent_to_tree(tokenizer, constituent_string, sentence, word_offset, node_offset, num_orders=2):
 66 |     constituents = []
 67 |     temp_str = ""
 68 |     # 괄호 ['(', 'S', '(', 'NP', '(', 'NP', '(', 'CD', 'one', ')', '(', 'ADJP', '(', 'RB', 'long', ')' ... ] 이런식으로 (,)나 단어, constituent 단위로 분리
 69 |     for i, char in enumerate(constituent_string):
 70 |         if char == "(" or char == ")" or char == " ":
 71 |             if len(temp_str) != 0:
 72 |                 constituents.append(temp_str)
 73 |                 temp_str = ""
 74 |             if char != " ":
 75 |                 constituents.append(char)
 76 |         else:
 77 |             temp_str += char
 78 |     # NP, PP등 노드 단위로 stack
 79 |     stack = []
 80 |     for cons in constituents:
 81 |         if cons != ")":
 82 |             stack.append(cons)
 83 |         else:
 84 |             tail = stack.pop()
 85 |             temp_constituents = []
 86 |             while tail != "(":
 87 |                 temp_constituents.append(tail)
 88 |                 tail = stack.pop()
 89 | 
 90 |             parent = Tree(temp_constituents[-1])
 91 |             for i in range(len(temp_constituents) - 2, -1, -1):
 92 |                 if isinstance(temp_constituents[i], Tree):
 93 |                     parent.add_child(temp_constituents[i])
 94 |                 else:
 95 |                     child = Tree(temp_constituents[i])
 96 |                     parent.add_child(child)
 97 |             stack.append(parent)
 98 |     root = stack[-1]
 99 |     map_count = 0
100 |     words = []
101 |     subtokens = []
102 |     subtoken_map = []
103 |     for node in root:
104 |         if len(node.children) == 0:
105 |             node.is_leaf = True
106 |             words.append(str(node))
107 |             node_token = tokenizer.tokenize(str(node))
108 |             if len(node_token) == 0:
109 |                 continue
110 |             subtokens.extend(node_token)
111 |             subtoken_map.extend([map_count]*len(node_token))
112 |             map_count += 1
113 | 
114 |     for node in root:
115 |         if node.is_leaf:
116 |             node.start = word_offset
117 |             node.end = word_offset
118 |             word_offset += 1
119 |     span_starts_ends(root)
120 | 
121 |     node_sequence = []
122 |     # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음
123 |     internal_nodes = []
124 |     for node in root:
125 |         if not node.is_leaf:
126 |             internal_nodes.append(node)
127 |         node_sequence.append(node)
128 | 
129 |     node_offset_original = node_offset
130 |     for node in root:
131 |         if node.is_leaf:
132 |             continue
133 |         node.idx = node_offset
134 |         node_offset += 1
135 | 
136 |     constituent_sequence = [] # [(node idx, node start, node end, node type, parent idx)]
137 |     num_internal_nodes = len(internal_nodes)
138 |     # constituent_edge
139 |     constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)]
140 |     for i, node in enumerate(internal_nodes):
141 |         parent_idx = node.parent.idx if node.parent else -1
142 |         constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx))
143 |         if parent_idx != -1:
144 |             constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 #바로 아래 코드랑 보면 양방향 엣지 포함하는거임
145 |             constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1
146 |     # 이부분은 한계층 건너 뛰어서 엣지 이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌
147 |     high_order_sequence = [constituent_sequence]
148 |     for i in range(1, num_orders):
149 |         new_constituent_sequence = []
150 |         for idx, start, end, type, parent_idx in high_order_sequence[-1]:
151 |             if parent_idx == -1:
152 |                 new_constituent_sequence.append((idx, start, end, type, parent_idx))
153 |                 continue
154 |             parent_node = constituent_sequence[parent_idx - node_offset_original]
155 |             if parent_node[-1] == -1:
156 |                 continue
157 |             new_constituent_sequence.append((idx, start, end, type, parent_node[-1]))
158 |             constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1
159 |             constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1
160 |         high_order_sequence.append(new_constituent_sequence)
161 | 
162 |     return high_order_sequence, word_offset, node_offset
163 | 
164 | 
165 | def final_graph(constituent_list, first_graph, second_graph):
166 |     cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt')
167 |     constituent_labels_first = []
168 |     constituent_labels_second = []
169 |     prev_root_node_id = None
170 |     one_order_sent_cons = constituent_list[0][0]
171 |     two_order_sent_cons = constituent_list[0][1]
172 |     two_order_sent_cons_idx = [idx[0] for idx in two_order_sent_cons]
173 | 
174 |     for idx, start, end, label, parent_idx in one_order_sent_cons:
175 |         idx_nodeid = idx
176 |         if parent_idx == -1:
177 |             if prev_root_node_id is not None:
178 |                 first_graph.add_edges(prev_root_node_id, idx_nodeid,
179 |                                 data={'cc_link': torch.tensor([1]),
180 |                                       'dtype': torch.tensor([1])})
181 |                 first_graph.add_edges(idx_nodeid, prev_root_node_id,
182 |                                 data={'cc_link': torch.tensor([1]),
183 |                                       'dtype': torch.tensor([1])})
184 |             prev_root_node_id = idx_nodeid
185 |         if parent_idx != -1:
186 |             parent_idx_nodeid = parent_idx
187 |             first_graph.add_edges(parent_idx_nodeid, idx_nodeid,
188 |                             data={'cc_link': torch.tensor([1]),
189 |                                   'dtype': torch.tensor([1])})
190 |             first_graph.add_edges(idx_nodeid, parent_idx_nodeid,
191 |                             data={'cc_link': torch.tensor([1]),
192 |                                   'dtype': torch.tensor([1])})
193 |         first_graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
194 |                                                       'dtype': torch.tensor([1])})
195 |         constituent_labels_first.append(cons_tag2id[label])
196 | 
197 |     prev_root_node_id = None
198 |     for idx, start, end, label, parent_idx in two_order_sent_cons:
199 |         idx_nodeid = idx
200 |         if parent_idx == -1:
201 |             if prev_root_node_id is not None:
202 |                 second_graph.add_edges(prev_root_node_id, idx_nodeid,
203 |                                 data={'cc_link': torch.tensor([1]),
204 |                                       'dtype': torch.tensor([1])})
205 |                 second_graph.add_edges(idx_nodeid, prev_root_node_id,
206 |                                 data={'cc_link': torch.tensor([1]),
207 |                                       'dtype': torch.tensor([1])})
208 |             prev_root_node_id = idx_nodeid
209 |         if parent_idx != -1:
210 |             parent_idx_nodeid = parent_idx
211 |             second_graph.add_edges(parent_idx_nodeid, idx_nodeid,
212 |                             data={'cc_link': torch.tensor([1]),
213 |                                   'dtype': torch.tensor([1])})
214 |             second_graph.add_edges(idx_nodeid, parent_idx_nodeid,
215 |                             data={'cc_link': torch.tensor([1]),
216 |                                   'dtype': torch.tensor([1])})
217 |     second_graph = dgl.add_self_loop(second_graph)
218 |     # for high_order_sent_cons in constituent_list:
219 |     #     for i, sent_cons in enumerate(high_order_sent_cons):
220 |     #         for idx, start, end, label, parent_idx in sent_cons:
221 |     #             idx_nodeid = idx   # 원래는 constituent_start_idx = 0, node_id_offset = 406(token id까지였음. 1063중 406이 토큰이고 이후가 node 였었음.)
222 |     #             # parent 없는 노드
223 |     #             if parent_idx == -1:
224 |     #                 if prev_root_node_id is not None:
225 |     #                     graph.add_edges(prev_root_node_id, idx_nodeid,
226 |     #                                     data={'cc_link': torch.tensor([1]),
227 |     #                                           'dtype': torch.tensor([1])})
228 |     #                     # dual GAT
229 |     #                     graph.add_edges(idx_nodeid, prev_root_node_id,
230 |     #                                     data={'cc_link': torch.tensor([1]),
231 |     #                                           'dtype': torch.tensor([1])})
232 |     #                 prev_root_node_id = idx_nodeid
233 |     #             # parent 없는 노드들
234 |     #             if parent_idx != -1:
235 |     #                 parent_idx_nodeid = parent_idx
236 |     #                 graph.add_edges(parent_idx_nodeid, idx_nodeid,
237 |     #                                 data={'cc_link': torch.tensor([1]),
238 |     #                                       'dtype': torch.tensor([1])})
239 |     #                 graph.add_edges(idx_nodeid, parent_idx_nodeid,
240 |     #                                 data={'cc_link': torch.tensor([1]),
241 |     #                                       'dtype': torch.tensor([1])})
242 |     #
243 |     #             if i == 0:
244 |     #                 # self-loop edge
245 |     #                 graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
246 |     #                                                               'dtype': torch.tensor([1])})
247 |     #                 constituent_labels.append(cons_tag2id[label])
248 |     # constituent_labels = torch.tensor(constituent_labels,dtype=torch.long)
249 |     # return graph, constituent_labels
250 |     constituent_labels_first = torch.tensor(constituent_labels_first, dtype=torch.long)
251 |     constituent_labels_second = torch.tensor(constituent_labels_first, dtype=torch.long) # 라벨 개수 동일하게
252 |     return first_graph, second_graph, constituent_labels_first, constituent_labels_second
253 | 
254 | 
255 | def all_process_graph(nlp, tokenizer, sentence):
256 |     sentence_doc = nlp(sentence)
257 |     sentence_sent = list(sentence_doc.sents)[0]
258 |     parse_string = sentence_sent._.parse_string
259 |     word_offset, node_offset = 0, 0
260 |     constituent = []
261 |     constituent_sequence, word_offset, node_offset = \
262 |         constituent_to_tree(tokenizer, parse_string, sentence, word_offset, node_offset)
263 |     constituent.append(constituent_sequence)
264 | 
265 |     first_graph = dgl.graph([])
266 |     first_graph.set_n_initializer(dgl.frame.zero_initializer)
267 |     num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
268 |     first_graph.add_nodes(num_cons)
269 |     first_graph.ndata['unit'] = torch.ones(num_cons)
270 |     first_graph.ndata['dtype'] = torch.ones(num_cons)
271 | 
272 |     second_graph = dgl.graph([])
273 |     second_graph.set_n_initializer(dgl.frame.zero_initializer)
274 |     num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
275 |     second_graph.add_nodes(num_cons)
276 |     second_graph.ndata['unit'] = torch.ones(num_cons)
277 |     second_graph.ndata['dtype'] = torch.ones(num_cons)
278 | 
279 |     claim_first_graph, claim_second_graph, constituent_labels_first, constituent_labels_second = \
280 |         final_graph(constituent, first_graph, second_graph)
281 |     return claim_first_graph, claim_second_graph, constituent_labels_first, constituent_labels_second
282 | 
283 | 
284 | 
285 | if __name__ == "__main__":
286 |     nlp = spacy.load('en_core_web_sm')
287 |     nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
288 |     tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=False, use_fast=False)
289 | 
290 |     # train_data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
291 |     # train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
292 |     # train_data = train_data.dropna(axis=0)
293 |     # dev_data = pd.read_csv('../../data/IAM/claims/dev.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
294 |     # dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
295 |     # dev_data = dev_data.dropna(axis=0)
296 |     test_data = pd.read_csv('../../data/IAM/claims/test.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
297 |     test_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
298 |     test_data = test_data.dropna(axis=0)
299 | 
300 |     # train_sentences = train_data['claim_sentence'].tolist()
301 |     # dev_sentences = dev_data['claim_sentence'].tolist()
302 |     test_sentences = test_data['claim_sentence'].tolist()
303 | 
304 |     total_train = []
305 |     total_dev = []
306 |     total_test = []
307 |     printable = set(string.printable)
308 | 
309 |     for idx, test in tqdm(enumerate(test_sentences), total=len(test_sentences)):
310 |         test = test.lower().replace('“', '"').replace('”', '"')
311 |         test = "".join(filter(lambda x : x in printable, test))
312 |         test_first_graph, test_second_graph, test_constituent_labels_first, test_constituent_labels_second = \
313 |             all_process_graph(nlp, tokenizer, test)
314 |         #dgl.save_graphs('../../data/IAM/claims/graphs/test_first_graph_{}.dgl'.format(idx), test_first_graph)
315 |         #dgl.save_graphs('../../data/IAM/claims/graphs/test_second_graph_{}.dgl'.format(idx), test_second_graph)
316 |         total_test.append([test_constituent_labels_first.tolist(), test_constituent_labels_second.tolist()])
317 | 
318 |     # for idx, train in tqdm(enumerate(train_sentences), total=len(train_sentences)):
319 |     #     # train = train.lower().replace("\xa0", '').replace('...', '').replace('—', ' ').replace('“', '').replace('’', "'").strip()
320 |     #     train = train.lower().replace('“', '"').replace('”', '"')
321 |     #     train = "".join(filter(lambda x : x in printable, train))
322 |     #
323 |     #     train_first_graph, train_second_graph, train_constituent_labels_first, train_constituent_labels_second = \
324 |     #         all_process_graph(nlp, tokenizer, train)
325 |     #     dgl.save_graphs('../../data/IAM/claims/graphs/train_first_graph_{}.dgl'.format(idx), train_first_graph)
326 |     #     dgl.save_graphs('../../data/IAM/claims/graphs/train_second_graph_{}.dgl'.format(idx), train_second_graph)
327 |     #     total_train.append([train_constituent_labels_first.tolist(), train_constituent_labels_second.tolist()])
328 |     #
329 |     # for idx, dev in tqdm(enumerate(dev_sentences), total=len(dev_sentences)):
330 |     #     dev = dev.lower().replace('“', '"').replace('”', '"')
331 |     #     dev = "".join(filter(lambda x: x in printable, dev))
332 |     #     dev_first_graph, dev_second_graph, dev_constituent_label_first, dev_constituent_label_second \
333 |     #         = all_process_graph(nlp, tokenizer, dev)
334 |     #     dgl.save_graphs('../../data/IAM/claims/graphs/dev_first_graph_{}.dgl'.format(idx), dev_first_graph)
335 |     #     dgl.save_graphs('../../data/IAM/claims/graphs/dev_second_graph_{}.dgl'.format(idx), dev_second_graph)
336 |     #     total_dev.append([dev_constituent_label_first.tolist(), dev_constituent_label_second.tolist()])
337 | 
338 |     # with open('../../data/IAM/claims/graphs/train_constituent_first_second.txt', 'w', encoding='utf-8') as f:
339 |     #     for line in total_train:
340 |     #         f.write(str(line)+'\n')
341 |     #
342 |     # with open('../../data/IAM/claims/graphs/dev_constituent_first_second.txt', 'w', encoding='utf-8') as f:
343 |     #     for line in total_dev:
344 |     #         f.write(str(line)+'\n')
345 | 
346 |     # with open('../../data/IAM/claims/graphs/test_constituent_first_second.txt', 'w', encoding='utf-8') as f:
347 |     #     for line in total_test:
348 |     #         f.write(str(line) + '\n')
349 | 


--------------------------------------------------------------------------------
/mine_next/functions/stance_main_func.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import os
  3 | from tqdm import tqdm
  4 | import torch
  5 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
  6 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup
  7 | from transformers import AutoConfig, AutoTokenizer
  8 | from sklearn.metrics import classification_report, accuracy_score
  9 | from sklearn.utils import resample
 10 | 
 11 | import csv
 12 | import numpy as np
 13 | import pandas as pd
 14 | import json
 15 | 
 16 | # from source.claim_classification.model.modeling import KoElectraForClaimClassification
 17 | # from source.claim_classification.func.dataset import convert_data2tensordataset
 18 | 
 19 | from mine_next.model.modeling import RobertaForClassification
 20 | from mine_next.functions.dataset import (
 21 |     convert_data2tensordataset,
 22 |     convert_stance_data2tensordataset,
 23 |     convert_only_sentence2tensordataset,
 24 | )
 25 | 
 26 | 
 27 | def random_downsampling(dataset):
 28 |     major = dataset[dataset['claim_label'] == 'O']
 29 |     minor = dataset[dataset['claim_label'] == 'C']
 30 |     sampling_data = resample(major, replace=True, n_samples=len(minor)*5, random_state=42)
 31 |     train_data = pd.concat([sampling_data, minor])
 32 |     return train_data
 33 | 
 34 | 
 35 | def random_upsampling(dataset):
 36 |     major = dataset[dataset['claim_label'] == 'O']
 37 |     minor = dataset[dataset['claim_label'] == 'C']
 38 |     sampling_data = resample(minor, replace=True, n_samples=len(major), random_state=42)
 39 |     train_data = pd.concat([sampling_data, major])
 40 |     return train_data
 41 | 
 42 | def do_train(config, model, optimizer, scheduler, train_dataloader, epoch, global_step):
 43 |     losses = []
 44 |     total_predicts, total_corrects = [], []
 45 |     for step, batch in tqdm(enumerate(train_dataloader), desc='do_train(epoch_{})'.format(epoch), total=len(train_dataloader)):
 46 |         batch = tuple(t.cuda() for t in batch)
 47 |         # graph 같이 학습할 경우
 48 |         # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
 49 |         # constituent_labels = batch[6]
 50 |         # loss, predicts = model(
 51 |         #     idx=idx,
 52 |         #     input_ids=input_ids,
 53 |         #     attention_mask=attention_mask,
 54 |         #     token_type_ids=token_type_ids,
 55 |         #     labels=labels,
 56 |         #     sim_labels=sim_labels,
 57 |         #     all_graph=total_graph,
 58 |         #     constituent_labels=constituent_labels
 59 |         # )
 60 |         # base
 61 |         idx, input_ids, attention_mask, token_type_ids, stance_labels = batch[0], batch[1], batch[2], batch[3], \
 62 |                                                                              batch[4]
 63 |         loss, predicts = model(
 64 |             idx=idx,
 65 |             input_ids=input_ids,
 66 |             attention_mask=attention_mask,
 67 |             token_type_ids=token_type_ids,
 68 |             labels=stance_labels,
 69 |             sim_labels=None,
 70 |         )
 71 |         predicts = predicts.argmax(dim=-1)
 72 |         predicts = predicts.cpu().detach().numpy().tolist()
 73 |         labels = stance_labels.cpu().detach().numpy().tolist()
 74 | 
 75 |         total_predicts.extend(predicts)
 76 |         total_corrects.extend(labels)
 77 | 
 78 |         if config.gradient_accumulation_steps > 1:
 79 |             loss = loss / config.gradient_accumulation_steps
 80 |         # 원래는 tensor(0.7255)이런식
 81 |         loss.backward()
 82 |         losses.append(loss.data.item())
 83 |         if (step + 1) % config.gradient_accumulation_steps == 0 or \
 84 |                 (len(train_dataloader) <= config.gradient_accumulation_steps and (step + 1) == len(
 85 |                     train_dataloader)):
 86 |             torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
 87 |             optimizer.step()
 88 |             scheduler.step()
 89 | 
 90 |             model.zero_grad()
 91 |             global_step += 1
 92 |     target_names = ['class 0', 'class 1']
 93 |     print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
 94 |     accuracy = accuracy_score(total_corrects, total_predicts)
 95 |     return accuracy, np.mean(losses), global_step
 96 | 
 97 | 
 98 | def do_evaluate(model, dev_dataloader):
 99 |     total_predicts, total_corrects = [], []
100 |     for step, batch in tqdm(enumerate(dev_dataloader), desc="do_evaluate", total=len(dev_dataloader)):
101 |         batch = tuple(t.cuda() for t in batch)
102 |         # graph 학습할 경우
103 |         # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
104 |         # constituent_labels = batch[6]
105 |         # predicts = model(
106 |         #     idx=idx,
107 |         #     input_ids=input_ids,
108 |         #     attention_mask=attention_mask,
109 |         #     token_type_ids=token_type_ids,
110 |         #     all_graph=total_graph,
111 |         #     constituent_labels=constituent_labels
112 |         # )
113 |         # base
114 |         idx, input_ids, attention_mask, token_type_ids, stance_labels = batch[0], batch[1], batch[2], batch[3], \
115 |                                                                              batch[4]
116 |         predicts = model(
117 |             idx=idx,
118 |             input_ids=input_ids,
119 |             attention_mask=attention_mask,
120 |             token_type_ids=token_type_ids,
121 |         )
122 |         predicts = predicts.argmax(dim=-1)
123 |         predicts = predicts.detach().cpu().tolist()
124 |         labels = stance_labels.detach().cpu().tolist()
125 |         total_predicts.extend(predicts)
126 |         total_corrects.extend(labels)
127 |     target_names = ['class 0', 'class 1']
128 |     print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
129 |     accuracy = accuracy_score(total_corrects, total_predicts)
130 |     return accuracy, total_predicts
131 | 
132 | 
133 | def train(config, model, tokenizer):
134 | 
135 |     # 데이터셋 로드
136 |     train_data = pd.read_csv(config.stance_train, sep='\t', header=None, quoting=csv.QUOTE_NONE)
137 |     #train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
138 |     train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'id', 'stance_labels']
139 |     train_data = train_data.dropna(axis=0)
140 |     # train_data = train_data[:100]
141 |     dev_data = pd.read_csv(config.stance_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
142 |     dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'id', 'stance_labels']
143 |     #dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
144 |     dev_data = dev_data.dropna(axis=0)
145 |     # dev_data = dev_data[:100]
146 | 
147 |     #train_data = random_upsampling(train_data)
148 |     train_dataset = convert_stance_data2tensordataset(train_data, tokenizer, config.max_length, 'train')
149 |     dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length, 'dev')
150 | 
151 | 
152 |     train_sampler = RandomSampler(train_dataset)
153 |     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size)
154 |     dev_sampler = SequentialSampler(dev_dataset)
155 |     dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
156 | 
157 |     t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.epoch
158 |     optimizer = AdamW(model.parameters(), lr=config.learning_rate)
159 |     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total)
160 | 
161 |     global_step = 0
162 |     max_test_accuracy = 0
163 |     model.zero_grad()
164 |     for epoch in range(config.epoch):
165 |         model.train()
166 |         train_accuracy, average_loss, global_step = do_train(
167 |             config=config, model=model,
168 |             optimizer=optimizer, scheduler=scheduler,
169 |             train_dataloader=train_dataloader, epoch=epoch+1, global_step=global_step)
170 |         print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4)))
171 | 
172 |         model.eval()
173 |         test_accuracy, _ = do_evaluate(model=model, dev_dataloader=dev_dataloader)
174 |         print("test_accuracy : {}\n".format(round(test_accuracy, 4)))
175 |         output_dir = os.path.join(config.save_dir, "checkpoint-{}".format(epoch))
176 |         if not os.path.exists(output_dir):
177 |             os.makedirs(output_dir)
178 |         model_to_save = model.module if hasattr(model, "module") else model
179 |         model_to_save.save_pretrained(output_dir)
180 |         tokenizer.save_pretrained(output_dir)
181 |         torch.save(config, os.path.join(output_dir, "training_args.bin"))
182 | 
183 | 
184 | def evaluate(config, model, tokenizer):
185 |     dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
186 |     dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
187 |     dev_data = dev_data.dropna(axis=0)
188 |     # dev_data = dev_data[:10]
189 |     # dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length)
190 |     dev_dataset, total_graph = convert_only_sentence2tensordataset(dev_data, tokenizer, config.max_length, 'dev')
191 | 
192 |     dev_sampler = SequentialSampler(dev_dataset)
193 |     dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
194 | 
195 |     test_accuracy, total_predicts = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=total_graph)
196 |     print("test accuracy : {}".format(round(test_accuracy,4)))
197 |     total_corrects = dev_data['claim_label'].tolist()
198 |     total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects]
199 |     totaL_claim_sentence = dev_data['claim_sentence'].tolist()
200 |     error_list = []
201 |     for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence):
202 |         if predict != correct:
203 |             error = {}
204 |             error['predict'] = predict
205 |             error['correct'] = correct
206 |             error['claim_sentence'] = claim
207 |             error_list.append(error)
208 | 
209 |     with open('../mine/functions/dev_error.json', 'w', encoding='utf-8') as f:
210 |         json.dump(error_list, f, indent=4)
211 | 


--------------------------------------------------------------------------------
/mine_next/functions/test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import csv
 3 | from transformers import RobertaTokenizer
 4 | import string
 5 | 
 6 | s = "“Let’s say there’s a government-run test."
 7 | printable = set(string.printable)
 8 | print(printable)
 9 | print("".join(filter(lambda x: x in printable, s)))
10 | 
11 | 
12 | all_claim = []
13 | 
14 | def extract_claim(data_file):
15 |     data = pd.read_csv(data_file, sep='\t', header=None, quoting=csv.QUOTE_NONE)
16 |     data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
17 |     data = data.dropna(axis=0)
18 |     data = data[data['claim_label'] == 'C']
19 |     claim_data = data['claim_sentence']
20 |     return claim_data.tolist()
21 | # tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
22 | # input_str = 'a significant number of republicans assert that hereditary monarchy is unfair and elitist'
23 | #print(tokenizer.tokenize(input_str))
24 | 
25 | # train_claim = extract_claim('../../data/IAM/claims/train.txt')
26 | # print(train_claim)
27 | # dev_claim = extract_claim('../../data/IAM/claims/dev.txt')
28 | # test_claim = extract_claim('../../data/IAM/claims/test.txt')
29 | #
30 | # all_claim.extend(train_claim)
31 | # all_claim.extend(dev_claim)
32 | # all_claim.extend(test_claim)
33 | #
34 | # with open('../../data/IAM/all_claim_sentence.txt', 'w', encoding='utf-8') as txt_file:
35 | #     for claim in all_claim:
36 | #         txt_file.write(claim)
37 | #         txt_file.write('\n')
38 | 
39 | 


--------------------------------------------------------------------------------
/mine_next/functions/textrank.py:
--------------------------------------------------------------------------------
 1 | import spacy
 2 | import pytextrank
 3 | import os
 4 | import pandas as pd
 5 | import csv
 6 | import json, string
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | def make_article_dict():
11 |     topic_dir_list = os.listdir('../../data/IAM/origin/test')
12 |     topic_dir_list = [os.path.join('../../data/IAM/origin/test', topic) for topic in topic_dir_list]
13 |     article_dict = {}
14 |     for topic_dir in topic_dir_list:
15 |         file_list = os.listdir(topic_dir) # [21_3.txt, 21_7.txt, ... ]
16 |         file_list_open = [os.path.join(topic_dir, file) for file in file_list] # ../../data/IAM/origin/train/Should_commercial_advertisements_be_allowed_to_be_fictitious/21_3.txt
17 | 
18 |         for idx, file in zip(file_list, file_list_open):
19 |             article_id = idx.split('.')[0]
20 | 
21 |             num = file.split('/')[-1]
22 |             assert idx == num
23 |             sentences = []
24 |             with open(file, 'r', encoding='utf-8') as f:
25 |                 article = f.readlines()
26 |             for line in article:
27 |                 article_sentence = line.split('\t')[0]
28 |                 sentences.append(article_sentence)
29 |             article_dict[article_id] = sentences
30 | 
31 |     with open('../../data/IAM/origin/test_article_dict.json', 'w', encoding='utf-8') as outfile:
32 |         json.dump(article_dict, outfile, indent='\t', ensure_ascii=False)
33 | 
34 | def make_pseudo_topic_with_textrank():
35 |     printable = set(string.printable)
36 |     nlp = spacy.load("en_core_web_sm")
37 |     nlp.add_pipe("textrank")
38 | 
39 |     datas = json.load(open('../../data/IAM/origin/dev_article_dict.json', encoding='utf-8'))
40 |     pseudo_topic = {}
41 |     for key, value in tqdm(datas.items(), total=len(datas)):
42 |         article_text = " ".join(value)
43 |         article_text = article_text.lower().replace('“', '"').replace('”', '"')
44 |         article_text = "".join(filter(lambda x : x in printable, article_text))
45 |         doc = nlp(article_text)
46 |         topic = []
47 |         for phrase in doc._.phrases[:10]:
48 |             topic.append(phrase.text)
49 |         # pseudo_topic[key] = " ".join(topic)
50 |         pseudo_topic[key] = topic
51 |     with open('../../data/IAM/origin/dev_pseudo_topic_with_textrank_list.json', 'w', encoding='utf-8') as file:
52 |         json.dump(pseudo_topic, file, indent='\t', ensure_ascii=False)
53 | 
54 | #make_article_dict()
55 | # make_pseudo_topic_with_textrank()
56 | # data = json.load(open('../../data/IAM/origin/dev_pseudo_topic_with_textrank_list.json', encoding='utf-8'))
57 | # print(data)
58 | 
59 | 
60 | # doc = nlp(article.lower())
61 | #
62 | # # examine the top-ranked phrases in the document
63 | # pseudo_topic = []
64 | # for phrase in doc._.phrases[:10]:
65 | #     #print(phrase)
66 | #     print(phrase.text)
67 | #     # print(phrase.rank, phrase.count)
68 | #     # print(phrase.chunks)
69 | #     print()
70 | #
71 | 
72 | 
73 | total_char_count = 0
74 | total_word_count = 0
75 | topic_dir_list = os.listdir('../../data/IAM/origin/test')
76 | topic_dir_list = [os.path.join('../../data/IAM/origin/test', topic) for topic in topic_dir_list]
77 | article_dict = {}
78 | for topic_dir in topic_dir_list:
79 |     file_list = os.listdir(topic_dir) # [21_3.txt, 21_7.txt, ... ]
80 |     file_list_open = [os.path.join(topic_dir, file) for file in file_list] # ../../data/IAM/origin/train/Should_commercial_advertisements_be_allowed_to_be_fictitious/21_3.txt
81 | 
82 |     for idx, file in zip(file_list, file_list_open):
83 |         article_id = idx.split('.')[0]
84 | 
85 |         num = file.split('/')[-1]
86 |         assert idx == num
87 |         sentences = []
88 |         with open(file, 'r', encoding='utf-8') as f:
89 |             article = f.readlines()
90 |         for line in article:
91 |             article_sentence = line.split('\t')[0]
92 |             word_of_sentence = article_sentence.split(' ')
93 |             total_char_count += len(article_sentence)
94 |             total_word_count += len(word_of_sentence)
95 | print(total_char_count)
96 | print(total_word_count)
97 | 


--------------------------------------------------------------------------------
/mine_next/functions/txt2json.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | 
 4 | import pandas as pd
 5 | 
 6 | dataset = pd.read_csv('../../data/IAM/stance/test.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
 7 | dataset.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
 8 | dataset = dataset.dropna(axis=0)
 9 | 
10 | # claim_sentences = dataset['claim_sentence'].tolist()
11 | # claim_labels = ['non claim' if label is 'O' else 'claim' for label in dataset['claim_label'].tolist()]
12 | stance_sentences = dataset['claim_sentence']
13 | stance_label = dataset['stance_label']
14 | 
15 | label_dict = {}
16 | label_dict['-1'] = 'contest'
17 | label_dict['1'] = 'support'
18 | label_dict['0'] = 'non-claim'
19 | data_json = []
20 | for sentence, label in zip(stance_sentences, stance_label):
21 |     content = {}
22 |     content['text'] = sentence
23 |     content['label'] = label_dict[str(label)]
24 |     data_json.append(content)
25 | 
26 | with open('../../data/IAM/stance/IAM_stance_test.json', 'w', encoding='utf-8') as outfile:
27 |     json.dump(data_json, outfile, indent='\t', ensure_ascii=False)


--------------------------------------------------------------------------------
/mine_next/functions/use_bertopic.py:
--------------------------------------------------------------------------------
 1 | from bertopic import BERTopic
 2 | from sklearn.datasets import fetch_20newsgroups
 3 | from hdbscan import HDBSCAN
 4 | from transformers import BertModel
 5 | from sentence_transformers import SentenceTransformer
 6 | import pandas as pd
 7 | import os, json
 8 | from os import listdir
 9 | from os.path import isfile, join
10 | from sklearn.manifold import TSNE
11 | from tqdm import tqdm
12 | 
13 | 
14 | def topic_sentences(mode):
15 |     sentences = []
16 |     article_ids = []
17 |     topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode))
18 |     topic_dir_list = sorted([os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list])
19 | 
20 |     for topic_dir in topic_dir_list:
21 |         file_list = os.listdir(topic_dir)
22 |         file_list_open = sorted([os.path.join(topic_dir, file) for file in file_list])
23 | 
24 |         for idx, file in zip(file_list, file_list_open):
25 |             article_id = idx.split('.')[0]
26 |             sentence = []
27 |             with open(file, 'r', encoding='utf-8') as f:
28 |                 article = f.readlines()
29 |             for line in article:
30 |                 article_sentence = line.split('\t')[0]
31 |                 #sentences.append(article_sentence)
32 |                 sentence.append(article_sentence)
33 |             sentences.append(' '.join(sent for sent in sentence))
34 |             article_ids.append(article_id)
35 |     return article_ids, sentences
36 | 
37 | train_ids, train_sentences = topic_sentences('train')
38 | dev_ids, dev_sentences = topic_sentences('dev')
39 | test_ids, test_sentences = topic_sentences('test')
40 | 
41 | def topic_modeling():
42 |     '''
43 |     1. extract embeddings
44 |     2. reduce dimensionality
45 |     3. cluster reduced embeddings
46 |     4. tokenize topics
47 |     5. create topic representatioin
48 |     '''
49 |     embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
50 |     hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
51 |     topic_model = BERTopic(
52 |         embedding_model=embedding_model,
53 |         hdbscan_model=hdbscan_model,
54 |         # diversity=0.2
55 |     )
56 |     topic_model.save('../../data/IAM/origin/topic_modeling_div0.0')
57 | 
58 | topic_modeling()
59 | topic_model = BERTopic.load('../../data/IAM/origin/topic_modeling_div0.0')
60 | topics, probs = topic_model.fit_transform(train_sentences)
61 | print(topic_model.get_topic_info())
62 | # topic_model.visualize_topics().write_html("../../data/IAM/origin/intertopic_dist_map_div0.2.html")
63 | # topic_model.visualize_documents(train_sentences).write_html("../../data/IAM/origin/projections_div0.2.html")
64 | 
65 | 
66 | def make_pseudo_topic_with_bertopic(ids, sentences, topic_model, mode):
67 |     pseudo_topic_dict = {}
68 |     for idx, sentence in tqdm(zip(ids, sentences), total=len(ids), desc='{} processing ...'.format(mode)):
69 |         # 여기서 sentence 는 기사 하나라고 생각하면 된다
70 |         pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0])
71 |         pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic])
72 |         pseudo_topic_dict[idx] = pseudo_topic
73 |     with open('../../data/IAM/origin/{}_pseudo_topic_with_bertopic_div0.0.json'.format(mode), 'w', encoding='utf-8') as file:
74 |         json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
75 | 
76 | make_pseudo_topic_with_bertopic(train_ids, train_sentences, topic_model, 'train')
77 | make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, topic_model, 'dev')
78 | make_pseudo_topic_with_bertopic(test_ids, test_sentences, topic_model, 'test')
79 | 
80 | 
81 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file:
82 | #     json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
83 | 
84 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
85 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2")
86 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
87 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model)
88 | #topic_model.save('../../data/IAM/origin/topic_model')
89 | 


--------------------------------------------------------------------------------
/mine_next/functions/use_bertopic2.py:
--------------------------------------------------------------------------------
 1 | from bertopic import BERTopic
 2 | from sklearn.datasets import fetch_20newsgroups
 3 | from hdbscan import HDBSCAN
 4 | from transformers import BertModel
 5 | from sentence_transformers import SentenceTransformer
 6 | import pandas as pd
 7 | import os, json
 8 | from os import listdir
 9 | from os.path import isfile, join
10 | from sklearn.manifold import TSNE
11 | from tqdm import tqdm
12 | 
13 | 
14 | def topic_sentences(mode):
15 |     sentences = []
16 |     article_ids = []
17 |     topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode))
18 |     topic_dir_list = sorted([os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list])
19 | 
20 |     for topic_dir in topic_dir_list:
21 |         file_list = os.listdir(topic_dir)
22 |         file_list_open = sorted([os.path.join(topic_dir, file) for file in file_list])
23 | 
24 |         for idx, file in zip(file_list, file_list_open):
25 |             article_id = idx.split('.')[0]
26 |             sentence = []
27 |             with open(file, 'r', encoding='utf-8') as f:
28 |                 article = f.readlines()
29 |             for line in article:
30 |                 article_sentence = line.split('\t')[0]
31 |                 #sentences.append(article_sentence)
32 |                 sentence.append(article_sentence)
33 |             sentences.append(' '.join(sent for sent in sentence))
34 |             article_ids.append(article_id)
35 |     return article_ids, sentences
36 | 
37 | train_ids, train_sentences = topic_sentences('train')
38 | dev_ids, dev_sentences = topic_sentences('dev')
39 | test_ids, test_sentences = topic_sentences('test')
40 | 
41 | def topic_modeling():
42 |     '''
43 |     1. extract embeddings
44 |     2. reduce dimensionality
45 |     3. cluster reduced embeddings
46 |     4. tokenize topics
47 |     5. create topic representatioin
48 |     '''
49 |     embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
50 |     hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
51 |     topic_model = BERTopic(
52 |         embedding_model=embedding_model,
53 |         hdbscan_model=hdbscan_model,
54 |         diversity=0.2
55 |     )
56 |     topic_model.save('../../data/IAM/origin/topic_modeling_div0.2')
57 | 
58 | topic_modeling()
59 | topic_model = BERTopic.load('../../data/IAM/origin/topic_modeling_div0.2')
60 | topics, probs = topic_model.fit_transform(train_sentences)
61 | print(topic_model.get_topic_info())
62 | # topic_model.visualize_topics().write_html("../../data/IAM/origin/intertopic_dist_map_div0.2.html")
63 | # topic_model.visualize_documents(train_sentences).write_html("../../data/IAM/origin/projections_div0.2.html")
64 | 
65 | 
66 | def make_pseudo_topic_with_bertopic(ids, sentences, topic_model, mode):
67 |     pseudo_topic_dict = {}
68 |     for idx, sentence in tqdm(zip(ids, sentences), total=len(ids), desc='{} processing ...'.format(mode)):
69 |         # 여기서 sentence 는 기사 하나라고 생각하면 된다
70 |         pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0])
71 |         pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic])
72 |         pseudo_topic_dict[idx] = pseudo_topic
73 |     with open('../../data/IAM/origin/{}_pseudo_topic_with_bertopic_div0.2.json'.format(mode), 'w', encoding='utf-8') as file:
74 |         json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
75 | 
76 | make_pseudo_topic_with_bertopic(train_ids, train_sentences, topic_model, 'train')
77 | make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, topic_model, 'dev')
78 | make_pseudo_topic_with_bertopic(test_ids, test_sentences, topic_model, 'test')
79 | 
80 | 
81 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file:
82 | #     json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
83 | 
84 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
85 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2")
86 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
87 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model)
88 | #topic_model.save('../../data/IAM/origin/topic_model')
89 | 


--------------------------------------------------------------------------------
/mine_next/functions/use_bertopic3.py:
--------------------------------------------------------------------------------
 1 | from bertopic import BERTopic
 2 | from sklearn.datasets import fetch_20newsgroups
 3 | from hdbscan import HDBSCAN
 4 | from transformers import BertModel
 5 | from sentence_transformers import SentenceTransformer
 6 | import pandas as pd
 7 | import os, json
 8 | from os import listdir
 9 | from os.path import isfile, join
10 | from sklearn.manifold import TSNE
11 | from tqdm import tqdm
12 | 
13 | 
14 | def topic_sentences(mode):
15 |     sentences = []
16 |     article_ids = []
17 |     topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode))
18 |     topic_dir_list = sorted([os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list])
19 | 
20 |     for topic_dir in topic_dir_list:
21 |         file_list = os.listdir(topic_dir)
22 |         file_list_open = sorted([os.path.join(topic_dir, file) for file in file_list])
23 | 
24 |         for idx, file in zip(file_list, file_list_open):
25 |             article_id = idx.split('.')[0]
26 |             sentence = []
27 |             with open(file, 'r', encoding='utf-8') as f:
28 |                 article = f.readlines()
29 |             for line in article:
30 |                 article_sentence = line.split('\t')[0]
31 |                 #sentences.append(article_sentence)
32 |                 sentence.append(article_sentence)
33 |             sentences.append(' '.join(sent for sent in sentence))
34 |             article_ids.append(article_id)
35 |     return article_ids, sentences
36 | 
37 | train_ids, train_sentences = topic_sentences('train')
38 | dev_ids, dev_sentences = topic_sentences('dev')
39 | test_ids, test_sentences = topic_sentences('test')
40 | temp_ids = [ids.split('_')[0] for ids in train_ids]
41 | 
42 | def topic_modeling():
43 |     '''
44 |     1. extract embeddings
45 |     2. reduce dimensionality
46 |     3. cluster reduced embeddings
47 |     4. tokenize topics
48 |     5. create topic representatioin
49 |     '''
50 |     embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
51 |     hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
52 |     topic_model = BERTopic(
53 |         embedding_model=embedding_model,
54 |         hdbscan_model=hdbscan_model,
55 |         diversity=0.1
56 |     )
57 |     topic_model.save('../../data/IAM/origin/topic_modeling_div0.1')
58 | 
59 | #topic_modeling()
60 | topic_model = BERTopic.load('../../data/IAM/origin/topic_modeling_div0.1')
61 | topics, probs = topic_model.fit_transform(train_sentences)
62 | print(topic_model.get_topic_info())
63 | #print(topic_model.topic_embeddings_)
64 | # 88개의 topic_embeddings_
65 | #topic_model.visualize_topics(width=1500, height=1500).write_html("../../data/IAM/origin/intertopic_dist_map_div0.1.html")
66 | #topic_model.visualize_documents(train_sentences, width=2000, height=2000).write_html("../../data/IAM/origin/projections_div0.1.html")
67 | topic_model.visualize_topics_per_class(topic_model.topics_per_class(train_sentences, temp_ids), width=1500, height=1500, top_n_topics=20).write_html('../../data/IAM/origin/topic_per_class_div.0.1.html')
68 | # for i in range(10):
69 | #     print(topic_model.transform(train_sentences[i]))
70 | 
71 | def make_pseudo_topic_with_bertopic(ids, sentences, topic_model, mode):
72 |     pseudo_topic_dict = {}
73 |     for idx, sentence in tqdm(zip(ids, sentences), total=len(ids), desc='{} processing ...'.format(mode)):
74 |         # 여기서 sentence 는 기사 하나라고 생각하면 된다
75 |         pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0])
76 |         pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic])
77 |         pseudo_topic_dict[idx] = pseudo_topic
78 |     with open('../../data/IAM/origin/{}_pseudo_topic_with_bertopic_div0.1.json'.format(mode), 'w', encoding='utf-8') as file:
79 |         json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
80 | 
81 | # make_pseudo_topic_with_bertopic(train_ids, train_sentences, topic_model, 'train')
82 | # make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, topic_model, 'dev')
83 | # make_pseudo_topic_with_bertopic(test_ids, test_sentences, topic_model, 'test')
84 | 
85 | 
86 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file:
87 | #     json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
88 | 
89 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
90 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2")
91 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
92 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model)
93 | #topic_model.save('../../data/IAM/origin/topic_model')
94 | 


--------------------------------------------------------------------------------
/mine_next/functions/use_firstsent.py:
--------------------------------------------------------------------------------
 1 | from bertopic import BERTopic
 2 | from sklearn.datasets import fetch_20newsgroups
 3 | from hdbscan import HDBSCAN
 4 | from transformers import BertModel
 5 | from sentence_transformers import SentenceTransformer
 6 | import pandas as pd
 7 | import os, json
 8 | from os import listdir
 9 | from os.path import isfile, join
10 | 
11 | def topic_sentences(mode):
12 |     sentences = []
13 |     article_ids = []
14 |     topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode))
15 |     topic_dir_list = [os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list]
16 | 
17 |     for topic_dir in topic_dir_list:
18 |         file_list = os.listdir(topic_dir)
19 |         file_list_open = [os.path.join(topic_dir, file) for file in file_list]
20 | 
21 |         for idx, file in zip(file_list, file_list_open):
22 |             article_id = idx.split('.')[0]
23 |             sentence = []
24 |             with open(file, 'r', encoding='utf-8') as f:
25 |                 article = f.readlines()
26 |             for line in article:
27 |                 article_sentence = line.split('\t')[0]
28 |                 #sentences.append(article_sentence)
29 |                 sentence.append(article_sentence)
30 |             sentences.append(sentence[0])
31 |             #sentences.append(' '.join(sent for sent in sentence))
32 |             article_ids.append(article_id)
33 |     return article_ids, sentences
34 | 
35 | train_ids, train_sentences = topic_sentences('train')
36 | dev_ids, dev_sentences = topic_sentences('dev')
37 | fit_data = train_sentences + dev_sentences
38 | test_ids, test_sentences = topic_sentences('test')
39 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
40 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2")
41 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
42 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model)
43 | #topic_model.save('../../data/IAM/origin/topic_model')
44 | #
45 | 
46 | 
47 | #topic_model = BERTopic.load('../../data/IAM/origin/topic_model')
48 | #topics, probs = topic_model.fit_transform(fit_data)
49 | 
50 | #print(topic_model.get_topic_info())
51 | 
52 | def make_pseudo_topic_with_bertopic(ids, sentences, mode):
53 |     pseudo_topic_dict = {}
54 |     for idx, sentence in zip(ids, sentences):
55 |         #pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0])
56 |         #pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic])
57 |         pseudo_topic_dict[idx] = sentence
58 |     with open('../../data/IAM/origin/{}_pseudo_topic_with_first_sent.json'.format(mode), 'w', encoding='utf-8') as file:
59 |         json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
60 | 
61 | make_pseudo_topic_with_bertopic(train_ids, train_sentences, 'train')
62 | make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, 'dev')
63 | make_pseudo_topic_with_bertopic(test_ids, test_sentences, 'test')
64 | 
65 | 
66 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file:
67 | #     json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)


--------------------------------------------------------------------------------
/mine_next/model/__pycache__/modeling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/model/__pycache__/modeling.cpython-37.pyc


--------------------------------------------------------------------------------
/mine_next/model/modeling.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModel, AutoConfig, \
  2 |     RobertaPreTrainedModel, RobertaModel
  3 | import torch
  4 | import torch.nn as nn
  5 | from abc import ABC
  6 | import dgl
  7 | import dgl.function as fn
  8 | import dgl.nn.pytorch as dglnn
  9 | import torch.nn.functional as F
 10 | from dgl import DGLGraph
 11 | 
 12 | 
 13 | class CGATLayer(nn.Module, ABC):
 14 |     """ Constituent-Constituent GATLayer """
 15 | 
 16 |     def __init__(self, in_dim, feat_embed_size, out_dim, num_heads):
 17 |         super(CGATLayer, self).__init__()
 18 |         self.fc = nn.Linear(in_dim, out_dim * num_heads, bias=False)
 19 |         self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)
 20 |         self.num_heads = num_heads
 21 |         self.reset_parameters()
 22 | 
 23 |     def reset_parameters(self):
 24 |         gain = nn.init.calculate_gain('relu')
 25 |         nn.init.xavier_normal_(self.fc.weight, gain=gain)
 26 |         nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)
 27 | 
 28 |     def edge_attention(self, edges):
 29 |         z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=2)
 30 |         a = self.attn_fc(z2)
 31 |         return {'e': F.leaky_relu(a)}
 32 | 
 33 |     def message_func(self, edges):
 34 |         return {'z': edges.src['z'], 'e': edges.data['e']}
 35 | 
 36 |     def reduce_func(self, nodes):
 37 |         alpha = F.softmax(nodes.mailbox['e'], dim=1)
 38 |         h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
 39 |         return {'h': h}
 40 | 
 41 |     def forward(self, g, h, edge_type=None):
 42 |         z = self.fc(h)
 43 |         num_tokens, emb_size = z.size()
 44 |         z = z.reshape([num_tokens, self.num_heads, emb_size // self.num_heads])
 45 |         cons_node_ids = g.filter_nodes(lambda nodes: nodes.data['dtype'] == 1)
 46 |         cc_edge_id = g.filter_edges(lambda edges: edges.data["dtype"] == edge_type)
 47 |         self_edge_id = g.filter_edges(lambda edges: edges.data["dtype"] == 4)
 48 |         cc_edge_id = torch.cat([cc_edge_id, self_edge_id], dim=0)
 49 |         g.nodes[cons_node_ids].data['z'] = z
 50 |         g.apply_edges(self.edge_attention, edges=cc_edge_id)
 51 |         g.pull(cons_node_ids, self.message_func, self.reduce_func)
 52 |         g.ndata.pop('z')
 53 |         h = g.ndata.pop('h')
 54 |         return h[cons_node_ids]
 55 | 
 56 | 
 57 | class CTGATLayer(nn.Module, ABC):
 58 |     """ Constituent-Token GATLayer """
 59 | 
 60 |     def __init__(self, in_dim, feat_embed_size, out_dim, num_heads):
 61 |         super(CTGATLayer, self).__init__()
 62 |         self.fc = nn.Linear(in_dim, out_dim * num_heads, bias=False)
 63 |         self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)
 64 |         self.num_heads = num_heads
 65 |         self.reset_parameters()
 66 | 
 67 |     def reset_parameters(self):
 68 |         gain = nn.init.calculate_gain('relu')
 69 |         nn.init.xavier_normal_(self.fc.weight, gain=gain)
 70 |         nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)
 71 | 
 72 |     def edge_attention(self, edges):
 73 |         z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=2)
 74 |         a = self.attn_fc(z2)
 75 |         return {'e': F.leaky_relu(a)}
 76 | 
 77 |     def message_func(self, edges):
 78 |         return {'z': edges.src['z'], 'e': edges.data['e']}
 79 | 
 80 |     def reduce_func(self, nodes):
 81 |         alpha = F.softmax(nodes.mailbox['e'], dim=1)
 82 |         h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
 83 |         return {'h': h}
 84 | 
 85 |     def forward(self, g, h, edge_type=None):
 86 |         z = self.fc(h)
 87 |         num_tokens, emb_size = z.size()
 88 |         z = z.reshape([num_tokens, self.num_heads, emb_size // self.num_heads])
 89 |         token_node_ids = g.filter_nodes(lambda nodes: nodes.data['dtype'] == 0)
 90 |         cons_node_ids = g.filter_nodes(lambda nodes: nodes.data['dtype'] == 1)
 91 |         ct_edge_id = g.filter_edges(lambda edges: edges.data["dtype"] == 5)
 92 |         g.nodes[cons_node_ids].data['z'] = z
 93 |         g.apply_edges(self.edge_attention, edges=ct_edge_id)
 94 |         g.pull(token_node_ids, self.message_func, self.reduce_func)
 95 |         g.ndata.pop('z')
 96 |         h = g.ndata.pop('h')
 97 |         return h[token_node_ids]
 98 | 
 99 | 
100 | class MultiHeadGATLayer(nn.Module, ABC):
101 |     def __init__(self, layer, in_size, out_size, feat_embed_size, num_heads, config, merge='cat', layer_norm_eps=1e-12):
102 |         super(MultiHeadGATLayer, self).__init__()
103 |         self.heads = nn.ModuleList()
104 |         out_dim = out_size // num_heads
105 |         self.layer = layer(in_size, feat_embed_size, out_dim, num_heads)
106 |         self.merge = merge
107 |         self.dropout = nn.Dropout(p=0.2)
108 |         self.LayerNorm = nn.LayerNorm(out_size, eps=layer_norm_eps)
109 | 
110 |     def forward(self, g, o, h, edge_type=None):
111 |         head_outs = self.layer(g, self.dropout(h), edge_type)
112 |         num_tokens = head_outs.size()[0]
113 |         if self.merge == 'cat':
114 |             out = head_outs.reshape([num_tokens, -1])
115 |         else:
116 |             out = torch.mean(head_outs, dim=1)
117 |         out = o + F.elu(out)
118 |         out = self.LayerNorm(out)
119 |         return out
120 | 
121 | 
122 | class GCNLayer(nn.Module):
123 |     def __init__(self, in_feats, out_feats):
124 |         super(GCNLayer, self).__init__()
125 |         self.linear = nn.Linear(in_feats, out_feats)
126 |         self.gcn_msg = fn.copy_u(u='h', out='m')
127 |         self.gcn_reduce = fn.sum(msg='m', out='h')
128 |     def forward(self, g, feature):
129 |         with g.local_scope():
130 |             g.ndata['h'] = feature
131 |             g.updata_all(self.gcn_msg, self.gcn_reduce)
132 |             h = g.ndata['h']
133 |             return self.linear(h)
134 | 
135 | 
136 | class MultiCGNLayer(nn.Module):
137 |     def __init__(self):
138 |         super(MultiCGNLayer, self).__init__()
139 |         self.hidden_size * 2 + self.cons_hidden_size
140 |         self.layer1 = GCNLayer()
141 | 
142 | 
143 | class GraphEmbedding(nn.Module):
144 |     def __init__(self, in_dim, hidden_dim):
145 |         super(GraphEmbedding, self).__init__()
146 |         self.conv1 = dglnn.GraphConv(in_dim, hidden_dim)
147 |         self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim)
148 |         self.conv3 = dglnn.GraphConv(hidden_dim, hidden_dim)
149 |         nn.init.xavier_normal_(self.conv1.weight)
150 |         nn.init.xavier_normal_(self.conv2.weight)
151 |         nn.init.xavier_normal_(self.conv3.weight)
152 |     def forward(self, g, h):
153 |         h = F.relu(self.conv1(g, h))
154 |         h = F.relu(self.conv2(g, h))
155 |         h = F.relu(self.conv3(g, h))
156 |         #with g.local_scope():
157 |         g.ndata['h'] = h
158 |         hg = dgl.mean_nodes(g, 'h')
159 |         return hg
160 | 
161 | 
162 | class GraphEmbedding2(nn.Module):
163 |     def __init__(self, in_dim, hidden_dim, out_dim):
164 |         super(GraphEmbedding2, self).__init__()
165 |         self.conv1 = dglnn.GraphConv(in_dim, hidden_dim, allow_zero_in_degree=True)
166 |         self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim, allow_zero_in_degree=True)
167 |         self.conv3 = dglnn.GraphConv(hidden_dim, out_dim, allow_zero_in_degree=True)
168 |         # nn.init.xavier_normal_(self.conv1.weight)
169 |         # nn.init.xavier_normal_(self.conv2.weight)
170 |         # nn.init.xavier_normal_(self.conv3.weight)
171 |     def forward(self, g, h):
172 |         h = F.relu(self.conv1(g, h))
173 |         h = F.relu(self.conv2(g, h))
174 |         h = F.relu(self.conv3(g, h))
175 |         with g.local_scope():
176 |             g.ndata['h'] = h
177 |             hg = dgl.mean_nodes(g, 'h')
178 |             return hg
179 | 
180 | 
181 | class RobertaReflectGraphWithGrandEdgeClassification(RobertaPreTrainedModel):
182 |     def __init__(self, config):
183 |         super().__init__(config)
184 |         self.num_labels = config.num_labels
185 |         self.hidden_size = config.hidden_size
186 |         self.cons_hidden_size = config.cons_hidden_size
187 |         self.roberta = RobertaModel(config)
188 |         self.feature_size = config.feature_size
189 |         # 그래프 둘다 쓸때
190 |         # self.claim_layer = nn.Sequential(
191 |         #     nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.hidden_size+2*self.feature_size),
192 |         #     nn.ReLU(),
193 |         #     nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.num_labels),
194 |         # )
195 |         # 그래프 하나만 할때
196 |         self.claim_layer = nn.Sequential(
197 |             nn.Linear(in_features=self.hidden_size + self.feature_size,
198 |                       out_features=self.hidden_size + self.feature_size),
199 |             nn.ReLU(),
200 |             nn.Linear(in_features=self.hidden_size + self.feature_size, out_features=self.num_labels),
201 |         )
202 |         # 원래는 self.hidden_size + self.cons_hidden_size
203 |         self.cons_type_embeddings = nn.Embedding(len(config.cons_tag2id), self.cons_hidden_size)
204 |         # nn.init.uniform_(self.cons_type_embeddings.weight, -1.0, 1.0)
205 |         self.softmax = nn.Softmax(dim=-1)
206 |         self.graph_embedding = GraphEmbedding2(self.cons_hidden_size, self.cons_hidden_size, self.feature_size)
207 | 
208 |     def forward(self, idx=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None, all_graph=None,
209 |                 constituent_labels_first=None, constituent_labels_second=None):
210 |         output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
211 |         output = output.last_hidden_state
212 | 
213 |         graph_conv_first = []
214 |         graph_conv_second = []
215 |         for (graph_id, first, second) in zip(idx, constituent_labels_first, constituent_labels_second):
216 |             curr_first_g = all_graph[0][int(graph_id.item())].to("cuda")
217 |             first_mask = first != -1
218 |             first_label = first[first_mask]
219 |             first_cons_node_feature = self.cons_type_embeddings(first_label)
220 |             curr_first_g_conv = self.graph_embedding(curr_first_g, first_cons_node_feature)
221 |             graph_conv_first.append(curr_first_g_conv)
222 | 
223 |             curr_second_g = all_graph[1][int(graph_id.item())].to("cuda")
224 |             second_mask = second != -1
225 |             second_label = second[second_mask]
226 |             second_cons_node_feature = self.cons_type_embeddings(second_label)
227 |             curr_second_g_conv = self.graph_embedding(curr_second_g, second_cons_node_feature)
228 |             graph_conv_second.append(curr_second_g_conv)
229 | 
230 |         # graph_conv_reult = torch.stack(graph_conv_reult, dim=0)
231 |         # cls = output[:, 0, : ] -> (4, 768)
232 |         graph_conv_first = torch.stack(graph_conv_first, dim=0)
233 |         graph_conv_second = torch.stack(graph_conv_second, dim=0)
234 | 
235 |         cls_token = output[:, 0, :].unsqueeze(dim=1)
236 |         #cls_graph_concat = torch.cat([cls_token, graph_conv_first, graph_conv_second], dim=-1)
237 |         cls_graph_concat = torch.cat([cls_token, graph_conv_second], dim=-1)
238 |         #cls_graph_concat = torch.cat([cls_token, graph_conv_second], dim=-1)
239 | 
240 |         logit = self.claim_layer(cls_graph_concat)
241 |         logit = logit.squeeze(dim=1)
242 |         if labels is not None:
243 |             loss_func = nn.CrossEntropyLoss()
244 |             loss = loss_func(logit, labels)
245 |             return loss, self.softmax(logit)
246 |         else:
247 |             return logit
248 | 
249 | 
250 | class RobertaReflectGraphClassification(RobertaPreTrainedModel):
251 |     def __init__(self, config):
252 |         super().__init__(config)
253 |         self.num_labels = config.num_labels
254 |         self.hidden_size = config.hidden_size
255 |         self.cons_hidden_size = config.cons_hidden_size
256 |         self.roberta = RobertaModel(config)
257 |         self.feature_size = config.feature_size
258 |         self.claim_layer = nn.Sequential(
259 |             nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.hidden_size+2*self.feature_size),
260 |             nn.ReLU(),
261 |             nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.num_labels),
262 |         )# 원래는 self.hidden_size + self.cons_hidden_size
263 |         self.cons_type_embeddings = nn.Embedding(len(config.cons_tag2id), self.cons_hidden_size)
264 |         # nn.init.uniform_(self.cons_type_embeddings.weight, -1.0, 1.0)
265 |         self.softmax = nn.Softmax(dim=-1)
266 |         self.graph_embedding = GraphEmbedding2(self.cons_hidden_size, self.cons_hidden_size, self.feature_size)
267 | 
268 |     def forward(self, idx=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None, all_graph=None,
269 |                 constituent_labels_first=None, constituent_labels_second=None):
270 |         output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
271 |         output = output.last_hidden_state
272 | 
273 |         graph_conv_first = []
274 |         graph_conv_second = []
275 |         for (graph_id, first, second) in zip(idx, constituent_labels_first, constituent_labels_second):
276 |             curr_first_g = all_graph[0][int(graph_id.item())].to("cuda")
277 |             first_mask = first != -1
278 |             first_label = first[first_mask]
279 |             first_cons_node_feature = self.cons_type_embeddings(first_label)
280 |             curr_first_g_conv = self.graph_embedding(curr_first_g, first_cons_node_feature)
281 |             graph_conv_first.append(curr_first_g_conv)
282 | 
283 |             curr_second_g = all_graph[1][int(graph_id.item())].to("cuda")
284 |             second_mask = second != -1
285 |             second_label = second[second_mask]
286 |             second_cons_node_feature = self.cons_type_embeddings(second_label)
287 |             curr_second_g_conv = self.graph_embedding(curr_second_g, second_cons_node_feature)
288 |             graph_conv_second.append(curr_second_g_conv)
289 | 
290 |         # graph_conv_reult = torch.stack(graph_conv_reult, dim=0)
291 |         # cls = output[:, 0, : ] -> (4, 768)
292 |         graph_conv_first = torch.stack(graph_conv_first, dim=0)
293 |         graph_conv_second = torch.stack(graph_conv_second, dim=0)
294 | 
295 |         cls_token = output[:, 0, :].unsqueeze(dim=1)
296 |         # cls_graph_concat = torch.cat([cls_token, graph_conv_first], dim=-1)
297 |         #cls_graph_concat = torch.cat([graph_conv_first,cls_token], dim=-1)
298 |         cls_graph_concat = torch.cat([cls_token, graph_conv_first, graph_conv_second], dim=-1)
299 |         logit = self.claim_layer(cls_graph_concat)
300 |         logit = logit.squeeze(dim=1)
301 |         if labels is not None:
302 |             loss_func = nn.CrossEntropyLoss()
303 |             loss = loss_func(logit, labels)
304 |             return loss, self.softmax(logit)
305 |         else:
306 |             return logit
307 | 
308 | 
309 | class RobertaForClassification(RobertaPreTrainedModel):
310 |     def __init__(self, config):
311 |         super().__init__(config)
312 |         self.num_labels = config.num_labels
313 |         self.hidden_size = config.hidden_size
314 |         self.roberta = RobertaModel(config)
315 |         self.claim_layer = nn.Sequential(
316 |             nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size),
317 |             nn.ReLU(),
318 |             nn.Linear(in_features=self.hidden_size, out_features=self.num_labels),
319 |         )
320 |         self.softmax = nn.Softmax(dim=-1)
321 | 
322 |     def forward(self, idx=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None):
323 |         output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
324 |         output = output.last_hidden_state
325 | 
326 |         logit = self.claim_layer(output[:, 0, :])
327 |         if labels is not None:
328 |             loss_func = nn.CrossEntropyLoss()
329 |             loss = loss_func(logit, labels)
330 |             return loss, self.softmax(logit)
331 |         else:
332 |             return logit
333 | 
334 | 
335 | class RobertaForStanceClassification(RobertaPreTrainedModel):
336 |     def __init__(self, config):
337 |         super().__init__(config)
338 |         self.num_labels = config.num_labels
339 |         self.hidden_size = config.hidden_size
340 |         self.roberta = RobertaModel(config)
341 |         self.claim_layer = nn.Sequential(
342 |             nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size),
343 |             nn.ReLU(),
344 |             nn.Linear(in_features=self.hidden_size, out_features=self.num_labels),
345 |         )
346 |         self.softmax = nn.Softmax(dim=-1)
347 | 
348 |     def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
349 |         output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
350 |         output = output.last_hidden_state
351 | 
352 |         logit = self.claim_layer(output[:, 0, :])
353 |         if labels is not None:
354 |             loss_func = nn.CrossEntropyLoss()
355 |             loss = loss_func(logit, labels)
356 |             return loss, self.softmax(logit)
357 |         else:
358 |             return logit
359 | 
360 | 
361 | class RobertaForSTANCY(RobertaPreTrainedModel):
362 |     def __init__(self, config):
363 |         super().__init__(config)
364 |         self.num_labels = config.num_labels
365 |         self.hidden_size = config.hidden_size
366 |         self.roberta = RobertaModel(config)
367 |         self.claim_layer = nn.Sequential(
368 |             nn.Linear(in_features=self.hidden_size+1, out_features=self.hidden_size+1),
369 |             nn.ReLU(),
370 |             nn.Linear(in_features=self.hidden_size+1, out_features=self.num_labels),
371 |         )
372 |         self.softmax = nn.Softmax(dim=-1)
373 |         self.cosine = nn.CosineSimilarity()
374 | 
375 |     def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None):
376 | 
377 |         output_combine = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
378 |         #output_combine = output_combine.last_hidden_state
379 |         output_combine = output_combine.pooler_output
380 |         sent_attention_mask = (1-token_type_ids) * attention_mask
381 |         output_sent = self.roberta(input_ids=input_ids, attention_mask=sent_attention_mask)
382 |         #output_sent = output_sent.last_hidden_state
383 |         output_sent = output_sent.pooler_output
384 |         cos_sim = self.cosine(output_combine, output_sent).unsqueeze(1)
385 |         combined = torch.cat([output_combine, cos_sim], dim=1)
386 | 
387 |         logit = self.claim_layer(combined)
388 | 
389 |         if labels is not None:
390 |             loss_func = nn.CrossEntropyLoss()
391 |             loss_bert = loss_func(logit, labels)
392 | 
393 |             loss_cosine = nn.CosineEmbeddingLoss()
394 |             loss_claim = loss_cosine(output_combine, output_sent, sim_labels)
395 |             loss = loss_bert + loss_claim
396 |             return loss, self.softmax(logit)
397 |         else:
398 |             return logit
399 | 


--------------------------------------------------------------------------------
/mine_next/run_base.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mine_next.functions.main_function import train, evaluate
 3 | import random, os
 4 | import numpy as np
 5 | import torch
 6 | from transformers import AutoConfig, AutoTokenizer
 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
 8 | from mine_next.model.modeling import RobertaReflectGraphClassification
 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 | 
11 | def create_model(args):
12 |     config = AutoConfig.from_pretrained(
13 |         args.language_model,
14 |         num_labels=args.num_labels,
15 |         max_length=args.max_length,
16 |         # local_files_only=True
17 |     )
18 |     tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
19 |     setattr(config, 'cons_hidden_size', args.cons_hidden_size)
20 |     setattr(config, 'cons_tag2id', args.cons_tag2id)
21 |     model = RobertaForClassification.from_pretrained(
22 |         args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
23 |         config=config,
24 |         # local_files_only=True
25 |     )
26 |     return config, tokenizer, model
27 | 
28 | 
29 | def set_seed(args):
30 |     random.seed(args.seed)
31 |     np.random.seed(args.seed)
32 |     torch.manual_seed(args.seed)
33 |     if torch.cuda.is_available():
34 |         torch.cuda.manual_seed_all(args.seed)
35 | 
36 | 
37 | def main(args):
38 |     set_seed(args)
39 |     config, tokenizer, model = create_model(args)
40 |     model.to(args.device)
41 | 
42 |     if args.mode == 'train':
43 |         train(args, model, tokenizer)
44 |     elif args.mode == 'dev':
45 |         evaluate(args, model, tokenizer)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     parser = argparse.ArgumentParser(description='main')
50 |     parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
51 |     parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
52 |     parser.add_argument('--save_dir', type=str, default='only_sentence_base_3e_5') # 모델 불러올 dir
53 |     parser.add_argument('--output_dir', type=str, default='only_sentence_base_3e_5') # 모델 저장할 dir
54 |     parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
55 |     parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
56 |     parser.add_argument('--device', type=str, default="cuda")
57 |     #model
58 |     parser.add_argument('--num_labels', type=int, default=2)
59 |     parser.add_argument('--max_length', type=int, default=512)
60 |     parser.add_argument('--batch_size', type=int, default=16)
61 |     parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
62 |     parser.add_argument('--cons_hidden_size', type=int, default=128)
63 | 
64 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
65 |     parser.add_argument("--learning_rate", type=float, default=3e-5)
66 |     parser.add_argument("--warmup_steps", type=int, default=0)
67 |     parser.add_argument("--max_grad_norm", type=float, default=5.0)
68 |     parser.add_argument('--mode', type=str, default='dev')
69 |     parser.add_argument('--seed', type=int, default=42)
70 |     parser.add_argument('--checkpoint', type=int, default=1)
71 |     parser.add_argument('--language_model', type=str, default='roberta-base')
72 |     parser.add_argument("--epoch", type=int, default=15)
73 | 
74 |     args = parser.parse_args()
75 |     main(args)
76 | 


--------------------------------------------------------------------------------
/mine_next/run_debug.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mine_next.functions.main_function2 import train, evaluate
 3 | import random, os
 4 | import numpy as np
 5 | import torch
 6 | from transformers import AutoConfig, AutoTokenizer
 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
 8 | from mine_next.model.modeling import RobertaReflectGraphClassification
 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 | 
11 | 
12 | def create_model(args):
13 |     config = AutoConfig.from_pretrained(
14 |         args.language_model,
15 |         num_labels=args.num_labels,
16 |         max_length=args.max_length,
17 |         # local_files_only=True
18 |     )
19 |     tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
20 |     setattr(config, 'cons_hidden_size', args.cons_hidden_size)
21 |     setattr(config, 'cons_tag2id', args.cons_tag2id)
22 |     model = RobertaReflectGraphClassification.from_pretrained(
23 |         args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
24 |         config=config,
25 |         # local_files_only=True
26 |     )
27 |     return config, tokenizer, model
28 | 
29 | def set_seed(args):
30 |     random.seed(args.seed)
31 |     np.random.seed(args.seed)
32 |     torch.manual_seed(args.seed)
33 |     if torch.cuda.is_available():
34 |         torch.cuda.manual_seed_all(args.seed)
35 | 
36 | def main(args):
37 |     set_seed(args)
38 |     config, tokenizer, model = create_model(args)
39 |     model.to(args.device)
40 | 
41 |     if args.mode == 'train':
42 |         train(args, model, tokenizer)
43 |     elif args.mode == 'dev':
44 |         evaluate(args, model, tokenizer)
45 | 
46 | if __name__ == '__main__':
47 |     parser = argparse.ArgumentParser(description='main')
48 |     parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
49 |     parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
50 |     parser.add_argument('--save_dir', type=str, default='only_sentence_base_graph_cons_256') # 모델 불러올 dir
51 |     parser.add_argument('--output_dir', type=str, default='only_sentence_base_graph_cons_256') # 모델 저장할 dir
52 |     parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
53 |     parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
54 |     parser.add_argument('--device', type=str, default="cuda")
55 |     #model
56 |     parser.add_argument('--num_labels', type=int, default=2)
57 |     parser.add_argument('--max_length', type=int, default=512)
58 |     parser.add_argument('--batch_size', type=int, default=16)
59 |     parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
60 |     parser.add_argument('--cons_hidden_size', type=int, default=256)
61 | 
62 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
63 |     parser.add_argument("--learning_rate", type=float, default=3e-5)
64 |     parser.add_argument("--warmup_steps", type=int, default=0)
65 |     parser.add_argument("--max_grad_norm", type=float, default=5.0)
66 |     parser.add_argument('--mode', type=str, default='train')
67 |     parser.add_argument('--seed', type=int, default=42)
68 |     parser.add_argument('--checkpoint', type=int, default=5)
69 |     parser.add_argument('--language_model', type=str, default='roberta-base')
70 |     parser.add_argument("--epoch", type=int, default=30)
71 | 
72 |     args = parser.parse_args()
73 |     main(args)
74 | 


--------------------------------------------------------------------------------
/mine_next/run_grad1.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mine_next.functions.main_function2 import train, evaluate, test
 3 | import random, os
 4 | import numpy as np
 5 | import torch
 6 | from transformers import AutoConfig, AutoTokenizer
 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 | 
11 | def create_model(args):
12 |     config = AutoConfig.from_pretrained(
13 |         args.language_model,
14 |         num_labels=args.num_labels,
15 |         max_length=args.max_length,
16 |         # local_files_only=True
17 |     )
18 |     tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
19 |     setattr(config, 'cons_hidden_size', args.cons_hidden_size)
20 |     setattr(config, 'feature_size', args.feature_size)
21 |     setattr(config, 'cons_tag2id', args.cons_tag2id)
22 |     model = RobertaReflectGraphClassification.from_pretrained(
23 |         args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
24 |         config=config,
25 |         # local_files_only=True
26 |     )
27 |     return config, tokenizer, model
28 | 
29 | 
30 | def set_seed(args):
31 |     random.seed(args.seed)
32 |     np.random.seed(args.seed)
33 |     torch.manual_seed(args.seed)
34 |     if torch.cuda.is_available():
35 |         torch.cuda.manual_seed_all(args.seed)
36 | 
37 | 
38 | def main(args):
39 |     set_seed(args)
40 |     config, tokenizer, model = create_model(args)
41 |     model.to(args.device)
42 | 
43 |     if args.mode == 'train':
44 |         train(args, model, tokenizer)
45 |     elif args.mode == 'dev':
46 |         evaluate(args, model, tokenizer)
47 |     elif args.mode == 'test':
48 |         test(args, model, tokenizer)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     parser = argparse.ArgumentParser(description='main')
53 |     parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
54 |     parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
55 |     parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
56 |     parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
57 |     # parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_first_sent.json')
58 |     # parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_first_sent.json')
59 |     # parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_first_sent.json')
60 |     parser.add_argument('--train_pseudo_topic', type=str,
61 |                         default='../data/IAM/origin/train_pseudo_topic_with_bertopic_div0.2.json')
62 |     parser.add_argument('--dev_pseudo_topic', type=str,
63 |                         default='../data/IAM/origin/dev_pseudo_topic_with_bertopic_div0.2.json')
64 |     parser.add_argument('--test_pseudo_topic', type=str,
65 |                         default='../data/IAM/origin/test_pseudo_topic_with_bertopic_div0.2.json')
66 |     parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
67 |     parser.add_argument('--device', type=str, default="cuda")
68 |     #model
69 |     parser.add_argument('--num_labels', type=int, default=2)
70 |     parser.add_argument('--max_length', type=int, default=256)
71 |     parser.add_argument('--batch_size', type=int, default=32)
72 |     parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
73 |     parser.add_argument('--cons_hidden_size', type=int, default=768)
74 |     parser.add_argument('--feature_size', type=int, default=384)
75 | 
76 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
77 |     parser.add_argument("--learning_rate", type=float, default=3e-5)
78 |     parser.add_argument("--warmup_steps", type=int, default=0)
79 |     parser.add_argument("--max_grad_norm", type=float, default=5.0)
80 |     parser.add_argument('--mode', type=str, default='train')
81 |     parser.add_argument('--seed', type=int, default=42)
82 |     parser.add_argument('--checkpoint', type=int, default=1)
83 |     parser.add_argument('--language_model', type=str, default='roberta-base')
84 |     parser.add_argument("--epoch", type=int, default=40)
85 |     parser.add_argument('--save_dir', type=str, default='pseudo_topic_with_bertopic_div02_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir
86 |     parser.add_argument('--output_dir', type=str, default='pseudo_topic_with_bertopic_div02_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir
87 | 
88 |     args = parser.parse_args()
89 |     main(args)
90 | 


--------------------------------------------------------------------------------
/mine_next/run_grand.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mine_next.functions.main_function2 import train, evaluate, test
 3 | import random, os
 4 | import numpy as np
 5 | import torch
 6 | from transformers import AutoConfig, AutoTokenizer
 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 | 
11 | def create_model(args):
12 |     config = AutoConfig.from_pretrained(
13 |         args.language_model,
14 |         num_labels=args.num_labels,
15 |         max_length=args.max_length,
16 |         # local_files_only=True
17 |     )
18 |     tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
19 |     setattr(config, 'cons_hidden_size', args.cons_hidden_size)
20 |     setattr(config, 'feature_size', args.feature_size)
21 |     setattr(config, 'cons_tag2id', args.cons_tag2id)
22 |     model = RobertaReflectGraphClassification.from_pretrained(
23 |         args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
24 |         config=config,
25 |         # local_files_only=True
26 |     )
27 |     return config, tokenizer, model
28 | 
29 | 
30 | def set_seed(args):
31 |     random.seed(args.seed)
32 |     np.random.seed(args.seed)
33 |     torch.manual_seed(args.seed)
34 |     if torch.cuda.is_available():
35 |         torch.cuda.manual_seed_all(args.seed)
36 | 
37 | 
38 | def main(args):
39 |     set_seed(args)
40 |     config, tokenizer, model = create_model(args)
41 |     model.to(args.device)
42 | 
43 |     if args.mode == 'train':
44 |         train(args, model, tokenizer)
45 |     elif args.mode == 'dev':
46 |         evaluate(args, model, tokenizer)
47 |     elif args.mode == 'test':
48 |         test(args, model, tokenizer)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     parser = argparse.ArgumentParser(description='main')
53 |     parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
54 |     parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
55 |     parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
56 |     parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
57 |     # parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_first_sent.json')
58 |     # parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_first_sent.json')
59 |     # parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_first_sent.json')
60 |     parser.add_argument('--train_pseudo_topic', type=str,
61 |                         default='../data/IAM/origin/train_pseudo_topic_with_bertopic_div0.1.json')
62 |     parser.add_argument('--dev_pseudo_topic', type=str,
63 |                         default='../data/IAM/origin/dev_pseudo_topic_with_bertopic_div0.1.json')
64 |     parser.add_argument('--test_pseudo_topic', type=str,
65 |                         default='../data/IAM/origin/test_pseudo_topic_with_bertopic_div0.1.json')
66 |     parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
67 |     parser.add_argument('--device', type=str, default="cuda")
68 |     #model
69 |     parser.add_argument('--num_labels', type=int, default=2)
70 |     parser.add_argument('--max_length', type=int, default=256)
71 |     parser.add_argument('--batch_size', type=int, default=32)
72 |     parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
73 |     parser.add_argument('--cons_hidden_size', type=int, default=768)
74 |     parser.add_argument('--feature_size', type=int, default=384)
75 | 
76 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
77 |     parser.add_argument("--learning_rate", type=float, default=3e-5)
78 |     parser.add_argument("--warmup_steps", type=int, default=0)
79 |     parser.add_argument("--max_grad_norm", type=float, default=5.0)
80 |     parser.add_argument('--mode', type=str, default='train')
81 |     parser.add_argument('--seed', type=int, default=42)
82 |     parser.add_argument('--checkpoint', type=int, default=1)
83 |     parser.add_argument('--language_model', type=str, default='roberta-base')
84 |     parser.add_argument("--epoch", type=int, default=40)
85 |     parser.add_argument('--save_dir', type=str, default='pseudo_topic_with_bertopic_div01_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir
86 |     parser.add_argument('--output_dir', type=str, default='pseudo_topic_with_bertopic_div01_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir
87 | 
88 |     args = parser.parse_args()
89 |     main(args)
90 | 


--------------------------------------------------------------------------------
/mine_next/run_grand2:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mine_next.functions.main_function2 import train, evaluate, test
 3 | import random, os
 4 | import numpy as np
 5 | import torch
 6 | from transformers import AutoConfig, AutoTokenizer
 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 | 
11 | 
12 | def create_model(args):
13 |     config = AutoConfig.from_pretrained(
14 |         args.language_model,
15 |         num_labels=args.num_labels,
16 |         max_length=args.max_length,
17 |         # local_files_only=True
18 |     )
19 |     tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
20 |     setattr(config, 'cons_hidden_size', args.cons_hidden_size)
21 |     setattr(config, 'feature_size', args.feature_size)
22 |     setattr(config, 'cons_tag2id', args.cons_tag2id)
23 |     model = RobertaReflectGraphWithGrandEdgeClassification.from_pretrained(
24 |         args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
25 |         config=config,
26 |         # local_files_only=True
27 |     )
28 |     return config, tokenizer, model
29 | 
30 | def set_seed(args):
31 |     random.seed(args.seed)
32 |     np.random.seed(args.seed)
33 |     torch.manual_seed(args.seed)
34 |     if torch.cuda.is_available():
35 |         torch.cuda.manual_seed_all(args.seed)
36 | 
37 | def main(args):
38 |     set_seed(args)
39 |     config, tokenizer, model = create_model(args)
40 |     model.to(args.device)
41 | 
42 |     if args.mode == 'train':
43 |         train(args, model, tokenizer)
44 |     elif args.mode == 'dev':
45 |         evaluate(args, model, tokenizer)
46 |     elif args.mode == 'test':
47 |         test(args, model, tokenizer)
48 | 
49 | if __name__ == '__main__':
50 |     parser = argparse.ArgumentParser(description='main')
51 |     parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
52 |     parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
53 |     parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
54 |     parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
55 |     parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_bertopic.json')
56 |     parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_bertopic.json')
57 |     parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_bertopic.json')
58 |     parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
59 |     parser.add_argument('--device', type=str, default="cuda")
60 |     #model
61 |     parser.add_argument('--num_labels', type=int, default=2)
62 |     parser.add_argument('--max_length', type=int, default=256)
63 |     parser.add_argument('--batch_size', type=int, default=32)
64 |     parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
65 |     parser.add_argument('--cons_hidden_size', type=int, default=768)
66 |     parser.add_argument('--feature_size', type=int, default=384)
67 | 
68 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
69 |     parser.add_argument("--learning_rate", type=float, default=2e-5)
70 |     parser.add_argument("--warmup_steps", type=int, default=0)
71 |     parser.add_argument("--max_grad_norm", type=float, default=5.0)
72 |     parser.add_argument('--mode', type=str, default='train')
73 |     parser.add_argument('--seed', type=int, default=42)
74 |     parser.add_argument('--checkpoint', type=int, default=5)
75 |     parser.add_argument('--language_model', type=str, default='roberta-base')
76 |     parser.add_argument("--epoch", type=int, default=40)
77 |     parser.add_argument('--save_dir', type=str, default='pseudo_topic_with_bertopic_sentence_base_two_graph_cons_768_feat_384_max_length_256_lr_2e5') # 모델 저장할 dir
78 |     parser.add_argument('--output_dir', type=str, default='pseudo_topic_with_bertopic_sentence_base_two_granh_cons_768_feat_384_max_length_256_lr_2e5') # 모델 불러올 dir
79 | 
80 |     args = parser.parse_args()
81 |     main(args)
82 | 


--------------------------------------------------------------------------------
/mine_next/run_grand2.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mine_next.functions.main_function2 import train, evaluate, test
 3 | import random, os
 4 | import numpy as np
 5 | import torch
 6 | from transformers import AutoConfig, AutoTokenizer
 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 | 
11 | def create_model(args):
12 |     config = AutoConfig.from_pretrained(
13 |         args.language_model,
14 |         num_labels=args.num_labels,
15 |         max_length=args.max_length,
16 |         # local_files_only=True
17 |     )
18 |     tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
19 |     setattr(config, 'cons_hidden_size', args.cons_hidden_size)
20 |     setattr(config, 'feature_size', args.feature_size)
21 |     setattr(config, 'cons_tag2id', args.cons_tag2id)
22 |     model = RobertaReflectGraphClassification.from_pretrained(
23 |         args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
24 |         config=config,
25 |         # local_files_only=True
26 |     )
27 |     return config, tokenizer, model
28 | 
29 | 
30 | def set_seed(args):
31 |     random.seed(args.seed)
32 |     np.random.seed(args.seed)
33 |     torch.manual_seed(args.seed)
34 |     if torch.cuda.is_available():
35 |         torch.cuda.manual_seed_all(args.seed)
36 | 
37 | 
38 | def main(args):
39 |     set_seed(args)
40 |     config, tokenizer, model = create_model(args)
41 |     model.to(args.device)
42 | 
43 |     if args.mode == 'train':
44 |         train(args, model, tokenizer)
45 |     elif args.mode == 'dev':
46 |         evaluate(args, model, tokenizer)
47 |     elif args.mode == 'test':
48 |         test(args, model, tokenizer)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     parser = argparse.ArgumentParser(description='main')
53 |     parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
54 |     parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
55 |     parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
56 |     parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
57 |     parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_bertopic.json')
58 |     parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_bertopic.json')
59 |     parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_bertopic.json')
60 |     parser.add_argument('--init_weight', type=bool, default=False) # False면 학습된거. True면 쌩 로버타
61 |     parser.add_argument('--device', type=str, default="cuda")
62 |     #model
63 |     parser.add_argument('--num_labels', type=int, default=2)
64 |     parser.add_argument('--max_length', type=int, default=256)
65 |     parser.add_argument('--batch_size', type=int, default=32)
66 |     parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
67 |     parser.add_argument('--cons_hidden_size', type=int, default=768)
68 |     parser.add_argument('--feature_size', type=int, default=384)
69 | 
70 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
71 |     parser.add_argument("--learning_rate", type=float, default=3e-5)
72 |     parser.add_argument("--warmup_steps", type=int, default=0)
73 |     parser.add_argument("--max_grad_norm", type=float, default=5.0)
74 |     parser.add_argument('--mode', type=str, default='test')
75 |     parser.add_argument('--seed', type=int, default=42)
76 |     parser.add_argument('--checkpoint', type=int, default=24)
77 |     parser.add_argument('--language_model', type=str, default='roberta-base')
78 |     parser.add_argument("--epoch", type=int, default=40)
79 |     parser.add_argument('--save_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir
80 |     parser.add_argument('--output_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir
81 | 
82 |     args = parser.parse_args()
83 |     main(args)
84 | 


--------------------------------------------------------------------------------
/mine_next/run_grand3_test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mine_next.functions.main_function2 import train, evaluate, test
 3 | import random, os
 4 | import numpy as np
 5 | import torch
 6 | from transformers import AutoConfig, AutoTokenizer
 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 | 
11 | 
12 | def create_model(args):
13 |     config = AutoConfig.from_pretrained(
14 |         args.language_model,
15 |         num_labels=args.num_labels,
16 |         max_length=args.max_length,
17 |         # local_files_only=True
18 |     )
19 |     tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
20 |     setattr(config, 'cons_hidden_size', args.cons_hidden_size)
21 |     setattr(config, 'feature_size', args.feature_size)
22 |     setattr(config, 'cons_tag2id', args.cons_tag2id)
23 |     model = RobertaReflectGraphWithGrandEdgeClassification.from_pretrained(
24 |         args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
25 |         config=config,
26 |         # local_files_only=True
27 |     )
28 |     return config, tokenizer, model
29 | 
30 | def set_seed(args):
31 |     random.seed(args.seed)
32 |     np.random.seed(args.seed)
33 |     torch.manual_seed(args.seed)
34 |     if torch.cuda.is_available():
35 |         torch.cuda.manual_seed_all(args.seed)
36 | 
37 | def main(args):
38 |     set_seed(args)
39 |     config, tokenizer, model = create_model(args)
40 |     model.to(args.device)
41 | 
42 |     if args.mode == 'train':
43 |         train(args, model, tokenizer)
44 |     elif args.mode == 'dev':
45 |         evaluate(args, model, tokenizer)
46 |     elif args.mode == 'test':
47 |         test(args, model, tokenizer)
48 | 
49 | if __name__ == '__main__':
50 |     parser = argparse.ArgumentParser(description='main')
51 |     parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
52 |     parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
53 |     parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
54 |     parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
55 |     parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_bertopic.json')
56 |     parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_bertopic.json')
57 |     parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_bertopic.json')
58 |     parser.add_argument('--init_weight', type=bool, default=False) # False면 학습된거. True면 쌩 로버타
59 |     parser.add_argument('--device', type=str, default="cuda")
60 |     #model
61 |     parser.add_argument('--num_labels', type=int, default=2)
62 |     parser.add_argument('--max_length', type=int, default=256)
63 |     parser.add_argument('--batch_size', type=int, default=32)
64 |     parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
65 |     parser.add_argument('--cons_hidden_size', type=int, default=768)
66 |     parser.add_argument('--feature_size', type=int, default=384)
67 | 
68 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
69 |     parser.add_argument("--learning_rate", type=float, default=3e-5)
70 |     parser.add_argument("--warmup_steps", type=int, default=0)
71 |     parser.add_argument("--max_grad_norm", type=float, default=5.0)
72 |     parser.add_argument('--mode', type=str, default='test')
73 |     parser.add_argument('--seed', type=int, default=42)
74 |     parser.add_argument('--checkpoint', type=int, default=7)
75 |     parser.add_argument('--language_model', type=str, default='roberta-base')
76 |     parser.add_argument("--epoch", type=int, default=40)
77 |     parser.add_argument('--save_dir', type=str, default='only_sentence_base_only_grand_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir
78 |     parser.add_argument('--output_dir', type=str, default='only_sentence_base_only_grand_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir
79 | 
80 |     args = parser.parse_args()
81 |     main(args)
82 | 


--------------------------------------------------------------------------------
/mine_next/run_one.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from mine_next.functions.main_function2 import train, evaluate
 3 | import random, os
 4 | import numpy as np
 5 | import torch
 6 | from transformers import AutoConfig, AutoTokenizer
 7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
 8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
 9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 | 
11 | 
12 | def create_model(args):
13 |     config = AutoConfig.from_pretrained(
14 |         args.language_model,
15 |         num_labels=args.num_labels,
16 |         max_length=args.max_length,
17 |         # local_files_only=True
18 |     )
19 |     tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
20 |     setattr(config, 'cons_hidden_size', args.cons_hidden_size)
21 |     setattr(config, 'feature_size', args.feature_size)
22 |     setattr(config, 'cons_tag2id', args.cons_tag2id)
23 |     model = RobertaReflectGraphClassification.from_pretrained(
24 |         args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
25 |         config=config,
26 |         # local_files_only=True
27 |     )
28 |     return config, tokenizer, model
29 | 
30 | def set_seed(args):
31 |     random.seed(args.seed)
32 |     np.random.seed(args.seed)
33 |     torch.manual_seed(args.seed)
34 |     if torch.cuda.is_available():
35 |         torch.cuda.manual_seed_all(args.seed)
36 | 
37 | def main(args):
38 |     set_seed(args)
39 |     config, tokenizer, model = create_model(args)
40 |     model.to(args.device)
41 | 
42 |     if args.mode == 'train':
43 |         train(args, model, tokenizer)
44 |     elif args.mode == 'dev':
45 |         evaluate(args, model, tokenizer)
46 | 
47 | if __name__ == '__main__':
48 |     parser = argparse.ArgumentParser(description='main')
49 |     parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
50 |     parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
51 |     parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
52 |     parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
53 |     parser.add_argument('--device', type=str, default="cuda")
54 |     #model
55 |     parser.add_argument('--num_labels', type=int, default=2)
56 |     parser.add_argument('--max_length', type=int, default=256)
57 |     parser.add_argument('--batch_size', type=int, default=32)
58 |     parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
59 |     parser.add_argument('--cons_hidden_size', type=int, default=768)
60 |     parser.add_argument('--feature_size', type=int, default=384)
61 | 
62 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
63 |     parser.add_argument("--learning_rate", type=float, default=3e-5)
64 |     parser.add_argument("--warmup_steps", type=int, default=0)
65 |     parser.add_argument("--max_grad_norm", type=float, default=5.0)
66 |     parser.add_argument('--mode', type=str, default='train')
67 |     parser.add_argument('--seed', type=int, default=42)
68 |     parser.add_argument('--checkpoint', type=int, default=8)
69 |     parser.add_argument('--language_model', type=str, default='roberta-base')
70 |     parser.add_argument("--epoch", type=int, default=30)
71 |     parser.add_argument('--output_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5_with_pseudo_topic') # 모델 불러올 dir
72 |     parser.add_argument('--save_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5_with_pseudo_topic') # 모델 저장할 dir
73 | 
74 | 
75 |     args = parser.parse_args()
76 |     main(args)
77 | 


--------------------------------------------------------------------------------
/mine_next/run_stance.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | #from mine_next.functions.main_function import train, evaluate
 3 | from mine_next.functions.stance_main_func import train, evaluate
 4 | import random, os
 5 | import numpy as np
 6 | import torch
 7 | from transformers import AutoConfig, AutoTokenizer
 8 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
 9 | from mine_next.model.modeling import RobertaReflectGraphClassification
10 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
11 | 
12 | 
13 | def create_model(args):
14 |     config = AutoConfig.from_pretrained(
15 |         args.language_model,
16 |         num_labels=args.num_labels,
17 |         max_length=args.max_length,
18 |         # local_files_only=True
19 |     )
20 |     tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
21 |     setattr(config, 'cons_hidden_size', args.cons_hidden_size)
22 |     setattr(config, 'cons_tag2id', args.cons_tag2id)
23 |     model = RobertaForClassification.from_pretrained(
24 |         args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
25 |         config=config,
26 |         # local_files_only=True
27 |     )
28 |     return config, tokenizer, model
29 | 
30 | def set_seed(args):
31 |     random.seed(args.seed)
32 |     np.random.seed(args.seed)
33 |     torch.manual_seed(args.seed)
34 |     if torch.cuda.is_available():
35 |         torch.cuda.manual_seed_all(args.seed)
36 | 
37 | def main(args):
38 |     set_seed(args)
39 |     config, tokenizer, model = create_model(args)
40 |     model.to(args.device)
41 | 
42 |     if args.mode == 'train':
43 |         train(args, model, tokenizer)
44 |     elif args.mode == 'dev':
45 |         evaluate(args, model, tokenizer)
46 | 
47 | if __name__ == '__main__':
48 |     parser = argparse.ArgumentParser(description='main')
49 |     parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
50 |     parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
51 |     parser.add_argument('--stance_train', type=str, default='../data/IAM/stance/train.txt')
52 |     parser.add_argument('--stance_dev', type=str, default='../data/IAM/stance/dev.txt')
53 |     parser.add_argument('--save_dir', type=str, default='stance_test') # 모델 불러올 dir
54 |     parser.add_argument('--output_dir', type=str, default='stance_test') # 모델 저장할 dir
55 |     parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
56 |     parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
57 |     parser.add_argument('--device', type=str, default="cuda")
58 |     #model
59 |     parser.add_argument('--num_labels', type=int, default=2)
60 |     parser.add_argument('--max_length', type=int, default=256)
61 |     parser.add_argument('--batch_size', type=int, default=32)
62 |     parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
63 |     parser.add_argument('--cons_hidden_size', type=int, default=128)
64 | 
65 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
66 |     parser.add_argument("--learning_rate", type=float, default=3e-5)
67 |     parser.add_argument("--warmup_steps", type=int, default=0)
68 |     parser.add_argument("--max_grad_norm", type=float, default=5.0)
69 |     parser.add_argument('--mode', type=str, default='train')
70 |     parser.add_argument('--seed', type=int, default=42)
71 |     parser.add_argument('--checkpoint', type=int, default=1)
72 |     parser.add_argument('--language_model', type=str, default='roberta-base')
73 |     parser.add_argument("--epoch", type=int, default=15)
74 | 
75 |     args = parser.parse_args()
76 |     main(args)
77 | 


--------------------------------------------------------------------------------