├── README.md
└── mine_next
├── functions
├── __pycache__
│ ├── dataset.cpython-37.pyc
│ ├── main_function2.cpython-37.pyc
│ ├── sent_to_graph.cpython-37.pyc
│ └── stance_main_func.cpython-37.pyc
├── dataset.py
├── dev_error.json
├── gcn_test2.py
├── heterograph.py
├── homograph.py
├── main_function.py
├── main_function2.py
├── make_graph.py
├── pos_analy.py
├── save_graph.py
├── sent2_to_graph.py
├── sent_to_graph.py
├── stance_main_func.py
├── test.py
├── test_error.json
├── textrank.py
├── txt2json.py
├── use_bertopic.py
├── use_bertopic2.py
├── use_bertopic3.py
└── use_firstsent.py
├── model
├── __pycache__
│ └── modeling.cpython-37.pyc
└── modeling.py
├── run_base.py
├── run_debug.py
├── run_grad1.py
├── run_grand.py
├── run_grand2
├── run_grand2.py
├── run_grand3_test.py
├── run_one.py
└── run_stance.py
/README.md:
--------------------------------------------------------------------------------
1 | 학습 실행 방법
2 |
3 | python run.py --train_file TRAIN_FILE_PATH --save_dir SAVE_DIRECTORY_NAME --do_train True --init_weight True
4 |
5 | 테스트 실행 방법
6 |
7 | python run.py --predict_file PREDICT_FILE_PATH --output_dir MODEL_DIRECTORY_NAME --checkpoint MODEL_CHECKPOINT --do_eval True
8 |
9 | 실제 예시
10 |
11 | python run.py --predict_file extractive_summary_mrc_test_4.0.json --output_dir ./ --checkpoint 16000 --do_eval True
12 |
13 | --output_dir : 저장된 모델을 불러오는 디렉토리. --checkpoint와 같이 엮임.
14 |
15 | ex)
16 | --output_dir : ./
17 | --checkpoint : 16000
18 | ./checkpoint-16000 안에 들어있는 모델 불러옴
19 |
20 |
--------------------------------------------------------------------------------
/mine_next/functions/__pycache__/dataset.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/dataset.cpython-37.pyc
--------------------------------------------------------------------------------
/mine_next/functions/__pycache__/main_function2.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/main_function2.cpython-37.pyc
--------------------------------------------------------------------------------
/mine_next/functions/__pycache__/sent_to_graph.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/sent_to_graph.cpython-37.pyc
--------------------------------------------------------------------------------
/mine_next/functions/__pycache__/stance_main_func.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/__pycache__/stance_main_func.cpython-37.pyc
--------------------------------------------------------------------------------
/mine_next/functions/dataset.py:
--------------------------------------------------------------------------------
1 | import json, ast
2 | import benepar
3 | import dgl.frame
4 | from torch.utils.data import TensorDataset, Dataset
5 | import torch
6 | from transformers import AutoTokenizer
7 | import pandas as pd
8 | import argparse
9 | from tqdm import tqdm
10 | import spacy
11 | from mine_next.functions.sent_to_graph import constituent_to_tree, get_cons_tag_vocab, final_graph, all_process_graph
12 | import os, string
13 |
14 | nlp = spacy.load('en_core_web_sm')
15 | nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
16 |
17 |
18 | def convert_only_sentence2tensordataset(dataset, pseudo, tokenizer, max_length, mode):
19 | printable = set(string.printable)
20 | total_idx = []
21 | total_input_ids = []
22 | total_attention_mask = []
23 | total_label = []
24 | total_token_type_ids = []
25 | total_sim_label = []
26 | claim_sentences = dataset['claim_sentence'].tolist()
27 | claim_labels = dataset['claim_label'].tolist()
28 | claim_article_id = dataset['article_id'].tolist()
29 | gold_topic_sentences = dataset['topic_sentence'].tolist()
30 |
31 | claim_labels = [0 if label is 'O' else 1 for label in claim_labels]
32 | # 여기부분은 평소에 불러다 쓸때 사용하는 부분
33 | # total_graph = {}
34 | # max_constituent_length = 600
35 | # total_constituent_labels = []
36 | # with open('../data/IAM/claims/graphs/{}_constituent.txt'.format(mode), 'r', encoding='utf-8') as f:
37 | # constituents = f.readlines()
38 | # # 테스트
39 | # for constituent in constituents:
40 | # constituent = ast.literal_eval(constituent.replace('\n', ''))
41 | # total_constituent_labels.append(constituent+[-1]*(max_constituent_length-len(constituent)))
42 | # graphs = os.listdir('../data/IAM/claims/graphs')
43 | # graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file)] #train이나 dev 의 dgl만
44 | # for graph in graphs_list:
45 | # (g,), _ = dgl.load_graphs(graph)
46 | # idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
47 | # total_graph[int(idx)] = g
48 |
49 | # 평소 불러쓸때
50 | total_graph_first = {}
51 | total_graph_second = {}
52 | max_constituent_length = 600
53 | total_constituent_label_first = []
54 | total_constituent_label_second = []
55 | with open('../data/IAM/claims/graphs/{}_constituent_first_second.txt'.format(mode), 'r', encoding='utf-8') as f:
56 | constituents = f.readlines()
57 | for constituent in constituents:
58 | constituent = ast.literal_eval(constituent.replace('\n', ''))
59 | total_constituent_label_first.append(constituent[0]+[-1]*(max_constituent_length-len(constituent[0])))
60 | total_constituent_label_second.append(constituent[1]+[-1]*(max_constituent_length-len(constituent[1])))
61 | graphs = os.listdir('../data/IAM/claims/graphs')
62 | graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'first' in file)] #train이나 dev 의 dgl만
63 | for graph in graphs_list:
64 | (g,), _ = dgl.load_graphs(graph)
65 | idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
66 | total_graph_first[int(idx)] = g
67 | graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'second' in file)] #train이나 dev 의 dgl만
68 | for graph in graphs_list:
69 | (g,), _ = dgl.load_graphs(graph)
70 | idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
71 | total_graph_second[int(idx)] = g
72 |
73 | for idx, (topic, claim_sentence, claim_label, article_id) in tqdm(enumerate(zip(gold_topic_sentences, claim_sentences, claim_labels, claim_article_id)), desc='convert to data to tensordataset', total=len(claim_labels)):
74 | claim_sentence = claim_sentence.lower().replace('“', '"').replace('”', '"')
75 | claim_sentence = "".join(filter(lambda x : x in printable, claim_sentence))
76 |
77 | # claim_graph_first, claim_graph_second, constituent_label_first, constituent_label_second = all_process_graph(nlp, tokenizer, claim_sentence)
78 | # total_graph_first[idx] = claim_graph_first
79 | # total_graph_second[idx] = claim_graph_second
80 | # constituent_label_first = constituent_label_first.tolist() + [-1]*(max_constituent_length-len(constituent_label_first.tolist()))
81 | # constituent_label_second = constituent_label_second.tolist() + [-1]*(max_constituent_length-len(constituent_label_second.tolist()))
82 | # total_constituent_label_first.append(constituent_label_first)
83 | # total_constituent_label_second.append(constituent_label_second)
84 |
85 | # 슈도 토픽 할때
86 | #process_sentence = tokenizer(pseudo[article_id], claim_sentence, max_length=max_length, padding='max_length', truncation=True)
87 | #process_sentence = tokenizer(claim_sentence, max_length=max_length, padding='max_length', truncation=True)
88 | process_sentence = tokenizer(topic, claim_sentence, max_length=max_length, padding='max_length', truncation=True)
89 | input_ids = process_sentence['input_ids']
90 | attention_mask = process_sentence['attention_mask']
91 | # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식
92 | sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2]
93 | try:
94 | second_sep_index = sep_index[1]
95 | token_type_ids = [0] * second_sep_index
96 | token_type_ids += [1] * (len(input_ids)-len(token_type_ids))
97 | except IndexError:
98 | token_type_ids = [0] * max_length
99 | # 주장일 때
100 | if claim_label == 1:
101 | sim_label = 1
102 | # 주장이 아닐때
103 | elif claim_label == 0:
104 | sim_label = -1
105 |
106 | total_idx.append(idx)
107 | total_input_ids.append(input_ids)
108 | total_attention_mask.append(attention_mask)
109 | total_token_type_ids.append(token_type_ids)
110 | total_label.append(claim_label)
111 | total_sim_label.append(sim_label)
112 | #total_graph[idx] = claim_graph
113 | #total_constituent_labels.append(constituent_label_list)
114 | if idx < 3:
115 | print()
116 | print("****EXAMPLE****")
117 | print("topic sentence : {}".format(topic))
118 | print("claim sentence : {}".format(claim_sentence))
119 | print("claim sentence input ids : {}".format(input_ids))
120 | print("claim sentence attention mask : {}".format(attention_mask))
121 | print("claim sentence token type ids : {}".format(token_type_ids))
122 | print("label : {}".format(claim_label))
123 | print("sim label : {}".format(sim_label))
124 |
125 |
126 | total_idx = torch.tensor(total_idx, dtype=torch.long)
127 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long)
128 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long)
129 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long)
130 | total_label = torch.tensor(total_label, dtype=torch.long)
131 | total_sim_label = torch.tensor(total_sim_label, dtype=torch.long)
132 | total_constituent_label_first = torch.tensor(total_constituent_label_first, dtype=torch.long)
133 | total_constituent_label_second = torch.tensor(total_constituent_label_second, dtype=torch.long)
134 | dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_label, total_sim_label,
135 | total_constituent_label_first, total_constituent_label_second)
136 |
137 | return dataset, total_graph_first, total_graph_second
138 |
139 |
140 | def convert_only_sentence2tensordataset(dataset, pseudo, tokenizer, max_length, mode):
141 | printable = set(string.printable)
142 | total_idx = []
143 | total_input_ids = []
144 | total_attention_mask = []
145 | total_label = []
146 | total_token_type_ids = []
147 | total_sim_label = []
148 | claim_sentences = dataset['claim_sentence'].tolist()
149 | claim_labels = dataset['claim_label'].tolist()
150 | claim_article_id = dataset['article_id'].tolist()
151 | gold_topic_sentences = dataset['topic_sentence'].tolist()
152 |
153 | claim_labels = [0 if label is 'O' else 1 for label in claim_labels]
154 | # 여기부분은 평소에 불러다 쓸때 사용하는 부분
155 | # total_graph = {}
156 | # max_constituent_length = 600
157 | # total_constituent_labels = []
158 | # with open('../data/IAM/claims/graphs/{}_constituent.txt'.format(mode), 'r', encoding='utf-8') as f:
159 | # constituents = f.readlines()
160 | # #테스트
161 | # for constituent in constituents:
162 | # constituent = ast.literal_eval(constituent.replace('\n', ''))
163 | # total_constituent_labels.append(constituent+[-1]*(max_constituent_length-len(constituent)))
164 | # graphs = os.listdir('../data/IAM/claims/graphs')
165 | # graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file)] #train이나 dev 의 dgl만
166 | # for graph in graphs_list:
167 | # (g,), _ = dgl.load_graphs(graph)
168 | # idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
169 | # total_graph[int(idx)] = g
170 |
171 | #평소 불러쓸때
172 | total_graph_first = {}
173 | total_graph_second = {}
174 | max_constituent_length = 600
175 | total_constituent_label_first = []
176 | total_constituent_label_second = []
177 | with open('../data/IAM/claims/graphs/{}_constituent_first_second.txt'.format(mode), 'r', encoding='utf-8') as f:
178 | constituents = f.readlines()
179 | for constituent in constituents:
180 | constituent = ast.literal_eval(constituent.replace('\n', ''))
181 | total_constituent_label_first.append(constituent[0]+[-1]*(max_constituent_length-len(constituent[0])))
182 | total_constituent_label_second.append(constituent[1]+[-1]*(max_constituent_length-len(constituent[1])))
183 | graphs = os.listdir('../data/IAM/claims/graphs')
184 | graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'first' in file)] #train이나 dev 의 dgl만
185 | for graph in graphs_list:
186 | (g,), _ = dgl.load_graphs(graph)
187 | idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
188 | total_graph_first[int(idx)] = g
189 | graphs_list = [os.path.join('../data/IAM/claims/graphs', file) for file in graphs if ('dgl' in file and mode in file and 'second' in file)] #train이나 dev 의 dgl만
190 | for graph in graphs_list:
191 | (g,), _ = dgl.load_graphs(graph)
192 | idx = graph.split('/')[-1].split('_')[-1].split('.')[0]
193 | total_graph_second[int(idx)] = g
194 |
195 | for idx, (topic, claim_sentence, claim_label, article_id) in tqdm(enumerate(zip(gold_topic_sentences, claim_sentences, claim_labels, claim_article_id)), desc='convert to data to tensordataset', total=len(claim_labels)):
196 | claim_sentence = claim_sentence.lower().replace('“', '"').replace('”', '"')
197 | claim_sentence = "".join(filter(lambda x : x in printable, claim_sentence))
198 |
199 | # claim_graph_first, claim_graph_second, constituent_label_first, constituent_label_second = all_process_graph(nlp, tokenizer, claim_sentence)
200 | # total_graph_first[idx] = claim_graph_first
201 | # total_graph_second[idx] = claim_graph_second
202 | # constituent_label_first = constituent_label_first.tolist() + [-1]*(max_constituent_length-len(constituent_label_first.tolist()))
203 | # constituent_label_second = constituent_label_second.tolist() + [-1]*(max_constituent_length-len(constituent_label_second.tolist()))
204 | # total_constituent_label_first.append(constituent_label_first)
205 | # total_constituent_label_second.append(constituent_label_second)
206 |
207 | # 슈도 토픽 할때
208 | process_sentence = tokenizer(pseudo[article_id], claim_sentence, max_length=max_length, padding='max_length', truncation=True)
209 | # 그냥 문장 하나만 할 때
210 | # process_sentence = tokenizer(claim_sentence, max_length=max_length, padding='max_length', truncation=True)
211 | # 골든 토픽과 문장 하나
212 | # process_sentence = tokenizer(topic, claim_sentence, max_length=max_length, padding='max_length', truncation=True)
213 |
214 | input_ids = process_sentence['input_ids']
215 | attention_mask = process_sentence['attention_mask']
216 | # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식
217 | sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2]
218 | try:
219 | second_sep_index = sep_index[1]
220 | token_type_ids = [0] * second_sep_index
221 | token_type_ids += [1] * (len(input_ids)-len(token_type_ids))
222 | except IndexError:
223 | token_type_ids = [0] * max_length
224 | # 주장일 때
225 | if claim_label == 1:
226 | sim_label = 1
227 | # 주장이 아닐때
228 | elif claim_label == 0:
229 | sim_label = -1
230 |
231 | total_idx.append(idx)
232 | total_input_ids.append(input_ids)
233 | total_attention_mask.append(attention_mask)
234 | total_token_type_ids.append(token_type_ids)
235 | total_label.append(claim_label)
236 | total_sim_label.append(sim_label)
237 | #total_graph[idx] = claim_graph
238 | #total_constituent_labels.append(constituent_label_list)
239 | if idx < 3:
240 | print()
241 | print("****EXAMPLE****")
242 | print("topic sentence : {}".format(topic))
243 | print("pseudo topic sentence : {}".format(pseudo[article_id]))
244 | print("claim sentence : {}".format(claim_sentence))
245 | print("claim sentence input ids : {}".format(input_ids))
246 | print("claim sentence attention mask : {}".format(attention_mask))
247 | print("claim sentence token type ids : {}".format(token_type_ids))
248 | print("label : {}".format(claim_label))
249 | print("sim label : {}".format(sim_label))
250 |
251 |
252 | total_idx = torch.tensor(total_idx, dtype=torch.long)
253 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long)
254 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long)
255 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long)
256 | total_label = torch.tensor(total_label, dtype=torch.long)
257 | total_sim_label = torch.tensor(total_sim_label, dtype=torch.long)
258 | total_constituent_label_first = torch.tensor(total_constituent_label_first, dtype=torch.long)
259 | total_constituent_label_second = torch.tensor(total_constituent_label_second, dtype=torch.long)
260 | dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_label, total_sim_label,
261 | total_constituent_label_first, total_constituent_label_second)
262 |
263 | return dataset, total_graph_first, total_graph_second
264 |
265 |
266 | def convert_data2tensordataset(dataset, tokenizer, max_length, mode):
267 | total_input_ids = []
268 | total_attention_mask = []
269 | total_label = []
270 | total_token_type_ids = []
271 | total_sim_label = []
272 | total_idx = []
273 | claim_sentences = dataset['claim_sentence'].tolist()
274 | claim_labels = dataset['claim_label'].tolist()
275 | claim_labels = [0 if label is 'O' else 1 for label in claim_labels]
276 | topic_sentences = dataset['topic_sentence'].tolist()
277 | for idx, (topic_sentence, claim_sentence, claim_label) in tqdm(enumerate(zip(topic_sentences, claim_sentences, claim_labels)), desc='convert to data to tensordataset', total=len(claim_labels)):
278 | process_sentence = tokenizer(topic_sentence, claim_sentence, max_length=max_length, padding='max_length', truncation=True)
279 | input_ids = process_sentence['input_ids']
280 | attention_mask = process_sentence['attention_mask']
281 | # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식
282 | sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2]
283 | second_sep_index = sep_index[1]
284 | token_type_ids = [0] * second_sep_index
285 | token_type_ids += [1] * (len(input_ids)-len(token_type_ids))
286 | # 주장일 때
287 | if claim_label == 1:
288 | sim_label = 1
289 | # 주장이 아닐때
290 | elif claim_label == 0:
291 | sim_label = -1
292 | total_idx.append(idx)
293 | total_input_ids.append(input_ids)
294 | total_attention_mask.append(attention_mask)
295 | total_token_type_ids.append(token_type_ids)
296 | total_label.append(claim_label)
297 | total_sim_label.append(sim_label)
298 | if idx < 3:
299 | print()
300 | print("****EXAMPLE****")
301 | print("topic sentence : {}".format(topic_sentence))
302 | print("claim sentence : {}".format(claim_sentence))
303 | print("topic, claim sentence input ids : {}".format(input_ids))
304 | print("topic, claim sentence attention mask : {}".format(attention_mask))
305 | print("topic, claim sentence token type ids : {}".format(token_type_ids))
306 | print("label : {}".format(claim_label))
307 | print("sim label : {}".format(sim_label))
308 | total_idx = torch.tensor(total_idx, dtype=torch.long)
309 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long)
310 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long)
311 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long)
312 | total_label = torch.tensor(total_label, dtype=torch.long)
313 | total_sim_label = torch.tensor(total_sim_label, dtype=torch.long)
314 | dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_label, total_sim_label)
315 | return dataset
316 |
317 |
318 | def convert_stance_data2tensordataset(dataset, tokenizer, max_length, mode=None):
319 | total_idx = []
320 | total_input_ids = []
321 | total_attention_mask = []
322 | total_label = []
323 | total_token_type_ids = []
324 | total_sim_label = []
325 | total_stance_label = []
326 | #dataset = dataset[dataset['claim_label'] == 'C']
327 |
328 | claim_sentences = dataset['claim_sentence'].tolist()
329 | topic_sentences = dataset['topic_sentence'].tolist()
330 | stance_labels = dataset['stance_labels'].tolist()
331 | for idx, (topic_sentence, claim_sentence, stance_label) in tqdm(enumerate(zip(topic_sentences, claim_sentences, stance_labels)), desc='convert to data to tensordataset', total=len(stance_labels)):
332 | process_sentence = tokenizer(topic_sentence, claim_sentence, max_length=max_length, padding='max_length', truncation=True)
333 | input_ids = process_sentence['input_ids']
334 | attention_mask = process_sentence['attention_mask']
335 | # 주제에 대한부분만 1로 하고 나머진 0 으로 하는 식
336 | try:
337 | sep_index = [idx for idx, ids in enumerate(input_ids) if ids == 2]
338 | second_sep_index = sep_index[1]
339 | token_type_ids = [0] * second_sep_index
340 | token_type_ids += [1] * (len(input_ids)-len(token_type_ids))
341 | except IndexError:
342 | token_type_ids = [0] * max_length
343 | #sent_attention_mask = (1-token_type_ids) * attention_mask
344 | total_idx.append(idx)
345 | total_input_ids.append(input_ids)
346 | total_attention_mask.append(attention_mask)
347 | total_token_type_ids.append(token_type_ids)
348 | if stance_label == -1:
349 | total_stance_label.append(0)
350 | else:
351 | total_stance_label.append(1)
352 | #total_stance_label.append(stance_label)
353 | if idx < 3:
354 | print()
355 | print("****EXAMPLE****")
356 | print("topic sentence : {}".format(topic_sentence))
357 | print("claim sentence : {}".format(claim_sentence))
358 | print("topic, claim sentence input ids : {}".format(input_ids))
359 | print("topic, claim sentence attention mask : {}".format(attention_mask))
360 | print("topic, claim sentence token type ids : {}".format(token_type_ids))
361 | print("stance label : {}".format(stance_label))
362 |
363 | total_idx = torch.tensor(total_idx, dtype=torch.long)
364 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long)
365 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long)
366 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long)
367 | total_stance_label = torch.tensor(total_stance_label, dtype=torch.long)
368 | dataset = TensorDataset(total_idx, total_input_ids, total_attention_mask, total_token_type_ids, total_stance_label)
369 | return dataset
370 |
371 | # with open('../../../data/train_claim.json', 'r', encoding='utf-8') as reader:
372 | # dataset = json.load(reader)['data']
373 | #
374 | # total_title = []
375 | # total_input_ids = []
376 | # total_attention_mask = []
377 | # total_label = []
378 | # for data in dataset:
379 | # title = data['title']
380 | # total_title.append(title)
381 | # paragraphs = data['paragraphs']
382 | # for para in paragraphs:
383 | # answers = para['qas'][0]['answers']
384 | # context = para['context']
385 | # result = tokenizer(context, padding='max_length', max_length=4096, truncation=True)
386 | # # cls idx 2 / sep idx 3
387 | # total_input_ids.append(result['input_ids'])
388 | # total_attention_mask.append(result['attention_mask'])
389 | # context_list = context.split('[SEP]')
390 | # each_label = [0] * len(context_list)
391 | # # 첫 sep는 첫번째 문장에 대한 표현. 문장의 오른쪽에 있는 sep를 기준으로 한다.
392 | # for answer in answers:
393 | # text = answer['text']
394 | # for idx, ctx in enumerate(context_list):
395 | # if text in ctx:
396 | # print(idx+1)
397 |
398 |
399 |
400 | # if __name__ == '__main__':
401 | # parser = argparse.ArgumentParser(description='dataset creating')
402 | # parser.add_argument('--train_data', type=str, default='../../../data/train_claim.json')
403 |
--------------------------------------------------------------------------------
/mine_next/functions/gcn_test2.py:
--------------------------------------------------------------------------------
1 | import dgl.nn.pytorch as dglnn
2 | import torch.nn as nn
3 | import dgl.data
4 | import torch.nn.functional as F
5 | from dgl.dataloading import GraphDataLoader
6 | import torch
7 |
8 |
9 | ###
10 | # 이건 그래프 통쨰로 분류하는 코드
11 | ###
12 |
13 |
14 | dataset = dgl.data.GINDataset('MUTAG', False)
15 |
16 | dataloader = GraphDataLoader(
17 | dataset,
18 | batch_size=1024,
19 | drop_last=False,
20 | shuffle=True)
21 |
22 |
23 | class Classifier(nn.Module):
24 | def __init__(self, in_dim, hidden_dim, n_classes):
25 | super(Classifier, self).__init__()
26 | self.conv1 = dglnn.GraphConv(in_dim, hidden_dim)
27 | self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim)
28 | self.classify = nn.Linear(hidden_dim, n_classes)
29 |
30 | def forward(self, g, h):
31 | # Apply graph convolution and activation.
32 | h = F.relu(self.conv1(g, h))
33 | h = F.relu(self.conv2(g, h))
34 | with g.local_scope():
35 | g.ndata['h'] = h
36 | # Calculate graph representation by average readout.
37 | # (batch size, 20(아마 히든사이즈))
38 | hg = dgl.mean_nodes(g, 'h')
39 | return self.classify(hg)
40 |
41 |
42 | model = Classifier(7, 20, 5)
43 | opt = torch.optim.Adam(model.parameters())
44 | for epoch in range(20):
45 | for batched_graph, labels in dataloader:
46 | # (num nodes, 7) 아마 라벨 개수가 7개인듯
47 | feats = batched_graph.ndata['attr']
48 | logits = model(batched_graph, feats)
49 | loss = F.cross_entropy(logits, labels)
50 | opt.zero_grad()
51 | loss.backward()
52 | opt.step()
--------------------------------------------------------------------------------
/mine_next/functions/heterograph.py:
--------------------------------------------------------------------------------
1 | """
2 | .. currentmodule:: dgl
3 |
4 | Working with Heterogeneous Graphs
5 | =================================
6 |
7 | **Author**: Quan Gan, `Minjie Wang `_, Mufei Li,
8 | George Karypis, Zheng Zhang
9 |
10 | In this tutorial, you learn about:
11 |
12 | * Examples of heterogenous graph data and typical applications.
13 |
14 | * Creating and manipulating a heterogenous graph in DGL.
15 |
16 | * Implementing `Relational-GCN `_, a popular GNN model,
17 | for heterogenous graph input.
18 |
19 | * Training a model to solve a node classification task.
20 |
21 | Heterogeneous graphs, or *heterographs* for short, are graphs that contain
22 | different types of nodes and edges. The different types of nodes and edges tend
23 | to have different types of attributes that are designed to capture the
24 | characteristics of each node and edge type. Within the context of
25 | graph neural networks, depending on their complexity, certain node and edge types
26 | might need to be modeled with representations that have a different number of dimensions.
27 |
28 | DGL supports graph neural network computations on such heterogeneous graphs, by
29 | using the heterograph class and its associated API.
30 |
31 | """
32 |
33 | ###############################################################################
34 | # Examples of heterographs
35 | # -----------------------
36 | # Many graph datasets represent relationships among various types of entities.
37 | # This section provides an overview for several graph use-cases that show such relationships
38 | # and can have their data represented as heterographs.
39 | #
40 | # Citation graph
41 | # ~~~~~~~~~~~~~~~
42 | # The Association for Computing Machinery publishes an `ACM dataset `_ that contains two
43 | # million papers, their authors, publication venues, and the other papers
44 | # that were cited. This information can be represented as a heterogeneous graph.
45 | #
46 | # The following diagram shows several entities in the ACM dataset and the relationships among them
47 | # (taken from `Shi et al., 2015 `_).
48 | #
49 | # .. figure:: https://data.dgl.ai/tutorial/hetero/acm-example.png#
50 | #
51 | # This graph has three types of entities that correspond to papers, authors, and publication venues.
52 | # It also contains three types of edges that connect the following:
53 | #
54 | # * Authors with papers corresponding to *written-by* relationships
55 | #
56 | # * Papers with publication venues corresponding to *published-in* relationships
57 | #
58 | # * Papers with other papers corresponding to *cited-by* relationships
59 | #
60 | #
61 | # Recommender systems
62 | # ~~~~~~~~~~~~~~~~~~~~
63 | # The datasets used in recommender systems often contain
64 | # interactions between users and items. For example, the data could include the
65 | # ratings that users have provided to movies. Such interactions can be modeled
66 | # as heterographs.
67 | #
68 | # The nodes in these heterographs will have two types, *users* and *movies*. The edges
69 | # will correspond to the user-movie interactions. Furthermore, if an interaction is
70 | # marked with a rating, then each rating value could correspond to a different edge type.
71 | # The following diagram shows an example of user-item interactions as a heterograph.
72 | #
73 | # .. figure:: https://data.dgl.ai/tutorial/hetero/recsys-example.png
74 | #
75 | #
76 | # Knowledge graph
77 | # ~~~~~~~~~~~~~~~~
78 | # Knowledge graphs are inherently heterogenous. For example, in
79 | # Wikidata, Barack Obama (item Q76) is an instance of a human, which could be viewed as
80 | # the entity class, whose spouse (item P26) is Michelle Obama (item Q13133) and
81 | # occupation (item P106) is politician (item Q82955). The relationships are shown in the following.
82 | # diagram.
83 | #
84 | # .. figure:: https://data.dgl.ai/tutorial/hetero/kg-example.png
85 | #
86 |
87 | ###############################################################################
88 | # Creating a heterograph in DGL
89 | # -----------------------------
90 | # You can create a heterograph in DGL using the :func:`dgl.heterograph` API.
91 | # The argument to :func:`dgl.heterograph` is a dictionary. The keys are tuples
92 | # in the form of ``(srctype, edgetype, dsttype)`` specifying the relation name
93 | # and the two entity types it connects. Such tuples are called *canonical edge types*
94 | # The values are data to initialize the graph structures, that is, which
95 | # nodes the edges actually connect.
96 | #
97 | # For instance, the following code creates the user-item interactions heterograph shown earlier.
98 |
99 | # Each value of the dictionary is a pair of source and destination arrays.
100 | # Nodes are integer IDs starting from zero. Nodes IDs of different types have
101 | # separate countings.
102 | import dgl
103 | import numpy as np
104 |
105 | ratings = dgl.heterograph(
106 | {('user', '+1', 'movie') : (np.array([0, 0, 1]), np.array([0, 1, 0])),
107 | ('user', '-1', 'movie') : (np.array([2]), np.array([1]))})
108 |
109 | ###############################################################################
110 | # Manipulating heterograph
111 | # ------------------------
112 | # You can create a more realistic heterograph using the ACM dataset. To do this, first
113 | # download the dataset as follows:
114 |
115 | import scipy.io
116 | import urllib.request
117 |
118 | data_url = 'https://data.dgl.ai/dataset/ACM.mat'
119 | data_file_path = '/tmp/ACM.mat'
120 |
121 | urllib.request.urlretrieve(data_url, data_file_path)
122 | data = scipy.io.loadmat(data_file_path)
123 | print(list(data.keys()))
124 |
125 | ###############################################################################
126 | # The dataset stores node information by their types: ``P`` for paper, ``A``
127 | # for author, ``C`` for conference, ``L`` for subject code, and so on. The relationships
128 | # are stored as SciPy sparse matrix under key ``XvsY``, where ``X`` and ``Y``
129 | # could be any of the node type code.
130 | #
131 | # The following code prints out some statistics about the paper-author relationships.
132 |
133 | print(type(data['PvsA']))
134 | print('#Papers:', data['PvsA'].shape[0])
135 | print('#Authors:', data['PvsA'].shape[1])
136 | print('#Links:', data['PvsA'].nnz)
137 |
138 | ###############################################################################
139 | # Converting this SciPy matrix to a heterograph in DGL is straightforward.
140 |
141 | pa_g = dgl.heterograph({('paper', 'written-by', 'author') : data['PvsA'].nonzero()})
142 |
143 | ###############################################################################
144 | # You can easily print out the type names and other structural information.
145 |
146 | print('Node types:', pa_g.ntypes)
147 | print('Edge types:', pa_g.etypes)
148 | print('Canonical edge types:', pa_g.canonical_etypes)
149 |
150 | # Nodes and edges are assigned integer IDs starting from zero and each type has its own counting.
151 | # To distinguish the nodes and edges of different types, specify the type name as the argument.
152 | print(pa_g.number_of_nodes('paper'))
153 | # Canonical edge type name can be shortened to only one edge type name if it is
154 | # uniquely distinguishable.
155 | print(pa_g.number_of_edges(('paper', 'written-by', 'author')))
156 | print(pa_g.number_of_edges('written-by'))
157 | print(pa_g.successors(1, etype='written-by')) # get the authors that write paper #1
158 |
159 | # Type name argument could be omitted whenever the behavior is unambiguous.
160 | print(pa_g.number_of_edges()) # Only one edge type, the edge type argument could be omitted
161 |
162 | ###############################################################################
163 | # A homogeneous graph is just a special case of a heterograph with only one type
164 | # of node and edge.
165 |
166 | # Paper-citing-paper graph is a homogeneous graph
167 | pp_g = dgl.heterograph({('paper', 'citing', 'paper') : data['PvsP'].nonzero()})
168 | # equivalent (shorter) API for creating homogeneous graph
169 | pp_g = dgl.from_scipy(data['PvsP'])
170 |
171 | # All the ntype and etype arguments could be omitted because the behavior is unambiguous.
172 | print(pp_g.number_of_nodes())
173 | print(pp_g.number_of_edges())
174 | print(pp_g.successors(3))
175 |
176 | ###############################################################################
177 | # Create a subset of the ACM graph using the paper-author, paper-paper,
178 | # and paper-subject relationships. Meanwhile, also add the reverse
179 | # relationship to prepare for the later sections.
180 |
181 | G = dgl.heterograph({
182 | ('paper', 'written-by', 'author') : data['PvsA'].nonzero(),
183 | ('author', 'writing', 'paper') : data['PvsA'].transpose().nonzero(),
184 | ('paper', 'citing', 'paper') : data['PvsP'].nonzero(),
185 | ('paper', 'cited', 'paper') : data['PvsP'].transpose().nonzero(),
186 | ('paper', 'is-about', 'subject') : data['PvsL'].nonzero(),
187 | ('subject', 'has', 'paper') : data['PvsL'].transpose().nonzero(),
188 | })
189 |
190 | print(G)
191 |
192 | ###############################################################################
193 | # **Metagraph** (or network schema) is a useful summary of a heterograph.
194 | # Serving as a template for a heterograph, it tells how many types of objects
195 | # exist in the network and where the possible links exist.
196 | #
197 | # DGL provides easy access to the metagraph, which could be visualized using
198 | # external tools.
199 |
200 | # Draw the metagraph using graphviz.
201 | # import pygraphviz as pgv
202 | # def plot_graph(nxg):
203 | # ag = pgv.AGraph(strict=False, directed=True)
204 | # for u, v, k in nxg.edges(keys=True):
205 | # ag.add_edge(u, v, label=k)
206 | # ag.layout('dot')
207 | # ag.draw('graph.png')
208 | #
209 | # plot_graph(G.metagraph())
210 |
211 | ###############################################################################
212 | # Learning tasks associated with heterographs
213 | # -------------------------------------------
214 | # Some of the typical learning tasks that involve heterographs include:
215 | #
216 | # * *Node classification and regression* to predict the class of each node or
217 | # estimate a value associated with it.
218 | #
219 | # * *Link prediction* to predict if there is an edge of a certain
220 | # type between a pair of nodes, or predict which other nodes a particular
221 | # node is connected with (and optionally the edge types of such connections).
222 | #
223 | # * *Graph classification/regression* to assign an entire
224 | # heterograph into one of the target classes or to estimate a numerical
225 | # value associated with it.
226 | #
227 | # In this tutorial, we designed a simple example for the first task.
228 | #
229 | # A semi-supervised node classification example
230 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
231 | # Our goal is to predict the publishing conference of a paper using the ACM
232 | # academic graph we just created. To further simplify the task, we only focus
233 | # on papers published in three conferences: *KDD*, *ICML*, and *VLDB*. All
234 | # the other papers are not labeled, making it a semi-supervised setting.
235 | #
236 | # The following code extracts those papers from the raw dataset and prepares
237 | # the training, validation, testing split.
238 |
239 | import numpy as np
240 | import torch
241 | import torch.nn as nn
242 | import torch.nn.functional as F
243 |
244 | pvc = data['PvsC'].tocsr()
245 | # find all papers published in KDD, ICML, VLDB
246 | c_selected = [0, 11, 13] # KDD, ICML, VLDB
247 | p_selected = pvc[:, c_selected].tocoo()
248 | # generate labels
249 | labels = pvc.indices
250 | labels[labels == 11] = 1
251 | labels[labels == 13] = 2
252 | labels = torch.tensor(labels).long()
253 |
254 | # generate train/val/test split
255 | pid = p_selected.row
256 | shuffle = np.random.permutation(pid)
257 | train_idx = torch.tensor(shuffle[0:800]).long()
258 | val_idx = torch.tensor(shuffle[800:900]).long()
259 | test_idx = torch.tensor(shuffle[900:]).long()
260 |
261 | ###############################################################################
262 | # Relational-GCN on heterograph
263 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
264 | # We use `Relational-GCN `_ to learn the
265 | # representation of nodes in the graph. Its message-passing equation is as
266 | # follows:
267 | #
268 | # .. math::
269 | #
270 | # h_i^{(l+1)} = \sigma\left(\sum_{r\in \mathcal{R}}
271 | # \sum_{j\in\mathcal{N}_r(i)}W_r^{(l)}h_j^{(l)}\right)
272 | #
273 | # Breaking down the equation, you see that there are two parts in the
274 | # computation.
275 | #
276 | # (i) Message computation and aggregation within each relation :math:`r`
277 | #
278 | # (ii) Reduction that merges the results from multiple relationships
279 | #
280 | # Following this intuition, perform message passing on a heterograph in
281 | # two steps.
282 | #
283 | # (i) Per-edge-type message passing
284 | #
285 | # (ii) Type wise reduction
286 |
287 | import dgl.function as fn
288 |
289 | class HeteroRGCNLayer(nn.Module):
290 | def __init__(self, in_size, out_size, etypes):
291 | super(HeteroRGCNLayer, self).__init__()
292 | # W_r for each relation
293 | self.weight = nn.ModuleDict({
294 | name : nn.Linear(in_size, out_size) for name in etypes
295 | })
296 |
297 | def forward(self, G, feat_dict):
298 | # The input is a dictionary of node features for each type
299 | funcs = {}
300 | for srctype, etype, dsttype in G.canonical_etypes:
301 | # Compute W_r * h
302 | Wh = self.weight[etype](feat_dict[srctype])
303 | # Save it in graph for message passing
304 | G.nodes[srctype].data['Wh_%s' % etype] = Wh
305 | # Specify per-relation message passing functions: (message_func, reduce_func).
306 | # Note that the results are saved to the same destination feature 'h', which
307 | # hints the type wise reducer for aggregation.
308 | funcs[etype] = (fn.copy_u('Wh_%s' % etype, 'm'), fn.mean('m', 'h'))
309 | # Trigger message passing of multiple types.
310 | # The first argument is the message passing functions for each relation.
311 | # The second one is the type wise reducer, could be "sum", "max",
312 | # "min", "mean", "stack"
313 | G.multi_update_all(funcs, 'sum')
314 | # return the updated node feature dictionary
315 | return {ntype : G.nodes[ntype].data['h'] for ntype in G.ntypes}
316 |
317 | ###############################################################################
318 | # Create a simple GNN by stacking two ``HeteroRGCNLayer``. Since the
319 | # nodes do not have input features, make their embeddings trainable.
320 |
321 | class HeteroRGCN(nn.Module):
322 | def __init__(self, G, in_size, hidden_size, out_size):
323 | super(HeteroRGCN, self).__init__()
324 | # Use trainable node embeddings as featureless inputs.
325 | embed_dict = {ntype : nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), in_size))
326 | for ntype in G.ntypes}
327 | for key, embed in embed_dict.items():
328 | nn.init.xavier_uniform_(embed)
329 | self.embed = nn.ParameterDict(embed_dict)
330 | # create layers
331 | self.layer1 = HeteroRGCNLayer(in_size, hidden_size, G.etypes)
332 | self.layer2 = HeteroRGCNLayer(hidden_size, out_size, G.etypes)
333 |
334 | def forward(self, G):
335 | h_dict = self.layer1(G, self.embed)
336 | h_dict = {k : F.leaky_relu(h) for k, h in h_dict.items()}
337 | h_dict = self.layer2(G, h_dict)
338 | # get paper logits
339 | return h_dict['paper']
340 |
341 | ###############################################################################
342 | # Train and evaluate
343 | # ~~~~~~~~~~~~~~~~~~
344 | # Train and evaluate this network.
345 |
346 | # Create the model. The output has three logits for three classes.
347 | model = HeteroRGCN(G, 10, 10, 3)
348 |
349 | opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
350 |
351 | best_val_acc = 0
352 | best_test_acc = 0
353 |
354 | for epoch in range(100):
355 | logits = model(G)
356 | # The loss is computed only for labeled nodes.
357 | loss = F.cross_entropy(logits[train_idx], labels[train_idx])
358 |
359 | pred = logits.argmax(1)
360 | train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
361 | val_acc = (pred[val_idx] == labels[val_idx]).float().mean()
362 | test_acc = (pred[test_idx] == labels[test_idx]).float().mean()
363 |
364 | if best_val_acc < val_acc:
365 | best_val_acc = val_acc
366 | best_test_acc = test_acc
367 |
368 | opt.zero_grad()
369 | loss.backward()
370 | opt.step()
371 |
372 | if epoch % 5 == 0:
373 | print('Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
374 | loss.item(),
375 | train_acc.item(),
376 | val_acc.item(),
377 | best_val_acc.item(),
378 | test_acc.item(),
379 | best_test_acc.item(),
380 | ))
381 |
382 | ###############################################################################
383 | # What's next?
384 | # ------------
385 | # * Check out our full implementation in PyTorch
386 | # `here `_.
387 | #
388 | # * We also provide the following model examples:
389 | #
390 | # * `Graph Convolutional Matrix Completion _`,
391 | # which we implement in MXNet
392 | # `here `_.
393 | #
394 | # * `Heterogeneous Graph Attention Network `_
395 | # requires transforming a heterograph into a homogeneous graph according to
396 | # a given metapath (i.e. a path template consisting of edge types). We
397 | # provide :func:`dgl.transform.metapath_reachable_graph` to do this. See full
398 | # implementation
399 | # `here `_.
400 | #
401 | # * `Metapath2vec `_ requires
402 | # generating random walk paths according to a given metapath. Please
403 | # refer to the full metapath2vec implementation
404 | # `here `_.
405 | #
406 | # * :doc:`Full heterograph API reference <../../api/python/heterograph>`.
407 |
--------------------------------------------------------------------------------
/mine_next/functions/homograph.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | import dgl
3 | import dgl.frame
4 | import torch
5 | import os, csv
6 | import pandas as pd
7 | from tqdm import tqdm
8 | import benepar
9 | from transformers import AutoTokenizer
10 | import string
11 | import dgl.nn.pytorch as dglnn
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 |
15 |
16 | class Classifier(nn.Module):
17 | def __init__(self, in_dim, hidden_dim, n_classes):
18 | super(Classifier, self).__init__()
19 | self.conv1 = dglnn.GraphConv(in_dim, hidden_dim)
20 | self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim)
21 | self.classify = nn.Linear(hidden_dim, n_classes)
22 | #self.cons_type_embeddings = nn.Embedding(82, 300)
23 |
24 | def forward(self, g, h, edge_type=None):
25 | # Apply graph convolution and activation.
26 | # cons_node_ids = g.filter(lambda nodes:nodes.data['dtype'] == 1 )
27 | # cc_edge_id = g.filter(lambda edges : edges.data['dtype'] == edge_type)
28 | # self_edge_id = g.filter(lambda edges : edges.data['dtype'] == 4)
29 | # cc_edge_id = torch.cat([cc_edge_id, self_edge_id], dim=0)
30 |
31 | h = F.relu(self.conv1(g, h))
32 | h = F.relu(self.conv2(g, h))
33 | with g.local_scope():
34 | g.ndata['h'] = h
35 | # Calculate graph representation by average readout.
36 | # (batch size, 20(아마 히든사이즈))
37 | hg = dgl.mean_nodes(g, 'h')
38 | return self.classify(hg)
39 |
40 |
41 | class RGCN(nn.Module):
42 | def __init__(self, in_feats, hid_feats, out_feats, rel_names):
43 | super().__init__()
44 |
45 | self.conv1 = dglnn.HeteroGraphConv({
46 | rel: dglnn.GraphConv(in_feats, hid_feats)
47 | for rel in rel_names}, aggregate='sum')
48 | self.conv2 = dglnn.HeteroGraphConv({
49 | rel: dglnn.GraphConv(hid_feats, out_feats)
50 | for rel in rel_names}, aggregate='sum')
51 |
52 | def forward(self, graph, inputs):
53 | # inputs is features of nodes
54 | h = self.conv1(graph, inputs)
55 | h = {k: F.relu(v) for k, v in h.items()}
56 | h = self.conv2(graph, h)
57 | return h
58 |
59 |
60 | class HeteroClassifier(nn.Module):
61 | def __init__(self, in_dim, hidden_dim, n_classes, rel_names):
62 | super().__init__()
63 |
64 | self.rgcn = RGCN(in_dim, hidden_dim, hidden_dim, rel_names)
65 | self.classify = nn.Linear(hidden_dim, n_classes)
66 |
67 | def forward(self, g, h):
68 | #h = g.ndata['feat']
69 | h = self.rgcn(g, h)
70 | with g.local_scope():
71 | g.ndata['h'] = h
72 | # Calculate graph representation by average readout.
73 | hg = 0
74 | for ntype in g.ntypes:
75 | hg = hg + dgl.mean_nodes(g, 'h', ntype=ntype)
76 | return self.classify(hg)
77 |
78 |
79 | class Tree(object):
80 | def __init__(self, type):
81 | # self.start, self.end 단어 기준으로 세는것. one 이 0번째, . 이 27번쨰라 root 노드가 start=0, end=27을 가지고 있는거임
82 | self.parent = None
83 | self.num_children = 0
84 | self.children = list()
85 | self.type = type
86 | self.is_leaf = False
87 | self.start = -1
88 | self.end = -1
89 | self.idx = -1
90 |
91 | def add_child(self, child):
92 | child.parent = self
93 | self.num_children += 1
94 | self.children.append(child)
95 |
96 | def size(self):
97 | count = 1
98 | for i in range(self.num_children):
99 | count += self.children[i].size()
100 | return count
101 |
102 | def __str__(self):
103 | return self.type
104 |
105 | def __iter__(self):
106 | yield self
107 | for c in self.children:
108 | for x in c:
109 | yield x
110 |
111 | def get_cons_tag_vocab(data_path):
112 | tag2id = {}
113 | with open(data_path) as f:
114 | for line in f.readlines():
115 | tag, idx = line.strip().split('\t')
116 | tag2id[tag] = int(idx)
117 | return tag2id
118 |
119 | def span_starts_ends(node: Tree):
120 | if len(node.children) == 0:
121 | return
122 | for child in node.children:
123 | span_starts_ends(child)
124 |
125 | node.start = node.children[0].start
126 | node.end = node.children[-1].end
127 |
128 | def constituent_to_tree(tokenizer, constituent_string, sentence, word_offset, node_offset, num_orders=2):
129 | constituents = []
130 | temp_str = ""
131 | # 괄호 ['(', 'S', '(', 'NP', '(', 'NP', '(', 'CD', 'one', ')', '(', 'ADJP', '(', 'RB', 'long', ')' ... ] 이런식으로 (,)나 단어, constituent 단위로 분리
132 | for i, char in enumerate(constituent_string):
133 | if char == "(" or char == ")" or char == " ":
134 | if len(temp_str) != 0:
135 | constituents.append(temp_str)
136 | temp_str = ""
137 | if char != " ":
138 | constituents.append(char)
139 | else:
140 | temp_str += char
141 | # NP, PP등 노드 단위로 stack
142 | stack = []
143 | for cons in constituents:
144 | if cons != ")":
145 | stack.append(cons)
146 | else:
147 | tail = stack.pop()
148 | temp_constituents = []
149 | while tail != "(":
150 | temp_constituents.append(tail)
151 | tail = stack.pop()
152 |
153 | parent = Tree(temp_constituents[-1])
154 | for i in range(len(temp_constituents) - 2, -1, -1):
155 | if isinstance(temp_constituents[i], Tree):
156 | parent.add_child(temp_constituents[i])
157 | else:
158 | child = Tree(temp_constituents[i])
159 | parent.add_child(child)
160 | stack.append(parent)
161 | root = stack[-1]
162 | for node in root:
163 | if len(node.children) == 0:
164 | node.is_leaf = True
165 |
166 | for node in root:
167 | if node.is_leaf:
168 | node.start = word_offset
169 | node.end = word_offset
170 | word_offset += 1
171 | span_starts_ends(root)
172 |
173 | node_sequence = []
174 | # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음
175 | internal_nodes = []
176 | for node in root:
177 | if not node.is_leaf:
178 | internal_nodes.append(node)
179 | node_sequence.append(node)
180 |
181 | node_offset_original = node_offset
182 | for node in root:
183 | if node.is_leaf:
184 | continue
185 | node.idx = node_offset
186 | node_offset += 1
187 |
188 | constituent_sequence = [] # [(node idx, node start, node end, node type, parent idx)]
189 | num_internal_nodes = len(internal_nodes)
190 | # constituent_edge
191 | constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)]
192 | for i, node in enumerate(internal_nodes):
193 | parent_idx = node.parent.idx if node.parent else -1
194 | constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx))
195 | if parent_idx != -1:
196 | constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 # 바로 아래 코드랑 보면 양방향 엣지 포함하는거임
197 | constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1
198 | # 이부분은 한계층 건너 뛰어서 엣지 이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌
199 | high_order_sequence = [constituent_sequence]
200 | for i in range(1, num_orders):
201 | new_constituent_sequence = []
202 | for idx, start, end, type, parent_idx in high_order_sequence[-1]:
203 | if parent_idx == -1:
204 | continue
205 | parent_node = constituent_sequence[parent_idx - node_offset_original]
206 | if parent_node[-1] == -1:
207 | continue
208 | new_constituent_sequence.append((idx, start, end, type, parent_node[-1]))
209 | constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1
210 | constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1
211 | high_order_sequence.append(new_constituent_sequence)
212 | return high_order_sequence, word_offset, node_offset
213 |
214 | def final_graph(constituent_list, graph):
215 | cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt')
216 | forward_edge_type, backward_edge_type = 0, 2
217 |
218 | prev_root_node_id = None
219 | constituent_labels = []
220 | for high_order_sent_cons in constituent_list:
221 | for i, sent_cons in enumerate(high_order_sent_cons):
222 | for idx, start, end, label, parent_idx in sent_cons:
223 | idx_nodeid = idx # 원래는 constituent_start_idx = 0, node_id_offset = 406(token id까지였음. 1063중 406이 토큰이고 이후가 node 였었음.)
224 | # parent 없는 노드
225 | if parent_idx == -1:
226 | if prev_root_node_id is not None:
227 | # graph.add_edges(prev_root_node_id, idx_nodeid,
228 | # data={'cc_link': torch.tensor([forward_edge_type + i]),
229 | # 'dtype': torch.tensor([forward_edge_type + i])})
230 | # # dual GAT
231 | # graph.add_edges(idx_nodeid, prev_root_node_id,
232 | # data={'cc_link': torch.tensor([backward_edge_type + i]),
233 | # 'dtype': torch.tensor([backward_edge_type + i])})
234 | graph.add_edges(prev_root_node_id, idx_nodeid,
235 | data={'cc_link': torch.tensor([1]),
236 | 'dtype': torch.tensor([1])})
237 | # dual GAT
238 | graph.add_edges(idx_nodeid, prev_root_node_id,
239 | data={'cc_link': torch.tensor([1]),
240 | 'dtype': torch.tensor([1])})
241 | prev_root_node_id = idx_nodeid
242 | # parent 있는 노드들
243 | if parent_idx != -1:
244 | parent_idx_nodeid = parent_idx
245 | # graph.add_edges(parent_idx_nodeid, idx_nodeid,
246 | # data={'cc_link': torch.tensor([forward_edge_type + i]),
247 | # 'dtype': torch.tensor([forward_edge_type + i])})
248 | # graph.add_edges(idx_nodeid, parent_idx_nodeid,
249 | # data={'cc_link': torch.tensor([backward_edge_type + i]),
250 | # 'dtype': torch.tensor([backward_edge_type + i])})
251 | graph.add_edges(parent_idx_nodeid, idx_nodeid,
252 | data={'cc_link': torch.tensor([1]),
253 | 'dtype': torch.tensor([1])})
254 | graph.add_edges(idx_nodeid, parent_idx_nodeid,
255 | data={'cc_link': torch.tensor([1]),
256 | 'dtype': torch.tensor([1])})
257 | if i == 0:
258 | # self-loop edge
259 | # graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([4]),
260 | # 'dtype': torch.tensor([4])})
261 | graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
262 | 'dtype': torch.tensor([1])})
263 | constituent_labels.append(cons_tag2id[label])
264 |
265 | constituent_labels = torch.tensor(constituent_labels,dtype=torch.long)
266 | return graph, constituent_labels
267 |
268 | def all_process_graph(nlp, tokenizer, sentence):
269 | sentence_doc = nlp(sentence)
270 | sentence_sent = list(sentence_doc.sents)[0]
271 | parse_string = sentence_sent._.parse_string
272 | word_offset, node_offset = 0, 0
273 | constituent = []
274 | constituent_sequence, word_offset, node_offset = \
275 | constituent_to_tree(tokenizer, parse_string, sentence, word_offset, node_offset)
276 | constituent.append(constituent_sequence)
277 |
278 | graph = dgl.graph([])
279 | graph.set_n_initializer(dgl.frame.zero_initializer)
280 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
281 |
282 | graph.add_nodes(num_cons)
283 | graph.ndata['unit'] = torch.ones(num_cons)
284 | graph.ndata['dtype'] = torch.ones(num_cons)
285 |
286 | claim_graph, constituent_labels = \
287 | final_graph(constituent, graph)
288 | return claim_graph, constituent_labels
289 |
290 |
291 | if __name__ == "__main__":
292 | nlp = spacy.load('en_core_web_sm')
293 | nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
294 | tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=False, use_fast=False)
295 |
296 | dev_data = pd.read_csv('../../data/IAM/claims/dev.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
297 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
298 | dev_data = dev_data.dropna(axis=0)
299 |
300 | dev_sentences = dev_data['claim_sentence'].tolist()[:10]
301 | total_dev_constituent_label = []
302 | printable = set(string.printable)
303 | total_graph = {}
304 | cons_type_embeddings = nn.Embedding(82, 300)
305 | model = Classifier(300, 300, 2) # homo graph 테스트용
306 |
307 | for idx, dev in tqdm(enumerate(dev_sentences), total=len(dev_sentences)):
308 | dev = dev.lower().replace('“', '"').replace('”', '"')
309 | dev = "".join(filter(lambda x: x in printable, dev))
310 |
311 | dev_graph, dev_constituent_label = all_process_graph(nlp, tokenizer, dev)
312 | total_dev_constituent_label.append([dev_constituent_label])
313 | total_graph[idx] = dev_graph
314 | cons_node_feat = cons_type_embeddings(dev_constituent_label)
315 | #etypes = ['0', '1', '2', '3', '4']
316 | #model = HeteroClassifier(300, 300, 2, etypes)
317 | #print(dev_graph.edges(form='all'))
318 | logits = model(dev_graph, cons_node_feat) # homo
319 | #logits = model(dev_graph, cons_node_feat) # hetero
320 | print(logits)
321 |
322 |
323 |
--------------------------------------------------------------------------------
/mine_next/functions/main_function.py:
--------------------------------------------------------------------------------
1 | import os.path
2 | import os
3 | from tqdm import tqdm
4 | import torch
5 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
6 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup
7 | from transformers import AutoConfig, AutoTokenizer
8 | from sklearn.metrics import classification_report, accuracy_score
9 | from sklearn.utils import resample
10 |
11 | import csv
12 | import numpy as np
13 | import pandas as pd
14 | import json
15 |
16 | # from source.claim_classification.model.modeling import KoElectraForClaimClassification
17 | # from source.claim_classification.func.dataset import convert_data2tensordataset
18 |
19 | from mine_next.model.modeling import RobertaForClassification
20 | from mine_next.functions.dataset import (
21 | convert_data2tensordataset,
22 | convert_stance_data2tensordataset,
23 | convert_only_sentence2tensordataset
24 | )
25 |
26 |
27 | def random_downsampling(dataset):
28 | major = dataset[dataset['claim_label'] == 'O']
29 | minor = dataset[dataset['claim_label'] == 'C']
30 | sampling_data = resample(major, replace=True, n_samples=len(minor)*5, random_state=42)
31 | train_data = pd.concat([sampling_data, minor])
32 | return train_data
33 |
34 |
35 | def random_upsampling(dataset):
36 | major = dataset[dataset['claim_label'] == 'O']
37 | minor = dataset[dataset['claim_label'] == 'C']
38 | sampling_data = resample(minor, replace=True, n_samples=len(major), random_state=42)
39 | train_data = pd.concat([sampling_data, major])
40 | return train_data
41 |
42 | def do_train(config, model, optimizer, scheduler, train_dataloader, epoch, global_step, total_graph):
43 | losses = []
44 | total_predicts, total_corrects = [], []
45 | for step, batch in tqdm(enumerate(train_dataloader), desc='do_train(epoch_{})'.format(epoch), total=len(train_dataloader)):
46 | batch = tuple(t.cuda() for t in batch)
47 | # graph 같이 학습할 경우
48 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
49 | # constituent_labels = batch[6]
50 | # loss, predicts = model(
51 | # idx=idx,
52 | # input_ids=input_ids,
53 | # attention_mask=attention_mask,
54 | # token_type_ids=token_type_ids,
55 | # labels=labels,
56 | # sim_labels=sim_labels,
57 | # all_graph=total_graph,
58 | # constituent_labels=constituent_labels
59 | # )
60 | # base
61 | idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \
62 | batch[4], batch[5]
63 | loss, predicts = model(
64 | idx=idx,
65 | input_ids=input_ids,
66 | attention_mask=attention_mask,
67 | token_type_ids=token_type_ids,
68 | labels=labels,
69 | sim_labels=sim_labels,
70 | )
71 | predicts = predicts.argmax(dim=-1)
72 | predicts = predicts.cpu().detach().numpy().tolist()
73 | labels = labels.cpu().detach().numpy().tolist()
74 |
75 | total_predicts.extend(predicts)
76 | total_corrects.extend(labels)
77 |
78 | if config.gradient_accumulation_steps > 1:
79 | loss = loss / config.gradient_accumulation_steps
80 | # 원래는 tensor(0.7255)이런식
81 | loss.backward()
82 | losses.append(loss.data.item())
83 | if (step + 1) % config.gradient_accumulation_steps == 0 or \
84 | (len(train_dataloader) <= config.gradient_accumulation_steps and (step + 1) == len(
85 | train_dataloader)):
86 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
87 | optimizer.step()
88 | scheduler.step()
89 |
90 | model.zero_grad()
91 | global_step += 1
92 | target_names = ['class 0', 'class 1']
93 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
94 | accuracy = accuracy_score(total_corrects, total_predicts)
95 | return accuracy, np.mean(losses), global_step
96 |
97 |
98 | def do_evaluate(model, dev_dataloader, total_graph):
99 | total_predicts, total_corrects = [], []
100 | for step, batch in tqdm(enumerate(dev_dataloader), desc="do_evaluate", total=len(dev_dataloader)):
101 | batch = tuple(t.cuda() for t in batch)
102 | # graph 학습할 경우
103 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
104 | # constituent_labels = batch[6]
105 | # predicts = model(
106 | # idx=idx,
107 | # input_ids=input_ids,
108 | # attention_mask=attention_mask,
109 | # token_type_ids=token_type_ids,
110 | # all_graph=total_graph,
111 | # constituent_labels=constituent_labels
112 | # )
113 | # base
114 | idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \
115 | batch[4], batch[5]
116 | predicts = model(
117 | idx=idx,
118 | input_ids=input_ids,
119 | attention_mask=attention_mask,
120 | token_type_ids=token_type_ids,
121 | )
122 | predicts = predicts.argmax(dim=-1)
123 | predicts = predicts.detach().cpu().tolist()
124 | labels = labels.detach().cpu().tolist()
125 | total_predicts.extend(predicts)
126 | total_corrects.extend(labels)
127 | target_names = ['class 0', 'class 1']
128 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
129 | accuracy = accuracy_score(total_corrects, total_predicts)
130 | return accuracy, total_predicts
131 |
132 |
133 | def train(config, model, tokenizer):
134 |
135 | # 데이터셋 로드
136 | train_data = pd.read_csv(config.claim_train, sep='\t', header=None, quoting=csv.QUOTE_NONE)
137 | train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
138 | train_data = train_data.dropna(axis=0)
139 | # train_data = train_data[:100]
140 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
141 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
142 | dev_data = dev_data.dropna(axis=0)
143 | # dev_data = dev_data[:100]
144 |
145 | #train_data = random_upsampling(train_data)
146 | train_dataset, train_total_graph = convert_only_sentence2tensordataset(train_data, tokenizer, config.max_length, 'train')
147 | dev_dataset, dev_total_graph = convert_only_sentence2tensordataset(dev_data, tokenizer, config.max_length, 'dev')
148 |
149 |
150 | train_sampler = RandomSampler(train_dataset)
151 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size)
152 | dev_sampler = SequentialSampler(dev_dataset)
153 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
154 |
155 | t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.epoch
156 | optimizer = AdamW(model.parameters(), lr=config.learning_rate)
157 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total)
158 |
159 | global_step = 0
160 | max_test_accuracy = 0
161 | model.zero_grad()
162 | for epoch in range(config.epoch):
163 | model.train()
164 | train_accuracy, average_loss, global_step = do_train(
165 | config=config, model=model,
166 | optimizer=optimizer, scheduler=scheduler,
167 | train_dataloader=train_dataloader, epoch=epoch+1, global_step=global_step, total_graph=train_total_graph)
168 | print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4)))
169 |
170 | model.eval()
171 | test_accuracy, _ = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=dev_total_graph)
172 | print("test_accuracy : {}\n".format(round(test_accuracy, 4)))
173 | output_dir = os.path.join(config.save_dir, "checkpoint-{}".format(epoch))
174 | if not os.path.exists(output_dir):
175 | os.makedirs(output_dir)
176 | model_to_save = model.module if hasattr(model, "module") else model
177 | model_to_save.save_pretrained(output_dir)
178 | tokenizer.save_pretrained(output_dir)
179 | torch.save(config, os.path.join(output_dir, "training_args.bin"))
180 |
181 |
182 | def evaluate(config, model, tokenizer):
183 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
184 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
185 | dev_data = dev_data.dropna(axis=0)
186 | # dev_data = dev_data[:10]
187 | # dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length)
188 | dev_dataset, total_graph = convert_only_sentence2tensordataset(dev_data, tokenizer, config.max_length, 'dev')
189 |
190 | dev_sampler = SequentialSampler(dev_dataset)
191 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
192 |
193 | test_accuracy, total_predicts = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=total_graph)
194 | print("test accuracy : {}".format(round(test_accuracy,4)))
195 | total_corrects = dev_data['claim_label'].tolist()
196 | total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects]
197 | totaL_claim_sentence = dev_data['claim_sentence'].tolist()
198 | error_list = []
199 | for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence):
200 | if predict != correct:
201 | error = {}
202 | error['predict'] = predict
203 | error['correct'] = correct
204 | error['claim_sentence'] = claim
205 | error_list.append(error)
206 |
207 | with open('../mine/functions/dev_error.json', 'w', encoding='utf-8') as f:
208 | json.dump(error_list, f, indent=4)
209 |
--------------------------------------------------------------------------------
/mine_next/functions/main_function2.py:
--------------------------------------------------------------------------------
1 | import os.path
2 | import os
3 | from tqdm import tqdm
4 | import torch
5 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
6 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup
7 | from transformers import AutoConfig, AutoTokenizer
8 | from sklearn.metrics import classification_report, accuracy_score
9 | from sklearn.utils import resample
10 |
11 | import csv
12 | import numpy as np
13 | import pandas as pd
14 | import json
15 |
16 | # from source.claim_classification.model.modeling import KoElectraForClaimClassification
17 | # from source.claim_classification.func.dataset import convert_data2tensordataset
18 |
19 | from mine_next.model.modeling import RobertaForClassification
20 | from mine_next.functions.dataset import (
21 | convert_data2tensordataset,
22 | convert_stance_data2tensordataset,
23 | convert_only_sentence2tensordataset
24 | )
25 |
26 |
27 | def random_downsampling(dataset):
28 | major = dataset[dataset['claim_label'] == 'O']
29 | minor = dataset[dataset['claim_label'] == 'C']
30 | sampling_data = resample(major, replace=True, n_samples=len(minor)*5, random_state=42)
31 | train_data = pd.concat([sampling_data, minor])
32 | return train_data
33 |
34 | def random_upsampling(dataset):
35 | major = dataset[dataset['claim_label'] == 'O']
36 | minor = dataset[dataset['claim_label'] == 'C']
37 | sampling_data = resample(minor, replace=True, n_samples=len(major), random_state=42)
38 | train_data = pd.concat([sampling_data, major])
39 | return train_data
40 |
41 | def do_train(config, model, optimizer, scheduler, train_dataloader, epoch, global_step, total_graph):
42 | losses = []
43 | total_predicts, total_corrects = [], []
44 | for step, batch in tqdm(enumerate(train_dataloader), desc='do_train(epoch_{})'.format(epoch), total=len(train_dataloader)):
45 | batch = tuple(t.cuda() for t in batch)
46 | # graph 같이 학습할 경우
47 | idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
48 | constituent_labels_first, constituent_labels_second = batch[6], batch[7]
49 | loss, predicts = model(
50 | idx=idx,
51 | input_ids=input_ids,
52 | attention_mask=attention_mask,
53 | token_type_ids=token_type_ids,
54 | labels=labels,
55 | sim_labels=sim_labels,
56 | all_graph=total_graph,
57 | constituent_labels_first=constituent_labels_first,
58 | constituent_labels_second=constituent_labels_second
59 | )
60 | # base
61 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \
62 | # batch[4], batch[5]
63 | # loss, predicts = model(
64 | # idx=idx,
65 | # input_ids=input_ids,
66 | # attention_mask=attention_mask,
67 | # token_type_ids=token_type_ids,
68 | # labels=labels,
69 | # sim_labels=sim_labels,
70 | # )
71 | predicts = predicts.argmax(dim=-1)
72 | predicts = predicts.cpu().detach().numpy().tolist()
73 | labels = labels.cpu().detach().numpy().tolist()
74 |
75 | total_predicts.extend(predicts)
76 | total_corrects.extend(labels)
77 |
78 | if config.gradient_accumulation_steps > 1:
79 | loss = loss / config.gradient_accumulation_steps
80 | # 원래는 tensor(0.7255)이런식
81 | loss.backward()
82 | losses.append(loss.data.item())
83 | if (step + 1) % config.gradient_accumulation_steps == 0 or \
84 | (len(train_dataloader) <= config.gradient_accumulation_steps and (step + 1) == len(
85 | train_dataloader)):
86 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
87 | optimizer.step()
88 | scheduler.step()
89 |
90 | model.zero_grad()
91 | global_step += 1
92 | target_names = ['class 0', 'class 1']
93 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
94 | accuracy = accuracy_score(total_corrects, total_predicts)
95 | return accuracy, np.mean(losses), global_step
96 |
97 | def do_evaluate(model, dev_dataloader, total_graph):
98 | total_predicts, total_corrects = [], []
99 | for step, batch in tqdm(enumerate(dev_dataloader), desc="do_evaluate", total=len(dev_dataloader)):
100 | batch = tuple(t.cuda() for t in batch)
101 | # graph 학습할 경우
102 | idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
103 | constituent_labels_first, constituent_labels_second = batch[6], batch[7]
104 | predicts = model(
105 | idx=idx,
106 | input_ids=input_ids,
107 | attention_mask=attention_mask,
108 | token_type_ids=token_type_ids,
109 | all_graph=total_graph,
110 | constituent_labels_first=constituent_labels_first,
111 | constituent_labels_second= constituent_labels_second
112 | )
113 | # base
114 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], \
115 | # batch[4], batch[5]
116 | # predicts = model(
117 | # idx=idx,
118 | # input_ids=input_ids,
119 | # attention_mask=attention_mask,
120 | # token_type_ids=token_type_ids,
121 | # )
122 | predicts = predicts.argmax(dim=-1)
123 | predicts = predicts.detach().cpu().tolist()
124 | labels = labels.detach().cpu().tolist()
125 | total_predicts.extend(predicts)
126 | total_corrects.extend(labels)
127 | target_names = ['class 0', 'class 1']
128 | result = classification_report(total_corrects, total_predicts, target_names=target_names, digits=4, output_dict=True)
129 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
130 | accuracy = accuracy_score(total_corrects, total_predicts)
131 | return accuracy, total_predicts, result['class 1']['f1-score']
132 |
133 | def train(config, model, tokenizer):
134 |
135 | # 데이터셋 로드
136 | train_data = pd.read_csv(config.claim_train, sep='\t', header=None, quoting=csv.QUOTE_NONE)
137 | train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
138 | train_data = train_data.dropna(axis=0)
139 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
140 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
141 | dev_data = dev_data.dropna(axis=0)
142 |
143 | pseudo_train = json.load(open(config.train_pseudo_topic, encoding='utf-8'))
144 | pseudo_dev = json.load(open(config.dev_pseudo_topic, encoding='utf-8'))
145 | #train_data = random_upsampling(train_data)
146 |
147 | train_dataset, train_total_graph_first, train_total_graph_second = convert_only_sentence2tensordataset(
148 | train_data, pseudo_train, tokenizer, config.max_length, 'train')
149 | dev_dataset, dev_total_graph_first, dev_total_graph_second = convert_only_sentence2tensordataset(dev_data, pseudo_dev, tokenizer, config.max_length, 'dev')
150 |
151 | train_sampler = RandomSampler(train_dataset)
152 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size)
153 | dev_sampler = SequentialSampler(dev_dataset)
154 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
155 |
156 | t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.epoch
157 | optimizer = AdamW(model.parameters(), lr=config.learning_rate)
158 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total)
159 |
160 | global_step = 0
161 | max_test_accuracy = 0
162 | max_claim_f1 = 0
163 | model.zero_grad()
164 | for epoch in range(config.epoch):
165 | model.train()
166 | train_accuracy, average_loss, global_step = do_train(
167 | config=config, model=model,
168 | optimizer=optimizer, scheduler=scheduler,
169 | train_dataloader=train_dataloader, epoch=epoch, global_step=global_step, total_graph=[train_total_graph_first, train_total_graph_second])
170 | print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4)))
171 |
172 | model.eval()
173 | test_accuracy, _, claim_f1 = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=[dev_total_graph_first, dev_total_graph_second])
174 | print("test_accuracy : {}\n".format(round(test_accuracy, 4)))
175 | if max_claim_f1 < claim_f1:
176 | output_dir = os.path.join(config.save_dir, "checkpoint-{}".format(epoch))
177 | if not os.path.exists(output_dir):
178 | os.makedirs(output_dir)
179 | model_to_save = model.module if hasattr(model, "module") else model
180 | model_to_save.save_pretrained(output_dir)
181 | tokenizer.save_pretrained(output_dir)
182 | torch.save(config, os.path.join(output_dir, "training_args.bin"))
183 | max_claim_f1 = claim_f1
184 |
185 | def evaluate(config, model, tokenizer):
186 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
187 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
188 | dev_data = dev_data.dropna(axis=0)
189 | # dev_data = dev_data[:10]
190 | # dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length)
191 | pseudo_dev = json.load(open(config.dev_pseudo_topic, encoding='utf-8'))
192 | dev_dataset, dev_total_graph_first, dev_total_graph_second = convert_only_sentence2tensordataset(dev_data, pseudo_dev, tokenizer, config.max_length, 'dev')
193 |
194 | dev_sampler = SequentialSampler(dev_dataset)
195 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
196 |
197 | test_accuracy, total_predicts, claim_f1 = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=[dev_total_graph_first, dev_total_graph_second])
198 | print("test accuracy : {}".format(round(test_accuracy,4)))
199 | total_corrects = dev_data['claim_label'].tolist()
200 | total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects]
201 | assert len(total_corrects) == len(total_predicts)
202 | totaL_claim_sentence = dev_data['claim_sentence'].tolist()
203 | error_list = []
204 | for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence):
205 | error = {}
206 | error['predict'] = predict
207 | error['correct'] = correct
208 | error['claim_sentence'] = claim
209 | error_list.append(error)
210 |
211 | with open('../mine_next/functions/dev_error.json', 'w', encoding='utf-8') as f:
212 | json.dump(error_list, f, indent=4)
213 |
214 |
215 | def test(config, model, tokenizer):
216 | test_data = pd.read_csv(config.claim_test, sep='\t', header=None, quoting=csv.QUOTE_NONE)
217 | test_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
218 | test_data = test_data.dropna(axis=0)
219 | pseudo_test = json.load(open(config.test_pseudo_topic, encoding='utf-8'))
220 | test_dataset, test_total_graph_first, test_total_graph_second = convert_only_sentence2tensordataset(test_data, pseudo_test, tokenizer, config.max_length, 'test')
221 |
222 | test_sampler = SequentialSampler(test_dataset)
223 | test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=config.batch_size)
224 |
225 | test_accuracy, total_predicts, claim_f1 = do_evaluate(model=model, dev_dataloader=test_dataloader, total_graph=[test_total_graph_first, test_total_graph_second])
226 | print("test accuracy : {}".format(round(test_accuracy,4)))
227 | total_corrects = test_data['claim_label'].tolist()
228 | total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects]
229 | assert len(total_corrects) == len(total_predicts)
230 | totaL_claim_sentence = test_data['claim_sentence'].tolist()
231 | error_list = []
232 | for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence):
233 | error = {}
234 | error['predict'] = predict
235 | error['correct'] = correct
236 | error['claim_sentence'] = claim
237 | error_list.append(error)
238 |
239 | with open('../mine_next/functions/test_error.json', 'w', encoding='utf-8') as f:
240 | json.dump(error_list, f, indent=4)
241 |
--------------------------------------------------------------------------------
/mine_next/functions/make_graph.py:
--------------------------------------------------------------------------------
1 | import benepar, spacy
2 | from nltk.tree import Tree as nltk_tree
3 | from nltk.treeprettyprinter import TreePrettyPrinter
4 | from nltk.draw.tree import TreeView
5 | import os, csv
6 | import pandas as pd
7 | from tqdm import tqdm
8 | import dgl
9 | from dgl import save_graphs, load_graphs
10 | from dgl.data.utils import makedirs, save_info, load_info
11 |
12 | import torch
13 | from transformers import BertTokenizer
14 |
15 | tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
16 | data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
17 | data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
18 | data = data.dropna(axis=0)
19 | data = data[data['claim_label'] == 'C']
20 | claims = data['claim_sentence'].tolist()
21 |
22 | def get_cons_tag_vocab(data_path):
23 | tag2id = {}
24 | with open(data_path) as f:
25 | for line in f.readlines():
26 | tag, idx = line.strip().split('\t')
27 | tag2id[tag] = int(idx)
28 | return tag2id
29 |
30 |
31 |
32 | class Tree(object):
33 | def __init__(self, type):
34 | # self.start, self.end 단어 기준으로 세는것. one 이 0번째, . 이 27번쨰라 root 노드가 start=0, end=27을 가지고 있는거임
35 | self.parent = None
36 | self.num_children = 0
37 | self.children = list()
38 | self.type = type
39 | self.is_leaf = False
40 | self.start = -1
41 | self.end = -1
42 | self.idx = -1
43 |
44 | def add_child(self, child):
45 | child.parent = self
46 | self.num_children += 1
47 | self.children.append(child)
48 |
49 | def size(self):
50 | count = 1
51 | for i in range(self.num_children):
52 | count += self.children[i].size()
53 |
54 | return count
55 |
56 | def __str__(self):
57 | return self.type
58 |
59 | def __iter__(self):
60 | yield self
61 | for c in self.children:
62 | for x in c:
63 | yield x
64 |
65 | def span_starts_ends(node: Tree):
66 | if len(node.children) == 0:
67 | return
68 | for child in node.children:
69 | span_starts_ends(child)
70 |
71 | node.start = node.children[0].start
72 | node.end = node.children[-1].end
73 |
74 | def constituent_to_tree(constituent_string, word_offset, node_offset, num_orders=2):
75 | constituents = []
76 | temp_str = ""
77 | words = []
78 | subtokens = []
79 | subtoken_map = []
80 | # 괄호 ['(', 'S', '(', 'NP', '(', 'NP', '(', 'CD', 'one', ')', '(', 'ADJP', '(', 'RB', 'long', ')' ... ] 이런식으로 (,)나 단어, constituent 단위로 분리
81 | for i, char in enumerate(constituent_string):
82 | if char == "(" or char == ")" or char == " ":
83 | if len(temp_str) != 0:
84 | constituents.append(temp_str)
85 | temp_str = ""
86 | if char != " ":
87 | constituents.append(char)
88 | else:
89 | temp_str += char
90 | # NP, PP등 노드 단위로 stack
91 | stack = []
92 | for cons in constituents:
93 | if cons != ")":
94 | stack.append(cons)
95 | else:
96 | tail = stack.pop()
97 | temp_constituents = []
98 | while tail != "(":
99 | temp_constituents.append(tail)
100 | tail = stack.pop()
101 |
102 | parent = Tree(temp_constituents[-1])
103 | for i in range(len(temp_constituents) - 2, -1, -1):
104 | if isinstance(temp_constituents[i], Tree):
105 | parent.add_child(temp_constituents[i])
106 | else:
107 | # parent에 붙일때 parent의 leaf를 true로 바꿔주는 형식으로 해줄것
108 | child = Tree(temp_constituents[i])
109 | parent.add_child(child)
110 | stack.append(parent)
111 | root = stack[-1]
112 | # 노드 방문하면서 잎인지 체크해야함
113 | map_count = 0
114 | for node in root:
115 | if len(node.children) == 0:
116 | node.is_leaf = True
117 | words.append(str(node))
118 | node_token = tokenizer.tokenize(str(node))
119 | subtokens.extend(node_token)
120 | subtoken_map.extend([map_count]*len(node_token))
121 | map_count += 1
122 |
123 | word_offset_original = word_offset
124 | for node in root:
125 | if node.is_leaf:
126 | node.start = word_offset
127 | node.end = word_offset
128 | word_offset += 1
129 | span_starts_ends(root)
130 |
131 | node_sequence = []
132 | # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음
133 | internal_nodes = []
134 | for node in root:
135 | if not node.is_leaf:
136 | internal_nodes.append(node)
137 | node_sequence.append(node)
138 | node_offset_original = node_offset
139 | for node in root:
140 | if node.is_leaf:
141 | # or node.type in [":", "``", ".", ",", "XX", "X", "-LRB-", "-RRB-", "''", "HYPH"]
142 | continue
143 | node.idx = node_offset
144 | node_offset += 1
145 | constituent_sequence = [] # [(idx, start, end, type, parent idx)]
146 | num_internal_nodes = len(internal_nodes)
147 | # constituent_edge
148 | constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)]
149 | for i, node in enumerate(internal_nodes):
150 | # if node.type in [":", "``", ".", ",", "XX", "X", "-LRB-", "-RRB-", "''", "HYPH"]:
151 | # continue
152 | parent_idx = node.parent.idx if node.parent else -1
153 | constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx))
154 | if parent_idx != -1:
155 | constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 #바로 아래 코드랑 보면 양방향 엣지 포함하는거임
156 | constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1
157 | # 이부분은 한계층 건너 뛰어서 엣지 ㅇ이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌
158 | high_order_sequence = [constituent_sequence]
159 | for i in range(1, num_orders):
160 | new_constituent_sequence = []
161 | for idx, start, end, type, parent_idx in high_order_sequence[-1]:
162 | if parent_idx == -1:
163 | continue
164 | parent_node = constituent_sequence[parent_idx - node_offset_original]
165 | if parent_node[-1] == -1:
166 | continue
167 | new_constituent_sequence.append((idx, start, end, type, parent_node[-1]))
168 | constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1
169 | constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1
170 | high_order_sequence.append(new_constituent_sequence)
171 | return high_order_sequence, word_offset, node_offset, subtoken_map, subtokens
172 |
173 |
174 | def print_parse_string(claim_list):
175 | for claim in claim_list:
176 | input_string = claim.lower()
177 | doc = nlp(input_string)
178 | sent = list(doc.sents)[0]
179 | print(sent)
180 | parse_string = sent._.parse_string
181 | print(parse_string)
182 |
183 |
184 | def save(self):
185 | # save graphs and labels
186 | self.save_path = '.'
187 | self.mode = 'test'
188 | graph_path = os.path.join(self.save_path, self.mode + '_dgl_graph.bin')
189 | save_graphs(graph_path, self.graphs, {'labels': self.labels})
190 | # save other information in python dict
191 | info_path = os.path.join(self.save_path, self.mode + '_info.pkl')
192 | save_info(info_path, {'num_classes': self.num_classes})
193 |
194 | def load(self):
195 | # load processed data from directory `self.save_path`
196 | self.save_path = '.'
197 | self.mode = 'test'
198 | graph_path = os.path.join(self.save_path, self.mode + '_dgl_graph.bin')
199 | self.graphs, label_dict = load_graphs(graph_path)
200 | self.labels = label_dict['labels']
201 | info_path = os.path.join(self.save_path, self.mode + '_info.pkl')
202 | self.num_classes = load_info(info_path)['num_classes']
203 |
204 | def has_cache(self):
205 | # check whether there are processed data in `self.save_path`
206 | self.save_path = '.'
207 | self.mode = 'test'
208 | graph_path = os.path.join(self.save_path, self.mode + '_dgl_graph.bin')
209 | info_path = os.path.join(self.save_path, self.mode + '_info.pkl')
210 | return os.path.exists(graph_path) and os.path.exists(info_path)
211 |
212 |
213 | #print_parse_string(claims)
214 |
215 |
216 |
217 |
218 | # doc = nlp(input_string)
219 | #
220 | # sent = list(doc.sents)[0]
221 | # print(sent)
222 | # parse_string = sent._.parse_string
223 | # print(parse_string)
224 | #
225 | # # 원랜 read_constituents 파트. tree.py
226 | # constituents = []
227 | # word_offset, node_offset = 0, 0
228 | # constituent = []
229 | # constituent_sequence, word_offset, node_offset, subtoken_map, subtokens = constituent_to_tree(parse_string, word_offset, node_offset)
230 | # subtoken_map = torch.tensor(subtoken_map, dtype=torch.int64)
231 | # print('constitutuent sequence : ', constituent_sequence) # constituent sequence 0번째가 원래 노드, 1번째가 grand parent와 grand child 관련
232 | # print('word offset , node offset', word_offset, node_offset)
233 | # constituent.append(constituent_sequence)
234 | # constituents.append(constituent)
235 | #
236 | # # 그래프 만들기
237 | # num_tokens = subtoken_map.size()[0] # 문장 토크나이즈 해서 나온 토큰 개수
238 | # num_cons = sum([len(sent_cons[0]) for sent_cons in constituent]) # cons 노드 개수
239 | # graph = dgl.graph([])
240 | # graph.set_n_initializer(dgl.frame.zero_initializer)
241 | # print(graph)
242 | #
243 | # # 그래프에 토큰 관련 추가
244 | # graph.add_nodes(num_tokens)
245 | # graph.ndata['unit'] = torch.zeros(num_tokens)
246 | # graph.ndata['dtype'] = torch.zeros(num_tokens)
247 | #
248 | # # constituent tree 그래프
249 | # graph.add_nodes(num_cons)
250 | # graph.ndata['unit'][num_tokens:] = torch.ones(num_cons)
251 | # graph.ndata['dtype'][num_tokens:] = torch.ones(num_cons)
252 | #
253 | #
254 | # constituent_starts = []
255 | # constituent_ends = []
256 | # constituent_labels = []
257 | # prev_root_node_id = None
258 | # forward_edge_type, backward_edge_type = 0, 2
259 | # constituent_start_idx = 0
260 | # node_id_offset = 0
261 | # num_tokens = len(subtoken_map)
262 | # token_range = torch.arange(0, num_tokens, dtype=torch.int64)
263 | # cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt')
264 | #
265 | #
266 | # for high_order_sent_cons in constituent:
267 | # for i, sent_cons in enumerate(high_order_sent_cons):
268 | # for idx, start, end, label, parent_idx in sent_cons:
269 | # idx_nodeid = idx - constituent_start_idx + node_id_offset # 원래는 constituent_start_idx = 0, node_id_offset = 406(token id까지였음. 1063중 406이 토큰이고 이후가 node 였었음.)
270 | # # parent 없는 노드
271 | # if parent_idx == -1:
272 | # if prev_root_node_id is not None:
273 | # graph.add_edges(prev_root_node_id, idx_nodeid,
274 | # data={'cc_link': torch.tensor([forward_edge_type + i]),
275 | # 'dtype': torch.tensor([forward_edge_type + i])})
276 | # # dual GAT
277 | # graph.add_edges(idx_nodeid, prev_root_node_id,
278 | # data={'cc_link': torch.tensor([backward_edge_type + i]),
279 | # 'dtype': torch.tensor([backward_edge_type + i])})
280 | # prev_root_node_id = idx_nodeid
281 | # # parent 없는 노드들
282 | # if parent_idx != -1:
283 | # parent_idx_nodeid = parent_idx - constituent_start_idx + node_id_offset
284 | # graph.add_edges(parent_idx_nodeid, idx_nodeid,
285 | # data={'cc_link': torch.tensor([forward_edge_type + i]),
286 | # 'dtype': torch.tensor([forward_edge_type + i])})
287 | # graph.add_edges(idx_nodeid, parent_idx_nodeid,
288 | # data={'cc_link': torch.tensor([backward_edge_type + i]),
289 | # 'dtype': torch.tensor([backward_edge_type + i])})
290 | #
291 | # if i == 0:
292 | # # self-loop edge
293 | # graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([4]),
294 | # 'dtype': torch.tensor([4])})
295 | # # constituent -> token
296 | # token_start = token_range[subtoken_map == start][0]
297 | # token_end = token_range[subtoken_map == end][-1]
298 | # graph.add_edges(idx_nodeid, token_start, data={'ct_link': torch.tensor([5]),
299 | # 'dtype': torch.tensor([5])})
300 | # graph.add_edges(idx_nodeid, token_end, data={'ct_link': torch.tensor([5]),
301 | # 'dtype': torch.tensor([5])})
302 | # constituent_starts.append(token_start)
303 | # constituent_ends.append(token_end)
304 | # constituent_labels.append(cons_tag2id[label])
305 | #
306 | # print(graph)
307 | # # ndata
308 | # # unit 0이 token 노드, 1이 cons 노드
309 | # #print(graph.ndata)
310 | # print('graph ndata unit',graph.ndata['unit'])
311 | # print('graph ndata dtype', graph.ndata['dtype'])
312 | #
313 | # # edata
314 | # # cc link : 4(self loop edge), node-token : 5(constituent token edge) -> 이건 grand 이런거 아니고 그냥 일반적인 parent, child 트리
315 | # # forward edge type(cc link) : 0, backward edge type(cc link) : 2 -> 일반적인 parent child 트리
316 | # # forward edge type(cc link) : 1, backward edge type(cc link) : 3 -> grand parent child 트리
317 | # #print(graph.edata)
318 | # print('graph edata cc link', graph.edata['cc_link'])
319 | # print('graph edata ct link', graph.edata['ct_link'])
320 | # print('graph edata dtype', graph.edata['dtype'])
321 | #
322 | # dgl.save_graphs('graph.dgl', graph)
323 | # (g,), _ = dgl.load_graphs('graph.dgl')
324 |
325 |
326 |
327 | nlp = spacy.load('en_core_web_sm')
328 | nlp.add_pipe('benepar', config={'model':'benepar_en3'})
329 | input_string = 'Effects in the classroom'
330 | input_string = input_string.lower()
331 | doc = nlp(input_string)
332 |
333 | sent = list(doc.sents)[0]
334 | # print(sent)
335 | # parse_string = sent._.parse_string
336 | # print(parse_string)
337 | # #
338 | # # for tok in doc:
339 | # #
340 | # # print()
341 | # t = nltk_tree.fromstring(sent._.parse_string)
342 | # TreeView(t)._cframe.print_to_file('output1.ps')
343 | # os.system('convert output1.ps output1.png')
344 | #
345 | # t = nltk_tree.fromstring(sent._.parse_string)
346 | # print(TreePrettyPrinter(t).text())
347 |
348 | from nltk import Tree
349 | from nltk.draw.util import CanvasFrame
350 | from nltk.draw import TreeWidget
351 |
352 | cf = CanvasFrame()
353 | t = Tree.fromstring(sent._.parse_string)
354 | tc = TreeWidget(cf.canvas(),t)
355 | tc['node_font'] = 'arial 14 bold'
356 | tc['leaf_font'] = 'arial 14'
357 | tc['node_color'] = '#005990'
358 | tc['leaf_color'] = '#3F8F57'
359 | tc['line_color'] = '#175252'
360 | cf.add_widget(tc,10,10) # (10,10) offsets
361 | cf.print_to_file('tree1.ps')
362 | cf.destroy()
363 | os.system('convert tree1.ps tree1.png')
364 |
--------------------------------------------------------------------------------
/mine_next/functions/pos_analy.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | from spacy import displacy
3 | from collections import Counter
4 | import pandas as pd
5 | import os
6 | import csv
7 | from pathlib import Path
8 | from nltk.tree import Tree
9 | from nltk.parse.corenlp import CoreNLPParser
10 | from nltk.parse.stanford import StanfordParser
11 | nlp = spacy.load("en_core_web_sm")
12 | data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
13 | data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
14 | data = data.dropna(axis=0)
15 | data = data[data['claim_label'] == 'C']
16 | topics = data['topic_sentence'].tolist()
17 | claims = data['claim_sentence'].tolist()
18 | # with open('../../data/IAM/all_claim_sentence.txt', 'r', encoding='utf-8') as txt_file:
19 | # all_claims = txt_file.readlines()
20 |
21 | counter = Counter()
22 | claim_dep = []
23 | claim_pos = []
24 |
25 | # doc = nlp(claims[0])
26 | #
27 | #
28 | # def token_format(token):
29 | # return "_".join([token.orth_, token.tag_, token.dep_])
30 | #
31 | # def to_nltk_tree(node):
32 | # if node.n_lefts + node.n_rights > 0:
33 | # return Tree(token_format(node),
34 | # [to_nltk_tree(child)
35 | # for child in node.children]
36 | # )
37 | # else:
38 | # return token_format(node)
39 | # tree = [to_nltk_tree(sent.root) for sent in doc.sents]
40 | # # The first item in the list is the full tree
41 | # tree[0].draw()
42 |
43 | # # os.environ['CLASSPATH'] = '../../stanford/*'
44 | parser = CoreNLPParser(url='http://localhost:9000')
45 | #parser = StanfordParser(model_path="../../stanford/edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz")
46 | def nltk_spacy_tree(sent):
47 | doc = nlp(sent)
48 | def token_format(token):
49 | return "_".join([token.orth_, token.tag_, token.dep_])
50 | def to_nltk_tree(node):
51 | if node.n_lefts + node.n_rights > 0:
52 | return Tree(token_format(node), [to_nltk_tree(child) for child in node.children])
53 | else:
54 | return token_format(node)
55 | tree = [to_nltk_tree(sent.root) for sent in doc.sents]
56 | print(tree[0])
57 | nltk_spacy_tree(claims[0])
58 |
59 | def nltk_stanford_tree(sent):
60 | parse = parser.raw_parse(sent)
61 | tree = list(parse)
62 | print(tree[0].draw())
63 |
64 | #nltk_stanford_tree(claims[0])
65 |
66 | # nlp = stanfordnlp.Pipeline(processors='tokenize,pos')
67 | # doc = nlp(claims[0])
68 | # print(doc)
69 |
70 | '''
71 | 디펜던스 파서 트리 그려주는 코드
72 | '''
73 | # for idx, claim in enumerate(claims[:1]):
74 | # doc = nlp(claim)
75 | # sentence_spans = list(doc.sents)
76 | # #displacy.serve(doc, style='dep')
77 | #
78 | # svg = displacy.render(sentence_spans, style='dep')
79 | # output_path = Path('../../data/IAM/dep_claim_img/sentence_{}.svg'.format(idx))
80 | # output_path.open('w', encoding='utf-8').write(svg)
81 | # for tok in doc:
82 |
83 |
84 |
85 |
86 | # sentence_dep = []
87 | # sentence_pos = []
88 | # lemma = []
89 | # for tok in doc:
90 | # sentence_dep.append(tok.dep_)
91 | # sentence_pos.append(tok.pos_)
92 | # if tok.pos_ == 'VERB':
93 | # lemma.append(tok.lemma_)
94 | # claim_dep.append(sentence_dep)
95 | # claim_pos.append(sentence_pos)
96 | # counter.update(lemma)
97 | # print(counter)
98 |
99 | # with open('../../data/IAM/train_claim_pos.txt', 'w', encoding='utf-8') as pos_file:
100 | # for pos in claim_pos:
101 | # pos_file.write(' '.join(pos))
102 | # pos_file.write('\n')
103 | # with open('../../data/IAM/train_claim_dep.txt', 'w', encoding='utf-8') as dep_file:
104 | # for dep in claim_dep:
105 | # dep_file.write(' '.join(dep))
106 | # dep_file.write('\n')
107 |
108 | # for tok in doc:
109 | # print(tok.text, tok.lemma_, tok.pos_, tok.tag_, tok.dep_)
110 | # print()
--------------------------------------------------------------------------------
/mine_next/functions/save_graph.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/functions/save_graph.py
--------------------------------------------------------------------------------
/mine_next/functions/sent2_to_graph.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | import dgl
3 | import dgl.frame
4 | import torch
5 | import os, csv
6 | import pandas as pd
7 | from tqdm import tqdm
8 | import benepar
9 | from transformers import AutoTokenizer
10 | import string
11 |
12 |
13 | class Tree(object):
14 | def __init__(self, type):
15 | self.parent = None
16 | self.num_children = 0
17 | self.children = list()
18 | self.type = type
19 | self.is_leaf = False
20 | self.start = -1
21 | self.end = -1
22 | self.idx = -1
23 |
24 | def add_child(self, child):
25 | child.parent = self
26 | self.num_children += 1
27 | self.children.append(child)
28 |
29 | def size(self):
30 | count = 1
31 | for i in range(self.num_children):
32 | count += self.children[i].size()
33 | return count
34 |
35 | def __str__(self):
36 | return self.type
37 |
38 | def __iter__(self):
39 | yield self
40 | for c in self.children:
41 | for x in c:
42 | yield x
43 |
44 | def get_cons_tag_vocab(data_path):
45 | tag2id = {}
46 | with open(data_path) as f:
47 | for line in f.readlines():
48 | tag, idx = line.strip().split('\t')
49 | tag2id[tag] = int(idx)
50 | return tag2id
51 |
52 |
53 | def span_starts_ends(node: Tree):
54 | if len(node.children) == 0:
55 | return
56 | for child in node.children:
57 | span_starts_ends(child)
58 |
59 | node.start = node.children[0].start
60 | node.end = node.children[-1].end
61 |
62 |
63 | def constituent_to_tree(tokenizer, constituent_string, sentence, word_offset, node_offset, num_orders=2):
64 | constituents = []
65 | temp_str = ""
66 | for i, char in enumerate(constituent_string):
67 | if char == "(" or char == ")" or char == " ":
68 | if len(temp_str) != 0:
69 | constituents.append(temp_str)
70 | temp_str = ""
71 | if char != " ":
72 | constituents.append(char)
73 | else:
74 | temp_str += char
75 | # NP, PP등 노드 단위로 stack
76 | stack = []
77 | for cons in constituents:
78 | if cons != ")":
79 | stack.append(cons)
80 | else:
81 | tail = stack.pop()
82 | temp_constituents = []
83 | while tail != "(":
84 | temp_constituents.append(tail)
85 | tail = stack.pop()
86 |
87 | parent = Tree(temp_constituents[-1])
88 | for i in range(len(temp_constituents) - 2, -1, -1):
89 | if isinstance(temp_constituents[i], Tree):
90 | parent.add_child(temp_constituents[i])
91 | else:
92 | child = Tree(temp_constituents[i])
93 | parent.add_child(child)
94 | stack.append(parent)
95 | root = stack[-1]
96 | map_count = 0
97 | words = []
98 | subtokens = []
99 | subtoken_map = []
100 | for node in root:
101 | if len(node.children) == 0:
102 | node.is_leaf = True
103 | words.append(str(node))
104 | node_token = tokenizer.tokenize(str(node))
105 | if len(node_token) == 0:
106 | continue
107 | subtokens.extend(node_token)
108 | subtoken_map.extend([map_count]*len(node_token))
109 | map_count += 1
110 |
111 | for node in root:
112 | if node.is_leaf:
113 | node.start = word_offset
114 | node.end = word_offset
115 | word_offset += 1
116 | span_starts_ends(root)
117 |
118 | node_sequence = []
119 | # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음
120 | internal_nodes = []
121 | for node in root:
122 | if not node.is_leaf:
123 | internal_nodes.append(node)
124 | node_sequence.append(node)
125 |
126 | node_offset_original = node_offset
127 | for node in root:
128 | if node.is_leaf:
129 | continue
130 | node.idx = node_offset
131 | node_offset += 1
132 |
133 | constituent_sequence = [] # [(node idx, node start, node end, node type, parent idx)]
134 | num_internal_nodes = len(internal_nodes)
135 | # constituent_edge
136 | constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)]
137 | for i, node in enumerate(internal_nodes):
138 | parent_idx = node.parent.idx if node.parent else -1
139 | constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx))
140 | if parent_idx != -1:
141 | constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 #바로 아래 코드랑 보면 양방향 엣지 포함하는거임
142 | constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1
143 | # 이부분은 한계층 건너 뛰어서 엣지 이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌
144 | high_order_sequence = [constituent_sequence]
145 | for i in range(1, num_orders):
146 | new_constituent_sequence = []
147 | for idx, start, end, type, parent_idx in high_order_sequence[-1]:
148 | if parent_idx == -1:
149 | continue
150 | parent_node = constituent_sequence[parent_idx - node_offset_original]
151 | if parent_node[-1] == -1:
152 | continue
153 | new_constituent_sequence.append((idx, start, end, type, parent_node[-1]))
154 | constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1
155 | constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1
156 | high_order_sequence.append(new_constituent_sequence)
157 | return high_order_sequence, word_offset, node_offset
158 |
159 |
160 | def final_graph(constituent_list, first_graph, second_graph):
161 | cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt')
162 | forward_edge_type, backward_edge_type = 0, 2
163 | # 여기서 pc, gpc그래프 나눠서 주는게 낫지않을까
164 | constituent_labels_first = []
165 | constituent_labels_second = []
166 | prev_root_node_id = None
167 | print('fist graph', first_graph.edges())
168 | print('second graph', second_graph.edges())
169 | one_order_sent_cons = constituent_list[0][0]
170 | two_order_sent_cons = constituent_list[0][1]
171 | for idx, start, end, label, parent_idx in one_order_sent_cons:
172 | idx_nodeid = idx
173 | # parent 없는 노드
174 | if parent_idx == -1:
175 | if prev_root_node_id is not None:
176 | first_graph.add_edges(prev_root_node_id, idx_nodeid,
177 | data={'cc_link': torch.tensor([1]),
178 | 'dtype': torch.tensor([1])})
179 | # dual GAT
180 | first_graph.add_edges(idx_nodeid, prev_root_node_id,
181 | data={'cc_link': torch.tensor([1]),
182 | 'dtype': torch.tensor([1])})
183 | prev_root_node_id = idx_nodeid
184 | # parent 없는 노드들
185 | if parent_idx != -1:
186 | parent_idx_nodeid = parent_idx
187 | first_graph.add_edges(parent_idx_nodeid, idx_nodeid,
188 | data={'cc_link': torch.tensor([1]),
189 | 'dtype': torch.tensor([1])})
190 | first_graph.add_edges(idx_nodeid, parent_idx_nodeid,
191 | data={'cc_link': torch.tensor([1]),
192 | 'dtype': torch.tensor([1])})
193 |
194 | # self-loop edge
195 | first_graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
196 | 'dtype': torch.tensor([1])})
197 | constituent_labels_first.append(cons_tag2id[label])
198 | # print('first graph', first_graph.edges())
199 |
200 | for idx, start, end, label, parent_idx in two_order_sent_cons:
201 | idx_nodeid = idx
202 | # parent 없는 노드
203 | if parent_idx == -1:
204 | if prev_root_node_id is not None:
205 | second_graph.add_edges(prev_root_node_id, idx_nodeid,
206 | data={'cc_link': torch.tensor([1]),
207 | 'dtype': torch.tensor([1])})
208 | # dual GAT
209 | second_graph.add_edges(idx_nodeid, prev_root_node_id,
210 | data={'cc_link': torch.tensor([1]),
211 | 'dtype': torch.tensor([1])})
212 | prev_root_node_id = idx_nodeid
213 | # parent 없는 노드들
214 | if parent_idx != -1:
215 | parent_idx_nodeid = parent_idx
216 | second_graph.add_edges(parent_idx_nodeid, idx_nodeid,
217 | data={'cc_link': torch.tensor([1]),
218 | 'dtype': torch.tensor([1])})
219 | second_graph.add_edges(idx_nodeid, parent_idx_nodeid,
220 | data={'cc_link': torch.tensor([1]),
221 | 'dtype': torch.tensor([1])})
222 | constituent_labels_second.append(cons_tag2id[label])
223 | print('second graph', second_graph.edges())
224 | # for high_order_sent_cons in constituent_list:
225 | # # i = 0: parent - child/ i = 1: grand parent - grand child
226 | # for i, sent_cons in enumerate(high_order_sent_cons):
227 | # for idx, start, end, label, parent_idx in sent_cons:
228 | # idx_nodeid = idx
229 | # # parent 없는 노드
230 | # if parent_idx == -1:
231 | # if prev_root_node_id is not None:
232 | # graph.add_edges(prev_root_node_id, idx_nodeid,
233 | # data={'cc_link': torch.tensor([1]),
234 | # 'dtype': torch.tensor([1])})
235 | # # dual GAT
236 | # graph.add_edges(idx_nodeid, prev_root_node_id,
237 | # data={'cc_link': torch.tensor([1]),
238 | # 'dtype': torch.tensor([1])})
239 | # prev_root_node_id = idx_nodeid
240 | # # parent 없는 노드들
241 | # if parent_idx != -1:
242 | # parent_idx_nodeid = parent_idx
243 | # graph.add_edges(parent_idx_nodeid, idx_nodeid,
244 | # data={'cc_link': torch.tensor([1]),
245 | # 'dtype': torch.tensor([1])})
246 | # graph.add_edges(idx_nodeid, parent_idx_nodeid,
247 | # data={'cc_link': torch.tensor([1]),
248 | # 'dtype': torch.tensor([1])})
249 | #
250 | # if i == 0:
251 | # # self-loop edge
252 | # graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
253 | # 'dtype': torch.tensor([1])})
254 | # constituent_labels.append(cons_tag2id[label])
255 | # print(graph.edges(form='all'))
256 |
257 | constituent_labels_first = torch.tensor(constituent_labels_first, dtype=torch.long)
258 | constituent_labels_second = torch.tensor(constituent_labels_second, dtype=torch.long)
259 | return first_graph, second_graph, constituent_labels_first, constituent_labels_second
260 |
261 |
262 | def all_process_graph(nlp, tokenizer, sentence):
263 | sentence_doc = nlp(sentence)
264 | sentence_sent = list(sentence_doc.sents)[0]
265 | parse_string = sentence_sent._.parse_string
266 | word_offset, node_offset = 0, 0
267 | constituent = []
268 | constituent_sequence, word_offset, node_offset = \
269 | constituent_to_tree(tokenizer, parse_string, sentence, word_offset, node_offset)
270 | constituent.append(constituent_sequence)
271 |
272 | first_graph = dgl.graph([])
273 | first_graph.set_n_initializer(dgl.frame.zero_initializer)
274 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
275 | first_graph.add_nodes(num_cons)
276 | first_graph.ndata['unit'] = torch.ones(num_cons)
277 | first_graph.ndata['dtype'] = torch.ones(num_cons)
278 | second_graph = dgl.graph([])
279 | second_graph.set_n_initializer(dgl.frame.zero_initializer)
280 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
281 | second_graph.add_nodes(num_cons)
282 | second_graph.ndata['unit'] = torch.ones(num_cons)
283 | second_graph.ndata['dtype'] = torch.ones(num_cons)
284 |
285 | claim_first_graph, claim_second_graph, constituent_labels_frist, constituent_labels_second = \
286 | final_graph(constituent, first_graph, second_graph)
287 | return claim_first_graph, claim_second_graph, constituent_labels_frist, constituent_labels_second
288 |
289 |
290 | if __name__ == "__main__":
291 |
292 | nlp = spacy.load('en_core_web_sm')
293 | nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
294 | printable = set(string.printable)
295 |
296 | tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=False, use_fast=False)
297 |
298 | train_data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
299 | train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
300 | train_data = train_data.dropna(axis=0)
301 | dev_data = pd.read_csv('../../data/IAM/claims/dev.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
302 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
303 | dev_data = dev_data.dropna(axis=0)
304 |
305 | train_sentences = train_data['claim_sentence'].tolist()[:10]
306 | dev_sentences = dev_data['claim_sentence'].tolist()[:10]
307 | total_train = []
308 | total_dev = []
309 | for idx, train in tqdm(enumerate(train_sentences), total=len(train_sentences)):
310 | train = train.lower().replace('“', '"').replace('”', '"')
311 | train = "".join(filter(lambda x : x in printable, train))
312 |
313 | train_first_graph, train_second_graph, train_constituent_labels_first, train_constituent_labels_second \
314 | = all_process_graph(nlp, tokenizer, train)
315 | dgl.save_graphs('../../data/IAM/claims/graphs/train_first_graph_{}.dgl'.format(idx), train_first_graph)
316 | dgl.save_graphs('../../data/IAM/claims/graphs/train_second_graph_{}.dgl'.format(idx), train_second_graph)
317 | total_train.append([train_constituent_labels_first.tolist(), train_constituent_labels_second.tolist()])
318 |
319 | for idx, dev in tqdm(enumerate(dev_sentences), total=len(dev_sentences)):
320 | dev = dev.lower().replace('“', '"').replace('”', '"')
321 | dev = "".join(filter(lambda x : x in printable, dev))
322 | dev_first_graph, dev_second_graph, dev_constituent_label_first, dev_constituent_label_second \
323 | = all_process_graph(nlp, tokenizer, dev)
324 | dgl.save_graphs('../../data/IAM/claims/graphs/dev_first_graph_{}.dgl'.format(idx), dev_first_graph)
325 | dgl.save_graphs('../../data/IAM/claims/graphs/dev_second_graph_{}.dgl'.format(idx), dev_second_graph)
326 | total_dev.append([dev_constituent_label_first.tolist(), dev_constituent_label_second.tolist()])
327 |
328 | with open('../../data/IAM/claims/graphs/train_constituent_test.txt', 'w', encoding='utf-8') as f:
329 | for line in total_train:
330 | f.write(str(line)+'\n')
331 |
332 | with open('../../data/IAM/claims/graphs/dev_constituent_test.txt', 'w', encoding='utf-8') as f:
333 | for line in total_dev:
334 | f.write(str(line)+'\n')
335 |
--------------------------------------------------------------------------------
/mine_next/functions/sent_to_graph.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | import dgl
3 | import dgl.frame
4 | import torch
5 | import os, csv
6 | import pandas as pd
7 | from tqdm import tqdm
8 | import benepar
9 | from transformers import AutoTokenizer
10 | import string
11 |
12 |
13 | class Tree(object):
14 | def __init__(self, type):
15 | # self.start, self.end 단어 기준으로 세는것. one 이 0번째, . 이 27번쨰라 root 노드가 start=0, end=27을 가지고 있는거임
16 | self.parent = None
17 | self.num_children = 0
18 | self.children = list()
19 | self.type = type
20 | self.is_leaf = False
21 | self.start = -1
22 | self.end = -1
23 | self.idx = -1
24 |
25 | def add_child(self, child):
26 | child.parent = self
27 | self.num_children += 1
28 | self.children.append(child)
29 |
30 | def size(self):
31 | count = 1
32 | for i in range(self.num_children):
33 | count += self.children[i].size()
34 | return count
35 |
36 | def __str__(self):
37 | return self.type
38 |
39 | def __iter__(self):
40 | yield self
41 | for c in self.children:
42 | for x in c:
43 | yield x
44 |
45 |
46 | def get_cons_tag_vocab(data_path):
47 | tag2id = {}
48 | with open(data_path) as f:
49 | for line in f.readlines():
50 | tag, idx = line.strip().split('\t')
51 | tag2id[tag] = int(idx)
52 | return tag2id
53 |
54 |
55 | def span_starts_ends(node: Tree):
56 | if len(node.children) == 0:
57 | return
58 | for child in node.children:
59 | span_starts_ends(child)
60 |
61 | node.start = node.children[0].start
62 | node.end = node.children[-1].end
63 |
64 |
65 | def constituent_to_tree(tokenizer, constituent_string, sentence, word_offset, node_offset, num_orders=2):
66 | constituents = []
67 | temp_str = ""
68 | # 괄호 ['(', 'S', '(', 'NP', '(', 'NP', '(', 'CD', 'one', ')', '(', 'ADJP', '(', 'RB', 'long', ')' ... ] 이런식으로 (,)나 단어, constituent 단위로 분리
69 | for i, char in enumerate(constituent_string):
70 | if char == "(" or char == ")" or char == " ":
71 | if len(temp_str) != 0:
72 | constituents.append(temp_str)
73 | temp_str = ""
74 | if char != " ":
75 | constituents.append(char)
76 | else:
77 | temp_str += char
78 | # NP, PP등 노드 단위로 stack
79 | stack = []
80 | for cons in constituents:
81 | if cons != ")":
82 | stack.append(cons)
83 | else:
84 | tail = stack.pop()
85 | temp_constituents = []
86 | while tail != "(":
87 | temp_constituents.append(tail)
88 | tail = stack.pop()
89 |
90 | parent = Tree(temp_constituents[-1])
91 | for i in range(len(temp_constituents) - 2, -1, -1):
92 | if isinstance(temp_constituents[i], Tree):
93 | parent.add_child(temp_constituents[i])
94 | else:
95 | child = Tree(temp_constituents[i])
96 | parent.add_child(child)
97 | stack.append(parent)
98 | root = stack[-1]
99 | map_count = 0
100 | words = []
101 | subtokens = []
102 | subtoken_map = []
103 | for node in root:
104 | if len(node.children) == 0:
105 | node.is_leaf = True
106 | words.append(str(node))
107 | node_token = tokenizer.tokenize(str(node))
108 | if len(node_token) == 0:
109 | continue
110 | subtokens.extend(node_token)
111 | subtoken_map.extend([map_count]*len(node_token))
112 | map_count += 1
113 |
114 | for node in root:
115 | if node.is_leaf:
116 | node.start = word_offset
117 | node.end = word_offset
118 | word_offset += 1
119 | span_starts_ends(root)
120 |
121 | node_sequence = []
122 | # internal nodes 는 S, NP VP, PP 와같은 노드만. one, lone과같은 노드는 없음
123 | internal_nodes = []
124 | for node in root:
125 | if not node.is_leaf:
126 | internal_nodes.append(node)
127 | node_sequence.append(node)
128 |
129 | node_offset_original = node_offset
130 | for node in root:
131 | if node.is_leaf:
132 | continue
133 | node.idx = node_offset
134 | node_offset += 1
135 |
136 | constituent_sequence = [] # [(node idx, node start, node end, node type, parent idx)]
137 | num_internal_nodes = len(internal_nodes)
138 | # constituent_edge
139 | constituent_edge = [[0] * num_internal_nodes for _ in range(num_internal_nodes)]
140 | for i, node in enumerate(internal_nodes):
141 | parent_idx = node.parent.idx if node.parent else -1
142 | constituent_sequence.append((node.idx, node.start, node.end, node.type, parent_idx))
143 | if parent_idx != -1:
144 | constituent_edge[node.idx - node_offset_original][parent_idx - node_offset_original] = 1 #바로 아래 코드랑 보면 양방향 엣지 포함하는거임
145 | constituent_edge[parent_idx - node_offset_original][node.idx - node_offset_original] = 1
146 | # 이부분은 한계층 건너 뛰어서 엣지 이어 주는 식임. 원래 S랑 PP는 안이어져있는데 여기서 이어줌
147 | high_order_sequence = [constituent_sequence]
148 | for i in range(1, num_orders):
149 | new_constituent_sequence = []
150 | for idx, start, end, type, parent_idx in high_order_sequence[-1]:
151 | if parent_idx == -1:
152 | new_constituent_sequence.append((idx, start, end, type, parent_idx))
153 | continue
154 | parent_node = constituent_sequence[parent_idx - node_offset_original]
155 | if parent_node[-1] == -1:
156 | continue
157 | new_constituent_sequence.append((idx, start, end, type, parent_node[-1]))
158 | constituent_edge[idx - node_offset_original][parent_node[-1] - node_offset_original] = 1
159 | constituent_edge[parent_node[-1] - node_offset_original][idx - node_offset_original] = 1
160 | high_order_sequence.append(new_constituent_sequence)
161 |
162 | return high_order_sequence, word_offset, node_offset
163 |
164 |
165 | def final_graph(constituent_list, first_graph, second_graph):
166 | cons_tag2id = get_cons_tag_vocab('../../data/IAM/constituent_gold_vocab.txt')
167 | constituent_labels_first = []
168 | constituent_labels_second = []
169 | prev_root_node_id = None
170 | one_order_sent_cons = constituent_list[0][0]
171 | two_order_sent_cons = constituent_list[0][1]
172 | two_order_sent_cons_idx = [idx[0] for idx in two_order_sent_cons]
173 |
174 | for idx, start, end, label, parent_idx in one_order_sent_cons:
175 | idx_nodeid = idx
176 | if parent_idx == -1:
177 | if prev_root_node_id is not None:
178 | first_graph.add_edges(prev_root_node_id, idx_nodeid,
179 | data={'cc_link': torch.tensor([1]),
180 | 'dtype': torch.tensor([1])})
181 | first_graph.add_edges(idx_nodeid, prev_root_node_id,
182 | data={'cc_link': torch.tensor([1]),
183 | 'dtype': torch.tensor([1])})
184 | prev_root_node_id = idx_nodeid
185 | if parent_idx != -1:
186 | parent_idx_nodeid = parent_idx
187 | first_graph.add_edges(parent_idx_nodeid, idx_nodeid,
188 | data={'cc_link': torch.tensor([1]),
189 | 'dtype': torch.tensor([1])})
190 | first_graph.add_edges(idx_nodeid, parent_idx_nodeid,
191 | data={'cc_link': torch.tensor([1]),
192 | 'dtype': torch.tensor([1])})
193 | first_graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
194 | 'dtype': torch.tensor([1])})
195 | constituent_labels_first.append(cons_tag2id[label])
196 |
197 | prev_root_node_id = None
198 | for idx, start, end, label, parent_idx in two_order_sent_cons:
199 | idx_nodeid = idx
200 | if parent_idx == -1:
201 | if prev_root_node_id is not None:
202 | second_graph.add_edges(prev_root_node_id, idx_nodeid,
203 | data={'cc_link': torch.tensor([1]),
204 | 'dtype': torch.tensor([1])})
205 | second_graph.add_edges(idx_nodeid, prev_root_node_id,
206 | data={'cc_link': torch.tensor([1]),
207 | 'dtype': torch.tensor([1])})
208 | prev_root_node_id = idx_nodeid
209 | if parent_idx != -1:
210 | parent_idx_nodeid = parent_idx
211 | second_graph.add_edges(parent_idx_nodeid, idx_nodeid,
212 | data={'cc_link': torch.tensor([1]),
213 | 'dtype': torch.tensor([1])})
214 | second_graph.add_edges(idx_nodeid, parent_idx_nodeid,
215 | data={'cc_link': torch.tensor([1]),
216 | 'dtype': torch.tensor([1])})
217 | second_graph = dgl.add_self_loop(second_graph)
218 | # for high_order_sent_cons in constituent_list:
219 | # for i, sent_cons in enumerate(high_order_sent_cons):
220 | # for idx, start, end, label, parent_idx in sent_cons:
221 | # idx_nodeid = idx # 원래는 constituent_start_idx = 0, node_id_offset = 406(token id까지였음. 1063중 406이 토큰이고 이후가 node 였었음.)
222 | # # parent 없는 노드
223 | # if parent_idx == -1:
224 | # if prev_root_node_id is not None:
225 | # graph.add_edges(prev_root_node_id, idx_nodeid,
226 | # data={'cc_link': torch.tensor([1]),
227 | # 'dtype': torch.tensor([1])})
228 | # # dual GAT
229 | # graph.add_edges(idx_nodeid, prev_root_node_id,
230 | # data={'cc_link': torch.tensor([1]),
231 | # 'dtype': torch.tensor([1])})
232 | # prev_root_node_id = idx_nodeid
233 | # # parent 없는 노드들
234 | # if parent_idx != -1:
235 | # parent_idx_nodeid = parent_idx
236 | # graph.add_edges(parent_idx_nodeid, idx_nodeid,
237 | # data={'cc_link': torch.tensor([1]),
238 | # 'dtype': torch.tensor([1])})
239 | # graph.add_edges(idx_nodeid, parent_idx_nodeid,
240 | # data={'cc_link': torch.tensor([1]),
241 | # 'dtype': torch.tensor([1])})
242 | #
243 | # if i == 0:
244 | # # self-loop edge
245 | # graph.add_edges(idx_nodeid, idx_nodeid, data={'cc_link': torch.tensor([1]),
246 | # 'dtype': torch.tensor([1])})
247 | # constituent_labels.append(cons_tag2id[label])
248 | # constituent_labels = torch.tensor(constituent_labels,dtype=torch.long)
249 | # return graph, constituent_labels
250 | constituent_labels_first = torch.tensor(constituent_labels_first, dtype=torch.long)
251 | constituent_labels_second = torch.tensor(constituent_labels_first, dtype=torch.long) # 라벨 개수 동일하게
252 | return first_graph, second_graph, constituent_labels_first, constituent_labels_second
253 |
254 |
255 | def all_process_graph(nlp, tokenizer, sentence):
256 | sentence_doc = nlp(sentence)
257 | sentence_sent = list(sentence_doc.sents)[0]
258 | parse_string = sentence_sent._.parse_string
259 | word_offset, node_offset = 0, 0
260 | constituent = []
261 | constituent_sequence, word_offset, node_offset = \
262 | constituent_to_tree(tokenizer, parse_string, sentence, word_offset, node_offset)
263 | constituent.append(constituent_sequence)
264 |
265 | first_graph = dgl.graph([])
266 | first_graph.set_n_initializer(dgl.frame.zero_initializer)
267 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
268 | first_graph.add_nodes(num_cons)
269 | first_graph.ndata['unit'] = torch.ones(num_cons)
270 | first_graph.ndata['dtype'] = torch.ones(num_cons)
271 |
272 | second_graph = dgl.graph([])
273 | second_graph.set_n_initializer(dgl.frame.zero_initializer)
274 | num_cons = sum([len(sent_cons[0]) for sent_cons in constituent])
275 | second_graph.add_nodes(num_cons)
276 | second_graph.ndata['unit'] = torch.ones(num_cons)
277 | second_graph.ndata['dtype'] = torch.ones(num_cons)
278 |
279 | claim_first_graph, claim_second_graph, constituent_labels_first, constituent_labels_second = \
280 | final_graph(constituent, first_graph, second_graph)
281 | return claim_first_graph, claim_second_graph, constituent_labels_first, constituent_labels_second
282 |
283 |
284 |
285 | if __name__ == "__main__":
286 | nlp = spacy.load('en_core_web_sm')
287 | nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
288 | tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=False, use_fast=False)
289 |
290 | # train_data = pd.read_csv('../../data/IAM/claims/train.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
291 | # train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
292 | # train_data = train_data.dropna(axis=0)
293 | # dev_data = pd.read_csv('../../data/IAM/claims/dev.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
294 | # dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
295 | # dev_data = dev_data.dropna(axis=0)
296 | test_data = pd.read_csv('../../data/IAM/claims/test.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
297 | test_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
298 | test_data = test_data.dropna(axis=0)
299 |
300 | # train_sentences = train_data['claim_sentence'].tolist()
301 | # dev_sentences = dev_data['claim_sentence'].tolist()
302 | test_sentences = test_data['claim_sentence'].tolist()
303 |
304 | total_train = []
305 | total_dev = []
306 | total_test = []
307 | printable = set(string.printable)
308 |
309 | for idx, test in tqdm(enumerate(test_sentences), total=len(test_sentences)):
310 | test = test.lower().replace('“', '"').replace('”', '"')
311 | test = "".join(filter(lambda x : x in printable, test))
312 | test_first_graph, test_second_graph, test_constituent_labels_first, test_constituent_labels_second = \
313 | all_process_graph(nlp, tokenizer, test)
314 | #dgl.save_graphs('../../data/IAM/claims/graphs/test_first_graph_{}.dgl'.format(idx), test_first_graph)
315 | #dgl.save_graphs('../../data/IAM/claims/graphs/test_second_graph_{}.dgl'.format(idx), test_second_graph)
316 | total_test.append([test_constituent_labels_first.tolist(), test_constituent_labels_second.tolist()])
317 |
318 | # for idx, train in tqdm(enumerate(train_sentences), total=len(train_sentences)):
319 | # # train = train.lower().replace("\xa0", '').replace('...', '').replace('—', ' ').replace('“', '').replace('’', "'").strip()
320 | # train = train.lower().replace('“', '"').replace('”', '"')
321 | # train = "".join(filter(lambda x : x in printable, train))
322 | #
323 | # train_first_graph, train_second_graph, train_constituent_labels_first, train_constituent_labels_second = \
324 | # all_process_graph(nlp, tokenizer, train)
325 | # dgl.save_graphs('../../data/IAM/claims/graphs/train_first_graph_{}.dgl'.format(idx), train_first_graph)
326 | # dgl.save_graphs('../../data/IAM/claims/graphs/train_second_graph_{}.dgl'.format(idx), train_second_graph)
327 | # total_train.append([train_constituent_labels_first.tolist(), train_constituent_labels_second.tolist()])
328 | #
329 | # for idx, dev in tqdm(enumerate(dev_sentences), total=len(dev_sentences)):
330 | # dev = dev.lower().replace('“', '"').replace('”', '"')
331 | # dev = "".join(filter(lambda x: x in printable, dev))
332 | # dev_first_graph, dev_second_graph, dev_constituent_label_first, dev_constituent_label_second \
333 | # = all_process_graph(nlp, tokenizer, dev)
334 | # dgl.save_graphs('../../data/IAM/claims/graphs/dev_first_graph_{}.dgl'.format(idx), dev_first_graph)
335 | # dgl.save_graphs('../../data/IAM/claims/graphs/dev_second_graph_{}.dgl'.format(idx), dev_second_graph)
336 | # total_dev.append([dev_constituent_label_first.tolist(), dev_constituent_label_second.tolist()])
337 |
338 | # with open('../../data/IAM/claims/graphs/train_constituent_first_second.txt', 'w', encoding='utf-8') as f:
339 | # for line in total_train:
340 | # f.write(str(line)+'\n')
341 | #
342 | # with open('../../data/IAM/claims/graphs/dev_constituent_first_second.txt', 'w', encoding='utf-8') as f:
343 | # for line in total_dev:
344 | # f.write(str(line)+'\n')
345 |
346 | # with open('../../data/IAM/claims/graphs/test_constituent_first_second.txt', 'w', encoding='utf-8') as f:
347 | # for line in total_test:
348 | # f.write(str(line) + '\n')
349 |
--------------------------------------------------------------------------------
/mine_next/functions/stance_main_func.py:
--------------------------------------------------------------------------------
1 | import os.path
2 | import os
3 | from tqdm import tqdm
4 | import torch
5 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
6 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup
7 | from transformers import AutoConfig, AutoTokenizer
8 | from sklearn.metrics import classification_report, accuracy_score
9 | from sklearn.utils import resample
10 |
11 | import csv
12 | import numpy as np
13 | import pandas as pd
14 | import json
15 |
16 | # from source.claim_classification.model.modeling import KoElectraForClaimClassification
17 | # from source.claim_classification.func.dataset import convert_data2tensordataset
18 |
19 | from mine_next.model.modeling import RobertaForClassification
20 | from mine_next.functions.dataset import (
21 | convert_data2tensordataset,
22 | convert_stance_data2tensordataset,
23 | convert_only_sentence2tensordataset,
24 | )
25 |
26 |
27 | def random_downsampling(dataset):
28 | major = dataset[dataset['claim_label'] == 'O']
29 | minor = dataset[dataset['claim_label'] == 'C']
30 | sampling_data = resample(major, replace=True, n_samples=len(minor)*5, random_state=42)
31 | train_data = pd.concat([sampling_data, minor])
32 | return train_data
33 |
34 |
35 | def random_upsampling(dataset):
36 | major = dataset[dataset['claim_label'] == 'O']
37 | minor = dataset[dataset['claim_label'] == 'C']
38 | sampling_data = resample(minor, replace=True, n_samples=len(major), random_state=42)
39 | train_data = pd.concat([sampling_data, major])
40 | return train_data
41 |
42 | def do_train(config, model, optimizer, scheduler, train_dataloader, epoch, global_step):
43 | losses = []
44 | total_predicts, total_corrects = [], []
45 | for step, batch in tqdm(enumerate(train_dataloader), desc='do_train(epoch_{})'.format(epoch), total=len(train_dataloader)):
46 | batch = tuple(t.cuda() for t in batch)
47 | # graph 같이 학습할 경우
48 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
49 | # constituent_labels = batch[6]
50 | # loss, predicts = model(
51 | # idx=idx,
52 | # input_ids=input_ids,
53 | # attention_mask=attention_mask,
54 | # token_type_ids=token_type_ids,
55 | # labels=labels,
56 | # sim_labels=sim_labels,
57 | # all_graph=total_graph,
58 | # constituent_labels=constituent_labels
59 | # )
60 | # base
61 | idx, input_ids, attention_mask, token_type_ids, stance_labels = batch[0], batch[1], batch[2], batch[3], \
62 | batch[4]
63 | loss, predicts = model(
64 | idx=idx,
65 | input_ids=input_ids,
66 | attention_mask=attention_mask,
67 | token_type_ids=token_type_ids,
68 | labels=stance_labels,
69 | sim_labels=None,
70 | )
71 | predicts = predicts.argmax(dim=-1)
72 | predicts = predicts.cpu().detach().numpy().tolist()
73 | labels = stance_labels.cpu().detach().numpy().tolist()
74 |
75 | total_predicts.extend(predicts)
76 | total_corrects.extend(labels)
77 |
78 | if config.gradient_accumulation_steps > 1:
79 | loss = loss / config.gradient_accumulation_steps
80 | # 원래는 tensor(0.7255)이런식
81 | loss.backward()
82 | losses.append(loss.data.item())
83 | if (step + 1) % config.gradient_accumulation_steps == 0 or \
84 | (len(train_dataloader) <= config.gradient_accumulation_steps and (step + 1) == len(
85 | train_dataloader)):
86 | torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
87 | optimizer.step()
88 | scheduler.step()
89 |
90 | model.zero_grad()
91 | global_step += 1
92 | target_names = ['class 0', 'class 1']
93 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
94 | accuracy = accuracy_score(total_corrects, total_predicts)
95 | return accuracy, np.mean(losses), global_step
96 |
97 |
98 | def do_evaluate(model, dev_dataloader):
99 | total_predicts, total_corrects = [], []
100 | for step, batch in tqdm(enumerate(dev_dataloader), desc="do_evaluate", total=len(dev_dataloader)):
101 | batch = tuple(t.cuda() for t in batch)
102 | # graph 학습할 경우
103 | # idx, input_ids, attention_mask, token_type_ids, labels, sim_labels = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5]
104 | # constituent_labels = batch[6]
105 | # predicts = model(
106 | # idx=idx,
107 | # input_ids=input_ids,
108 | # attention_mask=attention_mask,
109 | # token_type_ids=token_type_ids,
110 | # all_graph=total_graph,
111 | # constituent_labels=constituent_labels
112 | # )
113 | # base
114 | idx, input_ids, attention_mask, token_type_ids, stance_labels = batch[0], batch[1], batch[2], batch[3], \
115 | batch[4]
116 | predicts = model(
117 | idx=idx,
118 | input_ids=input_ids,
119 | attention_mask=attention_mask,
120 | token_type_ids=token_type_ids,
121 | )
122 | predicts = predicts.argmax(dim=-1)
123 | predicts = predicts.detach().cpu().tolist()
124 | labels = stance_labels.detach().cpu().tolist()
125 | total_predicts.extend(predicts)
126 | total_corrects.extend(labels)
127 | target_names = ['class 0', 'class 1']
128 | print(classification_report(total_corrects, total_predicts, target_names=target_names, digits=4))
129 | accuracy = accuracy_score(total_corrects, total_predicts)
130 | return accuracy, total_predicts
131 |
132 |
133 | def train(config, model, tokenizer):
134 |
135 | # 데이터셋 로드
136 | train_data = pd.read_csv(config.stance_train, sep='\t', header=None, quoting=csv.QUOTE_NONE)
137 | #train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
138 | train_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'id', 'stance_labels']
139 | train_data = train_data.dropna(axis=0)
140 | # train_data = train_data[:100]
141 | dev_data = pd.read_csv(config.stance_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
142 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'id', 'stance_labels']
143 | #dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
144 | dev_data = dev_data.dropna(axis=0)
145 | # dev_data = dev_data[:100]
146 |
147 | #train_data = random_upsampling(train_data)
148 | train_dataset = convert_stance_data2tensordataset(train_data, tokenizer, config.max_length, 'train')
149 | dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length, 'dev')
150 |
151 |
152 | train_sampler = RandomSampler(train_dataset)
153 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size)
154 | dev_sampler = SequentialSampler(dev_dataset)
155 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
156 |
157 | t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.epoch
158 | optimizer = AdamW(model.parameters(), lr=config.learning_rate)
159 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=t_total)
160 |
161 | global_step = 0
162 | max_test_accuracy = 0
163 | model.zero_grad()
164 | for epoch in range(config.epoch):
165 | model.train()
166 | train_accuracy, average_loss, global_step = do_train(
167 | config=config, model=model,
168 | optimizer=optimizer, scheduler=scheduler,
169 | train_dataloader=train_dataloader, epoch=epoch+1, global_step=global_step)
170 | print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4)))
171 |
172 | model.eval()
173 | test_accuracy, _ = do_evaluate(model=model, dev_dataloader=dev_dataloader)
174 | print("test_accuracy : {}\n".format(round(test_accuracy, 4)))
175 | output_dir = os.path.join(config.save_dir, "checkpoint-{}".format(epoch))
176 | if not os.path.exists(output_dir):
177 | os.makedirs(output_dir)
178 | model_to_save = model.module if hasattr(model, "module") else model
179 | model_to_save.save_pretrained(output_dir)
180 | tokenizer.save_pretrained(output_dir)
181 | torch.save(config, os.path.join(output_dir, "training_args.bin"))
182 |
183 |
184 | def evaluate(config, model, tokenizer):
185 | dev_data = pd.read_csv(config.claim_dev, sep='\t', header=None, quoting=csv.QUOTE_NONE)
186 | dev_data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
187 | dev_data = dev_data.dropna(axis=0)
188 | # dev_data = dev_data[:10]
189 | # dev_dataset = convert_stance_data2tensordataset(dev_data, tokenizer, config.max_length)
190 | dev_dataset, total_graph = convert_only_sentence2tensordataset(dev_data, tokenizer, config.max_length, 'dev')
191 |
192 | dev_sampler = SequentialSampler(dev_dataset)
193 | dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=config.batch_size)
194 |
195 | test_accuracy, total_predicts = do_evaluate(model=model, dev_dataloader=dev_dataloader, total_graph=total_graph)
196 | print("test accuracy : {}".format(round(test_accuracy,4)))
197 | total_corrects = dev_data['claim_label'].tolist()
198 | total_corrects = [1 if correct == 'C' else 0 for correct in total_corrects]
199 | totaL_claim_sentence = dev_data['claim_sentence'].tolist()
200 | error_list = []
201 | for predict, correct, claim in zip(total_predicts, total_corrects, totaL_claim_sentence):
202 | if predict != correct:
203 | error = {}
204 | error['predict'] = predict
205 | error['correct'] = correct
206 | error['claim_sentence'] = claim
207 | error_list.append(error)
208 |
209 | with open('../mine/functions/dev_error.json', 'w', encoding='utf-8') as f:
210 | json.dump(error_list, f, indent=4)
211 |
--------------------------------------------------------------------------------
/mine_next/functions/test.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import csv
3 | from transformers import RobertaTokenizer
4 | import string
5 |
6 | s = "“Let’s say there’s a government-run test."
7 | printable = set(string.printable)
8 | print(printable)
9 | print("".join(filter(lambda x: x in printable, s)))
10 |
11 |
12 | all_claim = []
13 |
14 | def extract_claim(data_file):
15 | data = pd.read_csv(data_file, sep='\t', header=None, quoting=csv.QUOTE_NONE)
16 | data.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
17 | data = data.dropna(axis=0)
18 | data = data[data['claim_label'] == 'C']
19 | claim_data = data['claim_sentence']
20 | return claim_data.tolist()
21 | # tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
22 | # input_str = 'a significant number of republicans assert that hereditary monarchy is unfair and elitist'
23 | #print(tokenizer.tokenize(input_str))
24 |
25 | # train_claim = extract_claim('../../data/IAM/claims/train.txt')
26 | # print(train_claim)
27 | # dev_claim = extract_claim('../../data/IAM/claims/dev.txt')
28 | # test_claim = extract_claim('../../data/IAM/claims/test.txt')
29 | #
30 | # all_claim.extend(train_claim)
31 | # all_claim.extend(dev_claim)
32 | # all_claim.extend(test_claim)
33 | #
34 | # with open('../../data/IAM/all_claim_sentence.txt', 'w', encoding='utf-8') as txt_file:
35 | # for claim in all_claim:
36 | # txt_file.write(claim)
37 | # txt_file.write('\n')
38 |
39 |
--------------------------------------------------------------------------------
/mine_next/functions/textrank.py:
--------------------------------------------------------------------------------
1 | import spacy
2 | import pytextrank
3 | import os
4 | import pandas as pd
5 | import csv
6 | import json, string
7 | from tqdm import tqdm
8 |
9 |
10 | def make_article_dict():
11 | topic_dir_list = os.listdir('../../data/IAM/origin/test')
12 | topic_dir_list = [os.path.join('../../data/IAM/origin/test', topic) for topic in topic_dir_list]
13 | article_dict = {}
14 | for topic_dir in topic_dir_list:
15 | file_list = os.listdir(topic_dir) # [21_3.txt, 21_7.txt, ... ]
16 | file_list_open = [os.path.join(topic_dir, file) for file in file_list] # ../../data/IAM/origin/train/Should_commercial_advertisements_be_allowed_to_be_fictitious/21_3.txt
17 |
18 | for idx, file in zip(file_list, file_list_open):
19 | article_id = idx.split('.')[0]
20 |
21 | num = file.split('/')[-1]
22 | assert idx == num
23 | sentences = []
24 | with open(file, 'r', encoding='utf-8') as f:
25 | article = f.readlines()
26 | for line in article:
27 | article_sentence = line.split('\t')[0]
28 | sentences.append(article_sentence)
29 | article_dict[article_id] = sentences
30 |
31 | with open('../../data/IAM/origin/test_article_dict.json', 'w', encoding='utf-8') as outfile:
32 | json.dump(article_dict, outfile, indent='\t', ensure_ascii=False)
33 |
34 | def make_pseudo_topic_with_textrank():
35 | printable = set(string.printable)
36 | nlp = spacy.load("en_core_web_sm")
37 | nlp.add_pipe("textrank")
38 |
39 | datas = json.load(open('../../data/IAM/origin/dev_article_dict.json', encoding='utf-8'))
40 | pseudo_topic = {}
41 | for key, value in tqdm(datas.items(), total=len(datas)):
42 | article_text = " ".join(value)
43 | article_text = article_text.lower().replace('“', '"').replace('”', '"')
44 | article_text = "".join(filter(lambda x : x in printable, article_text))
45 | doc = nlp(article_text)
46 | topic = []
47 | for phrase in doc._.phrases[:10]:
48 | topic.append(phrase.text)
49 | # pseudo_topic[key] = " ".join(topic)
50 | pseudo_topic[key] = topic
51 | with open('../../data/IAM/origin/dev_pseudo_topic_with_textrank_list.json', 'w', encoding='utf-8') as file:
52 | json.dump(pseudo_topic, file, indent='\t', ensure_ascii=False)
53 |
54 | #make_article_dict()
55 | # make_pseudo_topic_with_textrank()
56 | # data = json.load(open('../../data/IAM/origin/dev_pseudo_topic_with_textrank_list.json', encoding='utf-8'))
57 | # print(data)
58 |
59 |
60 | # doc = nlp(article.lower())
61 | #
62 | # # examine the top-ranked phrases in the document
63 | # pseudo_topic = []
64 | # for phrase in doc._.phrases[:10]:
65 | # #print(phrase)
66 | # print(phrase.text)
67 | # # print(phrase.rank, phrase.count)
68 | # # print(phrase.chunks)
69 | # print()
70 | #
71 |
72 |
73 | total_char_count = 0
74 | total_word_count = 0
75 | topic_dir_list = os.listdir('../../data/IAM/origin/test')
76 | topic_dir_list = [os.path.join('../../data/IAM/origin/test', topic) for topic in topic_dir_list]
77 | article_dict = {}
78 | for topic_dir in topic_dir_list:
79 | file_list = os.listdir(topic_dir) # [21_3.txt, 21_7.txt, ... ]
80 | file_list_open = [os.path.join(topic_dir, file) for file in file_list] # ../../data/IAM/origin/train/Should_commercial_advertisements_be_allowed_to_be_fictitious/21_3.txt
81 |
82 | for idx, file in zip(file_list, file_list_open):
83 | article_id = idx.split('.')[0]
84 |
85 | num = file.split('/')[-1]
86 | assert idx == num
87 | sentences = []
88 | with open(file, 'r', encoding='utf-8') as f:
89 | article = f.readlines()
90 | for line in article:
91 | article_sentence = line.split('\t')[0]
92 | word_of_sentence = article_sentence.split(' ')
93 | total_char_count += len(article_sentence)
94 | total_word_count += len(word_of_sentence)
95 | print(total_char_count)
96 | print(total_word_count)
97 |
--------------------------------------------------------------------------------
/mine_next/functions/txt2json.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import json
3 |
4 | import pandas as pd
5 |
6 | dataset = pd.read_csv('../../data/IAM/stance/test.txt', sep='\t', header=None, quoting=csv.QUOTE_NONE)
7 | dataset.columns = ['claim_label', 'topic_sentence', 'claim_sentence', 'article_id', 'stance_label']
8 | dataset = dataset.dropna(axis=0)
9 |
10 | # claim_sentences = dataset['claim_sentence'].tolist()
11 | # claim_labels = ['non claim' if label is 'O' else 'claim' for label in dataset['claim_label'].tolist()]
12 | stance_sentences = dataset['claim_sentence']
13 | stance_label = dataset['stance_label']
14 |
15 | label_dict = {}
16 | label_dict['-1'] = 'contest'
17 | label_dict['1'] = 'support'
18 | label_dict['0'] = 'non-claim'
19 | data_json = []
20 | for sentence, label in zip(stance_sentences, stance_label):
21 | content = {}
22 | content['text'] = sentence
23 | content['label'] = label_dict[str(label)]
24 | data_json.append(content)
25 |
26 | with open('../../data/IAM/stance/IAM_stance_test.json', 'w', encoding='utf-8') as outfile:
27 | json.dump(data_json, outfile, indent='\t', ensure_ascii=False)
--------------------------------------------------------------------------------
/mine_next/functions/use_bertopic.py:
--------------------------------------------------------------------------------
1 | from bertopic import BERTopic
2 | from sklearn.datasets import fetch_20newsgroups
3 | from hdbscan import HDBSCAN
4 | from transformers import BertModel
5 | from sentence_transformers import SentenceTransformer
6 | import pandas as pd
7 | import os, json
8 | from os import listdir
9 | from os.path import isfile, join
10 | from sklearn.manifold import TSNE
11 | from tqdm import tqdm
12 |
13 |
14 | def topic_sentences(mode):
15 | sentences = []
16 | article_ids = []
17 | topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode))
18 | topic_dir_list = sorted([os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list])
19 |
20 | for topic_dir in topic_dir_list:
21 | file_list = os.listdir(topic_dir)
22 | file_list_open = sorted([os.path.join(topic_dir, file) for file in file_list])
23 |
24 | for idx, file in zip(file_list, file_list_open):
25 | article_id = idx.split('.')[0]
26 | sentence = []
27 | with open(file, 'r', encoding='utf-8') as f:
28 | article = f.readlines()
29 | for line in article:
30 | article_sentence = line.split('\t')[0]
31 | #sentences.append(article_sentence)
32 | sentence.append(article_sentence)
33 | sentences.append(' '.join(sent for sent in sentence))
34 | article_ids.append(article_id)
35 | return article_ids, sentences
36 |
37 | train_ids, train_sentences = topic_sentences('train')
38 | dev_ids, dev_sentences = topic_sentences('dev')
39 | test_ids, test_sentences = topic_sentences('test')
40 |
41 | def topic_modeling():
42 | '''
43 | 1. extract embeddings
44 | 2. reduce dimensionality
45 | 3. cluster reduced embeddings
46 | 4. tokenize topics
47 | 5. create topic representatioin
48 | '''
49 | embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
50 | hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
51 | topic_model = BERTopic(
52 | embedding_model=embedding_model,
53 | hdbscan_model=hdbscan_model,
54 | # diversity=0.2
55 | )
56 | topic_model.save('../../data/IAM/origin/topic_modeling_div0.0')
57 |
58 | topic_modeling()
59 | topic_model = BERTopic.load('../../data/IAM/origin/topic_modeling_div0.0')
60 | topics, probs = topic_model.fit_transform(train_sentences)
61 | print(topic_model.get_topic_info())
62 | # topic_model.visualize_topics().write_html("../../data/IAM/origin/intertopic_dist_map_div0.2.html")
63 | # topic_model.visualize_documents(train_sentences).write_html("../../data/IAM/origin/projections_div0.2.html")
64 |
65 |
66 | def make_pseudo_topic_with_bertopic(ids, sentences, topic_model, mode):
67 | pseudo_topic_dict = {}
68 | for idx, sentence in tqdm(zip(ids, sentences), total=len(ids), desc='{} processing ...'.format(mode)):
69 | # 여기서 sentence 는 기사 하나라고 생각하면 된다
70 | pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0])
71 | pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic])
72 | pseudo_topic_dict[idx] = pseudo_topic
73 | with open('../../data/IAM/origin/{}_pseudo_topic_with_bertopic_div0.0.json'.format(mode), 'w', encoding='utf-8') as file:
74 | json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
75 |
76 | make_pseudo_topic_with_bertopic(train_ids, train_sentences, topic_model, 'train')
77 | make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, topic_model, 'dev')
78 | make_pseudo_topic_with_bertopic(test_ids, test_sentences, topic_model, 'test')
79 |
80 |
81 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file:
82 | # json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
83 |
84 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
85 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2")
86 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
87 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model)
88 | #topic_model.save('../../data/IAM/origin/topic_model')
89 |
--------------------------------------------------------------------------------
/mine_next/functions/use_bertopic2.py:
--------------------------------------------------------------------------------
1 | from bertopic import BERTopic
2 | from sklearn.datasets import fetch_20newsgroups
3 | from hdbscan import HDBSCAN
4 | from transformers import BertModel
5 | from sentence_transformers import SentenceTransformer
6 | import pandas as pd
7 | import os, json
8 | from os import listdir
9 | from os.path import isfile, join
10 | from sklearn.manifold import TSNE
11 | from tqdm import tqdm
12 |
13 |
14 | def topic_sentences(mode):
15 | sentences = []
16 | article_ids = []
17 | topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode))
18 | topic_dir_list = sorted([os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list])
19 |
20 | for topic_dir in topic_dir_list:
21 | file_list = os.listdir(topic_dir)
22 | file_list_open = sorted([os.path.join(topic_dir, file) for file in file_list])
23 |
24 | for idx, file in zip(file_list, file_list_open):
25 | article_id = idx.split('.')[0]
26 | sentence = []
27 | with open(file, 'r', encoding='utf-8') as f:
28 | article = f.readlines()
29 | for line in article:
30 | article_sentence = line.split('\t')[0]
31 | #sentences.append(article_sentence)
32 | sentence.append(article_sentence)
33 | sentences.append(' '.join(sent for sent in sentence))
34 | article_ids.append(article_id)
35 | return article_ids, sentences
36 |
37 | train_ids, train_sentences = topic_sentences('train')
38 | dev_ids, dev_sentences = topic_sentences('dev')
39 | test_ids, test_sentences = topic_sentences('test')
40 |
41 | def topic_modeling():
42 | '''
43 | 1. extract embeddings
44 | 2. reduce dimensionality
45 | 3. cluster reduced embeddings
46 | 4. tokenize topics
47 | 5. create topic representatioin
48 | '''
49 | embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
50 | hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
51 | topic_model = BERTopic(
52 | embedding_model=embedding_model,
53 | hdbscan_model=hdbscan_model,
54 | diversity=0.2
55 | )
56 | topic_model.save('../../data/IAM/origin/topic_modeling_div0.2')
57 |
58 | topic_modeling()
59 | topic_model = BERTopic.load('../../data/IAM/origin/topic_modeling_div0.2')
60 | topics, probs = topic_model.fit_transform(train_sentences)
61 | print(topic_model.get_topic_info())
62 | # topic_model.visualize_topics().write_html("../../data/IAM/origin/intertopic_dist_map_div0.2.html")
63 | # topic_model.visualize_documents(train_sentences).write_html("../../data/IAM/origin/projections_div0.2.html")
64 |
65 |
66 | def make_pseudo_topic_with_bertopic(ids, sentences, topic_model, mode):
67 | pseudo_topic_dict = {}
68 | for idx, sentence in tqdm(zip(ids, sentences), total=len(ids), desc='{} processing ...'.format(mode)):
69 | # 여기서 sentence 는 기사 하나라고 생각하면 된다
70 | pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0])
71 | pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic])
72 | pseudo_topic_dict[idx] = pseudo_topic
73 | with open('../../data/IAM/origin/{}_pseudo_topic_with_bertopic_div0.2.json'.format(mode), 'w', encoding='utf-8') as file:
74 | json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
75 |
76 | make_pseudo_topic_with_bertopic(train_ids, train_sentences, topic_model, 'train')
77 | make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, topic_model, 'dev')
78 | make_pseudo_topic_with_bertopic(test_ids, test_sentences, topic_model, 'test')
79 |
80 |
81 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file:
82 | # json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
83 |
84 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
85 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2")
86 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
87 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model)
88 | #topic_model.save('../../data/IAM/origin/topic_model')
89 |
--------------------------------------------------------------------------------
/mine_next/functions/use_bertopic3.py:
--------------------------------------------------------------------------------
1 | from bertopic import BERTopic
2 | from sklearn.datasets import fetch_20newsgroups
3 | from hdbscan import HDBSCAN
4 | from transformers import BertModel
5 | from sentence_transformers import SentenceTransformer
6 | import pandas as pd
7 | import os, json
8 | from os import listdir
9 | from os.path import isfile, join
10 | from sklearn.manifold import TSNE
11 | from tqdm import tqdm
12 |
13 |
14 | def topic_sentences(mode):
15 | sentences = []
16 | article_ids = []
17 | topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode))
18 | topic_dir_list = sorted([os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list])
19 |
20 | for topic_dir in topic_dir_list:
21 | file_list = os.listdir(topic_dir)
22 | file_list_open = sorted([os.path.join(topic_dir, file) for file in file_list])
23 |
24 | for idx, file in zip(file_list, file_list_open):
25 | article_id = idx.split('.')[0]
26 | sentence = []
27 | with open(file, 'r', encoding='utf-8') as f:
28 | article = f.readlines()
29 | for line in article:
30 | article_sentence = line.split('\t')[0]
31 | #sentences.append(article_sentence)
32 | sentence.append(article_sentence)
33 | sentences.append(' '.join(sent for sent in sentence))
34 | article_ids.append(article_id)
35 | return article_ids, sentences
36 |
37 | train_ids, train_sentences = topic_sentences('train')
38 | dev_ids, dev_sentences = topic_sentences('dev')
39 | test_ids, test_sentences = topic_sentences('test')
40 | temp_ids = [ids.split('_')[0] for ids in train_ids]
41 |
42 | def topic_modeling():
43 | '''
44 | 1. extract embeddings
45 | 2. reduce dimensionality
46 | 3. cluster reduced embeddings
47 | 4. tokenize topics
48 | 5. create topic representatioin
49 | '''
50 | embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
51 | hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
52 | topic_model = BERTopic(
53 | embedding_model=embedding_model,
54 | hdbscan_model=hdbscan_model,
55 | diversity=0.1
56 | )
57 | topic_model.save('../../data/IAM/origin/topic_modeling_div0.1')
58 |
59 | #topic_modeling()
60 | topic_model = BERTopic.load('../../data/IAM/origin/topic_modeling_div0.1')
61 | topics, probs = topic_model.fit_transform(train_sentences)
62 | print(topic_model.get_topic_info())
63 | #print(topic_model.topic_embeddings_)
64 | # 88개의 topic_embeddings_
65 | #topic_model.visualize_topics(width=1500, height=1500).write_html("../../data/IAM/origin/intertopic_dist_map_div0.1.html")
66 | #topic_model.visualize_documents(train_sentences, width=2000, height=2000).write_html("../../data/IAM/origin/projections_div0.1.html")
67 | topic_model.visualize_topics_per_class(topic_model.topics_per_class(train_sentences, temp_ids), width=1500, height=1500, top_n_topics=20).write_html('../../data/IAM/origin/topic_per_class_div.0.1.html')
68 | # for i in range(10):
69 | # print(topic_model.transform(train_sentences[i]))
70 |
71 | def make_pseudo_topic_with_bertopic(ids, sentences, topic_model, mode):
72 | pseudo_topic_dict = {}
73 | for idx, sentence in tqdm(zip(ids, sentences), total=len(ids), desc='{} processing ...'.format(mode)):
74 | # 여기서 sentence 는 기사 하나라고 생각하면 된다
75 | pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0])
76 | pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic])
77 | pseudo_topic_dict[idx] = pseudo_topic
78 | with open('../../data/IAM/origin/{}_pseudo_topic_with_bertopic_div0.1.json'.format(mode), 'w', encoding='utf-8') as file:
79 | json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
80 |
81 | # make_pseudo_topic_with_bertopic(train_ids, train_sentences, topic_model, 'train')
82 | # make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, topic_model, 'dev')
83 | # make_pseudo_topic_with_bertopic(test_ids, test_sentences, topic_model, 'test')
84 |
85 |
86 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file:
87 | # json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
88 |
89 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
90 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2")
91 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
92 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model)
93 | #topic_model.save('../../data/IAM/origin/topic_model')
94 |
--------------------------------------------------------------------------------
/mine_next/functions/use_firstsent.py:
--------------------------------------------------------------------------------
1 | from bertopic import BERTopic
2 | from sklearn.datasets import fetch_20newsgroups
3 | from hdbscan import HDBSCAN
4 | from transformers import BertModel
5 | from sentence_transformers import SentenceTransformer
6 | import pandas as pd
7 | import os, json
8 | from os import listdir
9 | from os.path import isfile, join
10 |
11 | def topic_sentences(mode):
12 | sentences = []
13 | article_ids = []
14 | topic_dir_list = os.listdir('../../data/IAM/origin/{}'.format(mode))
15 | topic_dir_list = [os.path.join('../../data/IAM/origin/{}'.format(mode), topic) for topic in topic_dir_list]
16 |
17 | for topic_dir in topic_dir_list:
18 | file_list = os.listdir(topic_dir)
19 | file_list_open = [os.path.join(topic_dir, file) for file in file_list]
20 |
21 | for idx, file in zip(file_list, file_list_open):
22 | article_id = idx.split('.')[0]
23 | sentence = []
24 | with open(file, 'r', encoding='utf-8') as f:
25 | article = f.readlines()
26 | for line in article:
27 | article_sentence = line.split('\t')[0]
28 | #sentences.append(article_sentence)
29 | sentence.append(article_sentence)
30 | sentences.append(sentence[0])
31 | #sentences.append(' '.join(sent for sent in sentence))
32 | article_ids.append(article_id)
33 | return article_ids, sentences
34 |
35 | train_ids, train_sentences = topic_sentences('train')
36 | dev_ids, dev_sentences = topic_sentences('dev')
37 | fit_data = train_sentences + dev_sentences
38 | test_ids, test_sentences = topic_sentences('test')
39 | # embedding_model = SentenceTransformer("all-MiniLM-L12-v2")
40 | # # embedding_model = SentenceTransformer("all-mpnet-base-v2")
41 | # cluster_model = HDBSCAN(min_cluster_size=3, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
42 | # topic_model = BERTopic(embedding_model=embedding_model, hdbscan_model=cluster_model)
43 | #topic_model.save('../../data/IAM/origin/topic_model')
44 | #
45 |
46 |
47 | #topic_model = BERTopic.load('../../data/IAM/origin/topic_model')
48 | #topics, probs = topic_model.fit_transform(fit_data)
49 |
50 | #print(topic_model.get_topic_info())
51 |
52 | def make_pseudo_topic_with_bertopic(ids, sentences, mode):
53 | pseudo_topic_dict = {}
54 | for idx, sentence in zip(ids, sentences):
55 | #pseudo_topic = topic_model.get_topic(topic=topic_model.transform(sentence)[0][0])
56 | #pseudo_topic = ' '.join([topic_word[0] for topic_word in pseudo_topic])
57 | pseudo_topic_dict[idx] = sentence
58 | with open('../../data/IAM/origin/{}_pseudo_topic_with_first_sent.json'.format(mode), 'w', encoding='utf-8') as file:
59 | json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
60 |
61 | make_pseudo_topic_with_bertopic(train_ids, train_sentences, 'train')
62 | make_pseudo_topic_with_bertopic(dev_ids, dev_sentences, 'dev')
63 | make_pseudo_topic_with_bertopic(test_ids, test_sentences, 'test')
64 |
65 |
66 | # with open('../../data/IAM/origin/test_pseudo_topic_with_bertopic.json', 'w', encoding='utf-8') as file:
67 | # json.dump(pseudo_topic_dict, file, indent='\t', ensure_ascii=False)
--------------------------------------------------------------------------------
/mine_next/model/__pycache__/modeling.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KUNLP/Argument_Classifier/a44f3764516e0a3450e545fbc9e8caa12f8ee466/mine_next/model/__pycache__/modeling.cpython-37.pyc
--------------------------------------------------------------------------------
/mine_next/model/modeling.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer, AutoModel, AutoConfig, \
2 | RobertaPreTrainedModel, RobertaModel
3 | import torch
4 | import torch.nn as nn
5 | from abc import ABC
6 | import dgl
7 | import dgl.function as fn
8 | import dgl.nn.pytorch as dglnn
9 | import torch.nn.functional as F
10 | from dgl import DGLGraph
11 |
12 |
13 | class CGATLayer(nn.Module, ABC):
14 | """ Constituent-Constituent GATLayer """
15 |
16 | def __init__(self, in_dim, feat_embed_size, out_dim, num_heads):
17 | super(CGATLayer, self).__init__()
18 | self.fc = nn.Linear(in_dim, out_dim * num_heads, bias=False)
19 | self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)
20 | self.num_heads = num_heads
21 | self.reset_parameters()
22 |
23 | def reset_parameters(self):
24 | gain = nn.init.calculate_gain('relu')
25 | nn.init.xavier_normal_(self.fc.weight, gain=gain)
26 | nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)
27 |
28 | def edge_attention(self, edges):
29 | z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=2)
30 | a = self.attn_fc(z2)
31 | return {'e': F.leaky_relu(a)}
32 |
33 | def message_func(self, edges):
34 | return {'z': edges.src['z'], 'e': edges.data['e']}
35 |
36 | def reduce_func(self, nodes):
37 | alpha = F.softmax(nodes.mailbox['e'], dim=1)
38 | h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
39 | return {'h': h}
40 |
41 | def forward(self, g, h, edge_type=None):
42 | z = self.fc(h)
43 | num_tokens, emb_size = z.size()
44 | z = z.reshape([num_tokens, self.num_heads, emb_size // self.num_heads])
45 | cons_node_ids = g.filter_nodes(lambda nodes: nodes.data['dtype'] == 1)
46 | cc_edge_id = g.filter_edges(lambda edges: edges.data["dtype"] == edge_type)
47 | self_edge_id = g.filter_edges(lambda edges: edges.data["dtype"] == 4)
48 | cc_edge_id = torch.cat([cc_edge_id, self_edge_id], dim=0)
49 | g.nodes[cons_node_ids].data['z'] = z
50 | g.apply_edges(self.edge_attention, edges=cc_edge_id)
51 | g.pull(cons_node_ids, self.message_func, self.reduce_func)
52 | g.ndata.pop('z')
53 | h = g.ndata.pop('h')
54 | return h[cons_node_ids]
55 |
56 |
57 | class CTGATLayer(nn.Module, ABC):
58 | """ Constituent-Token GATLayer """
59 |
60 | def __init__(self, in_dim, feat_embed_size, out_dim, num_heads):
61 | super(CTGATLayer, self).__init__()
62 | self.fc = nn.Linear(in_dim, out_dim * num_heads, bias=False)
63 | self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)
64 | self.num_heads = num_heads
65 | self.reset_parameters()
66 |
67 | def reset_parameters(self):
68 | gain = nn.init.calculate_gain('relu')
69 | nn.init.xavier_normal_(self.fc.weight, gain=gain)
70 | nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)
71 |
72 | def edge_attention(self, edges):
73 | z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=2)
74 | a = self.attn_fc(z2)
75 | return {'e': F.leaky_relu(a)}
76 |
77 | def message_func(self, edges):
78 | return {'z': edges.src['z'], 'e': edges.data['e']}
79 |
80 | def reduce_func(self, nodes):
81 | alpha = F.softmax(nodes.mailbox['e'], dim=1)
82 | h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
83 | return {'h': h}
84 |
85 | def forward(self, g, h, edge_type=None):
86 | z = self.fc(h)
87 | num_tokens, emb_size = z.size()
88 | z = z.reshape([num_tokens, self.num_heads, emb_size // self.num_heads])
89 | token_node_ids = g.filter_nodes(lambda nodes: nodes.data['dtype'] == 0)
90 | cons_node_ids = g.filter_nodes(lambda nodes: nodes.data['dtype'] == 1)
91 | ct_edge_id = g.filter_edges(lambda edges: edges.data["dtype"] == 5)
92 | g.nodes[cons_node_ids].data['z'] = z
93 | g.apply_edges(self.edge_attention, edges=ct_edge_id)
94 | g.pull(token_node_ids, self.message_func, self.reduce_func)
95 | g.ndata.pop('z')
96 | h = g.ndata.pop('h')
97 | return h[token_node_ids]
98 |
99 |
100 | class MultiHeadGATLayer(nn.Module, ABC):
101 | def __init__(self, layer, in_size, out_size, feat_embed_size, num_heads, config, merge='cat', layer_norm_eps=1e-12):
102 | super(MultiHeadGATLayer, self).__init__()
103 | self.heads = nn.ModuleList()
104 | out_dim = out_size // num_heads
105 | self.layer = layer(in_size, feat_embed_size, out_dim, num_heads)
106 | self.merge = merge
107 | self.dropout = nn.Dropout(p=0.2)
108 | self.LayerNorm = nn.LayerNorm(out_size, eps=layer_norm_eps)
109 |
110 | def forward(self, g, o, h, edge_type=None):
111 | head_outs = self.layer(g, self.dropout(h), edge_type)
112 | num_tokens = head_outs.size()[0]
113 | if self.merge == 'cat':
114 | out = head_outs.reshape([num_tokens, -1])
115 | else:
116 | out = torch.mean(head_outs, dim=1)
117 | out = o + F.elu(out)
118 | out = self.LayerNorm(out)
119 | return out
120 |
121 |
122 | class GCNLayer(nn.Module):
123 | def __init__(self, in_feats, out_feats):
124 | super(GCNLayer, self).__init__()
125 | self.linear = nn.Linear(in_feats, out_feats)
126 | self.gcn_msg = fn.copy_u(u='h', out='m')
127 | self.gcn_reduce = fn.sum(msg='m', out='h')
128 | def forward(self, g, feature):
129 | with g.local_scope():
130 | g.ndata['h'] = feature
131 | g.updata_all(self.gcn_msg, self.gcn_reduce)
132 | h = g.ndata['h']
133 | return self.linear(h)
134 |
135 |
136 | class MultiCGNLayer(nn.Module):
137 | def __init__(self):
138 | super(MultiCGNLayer, self).__init__()
139 | self.hidden_size * 2 + self.cons_hidden_size
140 | self.layer1 = GCNLayer()
141 |
142 |
143 | class GraphEmbedding(nn.Module):
144 | def __init__(self, in_dim, hidden_dim):
145 | super(GraphEmbedding, self).__init__()
146 | self.conv1 = dglnn.GraphConv(in_dim, hidden_dim)
147 | self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim)
148 | self.conv3 = dglnn.GraphConv(hidden_dim, hidden_dim)
149 | nn.init.xavier_normal_(self.conv1.weight)
150 | nn.init.xavier_normal_(self.conv2.weight)
151 | nn.init.xavier_normal_(self.conv3.weight)
152 | def forward(self, g, h):
153 | h = F.relu(self.conv1(g, h))
154 | h = F.relu(self.conv2(g, h))
155 | h = F.relu(self.conv3(g, h))
156 | #with g.local_scope():
157 | g.ndata['h'] = h
158 | hg = dgl.mean_nodes(g, 'h')
159 | return hg
160 |
161 |
162 | class GraphEmbedding2(nn.Module):
163 | def __init__(self, in_dim, hidden_dim, out_dim):
164 | super(GraphEmbedding2, self).__init__()
165 | self.conv1 = dglnn.GraphConv(in_dim, hidden_dim, allow_zero_in_degree=True)
166 | self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim, allow_zero_in_degree=True)
167 | self.conv3 = dglnn.GraphConv(hidden_dim, out_dim, allow_zero_in_degree=True)
168 | # nn.init.xavier_normal_(self.conv1.weight)
169 | # nn.init.xavier_normal_(self.conv2.weight)
170 | # nn.init.xavier_normal_(self.conv3.weight)
171 | def forward(self, g, h):
172 | h = F.relu(self.conv1(g, h))
173 | h = F.relu(self.conv2(g, h))
174 | h = F.relu(self.conv3(g, h))
175 | with g.local_scope():
176 | g.ndata['h'] = h
177 | hg = dgl.mean_nodes(g, 'h')
178 | return hg
179 |
180 |
181 | class RobertaReflectGraphWithGrandEdgeClassification(RobertaPreTrainedModel):
182 | def __init__(self, config):
183 | super().__init__(config)
184 | self.num_labels = config.num_labels
185 | self.hidden_size = config.hidden_size
186 | self.cons_hidden_size = config.cons_hidden_size
187 | self.roberta = RobertaModel(config)
188 | self.feature_size = config.feature_size
189 | # 그래프 둘다 쓸때
190 | # self.claim_layer = nn.Sequential(
191 | # nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.hidden_size+2*self.feature_size),
192 | # nn.ReLU(),
193 | # nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.num_labels),
194 | # )
195 | # 그래프 하나만 할때
196 | self.claim_layer = nn.Sequential(
197 | nn.Linear(in_features=self.hidden_size + self.feature_size,
198 | out_features=self.hidden_size + self.feature_size),
199 | nn.ReLU(),
200 | nn.Linear(in_features=self.hidden_size + self.feature_size, out_features=self.num_labels),
201 | )
202 | # 원래는 self.hidden_size + self.cons_hidden_size
203 | self.cons_type_embeddings = nn.Embedding(len(config.cons_tag2id), self.cons_hidden_size)
204 | # nn.init.uniform_(self.cons_type_embeddings.weight, -1.0, 1.0)
205 | self.softmax = nn.Softmax(dim=-1)
206 | self.graph_embedding = GraphEmbedding2(self.cons_hidden_size, self.cons_hidden_size, self.feature_size)
207 |
208 | def forward(self, idx=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None, all_graph=None,
209 | constituent_labels_first=None, constituent_labels_second=None):
210 | output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
211 | output = output.last_hidden_state
212 |
213 | graph_conv_first = []
214 | graph_conv_second = []
215 | for (graph_id, first, second) in zip(idx, constituent_labels_first, constituent_labels_second):
216 | curr_first_g = all_graph[0][int(graph_id.item())].to("cuda")
217 | first_mask = first != -1
218 | first_label = first[first_mask]
219 | first_cons_node_feature = self.cons_type_embeddings(first_label)
220 | curr_first_g_conv = self.graph_embedding(curr_first_g, first_cons_node_feature)
221 | graph_conv_first.append(curr_first_g_conv)
222 |
223 | curr_second_g = all_graph[1][int(graph_id.item())].to("cuda")
224 | second_mask = second != -1
225 | second_label = second[second_mask]
226 | second_cons_node_feature = self.cons_type_embeddings(second_label)
227 | curr_second_g_conv = self.graph_embedding(curr_second_g, second_cons_node_feature)
228 | graph_conv_second.append(curr_second_g_conv)
229 |
230 | # graph_conv_reult = torch.stack(graph_conv_reult, dim=0)
231 | # cls = output[:, 0, : ] -> (4, 768)
232 | graph_conv_first = torch.stack(graph_conv_first, dim=0)
233 | graph_conv_second = torch.stack(graph_conv_second, dim=0)
234 |
235 | cls_token = output[:, 0, :].unsqueeze(dim=1)
236 | #cls_graph_concat = torch.cat([cls_token, graph_conv_first, graph_conv_second], dim=-1)
237 | cls_graph_concat = torch.cat([cls_token, graph_conv_second], dim=-1)
238 | #cls_graph_concat = torch.cat([cls_token, graph_conv_second], dim=-1)
239 |
240 | logit = self.claim_layer(cls_graph_concat)
241 | logit = logit.squeeze(dim=1)
242 | if labels is not None:
243 | loss_func = nn.CrossEntropyLoss()
244 | loss = loss_func(logit, labels)
245 | return loss, self.softmax(logit)
246 | else:
247 | return logit
248 |
249 |
250 | class RobertaReflectGraphClassification(RobertaPreTrainedModel):
251 | def __init__(self, config):
252 | super().__init__(config)
253 | self.num_labels = config.num_labels
254 | self.hidden_size = config.hidden_size
255 | self.cons_hidden_size = config.cons_hidden_size
256 | self.roberta = RobertaModel(config)
257 | self.feature_size = config.feature_size
258 | self.claim_layer = nn.Sequential(
259 | nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.hidden_size+2*self.feature_size),
260 | nn.ReLU(),
261 | nn.Linear(in_features=self.hidden_size+2*self.feature_size, out_features=self.num_labels),
262 | )# 원래는 self.hidden_size + self.cons_hidden_size
263 | self.cons_type_embeddings = nn.Embedding(len(config.cons_tag2id), self.cons_hidden_size)
264 | # nn.init.uniform_(self.cons_type_embeddings.weight, -1.0, 1.0)
265 | self.softmax = nn.Softmax(dim=-1)
266 | self.graph_embedding = GraphEmbedding2(self.cons_hidden_size, self.cons_hidden_size, self.feature_size)
267 |
268 | def forward(self, idx=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None, all_graph=None,
269 | constituent_labels_first=None, constituent_labels_second=None):
270 | output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
271 | output = output.last_hidden_state
272 |
273 | graph_conv_first = []
274 | graph_conv_second = []
275 | for (graph_id, first, second) in zip(idx, constituent_labels_first, constituent_labels_second):
276 | curr_first_g = all_graph[0][int(graph_id.item())].to("cuda")
277 | first_mask = first != -1
278 | first_label = first[first_mask]
279 | first_cons_node_feature = self.cons_type_embeddings(first_label)
280 | curr_first_g_conv = self.graph_embedding(curr_first_g, first_cons_node_feature)
281 | graph_conv_first.append(curr_first_g_conv)
282 |
283 | curr_second_g = all_graph[1][int(graph_id.item())].to("cuda")
284 | second_mask = second != -1
285 | second_label = second[second_mask]
286 | second_cons_node_feature = self.cons_type_embeddings(second_label)
287 | curr_second_g_conv = self.graph_embedding(curr_second_g, second_cons_node_feature)
288 | graph_conv_second.append(curr_second_g_conv)
289 |
290 | # graph_conv_reult = torch.stack(graph_conv_reult, dim=0)
291 | # cls = output[:, 0, : ] -> (4, 768)
292 | graph_conv_first = torch.stack(graph_conv_first, dim=0)
293 | graph_conv_second = torch.stack(graph_conv_second, dim=0)
294 |
295 | cls_token = output[:, 0, :].unsqueeze(dim=1)
296 | # cls_graph_concat = torch.cat([cls_token, graph_conv_first], dim=-1)
297 | #cls_graph_concat = torch.cat([graph_conv_first,cls_token], dim=-1)
298 | cls_graph_concat = torch.cat([cls_token, graph_conv_first, graph_conv_second], dim=-1)
299 | logit = self.claim_layer(cls_graph_concat)
300 | logit = logit.squeeze(dim=1)
301 | if labels is not None:
302 | loss_func = nn.CrossEntropyLoss()
303 | loss = loss_func(logit, labels)
304 | return loss, self.softmax(logit)
305 | else:
306 | return logit
307 |
308 |
309 | class RobertaForClassification(RobertaPreTrainedModel):
310 | def __init__(self, config):
311 | super().__init__(config)
312 | self.num_labels = config.num_labels
313 | self.hidden_size = config.hidden_size
314 | self.roberta = RobertaModel(config)
315 | self.claim_layer = nn.Sequential(
316 | nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size),
317 | nn.ReLU(),
318 | nn.Linear(in_features=self.hidden_size, out_features=self.num_labels),
319 | )
320 | self.softmax = nn.Softmax(dim=-1)
321 |
322 | def forward(self, idx=None, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None):
323 | output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
324 | output = output.last_hidden_state
325 |
326 | logit = self.claim_layer(output[:, 0, :])
327 | if labels is not None:
328 | loss_func = nn.CrossEntropyLoss()
329 | loss = loss_func(logit, labels)
330 | return loss, self.softmax(logit)
331 | else:
332 | return logit
333 |
334 |
335 | class RobertaForStanceClassification(RobertaPreTrainedModel):
336 | def __init__(self, config):
337 | super().__init__(config)
338 | self.num_labels = config.num_labels
339 | self.hidden_size = config.hidden_size
340 | self.roberta = RobertaModel(config)
341 | self.claim_layer = nn.Sequential(
342 | nn.Linear(in_features=self.hidden_size, out_features=self.hidden_size),
343 | nn.ReLU(),
344 | nn.Linear(in_features=self.hidden_size, out_features=self.num_labels),
345 | )
346 | self.softmax = nn.Softmax(dim=-1)
347 |
348 | def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
349 | output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
350 | output = output.last_hidden_state
351 |
352 | logit = self.claim_layer(output[:, 0, :])
353 | if labels is not None:
354 | loss_func = nn.CrossEntropyLoss()
355 | loss = loss_func(logit, labels)
356 | return loss, self.softmax(logit)
357 | else:
358 | return logit
359 |
360 |
361 | class RobertaForSTANCY(RobertaPreTrainedModel):
362 | def __init__(self, config):
363 | super().__init__(config)
364 | self.num_labels = config.num_labels
365 | self.hidden_size = config.hidden_size
366 | self.roberta = RobertaModel(config)
367 | self.claim_layer = nn.Sequential(
368 | nn.Linear(in_features=self.hidden_size+1, out_features=self.hidden_size+1),
369 | nn.ReLU(),
370 | nn.Linear(in_features=self.hidden_size+1, out_features=self.num_labels),
371 | )
372 | self.softmax = nn.Softmax(dim=-1)
373 | self.cosine = nn.CosineSimilarity()
374 |
375 | def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, sim_labels=None):
376 |
377 | output_combine = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
378 | #output_combine = output_combine.last_hidden_state
379 | output_combine = output_combine.pooler_output
380 | sent_attention_mask = (1-token_type_ids) * attention_mask
381 | output_sent = self.roberta(input_ids=input_ids, attention_mask=sent_attention_mask)
382 | #output_sent = output_sent.last_hidden_state
383 | output_sent = output_sent.pooler_output
384 | cos_sim = self.cosine(output_combine, output_sent).unsqueeze(1)
385 | combined = torch.cat([output_combine, cos_sim], dim=1)
386 |
387 | logit = self.claim_layer(combined)
388 |
389 | if labels is not None:
390 | loss_func = nn.CrossEntropyLoss()
391 | loss_bert = loss_func(logit, labels)
392 |
393 | loss_cosine = nn.CosineEmbeddingLoss()
394 | loss_claim = loss_cosine(output_combine, output_sent, sim_labels)
395 | loss = loss_bert + loss_claim
396 | return loss, self.softmax(logit)
397 | else:
398 | return logit
399 |
--------------------------------------------------------------------------------
/mine_next/run_base.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from mine_next.functions.main_function import train, evaluate
3 | import random, os
4 | import numpy as np
5 | import torch
6 | from transformers import AutoConfig, AutoTokenizer
7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
8 | from mine_next.model.modeling import RobertaReflectGraphClassification
9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 |
11 | def create_model(args):
12 | config = AutoConfig.from_pretrained(
13 | args.language_model,
14 | num_labels=args.num_labels,
15 | max_length=args.max_length,
16 | # local_files_only=True
17 | )
18 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
19 | setattr(config, 'cons_hidden_size', args.cons_hidden_size)
20 | setattr(config, 'cons_tag2id', args.cons_tag2id)
21 | model = RobertaForClassification.from_pretrained(
22 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
23 | config=config,
24 | # local_files_only=True
25 | )
26 | return config, tokenizer, model
27 |
28 |
29 | def set_seed(args):
30 | random.seed(args.seed)
31 | np.random.seed(args.seed)
32 | torch.manual_seed(args.seed)
33 | if torch.cuda.is_available():
34 | torch.cuda.manual_seed_all(args.seed)
35 |
36 |
37 | def main(args):
38 | set_seed(args)
39 | config, tokenizer, model = create_model(args)
40 | model.to(args.device)
41 |
42 | if args.mode == 'train':
43 | train(args, model, tokenizer)
44 | elif args.mode == 'dev':
45 | evaluate(args, model, tokenizer)
46 |
47 |
48 | if __name__ == '__main__':
49 | parser = argparse.ArgumentParser(description='main')
50 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
51 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
52 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_3e_5') # 모델 불러올 dir
53 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_3e_5') # 모델 저장할 dir
54 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
55 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
56 | parser.add_argument('--device', type=str, default="cuda")
57 | #model
58 | parser.add_argument('--num_labels', type=int, default=2)
59 | parser.add_argument('--max_length', type=int, default=512)
60 | parser.add_argument('--batch_size', type=int, default=16)
61 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
62 | parser.add_argument('--cons_hidden_size', type=int, default=128)
63 |
64 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
65 | parser.add_argument("--learning_rate", type=float, default=3e-5)
66 | parser.add_argument("--warmup_steps", type=int, default=0)
67 | parser.add_argument("--max_grad_norm", type=float, default=5.0)
68 | parser.add_argument('--mode', type=str, default='dev')
69 | parser.add_argument('--seed', type=int, default=42)
70 | parser.add_argument('--checkpoint', type=int, default=1)
71 | parser.add_argument('--language_model', type=str, default='roberta-base')
72 | parser.add_argument("--epoch", type=int, default=15)
73 |
74 | args = parser.parse_args()
75 | main(args)
76 |
--------------------------------------------------------------------------------
/mine_next/run_debug.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from mine_next.functions.main_function2 import train, evaluate
3 | import random, os
4 | import numpy as np
5 | import torch
6 | from transformers import AutoConfig, AutoTokenizer
7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
8 | from mine_next.model.modeling import RobertaReflectGraphClassification
9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 |
11 |
12 | def create_model(args):
13 | config = AutoConfig.from_pretrained(
14 | args.language_model,
15 | num_labels=args.num_labels,
16 | max_length=args.max_length,
17 | # local_files_only=True
18 | )
19 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
20 | setattr(config, 'cons_hidden_size', args.cons_hidden_size)
21 | setattr(config, 'cons_tag2id', args.cons_tag2id)
22 | model = RobertaReflectGraphClassification.from_pretrained(
23 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
24 | config=config,
25 | # local_files_only=True
26 | )
27 | return config, tokenizer, model
28 |
29 | def set_seed(args):
30 | random.seed(args.seed)
31 | np.random.seed(args.seed)
32 | torch.manual_seed(args.seed)
33 | if torch.cuda.is_available():
34 | torch.cuda.manual_seed_all(args.seed)
35 |
36 | def main(args):
37 | set_seed(args)
38 | config, tokenizer, model = create_model(args)
39 | model.to(args.device)
40 |
41 | if args.mode == 'train':
42 | train(args, model, tokenizer)
43 | elif args.mode == 'dev':
44 | evaluate(args, model, tokenizer)
45 |
46 | if __name__ == '__main__':
47 | parser = argparse.ArgumentParser(description='main')
48 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
49 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
50 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_graph_cons_256') # 모델 불러올 dir
51 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_graph_cons_256') # 모델 저장할 dir
52 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
53 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
54 | parser.add_argument('--device', type=str, default="cuda")
55 | #model
56 | parser.add_argument('--num_labels', type=int, default=2)
57 | parser.add_argument('--max_length', type=int, default=512)
58 | parser.add_argument('--batch_size', type=int, default=16)
59 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
60 | parser.add_argument('--cons_hidden_size', type=int, default=256)
61 |
62 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
63 | parser.add_argument("--learning_rate", type=float, default=3e-5)
64 | parser.add_argument("--warmup_steps", type=int, default=0)
65 | parser.add_argument("--max_grad_norm", type=float, default=5.0)
66 | parser.add_argument('--mode', type=str, default='train')
67 | parser.add_argument('--seed', type=int, default=42)
68 | parser.add_argument('--checkpoint', type=int, default=5)
69 | parser.add_argument('--language_model', type=str, default='roberta-base')
70 | parser.add_argument("--epoch", type=int, default=30)
71 |
72 | args = parser.parse_args()
73 | main(args)
74 |
--------------------------------------------------------------------------------
/mine_next/run_grad1.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from mine_next.functions.main_function2 import train, evaluate, test
3 | import random, os
4 | import numpy as np
5 | import torch
6 | from transformers import AutoConfig, AutoTokenizer
7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 |
11 | def create_model(args):
12 | config = AutoConfig.from_pretrained(
13 | args.language_model,
14 | num_labels=args.num_labels,
15 | max_length=args.max_length,
16 | # local_files_only=True
17 | )
18 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
19 | setattr(config, 'cons_hidden_size', args.cons_hidden_size)
20 | setattr(config, 'feature_size', args.feature_size)
21 | setattr(config, 'cons_tag2id', args.cons_tag2id)
22 | model = RobertaReflectGraphClassification.from_pretrained(
23 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
24 | config=config,
25 | # local_files_only=True
26 | )
27 | return config, tokenizer, model
28 |
29 |
30 | def set_seed(args):
31 | random.seed(args.seed)
32 | np.random.seed(args.seed)
33 | torch.manual_seed(args.seed)
34 | if torch.cuda.is_available():
35 | torch.cuda.manual_seed_all(args.seed)
36 |
37 |
38 | def main(args):
39 | set_seed(args)
40 | config, tokenizer, model = create_model(args)
41 | model.to(args.device)
42 |
43 | if args.mode == 'train':
44 | train(args, model, tokenizer)
45 | elif args.mode == 'dev':
46 | evaluate(args, model, tokenizer)
47 | elif args.mode == 'test':
48 | test(args, model, tokenizer)
49 |
50 |
51 | if __name__ == '__main__':
52 | parser = argparse.ArgumentParser(description='main')
53 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
54 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
55 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
56 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
57 | # parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_first_sent.json')
58 | # parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_first_sent.json')
59 | # parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_first_sent.json')
60 | parser.add_argument('--train_pseudo_topic', type=str,
61 | default='../data/IAM/origin/train_pseudo_topic_with_bertopic_div0.2.json')
62 | parser.add_argument('--dev_pseudo_topic', type=str,
63 | default='../data/IAM/origin/dev_pseudo_topic_with_bertopic_div0.2.json')
64 | parser.add_argument('--test_pseudo_topic', type=str,
65 | default='../data/IAM/origin/test_pseudo_topic_with_bertopic_div0.2.json')
66 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
67 | parser.add_argument('--device', type=str, default="cuda")
68 | #model
69 | parser.add_argument('--num_labels', type=int, default=2)
70 | parser.add_argument('--max_length', type=int, default=256)
71 | parser.add_argument('--batch_size', type=int, default=32)
72 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
73 | parser.add_argument('--cons_hidden_size', type=int, default=768)
74 | parser.add_argument('--feature_size', type=int, default=384)
75 |
76 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
77 | parser.add_argument("--learning_rate", type=float, default=3e-5)
78 | parser.add_argument("--warmup_steps", type=int, default=0)
79 | parser.add_argument("--max_grad_norm", type=float, default=5.0)
80 | parser.add_argument('--mode', type=str, default='train')
81 | parser.add_argument('--seed', type=int, default=42)
82 | parser.add_argument('--checkpoint', type=int, default=1)
83 | parser.add_argument('--language_model', type=str, default='roberta-base')
84 | parser.add_argument("--epoch", type=int, default=40)
85 | parser.add_argument('--save_dir', type=str, default='pseudo_topic_with_bertopic_div02_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir
86 | parser.add_argument('--output_dir', type=str, default='pseudo_topic_with_bertopic_div02_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir
87 |
88 | args = parser.parse_args()
89 | main(args)
90 |
--------------------------------------------------------------------------------
/mine_next/run_grand.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from mine_next.functions.main_function2 import train, evaluate, test
3 | import random, os
4 | import numpy as np
5 | import torch
6 | from transformers import AutoConfig, AutoTokenizer
7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 |
11 | def create_model(args):
12 | config = AutoConfig.from_pretrained(
13 | args.language_model,
14 | num_labels=args.num_labels,
15 | max_length=args.max_length,
16 | # local_files_only=True
17 | )
18 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
19 | setattr(config, 'cons_hidden_size', args.cons_hidden_size)
20 | setattr(config, 'feature_size', args.feature_size)
21 | setattr(config, 'cons_tag2id', args.cons_tag2id)
22 | model = RobertaReflectGraphClassification.from_pretrained(
23 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
24 | config=config,
25 | # local_files_only=True
26 | )
27 | return config, tokenizer, model
28 |
29 |
30 | def set_seed(args):
31 | random.seed(args.seed)
32 | np.random.seed(args.seed)
33 | torch.manual_seed(args.seed)
34 | if torch.cuda.is_available():
35 | torch.cuda.manual_seed_all(args.seed)
36 |
37 |
38 | def main(args):
39 | set_seed(args)
40 | config, tokenizer, model = create_model(args)
41 | model.to(args.device)
42 |
43 | if args.mode == 'train':
44 | train(args, model, tokenizer)
45 | elif args.mode == 'dev':
46 | evaluate(args, model, tokenizer)
47 | elif args.mode == 'test':
48 | test(args, model, tokenizer)
49 |
50 |
51 | if __name__ == '__main__':
52 | parser = argparse.ArgumentParser(description='main')
53 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
54 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
55 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
56 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
57 | # parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_first_sent.json')
58 | # parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_first_sent.json')
59 | # parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_first_sent.json')
60 | parser.add_argument('--train_pseudo_topic', type=str,
61 | default='../data/IAM/origin/train_pseudo_topic_with_bertopic_div0.1.json')
62 | parser.add_argument('--dev_pseudo_topic', type=str,
63 | default='../data/IAM/origin/dev_pseudo_topic_with_bertopic_div0.1.json')
64 | parser.add_argument('--test_pseudo_topic', type=str,
65 | default='../data/IAM/origin/test_pseudo_topic_with_bertopic_div0.1.json')
66 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
67 | parser.add_argument('--device', type=str, default="cuda")
68 | #model
69 | parser.add_argument('--num_labels', type=int, default=2)
70 | parser.add_argument('--max_length', type=int, default=256)
71 | parser.add_argument('--batch_size', type=int, default=32)
72 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
73 | parser.add_argument('--cons_hidden_size', type=int, default=768)
74 | parser.add_argument('--feature_size', type=int, default=384)
75 |
76 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
77 | parser.add_argument("--learning_rate", type=float, default=3e-5)
78 | parser.add_argument("--warmup_steps", type=int, default=0)
79 | parser.add_argument("--max_grad_norm", type=float, default=5.0)
80 | parser.add_argument('--mode', type=str, default='train')
81 | parser.add_argument('--seed', type=int, default=42)
82 | parser.add_argument('--checkpoint', type=int, default=1)
83 | parser.add_argument('--language_model', type=str, default='roberta-base')
84 | parser.add_argument("--epoch", type=int, default=40)
85 | parser.add_argument('--save_dir', type=str, default='pseudo_topic_with_bertopic_div01_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir
86 | parser.add_argument('--output_dir', type=str, default='pseudo_topic_with_bertopic_div01_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir
87 |
88 | args = parser.parse_args()
89 | main(args)
90 |
--------------------------------------------------------------------------------
/mine_next/run_grand2:
--------------------------------------------------------------------------------
1 | import argparse
2 | from mine_next.functions.main_function2 import train, evaluate, test
3 | import random, os
4 | import numpy as np
5 | import torch
6 | from transformers import AutoConfig, AutoTokenizer
7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 |
11 |
12 | def create_model(args):
13 | config = AutoConfig.from_pretrained(
14 | args.language_model,
15 | num_labels=args.num_labels,
16 | max_length=args.max_length,
17 | # local_files_only=True
18 | )
19 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
20 | setattr(config, 'cons_hidden_size', args.cons_hidden_size)
21 | setattr(config, 'feature_size', args.feature_size)
22 | setattr(config, 'cons_tag2id', args.cons_tag2id)
23 | model = RobertaReflectGraphWithGrandEdgeClassification.from_pretrained(
24 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
25 | config=config,
26 | # local_files_only=True
27 | )
28 | return config, tokenizer, model
29 |
30 | def set_seed(args):
31 | random.seed(args.seed)
32 | np.random.seed(args.seed)
33 | torch.manual_seed(args.seed)
34 | if torch.cuda.is_available():
35 | torch.cuda.manual_seed_all(args.seed)
36 |
37 | def main(args):
38 | set_seed(args)
39 | config, tokenizer, model = create_model(args)
40 | model.to(args.device)
41 |
42 | if args.mode == 'train':
43 | train(args, model, tokenizer)
44 | elif args.mode == 'dev':
45 | evaluate(args, model, tokenizer)
46 | elif args.mode == 'test':
47 | test(args, model, tokenizer)
48 |
49 | if __name__ == '__main__':
50 | parser = argparse.ArgumentParser(description='main')
51 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
52 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
53 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
54 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
55 | parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_bertopic.json')
56 | parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_bertopic.json')
57 | parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_bertopic.json')
58 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
59 | parser.add_argument('--device', type=str, default="cuda")
60 | #model
61 | parser.add_argument('--num_labels', type=int, default=2)
62 | parser.add_argument('--max_length', type=int, default=256)
63 | parser.add_argument('--batch_size', type=int, default=32)
64 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
65 | parser.add_argument('--cons_hidden_size', type=int, default=768)
66 | parser.add_argument('--feature_size', type=int, default=384)
67 |
68 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
69 | parser.add_argument("--learning_rate", type=float, default=2e-5)
70 | parser.add_argument("--warmup_steps", type=int, default=0)
71 | parser.add_argument("--max_grad_norm", type=float, default=5.0)
72 | parser.add_argument('--mode', type=str, default='train')
73 | parser.add_argument('--seed', type=int, default=42)
74 | parser.add_argument('--checkpoint', type=int, default=5)
75 | parser.add_argument('--language_model', type=str, default='roberta-base')
76 | parser.add_argument("--epoch", type=int, default=40)
77 | parser.add_argument('--save_dir', type=str, default='pseudo_topic_with_bertopic_sentence_base_two_graph_cons_768_feat_384_max_length_256_lr_2e5') # 모델 저장할 dir
78 | parser.add_argument('--output_dir', type=str, default='pseudo_topic_with_bertopic_sentence_base_two_granh_cons_768_feat_384_max_length_256_lr_2e5') # 모델 불러올 dir
79 |
80 | args = parser.parse_args()
81 | main(args)
82 |
--------------------------------------------------------------------------------
/mine_next/run_grand2.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from mine_next.functions.main_function2 import train, evaluate, test
3 | import random, os
4 | import numpy as np
5 | import torch
6 | from transformers import AutoConfig, AutoTokenizer
7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 |
11 | def create_model(args):
12 | config = AutoConfig.from_pretrained(
13 | args.language_model,
14 | num_labels=args.num_labels,
15 | max_length=args.max_length,
16 | # local_files_only=True
17 | )
18 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
19 | setattr(config, 'cons_hidden_size', args.cons_hidden_size)
20 | setattr(config, 'feature_size', args.feature_size)
21 | setattr(config, 'cons_tag2id', args.cons_tag2id)
22 | model = RobertaReflectGraphClassification.from_pretrained(
23 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
24 | config=config,
25 | # local_files_only=True
26 | )
27 | return config, tokenizer, model
28 |
29 |
30 | def set_seed(args):
31 | random.seed(args.seed)
32 | np.random.seed(args.seed)
33 | torch.manual_seed(args.seed)
34 | if torch.cuda.is_available():
35 | torch.cuda.manual_seed_all(args.seed)
36 |
37 |
38 | def main(args):
39 | set_seed(args)
40 | config, tokenizer, model = create_model(args)
41 | model.to(args.device)
42 |
43 | if args.mode == 'train':
44 | train(args, model, tokenizer)
45 | elif args.mode == 'dev':
46 | evaluate(args, model, tokenizer)
47 | elif args.mode == 'test':
48 | test(args, model, tokenizer)
49 |
50 |
51 | if __name__ == '__main__':
52 | parser = argparse.ArgumentParser(description='main')
53 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
54 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
55 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
56 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
57 | parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_bertopic.json')
58 | parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_bertopic.json')
59 | parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_bertopic.json')
60 | parser.add_argument('--init_weight', type=bool, default=False) # False면 학습된거. True면 쌩 로버타
61 | parser.add_argument('--device', type=str, default="cuda")
62 | #model
63 | parser.add_argument('--num_labels', type=int, default=2)
64 | parser.add_argument('--max_length', type=int, default=256)
65 | parser.add_argument('--batch_size', type=int, default=32)
66 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
67 | parser.add_argument('--cons_hidden_size', type=int, default=768)
68 | parser.add_argument('--feature_size', type=int, default=384)
69 |
70 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
71 | parser.add_argument("--learning_rate", type=float, default=3e-5)
72 | parser.add_argument("--warmup_steps", type=int, default=0)
73 | parser.add_argument("--max_grad_norm", type=float, default=5.0)
74 | parser.add_argument('--mode', type=str, default='test')
75 | parser.add_argument('--seed', type=int, default=42)
76 | parser.add_argument('--checkpoint', type=int, default=24)
77 | parser.add_argument('--language_model', type=str, default='roberta-base')
78 | parser.add_argument("--epoch", type=int, default=40)
79 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir
80 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir
81 |
82 | args = parser.parse_args()
83 | main(args)
84 |
--------------------------------------------------------------------------------
/mine_next/run_grand3_test.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from mine_next.functions.main_function2 import train, evaluate, test
3 | import random, os
4 | import numpy as np
5 | import torch
6 | from transformers import AutoConfig, AutoTokenizer
7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 |
11 |
12 | def create_model(args):
13 | config = AutoConfig.from_pretrained(
14 | args.language_model,
15 | num_labels=args.num_labels,
16 | max_length=args.max_length,
17 | # local_files_only=True
18 | )
19 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
20 | setattr(config, 'cons_hidden_size', args.cons_hidden_size)
21 | setattr(config, 'feature_size', args.feature_size)
22 | setattr(config, 'cons_tag2id', args.cons_tag2id)
23 | model = RobertaReflectGraphWithGrandEdgeClassification.from_pretrained(
24 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
25 | config=config,
26 | # local_files_only=True
27 | )
28 | return config, tokenizer, model
29 |
30 | def set_seed(args):
31 | random.seed(args.seed)
32 | np.random.seed(args.seed)
33 | torch.manual_seed(args.seed)
34 | if torch.cuda.is_available():
35 | torch.cuda.manual_seed_all(args.seed)
36 |
37 | def main(args):
38 | set_seed(args)
39 | config, tokenizer, model = create_model(args)
40 | model.to(args.device)
41 |
42 | if args.mode == 'train':
43 | train(args, model, tokenizer)
44 | elif args.mode == 'dev':
45 | evaluate(args, model, tokenizer)
46 | elif args.mode == 'test':
47 | test(args, model, tokenizer)
48 |
49 | if __name__ == '__main__':
50 | parser = argparse.ArgumentParser(description='main')
51 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
52 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
53 | parser.add_argument('--claim_test', type=str, default='../data/IAM/claims/test.txt')
54 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
55 | parser.add_argument('--train_pseudo_topic', type=str, default='../data/IAM/origin/train_pseudo_topic_with_bertopic.json')
56 | parser.add_argument('--dev_pseudo_topic', type=str, default='../data/IAM/origin/dev_pseudo_topic_with_bertopic.json')
57 | parser.add_argument('--test_pseudo_topic', type=str, default='../data/IAM/origin/test_pseudo_topic_with_bertopic.json')
58 | parser.add_argument('--init_weight', type=bool, default=False) # False면 학습된거. True면 쌩 로버타
59 | parser.add_argument('--device', type=str, default="cuda")
60 | #model
61 | parser.add_argument('--num_labels', type=int, default=2)
62 | parser.add_argument('--max_length', type=int, default=256)
63 | parser.add_argument('--batch_size', type=int, default=32)
64 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
65 | parser.add_argument('--cons_hidden_size', type=int, default=768)
66 | parser.add_argument('--feature_size', type=int, default=384)
67 |
68 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
69 | parser.add_argument("--learning_rate", type=float, default=3e-5)
70 | parser.add_argument("--warmup_steps", type=int, default=0)
71 | parser.add_argument("--max_grad_norm", type=float, default=5.0)
72 | parser.add_argument('--mode', type=str, default='test')
73 | parser.add_argument('--seed', type=int, default=42)
74 | parser.add_argument('--checkpoint', type=int, default=7)
75 | parser.add_argument('--language_model', type=str, default='roberta-base')
76 | parser.add_argument("--epoch", type=int, default=40)
77 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_only_grand_cons_768_feat_384_max_length_256_lr_3e5') # 모델 저장할 dir
78 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_only_grand_cons_768_feat_384_max_length_256_lr_3e5') # 모델 불러올 dir
79 |
80 | args = parser.parse_args()
81 | main(args)
82 |
--------------------------------------------------------------------------------
/mine_next/run_one.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from mine_next.functions.main_function2 import train, evaluate
3 | import random, os
4 | import numpy as np
5 | import torch
6 | from transformers import AutoConfig, AutoTokenizer
7 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
8 | from mine_next.model.modeling import RobertaReflectGraphClassification, RobertaReflectGraphWithGrandEdgeClassification
9 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
10 |
11 |
12 | def create_model(args):
13 | config = AutoConfig.from_pretrained(
14 | args.language_model,
15 | num_labels=args.num_labels,
16 | max_length=args.max_length,
17 | # local_files_only=True
18 | )
19 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
20 | setattr(config, 'cons_hidden_size', args.cons_hidden_size)
21 | setattr(config, 'feature_size', args.feature_size)
22 | setattr(config, 'cons_tag2id', args.cons_tag2id)
23 | model = RobertaReflectGraphClassification.from_pretrained(
24 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
25 | config=config,
26 | # local_files_only=True
27 | )
28 | return config, tokenizer, model
29 |
30 | def set_seed(args):
31 | random.seed(args.seed)
32 | np.random.seed(args.seed)
33 | torch.manual_seed(args.seed)
34 | if torch.cuda.is_available():
35 | torch.cuda.manual_seed_all(args.seed)
36 |
37 | def main(args):
38 | set_seed(args)
39 | config, tokenizer, model = create_model(args)
40 | model.to(args.device)
41 |
42 | if args.mode == 'train':
43 | train(args, model, tokenizer)
44 | elif args.mode == 'dev':
45 | evaluate(args, model, tokenizer)
46 |
47 | if __name__ == '__main__':
48 | parser = argparse.ArgumentParser(description='main')
49 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
50 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
51 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
52 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
53 | parser.add_argument('--device', type=str, default="cuda")
54 | #model
55 | parser.add_argument('--num_labels', type=int, default=2)
56 | parser.add_argument('--max_length', type=int, default=256)
57 | parser.add_argument('--batch_size', type=int, default=32)
58 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
59 | parser.add_argument('--cons_hidden_size', type=int, default=768)
60 | parser.add_argument('--feature_size', type=int, default=384)
61 |
62 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
63 | parser.add_argument("--learning_rate", type=float, default=3e-5)
64 | parser.add_argument("--warmup_steps", type=int, default=0)
65 | parser.add_argument("--max_grad_norm", type=float, default=5.0)
66 | parser.add_argument('--mode', type=str, default='train')
67 | parser.add_argument('--seed', type=int, default=42)
68 | parser.add_argument('--checkpoint', type=int, default=8)
69 | parser.add_argument('--language_model', type=str, default='roberta-base')
70 | parser.add_argument("--epoch", type=int, default=30)
71 | parser.add_argument('--output_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5_with_pseudo_topic') # 모델 불러올 dir
72 | parser.add_argument('--save_dir', type=str, default='only_sentence_base_graph_normal_only_pc_cons_768_feat_384_max_length_256_lr_3e5_with_pseudo_topic') # 모델 저장할 dir
73 |
74 |
75 | args = parser.parse_args()
76 | main(args)
77 |
--------------------------------------------------------------------------------
/mine_next/run_stance.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | #from mine_next.functions.main_function import train, evaluate
3 | from mine_next.functions.stance_main_func import train, evaluate
4 | import random, os
5 | import numpy as np
6 | import torch
7 | from transformers import AutoConfig, AutoTokenizer
8 | from mine_next.model.modeling import RobertaForClassification, RobertaForSTANCY, RobertaForStanceClassification
9 | from mine_next.model.modeling import RobertaReflectGraphClassification
10 | from mine_next.functions.sent_to_graph import get_cons_tag_vocab
11 |
12 |
13 | def create_model(args):
14 | config = AutoConfig.from_pretrained(
15 | args.language_model,
16 | num_labels=args.num_labels,
17 | max_length=args.max_length,
18 | # local_files_only=True
19 | )
20 | tokenizer = AutoTokenizer.from_pretrained(args.language_model, do_lower_case=False, use_fast=False)
21 | setattr(config, 'cons_hidden_size', args.cons_hidden_size)
22 | setattr(config, 'cons_tag2id', args.cons_tag2id)
23 | model = RobertaForClassification.from_pretrained(
24 | args.language_model if args.init_weight else os.path.join(args.output_dir, "checkpoint-{}".format(args.checkpoint)),
25 | config=config,
26 | # local_files_only=True
27 | )
28 | return config, tokenizer, model
29 |
30 | def set_seed(args):
31 | random.seed(args.seed)
32 | np.random.seed(args.seed)
33 | torch.manual_seed(args.seed)
34 | if torch.cuda.is_available():
35 | torch.cuda.manual_seed_all(args.seed)
36 |
37 | def main(args):
38 | set_seed(args)
39 | config, tokenizer, model = create_model(args)
40 | model.to(args.device)
41 |
42 | if args.mode == 'train':
43 | train(args, model, tokenizer)
44 | elif args.mode == 'dev':
45 | evaluate(args, model, tokenizer)
46 |
47 | if __name__ == '__main__':
48 | parser = argparse.ArgumentParser(description='main')
49 | parser.add_argument('--claim_train', type=str, default='../data/IAM/claims/train.txt')
50 | parser.add_argument('--claim_dev', type=str, default='../data/IAM/claims/dev.txt')
51 | parser.add_argument('--stance_train', type=str, default='../data/IAM/stance/train.txt')
52 | parser.add_argument('--stance_dev', type=str, default='../data/IAM/stance/dev.txt')
53 | parser.add_argument('--save_dir', type=str, default='stance_test') # 모델 불러올 dir
54 | parser.add_argument('--output_dir', type=str, default='stance_test') # 모델 저장할 dir
55 | parser.add_argument('--constituent_gold_vocab', type=str, default='../data/IAM/constituent_gold_vocab.txt')
56 | parser.add_argument('--init_weight', type=bool, default=True) # False면 학습된거. True면 쌩 로버타
57 | parser.add_argument('--device', type=str, default="cuda")
58 | #model
59 | parser.add_argument('--num_labels', type=int, default=2)
60 | parser.add_argument('--max_length', type=int, default=256)
61 | parser.add_argument('--batch_size', type=int, default=32)
62 | parser.add_argument('--cons_tag2id', type=dict, default=get_cons_tag_vocab('../data/IAM/constituent_gold_vocab.txt'))
63 | parser.add_argument('--cons_hidden_size', type=int, default=128)
64 |
65 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
66 | parser.add_argument("--learning_rate", type=float, default=3e-5)
67 | parser.add_argument("--warmup_steps", type=int, default=0)
68 | parser.add_argument("--max_grad_norm", type=float, default=5.0)
69 | parser.add_argument('--mode', type=str, default='train')
70 | parser.add_argument('--seed', type=int, default=42)
71 | parser.add_argument('--checkpoint', type=int, default=1)
72 | parser.add_argument('--language_model', type=str, default='roberta-base')
73 | parser.add_argument("--epoch", type=int, default=15)
74 |
75 | args = parser.parse_args()
76 | main(args)
77 |
--------------------------------------------------------------------------------