├── model.png ├── config.py ├── README.md └── src ├── model ├── main.py ├── model.py └── main_functions.py └── functions └── preprocessing.py /model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KUNLP/KU_Sentiment-Analysis/HEAD/model.png -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | root_dir = os.path.dirname(os.path.abspath( __file__ )) 4 | cache_dir = os.path.join(root_dir, "cache") 5 | output_dir = os.path.join(root_dir, "KuELECTRA_base_no_curri") 6 | data_dir = os.path.join(root_dir, "data") 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentiment analysis model for Korean movie review 2 | Code for HCLT 2020 paper: *[Movie Revies Sentiment Analysis Considering the Order in which Sentiment Words Appear](http://koreascience.or.kr/article/CFKO202030060610812.page?&lang=ko)* 3 | 4 | ## Dependencies 5 | - python 3.7 6 | - PyTorch 1.5.0 7 | - Transformers 2.11.0 8 | - Seqeval 1.2.2 9 | 10 | 11 | All code only supports running on Linux. 12 | 13 | # Model Structure 14 | 15 | 16 | 17 | 18 | 19 | ## Data 20 | 21 | Naver sentiment movie corpus v1.0: *[NSMC](https://github.com/e9t/nsmc)* 22 | 23 | ## Train & Test 24 | 25 | ``` 26 | python main.py 27 | ``` 28 | 29 | ## Results on NSMC 30 | 31 | | Model | Acc | 32 | |---|--------- | 33 | | KUSA | 90.81% | 34 | -------------------------------------------------------------------------------- /src/model/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import config 3 | from src.model.main_functions import Helper 4 | 5 | 6 | if __name__ == "__main__": 7 | 8 | if not os.path.exists(config.cache_dir): 9 | os.makedirs(config.cache_dir) 10 | if not os.path.exists(config.output_dir): 11 | os.makedirs(config.output_dir) 12 | 13 | config = {"mode": "demo", 14 | "train_data_path": os.path.join(config.data_dir, "stage_last.txt"), 15 | "test_data_path": os.path.join(config.data_dir, "ratings_test_add_sentiScore.txt"), 16 | "analyze_data_path": os.path.join(config.data_dir, "sampling_data_5.txt"), 17 | "cache_dir_path": config.cache_dir, 18 | "model_dir_path": config.output_dir, 19 | "checkpoint": 75675, 20 | "epoch": 5, 21 | "learning_rate": 0.0001, 22 | "dropout_rate": 0.1, 23 | "warmup_steps": 0, 24 | "max_grad_norm": 1.0, 25 | "batch_size": 256, 26 | "max_length": 50, 27 | "lstm_hidden": 256, 28 | "lstm_num_layer": 1, 29 | "bidirectional_flag": True, 30 | "senti_labels": 2, 31 | "score_labels": 7, 32 | "gradient_accumulation_steps": 1, 33 | "weight_decay": 0.0, 34 | "adam_epsilon": 1e-8 35 | } 36 | 37 | helper = Helper(config) 38 | 39 | if config["mode"] == "train": 40 | helper.train() 41 | elif config["mode"] == "test": 42 | helper.test() 43 | elif config["mode"] == "analyze": 44 | helper.analyze() 45 | elif config["mode"] == "demo": 46 | helper.demo() 47 | -------------------------------------------------------------------------------- /src/functions/preprocessing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | from torch.utils.data import TensorDataset 4 | 5 | 6 | # 학습 or 평가 데이터를 읽어 리스트에 저장 7 | def read_data(file_path, mode): 8 | with open(file_path, "r", encoding="utf8") as inFile: 9 | lines = inFile.readlines() 10 | 11 | datas = [] 12 | for index, line in enumerate(tqdm(lines, desc="read_data")): 13 | # 입력 문장을 \t으로 분리 14 | pieces = line.strip().split("\t") 15 | 16 | if mode == "train" or mode == "test": 17 | # 데이터의 형태가 올바른지 체크 18 | assert len(pieces) == 3 19 | assert len(pieces[0].split(" ")) == len(pieces[2].split(" ")) 20 | sentence, senti_label, score_label = pieces[0], int(pieces[1]), [int(score) + 1 for score in pieces[2].split(" ")] 21 | datas.append((sentence, senti_label, score_label)) 22 | elif mode == "analyze": 23 | sentence, senti_label = pieces[0], int(pieces[1]) 24 | datas.append((sentence, senti_label)) 25 | return datas 26 | 27 | 28 | def convert_data2dataset(datas, tokenizer, max_length, labels, score_labels, mode): 29 | total_input_ids, total_attention_mask, total_token_type_ids, total_senti_labels, total_score_labels, total_senti_seq, total_score_seq, total_word_seq = [], [], [], [], [], [], [], [] 30 | 31 | if mode == "analyze": 32 | total_score_labels = None 33 | for index, data in enumerate(tqdm(datas, desc="convert_data2dataset")): 34 | sentence = "" 35 | senti_label = 0 36 | score_label = [] 37 | if mode == "train" or mode == "test": 38 | sentence, senti_label, score_label = data 39 | elif mode == "analyze": 40 | sentence, senti_label = data 41 | 42 | # tokens = tokenizer.tokenize(sentence) 43 | 44 | # tokens = ["[CLS]"] + tokens 45 | # tokens = tokens[:max_length-1] 46 | # tokens.append("[SEP]") 47 | input_ids = [tokenizer._convert_token_to_id(token) for token in sentence.split()] 48 | assert len(input_ids) <= max_length 49 | 50 | attention_mask = [1] * len(input_ids) 51 | token_type_ids = [0] * len(input_ids) 52 | 53 | padding = [0] * (max_length - len(input_ids)) 54 | 55 | total_word_seq.append(len(input_ids)) 56 | 57 | input_ids += padding 58 | attention_mask += padding 59 | token_type_ids += padding 60 | 61 | total_input_ids.append(input_ids) 62 | total_attention_mask.append(attention_mask) 63 | total_token_type_ids.append(token_type_ids) 64 | total_senti_labels.append(senti_label) 65 | 66 | total_senti_seq.append([i for i in range(labels)]) 67 | total_score_seq.append([i for i in range(score_labels)]) 68 | 69 | if mode == "train" or mode == "test": 70 | score_label += padding 71 | total_score_labels.append(score_label) 72 | 73 | if index < 2: 74 | print("*** Example ***") 75 | print("sequence : {}".format(sentence)) 76 | print("input_ids: {}".format(" ".join([str(x) for x in total_input_ids[-1]]))) 77 | print("attention_mask: {}".format(" ".join([str(x) for x in total_attention_mask[-1]]))) 78 | print("token_type_ids: {}".format(" ".join([str(x) for x in total_token_type_ids[-1]]))) 79 | print("label: {}".format(total_senti_labels[-1])) 80 | print("senti_seq: {}".format(total_senti_seq[-1])) 81 | print("score_seq: {}".format(total_score_seq[-1])) 82 | print("word_seq: {}".format(total_word_seq[-1])) 83 | print() 84 | if mode == "train" or mode == "test": 85 | print("score label: {}".format(total_score_labels[-1])) 86 | 87 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long) 88 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long) 89 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long) 90 | total_senti_labels = torch.tensor(total_senti_labels, dtype=torch.long) 91 | total_senti_seq = torch.tensor(total_senti_seq, dtype=torch.long) 92 | total_score_seq = torch.tensor(total_score_seq, dtype=torch.long) 93 | total_word_seq = torch.tensor(total_word_seq, dtype=torch.long) 94 | if mode == "train" or mode == "test": 95 | total_score_labels = torch.tensor(total_score_labels, dtype=torch.long) 96 | dataset = TensorDataset(total_input_ids, total_attention_mask, total_token_type_ids, total_senti_labels, 97 | total_score_labels, total_senti_seq, total_score_seq, total_word_seq) 98 | elif mode == "analyze": 99 | dataset = TensorDataset(total_input_ids, total_attention_mask, total_token_type_ids, total_senti_labels, 100 | total_senti_seq, total_score_seq, total_word_seq) 101 | 102 | return dataset 103 | -------------------------------------------------------------------------------- /src/model/model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch.nn as nn 3 | from transformers.activations import get_activation 4 | from transformers.modeling_electra import ElectraPreTrainedModel, ElectraModel 5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 6 | import torch 7 | import torch.nn.functional as F 8 | from torch.autograd import * 9 | 10 | 11 | class ElectraForSequenceClassification(ElectraPreTrainedModel): 12 | def __init__(self, config, lstm_hidden, label_emb_size, score_emb_size, score_size, num_layer, bilstm_flag): 13 | super().__init__(config) 14 | 15 | assert score_emb_size == lstm_hidden * 2, "Please set score-embedding-size to twice the lstm-hidden-size" 16 | 17 | # 분류할 라벨의 개수 18 | self.num_labels = config.num_labels 19 | # ELECTRA 모델 20 | self.electra = ElectraModel(config) 21 | 22 | self.n_hidden = lstm_hidden 23 | 24 | self.score_emb = nn.Embedding(score_size, score_emb_size, scale_grad_by_freq=True) 25 | self.num_layers = num_layer 26 | self.bidirectional = 2 if bilstm_flag else 1 27 | 28 | self.lstm_first = nn.LSTM(config.hidden_size, lstm_hidden, bidirectional=True, batch_first=True) 29 | self.lstm_last = nn.LSTM(lstm_hidden * 4, lstm_hidden, num_layers=self.num_layers, 30 | batch_first=True, bidirectional=bilstm_flag) 31 | self.lstm_score_sequence = nn.LSTM(lstm_hidden * 2 + score_size, lstm_hidden, num_layers=self.num_layers, 32 | batch_first=True, bidirectional=bilstm_flag) 33 | 34 | self.label_attn = multihead_attention(lstm_hidden * 2, num_heads=1, dropout_rate=config.hidden_dropout_prob) 35 | self.label_attn_last = multihead_attention(lstm_hidden * 2, num_heads=1, dropout_rate=0) 36 | 37 | self.lstm_hidden2senti = nn.Linear(lstm_hidden * 2, config.num_labels) 38 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 39 | 40 | self.init_weights() 41 | 42 | def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None, 43 | input_label_seq_tensor=None, input_senti_seq_tensor=None, word_seq_lengths=None): 44 | discriminator_hidden_states = self.electra(input_ids, attention_mask, token_type_ids) 45 | 46 | # (batch_size, max_length, hidden_size) 47 | discriminator_hidden_states = discriminator_hidden_states[0] 48 | 49 | self.batch_size = discriminator_hidden_states.shape[0] 50 | label_embs = self.score_emb(input_label_seq_tensor) 51 | 52 | hidden = None 53 | 54 | lstm_outputs, hidden = self.lstm_first(discriminator_hidden_states, hidden) 55 | lstm_outputs = self.dropout(lstm_outputs) 56 | 57 | word2score_attention_output = self.label_attn(lstm_outputs, label_embs, label_embs, False) 58 | 59 | # [batch, seq_length, lstm_hiddn * 2 + score_label_emb] 60 | lstm_outputs = torch.cat([lstm_outputs, word2score_attention_output], dim=-1) 61 | 62 | """ 63 | Last Layer 64 | """ 65 | lstm_outputs, hidden = self.lstm_last(lstm_outputs, hidden) 66 | lstm_outputs = self.dropout(lstm_outputs) 67 | 68 | word2score_attention_output = self.label_attn_last(lstm_outputs, label_embs, label_embs, True) 69 | 70 | # [batch, seq_length, lstm_hidden * 2 + score_size] 71 | lstm_outputs = torch.cat([lstm_outputs, word2score_attention_output], dim=-1) 72 | 73 | lstm_outputs, hidden = self.lstm_score_sequence(lstm_outputs, hidden) 74 | 75 | lstm_last_hidden = hidden[0].transpose(0, 1).contiguous().view(self.batch_size, -1) 76 | 77 | 78 | return word2score_attention_output.permute(0, 2, 1), self.lstm_hidden2senti(lstm_last_hidden) 79 | 80 | 81 | class multihead_attention(nn.Module): 82 | 83 | def __init__(self, num_units, num_heads=1, dropout_rate=0, gpu=True, causality=False): 84 | '''Applies multihead attention. 85 | Args: 86 | num_units: A scalar. Attention size. 87 | dropout_rate: A floating point number. 88 | causality: Boolean. If true, units that reference the future are masked. 89 | num_heads: An int. Number of heads. 90 | ''' 91 | super(multihead_attention, self).__init__() 92 | self.gpu = gpu 93 | self.num_units = num_units 94 | self.num_heads = num_heads 95 | self.dropout_rate = dropout_rate 96 | self.causality = causality 97 | self.Q_proj = nn.Sequential(nn.Linear(self.num_units, self.num_units), nn.ReLU()) 98 | self.K_proj = nn.Sequential(nn.Linear(self.num_units, self.num_units), nn.ReLU()) 99 | self.V_proj = nn.Sequential(nn.Linear(self.num_units, self.num_units), nn.ReLU()) 100 | 101 | self.output_dropout = nn.Dropout(p=self.dropout_rate) 102 | 103 | def forward(self, queries, keys, values, last_layer=False): 104 | # keys, values: same shape of [N, T_k, C_k] 105 | # queries: A 3d Variable with shape of [N, T_q, C_q] 106 | # Linear projections 107 | Q = self.Q_proj(queries) # (N, T_q, C) 108 | K = self.K_proj(keys) # (N, T_q, C) 109 | V = self.V_proj(values) # (N, T_q, C) 110 | 111 | # get dim to concat 112 | concat_dim = len(Q.shape) - 1 113 | 114 | if concat_dim == 1: 115 | Q = Q.unsqueeze(dim=1) 116 | queries = queries.unsqueeze(dim=1) 117 | concat_dim = 2 118 | 119 | # Split and concat 120 | Q_ = torch.cat(torch.chunk(Q, self.num_heads, dim=concat_dim), dim=0) # (h*N, T_q, C/h) 121 | K_ = torch.cat(torch.chunk(K, self.num_heads, dim=concat_dim), dim=0) # (h*N, T_q, C/h) 122 | V_ = torch.cat(torch.chunk(V, self.num_heads, dim=concat_dim), dim=0) # (h*N, T_q, C/h) 123 | 124 | # Multiplication 125 | outputs = torch.bmm(Q_, K_.permute(0, 2, 1)) # (h*N, T_q, T_k) 126 | 127 | # Scale 128 | outputs = outputs / (K_.size()[-1] ** 0.5) 129 | 130 | # Activation 131 | if not last_layer: 132 | outputs = F.softmax(outputs, dim=-1) # (h*N, T_q, T_k) 133 | 134 | # Query Masking 135 | query_masks = torch.sign(torch.abs(torch.sum(queries, dim=-1))) # (N, T_q) 136 | query_masks = query_masks.repeat(self.num_heads, 1) # (h*N, T_q) 137 | query_masks = torch.unsqueeze(query_masks, 2).repeat(1, 1, keys.size()[1]) # (h*N, T_q, T_k) 138 | query_masks = query_masks.reshape([outputs.shape[0], outputs.shape[1], outputs.shape[2]]) 139 | 140 | outputs = outputs * query_masks 141 | 142 | # Dropouts 143 | outputs = self.output_dropout(outputs) # (h*N, T_q, T_k) 144 | 145 | if last_layer: 146 | return outputs 147 | 148 | # Weighted sum 149 | outputs = torch.bmm(outputs, V_) # (h*N, T_q, C/h) 150 | 151 | # Restore shape 152 | outputs = torch.cat(torch.chunk(outputs, self.num_heads, dim=0), dim=concat_dim) # (N, T_q, C) 153 | 154 | # Residual connection 155 | # outputs += queries 156 | 157 | return outputs -------------------------------------------------------------------------------- /src/model/main_functions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from tqdm import tqdm 5 | from sklearn.metrics import accuracy_score 6 | import numpy as np 7 | from transformers.configuration_electra import ElectraConfig 8 | from transformers.tokenization_electra import ElectraTokenizer 9 | from transformers.optimization import AdamW, get_linear_schedule_with_warmup 10 | from src.model.model import ElectraForSequenceClassification 11 | from torch.utils.data import DataLoader, RandomSampler, SequentialSampler 12 | from src.functions import preprocessing 13 | from torch.utils.data import TensorDataset 14 | import os 15 | 16 | 17 | class Helper(): 18 | def __init__(self, config): 19 | self.config = config 20 | 21 | def do_train(self, electra_model, optimizer, scheduler, train_dataloader, epoch, global_step): 22 | score_criterion = nn.CrossEntropyLoss(ignore_index=0) 23 | senti_criterion = nn.CrossEntropyLoss() 24 | 25 | # batch 단위 별 loss를 담을 리스트 26 | losses = [] 27 | # 모델의 출력 결과와 실제 정답값을 담을 리스트 28 | total_predicts, total_corrects = [], [] 29 | total_pred_scores, total_score_corrects = 0, 0 30 | for step, batch in enumerate(tqdm(train_dataloader, desc="do_train(epoch_{})".format(epoch))): 31 | 32 | batch = tuple(t.cuda() for t in batch) 33 | input_ids, attention_mask, token_type_ids, senti_labels, score_labels, senti_seq, score_seq, word_len_seq = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5], batch[6], batch[7] 34 | 35 | # 입력 데이터에 대한 출력과 loss 생성 36 | score_logits, senti_logit = electra_model(input_ids, attention_mask, token_type_ids, 37 | senti_labels, score_seq, senti_seq, 38 | word_len_seq) 39 | 40 | score_loss = score_criterion(score_logits, score_labels) 41 | 42 | score_pred = F.softmax(score_logits, dim=1) 43 | score_pred = score_pred.argmax(dim=1) 44 | 45 | for pred, gold, length in zip(score_pred, score_labels, word_len_seq): 46 | pred = pred[:length] 47 | gold = gold[:length] 48 | for pred_, gold_ in zip(pred, gold): 49 | if gold_ != 0: 50 | if pred_ == gold_: 51 | total_score_corrects += 1 52 | total_pred_scores += 1 53 | 54 | senti_logit = senti_logit.squeeze() 55 | senti_loss = senti_criterion(senti_logit, senti_labels) 56 | 57 | total_loss = score_loss * 0.1 + senti_loss * 0.9 58 | # total_loss = senti_loss 59 | 60 | predicts = F.softmax(senti_logit, dim=1) 61 | predicts = predicts.argmax(dim=-1) 62 | predicts = predicts.cpu().detach().numpy().tolist() 63 | labels = senti_labels.cpu().detach().numpy().tolist() 64 | 65 | total_predicts += predicts 66 | total_corrects += labels 67 | 68 | if self.config["gradient_accumulation_steps"] > 1: 69 | total_loss = total_loss / self.config["gradient_accumulation_steps"] 70 | if step % 100 == 0: 71 | print("loss : ", '{:.6f}'.format(total_loss)) 72 | 73 | # loss 값으로부터 모델 내부 각 매개변수에 대하여 gradient 계산 74 | total_loss.backward() 75 | losses.append(total_loss.data.item()) 76 | 77 | if (step + 1) % self.config["gradient_accumulation_steps"] == 0 or \ 78 | (len(train_dataloader) <= self.config["gradient_accumulation_steps"] and (step + 1) == len( 79 | train_dataloader)): 80 | torch.nn.utils.clip_grad_norm_(electra_model.parameters(), self.config["max_grad_norm"]) 81 | 82 | # 모델 내부 각 매개변수 가중치 갱신 83 | optimizer.step() 84 | scheduler.step() 85 | 86 | # 변화도를 0으로 변경 87 | electra_model.zero_grad() 88 | global_step += 1 89 | 90 | # 정확도 계산 91 | accuracy = accuracy_score(total_corrects, total_predicts) 92 | 93 | score_acc = total_score_corrects / total_pred_scores 94 | 95 | return accuracy, np.mean(losses), global_step, score_acc 96 | 97 | def do_evaluate(self, electra_model, test_dataloader, mode): 98 | # 모델의 입력, 출력, 실제 정답값을 담을 리스트 99 | total_input_ids, total_predicts, total_corrects = [], [], [] 100 | total_pred_scores, total_score_corrects = 0, 0 101 | for step, batch in enumerate(tqdm(test_dataloader, desc="do_evaluate")): 102 | batch = tuple(t.cuda() for t in batch) 103 | input_ids, attention_mask, token_type_ids, senti_labels, score_labels, senti_seq, score_seq, word_len_seq = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5], batch[6], batch[7] 104 | 105 | # 입력 데이터에 대한 출력 결과 생성 106 | score_logits, senti_logits = electra_model(input_ids, attention_mask, token_type_ids, 107 | senti_labels, score_seq, senti_seq, 108 | word_len_seq) 109 | 110 | score_pred = F.softmax(score_logits, dim=1) 111 | score_pred = score_pred.argmax(dim=1) 112 | 113 | for pred, gold, length in zip(score_pred, score_labels, word_len_seq): 114 | pred = pred[:length] 115 | gold = gold[:length] 116 | for pred_, gold_ in zip(pred, gold): 117 | if gold_ != 0: 118 | if pred_ == gold_: 119 | total_score_corrects += 1 120 | total_pred_scores += 1 121 | 122 | senti_logits = senti_logits.squeeze() 123 | 124 | predicts = F.softmax(senti_logits, dim=1) 125 | predicts = predicts.argmax(dim=-1) 126 | predicts = predicts.cpu().detach().numpy().tolist() 127 | labels = senti_labels.cpu().detach().numpy().tolist() 128 | input_ids = input_ids.cpu().detach().numpy().tolist() 129 | 130 | total_predicts += predicts 131 | total_corrects += labels 132 | total_input_ids += input_ids 133 | 134 | # 정확도 계산 135 | accuracy = accuracy_score(total_corrects, total_predicts) 136 | score_acc = total_score_corrects / total_pred_scores 137 | 138 | if (mode == "train"): 139 | return accuracy, score_acc 140 | else: 141 | return accuracy, total_input_ids, total_predicts, total_corrects 142 | 143 | def do_analyze(self, electra_model, test_dataloader, mode): 144 | # 모델의 입력, 출력, 실제 정답값을 담을 리스트 145 | total_input_ids, total_predicts, total_corrects = [], [], [] 146 | 147 | for step, batch in enumerate(tqdm(test_dataloader, desc="do_analyze")): 148 | batch = tuple(t.cuda() for t in batch) 149 | input_ids, attention_mask, token_type_ids, senti_labels, senti_seq, score_seq, word_len_seq = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5], batch[6] 150 | 151 | # 입력 데이터에 대한 출력 결과 생성 152 | score_logits, senti_logits = electra_model(input_ids, attention_mask, token_type_ids, 153 | senti_labels, score_seq, senti_seq, 154 | word_len_seq) 155 | 156 | senti_logits = senti_logits.squeeze() 157 | 158 | predicts = F.softmax(senti_logits, dim=1) 159 | predicts = predicts.argmax(dim=-1) 160 | predicts = predicts.cpu().detach().numpy().tolist() 161 | labels = senti_labels.cpu().detach().numpy().tolist() 162 | input_ids = input_ids.cpu().detach().numpy().tolist() 163 | 164 | total_predicts += predicts 165 | total_corrects += labels 166 | total_input_ids += input_ids 167 | 168 | # 정확도 계산 169 | accuracy = accuracy_score(total_corrects, total_predicts) 170 | return accuracy, total_input_ids, total_predicts, total_corrects 171 | 172 | def train(self): 173 | ######################################################################################################################################### 174 | # electra config 객체 생성 175 | electra_config = ElectraConfig.from_pretrained(os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 176 | num_labels=self.config["senti_labels"], cache_dir=None) 177 | 178 | # electra tokenizer 객체 생성 179 | electra_tokenizer = ElectraTokenizer.from_pretrained(os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 180 | do_lower_case=False, 181 | cache_dir=None) 182 | 183 | # electra model 객체 생성 184 | electra_model = ElectraForSequenceClassification.from_pretrained(os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 185 | config=electra_config, 186 | lstm_hidden=self.config['lstm_hidden'], 187 | label_emb_size=self.config['lstm_hidden'] * 2, 188 | score_emb_size=self.config['lstm_hidden'] * 2, 189 | score_size=self.config['score_labels'], 190 | num_layer=self.config['lstm_num_layer'], 191 | bilstm_flag=self.config['bidirectional_flag'], 192 | cache_dir=self.config["cache_dir_path"] 193 | # from_tf=True 194 | ) 195 | ######################################################################################################################################### 196 | 197 | electra_model.cuda() 198 | 199 | # 학습 데이터 읽기 200 | train_datas = preprocessing.read_data(file_path=self.config["train_data_path"], mode=self.config["mode"]) 201 | 202 | # 학습 데이터 전처리 203 | train_dataset = preprocessing.convert_data2dataset(datas=train_datas, tokenizer=electra_tokenizer, 204 | max_length=self.config["max_length"], 205 | labels=self.config["senti_labels"], 206 | score_labels=self.config["score_labels"], 207 | mode=self.config["mode"]) 208 | 209 | # 학습 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 210 | train_sampler = RandomSampler(train_dataset) 211 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.config["batch_size"]) 212 | 213 | # 평가 데이터 읽기 214 | test_datas = preprocessing.read_data(file_path=self.config["test_data_path"], mode=self.config["mode"]) 215 | 216 | # 평가 데이터 전처리 217 | test_dataset = preprocessing.convert_data2dataset(datas=test_datas, tokenizer=electra_tokenizer, 218 | max_length=self.config["max_length"], 219 | labels=self.config["senti_labels"], 220 | score_labels=self.config["score_labels"], 221 | mode=self.config["mode"]) 222 | 223 | # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 224 | test_sampler = SequentialSampler(test_dataset) 225 | test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100) 226 | 227 | # 전체 학습 횟수(batch 단위) 228 | t_total = len(train_dataloader) // self.config["gradient_accumulation_steps"] * self.config["epoch"] 229 | 230 | # 모델 학습을 위한 optimizer 231 | no_decay = ['bias', 'LayerNorm.weight'] 232 | optimizer = AdamW([{'params': [p for n, p in electra_model.named_parameters() if not any(nd in n for nd in no_decay)], 233 | 'lr': 5e-5, 'weight_decay': self.config['weight_decay']}, 234 | {'params': [p for n, p in electra_model.named_parameters() if any(nd in n for nd in no_decay)], 235 | 'lr': 5e-5, 'weight_decay': 0.0}]) 236 | # optimizer = AdamW(lan.parameters(), lr=self.config['learning_rate'], eps=self.config['adam_epsilon']) 237 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.config["warmup_steps"], 238 | num_training_steps=t_total) 239 | 240 | if os.path.isfile(os.path.join(self.config["model_dir_path"], "optimizer.pt")) and os.path.isfile( 241 | os.path.join(self.config["model_dir_path"], "scheduler.pt")): 242 | # 기존에 학습했던 optimizer와 scheduler의 정보 불러옴 243 | optimizer.load_state_dict(torch.load(os.path.join(self.config["model_dir_path"], "optimizer.pt"))) 244 | scheduler.load_state_dict(torch.load(os.path.join(self.config["model_dir_path"], "scheduler.pt"))) 245 | print("####################### Success Load Model ###########################") 246 | 247 | global_step = 0 248 | electra_model.zero_grad() 249 | max_test_accuracy = 0 250 | for epoch in range(self.config["epoch"]): 251 | electra_model.train() 252 | 253 | # 학습 데이터에 대한 정확도와 평균 loss 254 | train_accuracy, average_loss, global_step, score_acc = self.do_train(electra_model=electra_model, 255 | optimizer=optimizer, scheduler=scheduler, 256 | train_dataloader=train_dataloader, 257 | epoch=epoch + 1, global_step=global_step) 258 | 259 | print("train_accuracy : {}\taverage_loss : {}\n".format(round(train_accuracy, 4), round(average_loss, 4))) 260 | print("train_score_accuracy :", "{:.6f}".format(score_acc)) 261 | 262 | electra_model.eval() 263 | 264 | # 평가 데이터에 대한 정확도 265 | test_accuracy, score_acc = self.do_evaluate(electra_model=electra_model, test_dataloader=test_dataloader, 266 | mode=self.config["mode"]) 267 | 268 | print("test_accuracy : {}\n".format(round(test_accuracy, 4))) 269 | print("test_score_accuracy :", "{:.6f}".format(score_acc)) 270 | 271 | 272 | # 현재의 정확도가 기존 정확도보다 높은 경우 모델 파일 저장 273 | if (max_test_accuracy < test_accuracy): 274 | max_test_accuracy = test_accuracy 275 | 276 | output_dir = os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(global_step)) 277 | if not os.path.exists(output_dir): 278 | os.makedirs(output_dir) 279 | 280 | electra_config.save_pretrained(output_dir) 281 | electra_tokenizer.save_pretrained(output_dir) 282 | electra_model.save_pretrained(output_dir) 283 | # torch.save(lan.state_dict(), os.path.join(output_dir, "lan.pt")) 284 | torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) 285 | torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) 286 | 287 | print("max_test_accuracy :", "{:.6f}".format(round(max_test_accuracy, 4))) 288 | 289 | def show_result(self, total_input_ids, total_predicts, total_corrects, tokenizer): 290 | for index, input_ids in enumerate(total_input_ids): 291 | tokens = [tokenizer._convert_id_to_token(input_id) for input_id in input_ids] 292 | 293 | # [CLS] 토큰 제거 294 | tokens = tokens[1:] 295 | 296 | # [SEP] 토큰 제거 297 | tokens = tokens[:tokens.index("[SEP]")] 298 | 299 | # 입력 sequence 복원 300 | sequence = tokenizer.convert_tokens_to_string(tokens) 301 | 302 | predict, correct = total_predicts[index], total_corrects[index] 303 | if (predict == 0): 304 | predict = "negative" 305 | else: 306 | predict = "positive" 307 | 308 | if (correct == 0): 309 | correct = "negative" 310 | else: 311 | correct = "positive" 312 | 313 | print("sequence : {}".format(sequence)) 314 | print("predict : {}".format(predict)) 315 | print("correct : {}".format(correct)) 316 | print() 317 | 318 | def test(self): 319 | # electra config 객체 생성 320 | electra_config = ElectraConfig.from_pretrained( 321 | os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 322 | num_labels=self.config["senti_labels"], 323 | cache_dir=None) 324 | 325 | # electra tokenizer 객체 생성 326 | electra_tokenizer = ElectraTokenizer.from_pretrained( 327 | os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 328 | do_lower_case=False, 329 | cache_dir=None) 330 | 331 | # electra model 객체 생성 332 | electra_model = ElectraForSequenceClassification.from_pretrained( 333 | os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 334 | config=electra_config, 335 | lstm_hidden=self.config['lstm_hidden'], 336 | label_emb_size=self.config['lstm_hidden'] * 2, 337 | score_emb_size=self.config['lstm_hidden'] * 2, 338 | score_size=self.config['score_labels'], 339 | num_layer=self.config['lstm_num_layer'], 340 | bilstm_flag=self.config['bidirectional_flag'], 341 | cache_dir=self.config["cache_dir_path"] 342 | ) 343 | 344 | electra_model.cuda() 345 | 346 | # 평가 데이터 읽기 347 | test_datas = preprocessing.read_data(file_path=self.config["test_data_path"], mode=self.config["mode"]) 348 | 349 | # 평가 데이터 전처리 350 | test_dataset = preprocessing.convert_data2dataset(datas=test_datas, tokenizer=electra_tokenizer, 351 | max_length=self.config["max_length"], 352 | labels=self.config["senti_labels"], 353 | score_labels=self.config["score_labels"], 354 | mode=self.config["mode"]) 355 | 356 | # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 357 | test_sampler = SequentialSampler(test_dataset) 358 | test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100) 359 | 360 | electra_model.eval() 361 | 362 | # 평가 데이터에 대한 정확도와 모델의 입력, 출력, 정답 363 | test_accuracy, total_input_ids, total_predicts, total_corrects = self.do_evaluate(electra_model=electra_model, 364 | test_dataloader=test_dataloader, 365 | mode=self.config["mode"]) 366 | 367 | print("test_accuracy : {}\n".format(round(test_accuracy, 4))) 368 | 369 | # 10개의 평가 케이스에 대하여 모델 출력과 정답 비교 370 | self.show_result(total_input_ids=total_input_ids[:10], total_predicts=total_predicts[:10], 371 | total_corrects=total_corrects[:10], tokenizer=electra_tokenizer) 372 | 373 | def analyze(self): 374 | # electra config 객체 생성 375 | electra_config = ElectraConfig.from_pretrained( 376 | os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 377 | num_labels=self.config["senti_labels"], 378 | cache_dir=None) 379 | 380 | # electra tokenizer 객체 생성 381 | electra_tokenizer = ElectraTokenizer.from_pretrained( 382 | os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 383 | do_lower_case=False, 384 | cache_dir=None) 385 | 386 | # electra model 객체 생성 387 | electra_model = ElectraForSequenceClassification.from_pretrained( 388 | os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 389 | config=electra_config, 390 | lstm_hidden=self.config['lstm_hidden'], 391 | label_emb_size=self.config['lstm_hidden'] * 2, 392 | score_emb_size=self.config['lstm_hidden'] * 2, 393 | score_size=self.config['score_labels'], 394 | num_layer=self.config['lstm_num_layer'], 395 | bilstm_flag=self.config['bidirectional_flag'], 396 | cache_dir=self.config["cache_dir_path"] 397 | ) 398 | 399 | electra_model.cuda() 400 | 401 | # 평가 데이터 읽기 402 | test_datas = preprocessing.read_data(file_path=self.config["analyze_data_path"], mode=self.config["mode"]) 403 | 404 | # 평가 데이터 전처리 405 | test_dataset = preprocessing.convert_data2dataset(datas=test_datas, tokenizer=electra_tokenizer, 406 | max_length=self.config["max_length"], 407 | labels=self.config["senti_labels"], 408 | score_labels=self.config["score_labels"], 409 | mode=self.config["mode"]) 410 | 411 | # 평가 데이터를 batch 단위로 추출하기 위한 DataLoader 객체 생성 412 | test_sampler = SequentialSampler(test_dataset) 413 | test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100) 414 | 415 | electra_model.eval() 416 | 417 | # 평가 데이터에 대한 정확도와 모델의 입력, 출력, 정답 418 | test_accuracy, total_input_ids, total_predicts, total_corrects = self.do_analyze(electra_model=electra_model, 419 | test_dataloader=test_dataloader, 420 | mode=self.config["mode"]) 421 | 422 | print("test_accuracy : {}\n".format(round(test_accuracy, 4))) 423 | 424 | print("테스트 데이터 10개에 대하여 모델 출력과 정답을 비교") 425 | # 10개의 평가 케이스에 대하여 모델 출력과 정답 비교 426 | self.show_result(total_input_ids=total_input_ids[:10], total_predicts=total_predicts[:10], 427 | total_corrects=total_corrects[:10], tokenizer=electra_tokenizer) 428 | 429 | def demo(self): 430 | # electra config 객체 생성 431 | electra_config = ElectraConfig.from_pretrained( 432 | os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 433 | num_labels=self.config["senti_labels"], 434 | cache_dir=None) 435 | 436 | # electra tokenizer 객체 생성 437 | electra_tokenizer = ElectraTokenizer.from_pretrained( 438 | os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 439 | do_lower_case=False, 440 | cache_dir=None) 441 | 442 | # electra model 객체 생성 443 | electra_model = ElectraForSequenceClassification.from_pretrained( 444 | os.path.join(self.config["model_dir_path"], "checkpoint-{}".format(self.config["checkpoint"])), 445 | config=electra_config, 446 | lstm_hidden=self.config['lstm_hidden'], 447 | label_emb_size=self.config['lstm_hidden'] * 2, 448 | score_emb_size=self.config['lstm_hidden'] * 2, 449 | score_size=self.config['score_labels'], 450 | num_layer=self.config['lstm_num_layer'], 451 | bilstm_flag=self.config['bidirectional_flag'], 452 | cache_dir=self.config["cache_dir_path"] 453 | ) 454 | 455 | electra_model.cuda() 456 | 457 | is_demo = True 458 | 459 | while (is_demo): 460 | total_input_ids, total_attention_mask, total_token_type_ids, total_senti_seq, total_score_seq, total_word_seq = [], [], [], [], [], [] 461 | score_labels = None 462 | senti_labels = None 463 | datas = input().strip() 464 | if datas == "-1": 465 | break 466 | tokens = electra_tokenizer.tokenize(datas) 467 | tokens = ["[CLS]"] + tokens 468 | tokens = tokens[:self.config['max_length']-1] 469 | tokens.append("[SEP]") 470 | 471 | input_ids = [electra_tokenizer._convert_token_to_id(token) for token in tokens] 472 | assert len(input_ids) <= self.config['max_length'] 473 | 474 | attention_mask = [1] * len(input_ids) 475 | token_type_ids = [0] * len(input_ids) 476 | 477 | padding = [0] * (self.config['max_length'] - len(input_ids)) 478 | 479 | total_word_seq.append(len(input_ids)) 480 | 481 | input_ids += padding 482 | attention_mask += padding 483 | token_type_ids += padding 484 | 485 | total_input_ids.append(input_ids) 486 | total_attention_mask.append(attention_mask) 487 | total_token_type_ids.append(token_type_ids) 488 | 489 | total_senti_seq.append([i for i in range(self.config['senti_labels'])]) 490 | total_score_seq.append([i for i in range(self.config['score_labels'])]) 491 | 492 | total_input_ids = torch.tensor(total_input_ids, dtype=torch.long) 493 | total_attention_mask = torch.tensor(total_attention_mask, dtype=torch.long) 494 | total_token_type_ids = torch.tensor(total_token_type_ids, dtype=torch.long) 495 | total_senti_seq = torch.tensor(total_senti_seq, dtype=torch.long) 496 | total_score_seq = torch.tensor(total_score_seq, dtype=torch.long) 497 | total_word_seq = torch.tensor(total_word_seq, dtype=torch.long) 498 | 499 | 500 | dataset = TensorDataset(total_input_ids, total_attention_mask, total_token_type_ids, total_senti_seq, 501 | total_score_seq, total_word_seq) 502 | 503 | test_sampler = SequentialSampler(dataset) 504 | test_dataloader = DataLoader(dataset, sampler=test_sampler, batch_size=1) 505 | 506 | electra_model.eval() 507 | 508 | 509 | for step, batch in enumerate(test_dataloader): 510 | batch = tuple(t.cuda() for t in batch) 511 | input_ids, attention_mask, token_type_ids, senti_seq, score_seq, word_len_seq = batch[0], batch[1], batch[2], batch[3], batch[4], batch[5] 512 | 513 | # 입력 데이터에 대한 출력 결과 생성 514 | score_logits, senti_logits = electra_model(input_ids, attention_mask, token_type_ids, 515 | None, score_seq, senti_seq, word_len_seq) 516 | 517 | senti_logits = senti_logits.squeeze() 518 | predict = F.softmax(senti_logits, dim=0) 519 | predict = predict.argmax(dim=-1) 520 | predict = predict.cpu().detach().numpy().tolist() 521 | input_ids = input_ids.cpu().detach().numpy().tolist() 522 | 523 | score_pred = F.softmax(score_logits, dim=1) 524 | score_pred = score_pred.argmax(dim=1) 525 | labels = score_pred.cpu().detach().numpy().tolist() 526 | 527 | tokens = [electra_tokenizer._convert_id_to_token(input_id) for input_id in input_ids[0]] 528 | sep_idx = tokens.index("[SEP]") 529 | # [CLS] 토큰 제거 530 | tokens = tokens[1:] 531 | 532 | # [SEP] 토큰 제거 533 | labels = labels[0][1:sep_idx] 534 | tokens = tokens[:tokens.index("[SEP]")] 535 | 536 | # 입력 sequence 복원 537 | # sequence = electra_tokenizer.convert_tokens_to_string(tokens) 538 | 539 | if (predict == 0): 540 | predict = "부정" 541 | else: 542 | predict = "긍정" 543 | 544 | print() 545 | for token, label in zip(tokens, labels): 546 | if label == 2: 547 | print(token.replace("##", "") + "{매우 부정}", end=" ") 548 | elif label == 3: 549 | print(token.replace("##", "") + "{부정}", end=" ") 550 | elif label == 5: 551 | print(token.replace("##", "") + "{긍정}", end=" ") 552 | elif label == 6: 553 | print(token.replace("##", "") + "{매우 긍정}", end=" ") 554 | else: 555 | print(token.replace("##", ""), end=" ") 556 | print("\n") 557 | print("감성 분석 결과 : {}".format(predict)) 558 | print() 559 | 560 | 561 | 562 | --------------------------------------------------------------------------------