├── simple_ntc ├── __init__.py ├── bert_dataset.py ├── utils.py ├── models │ ├── rnn.py │ └── cnn.py ├── data_loader.py ├── bert_trainer.py └── trainer.py ├── .gitignore ├── models └── get_accuracy.py ├── get_confusion_matrix.py ├── classify_plm.py ├── train.py ├── classify.py ├── finetune_plm_hftrainer.py ├── finetune_plm_native.py └── README.md /simple_ntc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | *.pyc 3 | data 4 | data/* 5 | .DS_Store 6 | ._.DS_Store 7 | models 8 | .checkpoints 9 | wandb/ 10 | -------------------------------------------------------------------------------- /models/get_accuracy.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def read_text(fn): 4 | with open(fn, 'r') as f: 5 | lines = f.readlines() 6 | 7 | return lines 8 | 9 | def main(ref_fn, hyp_fn): 10 | refs = read_text(ref_fn) 11 | hyps = read_text(hyp_fn) 12 | 13 | correcnt_cnt = 0 14 | for ref, hyp in zip(refs, hyps): 15 | if ref == hyp: 16 | correcnt_cnt += 1 17 | 18 | print('%d / %d = %.4f' % (correcnt_cnt, len(refs), float(correcnt_cnt) / len(refs))) 19 | 20 | if __name__ == '__main__': 21 | ref_fn = sys.argv[1] 22 | hyp_fn = sys.argv[2] 23 | 24 | main(ref_fn, hyp_fn) 25 | -------------------------------------------------------------------------------- /get_confusion_matrix.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | 4 | 5 | def read_stdin(): 6 | lines = [] 7 | 8 | for line in sys.stdin: 9 | if line.strip() != '': 10 | lines += [line.strip().split('\t')[0]] 11 | 12 | return lines 13 | 14 | 15 | def read_text(fn): 16 | lines = [] 17 | f = open(fn, 'r') 18 | 19 | for line in f: 20 | if line.strip() != '': 21 | lines += [line.strip().split('\t')[0]] 22 | 23 | f.close() 24 | 25 | return lines 26 | 27 | 28 | def get_confusion_matrix(classes, y, y_hat): 29 | confusion_matrix = np.zeros((len(classes), len(classes))) 30 | mapping_table = {} 31 | 32 | for idx, c in enumerate(classes): 33 | mapping_table[c] = idx 34 | 35 | for y_i, y_hat_i in zip(y, y_hat): 36 | confusion_matrix[mapping_table[y_hat_i], mapping_table[y_i]] += 1 37 | 38 | print('\t'.join(c for c in classes)) 39 | for i in range(len(classes)): 40 | print('\t'.join(['%4d' % confusion_matrix[i, j] for j in range(len(classes))])) 41 | 42 | 43 | if __name__ == '__main__': 44 | ref_fn = sys.argv[1] 45 | 46 | ref_lines = read_text(ref_fn) 47 | lines = read_stdin() 48 | 49 | min_length = min(len(ref_lines), len(lines)) 50 | ref_lines = ref_lines[:min_length] 51 | lines = lines[:min_length] 52 | 53 | classes = list(set(ref_lines + lines)) 54 | 55 | get_confusion_matrix(classes, ref_lines, lines) 56 | -------------------------------------------------------------------------------- /simple_ntc/bert_dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class TextClassificationCollator(): 6 | 7 | def __init__(self, tokenizer, max_length, with_text=True): 8 | self.tokenizer = tokenizer 9 | self.max_length = max_length 10 | self.with_text = with_text 11 | 12 | def __call__(self, samples): 13 | texts, labels = [], [] 14 | for text, label in samples: 15 | texts += [text] 16 | labels += [label] 17 | 18 | encoding = self.tokenizer( 19 | texts, 20 | padding=True, 21 | truncation=True, 22 | return_tensors="pt", 23 | max_length=self.max_length 24 | ) 25 | 26 | return_value = { 27 | 'input_ids': encoding['input_ids'], 28 | 'attention_mask': encoding['attention_mask'], 29 | 'labels': torch.tensor(labels, dtype=torch.long), 30 | } 31 | if self.with_text: 32 | return_value['text'] = texts 33 | 34 | return return_value 35 | 36 | 37 | class TextClassificationDataset(Dataset): 38 | 39 | def __init__(self, texts, labels): 40 | self.texts = texts 41 | self.labels = labels 42 | 43 | def __len__(self): 44 | return len(self.texts) 45 | 46 | def __getitem__(self, item): 47 | text = str(self.texts[item]) 48 | label = self.labels[item] 49 | 50 | return text, label 51 | -------------------------------------------------------------------------------- /simple_ntc/utils.py: -------------------------------------------------------------------------------- 1 | def read_text(fn): 2 | with open(fn, 'r') as f: 3 | lines = f.readlines() 4 | 5 | labels, texts = [], [] 6 | for line in lines: 7 | if line.strip() != '': 8 | # The file should have tab delimited two columns. 9 | # First column indicates label field, 10 | # and second column indicates text field. 11 | try: 12 | label, text = line.strip().split('\t') 13 | labels += [label] 14 | texts += [text] 15 | except Exception as e: 16 | print(e) 17 | print(line) 18 | continue 19 | 20 | return labels, texts 21 | 22 | 23 | def get_grad_norm(parameters, norm_type=2): 24 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 25 | 26 | total_norm = 0 27 | 28 | try: 29 | for p in parameters: 30 | total_norm += (p.grad.data**norm_type).sum() 31 | total_norm = total_norm ** (1. / norm_type) 32 | except Exception as e: 33 | print(e) 34 | 35 | return total_norm 36 | 37 | 38 | def get_parameter_norm(parameters, norm_type=2): 39 | total_norm = 0 40 | 41 | try: 42 | for p in parameters: 43 | total_norm += (p.data**norm_type).sum() 44 | total_norm = total_norm ** (1. / norm_type) 45 | except Exception as e: 46 | print(e) 47 | 48 | return total_norm 49 | -------------------------------------------------------------------------------- /simple_ntc/models/rnn.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class RNNClassifier(nn.Module): 5 | 6 | def __init__( 7 | self, 8 | input_size, 9 | word_vec_size, 10 | hidden_size, 11 | n_classes, 12 | n_layers=4, 13 | dropout_p=.3, 14 | ): 15 | self.input_size = input_size # vocabulary_size 16 | self.word_vec_size = word_vec_size 17 | self.hidden_size = hidden_size 18 | self.n_classes = n_classes 19 | self.n_layers = n_layers 20 | self.dropout_p = dropout_p 21 | 22 | super().__init__() 23 | 24 | self.emb = nn.Embedding(input_size, word_vec_size) 25 | self.rnn = nn.LSTM( 26 | input_size=word_vec_size, 27 | hidden_size=hidden_size, 28 | num_layers=n_layers, 29 | dropout=dropout_p, 30 | batch_first=True, 31 | bidirectional=True, 32 | ) 33 | self.generator = nn.Linear(hidden_size * 2, n_classes) 34 | # We use LogSoftmax + NLLLoss instead of Softmax + CrossEntropy 35 | self.activation = nn.LogSoftmax(dim=-1) 36 | 37 | def forward(self, x): 38 | # |x| = (batch_size, length) 39 | x = self.emb(x) 40 | # |x| = (batch_size, length, word_vec_size) 41 | x, _ = self.rnn(x) 42 | # |x| = (batch_size, length, hidden_size * 2) 43 | y = self.activation(self.generator(x[:, -1])) 44 | # |y| = (batch_size, n_classes) 45 | 46 | return y 47 | -------------------------------------------------------------------------------- /simple_ntc/data_loader.py: -------------------------------------------------------------------------------- 1 | import torchtext 2 | version = list(map(int, torchtext.__version__.split('.'))) 3 | if version[0] <= 0 and version[1] < 9: 4 | from torchtext import data 5 | else: 6 | from torchtext.legacy import data 7 | 8 | 9 | class DataLoader(object): 10 | ''' 11 | Data loader class to load text file using torchtext library. 12 | ''' 13 | 14 | def __init__( 15 | self, train_fn, 16 | batch_size=64, 17 | valid_ratio=.2, 18 | device=-1, 19 | max_vocab=999999, 20 | min_freq=1, 21 | use_eos=False, 22 | shuffle=True, 23 | ): 24 | ''' 25 | DataLoader initialization. 26 | :param train_fn: Train-set filename 27 | :param batch_size: Batchify data fot certain batch size. 28 | :param device: Device-id to load data (-1 for CPU) 29 | :param max_vocab: Maximum vocabulary size 30 | :param min_freq: Minimum frequency for loaded word. 31 | :param use_eos: If it is True, put after every end of sentence. 32 | :param shuffle: If it is True, random shuffle the input data. 33 | ''' 34 | super().__init__() 35 | 36 | # Define field of the input file. 37 | # The input file consists of two fields. 38 | self.label = data.Field( 39 | sequential=False, 40 | use_vocab=True, 41 | unk_token=None 42 | ) 43 | self.text = data.Field( 44 | use_vocab=True, 45 | batch_first=True, 46 | include_lengths=False, 47 | eos_token='' if use_eos else None, 48 | ) 49 | 50 | # Those defined two columns will be delimited by TAB. 51 | # Thus, we use TabularDataset to load two columns in the input file. 52 | # We would have two separate input file: train_fn, valid_fn 53 | # Files consist of two columns: label field and text field. 54 | train, valid = data.TabularDataset( 55 | path=train_fn, 56 | format='tsv', 57 | fields=[ 58 | ('label', self.label), 59 | ('text', self.text), 60 | ], 61 | ).split(split_ratio=(1 - valid_ratio)) 62 | 63 | # Those loaded dataset would be feeded into each iterator: 64 | # train iterator and valid iterator. 65 | # We sort input sentences by length, to group similar lengths. 66 | self.train_loader, self.valid_loader = data.BucketIterator.splits( 67 | (train, valid), 68 | batch_size=batch_size, 69 | device='cuda:%d' % device if device >= 0 else 'cpu', 70 | shuffle=shuffle, 71 | sort_key=lambda x: len(x.text), 72 | sort_within_batch=True, 73 | ) 74 | 75 | # At last, we make a vocabulary for label and text field. 76 | # It is making mapping table between words and indice. 77 | self.label.build_vocab(train) 78 | self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq) 79 | -------------------------------------------------------------------------------- /classify_plm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from transformers import BertTokenizerFast 9 | from transformers import BertForSequenceClassification, AlbertForSequenceClassification 10 | 11 | 12 | def define_argparser(): 13 | ''' 14 | Define argument parser to take inference using pre-trained model. 15 | ''' 16 | p = argparse.ArgumentParser() 17 | 18 | p.add_argument('--model_fn', required=True) 19 | p.add_argument('--gpu_id', type=int, default=-1) 20 | p.add_argument('--batch_size', type=int, default=256) 21 | p.add_argument('--top_k', type=int, default=1) 22 | 23 | config = p.parse_args() 24 | 25 | return config 26 | 27 | 28 | def read_text(): 29 | ''' 30 | Read text from standard input for inference. 31 | ''' 32 | lines = [] 33 | 34 | for line in sys.stdin: 35 | if line.strip() != '': 36 | lines += [line.strip()] 37 | 38 | return lines 39 | 40 | 41 | def main(config): 42 | saved_data = torch.load( 43 | config.model_fn, 44 | map_location='cpu' if config.gpu_id < 0 else 'cuda:%d' % config.gpu_id 45 | ) 46 | 47 | train_config = saved_data['config'] 48 | bert_best = saved_data['bert'] 49 | index_to_label = saved_data['classes'] 50 | 51 | lines = read_text() 52 | 53 | with torch.no_grad(): 54 | # Declare model and load pre-trained weights. 55 | tokenizer = BertTokenizerFast.from_pretrained(train_config.pretrained_model_name) 56 | model_loader = AlbertForSequenceClassification if train_config.use_albert else BertForSequenceClassification 57 | model = model_loader.from_pretrained( 58 | train_config.pretrained_model_name, 59 | num_labels=len(index_to_label) 60 | ) 61 | model.load_state_dict(bert_best) 62 | 63 | if config.gpu_id >= 0: 64 | model.cuda(config.gpu_id) 65 | device = next(model.parameters()).device 66 | 67 | # Don't forget turn-on evaluation mode. 68 | model.eval() 69 | 70 | y_hats = [] 71 | for idx in range(0, len(lines), config.batch_size): 72 | mini_batch = tokenizer( 73 | lines[idx:idx + config.batch_size], 74 | padding=True, 75 | truncation=True, 76 | return_tensors="pt", 77 | ) 78 | 79 | x = mini_batch['input_ids'] 80 | x = x.to(device) 81 | mask = mini_batch['attention_mask'] 82 | mask = mask.to(device) 83 | 84 | # Take feed-forward 85 | y_hat = F.softmax(model(x, attention_mask=mask).logits, dim=-1) 86 | 87 | y_hats += [y_hat] 88 | # Concatenate the mini-batch wise result 89 | y_hats = torch.cat(y_hats, dim=0) 90 | # |y_hats| = (len(lines), n_classes) 91 | 92 | probs, indice = y_hats.cpu().topk(config.top_k) 93 | # |indice| = (len(lines), top_k) 94 | 95 | for i in range(len(lines)): 96 | sys.stdout.write('%s\t%s\n' % ( 97 | ' '.join([index_to_label[int(indice[i][j])] for j in range(config.top_k)]), 98 | lines[i] 99 | )) 100 | 101 | 102 | if __name__ == '__main__': 103 | config = define_argparser() 104 | main(config) 105 | -------------------------------------------------------------------------------- /simple_ntc/models/cnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class CNNClassifier(nn.Module): 6 | 7 | def __init__( 8 | self, 9 | input_size, 10 | word_vec_size, 11 | n_classes, 12 | use_batch_norm=False, 13 | dropout_p=.5, 14 | window_sizes=[3, 4, 5], 15 | n_filters=[100, 100, 100], 16 | ): 17 | self.input_size = input_size # vocabulary size 18 | self.word_vec_size = word_vec_size 19 | self.n_classes = n_classes 20 | self.use_batch_norm = use_batch_norm 21 | self.dropout_p = dropout_p 22 | # window_size means that how many words a pattern covers. 23 | self.window_sizes = window_sizes 24 | # n_filters means that how many patterns to cover. 25 | self.n_filters = n_filters 26 | 27 | super().__init__() 28 | 29 | self.emb = nn.Embedding(input_size, word_vec_size) 30 | # Use nn.ModuleList to register each sub-modules. 31 | self.feature_extractors = nn.ModuleList() 32 | for window_size, n_filter in zip(window_sizes, n_filters): 33 | self.feature_extractors.append( 34 | nn.Sequential( 35 | nn.Conv2d( 36 | in_channels=1, # We only use one embedding layer. 37 | out_channels=n_filter, 38 | kernel_size=(window_size, word_vec_size), 39 | ), 40 | nn.ReLU(), 41 | nn.BatchNorm2d(n_filter) if use_batch_norm else nn.Dropout(dropout_p), 42 | ) 43 | ) 44 | 45 | # An input of generator layer is max values from each filter. 46 | self.generator = nn.Linear(sum(n_filters), n_classes) 47 | # We use LogSoftmax + NLLLoss instead of Softmax + CrossEntropy 48 | self.activation = nn.LogSoftmax(dim=-1) 49 | 50 | def forward(self, x): 51 | # |x| = (batch_size, length) 52 | x = self.emb(x) 53 | # |x| = (batch_size, length, word_vec_size) 54 | min_length = max(self.window_sizes) 55 | if min_length > x.size(1): 56 | # Because some input does not long enough for maximum length of window size, 57 | # we add zero tensor for padding. 58 | pad = x.new(x.size(0), min_length - x.size(1), self.word_vec_size).zero_() 59 | # |pad| = (batch_size, min_length - length, word_vec_size) 60 | x = torch.cat([x, pad], dim=1) 61 | # |x| = (batch_size, min_length, word_vec_size) 62 | 63 | # In ordinary case of vision task, you may have 3 channels on tensor, 64 | # but in this case, you would have just 1 channel, 65 | # which is added by 'unsqueeze' method in below: 66 | x = x.unsqueeze(1) 67 | # |x| = (batch_size, 1, length, word_vec_size) 68 | 69 | cnn_outs = [] 70 | for block in self.feature_extractors: 71 | cnn_out = block(x) 72 | # |cnn_out| = (batch_size, n_filter, length - window_size + 1, 1) 73 | 74 | # In case of max pooling, we does not know the pooling size, 75 | # because it depends on the length of the sentence. 76 | # Therefore, we use instant function using 'nn.functional' package. 77 | # This is the beauty of PyTorch. :) 78 | cnn_out = nn.functional.max_pool1d( 79 | input=cnn_out.squeeze(-1), 80 | kernel_size=cnn_out.size(-2) 81 | ).squeeze(-1) 82 | # |cnn_out| = (batch_size, n_filter) 83 | cnn_outs += [cnn_out] 84 | # Merge output tensors from each convolution layer. 85 | cnn_outs = torch.cat(cnn_outs, dim=-1) 86 | # |cnn_outs| = (batch_size, sum(n_filters)) 87 | y = self.activation(self.generator(cnn_outs)) 88 | # |y| = (batch_size, n_classes) 89 | 90 | return y 91 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | 7 | from simple_ntc.trainer import Trainer 8 | from simple_ntc.data_loader import DataLoader 9 | 10 | from simple_ntc.models.rnn import RNNClassifier 11 | from simple_ntc.models.cnn import CNNClassifier 12 | 13 | 14 | def define_argparser(): 15 | ''' 16 | Define argument parser to set hyper-parameters. 17 | ''' 18 | p = argparse.ArgumentParser() 19 | 20 | p.add_argument('--model_fn', required=True) 21 | p.add_argument('--train_fn', required=True) 22 | 23 | p.add_argument('--gpu_id', type=int, default=-1) 24 | p.add_argument('--verbose', type=int, default=2) 25 | 26 | p.add_argument('--min_vocab_freq', type=int, default=5) 27 | p.add_argument('--max_vocab_size', type=int, default=999999) 28 | 29 | p.add_argument('--batch_size', type=int, default=256) 30 | p.add_argument('--n_epochs', type=int, default=10) 31 | 32 | p.add_argument('--word_vec_size', type=int, default= 256) 33 | p.add_argument('--dropout', type=float, default=.3) 34 | 35 | p.add_argument('--max_length', type=int, default=256) 36 | 37 | p.add_argument('--rnn', action='store_true') 38 | p.add_argument('--hidden_size', type=int, default=512) 39 | p.add_argument('--n_layers', type=int, default=4) 40 | 41 | p.add_argument('--cnn', action='store_true') 42 | p.add_argument('--use_batch_norm', action='store_true') 43 | p.add_argument('--window_sizes', type=int, nargs='*', default=[3, 4, 5]) 44 | p.add_argument('--n_filters', type=int, nargs='*', default=[100, 100, 100]) 45 | 46 | config = p.parse_args() 47 | 48 | return config 49 | 50 | 51 | def main(config): 52 | loaders = DataLoader( 53 | train_fn=config.train_fn, 54 | batch_size=config.batch_size, 55 | min_freq=config.min_vocab_freq, 56 | max_vocab=config.max_vocab_size, 57 | device=config.gpu_id 58 | ) 59 | 60 | print( 61 | '|train| =', len(loaders.train_loader.dataset), 62 | '|valid| =', len(loaders.valid_loader.dataset), 63 | ) 64 | 65 | vocab_size = len(loaders.text.vocab) 66 | n_classes = len(loaders.label.vocab) 67 | print('|vocab| =', vocab_size, '|classes| =', n_classes) 68 | 69 | if config.rnn is False and config.cnn is False: 70 | raise Exception('You need to specify an architecture to train. (--rnn or --cnn)') 71 | 72 | if config.rnn: 73 | # Declare model and loss. 74 | model = RNNClassifier( 75 | input_size=vocab_size, 76 | word_vec_size=config.word_vec_size, 77 | hidden_size=config.hidden_size, 78 | n_classes=n_classes, 79 | n_layers=config.n_layers, 80 | dropout_p=config.dropout, 81 | ) 82 | optimizer = optim.Adam(model.parameters()) 83 | crit = nn.NLLLoss() 84 | print(model) 85 | 86 | if config.gpu_id >= 0: 87 | model.cuda(config.gpu_id) 88 | crit.cuda(config.gpu_id) 89 | 90 | rnn_trainer = Trainer(config) 91 | rnn_model = rnn_trainer.train( 92 | model, 93 | crit, 94 | optimizer, 95 | loaders.train_loader, 96 | loaders.valid_loader 97 | ) 98 | if config.cnn: 99 | # Declare model and loss. 100 | model = CNNClassifier( 101 | input_size=vocab_size, 102 | word_vec_size=config.word_vec_size, 103 | n_classes=n_classes, 104 | use_batch_norm=config.use_batch_norm, 105 | dropout_p=config.dropout, 106 | window_sizes=config.window_sizes, 107 | n_filters=config.n_filters, 108 | ) 109 | optimizer = optim.Adam(model.parameters()) 110 | crit = nn.NLLLoss() 111 | print(model) 112 | 113 | if config.gpu_id >= 0: 114 | model.cuda(config.gpu_id) 115 | crit.cuda(config.gpu_id) 116 | 117 | cnn_trainer = Trainer(config) 118 | cnn_model = cnn_trainer.train( 119 | model, 120 | crit, 121 | optimizer, 122 | loaders.train_loader, 123 | loaders.valid_loader 124 | ) 125 | 126 | torch.save({ 127 | 'rnn': rnn_model.state_dict() if config.rnn else None, 128 | 'cnn': cnn_model.state_dict() if config.cnn else None, 129 | 'config': config, 130 | 'vocab': loaders.text.vocab, 131 | 'classes': loaders.label.vocab, 132 | }, config.model_fn) 133 | 134 | 135 | if __name__ == '__main__': 136 | config = define_argparser() 137 | main(config) 138 | -------------------------------------------------------------------------------- /simple_ntc/bert_trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.utils as torch_utils 3 | 4 | from ignite.engine import Events 5 | 6 | from simple_ntc.utils import get_grad_norm, get_parameter_norm 7 | 8 | VERBOSE_SILENT = 0 9 | VERBOSE_EPOCH_WISE = 1 10 | VERBOSE_BATCH_WISE = 2 11 | 12 | from simple_ntc.trainer import Trainer, MyEngine 13 | 14 | 15 | class EngineForBert(MyEngine): 16 | 17 | def __init__(self, func, model, crit, optimizer, scheduler, config): 18 | self.scheduler = scheduler 19 | 20 | super().__init__(func, model, crit, optimizer, config) 21 | 22 | @staticmethod 23 | def train(engine, mini_batch): 24 | # You have to reset the gradients of all model parameters 25 | # before to take another step in gradient descent. 26 | engine.model.train() # Because we assign model as class variable, we can easily access to it. 27 | engine.optimizer.zero_grad() 28 | 29 | x, y = mini_batch['input_ids'], mini_batch['labels'] 30 | x, y = x.to(engine.device), y.to(engine.device) 31 | mask = mini_batch['attention_mask'] 32 | mask = mask.to(engine.device) 33 | 34 | x = x[:, :engine.config.max_length] 35 | 36 | # Take feed-forward 37 | y_hat = engine.model(x, attention_mask=mask).logits 38 | 39 | loss = engine.crit(y_hat, y) 40 | loss.backward() 41 | 42 | # Calculate accuracy only if 'y' is LongTensor, 43 | # which means that 'y' is one-hot representation. 44 | if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor): 45 | accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0)) 46 | else: 47 | accuracy = 0 48 | 49 | p_norm = float(get_parameter_norm(engine.model.parameters())) 50 | g_norm = float(get_grad_norm(engine.model.parameters())) 51 | 52 | # Take a step of gradient descent. 53 | engine.optimizer.step() 54 | engine.scheduler.step() 55 | 56 | return { 57 | 'loss': float(loss), 58 | 'accuracy': float(accuracy), 59 | '|param|': p_norm, 60 | '|g_param|': g_norm, 61 | } 62 | 63 | @staticmethod 64 | def validate(engine, mini_batch): 65 | engine.model.eval() 66 | 67 | with torch.no_grad(): 68 | x, y = mini_batch['input_ids'], mini_batch['labels'] 69 | x, y = x.to(engine.device), y.to(engine.device) 70 | mask = mini_batch['attention_mask'] 71 | mask = mask.to(engine.device) 72 | 73 | x = x[:, :engine.config.max_length] 74 | 75 | # Take feed-forward 76 | y_hat = engine.model(x, attention_mask=mask).logits 77 | 78 | loss = engine.crit(y_hat, y) 79 | 80 | if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor): 81 | accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0)) 82 | else: 83 | accuracy = 0 84 | 85 | return { 86 | 'loss': float(loss), 87 | 'accuracy': float(accuracy), 88 | } 89 | 90 | 91 | class BertTrainer(Trainer): 92 | 93 | def __init__(self, config): 94 | self.config = config 95 | 96 | def train( 97 | self, 98 | model, crit, optimizer, scheduler, 99 | train_loader, valid_loader, 100 | ): 101 | train_engine = EngineForBert( 102 | EngineForBert.train, 103 | model, crit, optimizer, scheduler, self.config 104 | ) 105 | validation_engine = EngineForBert( 106 | EngineForBert.validate, 107 | model, crit, optimizer, scheduler, self.config 108 | ) 109 | 110 | EngineForBert.attach( 111 | train_engine, 112 | validation_engine, 113 | verbose=self.config.verbose 114 | ) 115 | 116 | def run_validation(engine, validation_engine, valid_loader): 117 | validation_engine.run(valid_loader, max_epochs=1) 118 | 119 | train_engine.add_event_handler( 120 | Events.EPOCH_COMPLETED, # event 121 | run_validation, # function 122 | validation_engine, valid_loader, # arguments 123 | ) 124 | validation_engine.add_event_handler( 125 | Events.EPOCH_COMPLETED, # event 126 | EngineForBert.check_best, # function 127 | ) 128 | 129 | train_engine.run( 130 | train_loader, 131 | max_epochs=self.config.n_epochs, 132 | ) 133 | 134 | model.load_state_dict(validation_engine.best_model) 135 | 136 | return model 137 | -------------------------------------------------------------------------------- /classify.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | import torch 5 | import torch.nn as nn 6 | version = list(map(int, torchtext.__version__.split('.'))) 7 | if version[0] <= 0 and version[1] < 9: 8 | from torchtext import data 9 | else: 10 | from torchtext.legacy import data 11 | 12 | from simple_ntc.models.rnn import RNNClassifier 13 | from simple_ntc.models.cnn import CNNClassifier 14 | 15 | 16 | def define_argparser(): 17 | ''' 18 | Define argument parser to take inference using pre-trained model. 19 | ''' 20 | p = argparse.ArgumentParser() 21 | 22 | p.add_argument('--model_fn', required=True) 23 | p.add_argument('--gpu_id', type=int, default=-1) 24 | p.add_argument('--batch_size', type=int, default=256) 25 | p.add_argument('--top_k', type=int, default=1) 26 | p.add_argument('--max_length', type=int, default=256) 27 | 28 | p.add_argument('--drop_rnn', action='store_true') 29 | p.add_argument('--drop_cnn', action='store_true') 30 | 31 | config = p.parse_args() 32 | 33 | return config 34 | 35 | 36 | def read_text(max_length=256): 37 | ''' 38 | Read text from standard input for inference. 39 | ''' 40 | lines = [] 41 | 42 | for line in sys.stdin: 43 | if line.strip() != '': 44 | lines += [line.strip().split(' ')[:max_length]] 45 | 46 | return lines 47 | 48 | 49 | def define_field(): 50 | ''' 51 | To avoid use DataLoader class, just declare dummy fields. 52 | With those fields, we can retore mapping table between words and indice. 53 | ''' 54 | return ( 55 | data.Field( 56 | use_vocab=True, 57 | batch_first=True, 58 | include_lengths=False, 59 | ), 60 | data.Field( 61 | sequential=False, 62 | use_vocab=True, 63 | unk_token=None, 64 | ) 65 | ) 66 | 67 | 68 | def main(config): 69 | saved_data = torch.load( 70 | config.model_fn, 71 | map_location='cpu' if config.gpu_id < 0 else 'cuda:%d' % config.gpu_id 72 | ) 73 | 74 | train_config = saved_data['config'] 75 | rnn_best = saved_data['rnn'] 76 | cnn_best = saved_data['cnn'] 77 | vocab = saved_data['vocab'] 78 | classes = saved_data['classes'] 79 | 80 | vocab_size = len(vocab) 81 | n_classes = len(classes) 82 | 83 | text_field, label_field = define_field() 84 | text_field.vocab = vocab 85 | label_field.vocab = classes 86 | 87 | lines = read_text(max_length=config.max_length) 88 | 89 | with torch.no_grad(): 90 | ensemble = [] 91 | if rnn_best is not None and not config.drop_rnn: 92 | # Declare model and load pre-trained weights. 93 | model = RNNClassifier( 94 | input_size=vocab_size, 95 | word_vec_size=train_config.word_vec_size, 96 | hidden_size=train_config.hidden_size, 97 | n_classes=n_classes, 98 | n_layers=train_config.n_layers, 99 | dropout_p=train_config.dropout, 100 | ) 101 | model.load_state_dict(rnn_best) 102 | ensemble += [model] 103 | if cnn_best is not None and not config.drop_cnn: 104 | # Declare model and load pre-trained weights. 105 | model = CNNClassifier( 106 | input_size=vocab_size, 107 | word_vec_size=train_config.word_vec_size, 108 | n_classes=n_classes, 109 | use_batch_norm=train_config.use_batch_norm, 110 | dropout_p=train_config.dropout, 111 | window_sizes=train_config.window_sizes, 112 | n_filters=train_config.n_filters, 113 | ) 114 | model.load_state_dict(cnn_best) 115 | ensemble += [model] 116 | 117 | y_hats = [] 118 | # Get prediction with iteration on ensemble. 119 | for model in ensemble: 120 | if config.gpu_id >= 0: 121 | model.cuda(config.gpu_id) 122 | # Don't forget turn-on evaluation mode. 123 | model.eval() 124 | 125 | y_hat = [] 126 | for idx in range(0, len(lines), config.batch_size): 127 | # Converts string to list of index. 128 | x = text_field.numericalize( 129 | text_field.pad(lines[idx:idx + config.batch_size]), 130 | device='cuda:%d' % config.gpu_id if config.gpu_id >= 0 else 'cpu', 131 | ) 132 | 133 | y_hat += [model(x).cpu()] 134 | # Concatenate the mini-batch wise result 135 | y_hat = torch.cat(y_hat, dim=0) 136 | # |y_hat| = (len(lines), n_classes) 137 | 138 | y_hats += [y_hat] 139 | 140 | model.cpu() 141 | # Merge to one tensor for ensemble result and make probability from log-prob. 142 | y_hats = torch.stack(y_hats).exp() 143 | # |y_hats| = (len(ensemble), len(lines), n_classes) 144 | y_hats = y_hats.sum(dim=0) / len(ensemble) # Get average 145 | # |y_hats| = (len(lines), n_classes) 146 | 147 | probs, indice = y_hats.topk(config.top_k) 148 | 149 | for i in range(len(lines)): 150 | sys.stdout.write('%s\t%s\n' % ( 151 | ' '.join([classes.itos[indice[i][j]] for j in range(config.top_k)]), 152 | ' '.join(lines[i]) 153 | )) 154 | 155 | 156 | if __name__ == '__main__': 157 | config = define_argparser() 158 | main(config) 159 | -------------------------------------------------------------------------------- /finetune_plm_hftrainer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | 4 | from sklearn.metrics import accuracy_score 5 | 6 | import torch 7 | 8 | from transformers import BertTokenizerFast 9 | from transformers import BertForSequenceClassification, AlbertForSequenceClassification, RobertaForSequenceClassification 10 | from transformers import Trainer 11 | from transformers import TrainingArguments 12 | 13 | from simple_ntc.bert_dataset import TextClassificationCollator 14 | from simple_ntc.bert_dataset import TextClassificationDataset 15 | from simple_ntc.utils import read_text 16 | 17 | 18 | def define_argparser(): 19 | p = argparse.ArgumentParser() 20 | 21 | p.add_argument('--model_fn', required=True) 22 | p.add_argument('--train_fn', required=True) 23 | # Recommended model list: 24 | # - kykim/bert-kor-base 25 | # - kykim/albert-kor-base 26 | # - beomi/kcbert-base 27 | # - beomi/kcbert-large 28 | p.add_argument('--pretrained_model_name', type=str, default='beomi/kcbert-base') 29 | p.add_argument('--use_albert', action='store_true') 30 | p.add_argument('--use_roberta', action='store_true') 31 | 32 | p.add_argument('--valid_ratio', type=float, default=.2) 33 | p.add_argument('--batch_size_per_device', type=int, default=32) 34 | p.add_argument('--n_epochs', type=int, default=5) 35 | 36 | p.add_argument('--warmup_ratio', type=float, default=.2) 37 | 38 | p.add_argument('--max_length', type=int, default=100) 39 | 40 | config = p.parse_args() 41 | 42 | return config 43 | 44 | 45 | def get_datasets(fn, valid_ratio=.2): 46 | # Get list of labels and list of texts. 47 | labels, texts = read_text(fn) 48 | 49 | # Generate label to index map. 50 | unique_labels = list(set(labels)) 51 | label_to_index = {} 52 | index_to_label = {} 53 | for i, label in enumerate(unique_labels): 54 | label_to_index[label] = i 55 | index_to_label[i] = label 56 | 57 | # Convert label text to integer value. 58 | labels = list(map(label_to_index.get, labels)) 59 | 60 | # Shuffle before split into train and validation set. 61 | shuffled = list(zip(texts, labels)) 62 | random.shuffle(shuffled) 63 | texts = [e[0] for e in shuffled] 64 | labels = [e[1] for e in shuffled] 65 | idx = int(len(texts) * (1 - valid_ratio)) 66 | 67 | train_dataset = TextClassificationDataset(texts[:idx], labels[:idx]) 68 | valid_dataset = TextClassificationDataset(texts[idx:], labels[idx:]) 69 | 70 | return train_dataset, valid_dataset, index_to_label 71 | 72 | 73 | def main(config): 74 | # Get pretrained tokenizer. 75 | tokenizer = BertTokenizerFast.from_pretrained(config.pretrained_model_name) 76 | # Get datasets and index to label map. 77 | train_dataset, valid_dataset, index_to_label = get_datasets( 78 | config.train_fn, 79 | valid_ratio=config.valid_ratio 80 | ) 81 | 82 | print( 83 | '|train| =', len(train_dataset), 84 | '|valid| =', len(valid_dataset), 85 | ) 86 | 87 | total_batch_size = config.batch_size_per_device * torch.cuda.device_count() 88 | n_total_iterations = int(len(train_dataset) / total_batch_size * config.n_epochs) 89 | n_warmup_steps = int(n_total_iterations * config.warmup_ratio) 90 | print( 91 | '#total_iters =', n_total_iterations, 92 | '#warmup_iters =', n_warmup_steps, 93 | ) 94 | 95 | # Get pretrained model with specified softmax layer. 96 | assert not (config.use_albert and config.use_roberta), 'Only one of use_albert and use_roberta can be True.' 97 | if config.use_albert: 98 | model_loader = AlbertForSequenceClassification 99 | elif config.use_roberta: 100 | model_loader = RobertaForSequenceClassification 101 | else: 102 | model_loader = BertForSequenceClassification 103 | 104 | model = model_loader.from_pretrained( 105 | config.pretrained_model_name, 106 | num_labels=len(index_to_label) 107 | ) 108 | 109 | training_args = TrainingArguments( 110 | output_dir='./.checkpoints', 111 | num_train_epochs=config.n_epochs, 112 | per_device_train_batch_size=config.batch_size_per_device, 113 | per_device_eval_batch_size=config.batch_size_per_device, 114 | warmup_steps=n_warmup_steps, 115 | weight_decay=0.01, 116 | fp16=True, 117 | evaluation_strategy='epoch', 118 | save_strategy='epoch', 119 | logging_steps=n_total_iterations // 100, 120 | save_steps=n_total_iterations // config.n_epochs, 121 | load_best_model_at_end=True, 122 | ) 123 | 124 | def compute_metrics(pred): 125 | labels = pred.label_ids 126 | preds = pred.predictions.argmax(-1) 127 | 128 | return { 129 | 'accuracy': accuracy_score(labels, preds) 130 | } 131 | 132 | trainer = Trainer( 133 | model=model, 134 | args=training_args, 135 | data_collator=TextClassificationCollator(tokenizer, 136 | config.max_length, 137 | with_text=False), 138 | train_dataset=train_dataset, 139 | eval_dataset=valid_dataset, 140 | compute_metrics=compute_metrics, 141 | ) 142 | 143 | trainer.train() 144 | 145 | torch.save({ 146 | 'rnn': None, 147 | 'cnn': None, 148 | 'bert': trainer.model.state_dict(), 149 | 'config': config, 150 | 'vocab': None, 151 | 'classes': index_to_label, 152 | 'tokenizer': tokenizer, 153 | }, config.model_fn) 154 | 155 | 156 | if __name__ == '__main__': 157 | config = define_argparser() 158 | main(config) 159 | -------------------------------------------------------------------------------- /finetune_plm_native.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.optim as optim 7 | from torch.utils.data import DataLoader 8 | 9 | from transformers import BertTokenizerFast 10 | from transformers import BertForSequenceClassification, AlbertForSequenceClassification 11 | from transformers import AdamW 12 | from transformers import get_linear_schedule_with_warmup 13 | 14 | import torch_optimizer as custom_optim 15 | 16 | from simple_ntc.bert_trainer import BertTrainer as Trainer 17 | from simple_ntc.bert_dataset import TextClassificationDataset, TextClassificationCollator 18 | from simple_ntc.utils import read_text 19 | 20 | 21 | def define_argparser(): 22 | p = argparse.ArgumentParser() 23 | 24 | p.add_argument('--model_fn', required=True) 25 | p.add_argument('--train_fn', required=True) 26 | # Recommended model list: 27 | # - kykim/bert-kor-base 28 | # - kykim/albert-kor-base 29 | # - beomi/kcbert-base 30 | # - beomi/kcbert-large 31 | p.add_argument('--pretrained_model_name', type=str, default='beomi/kcbert-base') 32 | p.add_argument('--use_albert', action='store_true') 33 | 34 | p.add_argument('--gpu_id', type=int, default=-1) 35 | p.add_argument('--verbose', type=int, default=2) 36 | 37 | p.add_argument('--batch_size', type=int, default=32) 38 | p.add_argument('--n_epochs', type=int, default=5) 39 | 40 | p.add_argument('--lr', type=float, default=5e-5) 41 | p.add_argument('--warmup_ratio', type=float, default=.2) 42 | p.add_argument('--adam_epsilon', type=float, default=1e-8) 43 | # If you want to use RAdam, I recommend to use LR=1e-4. 44 | # Also, you can set warmup_ratio=0. 45 | p.add_argument('--use_radam', action='store_true') 46 | p.add_argument('--valid_ratio', type=float, default=.2) 47 | 48 | p.add_argument('--max_length', type=int, default=100) 49 | 50 | config = p.parse_args() 51 | 52 | return config 53 | 54 | 55 | def get_loaders(fn, tokenizer, valid_ratio=.2): 56 | # Get list of labels and list of texts. 57 | labels, texts = read_text(fn) 58 | 59 | # Generate label to index map. 60 | unique_labels = list(set(labels)) 61 | label_to_index = {} 62 | index_to_label = {} 63 | for i, label in enumerate(unique_labels): 64 | label_to_index[label] = i 65 | index_to_label[i] = label 66 | 67 | # Convert label text to integer value. 68 | labels = list(map(label_to_index.get, labels)) 69 | 70 | # Shuffle before split into train and validation set. 71 | shuffled = list(zip(texts, labels)) 72 | random.shuffle(shuffled) 73 | texts = [e[0] for e in shuffled] 74 | labels = [e[1] for e in shuffled] 75 | idx = int(len(texts) * (1 - valid_ratio)) 76 | 77 | # Get dataloaders using given tokenizer as collate_fn. 78 | train_loader = DataLoader( 79 | TextClassificationDataset(texts[:idx], labels[:idx]), 80 | batch_size=config.batch_size, 81 | shuffle=True, 82 | collate_fn=TextClassificationCollator(tokenizer, config.max_length), 83 | ) 84 | valid_loader = DataLoader( 85 | TextClassificationDataset(texts[idx:], labels[idx:]), 86 | batch_size=config.batch_size, 87 | collate_fn=TextClassificationCollator(tokenizer, config.max_length), 88 | ) 89 | 90 | return train_loader, valid_loader, index_to_label 91 | 92 | 93 | def get_optimizer(model, config): 94 | if config.use_radam: 95 | optimizer = custom_optim.RAdam(model.parameters(), lr=config.lr) 96 | else: 97 | # Prepare optimizer and schedule (linear warmup and decay) 98 | no_decay = ['bias', 'LayerNorm.weight'] 99 | optimizer_grouped_parameters = [ 100 | { 101 | 'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 102 | 'weight_decay': 0.01 103 | }, 104 | { 105 | 'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 106 | 'weight_decay': 0.0 107 | } 108 | ] 109 | 110 | optimizer = optim.AdamW( 111 | optimizer_grouped_parameters, 112 | lr=config.lr, 113 | eps=config.adam_epsilon 114 | ) 115 | 116 | return optimizer 117 | 118 | 119 | def main(config): 120 | # Get pretrained tokenizer. 121 | tokenizer = BertTokenizerFast.from_pretrained(config.pretrained_model_name) 122 | # Get dataloaders using tokenizer from untokenized corpus. 123 | train_loader, valid_loader, index_to_label = get_loaders( 124 | config.train_fn, 125 | tokenizer, 126 | valid_ratio=config.valid_ratio 127 | ) 128 | 129 | print( 130 | '|train| =', len(train_loader) * config.batch_size, 131 | '|valid| =', len(valid_loader) * config.batch_size, 132 | ) 133 | 134 | n_total_iterations = len(train_loader) * config.n_epochs 135 | n_warmup_steps = int(n_total_iterations * config.warmup_ratio) 136 | print( 137 | '#total_iters =', n_total_iterations, 138 | '#warmup_iters =', n_warmup_steps, 139 | ) 140 | 141 | # Get pretrained model with specified softmax layer. 142 | model_loader = AlbertForSequenceClassification if config.use_albert else BertForSequenceClassification 143 | model = model_loader.from_pretrained( 144 | config.pretrained_model_name, 145 | num_labels=len(index_to_label) 146 | ) 147 | optimizer = get_optimizer(model, config) 148 | 149 | # By default, model returns a hidden representation before softmax func. 150 | # Thus, we need to use CrossEntropyLoss, which combines LogSoftmax and NLLLoss. 151 | crit = nn.CrossEntropyLoss() 152 | scheduler = get_linear_schedule_with_warmup( 153 | optimizer, 154 | n_warmup_steps, 155 | n_total_iterations 156 | ) 157 | 158 | if config.gpu_id >= 0: 159 | model.cuda(config.gpu_id) 160 | crit.cuda(config.gpu_id) 161 | 162 | # Start train. 163 | trainer = Trainer(config) 164 | model = trainer.train( 165 | model, 166 | crit, 167 | optimizer, 168 | scheduler, 169 | train_loader, 170 | valid_loader, 171 | ) 172 | 173 | torch.save({ 174 | 'rnn': None, 175 | 'cnn': None, 176 | 'bert': model.state_dict(), 177 | 'config': config, 178 | 'vocab': None, 179 | 'classes': index_to_label, 180 | 'tokenizer': tokenizer, 181 | }, config.model_fn) 182 | 183 | if __name__ == '__main__': 184 | config = define_argparser() 185 | main(config) 186 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simple Neural Text Classification(NTC) 2 | 3 | This repository contains implementation of naive and simple text classification using recurrent neural network (LSTM) and convolutional neural network (from [[Kim 2014](http://arxiv.org/abs/1408.5882)]). You need to specify architecture to train, and you can select both. If you choose both arthictecture to classify sentences, inference will be done by ensemble (just simple average). 4 | 5 | In addition, this repo is for [lecture](https://www.fastcampus.co.kr/data_camp_nlpbasic/) and [book](https://kh-kim.gitbook.io/natural-language-processing-with-pytorch/), what I conduct. Please, refer those site for further information. 6 | 7 | ## Pre-requisite 8 | 9 | - Python 3.6 or higher 10 | - PyTorch 1.6 or higher 11 | - PyTorch Ignite 12 | - TorchText 0.5 or higher 13 | - [torch-optimizer 0.0.1a15](https://pypi.org/project/torch-optimizer/) 14 | - Tokenized corpus (e.g. [Moses](https://www.nltk.org/_modules/nltk/tokenize/moses.html), Mecab, [Jieba](https://github.com/fxsjy/jieba)) 15 | 16 | if you want to use BERT finetuning, you may also need, 17 | 18 | - Huggingface 19 | 20 | ## Usage 21 | 22 | ### Preparation 23 | 24 | #### Format 25 | 26 | The input file would have a format with two columns, class and sentence. Those columns are delimited by tab. Class does not need to be a number, but a word (without white-space). Below is example corpus to explain. 27 | 28 | ```bash 29 | $ cat ./data/raw_corpus.txt | shuf | head 30 | positive 나름 괜찬항요 막 엄청 좋은건 아님 그냥 그럭저럭임... 아직 까지 인생 디퓨져는 못찾은느낌 31 | negative 재질은플라스틱부분이많고요...금방깨질거같아요..당장 물은나오게해야하기에..그냥설치했어요..지금도 조금은후회중..... 32 | positive 평소 신던 신발보다 크긴하지만 운동화라 끈 조절해서 신으려구요 신발 이쁘고 편하네요 33 | positive 두개사서 직장에 구비해두고 먹고있어요 양 많아서 오래쓸듯 34 | positive 생일선물로 샀는데 받으시는 분도 만족하시구 배송도 빨라서 좋았네요 35 | positive 아이가 너무 좋아합니다 크롱도 좋아라하지만 루피를 더.. 36 | negative 배송은 기다릴수 있었는데 8개나 주문했는데 샘플을 너무 적게보내주시네요ㅡㅡ;; 37 | positive 너무귀여워요~~ㅎ아직사용은 못해? f지만 이젠 모기땜에 잠설치는일은 ? j겟죠 38 | positive 13개월 아가 제일좋은 간식이네요 39 | positive 지인추천으로 샀어요~ 싸고 가성비 좋다해서 낮기저귀로 써보려구요~ 40 | ``` 41 | 42 | #### Tokenization (Optional) 43 | 44 | You may need to tokenize sentences in the corpus. You need to select your own tokenizer based on the language. (e.g. Mecab for Korean) 45 | 46 | ```bash 47 | $ cat ./data/raw_corpus.txt | awk -F'\t' '{ print $2 }' | mecab -O wakati > ./data/tmp.txt 48 | $ cat ./data/raw_corpus.txt | awk -F'\t' '{ print $1 }' > ./data/tmp_class.txt 49 | $ paste ./data/tmp_class.txt ./data/tmp.txt > ./data/corpus.txt 50 | $ rm ./data/tmp.txt ./data/tmp_class.txt 51 | ``` 52 | 53 | #### Shuffle and Split for Train-set and Valid-set 54 | 55 | After correct formatting and tokenization, you need to split the corpus to train-set and valid-set. 56 | 57 | ```bash 58 | $ wc -l ./data/corpus.txt 59 | 302680 ./data/corpus.txt 60 | ``` 61 | 62 | As you can see, we have more than 260k samples in corpus. 63 | 64 | ```bash 65 | $ cat ./data/corpus.txt | shuf > ./data/corpus.shuf.txt 66 | $ head -n 62680 ./data/corpus.shuf.txt > ./data/corpus.test.txt 67 | $ tail -n 240000 ./data/corpus.shuf.txt > ./data/corpus.train.txt 68 | ``` 69 | 70 | Now, you have 240,000 samples for train-set, and 62,680 samples for valid-set. Note that you can use 'rl' command, instead of 'shuf', if you are using MacOS. 71 | 72 | ### Train 73 | 74 | Below is the example command for training. You can select your own hyper-parameter values via argument inputs. 75 | 76 | ```bash 77 | python train.py --model_fn ./models/model.pth --train ./data/corpus.train.txt --valid ./data/corpus.valid.txt --rnn --cnn --gpu_id 0 78 | ``` 79 | 80 | Note that you need to specify an architecture for training. You can select both rnn and cnn for ensemble method. Also, you can select the device to use for training. In order to use CPU only, you can put -1 for '--gpu_id' argument, which is default value. 81 | 82 | ```bash 83 | $ python ./train.py --help 84 | usage: train.py [-h] --model_fn MODEL_FN --train_fn TRAIN_FN [--gpu_id GPU_ID] 85 | [--verbose VERBOSE] [--min_vocab_freq MIN_VOCAB_FREQ] 86 | [--max_vocab_size MAX_VOCAB_SIZE] [--batch_size BATCH_SIZE] 87 | [--n_epochs N_EPOCHS] [--word_vec_size WORD_VEC_SIZE] 88 | [--dropout DROPOUT] [--max_length MAX_LENGTH] [--rnn] 89 | [--hidden_size HIDDEN_SIZE] [--n_layers N_LAYERS] [--cnn] 90 | [--use_batch_norm] 91 | [--window_sizes [WINDOW_SIZES [WINDOW_SIZES ...]]] 92 | [--n_filters [N_FILTERS [N_FILTERS ...]]] 93 | ``` 94 | 95 | or you can check default hyper-parameter from train.py. 96 | 97 | ### Inference 98 | 99 | You can feed standard input as input for inference, like as below. Prediction result consists of two columns(top-k classes and input sentence) with tab delimiter. The result will be shown as standard output. 100 | 101 | ```bash 102 | $ head ./data/review.sorted.uniq.refined.tok.shuf.test.tsv | awk -F'\t' '{ print $2 }' | python classify.py --model ./models/model.pth --gpu_id -1 --top_k 1 103 | positive 생각 보다 밝 아요 ㅎㅎ 104 | negative 쓸 대 가 없 네요 105 | positive 깔 금 해요 . 가벼워 요 . 설치 가 쉬워요 . 타 사이트 에 비해 가격 도 저렴 하 답니다 . 106 | positive 크기 나 두께 가 딱 제 가 원 하 던 사이즈 네요 . 책상 의자 가 너무 딱딱 해서 쿠션 감 좋 은 방석 이 필요 하 던 차 에 좋 은 제품 만났 네요 . 냄새 얘기 하 시 는 분 도 더러 있 던데 별로 냄새 안 나 요 . 107 | positive 빠르 고 괜찬 습니다 . 108 | positive 유통 기한 도 넉넉 하 고 좋 아요 109 | positive 좋 은 가격 에 좋 은 상품 잘 쓰 겠 습니다 . 110 | negative 사이트 에서 늘 생리대 사 서 쓰 는데 오늘 처럼 이렇게 비닐 에 포장 되 어 받 아 본 건 처음 입니다 . 위생 용품 이 고 자체 도 비닐 포장 이 건만 소형 박스 에 라도 넣 어 보내 주 시 지 . .. 111 | negative 연결 부분 이 많이 티 가 납니다 . 재질 구김 도 좀 있 습니다 . 112 | positive 애기 태열 때문 에 구매 해서 잘 쓰 고 있 습니다 . 113 | ``` 114 | 115 | Also, you can see the arguments, and see the default values on classify.py. 116 | 117 | ```bash 118 | $ python classify.py -h 119 | usage: classify.py [-h] --model_fn MODEL [--gpu_id GPU_ID] 120 | [--batch_size BATCH_SIZE] [--top_k TOP_K] 121 | ``` 122 | 123 | ## Evaluation 124 | 125 | I split the corpus to make train-set and valid-set. 240,000 lines are sampled for train-set and 62,680 samples for valid-set. Architecture snapshots are like as below. You may increase the performance with hyper-parameter optimization. 126 | 127 | ```bash 128 | RNNClassifier( 129 | (emb): Embedding(35532, 128) 130 | (rnn): LSTM(128, 256, num_layers=4, batch_first=True, dropout=0.3, bidirectional=True) 131 | (generator): Linear(in_features=512, out_features=2, bias=True) 132 | (activation): LogSoftmax() 133 | ) 134 | ``` 135 | 136 | ```bash 137 | CNNClassifier( 138 | (emb): Embedding(35532, 256) 139 | (feature_extractors): ModuleList( 140 | (0): Sequential( 141 | (0): Conv2d(1, 100, kernel_size=(3, 256), stride=(1, 1)) 142 | (1): ReLU() 143 | (2): Dropout(p=0.3, inplace=False) 144 | ) 145 | (1): Sequential( 146 | (0): Conv2d(1, 100, kernel_size=(4, 256), stride=(1, 1)) 147 | (1): ReLU() 148 | (2): Dropout(p=0.3, inplace=False) 149 | ) 150 | (2): Sequential( 151 | (0): Conv2d(1, 100, kernel_size=(5, 256), stride=(1, 1)) 152 | (1): ReLU() 153 | (2): Dropout(p=0.3, inplace=False) 154 | ) 155 | ) 156 | (generator): Linear(in_features=300, out_features=2, bias=True) 157 | (activation): LogSoftmax() 158 | ) 159 | ``` 160 | 161 | |Architecture|Test Accuracy| 162 | |-|-| 163 | |Bi-LSTM|0.9035| 164 | |CNN|0.9090| 165 | |Bi-LSTM + CNN|0.9142| 166 | |KcBERT|0.9598| 167 | 168 | ## Author 169 | 170 | |Name|Kim, Ki Hyun| 171 | |-|-| 172 | |email|pointzz.ki@gmail.com| 173 | |github|https://github.com/kh-kim/| 174 | |linkedin|https://www.linkedin.com/in/ki-hyun-kim/| 175 | 176 | ## Reference 177 | 178 | - Kim, Convolutional neural networks for sentence classification, EMNLP, 2014 179 | - Devlin et al., BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, ACL, 2019 180 | - [Lee, KcBERT: Korean comments BERT, GitHub, 2020](https://github.com/Beomi/KcBERT) 181 | -------------------------------------------------------------------------------- /simple_ntc/trainer.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import numpy as np 4 | 5 | import torch 6 | 7 | from ignite.engine import Engine 8 | from ignite.engine import Events 9 | from ignite.metrics import RunningAverage 10 | from ignite.contrib.handlers.tqdm_logger import ProgressBar 11 | 12 | from simple_ntc.utils import get_grad_norm, get_parameter_norm 13 | 14 | VERBOSE_SILENT = 0 15 | VERBOSE_EPOCH_WISE = 1 16 | VERBOSE_BATCH_WISE = 2 17 | 18 | 19 | class MyEngine(Engine): 20 | 21 | def __init__(self, func, model, crit, optimizer, config): 22 | # Ignite Engine does not have objects in below lines. 23 | # Thus, we assign class variables to access these object, during the procedure. 24 | self.model = model 25 | self.crit = crit 26 | self.optimizer = optimizer 27 | self.config = config 28 | 29 | super().__init__(func) # Ignite Engine only needs function to run. 30 | 31 | self.best_loss = np.inf 32 | self.best_model = None 33 | 34 | self.device = next(model.parameters()).device 35 | 36 | @staticmethod 37 | def train(engine, mini_batch): 38 | # You have to reset the gradients of all model parameters 39 | # before to take another step in gradient descent. 40 | engine.model.train() # Because we assign model as class variable, we can easily access to it. 41 | engine.optimizer.zero_grad() 42 | 43 | x, y = mini_batch.text, mini_batch.label 44 | x, y = x.to(engine.device), y.to(engine.device) 45 | 46 | x = x[:, :engine.config.max_length] 47 | 48 | # Take feed-forward 49 | y_hat = engine.model(x) 50 | 51 | loss = engine.crit(y_hat, y) 52 | loss.backward() 53 | 54 | # Calculate accuracy only if 'y' is LongTensor, 55 | # which means that 'y' is one-hot representation. 56 | if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor): 57 | accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0)) 58 | else: 59 | accuracy = 0 60 | 61 | p_norm = float(get_parameter_norm(engine.model.parameters())) 62 | g_norm = float(get_grad_norm(engine.model.parameters())) 63 | 64 | # Take a step of gradient descent. 65 | engine.optimizer.step() 66 | 67 | return { 68 | 'loss': float(loss), 69 | 'accuracy': float(accuracy), 70 | '|param|': p_norm, 71 | '|g_param|': g_norm, 72 | } 73 | 74 | @staticmethod 75 | def validate(engine, mini_batch): 76 | engine.model.eval() 77 | 78 | with torch.no_grad(): 79 | x, y = mini_batch.text, mini_batch.label 80 | x, y = x.to(engine.device), y.to(engine.device) 81 | 82 | x = x[:, :engine.config.max_length] 83 | 84 | y_hat = engine.model(x) 85 | 86 | loss = engine.crit(y_hat, y) 87 | 88 | if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor): 89 | accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0)) 90 | else: 91 | accuracy = 0 92 | 93 | return { 94 | 'loss': float(loss), 95 | 'accuracy': float(accuracy), 96 | } 97 | 98 | @staticmethod 99 | def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE): 100 | # Attaching would be repaeted for serveral metrics. 101 | # Thus, we can reduce the repeated codes by using this function. 102 | def attach_running_average(engine, metric_name): 103 | RunningAverage(output_transform=lambda x: x[metric_name]).attach( 104 | engine, 105 | metric_name, 106 | ) 107 | 108 | training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|'] 109 | 110 | for metric_name in training_metric_names: 111 | attach_running_average(train_engine, metric_name) 112 | 113 | # If the verbosity is set, progress bar would be shown for mini-batch iterations. 114 | # Without ignite, you can use tqdm to implement progress bar. 115 | if verbose >= VERBOSE_BATCH_WISE: 116 | pbar = ProgressBar(bar_format=None, ncols=120) 117 | pbar.attach(train_engine, training_metric_names) 118 | 119 | # If the verbosity is set, statistics would be shown after each epoch. 120 | if verbose >= VERBOSE_EPOCH_WISE: 121 | @train_engine.on(Events.EPOCH_COMPLETED) 122 | def print_train_logs(engine): 123 | print('Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} accuracy={:.4f}'.format( 124 | engine.state.epoch, 125 | engine.state.metrics['|param|'], 126 | engine.state.metrics['|g_param|'], 127 | engine.state.metrics['loss'], 128 | engine.state.metrics['accuracy'], 129 | )) 130 | 131 | validation_metric_names = ['loss', 'accuracy'] 132 | 133 | for metric_name in validation_metric_names: 134 | attach_running_average(validation_engine, metric_name) 135 | 136 | # Do same things for validation engine. 137 | if verbose >= VERBOSE_BATCH_WISE: 138 | pbar = ProgressBar(bar_format=None, ncols=120) 139 | pbar.attach(validation_engine, validation_metric_names) 140 | 141 | if verbose >= VERBOSE_EPOCH_WISE: 142 | @validation_engine.on(Events.EPOCH_COMPLETED) 143 | def print_valid_logs(engine): 144 | print('Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}'.format( 145 | engine.state.metrics['loss'], 146 | engine.state.metrics['accuracy'], 147 | engine.best_loss, 148 | )) 149 | 150 | @staticmethod 151 | def check_best(engine): 152 | loss = float(engine.state.metrics['loss']) 153 | if loss <= engine.best_loss: # If current epoch returns lower validation loss, 154 | engine.best_loss = loss # Update lowest validation loss. 155 | engine.best_model = deepcopy(engine.model.state_dict()) # Update best model weights. 156 | 157 | @staticmethod 158 | def save_model(engine, train_engine, config, **kwargs): 159 | torch.save( 160 | { 161 | 'model': engine.best_model, 162 | 'config': config, 163 | **kwargs 164 | }, config.model_fn 165 | ) 166 | 167 | 168 | class Trainer(): 169 | 170 | def __init__(self, config): 171 | self.config = config 172 | 173 | def train( 174 | self, 175 | model, crit, optimizer, 176 | train_loader, valid_loader, 177 | ): 178 | train_engine = MyEngine( 179 | MyEngine.train, 180 | model, crit, optimizer, self.config 181 | ) 182 | validation_engine = MyEngine( 183 | MyEngine.validate, 184 | model, crit, optimizer, self.config 185 | ) 186 | 187 | MyEngine.attach( 188 | train_engine, 189 | validation_engine, 190 | verbose=self.config.verbose 191 | ) 192 | 193 | def run_validation(engine, validation_engine, valid_loader): 194 | validation_engine.run(valid_loader, max_epochs=1) 195 | 196 | train_engine.add_event_handler( 197 | Events.EPOCH_COMPLETED, # event 198 | run_validation, # function 199 | validation_engine, valid_loader, # arguments 200 | ) 201 | validation_engine.add_event_handler( 202 | Events.EPOCH_COMPLETED, # event 203 | MyEngine.check_best, # function 204 | ) 205 | 206 | train_engine.run( 207 | train_loader, 208 | max_epochs=self.config.n_epochs, 209 | ) 210 | 211 | model.load_state_dict(validation_engine.best_model) 212 | 213 | return model 214 | --------------------------------------------------------------------------------