├── simple_ntc
    ├── __init__.py
    ├── bert_dataset.py
    ├── utils.py
    ├── models
    │   ├── rnn.py
    │   └── cnn.py
    ├── data_loader.py
    ├── bert_trainer.py
    └── trainer.py
├── .gitignore
├── models
    └── get_accuracy.py
├── get_confusion_matrix.py
├── classify_plm.py
├── train.py
├── classify.py
├── finetune_plm_hftrainer.py
├── finetune_plm_native.py
└── README.md


/simple_ntc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode
 2 | *.pyc
 3 | data
 4 | data/*
 5 | .DS_Store
 6 | ._.DS_Store
 7 | models
 8 | .checkpoints
 9 | wandb/
10 | 


--------------------------------------------------------------------------------
/models/get_accuracy.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def read_text(fn):
 4 |     with open(fn, 'r') as f:
 5 |         lines = f.readlines()
 6 | 
 7 |         return lines
 8 | 
 9 | def main(ref_fn, hyp_fn):
10 |     refs = read_text(ref_fn)
11 |     hyps = read_text(hyp_fn)
12 | 
13 |     correcnt_cnt = 0
14 |     for ref, hyp in zip(refs, hyps):
15 |         if ref == hyp:
16 |             correcnt_cnt += 1
17 |     
18 |     print('%d / %d = %.4f' % (correcnt_cnt, len(refs), float(correcnt_cnt) / len(refs)))
19 | 
20 | if __name__ == '__main__':
21 |     ref_fn = sys.argv[1]
22 |     hyp_fn = sys.argv[2]
23 | 
24 |     main(ref_fn, hyp_fn)
25 | 


--------------------------------------------------------------------------------
/get_confusion_matrix.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | 
 4 | 
 5 | def read_stdin():
 6 |     lines = []
 7 | 
 8 |     for line in sys.stdin:
 9 |         if line.strip() != '':
10 |             lines += [line.strip().split('\t')[0]]
11 | 
12 |     return lines
13 | 
14 | 
15 | def read_text(fn):
16 |     lines = []
17 |     f = open(fn, 'r')
18 | 
19 |     for line in f:
20 |         if line.strip() != '':
21 |             lines += [line.strip().split('\t')[0]]
22 | 
23 |     f.close()
24 | 
25 |     return lines
26 | 
27 | 
28 | def get_confusion_matrix(classes, y, y_hat):
29 |     confusion_matrix = np.zeros((len(classes), len(classes)))
30 |     mapping_table = {}
31 | 
32 |     for idx, c in enumerate(classes):
33 |         mapping_table[c] = idx
34 | 
35 |     for y_i, y_hat_i in zip(y, y_hat):
36 |         confusion_matrix[mapping_table[y_hat_i], mapping_table[y_i]] += 1
37 | 
38 |     print('\t'.join(c for c in classes))
39 |     for i in range(len(classes)):
40 |         print('\t'.join(['%4d' % confusion_matrix[i, j] for j in range(len(classes))]))
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     ref_fn = sys.argv[1]
45 | 
46 |     ref_lines = read_text(ref_fn)
47 |     lines = read_stdin()
48 |     
49 |     min_length = min(len(ref_lines), len(lines))
50 |     ref_lines = ref_lines[:min_length]
51 |     lines = lines[:min_length]
52 | 
53 |     classes = list(set(ref_lines + lines))
54 | 
55 |     get_confusion_matrix(classes, ref_lines, lines)
56 | 


--------------------------------------------------------------------------------
/simple_ntc/bert_dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | 
 4 | 
 5 | class TextClassificationCollator():
 6 | 
 7 |     def __init__(self, tokenizer, max_length, with_text=True):
 8 |         self.tokenizer = tokenizer
 9 |         self.max_length = max_length
10 |         self.with_text = with_text
11 | 
12 |     def __call__(self, samples):
13 |         texts, labels = [], []
14 |         for text, label in samples:
15 |             texts += [text]
16 |             labels += [label]
17 | 
18 |         encoding = self.tokenizer(
19 |             texts,
20 |             padding=True,
21 |             truncation=True,
22 |             return_tensors="pt",
23 |             max_length=self.max_length
24 |         )
25 | 
26 |         return_value = {
27 |             'input_ids': encoding['input_ids'],
28 |             'attention_mask': encoding['attention_mask'],
29 |             'labels': torch.tensor(labels, dtype=torch.long),
30 |         }
31 |         if self.with_text:
32 |             return_value['text'] = texts
33 | 
34 |         return return_value
35 | 
36 | 
37 | class TextClassificationDataset(Dataset):
38 | 
39 |     def __init__(self, texts, labels):
40 |         self.texts = texts
41 |         self.labels = labels
42 |     
43 |     def __len__(self):
44 |         return len(self.texts)
45 |     
46 |     def __getitem__(self, item):
47 |         text = str(self.texts[item])
48 |         label = self.labels[item]
49 | 
50 |         return text, label
51 | 


--------------------------------------------------------------------------------
/simple_ntc/utils.py:
--------------------------------------------------------------------------------
 1 | def read_text(fn):
 2 |     with open(fn, 'r') as f:
 3 |         lines = f.readlines()
 4 | 
 5 |         labels, texts = [], []
 6 |         for line in lines:
 7 |             if line.strip() != '':
 8 |                 # The file should have tab delimited two columns.
 9 |                 # First column indicates label field,
10 |                 # and second column indicates text field.
11 |                 try:
12 |                     label, text = line.strip().split('\t')
13 |                     labels += [label]
14 |                     texts += [text]
15 |                 except Exception as e:
16 |                     print(e)
17 |                     print(line)
18 |                     continue
19 | 
20 |     return labels, texts
21 | 
22 | 
23 | def get_grad_norm(parameters, norm_type=2):
24 |     parameters = list(filter(lambda p: p.grad is not None, parameters))
25 | 
26 |     total_norm = 0
27 | 
28 |     try:
29 |         for p in parameters:
30 |             total_norm += (p.grad.data**norm_type).sum()
31 |         total_norm = total_norm ** (1. / norm_type)
32 |     except Exception as e:
33 |         print(e)
34 | 
35 |     return total_norm
36 | 
37 | 
38 | def get_parameter_norm(parameters, norm_type=2):
39 |     total_norm = 0
40 | 
41 |     try:
42 |         for p in parameters:
43 |             total_norm += (p.data**norm_type).sum()
44 |         total_norm = total_norm ** (1. / norm_type)
45 |     except Exception as e:
46 |         print(e)
47 | 
48 |     return total_norm
49 | 


--------------------------------------------------------------------------------
/simple_ntc/models/rnn.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | class RNNClassifier(nn.Module):
 5 | 
 6 |     def __init__(
 7 |         self,
 8 |         input_size,
 9 |         word_vec_size,
10 |         hidden_size,
11 |         n_classes,
12 |         n_layers=4,
13 |         dropout_p=.3,
14 |     ):
15 |         self.input_size = input_size  # vocabulary_size
16 |         self.word_vec_size = word_vec_size
17 |         self.hidden_size = hidden_size
18 |         self.n_classes = n_classes
19 |         self.n_layers = n_layers
20 |         self.dropout_p = dropout_p
21 | 
22 |         super().__init__()
23 | 
24 |         self.emb = nn.Embedding(input_size, word_vec_size)
25 |         self.rnn = nn.LSTM(
26 |             input_size=word_vec_size,
27 |             hidden_size=hidden_size,
28 |             num_layers=n_layers,
29 |             dropout=dropout_p,
30 |             batch_first=True,
31 |             bidirectional=True,
32 |         )
33 |         self.generator = nn.Linear(hidden_size * 2, n_classes)
34 |         # We use LogSoftmax + NLLLoss instead of Softmax + CrossEntropy
35 |         self.activation = nn.LogSoftmax(dim=-1)
36 | 
37 |     def forward(self, x):
38 |         # |x| = (batch_size, length)
39 |         x = self.emb(x)
40 |         # |x| = (batch_size, length, word_vec_size)
41 |         x, _ = self.rnn(x)
42 |         # |x| = (batch_size, length, hidden_size * 2)
43 |         y = self.activation(self.generator(x[:, -1]))
44 |         # |y| = (batch_size, n_classes)
45 | 
46 |         return y
47 | 


--------------------------------------------------------------------------------
/simple_ntc/data_loader.py:
--------------------------------------------------------------------------------
 1 | import torchtext
 2 | version = list(map(int, torchtext.__version__.split('.')))
 3 | if version[0] <= 0 and version[1] < 9:
 4 |     from torchtext import data
 5 | else:
 6 |     from torchtext.legacy import data
 7 | 
 8 | 
 9 | class DataLoader(object):
10 |     '''
11 |     Data loader class to load text file using torchtext library.
12 |     '''
13 | 
14 |     def __init__(
15 |         self, train_fn,
16 |         batch_size=64,
17 |         valid_ratio=.2,
18 |         device=-1,
19 |         max_vocab=999999,
20 |         min_freq=1,
21 |         use_eos=False,
22 |         shuffle=True,
23 |     ):
24 |         '''
25 |         DataLoader initialization.
26 |         :param train_fn: Train-set filename
27 |         :param batch_size: Batchify data fot certain batch size.
28 |         :param device: Device-id to load data (-1 for CPU)
29 |         :param max_vocab: Maximum vocabulary size
30 |         :param min_freq: Minimum frequency for loaded word.
31 |         :param use_eos: If it is True, put <EOS> after every end of sentence.
32 |         :param shuffle: If it is True, random shuffle the input data.
33 |         '''
34 |         super().__init__()
35 | 
36 |         # Define field of the input file.
37 |         # The input file consists of two fields.
38 |         self.label = data.Field(
39 |             sequential=False,
40 |             use_vocab=True,
41 |             unk_token=None
42 |         )
43 |         self.text = data.Field(
44 |             use_vocab=True,
45 |             batch_first=True,
46 |             include_lengths=False,
47 |             eos_token='<EOS>' if use_eos else None,
48 |         )
49 | 
50 |         # Those defined two columns will be delimited by TAB.
51 |         # Thus, we use TabularDataset to load two columns in the input file.
52 |         # We would have two separate input file: train_fn, valid_fn
53 |         # Files consist of two columns: label field and text field.
54 |         train, valid = data.TabularDataset(
55 |             path=train_fn,
56 |             format='tsv', 
57 |             fields=[
58 |                 ('label', self.label),
59 |                 ('text', self.text),
60 |             ],
61 |         ).split(split_ratio=(1 - valid_ratio))
62 | 
63 |         # Those loaded dataset would be feeded into each iterator:
64 |         # train iterator and valid iterator.
65 |         # We sort input sentences by length, to group similar lengths.
66 |         self.train_loader, self.valid_loader = data.BucketIterator.splits(
67 |             (train, valid),
68 |             batch_size=batch_size,
69 |             device='cuda:%d' % device if device >= 0 else 'cpu',
70 |             shuffle=shuffle,
71 |             sort_key=lambda x: len(x.text),
72 |             sort_within_batch=True,
73 |         )
74 | 
75 |         # At last, we make a vocabulary for label and text field.
76 |         # It is making mapping table between words and indice.
77 |         self.label.build_vocab(train)
78 |         self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)
79 | 


--------------------------------------------------------------------------------
/classify_plm.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | from transformers import BertTokenizerFast
  9 | from transformers import BertForSequenceClassification, AlbertForSequenceClassification
 10 | 
 11 | 
 12 | def define_argparser():
 13 |     '''
 14 |     Define argument parser to take inference using pre-trained model.
 15 |     '''
 16 |     p = argparse.ArgumentParser()
 17 | 
 18 |     p.add_argument('--model_fn', required=True)
 19 |     p.add_argument('--gpu_id', type=int, default=-1)
 20 |     p.add_argument('--batch_size', type=int, default=256)
 21 |     p.add_argument('--top_k', type=int, default=1)
 22 | 
 23 |     config = p.parse_args()
 24 | 
 25 |     return config
 26 | 
 27 | 
 28 | def read_text():
 29 |     '''
 30 |     Read text from standard input for inference.
 31 |     '''
 32 |     lines = []
 33 | 
 34 |     for line in sys.stdin:
 35 |         if line.strip() != '':
 36 |             lines += [line.strip()]
 37 | 
 38 |     return lines
 39 | 
 40 | 
 41 | def main(config):
 42 |     saved_data = torch.load(
 43 |         config.model_fn,
 44 |         map_location='cpu' if config.gpu_id < 0 else 'cuda:%d' % config.gpu_id
 45 |     )
 46 | 
 47 |     train_config = saved_data['config']
 48 |     bert_best = saved_data['bert']
 49 |     index_to_label = saved_data['classes']
 50 | 
 51 |     lines = read_text()
 52 | 
 53 |     with torch.no_grad():
 54 |         # Declare model and load pre-trained weights.
 55 |         tokenizer = BertTokenizerFast.from_pretrained(train_config.pretrained_model_name)
 56 |         model_loader = AlbertForSequenceClassification if train_config.use_albert else BertForSequenceClassification
 57 |         model = model_loader.from_pretrained(
 58 |             train_config.pretrained_model_name,
 59 |             num_labels=len(index_to_label)
 60 |         )
 61 |         model.load_state_dict(bert_best)
 62 | 
 63 |         if config.gpu_id >= 0:
 64 |             model.cuda(config.gpu_id)
 65 |         device = next(model.parameters()).device
 66 | 
 67 |         # Don't forget turn-on evaluation mode.
 68 |         model.eval()
 69 | 
 70 |         y_hats = []
 71 |         for idx in range(0, len(lines), config.batch_size):
 72 |             mini_batch = tokenizer(
 73 |                 lines[idx:idx + config.batch_size],
 74 |                 padding=True,
 75 |                 truncation=True,
 76 |                 return_tensors="pt",
 77 |             )
 78 | 
 79 |             x = mini_batch['input_ids']
 80 |             x = x.to(device)
 81 |             mask = mini_batch['attention_mask']
 82 |             mask = mask.to(device)
 83 | 
 84 |             # Take feed-forward
 85 |             y_hat = F.softmax(model(x, attention_mask=mask).logits, dim=-1)
 86 | 
 87 |             y_hats += [y_hat]
 88 |         # Concatenate the mini-batch wise result
 89 |         y_hats = torch.cat(y_hats, dim=0)
 90 |         # |y_hats| = (len(lines), n_classes)
 91 | 
 92 |         probs, indice = y_hats.cpu().topk(config.top_k)
 93 |         # |indice| = (len(lines), top_k)
 94 | 
 95 |         for i in range(len(lines)):
 96 |             sys.stdout.write('%s\t%s\n' % (
 97 |                 ' '.join([index_to_label[int(indice[i][j])] for j in range(config.top_k)]), 
 98 |                 lines[i]
 99 |             ))
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     config = define_argparser()
104 |     main(config)
105 | 


--------------------------------------------------------------------------------
/simple_ntc/models/cnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class CNNClassifier(nn.Module):
 6 | 
 7 |     def __init__(
 8 |         self,
 9 |         input_size,
10 |         word_vec_size,
11 |         n_classes,
12 |         use_batch_norm=False,
13 |         dropout_p=.5,
14 |         window_sizes=[3, 4, 5],
15 |         n_filters=[100, 100, 100],
16 |     ):
17 |         self.input_size = input_size  # vocabulary size
18 |         self.word_vec_size = word_vec_size
19 |         self.n_classes = n_classes
20 |         self.use_batch_norm = use_batch_norm
21 |         self.dropout_p = dropout_p
22 |         # window_size means that how many words a pattern covers.
23 |         self.window_sizes = window_sizes
24 |         # n_filters means that how many patterns to cover.
25 |         self.n_filters = n_filters
26 | 
27 |         super().__init__()
28 | 
29 |         self.emb = nn.Embedding(input_size, word_vec_size)
30 |         # Use nn.ModuleList to register each sub-modules.
31 |         self.feature_extractors = nn.ModuleList()
32 |         for window_size, n_filter in zip(window_sizes, n_filters):
33 |             self.feature_extractors.append(
34 |                 nn.Sequential(
35 |                     nn.Conv2d(
36 |                         in_channels=1, # We only use one embedding layer.
37 |                         out_channels=n_filter,
38 |                         kernel_size=(window_size, word_vec_size),
39 |                     ),
40 |                     nn.ReLU(),
41 |                     nn.BatchNorm2d(n_filter) if use_batch_norm else nn.Dropout(dropout_p),
42 |                 )
43 |             )
44 | 
45 |         # An input of generator layer is max values from each filter.
46 |         self.generator = nn.Linear(sum(n_filters), n_classes)
47 |         # We use LogSoftmax + NLLLoss instead of Softmax + CrossEntropy
48 |         self.activation = nn.LogSoftmax(dim=-1)
49 | 
50 |     def forward(self, x):
51 |         # |x| = (batch_size, length)
52 |         x = self.emb(x)
53 |         # |x| = (batch_size, length, word_vec_size)
54 |         min_length = max(self.window_sizes)
55 |         if min_length > x.size(1):
56 |             # Because some input does not long enough for maximum length of window size,
57 |             # we add zero tensor for padding.
58 |             pad = x.new(x.size(0), min_length - x.size(1), self.word_vec_size).zero_()
59 |             # |pad| = (batch_size, min_length - length, word_vec_size)
60 |             x = torch.cat([x, pad], dim=1)
61 |             # |x| = (batch_size, min_length, word_vec_size)
62 | 
63 |         # In ordinary case of vision task, you may have 3 channels on tensor,
64 |         # but in this case, you would have just 1 channel,
65 |         # which is added by 'unsqueeze' method in below:
66 |         x = x.unsqueeze(1)
67 |         # |x| = (batch_size, 1, length, word_vec_size)
68 | 
69 |         cnn_outs = []
70 |         for block in self.feature_extractors:
71 |             cnn_out = block(x)
72 |             # |cnn_out| = (batch_size, n_filter, length - window_size + 1, 1)
73 | 
74 |             # In case of max pooling, we does not know the pooling size,
75 |             # because it depends on the length of the sentence.
76 |             # Therefore, we use instant function using 'nn.functional' package.
77 |             # This is the beauty of PyTorch. :)
78 |             cnn_out = nn.functional.max_pool1d(
79 |                 input=cnn_out.squeeze(-1),
80 |                 kernel_size=cnn_out.size(-2)
81 |             ).squeeze(-1)
82 |             # |cnn_out| = (batch_size, n_filter)
83 |             cnn_outs += [cnn_out]
84 |         # Merge output tensors from each convolution layer.
85 |         cnn_outs = torch.cat(cnn_outs, dim=-1)
86 |         # |cnn_outs| = (batch_size, sum(n_filters))
87 |         y = self.activation(self.generator(cnn_outs))
88 |         # |y| = (batch_size, n_classes)
89 | 
90 |         return y
91 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.optim as optim
  6 | 
  7 | from simple_ntc.trainer import Trainer
  8 | from simple_ntc.data_loader import DataLoader
  9 | 
 10 | from simple_ntc.models.rnn import RNNClassifier
 11 | from simple_ntc.models.cnn import CNNClassifier
 12 | 
 13 | 
 14 | def define_argparser():
 15 |     '''
 16 |     Define argument parser to set hyper-parameters.
 17 |     '''
 18 |     p = argparse.ArgumentParser()
 19 | 
 20 |     p.add_argument('--model_fn', required=True)
 21 |     p.add_argument('--train_fn', required=True)
 22 |     
 23 |     p.add_argument('--gpu_id', type=int, default=-1)
 24 |     p.add_argument('--verbose', type=int, default=2)
 25 | 
 26 |     p.add_argument('--min_vocab_freq', type=int, default=5)
 27 |     p.add_argument('--max_vocab_size', type=int, default=999999)
 28 | 
 29 |     p.add_argument('--batch_size', type=int, default=256)
 30 |     p.add_argument('--n_epochs', type=int, default=10)
 31 | 
 32 |     p.add_argument('--word_vec_size', type=int, default= 256)
 33 |     p.add_argument('--dropout', type=float, default=.3)
 34 | 
 35 |     p.add_argument('--max_length', type=int, default=256)
 36 |     
 37 |     p.add_argument('--rnn', action='store_true')
 38 |     p.add_argument('--hidden_size', type=int, default=512)
 39 |     p.add_argument('--n_layers', type=int, default=4)
 40 | 
 41 |     p.add_argument('--cnn', action='store_true')
 42 |     p.add_argument('--use_batch_norm', action='store_true')
 43 |     p.add_argument('--window_sizes', type=int, nargs='*', default=[3, 4, 5])
 44 |     p.add_argument('--n_filters', type=int, nargs='*', default=[100, 100, 100])
 45 | 
 46 |     config = p.parse_args()
 47 | 
 48 |     return config
 49 | 
 50 | 
 51 | def main(config):
 52 |     loaders = DataLoader(
 53 |         train_fn=config.train_fn,
 54 |         batch_size=config.batch_size,
 55 |         min_freq=config.min_vocab_freq,
 56 |         max_vocab=config.max_vocab_size,
 57 |         device=config.gpu_id
 58 |     )
 59 | 
 60 |     print(
 61 |         '|train| =', len(loaders.train_loader.dataset),
 62 |         '|valid| =', len(loaders.valid_loader.dataset),
 63 |     )
 64 |     
 65 |     vocab_size = len(loaders.text.vocab)
 66 |     n_classes = len(loaders.label.vocab)
 67 |     print('|vocab| =', vocab_size, '|classes| =', n_classes)
 68 | 
 69 |     if config.rnn is False and config.cnn is False:
 70 |         raise Exception('You need to specify an architecture to train. (--rnn or --cnn)')
 71 | 
 72 |     if config.rnn:
 73 |         # Declare model and loss.
 74 |         model = RNNClassifier(
 75 |             input_size=vocab_size,
 76 |             word_vec_size=config.word_vec_size,
 77 |             hidden_size=config.hidden_size,
 78 |             n_classes=n_classes,
 79 |             n_layers=config.n_layers,
 80 |             dropout_p=config.dropout,
 81 |         )
 82 |         optimizer = optim.Adam(model.parameters())
 83 |         crit = nn.NLLLoss()
 84 |         print(model)
 85 | 
 86 |         if config.gpu_id >= 0:
 87 |             model.cuda(config.gpu_id)
 88 |             crit.cuda(config.gpu_id)
 89 | 
 90 |         rnn_trainer = Trainer(config)
 91 |         rnn_model = rnn_trainer.train(
 92 |             model,
 93 |             crit,
 94 |             optimizer,
 95 |             loaders.train_loader,
 96 |             loaders.valid_loader
 97 |         )
 98 |     if config.cnn:
 99 |         # Declare model and loss.
100 |         model = CNNClassifier(
101 |             input_size=vocab_size,
102 |             word_vec_size=config.word_vec_size,
103 |             n_classes=n_classes,
104 |             use_batch_norm=config.use_batch_norm,
105 |             dropout_p=config.dropout,
106 |             window_sizes=config.window_sizes,
107 |             n_filters=config.n_filters,
108 |         )
109 |         optimizer = optim.Adam(model.parameters())
110 |         crit = nn.NLLLoss()
111 |         print(model)
112 | 
113 |         if config.gpu_id >= 0:
114 |             model.cuda(config.gpu_id)
115 |             crit.cuda(config.gpu_id)
116 | 
117 |         cnn_trainer = Trainer(config)
118 |         cnn_model = cnn_trainer.train(
119 |             model,
120 |             crit,
121 |             optimizer,
122 |             loaders.train_loader,
123 |             loaders.valid_loader
124 |         )
125 | 
126 |     torch.save({
127 |         'rnn': rnn_model.state_dict() if config.rnn else None,
128 |         'cnn': cnn_model.state_dict() if config.cnn else None,
129 |         'config': config,
130 |         'vocab': loaders.text.vocab,
131 |         'classes': loaders.label.vocab,
132 |     }, config.model_fn)
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     config = define_argparser()
137 |     main(config)
138 | 


--------------------------------------------------------------------------------
/simple_ntc/bert_trainer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.utils as torch_utils
  3 | 
  4 | from ignite.engine import Events
  5 | 
  6 | from simple_ntc.utils import get_grad_norm, get_parameter_norm
  7 | 
  8 | VERBOSE_SILENT = 0
  9 | VERBOSE_EPOCH_WISE = 1
 10 | VERBOSE_BATCH_WISE = 2
 11 | 
 12 | from simple_ntc.trainer import Trainer, MyEngine
 13 | 
 14 | 
 15 | class EngineForBert(MyEngine):
 16 | 
 17 |     def __init__(self, func, model, crit, optimizer, scheduler, config):
 18 |         self.scheduler = scheduler
 19 | 
 20 |         super().__init__(func, model, crit, optimizer, config)
 21 | 
 22 |     @staticmethod
 23 |     def train(engine, mini_batch):
 24 |         # You have to reset the gradients of all model parameters
 25 |         # before to take another step in gradient descent.
 26 |         engine.model.train() # Because we assign model as class variable, we can easily access to it.
 27 |         engine.optimizer.zero_grad()
 28 | 
 29 |         x, y = mini_batch['input_ids'], mini_batch['labels']
 30 |         x, y = x.to(engine.device), y.to(engine.device)
 31 |         mask = mini_batch['attention_mask']
 32 |         mask = mask.to(engine.device)
 33 | 
 34 |         x = x[:, :engine.config.max_length]
 35 | 
 36 |         # Take feed-forward
 37 |         y_hat = engine.model(x, attention_mask=mask).logits
 38 | 
 39 |         loss = engine.crit(y_hat, y)
 40 |         loss.backward()
 41 | 
 42 |         # Calculate accuracy only if 'y' is LongTensor,
 43 |         # which means that 'y' is one-hot representation.
 44 |         if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
 45 |             accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
 46 |         else:
 47 |             accuracy = 0
 48 | 
 49 |         p_norm = float(get_parameter_norm(engine.model.parameters()))
 50 |         g_norm = float(get_grad_norm(engine.model.parameters()))
 51 | 
 52 |         # Take a step of gradient descent.
 53 |         engine.optimizer.step()
 54 |         engine.scheduler.step()
 55 | 
 56 |         return {
 57 |             'loss': float(loss),
 58 |             'accuracy': float(accuracy),
 59 |             '|param|': p_norm,
 60 |             '|g_param|': g_norm,
 61 |         }
 62 | 
 63 |     @staticmethod
 64 |     def validate(engine, mini_batch):
 65 |         engine.model.eval()
 66 | 
 67 |         with torch.no_grad():
 68 |             x, y = mini_batch['input_ids'], mini_batch['labels']
 69 |             x, y = x.to(engine.device), y.to(engine.device)
 70 |             mask = mini_batch['attention_mask']
 71 |             mask = mask.to(engine.device)
 72 | 
 73 |             x = x[:, :engine.config.max_length]
 74 | 
 75 |             # Take feed-forward
 76 |             y_hat = engine.model(x, attention_mask=mask).logits
 77 | 
 78 |             loss = engine.crit(y_hat, y)
 79 | 
 80 |             if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
 81 |                 accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
 82 |             else:
 83 |                 accuracy = 0
 84 | 
 85 |         return {
 86 |             'loss': float(loss),
 87 |             'accuracy': float(accuracy),
 88 |         }
 89 | 
 90 | 
 91 | class BertTrainer(Trainer):
 92 | 
 93 |     def __init__(self, config):
 94 |         self.config = config
 95 | 
 96 |     def train(
 97 |         self,
 98 |         model, crit, optimizer, scheduler,
 99 |         train_loader, valid_loader,
100 |     ):
101 |         train_engine = EngineForBert(
102 |             EngineForBert.train,
103 |             model, crit, optimizer, scheduler, self.config
104 |         )
105 |         validation_engine = EngineForBert(
106 |             EngineForBert.validate,
107 |             model, crit, optimizer, scheduler, self.config
108 |         )
109 | 
110 |         EngineForBert.attach(
111 |             train_engine,
112 |             validation_engine,
113 |             verbose=self.config.verbose
114 |         )
115 | 
116 |         def run_validation(engine, validation_engine, valid_loader):
117 |             validation_engine.run(valid_loader, max_epochs=1)
118 | 
119 |         train_engine.add_event_handler(
120 |             Events.EPOCH_COMPLETED, # event
121 |             run_validation, # function
122 |             validation_engine, valid_loader, # arguments
123 |         )
124 |         validation_engine.add_event_handler(
125 |             Events.EPOCH_COMPLETED, # event
126 |             EngineForBert.check_best, # function
127 |         )
128 | 
129 |         train_engine.run(
130 |             train_loader,
131 |             max_epochs=self.config.n_epochs,
132 |         )
133 | 
134 |         model.load_state_dict(validation_engine.best_model)
135 | 
136 |         return model
137 | 


--------------------------------------------------------------------------------
/classify.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | version = list(map(int, torchtext.__version__.split('.')))
  7 | if version[0] <= 0 and version[1] < 9:
  8 |     from torchtext import data
  9 | else:
 10 |     from torchtext.legacy import data
 11 | 
 12 | from simple_ntc.models.rnn import RNNClassifier
 13 | from simple_ntc.models.cnn import CNNClassifier
 14 | 
 15 | 
 16 | def define_argparser():
 17 |     '''
 18 |     Define argument parser to take inference using pre-trained model.
 19 |     '''
 20 |     p = argparse.ArgumentParser()
 21 | 
 22 |     p.add_argument('--model_fn', required=True)
 23 |     p.add_argument('--gpu_id', type=int, default=-1)
 24 |     p.add_argument('--batch_size', type=int, default=256)
 25 |     p.add_argument('--top_k', type=int, default=1)
 26 |     p.add_argument('--max_length', type=int, default=256)
 27 |     
 28 |     p.add_argument('--drop_rnn', action='store_true')
 29 |     p.add_argument('--drop_cnn', action='store_true')
 30 | 
 31 |     config = p.parse_args()
 32 | 
 33 |     return config
 34 | 
 35 | 
 36 | def read_text(max_length=256):
 37 |     '''
 38 |     Read text from standard input for inference.
 39 |     '''
 40 |     lines = []
 41 | 
 42 |     for line in sys.stdin:
 43 |         if line.strip() != '':
 44 |             lines += [line.strip().split(' ')[:max_length]]
 45 | 
 46 |     return lines
 47 | 
 48 | 
 49 | def define_field():
 50 |     '''
 51 |     To avoid use DataLoader class, just declare dummy fields. 
 52 |     With those fields, we can retore mapping table between words and indice.
 53 |     '''
 54 |     return (
 55 |         data.Field(
 56 |             use_vocab=True,
 57 |             batch_first=True,
 58 |             include_lengths=False,
 59 |         ),
 60 |         data.Field(
 61 |             sequential=False,
 62 |             use_vocab=True,
 63 |             unk_token=None,
 64 |         )
 65 |     )
 66 | 
 67 | 
 68 | def main(config):
 69 |     saved_data = torch.load(
 70 |         config.model_fn,
 71 |         map_location='cpu' if config.gpu_id < 0 else 'cuda:%d' % config.gpu_id
 72 |     )
 73 | 
 74 |     train_config = saved_data['config']
 75 |     rnn_best = saved_data['rnn']
 76 |     cnn_best = saved_data['cnn']
 77 |     vocab = saved_data['vocab']
 78 |     classes = saved_data['classes']
 79 | 
 80 |     vocab_size = len(vocab)
 81 |     n_classes = len(classes)
 82 | 
 83 |     text_field, label_field = define_field()
 84 |     text_field.vocab = vocab
 85 |     label_field.vocab = classes
 86 | 
 87 |     lines = read_text(max_length=config.max_length)
 88 | 
 89 |     with torch.no_grad():
 90 |         ensemble = []
 91 |         if rnn_best is not None and not config.drop_rnn:
 92 |             # Declare model and load pre-trained weights.
 93 |             model = RNNClassifier(
 94 |                 input_size=vocab_size,
 95 |                 word_vec_size=train_config.word_vec_size,
 96 |                 hidden_size=train_config.hidden_size,
 97 |                 n_classes=n_classes,
 98 |                 n_layers=train_config.n_layers,
 99 |                 dropout_p=train_config.dropout,
100 |             )
101 |             model.load_state_dict(rnn_best)
102 |             ensemble += [model]
103 |         if cnn_best is not None and not config.drop_cnn:
104 |             # Declare model and load pre-trained weights.
105 |             model = CNNClassifier(
106 |                 input_size=vocab_size,
107 |                 word_vec_size=train_config.word_vec_size,
108 |                 n_classes=n_classes,
109 |                 use_batch_norm=train_config.use_batch_norm,
110 |                 dropout_p=train_config.dropout,
111 |                 window_sizes=train_config.window_sizes,
112 |                 n_filters=train_config.n_filters,
113 |             )
114 |             model.load_state_dict(cnn_best)
115 |             ensemble += [model]
116 | 
117 |         y_hats = []
118 |         # Get prediction with iteration on ensemble.
119 |         for model in ensemble:
120 |             if config.gpu_id >= 0:
121 |                 model.cuda(config.gpu_id)
122 |             # Don't forget turn-on evaluation mode.
123 |             model.eval()
124 | 
125 |             y_hat = []
126 |             for idx in range(0, len(lines), config.batch_size):                
127 |                 # Converts string to list of index.
128 |                 x = text_field.numericalize(
129 |                     text_field.pad(lines[idx:idx + config.batch_size]),
130 |                     device='cuda:%d' % config.gpu_id if config.gpu_id >= 0 else 'cpu',
131 |                 )
132 | 
133 |                 y_hat += [model(x).cpu()]
134 |             # Concatenate the mini-batch wise result
135 |             y_hat = torch.cat(y_hat, dim=0)
136 |             # |y_hat| = (len(lines), n_classes)
137 | 
138 |             y_hats += [y_hat]
139 | 
140 |             model.cpu()
141 |         # Merge to one tensor for ensemble result and make probability from log-prob.
142 |         y_hats = torch.stack(y_hats).exp()
143 |         # |y_hats| = (len(ensemble), len(lines), n_classes)
144 |         y_hats = y_hats.sum(dim=0) / len(ensemble) # Get average
145 |         # |y_hats| = (len(lines), n_classes)
146 | 
147 |         probs, indice = y_hats.topk(config.top_k)
148 | 
149 |         for i in range(len(lines)):
150 |             sys.stdout.write('%s\t%s\n' % (
151 |                 ' '.join([classes.itos[indice[i][j]] for j in range(config.top_k)]), 
152 |                 ' '.join(lines[i])
153 |             ))
154 | 
155 | 
156 | if __name__ == '__main__':
157 |     config = define_argparser()
158 |     main(config)
159 | 


--------------------------------------------------------------------------------
/finetune_plm_hftrainer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import random
  3 | 
  4 | from sklearn.metrics import accuracy_score
  5 | 
  6 | import torch
  7 | 
  8 | from transformers import BertTokenizerFast
  9 | from transformers import BertForSequenceClassification, AlbertForSequenceClassification, RobertaForSequenceClassification
 10 | from transformers import Trainer
 11 | from transformers import TrainingArguments
 12 | 
 13 | from simple_ntc.bert_dataset import TextClassificationCollator
 14 | from simple_ntc.bert_dataset import TextClassificationDataset
 15 | from simple_ntc.utils import read_text
 16 | 
 17 | 
 18 | def define_argparser():
 19 |     p = argparse.ArgumentParser()
 20 | 
 21 |     p.add_argument('--model_fn', required=True)
 22 |     p.add_argument('--train_fn', required=True)
 23 |     # Recommended model list:
 24 |     # - kykim/bert-kor-base
 25 |     # - kykim/albert-kor-base
 26 |     # - beomi/kcbert-base
 27 |     # - beomi/kcbert-large
 28 |     p.add_argument('--pretrained_model_name', type=str, default='beomi/kcbert-base')
 29 |     p.add_argument('--use_albert', action='store_true')
 30 |     p.add_argument('--use_roberta', action='store_true')
 31 | 
 32 |     p.add_argument('--valid_ratio', type=float, default=.2)
 33 |     p.add_argument('--batch_size_per_device', type=int, default=32)
 34 |     p.add_argument('--n_epochs', type=int, default=5)
 35 | 
 36 |     p.add_argument('--warmup_ratio', type=float, default=.2)
 37 | 
 38 |     p.add_argument('--max_length', type=int, default=100)
 39 | 
 40 |     config = p.parse_args()
 41 | 
 42 |     return config
 43 | 
 44 | 
 45 | def get_datasets(fn, valid_ratio=.2):
 46 |      # Get list of labels and list of texts.
 47 |     labels, texts = read_text(fn)
 48 | 
 49 |     # Generate label to index map.
 50 |     unique_labels = list(set(labels))
 51 |     label_to_index = {}
 52 |     index_to_label = {}
 53 |     for i, label in enumerate(unique_labels):
 54 |         label_to_index[label] = i
 55 |         index_to_label[i] = label
 56 | 
 57 |     # Convert label text to integer value.
 58 |     labels = list(map(label_to_index.get, labels))
 59 | 
 60 |     # Shuffle before split into train and validation set.
 61 |     shuffled = list(zip(texts, labels))
 62 |     random.shuffle(shuffled)
 63 |     texts = [e[0] for e in shuffled]
 64 |     labels = [e[1] for e in shuffled]
 65 |     idx = int(len(texts) * (1 - valid_ratio))
 66 | 
 67 |     train_dataset = TextClassificationDataset(texts[:idx], labels[:idx])
 68 |     valid_dataset = TextClassificationDataset(texts[idx:], labels[idx:])
 69 | 
 70 |     return train_dataset, valid_dataset, index_to_label
 71 | 
 72 | 
 73 | def main(config):
 74 |     # Get pretrained tokenizer.
 75 |     tokenizer = BertTokenizerFast.from_pretrained(config.pretrained_model_name)
 76 |     # Get datasets and index to label map.
 77 |     train_dataset, valid_dataset, index_to_label = get_datasets(
 78 |         config.train_fn,
 79 |         valid_ratio=config.valid_ratio
 80 |     )
 81 | 
 82 |     print(
 83 |         '|train| =', len(train_dataset),
 84 |         '|valid| =', len(valid_dataset),
 85 |     )
 86 | 
 87 |     total_batch_size = config.batch_size_per_device * torch.cuda.device_count()
 88 |     n_total_iterations = int(len(train_dataset) / total_batch_size * config.n_epochs)
 89 |     n_warmup_steps = int(n_total_iterations * config.warmup_ratio)
 90 |     print(
 91 |         '#total_iters =', n_total_iterations,
 92 |         '#warmup_iters =', n_warmup_steps,
 93 |     )
 94 | 
 95 |     # Get pretrained model with specified softmax layer.
 96 |     assert not (config.use_albert and config.use_roberta), 'Only one of use_albert and use_roberta can be True.'
 97 |     if config.use_albert:
 98 |         model_loader = AlbertForSequenceClassification
 99 |     elif config.use_roberta:
100 |         model_loader = RobertaForSequenceClassification
101 |     else:
102 |         model_loader = BertForSequenceClassification
103 | 
104 |     model = model_loader.from_pretrained(
105 |         config.pretrained_model_name,
106 |         num_labels=len(index_to_label)
107 |     )
108 | 
109 |     training_args = TrainingArguments(
110 |         output_dir='./.checkpoints',
111 |         num_train_epochs=config.n_epochs,
112 |         per_device_train_batch_size=config.batch_size_per_device,
113 |         per_device_eval_batch_size=config.batch_size_per_device,
114 |         warmup_steps=n_warmup_steps,
115 |         weight_decay=0.01,
116 |         fp16=True,
117 |         evaluation_strategy='epoch',
118 |         save_strategy='epoch',
119 |         logging_steps=n_total_iterations // 100,
120 |         save_steps=n_total_iterations // config.n_epochs,
121 |         load_best_model_at_end=True,
122 |     )
123 | 
124 |     def compute_metrics(pred):
125 |         labels = pred.label_ids
126 |         preds = pred.predictions.argmax(-1)
127 | 
128 |         return {
129 |             'accuracy': accuracy_score(labels, preds)
130 |         }
131 | 
132 |     trainer = Trainer(
133 |         model=model,
134 |         args=training_args,
135 |         data_collator=TextClassificationCollator(tokenizer,
136 |                                        config.max_length,
137 |                                        with_text=False),
138 |         train_dataset=train_dataset,
139 |         eval_dataset=valid_dataset,
140 |         compute_metrics=compute_metrics,
141 |     )
142 | 
143 |     trainer.train()
144 | 
145 |     torch.save({
146 |         'rnn': None,
147 |         'cnn': None,
148 |         'bert': trainer.model.state_dict(),
149 |         'config': config,
150 |         'vocab': None,
151 |         'classes': index_to_label,
152 |         'tokenizer': tokenizer,
153 |     }, config.model_fn)
154 | 
155 | 
156 | if __name__ == '__main__':
157 |     config = define_argparser()
158 |     main(config)
159 | 


--------------------------------------------------------------------------------
/finetune_plm_native.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import random
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | from torch.utils.data import DataLoader
  8 | 
  9 | from transformers import BertTokenizerFast
 10 | from transformers import BertForSequenceClassification, AlbertForSequenceClassification
 11 | from transformers import AdamW
 12 | from transformers import get_linear_schedule_with_warmup
 13 | 
 14 | import torch_optimizer as custom_optim
 15 | 
 16 | from simple_ntc.bert_trainer import BertTrainer as Trainer
 17 | from simple_ntc.bert_dataset import TextClassificationDataset, TextClassificationCollator
 18 | from simple_ntc.utils import read_text
 19 | 
 20 | 
 21 | def define_argparser():
 22 |     p = argparse.ArgumentParser()
 23 | 
 24 |     p.add_argument('--model_fn', required=True)
 25 |     p.add_argument('--train_fn', required=True)
 26 |     # Recommended model list:
 27 |     # - kykim/bert-kor-base
 28 |     # - kykim/albert-kor-base
 29 |     # - beomi/kcbert-base
 30 |     # - beomi/kcbert-large
 31 |     p.add_argument('--pretrained_model_name', type=str, default='beomi/kcbert-base')
 32 |     p.add_argument('--use_albert', action='store_true')
 33 |     
 34 |     p.add_argument('--gpu_id', type=int, default=-1)
 35 |     p.add_argument('--verbose', type=int, default=2)
 36 | 
 37 |     p.add_argument('--batch_size', type=int, default=32)
 38 |     p.add_argument('--n_epochs', type=int, default=5)
 39 | 
 40 |     p.add_argument('--lr', type=float, default=5e-5)
 41 |     p.add_argument('--warmup_ratio', type=float, default=.2)
 42 |     p.add_argument('--adam_epsilon', type=float, default=1e-8)
 43 |     # If you want to use RAdam, I recommend to use LR=1e-4.
 44 |     # Also, you can set warmup_ratio=0.
 45 |     p.add_argument('--use_radam', action='store_true')
 46 |     p.add_argument('--valid_ratio', type=float, default=.2)
 47 | 
 48 |     p.add_argument('--max_length', type=int, default=100)
 49 | 
 50 |     config = p.parse_args()
 51 | 
 52 |     return config
 53 | 
 54 | 
 55 | def get_loaders(fn, tokenizer, valid_ratio=.2):
 56 |     # Get list of labels and list of texts.
 57 |     labels, texts = read_text(fn)
 58 | 
 59 |     # Generate label to index map.
 60 |     unique_labels = list(set(labels))
 61 |     label_to_index = {}
 62 |     index_to_label = {}
 63 |     for i, label in enumerate(unique_labels):
 64 |         label_to_index[label] = i
 65 |         index_to_label[i] = label
 66 | 
 67 |     # Convert label text to integer value.
 68 |     labels = list(map(label_to_index.get, labels))
 69 | 
 70 |     # Shuffle before split into train and validation set.
 71 |     shuffled = list(zip(texts, labels))
 72 |     random.shuffle(shuffled)
 73 |     texts = [e[0] for e in shuffled]
 74 |     labels = [e[1] for e in shuffled]
 75 |     idx = int(len(texts) * (1 - valid_ratio))
 76 | 
 77 |     # Get dataloaders using given tokenizer as collate_fn.
 78 |     train_loader = DataLoader(
 79 |         TextClassificationDataset(texts[:idx], labels[:idx]),
 80 |         batch_size=config.batch_size,
 81 |         shuffle=True,
 82 |         collate_fn=TextClassificationCollator(tokenizer, config.max_length),
 83 |     )
 84 |     valid_loader = DataLoader(
 85 |         TextClassificationDataset(texts[idx:], labels[idx:]),
 86 |         batch_size=config.batch_size,
 87 |         collate_fn=TextClassificationCollator(tokenizer, config.max_length),
 88 |     )
 89 | 
 90 |     return train_loader, valid_loader, index_to_label
 91 | 
 92 | 
 93 | def get_optimizer(model, config):
 94 |     if config.use_radam:
 95 |         optimizer = custom_optim.RAdam(model.parameters(), lr=config.lr)
 96 |     else:
 97 |         # Prepare optimizer and schedule (linear warmup and decay)
 98 |         no_decay = ['bias', 'LayerNorm.weight']
 99 |         optimizer_grouped_parameters = [
100 |             {
101 |                 'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
102 |                 'weight_decay': 0.01
103 |             },
104 |             {
105 |                 'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
106 |                 'weight_decay': 0.0
107 |             }
108 |         ]
109 | 
110 |         optimizer = optim.AdamW(
111 |             optimizer_grouped_parameters,
112 |             lr=config.lr,
113 |             eps=config.adam_epsilon
114 |         )
115 | 
116 |     return optimizer
117 | 
118 | 
119 | def main(config):
120 |     # Get pretrained tokenizer.
121 |     tokenizer = BertTokenizerFast.from_pretrained(config.pretrained_model_name)
122 |     # Get dataloaders using tokenizer from untokenized corpus.
123 |     train_loader, valid_loader, index_to_label = get_loaders(
124 |         config.train_fn,
125 |         tokenizer,
126 |         valid_ratio=config.valid_ratio
127 |     )
128 | 
129 |     print(
130 |         '|train| =', len(train_loader) * config.batch_size,
131 |         '|valid| =', len(valid_loader) * config.batch_size,
132 |     )
133 | 
134 |     n_total_iterations = len(train_loader) * config.n_epochs
135 |     n_warmup_steps = int(n_total_iterations * config.warmup_ratio)
136 |     print(
137 |         '#total_iters =', n_total_iterations,
138 |         '#warmup_iters =', n_warmup_steps,
139 |     )
140 | 
141 |     # Get pretrained model with specified softmax layer.
142 |     model_loader = AlbertForSequenceClassification if config.use_albert else BertForSequenceClassification
143 |     model = model_loader.from_pretrained(
144 |         config.pretrained_model_name,
145 |         num_labels=len(index_to_label)
146 |     )
147 |     optimizer = get_optimizer(model, config)
148 | 
149 |     # By default, model returns a hidden representation before softmax func.
150 |     # Thus, we need to use CrossEntropyLoss, which combines LogSoftmax and NLLLoss.
151 |     crit = nn.CrossEntropyLoss()
152 |     scheduler = get_linear_schedule_with_warmup(
153 |         optimizer,
154 |         n_warmup_steps,
155 |         n_total_iterations
156 |     )
157 | 
158 |     if config.gpu_id >= 0:
159 |         model.cuda(config.gpu_id)
160 |         crit.cuda(config.gpu_id)
161 | 
162 |     # Start train.
163 |     trainer = Trainer(config)
164 |     model = trainer.train(
165 |         model,
166 |         crit,
167 |         optimizer,
168 |         scheduler,
169 |         train_loader,
170 |         valid_loader,
171 |     )
172 | 
173 |     torch.save({
174 |         'rnn': None,
175 |         'cnn': None,
176 |         'bert': model.state_dict(),
177 |         'config': config,
178 |         'vocab': None,
179 |         'classes': index_to_label,
180 |         'tokenizer': tokenizer,
181 |     }, config.model_fn)
182 | 
183 | if __name__ == '__main__':
184 |     config = define_argparser()
185 |     main(config)
186 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Simple Neural Text Classification(NTC)
  2 | 
  3 | This repository contains implementation of naive and simple text classification using recurrent neural network (LSTM) and convolutional neural network (from [[Kim 2014](http://arxiv.org/abs/1408.5882)]). You need to specify architecture to train, and you can select both. If you choose both arthictecture to classify sentences, inference will be done by ensemble (just simple average).
  4 | 
  5 | In addition, this repo is for [lecture](https://www.fastcampus.co.kr/data_camp_nlpbasic/) and [book](https://kh-kim.gitbook.io/natural-language-processing-with-pytorch/), what I conduct. Please, refer those site for further information.
  6 | 
  7 | ## Pre-requisite
  8 | 
  9 | - Python 3.6 or higher
 10 | - PyTorch 1.6 or higher
 11 | - PyTorch Ignite
 12 | - TorchText 0.5 or higher
 13 | - [torch-optimizer 0.0.1a15](https://pypi.org/project/torch-optimizer/)
 14 | - Tokenized corpus (e.g. [Moses](https://www.nltk.org/_modules/nltk/tokenize/moses.html), Mecab, [Jieba](https://github.com/fxsjy/jieba))
 15 | 
 16 | if you want to use BERT finetuning, you may also need,
 17 | 
 18 | - Huggingface
 19 | 
 20 | ## Usage
 21 | 
 22 | ### Preparation
 23 | 
 24 | #### Format
 25 | 
 26 | The input file would have a format with two columns, class and sentence. Those columns are delimited by tab. Class does not need to be a number, but a word (without white-space). Below is example corpus to explain.
 27 | 
 28 | ```bash
 29 | $ cat ./data/raw_corpus.txt | shuf | head
 30 | positive	나름 괜찬항요 막 엄청 좋은건 아님 그냥 그럭저럭임... 아직 까지 인생 디퓨져는 못찾은느낌
 31 | negative	재질은플라스틱부분이많고요...금방깨질거같아요..당장 물은나오게해야하기에..그냥설치했어요..지금도 조금은후회중.....
 32 | positive	평소 신던 신발보다 크긴하지만 운동화라 끈 조절해서 신으려구요 신발 이쁘고 편하네요
 33 | positive	두개사서 직장에 구비해두고 먹고있어요 양 많아서 오래쓸듯
 34 | positive	생일선물로 샀는데 받으시는 분도 만족하시구 배송도 빨라서 좋았네요
 35 | positive	아이가 너무 좋아합니다 크롱도 좋아라하지만 루피를 더..
 36 | negative	배송은 기다릴수 있었는데 8개나 주문했는데 샘플을 너무 적게보내주시네요ㅡㅡ;;
 37 | positive	너무귀여워요~~ㅎ아직사용은 못해? f지만 이젠 모기땜에 잠설치는일은 ? j겟죠
 38 | positive	13개월 아가 제일좋은 간식이네요
 39 | positive	지인추천으로 샀어요~ 싸고 가성비 좋다해서 낮기저귀로 써보려구요~
 40 | ```
 41 | 
 42 | #### Tokenization (Optional)
 43 | 
 44 | You may need to tokenize sentences in the corpus. You need to select your own tokenizer based on the language. (e.g. Mecab for Korean)
 45 | 
 46 | ```bash
 47 | $ cat ./data/raw_corpus.txt | awk -F'\t' '{ print $2 }' | mecab -O wakati > ./data/tmp.txt
 48 | $ cat ./data/raw_corpus.txt | awk -F'\t' '{ print $1 }' > ./data/tmp_class.txt
 49 | $ paste ./data/tmp_class.txt ./data/tmp.txt > ./data/corpus.txt
 50 | $ rm ./data/tmp.txt ./data/tmp_class.txt
 51 | ```
 52 | 
 53 | #### Shuffle and Split for Train-set and Valid-set
 54 | 
 55 | After correct formatting and tokenization, you need to split the corpus to train-set and valid-set.
 56 | 
 57 | ```bash
 58 | $ wc -l ./data/corpus.txt
 59 | 302680 ./data/corpus.txt
 60 | ```
 61 | 
 62 | As you can see, we have more than 260k samples in corpus.
 63 | 
 64 | ```bash
 65 | $ cat ./data/corpus.txt | shuf > ./data/corpus.shuf.txt
 66 | $ head -n 62680 ./data/corpus.shuf.txt > ./data/corpus.test.txt
 67 | $ tail -n 240000 ./data/corpus.shuf.txt > ./data/corpus.train.txt
 68 | ```
 69 | 
 70 | Now, you have 240,000 samples for train-set, and 62,680 samples for valid-set. Note that you can use 'rl' command, instead of 'shuf', if you are using MacOS.
 71 | 
 72 | ### Train
 73 | 
 74 | Below is the example command for training. You can select your own hyper-parameter values via argument inputs.
 75 | 
 76 | ```bash
 77 | python train.py --model_fn ./models/model.pth --train ./data/corpus.train.txt --valid ./data/corpus.valid.txt --rnn --cnn --gpu_id 0
 78 | ```
 79 | 
 80 | Note that you need to specify an architecture for training. You can select both rnn and cnn for ensemble method. Also, you can select the device to use for training. In order to use CPU only, you can put -1 for '--gpu_id' argument, which is default value.
 81 | 
 82 | ```bash
 83 | $ python ./train.py --help
 84 | usage: train.py [-h] --model_fn MODEL_FN --train_fn TRAIN_FN [--gpu_id GPU_ID]
 85 |                 [--verbose VERBOSE] [--min_vocab_freq MIN_VOCAB_FREQ]
 86 |                 [--max_vocab_size MAX_VOCAB_SIZE] [--batch_size BATCH_SIZE]
 87 |                 [--n_epochs N_EPOCHS] [--word_vec_size WORD_VEC_SIZE]
 88 |                 [--dropout DROPOUT] [--max_length MAX_LENGTH] [--rnn]
 89 |                 [--hidden_size HIDDEN_SIZE] [--n_layers N_LAYERS] [--cnn]
 90 |                 [--use_batch_norm]
 91 |                 [--window_sizes [WINDOW_SIZES [WINDOW_SIZES ...]]]
 92 |                 [--n_filters [N_FILTERS [N_FILTERS ...]]]
 93 | ```
 94 | 
 95 | or you can check default hyper-parameter from train.py.
 96 | 
 97 | ### Inference
 98 | 
 99 | You can feed standard input as input for inference, like as below. Prediction result consists of two columns(top-k classes and input sentence) with tab delimiter. The result will be shown as standard output.
100 | 
101 | ```bash
102 | $ head ./data/review.sorted.uniq.refined.tok.shuf.test.tsv | awk -F'\t' '{ print $2 }' | python classify.py --model ./models/model.pth --gpu_id -1 --top_k 1
103 | positive	생각 보다 밝 아요 ㅎㅎ
104 | negative	쓸 대 가 없 네요
105 | positive	깔 금 해요 . 가벼워 요 . 설치 가 쉬워요 . 타 사이트 에 비해 가격 도 저렴 하 답니다 .
106 | positive	크기 나 두께 가 딱 제 가 원 하 던 사이즈 네요 . 책상 의자 가 너무 딱딱 해서 쿠션 감 좋 은 방석 이 필요 하 던 차 에 좋 은 제품 만났 네요 . 냄새 얘기 하 시 는 분 도 더러 있 던데 별로 냄새 안 나 요 .
107 | positive	빠르 고 괜찬 습니다 .
108 | positive	유통 기한 도 넉넉 하 고 좋 아요
109 | positive	좋 은 가격 에 좋 은 상품 잘 쓰 겠 습니다 .
110 | negative	사이트 에서 늘 생리대 사 서 쓰 는데 오늘 처럼 이렇게 비닐 에 포장 되 어 받 아 본 건 처음 입니다 . 위생 용품 이 고 자체 도 비닐 포장 이 건만 소형 박스 에 라도 넣 어 보내 주 시 지 . ..
111 | negative	연결 부분 이 많이 티 가 납니다 . 재질 구김 도 좀 있 습니다 .
112 | positive	애기 태열 때문 에 구매 해서 잘 쓰 고 있 습니다 .
113 | ```
114 | 
115 | Also, you can see the arguments, and see the default values on classify.py.
116 | 
117 | ```bash
118 | $ python classify.py -h
119 | usage: classify.py [-h] --model_fn MODEL [--gpu_id GPU_ID]
120 |                    [--batch_size BATCH_SIZE] [--top_k TOP_K]
121 | ```
122 | 
123 | ## Evaluation
124 | 
125 | I split the corpus to make train-set and valid-set. 240,000 lines are sampled for train-set and 62,680 samples for valid-set. Architecture snapshots are like as below. You may increase the performance with hyper-parameter optimization.
126 | 
127 | ```bash
128 | RNNClassifier(
129 |   (emb): Embedding(35532, 128)
130 |   (rnn): LSTM(128, 256, num_layers=4, batch_first=True, dropout=0.3, bidirectional=True)
131 |   (generator): Linear(in_features=512, out_features=2, bias=True)
132 |   (activation): LogSoftmax()
133 | )
134 | ```
135 | 
136 | ```bash
137 | CNNClassifier(
138 |   (emb): Embedding(35532, 256)
139 |   (feature_extractors): ModuleList(
140 |     (0): Sequential(
141 |       (0): Conv2d(1, 100, kernel_size=(3, 256), stride=(1, 1))
142 |       (1): ReLU()
143 |       (2): Dropout(p=0.3, inplace=False)
144 |     )
145 |     (1): Sequential(
146 |       (0): Conv2d(1, 100, kernel_size=(4, 256), stride=(1, 1))
147 |       (1): ReLU()
148 |       (2): Dropout(p=0.3, inplace=False)
149 |     )
150 |     (2): Sequential(
151 |       (0): Conv2d(1, 100, kernel_size=(5, 256), stride=(1, 1))
152 |       (1): ReLU()
153 |       (2): Dropout(p=0.3, inplace=False)
154 |     )
155 |   )
156 |   (generator): Linear(in_features=300, out_features=2, bias=True)
157 |   (activation): LogSoftmax()
158 | )
159 | ```
160 | 
161 | |Architecture|Test Accuracy|
162 | |-|-|
163 | |Bi-LSTM|0.9035|
164 | |CNN|0.9090|
165 | |Bi-LSTM + CNN|0.9142|
166 | |KcBERT|0.9598|
167 | 
168 | ## Author
169 | 
170 | |Name|Kim, Ki Hyun|
171 | |-|-|
172 | |email|pointzz.ki@gmail.com|
173 | |github|https://github.com/kh-kim/|
174 | |linkedin|https://www.linkedin.com/in/ki-hyun-kim/|
175 | 
176 | ## Reference
177 | 
178 | - Kim, Convolutional neural networks for sentence classification, EMNLP, 2014
179 | - Devlin et al., BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, ACL, 2019
180 | - [Lee, KcBERT: Korean comments BERT, GitHub, 2020](https://github.com/Beomi/KcBERT)
181 | 


--------------------------------------------------------------------------------
/simple_ntc/trainer.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | 
  7 | from ignite.engine import Engine
  8 | from ignite.engine import Events
  9 | from ignite.metrics import RunningAverage
 10 | from ignite.contrib.handlers.tqdm_logger import ProgressBar
 11 | 
 12 | from simple_ntc.utils import get_grad_norm, get_parameter_norm
 13 | 
 14 | VERBOSE_SILENT = 0
 15 | VERBOSE_EPOCH_WISE = 1
 16 | VERBOSE_BATCH_WISE = 2
 17 | 
 18 | 
 19 | class MyEngine(Engine):
 20 | 
 21 |     def __init__(self, func, model, crit, optimizer, config):
 22 |         # Ignite Engine does not have objects in below lines.
 23 |         # Thus, we assign class variables to access these object, during the procedure.
 24 |         self.model = model
 25 |         self.crit = crit
 26 |         self.optimizer = optimizer
 27 |         self.config = config
 28 | 
 29 |         super().__init__(func) # Ignite Engine only needs function to run.
 30 | 
 31 |         self.best_loss = np.inf
 32 |         self.best_model = None
 33 | 
 34 |         self.device = next(model.parameters()).device
 35 | 
 36 |     @staticmethod
 37 |     def train(engine, mini_batch):
 38 |         # You have to reset the gradients of all model parameters
 39 |         # before to take another step in gradient descent.
 40 |         engine.model.train() # Because we assign model as class variable, we can easily access to it.
 41 |         engine.optimizer.zero_grad()
 42 | 
 43 |         x, y = mini_batch.text, mini_batch.label
 44 |         x, y = x.to(engine.device), y.to(engine.device)
 45 | 
 46 |         x = x[:, :engine.config.max_length]
 47 | 
 48 |         # Take feed-forward
 49 |         y_hat = engine.model(x)
 50 | 
 51 |         loss = engine.crit(y_hat, y)
 52 |         loss.backward()
 53 | 
 54 |         # Calculate accuracy only if 'y' is LongTensor,
 55 |         # which means that 'y' is one-hot representation.
 56 |         if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
 57 |             accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
 58 |         else:
 59 |             accuracy = 0
 60 | 
 61 |         p_norm = float(get_parameter_norm(engine.model.parameters()))
 62 |         g_norm = float(get_grad_norm(engine.model.parameters()))
 63 | 
 64 |         # Take a step of gradient descent.
 65 |         engine.optimizer.step()
 66 | 
 67 |         return {
 68 |             'loss': float(loss),
 69 |             'accuracy': float(accuracy),
 70 |             '|param|': p_norm,
 71 |             '|g_param|': g_norm,
 72 |         }
 73 | 
 74 |     @staticmethod
 75 |     def validate(engine, mini_batch):
 76 |         engine.model.eval()
 77 | 
 78 |         with torch.no_grad():
 79 |             x, y = mini_batch.text, mini_batch.label
 80 |             x, y = x.to(engine.device), y.to(engine.device)
 81 | 
 82 |             x = x[:, :engine.config.max_length]
 83 | 
 84 |             y_hat = engine.model(x)
 85 | 
 86 |             loss = engine.crit(y_hat, y)
 87 | 
 88 |             if isinstance(y, torch.LongTensor) or isinstance(y, torch.cuda.LongTensor):
 89 |                 accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float(y.size(0))
 90 |             else:
 91 |                 accuracy = 0
 92 | 
 93 |         return {
 94 |             'loss': float(loss),
 95 |             'accuracy': float(accuracy),
 96 |         }
 97 | 
 98 |     @staticmethod
 99 |     def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE):
100 |         # Attaching would be repaeted for serveral metrics.
101 |         # Thus, we can reduce the repeated codes by using this function.
102 |         def attach_running_average(engine, metric_name):
103 |             RunningAverage(output_transform=lambda x: x[metric_name]).attach(
104 |                 engine,
105 |                 metric_name,
106 |             )
107 | 
108 |         training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|']
109 | 
110 |         for metric_name in training_metric_names:
111 |             attach_running_average(train_engine, metric_name)
112 | 
113 |         # If the verbosity is set, progress bar would be shown for mini-batch iterations.
114 |         # Without ignite, you can use tqdm to implement progress bar.
115 |         if verbose >= VERBOSE_BATCH_WISE:
116 |             pbar = ProgressBar(bar_format=None, ncols=120)
117 |             pbar.attach(train_engine, training_metric_names)
118 | 
119 |         # If the verbosity is set, statistics would be shown after each epoch.
120 |         if verbose >= VERBOSE_EPOCH_WISE:
121 |             @train_engine.on(Events.EPOCH_COMPLETED)
122 |             def print_train_logs(engine):
123 |                 print('Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} accuracy={:.4f}'.format(
124 |                     engine.state.epoch,
125 |                     engine.state.metrics['|param|'],
126 |                     engine.state.metrics['|g_param|'],
127 |                     engine.state.metrics['loss'],
128 |                     engine.state.metrics['accuracy'],
129 |                 ))
130 | 
131 |         validation_metric_names = ['loss', 'accuracy']
132 |         
133 |         for metric_name in validation_metric_names:
134 |             attach_running_average(validation_engine, metric_name)
135 | 
136 |         # Do same things for validation engine.
137 |         if verbose >= VERBOSE_BATCH_WISE:
138 |             pbar = ProgressBar(bar_format=None, ncols=120)
139 |             pbar.attach(validation_engine, validation_metric_names)
140 | 
141 |         if verbose >= VERBOSE_EPOCH_WISE:
142 |             @validation_engine.on(Events.EPOCH_COMPLETED)
143 |             def print_valid_logs(engine):
144 |                 print('Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}'.format(
145 |                     engine.state.metrics['loss'],
146 |                     engine.state.metrics['accuracy'],
147 |                     engine.best_loss,
148 |                 ))
149 | 
150 |     @staticmethod
151 |     def check_best(engine):
152 |         loss = float(engine.state.metrics['loss'])
153 |         if loss <= engine.best_loss: # If current epoch returns lower validation loss,
154 |             engine.best_loss = loss  # Update lowest validation loss.
155 |             engine.best_model = deepcopy(engine.model.state_dict()) # Update best model weights.
156 | 
157 |     @staticmethod
158 |     def save_model(engine, train_engine, config, **kwargs):
159 |         torch.save(
160 |             {
161 |                 'model': engine.best_model,
162 |                 'config': config,
163 |                 **kwargs
164 |             }, config.model_fn
165 |         )
166 | 
167 | 
168 | class Trainer():
169 | 
170 |     def __init__(self, config):
171 |         self.config = config
172 | 
173 |     def train(
174 |         self,
175 |         model, crit, optimizer,
176 |         train_loader, valid_loader,
177 |     ):
178 |         train_engine = MyEngine(
179 |             MyEngine.train,
180 |             model, crit, optimizer, self.config
181 |         )
182 |         validation_engine = MyEngine(
183 |             MyEngine.validate,
184 |             model, crit, optimizer, self.config
185 |         )
186 | 
187 |         MyEngine.attach(
188 |             train_engine,
189 |             validation_engine,
190 |             verbose=self.config.verbose
191 |         )
192 | 
193 |         def run_validation(engine, validation_engine, valid_loader):
194 |             validation_engine.run(valid_loader, max_epochs=1)
195 | 
196 |         train_engine.add_event_handler(
197 |             Events.EPOCH_COMPLETED, # event
198 |             run_validation, # function
199 |             validation_engine, valid_loader, # arguments
200 |         )
201 |         validation_engine.add_event_handler(
202 |             Events.EPOCH_COMPLETED, # event
203 |             MyEngine.check_best, # function
204 |         )
205 | 
206 |         train_engine.run(
207 |             train_loader,
208 |             max_epochs=self.config.n_epochs,
209 |         )
210 | 
211 |         model.load_state_dict(validation_engine.best_model)
212 | 
213 |         return model
214 | 


--------------------------------------------------------------------------------