├── README.md ├── cnn ├── README.md ├── model.py ├── parameters.py ├── predict.py └── train.py ├── embedding.py ├── evaluate.py ├── prepare.py ├── rnn ├── README.md ├── model.py ├── parameters.py ├── predict.py └── train.py ├── utils.py └── vdcnn ├── README.md ├── model.py ├── predict.py └── train.py /README.md: -------------------------------------------------------------------------------- 1 | # Text Classification in PyTorch 2 | 3 | Minimal implementations of text classification models in PyTorch. 4 | 5 | - Word-level Convolutional Neural Networks (Kim 2014) 6 | - Character-level Convolutional Neural Networks (Zhang et al 2015) 7 | - Very Deep Convolutional Networks (VDCNNs; Conneau et al 2017) 8 | - Recurrent Neural Networks with attention 9 | - Hierarchical Attention Networks (HANs; Yang et al 2016) 10 | 11 | ## References 12 | 13 | Alexis Conneau, Holger Schwenk, Loïc Barrault, Yann Lecun. 2017. [Very Deep Convolutional Networks for Text Classification.](https://arxiv.org/abs/1606.01781) arXiv:1606.01781. 14 | 15 | Bjarke Felbo, Alan Mislove, Anders Søgaard, Iyad Rahwan, Sune Lehmann. 2017. [Using Millions of Emoji Occurrences to Learn Any-domain Representations for Detecting Sentiment, Emotion and Sarcasm.](https://arxiv.org/abs/1708.00524) https://arxiv.org/abs/1708.00524. 16 | 17 | Linyuan Gong, Ruyi Ji. 2018. [What Does a TextCNN Learn?](https://arxiv.org/abs/1801.06287) arXiv:1801.06287. 18 | 19 | Yoon Kim. 2014. [Convolutional Neural Networks for Sentence Classification.](https://arxiv.org/abs/1408.5882) arXiv:1408.5882. 20 | 21 | Siwei Lai, Liheng Xu, Kang Liu, Jun Zhao. 2015. [Recurrent Convolutional Neural Networks for Text Classification.](https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9745/9552) In Proceedings of the 29th AAAI Conference on Artificial Intelligence. 22 | 23 | Hoa T. Le, Christophe Cerisara, Alexandre Denis. 2017. [Do Convolutional Networks need to be Deep for Text Classification?](https://arxiv.org/abs/1707.04108) arXiv:1707.04108. 24 | 25 | Shuming Ma, Xu Sun, Junyang Lin, Xuancheng Ren. 2018. [A Hierarchical End-to-End Model for Jointly Improving Text Summarization and Sentiment Classification.](https://arxiv.org/abs/1805.01089) In IJCAI. 26 | 27 | Hao Peng, Jianxin Li, Yu He, Yaopeng Liu, Mengjiao Bao, Lihong Wang, Yangqiu Song, Qiang Yang. 2018. [Large-Scale Hierarchical Text Classification with Recursively Regularized Deep Graph-CNN.](http://www.cse.ust.hk/~yqsong/papers/2018-WWW-Text-GraphCNN.pdf) In Proceedings of the International World Wide Web Conference Committee (IW3C2). 28 | 29 | Yunlun Yang, Yunhai Tong, Shulei Ma, Zhi-Hong Deng. 2016. [A Position Encoding Convolutional Neural Network Based on Dependency Tree for Relation Classification.](https://aclweb.org/anthology/D16-1007) In EMNLP. 30 | 31 | Zichao Yang, Diyi Yang, Chris Dyer, Xiaodong He, Alex Smola, Eduard Hovy. 2016. [Hierarchical Attention Networks for Document Classification.](http://www.aclweb.org/anthology/N16-1174) In Proceedings of NAACL-HLT 2016. 32 | 33 | Wenpeng Yin, Katharina Kann, Mo Yu, Hinrich Schütze. 2017. [Comparative Study of CNN and RNN for Natural Language Processing.](https://arxiv.org/abs/1702.01923) arXiv:1702.01923. 34 | 35 | Zenan Zhai, Dat Quoc Nguyen, Karin Verspoor. 2018. [Comparing CNN and LSTM character-level embeddings in BiLSTM-CRF models for chemical and disease named entity recognition.](https://www.aclweb.org/anthology/W18-5605) In Proceedings of the 9th International Workshop on Health Text Mining and Information Analysis (LOUHI). 36 | 37 | Ye Zhang, Byron Wallace. 2016. [A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification.](https://arxiv.org/abs/1510.03820) arXiv:1510.03820. 38 | 39 | Xiang Zhang, Junbo Zhao, Yann LeCun. 2015. [Character-level Convolutional Networks for Text Classification.](https://arxiv.org/abs/1509.01626) arXiv:1509.01626. 40 | -------------------------------------------------------------------------------- /cnn/README.md: -------------------------------------------------------------------------------- 1 | # CNNs for Text Classification in PyTorch 2 | 3 | A minimal PyTorch implementation of Convolutional Neural Networks (CNNs) for text classification. 4 | 5 | Supported features: 6 | - Mini-batch training with CUDA 7 | - Lookup, CNNs, RNNs and/or self-attentive encoding in the embedding layer 8 | 9 | ## Usage 10 | 11 | Training data should be formatted as below: 12 | ``` 13 | sentence \t label 14 | sentence \t label 15 | ... 16 | ``` 17 | 18 | To prepare data: 19 | ``` 20 | python3 prepare.py training_data 21 | ``` 22 | 23 | To train: 24 | ``` 25 | python3 train.py model char_to_idx word_to_idx tag_to_idx training_data.csv (validation_data) num_epoch 26 | ``` 27 | 28 | To predict: 29 | ``` 30 | python3 predict.py model.epochN char_to_idx word_to_idx tag_to_idx test_data 31 | ``` 32 | 33 | To evaluate: 34 | ``` 35 | python3 evaluate.py model.epochN char_to_idx word_to_idx tag_to_idx test_data 36 | ``` 37 | -------------------------------------------------------------------------------- /cnn/model.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from embedding import embed 3 | 4 | class cnn(nn.Module): 5 | def __init__(self, char_vocab_size, word_vocab_size, num_labels): 6 | super().__init__() 7 | 8 | # architecture 9 | self.embed = embed(char_vocab_size, word_vocab_size) 10 | self.conv = nn.ModuleList([nn.Conv2d( 11 | in_channels = 1, # Ci 12 | out_channels = NUM_FEATMAPS, # Co 13 | kernel_size = (i, sum(EMBED.values())) # height, width 14 | ) for i in KERNEL_SIZES]) # num_kernels (K) 15 | self.dropout = nn.Dropout(DROPOUT) 16 | self.fc = nn.Linear(len(KERNEL_SIZES) * NUM_FEATMAPS, num_labels) 17 | self.softmax = nn.LogSoftmax(1) 18 | 19 | if CUDA: 20 | self = self.cuda() 21 | 22 | def forward(self, xc, xw): 23 | x = self.embed(xc, xw) # [batch_size (B), seq_len (L), embed_size (H)] 24 | x = x.unsqueeze(1) # [B, Ci, L, H] 25 | h = [conv(x) for conv in self.conv] # [B, Co, L, 1] * K 26 | h = [F.relu(k).squeeze(3) for k in h] # [B, Co, L] * K 27 | h = [F.max_pool1d(k, k.size(2)).squeeze(2) for k in h] # [B, Co] * K 28 | h = torch.cat(h, 1) # [B, Co * K] 29 | h = self.dropout(h) 30 | h = self.fc(h) # fully connected layer 31 | y = self.softmax(h) 32 | return y 33 | -------------------------------------------------------------------------------- /cnn/parameters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | UNIT = "word" # unit of tokenization (char, word) 7 | BATCH_SIZE = 64 8 | EMBED = {"char-cnn": 50, "lookup": 250} # embeddings (char-cnn, char-rnn, lookup, sae) 9 | NUM_FEATMAPS = 100 # feature maps generated by each kernel 10 | KERNEL_SIZES = [2, 3, 4] 11 | DROPOUT = 0.5 12 | LEARNING_RATE = 1e-4 13 | VERBOSE = False 14 | EVAL_EVERY = 10 15 | SAVE_EVERY = 10 16 | 17 | PAD = "" # padding 18 | SOS = "" # start of sequence 19 | EOS = "" # end of sequence 20 | UNK = "" # unknown token 21 | 22 | PAD_IDX = 0 23 | SOS_IDX = 1 24 | EOS_IDX = 2 25 | UNK_IDX = 3 26 | 27 | CUDA = torch.cuda.is_available() 28 | torch.manual_seed(0) # for reproducibility 29 | 30 | DELIM = "\t" # delimiter 31 | KEEP_IDX = False # use the existing indices when preparing additional data 32 | NUM_DIGITS = 4 # number of digits to print 33 | -------------------------------------------------------------------------------- /cnn/predict.py: -------------------------------------------------------------------------------- 1 | from model import * 2 | from utils import * 3 | 4 | def load_model(): 5 | cti = load_tkn_to_idx(sys.argv[2]) # char_to_idx 6 | wti = load_tkn_to_idx(sys.argv[3]) # word_to_idx 7 | itt = load_idx_to_tkn(sys.argv[4]) # idx_to_tag 8 | model = cnn(len(cti), len(wti), len(itt)) 9 | print(model) 10 | load_checkpoint(sys.argv[1], model) 11 | return model, cti, wti, itt 12 | 13 | def run_model(model, itt, batch): 14 | batch_size = len(batch) # real batch size 15 | while len(batch) < BATCH_SIZE: 16 | batch.append([-1, "", [], [], ""]) 17 | batch.sort(key = lambda x: -len(x[2])) 18 | xc, xw = batchify(*zip(*[(x[2], x[3]) for x in batch]), True, True, max(KERNEL_SIZES)) 19 | result = model(xc, xw) 20 | for i in range(batch_size): 21 | y = itt[result[i].argmax()] 22 | p = result[i].max().exp().item() 23 | batch[i].append(y) 24 | batch[i].append(p) 25 | if VERBOSE: 26 | print() 27 | print(batch[i]) 28 | y = torch.exp(result[i]).tolist() 29 | for j, p in sorted(enumerate(y), key = lambda x: -x[1]): 30 | print("%s %f" % (itt[j], p)) 31 | return [(x[1], *x[4:]) for x in sorted(batch[:batch_size])] 32 | 33 | def predict(filename, model, cti, wti, itt): 34 | data = [] 35 | fo = open(filename) 36 | for idx, line in enumerate(fo): 37 | line = line.strip() 38 | line, y = line.split("\t") if line.count("\t") else [line, None] 39 | x = tokenize(line, UNIT) 40 | xc = [[cti[c] if c in cti else UNK_IDX for c in w] for w in x] 41 | xw = [wti[w] if w in wti else UNK_IDX for w in x] 42 | data.append([idx, line, xc, xw, y]) 43 | fo.close() 44 | with torch.no_grad(): 45 | model.eval() 46 | for i in range(0, len(data), BATCH_SIZE): 47 | batch = data[i:i + BATCH_SIZE] 48 | for y in run_model(model, itt, batch): 49 | yield y 50 | 51 | if __name__ == "__main__": 52 | if len(sys.argv) != 6: 53 | sys.exit("Usage: %s model char_to_idx word_to_idx tag_to_idx test_data" % sys.argv[0]) 54 | result = predict(sys.argv[5], *load_model()) 55 | for x, y0, y1, p in result: 56 | print((x, y0, y1, p) if y0 else (x, y1, p)) 57 | -------------------------------------------------------------------------------- /cnn/train.py: -------------------------------------------------------------------------------- 1 | from model import * 2 | from utils import * 3 | from evaluate import * 4 | 5 | def load_data(): 6 | bxc = [] # character sequence batch 7 | bxw = [] # word sequence batch 8 | by = [] # label batch 9 | data = [] 10 | cti = load_tkn_to_idx(sys.argv[2]) # char_to_idx 11 | wti = load_tkn_to_idx(sys.argv[3]) # word_to_idx 12 | itt = load_idx_to_tkn(sys.argv[4]) # idx_to_tkn 13 | print("loading %s..." % sys.argv[5]) 14 | fo = open(sys.argv[5], "r") 15 | for line in fo: 16 | line = line.strip() 17 | *x, y = [x.split(":") for x in line.split(" ")] 18 | xc, xw = zip(*[(list(map(int, xc.split("+"))), int(xw)) for xc, xw in x]) 19 | bxc.append(xc) 20 | bxw.append(xw) 21 | by.append(int(y[0])) 22 | if len(by) == BATCH_SIZE: 23 | bxc, bxw = batchify(bxc, bxw, True, True, max(KERNEL_SIZES)) 24 | data.append((bxc, bxw, LongTensor(by))) 25 | bxc = [] 26 | bxw = [] 27 | by = [] 28 | fo.close() 29 | print("data size: %d" % (len(data) * BATCH_SIZE)) 30 | print("batch size: %d" % BATCH_SIZE) 31 | return data, cti, wti, itt 32 | 33 | def train(): 34 | num_epochs = int(sys.argv[-1]) 35 | data, cti, wti, itt = load_data() 36 | model = cnn(len(cti), len(wti), len(itt)) 37 | optim = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE) 38 | print(model) 39 | epoch = load_checkpoint(sys.argv[1], model) if isfile(sys.argv[1]) else 0 40 | filename = re.sub("\.epoch[0-9]+$", "", sys.argv[1]) 41 | print("training model...") 42 | for ei in range(epoch + 1, epoch + num_epochs + 1): 43 | loss_sum = 0 44 | timer = time() 45 | for xc, xw, y in data: 46 | model.zero_grad() 47 | loss = F.nll_loss(model(xc, xw), y) # forward pass and compute loss 48 | loss.backward() # compute gradients 49 | optim.step() # update parameters 50 | loss_sum += loss.item() 51 | timer = time() - timer 52 | loss_sum /= len(data) 53 | if ei % SAVE_EVERY and ei != epoch + num_epochs: 54 | save_checkpoint("", None, ei, loss_sum, timer) 55 | else: 56 | save_checkpoint(filename, model, ei, loss_sum, timer) 57 | if EVAL_EVERY and (ei % EVAL_EVERY == 0 or ei == epoch + num_epochs): 58 | args = [model, cti, wti, itt] 59 | evaluate(predict(sys.argv[6], *args), True) 60 | model.train() 61 | print() 62 | 63 | if __name__ == "__main__": 64 | if len(sys.argv) not in [7, 8]: 65 | sys.exit("Usage: %s model char_to_idx word_to_idx tag_to_idx training_data (validation_data) num_epoch" % sys.argv[0]) 66 | if len(sys.argv) == 7: 67 | EVAL_EVERY = False 68 | train() 69 | -------------------------------------------------------------------------------- /embedding.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | 3 | class embed(nn.Module): 4 | def __init__(self, char_vocab_size, word_vocab_size): 5 | super().__init__() 6 | 7 | # architecture 8 | for model, dim in EMBED.items(): 9 | if model == "char-cnn": 10 | self.char_embed = self.cnn(char_vocab_size, dim) 11 | elif model == "char-rnn": 12 | self.char_embed = self.rnn(char_vocab_size, dim) 13 | if model == "lookup": 14 | self.word_embed = nn.Embedding(word_vocab_size, dim, padding_idx = PAD_IDX) 15 | elif model == "sae": 16 | self.word_embed = self.sae(word_vocab_size, dim) 17 | 18 | if CUDA: 19 | self = self.cuda() 20 | 21 | def forward(self, xc, xw): 22 | hc = self.char_embed(xc) if "char-cnn" in EMBED or "char-rnn" in EMBED else None 23 | hw = self.word_embed(xw) if "lookup" in EMBED or "sae" in EMBED else None 24 | h = torch.cat([h for h in [hc, hw] if type(h) == torch.Tensor], 2) 25 | return h 26 | 27 | class cnn(nn.Module): 28 | def __init__(self, vocab_size, embed_size): 29 | super().__init__() 30 | dim = 50 31 | num_featmaps = 50 # feature maps generated by each kernel 32 | kernel_sizes = [3] 33 | 34 | # architecture 35 | self.embed = nn.Embedding(vocab_size, dim, padding_idx = PAD_IDX) 36 | self.conv = nn.ModuleList([nn.Conv2d( 37 | in_channels = 1, # Ci 38 | out_channels = num_featmaps, # Co 39 | kernel_size = (i, dim) # height, width 40 | ) for i in kernel_sizes]) # num_kernels (K) 41 | self.dropout = nn.Dropout(DROPOUT) 42 | self.fc = nn.Linear(len(kernel_sizes) * num_featmaps, embed_size) 43 | 44 | def forward(self, x): 45 | x = x.view(-1, x.size(2)) # [batch_size (B) * word_seq_len (Lw), char_seq_len (Lc)] 46 | x = self.embed(x) # [B * Lw, Lc, dim (H)] 47 | x = x.unsqueeze(1) # [B * Lw, Ci, Lc, W] 48 | h = [conv(x) for conv in self.conv] # [B * Lw, Co, Lc, 1] * K 49 | h = [F.relu(k).squeeze(3) for k in h] # [B * Lw, Co, Lc] * K 50 | h = [F.max_pool1d(k, k.size(2)).squeeze(2) for k in h] # [B * Lw, Co] * K 51 | h = torch.cat(h, 1) # [B * Lw, Co * K] 52 | h = self.dropout(h) 53 | h = self.fc(h) # fully connected layer [B * Lw, embed_size] 54 | h = h.view(BATCH_SIZE, -1, h.size(1)) # [B, Lw, embed_size] 55 | return h 56 | 57 | class rnn(nn.Module): 58 | def __init__(self, vocab_size, embed_size): 59 | super().__init__() 60 | self.dim = embed_size 61 | self.rnn_type = "GRU" # LSTM, GRU 62 | self.num_dirs = 2 # unidirectional: 1, bidirectional: 2 63 | self.num_layers = 2 64 | 65 | # architecture 66 | self.embed = nn.Embedding(vocab_size, embed_size, padding_idx = PAD_IDX) 67 | self.rnn = getattr(nn, self.rnn_type)( 68 | input_size = self.dim, 69 | hidden_size = self.dim // self.num_dirs, 70 | num_layers = self.num_layers, 71 | bias = True, 72 | batch_first = True, 73 | dropout = DROPOUT, 74 | bidirectional = self.num_dirs == 2 75 | ) 76 | 77 | def init_state(self, b): # initialize RNN states 78 | n = self.num_layers * self.num_dirs 79 | h = self.dim // self.num_dirs 80 | hs = zeros(n, b, h) # hidden state 81 | if self.rnn_type == "LSTM": 82 | cs = zeros(n, b, h) # LSTM cell state 83 | return (hs, cs) 84 | return hs 85 | 86 | def forward(self, x): 87 | s = self.init_state(x.size(0) * x.size(1)) 88 | x = x.view(-1, x.size(2)) # [batch_size (B) * word_seq_len (Lw), char_seq_len (Lc)] 89 | x = self.embed(x) # [B * Lw, Lc, embed_size (H)] 90 | h, s = self.rnn(x, s) 91 | h = s if self.rnn_type == "GRU" else s[-1] 92 | h = torch.cat([x for x in h[-self.num_dirs:]], 1) # final hidden state [B * Lw, H] 93 | h = h.view(BATCH_SIZE, -1, h.size(1)) # [B, Lw, H] 94 | return h 95 | 96 | class sae(nn.Module): # self attentive encoder 97 | def __init__(self, vocab_size, embed_size = 512): 98 | super().__init__() 99 | dim = embed_size 100 | num_layers = 1 101 | 102 | # architecture 103 | self.embed = nn.Embedding(vocab_size, dim, padding_idx = PAD_IDX) 104 | self.pe = self.pos_encoding(dim) 105 | self.layers = nn.ModuleList([self.layer(dim) for _ in range(num_layers)]) 106 | 107 | def forward(self, x): 108 | mask = self.maskset(x) 109 | x = self.embed(x) 110 | h = x + self.pe[:x.size(1)] 111 | for layer in self.layers: 112 | h = layer(h, mask[0]) 113 | return h 114 | 115 | @staticmethod 116 | def maskset(x): # set of mask and lengths 117 | mask = x.eq(PAD_IDX) 118 | return (mask.view(BATCH_SIZE, 1, 1, -1), x.size(1) - mask.sum(1)) 119 | 120 | @staticmethod 121 | def pos_encoding(dim, maxlen = 1000): # positional encoding 122 | pe = Tensor(maxlen, dim) 123 | pos = torch.arange(0, maxlen, 1.).unsqueeze(1) 124 | k = torch.exp(-np.log(10000) * torch.arange(0, dim, 2.) / dim) 125 | pe[:, 0::2] = torch.sin(pos * k) 126 | pe[:, 1::2] = torch.cos(pos * k) 127 | return pe 128 | 129 | class layer(nn.Module): # encoder layer 130 | def __init__(self, dim): 131 | super().__init__() 132 | 133 | # architecture 134 | self.attn = embed.sae.attn_mh(dim) 135 | self.ffn = embed.sae.ffn(dim) 136 | 137 | def forward(self, x, mask): 138 | z = self.attn(x, x, x, mask) 139 | z = self.ffn(z) 140 | return z 141 | 142 | class attn_mh(nn.Module): # multi-head attention 143 | def __init__(self, dim): 144 | super().__init__() 145 | self.D = dim # dimension of model 146 | self.H = 8 # number of heads 147 | self.Dk = self.D // self.H # dimension of key 148 | self.Dv = self.D // self.H # dimension of value 149 | 150 | # architecture 151 | self.Wq = nn.Linear(self.D, self.H * self.Dk) # query 152 | self.Wk = nn.Linear(self.D, self.H * self.Dk) # key for attention distribution 153 | self.Wv = nn.Linear(self.D, self.H * self.Dv) # value for context representation 154 | self.Wo = nn.Linear(self.H * self.Dv, self.D) 155 | self.dropout = nn.Dropout(DROPOUT) 156 | self.norm = nn.LayerNorm(self.D) 157 | 158 | def attn_sdp(self, q, k, v, mask): # scaled dot-product attention 159 | c = np.sqrt(self.Dk) # scale factor 160 | a = torch.matmul(q, k.transpose(2, 3)) / c # compatibility function 161 | a = a.masked_fill(mask, -10000) # masking in log space 162 | a = F.softmax(a, -1) 163 | a = torch.matmul(a, v) 164 | return a # attention weights 165 | 166 | def forward(self, q, k, v, mask): 167 | x = q # identity 168 | q = self.Wq(q).view(BATCH_SIZE, -1, self.H, self.Dk).transpose(1, 2) 169 | k = self.Wk(k).view(BATCH_SIZE, -1, self.H, self.Dk).transpose(1, 2) 170 | v = self.Wv(v).view(BATCH_SIZE, -1, self.H, self.Dv).transpose(1, 2) 171 | z = self.attn_sdp(q, k, v, mask) 172 | z = z.transpose(1, 2).contiguous().view(BATCH_SIZE, -1, self.H * self.Dv) 173 | z = self.Wo(z) 174 | z = self.norm(x + self.dropout(z)) # residual connection and dropout 175 | return z 176 | 177 | class ffn(nn.Module): # position-wise feed-forward networks 178 | def __init__(self, dim): 179 | super().__init__() 180 | dim_ffn = 2048 181 | 182 | # architecture 183 | self.layers = nn.Sequential( 184 | nn.Linear(dim, dim_ffn), 185 | nn.ReLU(), 186 | nn.Dropout(DROPOUT), 187 | nn.Linear(dim_ffn, dim) 188 | ) 189 | self.norm = nn.LayerNorm(dim) 190 | 191 | def forward(self, x): 192 | z = x + self.layers(x) # residual connection 193 | z = self.norm(z) # layer normalization 194 | return z 195 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | from predict import * 2 | 3 | def evaluate(result, summary = False): 4 | avg = defaultdict(float) # average 5 | tp = defaultdict(int) # true positives 6 | tpfn = defaultdict(int) # true positives + false negatives 7 | tpfp = defaultdict(int) # true positives + false positives 8 | for _, y0, y1, _ in result: # actual value, prediction 9 | tp[y0] += y0 == y1 10 | tpfn[y0] += 1 11 | tpfp[y1] += 1 12 | print() 13 | for y in sorted(tpfn.keys()): 14 | pr = (tp[y] / tpfp[y]) if tpfp[y] else 0 15 | rc = (tp[y] / tpfn[y]) if tpfn[y] else 0 16 | avg["macro_pr"] += pr 17 | avg["macro_rc"] += rc 18 | if not summary: 19 | print("label = %s" % y) 20 | print("precision = %f (%d/%d)" % (pr, tp[y], tpfp[y])) 21 | print("recall = %f (%d/%d)" % (rc, tp[y], tpfn[y])) 22 | print("f1 = %f\n" % f1(pr, rc)) 23 | avg["macro_pr"] /= len(tpfn) 24 | avg["macro_rc"] /= len(tpfn) 25 | avg["micro_f1"] = sum(tp.values()) / sum(tpfp.values()) 26 | print("macro precision = %f" % avg["macro_pr"]) 27 | print("macro recall = %f" % avg["macro_rc"]) 28 | print("macro f1 = %f" % f1(avg["macro_pr"], avg["macro_rc"])) 29 | print("micro f1 = %f" % avg["micro_f1"]) 30 | 31 | if __name__ == "__main__": 32 | if len(sys.argv) != 6: 33 | sys.exit("Usage: %s model char_to_idx word_to_idx tag_to_idx test_data" % sys.argv[0]) 34 | evaluate(predict(sys.argv[5], *load_model())) 35 | -------------------------------------------------------------------------------- /prepare.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | 3 | def load_data(): 4 | data = [] 5 | cti = {PAD: PAD_IDX, SOS: SOS_IDX, EOS: EOS_IDX, UNK: UNK_IDX} # char_to_idx 6 | wti = {PAD: PAD_IDX, SOS: SOS_IDX, EOS: EOS_IDX, UNK: UNK_IDX} # word_to_idx 7 | tti = {} # tag_to_idx 8 | fo = open(sys.argv[1]) 9 | for line in fo: 10 | x, y = line.split("\t") 11 | x = tokenize(x, UNIT) 12 | y = y.strip() 13 | for w in x: 14 | for c in w: 15 | if c not in cti: 16 | cti[c] = len(cti) 17 | if w not in wti: 18 | wti[w] = len(wti) 19 | if y not in tti: 20 | tti[y] = len(tti) 21 | x = ["+".join(str(cti[c]) for c in w) + ":%d" % wti[w] for w in x] 22 | y = [str(tti[y])] 23 | data.append(x + y) 24 | fo.close() 25 | data.sort(key = len, reverse = True) 26 | return data, cti, wti, tti 27 | 28 | if __name__ == "__main__": 29 | if len(sys.argv) != 2: 30 | sys.exit("Usage: %s training_data" % sys.argv[0]) 31 | data, cti, wti, tti = load_data() 32 | save_data(sys.argv[1] + ".csv", data) 33 | save_tkn_to_idx(sys.argv[1] + ".char_to_idx", cti) 34 | save_tkn_to_idx(sys.argv[1] + ".word_to_idx", wti) 35 | save_tkn_to_idx(sys.argv[1] + ".tag_to_idx", tti) 36 | -------------------------------------------------------------------------------- /rnn/README.md: -------------------------------------------------------------------------------- 1 | # RNNs for Text Classification in PyTorch 2 | 3 | A PyTorch implementation of Recurrent Neural Networks (RNNs) for text classification. 4 | 5 | Supported features: 6 | - Mini-batch training with CUDA 7 | - Global attention (Luong et al 2015) 8 | - Multi-head attention (Vaswani et al 2017) 9 | - Self attention (Vaswani et al 2017) 10 | - Lookup, CNNs, RNNs and/or self-attentive encoding in the embedding layer 11 | 12 | ## Usage 13 | 14 | Training data should be formatted as below: 15 | ``` 16 | sentence \t label 17 | sentence \t label 18 | ... 19 | ``` 20 | 21 | To prepare data: 22 | ``` 23 | python3 prepare.py training_data 24 | ``` 25 | 26 | To train: 27 | ``` 28 | python3 train.py model word_to_idx tag_to_idx training_data.csv (validation_data) num_epoch 29 | ``` 30 | 31 | To predict: 32 | ``` 33 | python3 predict.py model.epochN word_to_idx tag_to_idx test_data 34 | ``` 35 | 36 | To evaluate: 37 | ``` 38 | python3 evaluate.py model.epochN word_to_idx tag_to_idx test_data 39 | ``` 40 | -------------------------------------------------------------------------------- /rnn/model.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from embedding import embed 3 | 4 | class rnn(nn.Module): 5 | def __init__(self, char_vocab_size, word_vocab_size, num_labels): 6 | super().__init__() 7 | 8 | # architecture 9 | self.embed = embed(char_vocab_size, word_vocab_size) 10 | self.rnn1 = getattr(nn, RNN_TYPE)( 11 | input_size = sum(EMBED.values()), 12 | hidden_size = HIDDEN_SIZE // NUM_DIRS, 13 | num_layers = NUM_LAYERS, 14 | batch_first = True, 15 | bidirectional = NUM_DIRS == 2 16 | ) 17 | self.rnn2 = getattr(nn, RNN_TYPE)( 18 | input_size = HIDDEN_SIZE, 19 | hidden_size = HIDDEN_SIZE // NUM_DIRS, 20 | num_layers = NUM_LAYERS, 21 | batch_first = True, 22 | bidirectional = NUM_DIRS == 2 23 | ) 24 | self.attn = None 25 | if ATTN == "attn": # global attention 26 | self.attn = attn(HIDDEN_SIZE) 27 | if ATTN == "attn-rc": # global attention with residual connection 28 | self.attn = attn(sum(EMBED.values()) + HIDDEN_SIZE * 2) 29 | if ATTN == "mh-attn": # multi-head attention 30 | self.attn = attn_mh() 31 | self.fc = nn.Linear(HIDDEN_SIZE, num_labels) 32 | self.softmax = nn.LogSoftmax(1) 33 | 34 | if CUDA: 35 | self = self.cuda() 36 | 37 | def init_state(self): # initialize RNN states 38 | args = (NUM_LAYERS * NUM_DIRS, BATCH_SIZE, HIDDEN_SIZE // NUM_DIRS) 39 | hs = zeros(*args) # hidden state 40 | if RNN_TYPE == "LSTM": 41 | cs = zeros(*args) # LSTM cell state 42 | return (hs, cs) 43 | return hs 44 | 45 | def forward(self, xc, xw, mask): 46 | s1 = self.init_state() 47 | s2 = self.init_state() 48 | x = self.embed(xc, xw) 49 | x = nn.utils.rnn.pack_padded_sequence(x, mask[1], batch_first = True) 50 | h1, s1 = self.rnn1(x, s1) 51 | h2, s2 = self.rnn2(h1, s2) 52 | h = s2 if RNN_TYPE == "GRU" else s2[-1] 53 | h = torch.cat([x for x in h[-NUM_DIRS:]], 1) # final cell state 54 | if self.attn: 55 | h1, _ = nn.utils.rnn.pad_packed_sequence(h1, batch_first = True) 56 | h2, _ = nn.utils.rnn.pad_packed_sequence(h2, batch_first = True) 57 | if ATTN == "attn": 58 | h = self.attn(h, h2, mask[0]) 59 | if ATTN == "attn-rc": 60 | h = self.attn(h, torch.cat((x, h1, h2), 2), mask[0]) 61 | if ATTN == "mh-attn": 62 | h = self.attn(h, h2, h2, mask[0].view(BATCH_SIZE, 1, 1, -1)) 63 | h = self.fc(h) 64 | y = self.softmax(h) 65 | return y 66 | 67 | class attn(nn.Module): # global attention 68 | def __init__(self, attn_size): 69 | super().__init__() 70 | self.Va = None # attention weights 71 | 72 | # architecture 73 | self.Wa = nn.Linear(attn_size, 1) 74 | self.Wc = nn.Linear(HIDDEN_SIZE + attn_size, HIDDEN_SIZE) 75 | self.dropout = nn.Dropout(DROPOUT) 76 | 77 | def forward(self, hc, ho, mask): 78 | a = self.Wa(ho).transpose(1, 2) 79 | a = a.masked_fill(mask.unsqueeze(1), -10000) # masking in log space 80 | a = self.Va = F.softmax(a, 2) # attention vector [B, 1, L] 81 | c = a.bmm(ho).squeeze(1) # context vector [B, H] 82 | h = self.Wc(torch.cat((hc, self.dropout(c)), 1)) 83 | return h 84 | 85 | class attn_mh(nn.Module): # multi-head attention 86 | def __init__(self): 87 | super().__init__() 88 | self.Va = None # query-key attention weights 89 | 90 | # architecture 91 | self.Wq = nn.Linear(HIDDEN_SIZE, NUM_HEADS * DK) # query 92 | self.Wk = nn.Linear(HIDDEN_SIZE, NUM_HEADS * DK) # key for attention distribution 93 | self.Wv = nn.Linear(HIDDEN_SIZE, NUM_HEADS * DV) # value for context representation 94 | self.Wo = nn.Linear(NUM_HEADS * DV, HIDDEN_SIZE) 95 | self.dropout = nn.Dropout(DROPOUT) 96 | self.norm = nn.LayerNorm(HIDDEN_SIZE) 97 | 98 | def attn_sdp(self, q, k, v, mask): # scaled dot-product attention 99 | c = np.sqrt(DK) # scale factor 100 | a = torch.matmul(q, k.transpose(2, 3)) / c # compatibility function 101 | a = a.masked_fill(mask, -10000) # masking in log space 102 | a = F.softmax(a, 3) # [B, NUM_HEADS, 1, L] 103 | self.Va = a.squeeze(2) 104 | a = torch.matmul(a, v) # [B, NUM_HEADS, 1, DV] 105 | return a # attention weights 106 | 107 | def forward(self, q, k, v, mask): 108 | x = q # identity 109 | q = self.Wq(q).view(BATCH_SIZE, -1, NUM_HEADS, DK).transpose(1, 2) 110 | k = self.Wk(k).view(BATCH_SIZE, -1, NUM_HEADS, DK).transpose(1, 2) 111 | v = self.Wv(v).view(BATCH_SIZE, -1, NUM_HEADS, DV).transpose(1, 2) 112 | z = self.attn_sdp(q, k, v, mask) 113 | z = z.transpose(1, 2).contiguous().view(BATCH_SIZE, -1, NUM_HEADS * DV) 114 | z = self.Wo(z).squeeze(1) 115 | z = self.norm(x + self.dropout(z)) # residual connection and dropout 116 | return z 117 | -------------------------------------------------------------------------------- /rnn/parameters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | UNIT = "char" # unit of tokenization (char, word) 7 | RNN_TYPE = "LSTM" # LSTM, GRU 8 | NUM_DIRS = 2 # unidirectional: 1, bidirectional: 2 9 | NUM_LAYERS = 1 10 | BATCH_SIZE = 64 11 | EMBED = {"char-cnn": 50, "lookup": 250} # embeddings (char-cnn, char-rnn, lookup, sae) 12 | HIDDEN_SIZE = 500 13 | ATTN = "attn" # attention (attn: global, attn-rc: with residual connection, mh-attn: multi-head) 14 | DROPOUT = 0.5 15 | NUM_HEADS = 8 16 | DK = HIDDEN_SIZE // NUM_HEADS # dimension of key 17 | DV = HIDDEN_SIZE // NUM_HEADS # dimension of value 18 | LEARNING_RATE = 1e-4 19 | VERBOSE = False 20 | EVAL_EVERY = 10 21 | SAVE_EVERY = 10 22 | 23 | PAD = "" # padding 24 | SOS = "" # start of sequence 25 | EOS = "" # end of sequence 26 | UNK = "" # unknown token 27 | 28 | PAD_IDX = 0 29 | SOS_IDX = 1 30 | EOS_IDX = 2 31 | UNK_IDX = 3 32 | 33 | torch.manual_seed(1) 34 | CUDA = torch.cuda.is_available() 35 | 36 | DELIM = "\t" # delimiter 37 | KEEP_IDX = False # use the existing indices when preparing additional data 38 | NUM_DIGITS = 4 # number of digits to print 39 | -------------------------------------------------------------------------------- /rnn/predict.py: -------------------------------------------------------------------------------- 1 | from model import * 2 | from utils import * 3 | 4 | def load_model(): 5 | cti = load_tkn_to_idx(sys.argv[2]) # char_to_idx 6 | wti = load_tkn_to_idx(sys.argv[3]) # word_to_idx 7 | itt = load_idx_to_tkn(sys.argv[4]) # idx_to_tag 8 | itw = idx_to_tkn(wti) # idx_to_word 9 | model = rnn(len(cti), len(wti), len(itt)) 10 | print(model) 11 | load_checkpoint(sys.argv[1], model) 12 | return model, cti, wti, itt, itw 13 | 14 | def run_model(model, itw, itt, batch): 15 | batch_size = len(batch) # real batch size 16 | while len(batch) < BATCH_SIZE: 17 | batch.append([-1, "", [], [], ""]) 18 | batch.sort(key = lambda x: -len(x[3])) 19 | xc, xw = batchify(*zip(*[(x[2], x[3]) for x in batch]), True, True) 20 | result = model(xc, xw, maskset(xw)) 21 | if VERBOSE: 22 | Va = model.attn.Va.tolist() # attention weights 23 | for i in range(batch_size): 24 | y = itt[result[i].argmax()] 25 | p = round(max(result[i]).exp().item(), NUM_DIGITS) 26 | batch[i].append(y) 27 | batch[i].append(p) 28 | if VERBOSE: 29 | print(batch[i][1]) 30 | y = enumerate(result[i].exp().tolist()) 31 | for a, b in sorted(y, key = lambda x: -x[1]): 32 | print(itt[a], round(b, NUM_DIGITS)) 33 | print(heatmap(Va[i], batch[i][3], itw, sos = True, eos = True)) # attention heatmap 34 | return [(x[1], *x[4:]) for x in sorted(batch[:batch_size])] 35 | 36 | def predict(filename, model, cti, wti, itt, itw): 37 | data = [] 38 | fo = open(filename) 39 | for idx, line in enumerate(fo): 40 | line = line.strip() 41 | line, y = line.split("\t") if line.count("\t") else [line, None] 42 | x = tokenize(line, UNIT) 43 | xc = [[cti[c] if c in cti else UNK_IDX for c in w] for w in x] 44 | xw = [wti[w] if w in wti else UNK_IDX for w in x] 45 | data.append([idx, line, xc, xw, y]) 46 | fo.close() 47 | with torch.no_grad(): 48 | model.eval() 49 | for i in range(0, len(data), BATCH_SIZE): 50 | batch = data[i:i + BATCH_SIZE] 51 | for y in run_model(model, itw, itt, batch): 52 | yield y 53 | 54 | if __name__ == "__main__": 55 | if len(sys.argv) != 6: 56 | sys.exit("Usage: %s model char_to_idx word_to_idx tag_to_idx test_data" % sys.argv[0]) 57 | result = predict(sys.argv[5], *load_model()) 58 | for x, y0, y1, p in result: 59 | print((x, y0, y1, p) if y0 else (x, y1, p)) 60 | -------------------------------------------------------------------------------- /rnn/train.py: -------------------------------------------------------------------------------- 1 | from model import * 2 | from utils import * 3 | from evaluate import * 4 | 5 | def load_data(): 6 | bxc = [] # character sequence batch 7 | bxw = [] # word sequence batch 8 | by = [] # label batch 9 | data = [] 10 | cti = load_tkn_to_idx(sys.argv[2]) # char_to_idx 11 | wti = load_tkn_to_idx(sys.argv[3]) # word_to_idx 12 | itt = load_idx_to_tkn(sys.argv[4]) # idx_to_tag 13 | itw = idx_to_tkn(wti) # idx_to_word 14 | print("loading data...\n") 15 | fo = open(sys.argv[5], "r") 16 | for line in fo: 17 | line = line.strip() 18 | *x, y = [x.split(":") for x in line.split(" ")] 19 | xc, xw = zip(*[(list(map(int, xc.split("+"))), int(xw)) for xc, xw in x]) 20 | bxc.append(xc) 21 | bxw.append(xw) 22 | by.append(int(y[0])) 23 | if len(by) == BATCH_SIZE: 24 | bxc, bxw = batchify(bxc, bxw, True, True) 25 | data.append((bxc, bxw, LongTensor(by))) 26 | bxc = [] 27 | bxw = [] 28 | by = [] 29 | fo.close() 30 | print("data size: %d" % (len(data) * BATCH_SIZE)) 31 | print("batch size: %d" % BATCH_SIZE) 32 | return data, cti, wti, itt, itw 33 | 34 | def train(): 35 | num_epochs = int(sys.argv[-1]) 36 | data, cti, wti, itt, itw = load_data() 37 | model = rnn(len(cti), len(wti), len(itt)) 38 | optim = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE) 39 | print(model) 40 | epoch = load_checkpoint(sys.argv[1], model) if isfile(sys.argv[1]) else 0 41 | filename = re.sub("\.epoch[0-9]+$", "", sys.argv[1]) 42 | print("training model...") 43 | for ei in range(epoch + 1, epoch + num_epochs + 1): 44 | ii = 0 45 | loss_sum = 0 46 | timer = time() 47 | for xc, xw, y in data: 48 | ii += 1 49 | model.zero_grad() 50 | mask = maskset(xw) 51 | loss = F.nll_loss(model(xc, xw, mask), y) 52 | loss.backward() 53 | optim.step() 54 | loss_sum += loss.item() 55 | timer = time() - timer 56 | loss_sum /= len(data) 57 | if ei % SAVE_EVERY and ei != epoch + num_epochs: 58 | save_checkpoint("", None, ei, loss_sum, timer) 59 | else: 60 | save_checkpoint(filename, model, ei, loss_sum, timer) 61 | if EVAL_EVERY and (ei % EVAL_EVERY == 0 or ei == epoch + num_epochs): 62 | args = [model, cti, wti, itt, itw] 63 | evaluate(predict(sys.argv[6], *args), True) 64 | model.train() 65 | print() 66 | if __name__ == "__main__": 67 | if len(sys.argv) not in [7, 8]: 68 | sys.exit("Usage: %s model char_to_idx word_to_idx tag_to_idx training_data (validation_data) num_epoch" % sys.argv[0]) 69 | if len(sys.argv) == 7: 70 | EVAL_EVERY = False 71 | train() 72 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from time import time 4 | from os.path import isfile 5 | from parameters import * 6 | from collections import defaultdict 7 | 8 | def normalize(x): 9 | # x = re.sub("[\uAC00-\uD7A3]+", "\uAC00", x) £ convert Hangeul to 가 10 | # x = re.sub("[\u3040-\u30FF]+", "\u3042", x) # convert Hiragana and Katakana to あ 11 | # x = re.sub("[\u4E00-\u9FFF]+", "\u6F22", x) # convert CJK unified ideographs to 漢 12 | x = re.sub("\s+", " ", x) 13 | x = re.sub("^ | $", "", x) 14 | x = x.lower() 15 | return x 16 | 17 | def tokenize(x, unit): 18 | x = normalize(x) 19 | if unit == "char": 20 | return x 21 | if unit == "word": 22 | return x.split(" ") 23 | 24 | def save_data(filename, data): 25 | fo = open(filename, "w") 26 | for seq in data: 27 | fo.write(" ".join(seq) + "\n") 28 | fo.close() 29 | 30 | def load_tkn_to_idx(filename): 31 | print("loading %s" % filename) 32 | tkn_to_idx = {} 33 | fo = open(filename) 34 | for line in fo: 35 | line = line[:-1] 36 | tkn_to_idx[line] = len(tkn_to_idx) 37 | fo.close() 38 | return tkn_to_idx 39 | 40 | def load_idx_to_tkn(filename): 41 | print("loading %s" % filename) 42 | idx_to_tkn = [] 43 | fo = open(filename) 44 | for line in fo: 45 | line = line[:-1] 46 | idx_to_tkn.append(line) 47 | fo.close() 48 | return idx_to_tkn 49 | 50 | def save_tkn_to_idx(filename, tkn_to_idx): 51 | fo = open(filename, "w") 52 | for tkn, _ in sorted(tkn_to_idx.items(), key = lambda x: x[1]): 53 | fo.write("%s\n" % tkn) 54 | fo.close() 55 | 56 | def load_checkpoint(filename, model = None): 57 | print("loading %s" % filename) 58 | checkpoint = torch.load(filename) 59 | if model: 60 | model.load_state_dict(checkpoint["state_dict"]) 61 | epoch = checkpoint["epoch"] 62 | loss = checkpoint["loss"] 63 | print("saved model: epoch = %d, loss = %f" % (checkpoint["epoch"], checkpoint["loss"])) 64 | return epoch 65 | 66 | def save_checkpoint(filename, model, epoch, loss, time): 67 | print("epoch = %d, loss = %f, time = %f" % (epoch, loss, time)) 68 | if filename and model: 69 | print("saving %s" % filename) 70 | checkpoint = {} 71 | checkpoint["state_dict"] = model.state_dict() 72 | checkpoint["epoch"] = epoch 73 | checkpoint["loss"] = loss 74 | torch.save(checkpoint, filename + ".epoch%d" % epoch) 75 | print("saved model at epoch %d" % epoch) 76 | 77 | def cudify(f): 78 | return lambda *x: f(*x).cuda() if CUDA else f(*x) 79 | 80 | Tensor = cudify(torch.Tensor) 81 | LongTensor = cudify(torch.LongTensor) 82 | zeros = cudify(torch.zeros) 83 | 84 | def maskset(x): 85 | mask = x.eq(PAD_IDX) 86 | return (mask, x.size(1) - mask.sum(1)) # set of mask and lengths 87 | 88 | def idx_to_tkn(tkn_to_idx): 89 | return [x for x, _ in sorted(tkn_to_idx.items(), key = lambda x: x[1])] 90 | 91 | def batchify(xc, xw, sos = False, eos = False, minlen = 0): 92 | xw_len = max(minlen, max(len(x) for x in xw)) 93 | if xc: 94 | xc_len = max(minlen, max(len(w) for x in xc for w in x)) 95 | pad = [[PAD_IDX] * (xc_len + 2)] 96 | xc = [[[SOS_IDX] + w + [EOS_IDX] + [PAD_IDX] * (xc_len - len(w)) for w in x] for x in xc] 97 | xc = [(pad if sos else []) + x + (pad * (xw_len - len(x) + eos)) for x in xc] 98 | xc = LongTensor(xc) 99 | sos = [SOS_IDX] if sos else [] 100 | eos = [EOS_IDX] if eos else [] 101 | xw = [sos + list(x) + eos + [PAD_IDX] * (xw_len - len(x)) for x in xw] 102 | return xc, LongTensor(xw) 103 | 104 | def heatmap(m, x, itw, ch = True, rh = False, sos = False, eos = False): # attention heatmap 105 | f = "%%.%df" % NUM_DIGITS 106 | m = [v[:len(x) + sos + eos] for v in m] # remove padding 107 | m = [([SOS] if sos else []) + [itw[i] for i in x] + ([EOS] if eos else [])] + m 108 | if ch: # column header 109 | csv = DELIM.join([x for x in m[0]]) + "\n" # source sequence 110 | for row in m[ch:]: 111 | if rh: # row header 112 | csv += row[0] + DELIM # target sequence 113 | csv += DELIM.join([f % x for x in row[rh:]]) + "\n" 114 | return csv 115 | 116 | def f1(p, r): 117 | return 2 * p * r / (p + r) if p + r else 0 118 | -------------------------------------------------------------------------------- /vdcnn/README.md: -------------------------------------------------------------------------------- 1 | # VDCNNs for Text Classification in PyTorch 2 | 3 | A minimal PyTorch implementation of Very Deep Convolutional Networks (VDCNNs) for text classification. 4 | 5 | ## Usage 6 | 7 | Training data should be formatted as below: 8 | ``` 9 | sentence \t label 10 | sentence \t label 11 | ... 12 | ``` 13 | 14 | To prepare data: 15 | ``` 16 | python3 prepare.py training_data 17 | ``` 18 | 19 | To train: 20 | ``` 21 | python3 train.py model word_to_idx tag_to_idx training_data.csv num_epoch 22 | ``` 23 | 24 | To predict: 25 | ``` 26 | python3 predict.py model.epochN word_to_idx tag_to_idx test_data 27 | ``` 28 | 29 | To evaluate: 30 | ``` 31 | python3 evaluate.py model.epochN word_to_idx tag_to_idx test_data 32 | ``` 33 | -------------------------------------------------------------------------------- /vdcnn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | SEQ_LEN = 1024 # maximum length of an input sequence 6 | BATCH_SIZE = 64 7 | EMBED_SIZE = 16 8 | KERNEL_SIZE = 3 9 | STRIDE = 1 10 | PADDING = 1 # KERNEL_SIZE - 2 11 | LEARNING_RATE = 0.01 12 | WEIGHT_DECAY = 1e-4 13 | VERBOSE = False 14 | SAVE_EVERY = 10 15 | 16 | PAD = "" # padding 17 | PAD_IDX = 0 18 | 19 | torch.manual_seed(1) 20 | CUDA = torch.cuda.is_available() 21 | 22 | class vdcnn(nn.Module): 23 | def __init__(self, vocab_size, num_labels): 24 | super().__init__() 25 | self.k = 8 # k-max pooling 26 | 27 | # architecture 28 | self.embed = nn.Embedding(vocab_size, EMBED_SIZE, padding_idx = PAD_IDX) 29 | self.conv = nn.Conv1d(EMBED_SIZE, 64, KERNEL_SIZE, STRIDE, PADDING) 30 | self.res_blocks = nn.Sequential( # residual blocks 31 | res_block(64, 64), 32 | res_block(64, 64, "vgg"), 33 | res_block(64, 128), 34 | res_block(128, 128, "vgg"), 35 | res_block(128, 256), 36 | res_block(256, 256, "vgg"), 37 | res_block(256, 512), 38 | res_block(512, 512) 39 | ) 40 | self.fc = nn.Sequential( # fully connected layers 41 | nn.Linear(512 * self.k, 2048), 42 | nn.ReLU(), 43 | nn.Linear(2048, 2048), 44 | nn.ReLU(), 45 | nn.Linear(2048, num_labels) 46 | ) 47 | self.softmax = nn.LogSoftmax(1) 48 | 49 | if CUDA: 50 | self = self.cuda() 51 | 52 | def forward(self, x): 53 | x = self.embed(x) # embedding 54 | x = x.transpose(1, 2) # [batch_size (N), num_feature_maps (D), seq_len (L)] 55 | h = self.conv(x) # temporal convolution 56 | h = self.res_blocks(h) # residual blocks 57 | h = h.topk(self.k)[0].view(BATCH_SIZE, -1) # k-max pooling 58 | h = self.fc(h) # fully connected layers 59 | y = self.softmax(h) 60 | return y 61 | 62 | class res_block(nn.Module): # residual block 63 | def __init__(self, in_channels, out_channels, downsample = None): 64 | super().__init__() 65 | first_stride = 2 if downsample == "resnet" else 1 66 | pool_stride = 2 if downsample else 1 67 | 68 | # architecture 69 | self.conv_block = conv_block(in_channels, out_channels, first_stride) 70 | self.pool = None 71 | if downsample == "kmax": # k-max pooling (Kalchbrenner et al 2014) 72 | self.pool = lambda x: x.topk(x.size(2) // 2)[0] 73 | elif downsample == "vgg": # VGG-like 74 | self.pool = nn.MaxPool1d(KERNEL_SIZE, pool_stride, PADDING) 75 | self.shortcut = nn.Conv1d(in_channels, out_channels, 1, pool_stride) 76 | 77 | def forward(self, x): 78 | y = self.conv_block(x) 79 | if self.pool: 80 | y = self.pool(y) 81 | y += self.shortcut(x) # ResNet shortcut connections 82 | return y 83 | 84 | class conv_block(nn.Module): # convolutional block 85 | def __init__(self, in_channels, out_channels, first_stride): 86 | super().__init__() 87 | 88 | # architecture 89 | self.sequential = nn.Sequential( 90 | nn.Conv1d(in_channels, out_channels, KERNEL_SIZE, first_stride, PADDING), 91 | nn.BatchNorm1d(out_channels), 92 | nn.ReLU(), 93 | nn.Conv1d(out_channels, out_channels, KERNEL_SIZE, STRIDE, PADDING), 94 | nn.BatchNorm1d(out_channels), 95 | nn.ReLU() 96 | ) 97 | 98 | def forward(self, x): 99 | return self.sequential(x) 100 | 101 | def LongTensor(*args): 102 | x = torch.LongTensor(*args) 103 | return x.cuda() if CUDA else x 104 | 105 | def scalar(x): 106 | return x.view(-1).data.tolist()[0] 107 | 108 | def argmax(x): 109 | return scalar(torch.max(x, 0)[1]) # for 1D tensor 110 | -------------------------------------------------------------------------------- /vdcnn/predict.py: -------------------------------------------------------------------------------- 1 | from model import * 2 | from utils import * 3 | 4 | def load_model(): 5 | word_to_idx = load_word_to_idx(sys.argv[2]) 6 | tag_to_idx = load_tag_to_idx(sys.argv[3]) 7 | idx_to_tag = [tag for tag, _ in sorted(tag_to_idx.items(), key = lambda x: x[1])] 8 | model = vdcnn(len(word_to_idx), len(tag_to_idx)) 9 | model.eval() 10 | print(model) 11 | load_checkpoint(sys.argv[1], model) 12 | return model, word_to_idx, tag_to_idx, idx_to_tag 13 | 14 | def run_model(model, idx_to_tag, data): 15 | pred = [] 16 | z = len(data) 17 | while len(data) < BATCH_SIZE: 18 | data.append(["", []]) 19 | data.sort(key = lambda x: -len(x[1])) 20 | batch = [x + [PAD_IDX] * (SEQ_LEN - len(x)) for _, x in data] 21 | result = model(LongTensor(batch)) 22 | for i in range(z): 23 | m = argmax(result[i]) 24 | y = idx_to_tag[m] 25 | data[i].append(y) 26 | return data[:z] 27 | 28 | def predict(): 29 | data = [] 30 | model, word_to_idx, tag_to_idx, idx_to_tag = load_model() 31 | fo = open(sys.argv[4]) 32 | for line in fo: 33 | line = line.strip() 34 | x = tokenize(line, "char")[:SEQ_LEN] 35 | x = [word_to_idx[i] for i in x if i in word_to_idx] 36 | data.append([line, x]) 37 | if len(data) == BATCH_SIZE: 38 | result = run_model(model, idx_to_tag, data) 39 | for x in result: 40 | print(x) 41 | data = [] 42 | fo.close() 43 | if len(data): 44 | result = run_model(model, idx_to_tag, data) 45 | for x in result: 46 | print(x) 47 | 48 | if __name__ == "__main__": 49 | if len(sys.argv) != 5: 50 | sys.exit("Usage: %s model word_to_idx tag_to_idx test_data" % sys.argv[0]) 51 | print("cuda: %s" % CUDA) 52 | predict() 53 | -------------------------------------------------------------------------------- /vdcnn/train.py: -------------------------------------------------------------------------------- 1 | from model import * 2 | from utils import * 3 | 4 | def load_data(): 5 | data = [] 6 | batch_x = [] 7 | batch_y = [] 8 | print("loading data...") 9 | word_to_idx = load_word_to_idx(sys.argv[2]) 10 | tag_to_idx = load_tag_to_idx(sys.argv[3]) 11 | fo = open(sys.argv[4], "r") 12 | for line in fo: 13 | line = line.strip() 14 | seq = [int(i) for i in line.split(" ")][:SEQ_LEN] 15 | label = seq.pop() 16 | pad = [PAD_IDX] * (SEQ_LEN - len(seq)) 17 | batch_x.append(seq + pad) 18 | batch_y.append(label) 19 | if len(batch_x) == BATCH_SIZE: 20 | data.append((LongTensor(batch_x), LongTensor(batch_y))) # append a mini-batch 21 | batch_x = [] 22 | batch_y = [] 23 | fo.close() 24 | print("data size: %d" % (len(data) * BATCH_SIZE)) 25 | print("batch size: %d" % BATCH_SIZE) 26 | return data, word_to_idx, tag_to_idx 27 | 28 | def train(): 29 | num_epochs = int(sys.argv[5]) 30 | data, word_to_idx, tag_to_idx = load_data() 31 | model = vdcnn(len(word_to_idx), len(tag_to_idx)) 32 | print(model) 33 | optim = torch.optim.SGD(model.parameters(), lr = LEARNING_RATE, weight_decay = WEIGHT_DECAY) 34 | epoch = load_checkpoint(sys.argv[1], model) if isfile(sys.argv[1]) else 0 35 | filename = re.sub("\.epoch[0-9]+$", "", sys.argv[1]) 36 | print("training model...") 37 | for ei in range(epoch + 1, epoch + num_epochs + 1): 38 | loss_sum = 0 39 | timer = time.time() 40 | for x, y in data: 41 | model.zero_grad() 42 | loss = F.nll_loss(model(x), y) # forward pass and compute loss 43 | loss.backward() # compute gradients 44 | optim.step() # update parameters 45 | loss = scalar(loss) 46 | loss_sum += loss 47 | timer = time.time() - timer 48 | loss_sum /= len(data) 49 | if ei % SAVE_EVERY and ei != epoch + num_epochs: 50 | save_checkpoint("", None, ei, loss_sum, timer) 51 | else: 52 | save_checkpoint(filename, model, ei, loss_sum, timer) 53 | 54 | if __name__ == "__main__": 55 | if len(sys.argv) != 6: 56 | sys.exit("Usage: %s model word_to_idx tag_to_idx training_data num_epoch" % sys.argv[0]) 57 | print("cuda: %s" % CUDA) 58 | train() 59 | --------------------------------------------------------------------------------