├── README.md ├── layers.py └── models.py /README.md: -------------------------------------------------------------------------------- 1 | # hcan-pytorch 2 | hierarchical convolutional attention networks for text classification 3 | 4 | 5 | ``` 6 | from models import Proto_CNN as HCAN 7 | 8 | model = HCAN(input_dim, hidden_dim, kernel_dim, 9 | sent_maxlen, dropout_rate, num_emb, pretrained_weight) 10 | ``` 11 | if you don't have ```pretrained_weight```, you should modify the Class ```Proto``` to ```pretrained_weight``` optional. 12 | 13 | ``` 14 | logtis = model(x, None) 15 | ``` 16 | second variable is length of ```x```, ```l```. However, since this variable is no longer needed, insert ```None```. 17 | ```x```is index of words. 18 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.nn.modules.module import Module 7 | 8 | 9 | class ConvolutionalMultiheadAttention(Module): 10 | def __init__(self, input_dim, kernel_dim, multihead_cnt, conv_cnt): 11 | super(ConvolutionalMultiheadAttention, self).__init__() 12 | self.input_dim = input_dim 13 | self.multihead_cnt = multihead_cnt 14 | 15 | self.convs = nn.ModuleList([nn.Conv1d(input_dim, input_dim, kernel_dim) 16 | for _ in range(conv_cnt)]) 17 | for w in self.convs: 18 | nn.init.xavier_normal_(w.weight) 19 | 20 | def attention(self, q, k, v): 21 | return torch.softmax(torch.div(torch.bmm(q.permute(0,2,1), k), 22 | np.sqrt(self.input_dim)), 2).bmm(v.permute(0,2,1)).permute(0,2,1) 23 | 24 | def multihead(self, hiddens): 25 | hiddens = [torch.chunk(hidden, self.multihead_cnt, 1) for hidden in hiddens] 26 | hiddens = torch.cat([self.attention(hiddens[0][i], hiddens[1][i], hiddens[2][i]) 27 | for i in range(self.multihead_cnt)], 1) 28 | 29 | return hiddens 30 | 31 | class ConvolutionalMultiheadSelfAttention(ConvolutionalMultiheadAttention): 32 | def __init__(self, input_dim, kernel_dim, multihead_cnt=10, conv_cnt=6): 33 | super(ConvolutionalMultiheadSelfAttention, self).\ 34 | __init__(input_dim, kernel_dim, multihead_cnt, conv_cnt) 35 | 36 | def forward(self, input): 37 | hiddens = [F.elu(conv(input)) for conv in self.convs[:-1]] 38 | hiddens.append(torch.tanh(self.convs[-1](input))) 39 | 40 | elu_hid = self.multihead(hiddens[:3]) 41 | tanh_hid= self.multihead(hiddens[3:]) 42 | output = F.layer_norm(torch.mul(elu_hid, tanh_hid), elu_hid.size()[1:]) 43 | 44 | return output 45 | 46 | class ConvolutionalMultiheadTargetAttention(ConvolutionalMultiheadAttention): 47 | def __init__(self, input_dim, kernel_dim, multihead_cnt=10, conv_cnt=2): 48 | super(ConvolutionalMultiheadTargetAttention, self).\ 49 | __init__(input_dim, kernel_dim, multihead_cnt, conv_cnt) 50 | self.target = nn.Parameter(torch.randn(input_dim, 1)) 51 | stdv = 1. / math.sqrt(self.target.size(1)) 52 | self.target.data.uniform_(-stdv, stdv) 53 | 54 | def forward(self, input): 55 | batch_size = input.size(0) 56 | hiddens = [F.elu(conv(input)) for conv in self.convs] 57 | output = self.multihead([self.target.expand(batch_size, self.input_dim, 1)]+hiddens) 58 | 59 | return output 60 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.nn.utils as utils 5 | import torch.nn.init as init 6 | 7 | from layers import ConvolutionalMultiheadSelfAttention as CMSA 8 | from layers import ConvolutionalMultiheadTargetAttention as CMTA 9 | 10 | class Proto(nn.Module): 11 | def __init__(self, num_emb, input_dim, pretrained_weight): 12 | super(Proto, self).__init__() 13 | self.id2vec = nn.Embedding(num_emb, input_dim, padding_idx=1) 14 | # unk, pad, ..., keywords 15 | self.id2vec.weight.data[3:].copy_(torch.from_numpy(pretrained_weight)) 16 | self.id2vec.requires_grad = True 17 | 18 | # withLogits : combines a Sigmoid and BCE 19 | #self.loss = nn.BCEWithLogitsLoss() 20 | self.loss = nn.CrossEntropyLoss() 21 | 22 | def binary_accuracy(self, y_pred, y): 23 | ge = torch.ge(y_pred.type(y.type()), 0.5).float() 24 | correct = torch.eq(ge, y).view(-1) 25 | 26 | return torch.sum(correct).item()/correct.shape[0] 27 | 28 | def accuracy(self, y_pred, y): 29 | _, pred = y_pred.max(1) 30 | return sum(pred==y).cpu().numpy()/y.size() 31 | 32 | def predict(self, x, l): 33 | input = self.id2vec(x) 34 | input = torch.div(torch.sum(input, 1), l) 35 | return self.model(input) 36 | 37 | def forward(self, data, sent_maxlen): 38 | x, l, y = torch.split(data, [sent_maxlen,1,1], 1) 39 | logits = self.predict(x, l.float()) 40 | loss = self.loss(logits, y.squeeze()) 41 | accuracy = self.accuracy(logits, y.squeeze()) 42 | 43 | return loss, accuracy 44 | 45 | 46 | class Proto_CNN(Proto): 47 | def __init__(self, input_dim, hidden_dim, kernel_dim, 48 | sent_maxlen, dropout_rate, num_emb, pretrained_weight): 49 | super(Proto_CNN, self).__init__(num_emb, input_dim, pretrained_weight) 50 | 51 | self.positions = nn.Parameter(torch.randn(sent_maxlen,input_dim)) 52 | stdv = 1. / self.positions.size(1) ** 0.5 53 | self.positions.data.uniform_(-stdv, stdv) 54 | self.cmsa = CMSA(input_dim, kernel_dim[0]) 55 | self.cmta = CMTA(input_dim, kernel_dim[1]) 56 | self.dropout = nn.Dropout(dropout_rate) 57 | self.cls = nn.Linear(input_dim, hidden_dim[-1]) 58 | nn.init.xavier_normal_(self.cls.weight) 59 | 60 | def predict(self, x, l): 61 | input = self.id2vec(x) 62 | input = self.dropout(input + self.positions) 63 | 64 | hidden = self.cmsa(input.permute(0,2,1)) 65 | hidden = self.cmta(hidden) 66 | 67 | logits = self.cls(hidden.squeeze(-1)) 68 | 69 | return logits 70 | --------------------------------------------------------------------------------