├── sequence_module
    ├── __init__.py
    ├── encoder.py
    ├── attention.py
    ├── decoder.py
    └── evaluation.py
├── .gitignore
├── README.md
└── models
    ├── SiaGRU.py
    ├── ESIM.py
    ├── ABCNN2.py
    ├── BiMPM.py
    └── ABCNN.py


/sequence_module/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.pickle
3 | .idea
4 | __pycache__
5 | .DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text-Similarity Method Implemented by Pytorch
 2 | 
 3 | ## Method list
 4 | 
 5 |     1. ESIM (Enhanced LSTM for Natural Language Inferenc)
 6 |     2. SiaGRU (Siamese Recurrent Architectures for Learning Sentence Similarity)
 7 |     3. ABCNN (ABCNN: Attention-Based Convolutional Neural Network for Modeling Sentence Pairs)
 8 |     4. BiMPM (Bilateral Multi-Perspective Matching for Natural Language Sentences)
 9 | 
10 | 


--------------------------------------------------------------------------------
/models/SiaGRU.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class SiaGRU(nn.Module):
 6 |     def __init__(self, args):
 7 |         super(SiaGRU, self).__init__()
 8 |         self.args = args
 9 |         self.embeds_dim = args.embeds_dim
10 |         num_word = 20000
11 |         self.embeds = nn.Embedding(num_word, self.embeds_dim)
12 |         self.ln_embeds = nn.LayerNorm(self.embeds_dim)
13 |         self.hidden_size = args.hidden_size
14 |         self.num_layer = args.num_layer
15 |         self.gru = nn.LSTM(self.embeds_dim, self.hidden_size, batch_first=True, bidirectional=True, num_layers=2)
16 |         self.h0 = self.init_hidden((2 * self.num_layer, 1, self.hidden_size))
17 | 
18 |     def init_hidden(self, size):
19 |         h0 = nn.Parameter(torch.randn(size))
20 |         nn.init.xavier_normal_(h0)
21 |         return h0
22 | 
23 |     def forward_once(self, x):
24 |         output, hidden, cell = self.gru(x)
25 |         return hidden.squeeze()
26 | 
27 |     def forward(self, *input):
28 |         # sent1: batch_size * seq_len
29 |         sent1 = input[0]
30 |         sent2 = input[1]
31 | 
32 |         # embeds: batch_size * seq_len => batch_size * seq_len * dim
33 |         x1 = self.ln_embeds(self.embeds(sent1).transpose(1, 2).contiguous()).transpose(1, 2)
34 |         x2 = self.ln_embeds(self.embeds(sent2).transpose(1, 2).contiguous()).transpose(1, 2)
35 | 
36 |         encoding1 = self.forward_once(x1)
37 |         encoding2 = self.forward_once(x2)
38 | 
39 |         sim = torch.exp(-torch.norm(encoding1 - encoding2, p=2, dim=-1, keepdim=True))
40 |         return self.fc(sim)
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/sequence_module/encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | import torch.optim as optim
 5 | import torch.nn.functional as F
 6 | from torch.nn.utils.rnn import PackedSequence,pack_padded_sequence
 7 | 
 8 | 
 9 | class Encoder(nn.Module):
10 |     def __init__(self, input_size, embedding_size,hidden_size, n_layers=1,bidirec=False,dropout_p=0.5,use_cuda=False):
11 |         super(Encoder, self).__init__()
12 |         
13 |         self.input_size = input_size
14 |         self.hidden_size = hidden_size
15 |         self.n_layers = n_layers
16 |         self.dropout = nn.Dropout(dropout_p)
17 |         self.embedding = nn.Embedding(input_size, embedding_size)
18 |         self.use_cuda = use_cuda
19 |         
20 |         if bidirec:
21 |             self.n_direction = 2 
22 |             self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True,bidirectional=True)
23 |         else:
24 |             self.n_direction = 1
25 |             self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True)
26 |     
27 |     def init_hidden(self,inputs):
28 |         hidden = Variable(torch.zeros(self.n_layers*self.n_direction,inputs.size(0),self.hidden_size))
29 |         return hidden.cuda() if self.use_cuda else hidden
30 |     
31 |     def init_weight(self):
32 |         self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight)
33 |         self.gru.weight_hh_l0 = nn.init.xavier_normal(self.gru.weight_hh_l0)
34 |         self.gru.weight_ih_l0 = nn.init.xavier_normal(self.gru.weight_ih_l0)
35 |     
36 |     def forward(self, inputs, input_lengths):
37 |         """
38 |         inputs : B,T (LongTensor)
39 |         input_lengths : real lengths of input batch (list)
40 |         """
41 |         hidden = self.init_hidden(inputs)
42 |         
43 |         embedded = self.embedding(inputs)
44 |         embedded = self.dropout(embedded)
45 |         packed = pack_padded_sequence(embedded, input_lengths,batch_first=True)
46 |         outputs, hidden = self.gru(packed, hidden)
47 |         outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs,batch_first=True) # unpack (back to padded)
48 |         
49 |         if self.n_layers>1:
50 |             if self.n_direction==2:
51 |                 hidden = hidden[-2:]
52 |             else:
53 |                 hidden = hidden[-1]
54 |         
55 |         # B,T,D  / B,1,D
56 |         return outputs, torch.cat([h for h in hidden],1).unsqueeze(1)


--------------------------------------------------------------------------------
/sequence_module/attention.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | class Attention(nn.Module):
  8 |     def __init__(self, hidden_size, method='general'):
  9 |         super(Attention, self).__init__()
 10 |         
 11 |         self.method = method
 12 |         self.hidden_size = hidden_size
 13 |         
 14 |         if self.method == 'general':
 15 |             self.attn = nn.Linear(self.hidden_size, hidden_size)
 16 | 
 17 |         elif self.method == 'concat':
 18 |             self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
 19 |             self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))
 20 | 
 21 |     def forward(self, hidden, encoder_outputs,encoder_lengths=None,return_weight=False):
 22 |         """
 23 |         hidden : query (previous hidden) B,1,D <FloatTensor>
 24 |         encoder_outputs : context (encoder outputs) B,T,D <FloatTensor>
 25 |         encoder_lengths : list[int]
 26 |         """
 27 |         q, c = hidden, encoder_outputs
 28 | 
 29 |         batch_size_q, n_q, dim_q = q.size()
 30 |         batch_size_c, n_c, dim_c = c.size()
 31 | 
 32 |         if not (batch_size_q == batch_size_c):
 33 |             msg = 'batch size mismatch (query: {}, context: {}, value: {})'
 34 |             raise ValueError(msg.format(q.size(), c.size()))
 35 | 
 36 |         batch_size = batch_size_q
 37 |         
 38 |         s = self.score(q,c)
 39 |         
 40 |         # 인코딩 마스킹
 41 |         if encoder_lengths is not None:
 42 |             mask = s.data.new(batch_size, n_q, n_c)
 43 |             mask = self.fill_context_mask(mask, sizes=encoder_lengths, v_mask=float('-inf'), v_unmask=0)
 44 |             s = Variable(mask) + s
 45 |         
 46 |         # softmax로 normalize
 47 |         s_flat = s.view(batch_size * n_q, n_c)
 48 |         w_flat = F.softmax(s_flat,1)
 49 |         w = w_flat.view(batch_size, n_q, n_c)
 50 |         
 51 |         # Combine
 52 |         z = w.bmm(c)
 53 |         if return_weight:
 54 |             return w, z
 55 |         return z
 56 |         
 57 |     
 58 |     def score(self, q, c):
 59 |         """
 60 |         q: B,1,D
 61 |         c: B,T,D
 62 |         """
 63 |         if self.method == 'dot':
 64 |             return q.bmm(c.transpose(1, 2)) # B,1,D * B,D,T => B,1,T
 65 |         
 66 |         elif self.method == 'general':
 67 |             energy = self.attn(c) # B,T,D => B,T,D
 68 |             return q.bmm(energy.transpose(1,2)) # B,1,D * B,D,T => B,1,T
 69 |                     
 70 |         elif self.method == 'concat':
 71 |             q = q.repeat(1,c.size(1),1) # B,T,D
 72 |             energy = self.attn(torch.cat([q, c], 2)) # B,T,2D => B,T,D
 73 |             v = self.v.repeat(c.size(1), 1).unsqueeze(1)  # B,1,D
 74 |             return v.bmm(energy.transpose(1,2)) # B,1,D * B,D,T => B,1,T 
 75 |             
 76 |     
 77 |     def fill_context_mask(self, mask, sizes, v_mask, v_unmask):
 78 |         """Fill attention mask inplace for a variable length context.
 79 |         Args
 80 |         ----
 81 |         mask: Tensor of size (B, T, D)
 82 |             Tensor to fill with mask values. 
 83 |         sizes: list[int]
 84 |             List giving the size of the context for each item in
 85 |             the batch. Positions beyond each size will be masked.
 86 |         v_mask: float
 87 |             Value to use for masked positions.
 88 |         v_unmask: float
 89 |             Value to use for unmasked positions.
 90 |         Returns
 91 |         -------
 92 |         mask:
 93 |             Filled with values in {v_mask, v_unmask}
 94 |         """
 95 |         mask.fill_(v_unmask)
 96 |         n_context = mask.size(2)
 97 |         for i, size in enumerate(sizes):
 98 |             if size < n_context:
 99 |                 mask[i,:,size:] = v_mask
100 |         return mask


--------------------------------------------------------------------------------
/models/ESIM.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | import torch
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | class ESIM(nn.Module):
  7 |     def __init__(self, args):
  8 |         super(ESIM, self).__init__()
  9 |         self.args = args
 10 |         self.dropout = 0.5
 11 |         self.hidden_size = args.hidden_size
 12 |         self.embeds_dim = args.embeds_dim
 13 |         num_word = 20000
 14 |         self.embeds = nn.Embedding(num_word, self.embeds_dim)
 15 |         self.bn_embeds = nn.BatchNorm1d(self.embeds_dim)
 16 |         self.lstm1 = nn.LSTM(self.embeds_dim, self.hidden_size, batch_first=True, bidirectional=True)
 17 |         self.lstm2 = nn.LSTM(self.hidden_size*8, self.hidden_size, batch_first=True, bidirectional=True)
 18 | 
 19 |         self.fc = nn.Sequential(
 20 |             nn.BatchNorm1d(self.hidden_size * 8),
 21 |             nn.Linear(self.hidden_size * 8, args.linear_size),
 22 |             nn.ELU(inplace=True),
 23 |             nn.BatchNorm1d(args.linear_size),
 24 |             nn.Dropout(self.dropout),
 25 |             nn.Linear(args.linear_size, args.linear_size),
 26 |             nn.ELU(inplace=True),
 27 |             nn.BatchNorm1d(args.linear_size),
 28 |             nn.Dropout(self.dropout),
 29 |             nn.Linear(args.linear_size, 2),
 30 |             nn.Softmax(dim=-1)
 31 |         )
 32 |     
 33 |     def soft_attention_align(self, x1, x2, mask1, mask2):
 34 |         '''
 35 |         x1: batch_size * seq_len * dim
 36 |         x2: batch_size * seq_len * dim
 37 |         '''
 38 |         # attention: batch_size * seq_len * seq_len
 39 |         attention = torch.matmul(x1, x2.transpose(1, 2))
 40 |         mask1 = mask1.float().masked_fill_(mask1, float('-inf'))
 41 |         mask2 = mask2.float().masked_fill_(mask2, float('-inf'))
 42 | 
 43 |         # weight: batch_size * seq_len * seq_len
 44 |         weight1 = F.softmax(attention + mask2.unsqueeze(1), dim=-1)
 45 |         x1_align = torch.matmul(weight1, x2)
 46 |         weight2 = F.softmax(attention.transpose(1, 2) + mask1.unsqueeze(1), dim=-1)
 47 |         x2_align = torch.matmul(weight2, x1)
 48 |         # x_align: batch_size * seq_len * hidden_size
 49 | 
 50 |         return x1_align, x2_align
 51 | 
 52 |     def submul(self, x1, x2):
 53 |         mul = x1 * x2
 54 |         sub = x1 - x2
 55 |         return torch.cat([sub, mul], -1)
 56 | 
 57 |     def apply_multiple(self, x):
 58 |         # input: batch_size * seq_len * (2 * hidden_size)
 59 |         p1 = F.avg_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1)
 60 |         p2 = F.max_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1)
 61 |         # output: batch_size * (4 * hidden_size)
 62 |         return torch.cat([p1, p2], 1)
 63 | 
 64 |     def forward(self, *input):
 65 |         # batch_size * seq_len
 66 |         sent1, sent2 = input[0], input[1]
 67 |         mask1, mask2 = sent1.eq(0), sent2.eq(0)
 68 | 
 69 |         # embeds: batch_size * seq_len => batch_size * seq_len * dim
 70 |         x1 = self.bn_embeds(self.embeds(sent1).transpose(1, 2).contiguous()).transpose(1, 2)
 71 |         x2 = self.bn_embeds(self.embeds(sent2).transpose(1, 2).contiguous()).transpose(1, 2)
 72 | 
 73 |         # batch_size * seq_len * dim => batch_size * seq_len * hidden_size
 74 |         o1, _ = self.lstm1(x1)
 75 |         o2, _ = self.lstm1(x2)
 76 | 
 77 |         # Attention
 78 |         # batch_size * seq_len * hidden_size
 79 |         q1_align, q2_align = self.soft_attention_align(o1, o2, mask1, mask2)
 80 |         
 81 |         # Compose
 82 |         # batch_size * seq_len * (8 * hidden_size)
 83 |         q1_combined = torch.cat([o1, q1_align, self.submul(o1, q1_align)], -1)
 84 |         q2_combined = torch.cat([o2, q2_align, self.submul(o2, q2_align)], -1)
 85 | 
 86 |         # batch_size * seq_len * (2 * hidden_size)
 87 |         q1_compose, _ = self.lstm2(q1_combined)
 88 |         q2_compose, _ = self.lstm2(q2_combined)
 89 | 
 90 |         # Aggregate
 91 |         # input: batch_size * seq_len * (2 * hidden_size)
 92 |         # output: batch_size * (4 * hidden_size)
 93 |         q1_rep = self.apply_multiple(q1_compose)
 94 |         q2_rep = self.apply_multiple(q2_compose)
 95 | 
 96 |         # Classifier
 97 |         x = torch.cat([q1_rep, q2_rep], -1)
 98 |         similarity = self.fc(x)
 99 |         return similarity
100 | 
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/models/ABCNN2.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | class ABCNN3(nn.Module):
  7 |     def __init__(self, args):
  8 |         super(ABCNN3, self).__init__()
  9 |         self.args = args
 10 |         self.embeds_dim = args.embeds_dim
 11 |         num_word = 20000
 12 |         self.embeds = nn.Embedding(num_word, self.embeds_dim)
 13 |         self.linear_size = args.linear_size
 14 |         self.num_layer = args.num_layer
 15 |         self.conv = nn.ModuleList([Wide_Conv(args.max_length, self.embeds_dim) for _ in range(self.num_layer)])
 16 | 
 17 |         self.fc = nn.Sequential(
 18 |             nn.Linear(self.embeds_dim*(1+self.num_layer)*2, self.linear_size),
 19 |             nn.LayerNorm(self.linear_size),
 20 |             nn.ReLU(inplace=True),
 21 |             nn.Linear(self.linear_size, 2),
 22 |             nn.Softmax()
 23 |         )
 24 | 
 25 |     def forward(self, *input):
 26 |         s1, s2 = input[0], input[1]
 27 |         mask1, mask2 = s1.eq(0), s2.eq(0)
 28 |         res = [[], []]
 29 |         s1, s2 = self.embeds(s1), self.embeds(s2)
 30 |         # eg: s1 => res[0]
 31 |         # (batch_size, seq_len, dim) => (batch_size, dim)
 32 |         # if num_layer == 0
 33 |         res[0].append(F.avg_pool1d(s1.transpose(1, 2), kernel_size=s1.size(1)).squeeze(-1))
 34 |         res[1].append(F.avg_pool1d(s2.transpose(1, 2), kernel_size=s2.size(1)).squeeze(-1))
 35 |         for i, conv in enumerate(self.conv):
 36 |             o1, o2 = conv(s1, s2, mask1, mask2)
 37 |             res[0].append(F.avg_pool1d(o1.transpose(1, 2), kernel_size=o1.size(1)).squeeze(-1))
 38 |             res[1].append(F.avg_pool1d(o2.transpose(1, 2), kernel_size=o2.size(1)).squeeze(-1))
 39 |             o1, o2 = attention_avg_pooling(o1, o2, mask1, mask2)
 40 |             s1, s2 = o1 + s1, o2 + s2
 41 |         # batch_size * (dim*(1+num_layer)*2) => batch_size * linear_size
 42 |         x = torch.cat([torch.cat(res[0], 1), torch.cat(res[1], 1)], 1)
 43 |         sim = self.fc(x)
 44 |         return sim
 45 | 
 46 | 
 47 | class Wide_Conv(nn.Module):
 48 |     def __init__(self, seq_len, embeds_size):
 49 |         super(Wide_Conv, self).__init__()
 50 |         self.seq_len = seq_len
 51 |         self.embeds_size = embeds_size
 52 |         self.W = nn.Parameter(torch.randn([seq_len, embeds_size]))
 53 |         nn.init.xavier_normal_(self.W)
 54 |         self.conv = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=3, padding=[1, 1], stride=1)
 55 |         self.tanh = nn.Tanh()
 56 |         
 57 |     def forward(self, sent1, sent2, mask1, mask2):
 58 |         '''
 59 |         sent1, sent2: batch_size * seq_len * dim
 60 |         '''
 61 |         # sent1, sent2 = sent1.transpose(0, 1), sent2.transpose(0, 1)
 62 |         # => A: batch_size * seq_len * seq_len
 63 |         A = match_score(sent1, sent2, mask1, mask2)
 64 |         # attn_feature_map1: batch_size * seq_len * dim
 65 |         attn_feature_map1 = A.matmul(self.W)
 66 |         attn_feature_map2 = A.transpose(1, 2).matmul(self.W)
 67 |         # x1: batch_size * 2 *seq_len * dim
 68 |         x1 = torch.cat([sent1.unsqueeze(1), attn_feature_map1.unsqueeze(1)], 1)
 69 |         x2 = torch.cat([sent2.unsqueeze(1), attn_feature_map2.unsqueeze(1)], 1)
 70 |         o1, o2 = self.conv(x1).squeeze(1), self.conv(x2).squeeze(1)
 71 |         o1, o2 = self.tanh(o1), self.tanh(o2)
 72 |         return o1, o2
 73 | 
 74 | 
 75 | def match_score(s1, s2, mask1, mask2):
 76 |     '''
 77 |     s1, s2:  batch_size * seq_len  * dim
 78 |     '''
 79 |     batch, seq_len, dim = s1.shape
 80 |     s1 = s1 * mask1.eq(0).unsqueeze(2).float()
 81 |     s2 = s2 * mask2.eq(0).unsqueeze(2).float()
 82 |     s1 = s1.unsqueeze(2).repeat(1, 1, seq_len, 1)
 83 |     s2 = s2.unsqueeze(1).repeat(1, seq_len, 1, 1)
 84 |     a = s1 - s2
 85 |     a = torch.norm(a, dim=-1, p=2)
 86 |     return 1.0 / (1.0 + a)
 87 | 
 88 | 
 89 | def attention_avg_pooling(sent1, sent2, mask1, mask2):
 90 |     # A: batch_size * seq_len * seq_len
 91 |     A = match_score(sent1, sent2, mask1, mask2)
 92 |     weight1 = torch.sum(A, -1)
 93 |     weight2 = torch.sum(A.transpose(1, 2), -1)
 94 |     s1 = sent1 * weight1.unsqueeze(2)
 95 |     s2 = sent2 * weight2.unsqueeze(2)
 96 |     s1 = F.avg_pool1d(s1.transpose(1, 2), kernel_size=3, padding=1, stride=1)
 97 |     s2 = F.avg_pool1d(s2.transpose(1, 2), kernel_size=3, padding=1, stride=1)
 98 |     s1, s2 = s1.transpose(1, 2), s2.transpose(1, 2)
 99 |     return s1, s2
100 | 


--------------------------------------------------------------------------------
/sequence_module/decoder.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | import torch.nn.functional as F
  5 | from .attention import Attention
  6 | 
  7 | 
  8 | class Decoder(nn.Module):
  9 |     def __init__(self, input_size, embedding_size, hidden_size, n_layers=1,dropout_p=0.3,use_cuda=False):
 10 |         super(Decoder, self).__init__()
 11 |         
 12 |         self.hidden_size = hidden_size
 13 |         self.n_layers = n_layers
 14 |         
 15 |         # Define the layers
 16 |         self.embedding = nn.Embedding(input_size, embedding_size)
 17 |         self.dropout = nn.Dropout(dropout_p)
 18 |         
 19 |         self.gru = nn.GRU(embedding_size+hidden_size, hidden_size, n_layers,batch_first=True)
 20 |         self.linear = nn.Linear(hidden_size*2, input_size)
 21 |         self.attention = Attention(hidden_size)
 22 |         self.use_cuda = use_cuda
 23 |         
 24 |     def init_hidden(self,inputs):
 25 |         hidden = Variable(torch.zeros(self.n_layers,inputs.size(0),self.hidden_size))
 26 |         return hidden.cuda() if self.use_cuda else hidden
 27 |     
 28 |     
 29 |     def init_weight(self):
 30 |         self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight)
 31 |         self.gru.weight_hh_l0 = nn.init.xavier_normal(self.gru.weight_hh_l0)
 32 |         self.gru.weight_ih_l0 = nn.init.xavier_normal(self.gru.weight_ih_l0)
 33 |         self.linear.weight = nn.init.xavier_uniform(self.linear.weight)
 34 |     
 35 |     
 36 |     def greedy_decode(self,inputs,context,max_length,encoder_outputs):
 37 |         embedded = self.embedding(inputs)
 38 |         hidden = self.init_hidden(inputs)
 39 |         
 40 |         decode=[]
 41 |         i=0
 42 |         # Apply GRU to the output so far
 43 |         decoded = inputs
 44 |         while decoded.data.tolist()[0]!=3:
 45 |             _, hidden = self.gru(torch.cat((embedded,context),2), hidden) # h_t = f(h_{t-1},y_{t-1},c)
 46 |             concated = torch.cat((hidden,context.transpose(0,1)),2) # y_t = g(h_t,y_{t-1},c)
 47 |             score = self.linear(concated.squeeze(0))
 48 |             softmaxed = F.log_softmax(score,1)
 49 |             decode.append(softmaxed)
 50 |             decoded = softmaxed.max(1)[1]
 51 |             embedded = self.embedding(decoded).unsqueeze(1) # y_{t-1}
 52 |             
 53 |             # compute next context vector using attention
 54 |             context = self.attention(hidden, encoder_outputs,None)
 55 |             i+=1
 56 |         #  column-wise concat, reshape!!
 57 |         scores = torch.cat(decode)
 58 |         return scores.max(1)[1]
 59 |     
 60 |     
 61 |     def beam_search_decode(self,inputs,context,max_length,encoder_outputs):
 62 |         embedded = self.embedding(inputs)
 63 |         hidden = self.init_hidden(inputs)
 64 |         _, hidden = self.gru(torch.cat((embedded,context),2), hidden) # h_t = f(h_{t-1},y_{t-1},c)
 65 |         concated = torch.cat((hidden,context.transpose(0,1)),2) # y_t = g(h_t,y_{t-1},c)
 66 |         score = self.linear(concated.squeeze(0))
 67 | 
 68 |         beam = Beam([score,hidden],3) # beam init
 69 |         nodes = beam.get_next_nodes()
 70 |         i = 1
 71 | 
 72 |         while i<max_length:
 73 |             siblings=[]
 74 |             for node in nodes:
 75 |                 embedded = self.embedding(node[0])# y_{t-1}
 76 |                 hidden = node[1]
 77 |                 context = self.attention(hidden, encoder_outputs,None)
 78 |                 _, hidden = self.gru(torch.cat((embedded,context),2), hidden) # h_t = f(h_{t-1},y_{t-1},c)
 79 |                 concated = torch.cat((hidden,context.transpose(0,1)),2) # y_t = g(h_t,y_{t-1},c)
 80 |                 score = self.linear(concated.squeeze(0))
 81 | 
 82 |                 siblings.append([score,hidden])
 83 |             i+=1
 84 |             nodes = beam.select_k(siblings)
 85 |             # compute next context vector using attention
 86 | 
 87 |         return beam.get_best_seq()
 88 |     
 89 |     
 90 |     def forward(self,inputs,context,max_length,encoder_outputs,encoder_lengths=None):
 91 |         """
 92 |         inputs : B,1 (LongTensor, START SYMBOL)
 93 |         context : B,1,D (FloatTensor, Last encoder hidden state)
 94 |         encoder_outputs : B,T,D
 95 |         encoder_lengths : B,T # list
 96 |         max_length : int, max length to decode
 97 |         """
 98 |         # Get the embedding of the current input word
 99 |         embedded = self.embedding(inputs)
100 |         hidden = self.init_hidden(inputs)
101 |         embedded = self.dropout(embedded)
102 |         
103 |         decode=[]
104 |         # Apply GRU to the output so far
105 |         for i in range(max_length): # because of <s> , </s>
106 | 
107 |             _, hidden = self.gru(torch.cat((embedded,context),2), hidden) # h_t = f(h_{t-1},y_{t-1},c)
108 |             concated = torch.cat((hidden,context.transpose(0,1)),2) # y_t = g(h_t,y_{t-1},c)
109 |             score = self.linear(concated.squeeze(0))
110 |             softmaxed = F.log_softmax(score,1)
111 |             decode.append(softmaxed)
112 |             decoded = softmaxed.max(1)[1]
113 |             embedded = self.embedding(decoded).unsqueeze(1) # y_{t-1}
114 |             embedded = self.dropout(embedded)
115 |             
116 |             # compute next context vector using attention
117 |             context = self.attention(hidden.transpose(0,1), encoder_outputs, encoder_lengths)
118 |             
119 |         #  column-wise concat, reshape!!
120 |         scores = torch.cat(decode,1)
121 |         return scores.view(inputs.size(0)*max_length,-1)
122 | 
123 | 
124 | class Beam:
125 |     def __init__(self,root,num_beam):
126 |         """
127 |         root : (score, hidden)
128 |         """
129 |         self.num_beam = num_beam
130 |         
131 |         score = F.log_softmax(root[0],1)
132 |         s,i = score.topk(num_beam)
133 |         s = s.data.tolist()[0]
134 |         i = i.data.tolist()[0]
135 |         i = [[ii] for ii in i]
136 |         hiddens = [root[1] for _ in range(num_beam)]
137 |         self.beams = list(zip(s,i,hiddens))
138 |         self.beams = sorted(self.beams,key= lambda x:x[0], reverse=True)
139 |         
140 |     def select_k(self,siblings):
141 |         """
142 |         siblings : [score,hidden]
143 |         """
144 |         candits=[]
145 |         for p_index,sibling in enumerate(siblings):
146 |             parents = self.beams[p_index] # (cummulated score, list of sequence)
147 |             score = F.log_softmax(sibling[0],1)
148 |             s,i = score.topk(self.num_beam)
149 |             scores = s.data.tolist()[0]
150 |             #scores = [scores[i-1]+i for i in range(1,self.num_beam+1)] # penalize siblings
151 |             indices = i.data.tolist()[0]
152 | 
153 |             candits.extend([(parents[0]+scores[i],parents[1]+[indices[i]],sibling[1]) for i in range(len(scores))])
154 |         
155 |         candits = sorted(candits,key= lambda x:x[0], reverse=True)
156 | 
157 |         self.beams = candits[:self.num_beam]
158 |         
159 |         # last_input, hidden
160 |         return [[Variable(torch.LongTensor([b[1][-1]])).view(1,-1),b[2]] for b in self.beams]
161 |         
162 |     def get_best_seq(self):
163 |         return self.beams[0][1]
164 |     
165 |     def get_next_nodes(self):
166 |         return [[Variable(torch.LongTensor([b[1][-1]])).view(1,-1),b[2]] for b in self.beams]
167 |     


--------------------------------------------------------------------------------
/sequence_module/evaluation.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/Maluuba/nlg-eval
  2 | 
  3 | import copy
  4 | from collections import defaultdict
  5 | import math
  6 | 
  7 | 
  8 | def compute_corpus_bleu(hypothesis, references):
  9 |     with open(hypothesis, 'r') as f:
 10 |         hyp_list = f.readlines()
 11 |     ref_list = []
 12 |     for iidx, reference in enumerate(references):
 13 |         with open(reference, 'r') as f:
 14 |             ref_list.append(f.readlines())
 15 |     ref_list = [map(str.strip, refs) for refs in zip(*ref_list)]
 16 |     refs = {idx: strippedlines for (idx, strippedlines) in enumerate(ref_list)}
 17 |     hyps = {idx: [lines.strip()] for (idx, lines) in enumerate(hyp_list)}
 18 |     assert len(refs) == len(hyps)
 19 | 
 20 |     ret_scores = {}
 21 |     scorer = Bleu(4)
 22 |     method = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]
 23 |     score, scores = scorer.compute_score(refs, hyps)
 24 |     for sc, scs, m in zip(score, scores, method):
 25 |         print("%s: %0.6f" % (m, sc))
 26 |         ret_scores[m] = sc
 27 |     
 28 |     return ret_scores
 29 | 
 30 | 
 31 | def compute_individual_bleu(ref, hyp):
 32 |     assert isinstance(ref, list)
 33 |     assert isinstance(hyp, str)
 34 | 
 35 |     refs = {0: ref}
 36 |     ref_list = [ref]
 37 | 
 38 |     hyps = {0: [hyp.strip()]}
 39 |     hyp_list = [hyp]
 40 |     
 41 |     scorer = Bleu(4)
 42 |     method = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]
 43 |     
 44 |     ret_scores = {}
 45 |     
 46 |     score, scores = scorer.compute_score(refs, hyps)
 47 |     for sc, scs, m in zip(score, scores, method):
 48 |         ret_scores[m] = sc
 49 | 
 50 |     return ret_scores
 51 | 
 52 | 
 53 | class Bleu:
 54 |     def __init__(self, n=4):
 55 |         # default compute Blue score up to 4
 56 |         self._n = n
 57 |         self._hypo_for_image = {}
 58 |         self.ref_for_image = {}
 59 | 
 60 |     def compute_score(self, gts, res):
 61 | 
 62 |         assert(gts.keys() == res.keys())
 63 |         imgIds = gts.keys()
 64 | 
 65 |         bleu_scorer = BleuScorer(n=self._n)
 66 |         for id in imgIds:
 67 |             hypo = res[id]
 68 |             ref = gts[id]
 69 | 
 70 |             # Sanity check.
 71 |             assert(type(hypo) is list)
 72 |             assert(len(hypo) == 1)
 73 |             assert(type(ref) is list)
 74 |             assert(len(ref) >= 1)
 75 | 
 76 |             bleu_scorer += (hypo[0], ref)
 77 | 
 78 |         #score, scores = bleu_scorer.compute_score(option='shortest')
 79 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=0)
 80 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
 81 | 
 82 |         # return (bleu, bleu_info)
 83 |         return score, scores
 84 | 
 85 |     def method(self):
 86 |         return "Bleu"
 87 | 
 88 | 
 89 | def precook(s, n=4, out=False):
 90 |     """Takes a string as input and returns an object that can be given to
 91 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 92 |     can take string arguments as well."""
 93 |     words = s.split()
 94 |     counts = defaultdict(int)
 95 |     for k in range(1,n+1):
 96 |         for i in range(len(words)-k+1):
 97 |             ngram = tuple(words[i:i+k])
 98 |             counts[ngram] += 1
 99 |     return (len(words), counts)
100 | 
101 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
102 |     '''Takes a list of reference sentences for a single segment
103 |     and returns an object that encapsulates everything that BLEU
104 |     needs to know about them.'''
105 | 
106 |     reflen = []
107 |     maxcounts = {}
108 |     for ref in refs:
109 |         rl, counts = precook(ref, n)
110 |         reflen.append(rl)
111 |         for (ngram,count) in counts.items():
112 |             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
113 | 
114 |     # Calculate effective reference sentence length.
115 |     if eff == "shortest":
116 |         reflen = min(reflen)
117 |     elif eff == "average":
118 |         reflen = float(sum(reflen))/len(reflen)
119 | 
120 |     ## lhuang: N.B.: leave reflen computaiton to the very end!!
121 | 
122 |     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
123 | 
124 |     return (reflen, maxcounts)
125 | 
126 | def cook_test(test, temp, eff=None, n=4):
127 |     '''Takes a test sentence and returns an object that
128 |     encapsulates everything that BLEU needs to know about it.'''
129 |     (reflen, refmaxcounts) = temp
130 |     testlen, counts = precook(test, n, True)
131 | 
132 |     result = {}
133 | 
134 |     # Calculate effective reference sentence length.
135 | 
136 |     if eff == "closest":
137 |         result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
138 |     else: ## i.e., "average" or "shortest" or None
139 |         result["reflen"] = reflen
140 | 
141 |     result["testlen"] = testlen
142 | 
143 |     result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
144 | 
145 |     result['correct'] = [0]*n
146 |     for (ngram, count) in counts.items():
147 |         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
148 | 
149 |     return result
150 | 
151 | class BleuScorer(object):
152 |     """Bleu scorer.
153 |     """
154 | 
155 |     __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
156 |     # special_reflen is used in oracle (proportional effective ref len for a node).
157 | 
158 |     def copy(self):
159 |         ''' copy the refs.'''
160 |         new = BleuScorer(n=self.n)
161 |         new.ctest = copy.copy(self.ctest)
162 |         new.crefs = copy.copy(self.crefs)
163 |         new._score = None
164 |         return new
165 | 
166 |     def __init__(self, test=None, refs=None, n=4, special_reflen=None):
167 |         ''' singular instance '''
168 | 
169 |         self.n = n
170 |         self.crefs = []
171 |         self.ctest = []
172 |         self.cook_append(test, refs)
173 |         self.special_reflen = special_reflen
174 | 
175 |     def cook_append(self, test, refs):
176 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
177 | 
178 |         if refs is not None:
179 |             self.crefs.append(cook_refs(refs))
180 |             if test is not None:
181 |                 cooked_test = cook_test(test, self.crefs[-1])
182 |                 self.ctest.append(cooked_test) ## N.B.: -1
183 |             else:
184 |                 self.ctest.append(None) # lens of crefs and ctest have to match
185 | 
186 |         self._score = None ## need to recompute
187 | 
188 |     def ratio(self, option=None):
189 |         self.compute_score(option=option)
190 |         return self._ratio
191 | 
192 |     def score_ratio(self, option=None):
193 |         '''return (bleu, len_ratio) pair'''
194 |         return (self.fscore(option=option), self.ratio(option=option))
195 | 
196 |     def score_ratio_str(self, option=None):
197 |         return "%.4f (%.2f)" % self.score_ratio(option)
198 | 
199 |     def reflen(self, option=None):
200 |         self.compute_score(option=option)
201 |         return self._reflen
202 | 
203 |     def testlen(self, option=None):
204 |         self.compute_score(option=option)
205 |         return self._testlen
206 | 
207 |     def retest(self, new_test):
208 |         if type(new_test) is str:
209 |             new_test = [new_test]
210 |         assert len(new_test) == len(self.crefs), new_test
211 |         self.ctest = []
212 |         for t, rs in zip(new_test, self.crefs):
213 |             self.ctest.append(cook_test(t, rs))
214 |         self._score = None
215 | 
216 |         return self
217 | 
218 |     def rescore(self, new_test):
219 |         ''' replace test(s) with new test(s), and returns the new score.'''
220 | 
221 |         return self.retest(new_test).compute_score()
222 | 
223 |     def size(self):
224 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
225 |         return len(self.crefs)
226 | 
227 |     def __iadd__(self, other):
228 |         '''add an instance (e.g., from another sentence).'''
229 | 
230 |         if type(other) is tuple:
231 |             ## avoid creating new BleuScorer instances
232 |             self.cook_append(other[0], other[1])
233 |         else:
234 |             assert self.compatible(other), "incompatible BLEUs."
235 |             self.ctest.extend(other.ctest)
236 |             self.crefs.extend(other.crefs)
237 |             self._score = None ## need to recompute
238 | 
239 |         return self
240 | 
241 |     def compatible(self, other):
242 |         return isinstance(other, BleuScorer) and self.n == other.n
243 | 
244 |     def single_reflen(self, option="average"):
245 |         return self._single_reflen(self.crefs[0][0], option)
246 | 
247 |     def _single_reflen(self, reflens, option=None, testlen=None):
248 | 
249 |         if option == "shortest":
250 |             reflen = min(reflens)
251 |         elif option == "average":
252 |             reflen = float(sum(reflens))/len(reflens)
253 |         elif option == "closest":
254 |             reflen = min((abs(l-testlen), l) for l in reflens)[1]
255 |         else:
256 |             assert False, "unsupported reflen option %s" % option
257 | 
258 |         return reflen
259 | 
260 |     def recompute_score(self, option=None, verbose=0):
261 |         self._score = None
262 |         return self.compute_score(option, verbose)
263 | 
264 |     def compute_score(self, option=None, verbose=0):
265 |         n = self.n
266 |         small = 1e-9
267 |         tiny = 1e-15 ## so that if guess is 0 still return 0
268 |         bleu_list = [[] for _ in range(n)]
269 | 
270 |         if self._score is not None:
271 |             return self._score
272 | 
273 |         if option is None:
274 |             option = "average" if len(self.crefs) == 1 else "closest"
275 | 
276 |         self._testlen = 0
277 |         self._reflen = 0
278 |         totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
279 | 
280 |         # for each sentence
281 |         for comps in self.ctest:
282 |             testlen = comps['testlen']
283 |             self._testlen += testlen
284 | 
285 |             if self.special_reflen is None: ## need computation
286 |                 reflen = self._single_reflen(comps['reflen'], option, testlen)
287 |             else:
288 |                 reflen = self.special_reflen
289 | 
290 |             self._reflen += reflen
291 | 
292 |             for key in ['guess','correct']:
293 |                 for k in range(n):
294 |                     totalcomps[key][k] += comps[key][k]
295 | 
296 |             # append per image bleu score
297 |             bleu = 1.
298 |             for k in range(n):
299 |                 bleu *= (float(comps['correct'][k]) + tiny) \
300 |                         /(float(comps['guess'][k]) + small)
301 |                 bleu_list[k].append(bleu ** (1./(k+1)))
302 |             ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
303 |             if ratio < 1:
304 |                 for k in range(n):
305 |                     bleu_list[k][-1] *= math.exp(1 - 1/ratio)
306 | 
307 |             if verbose > 1:
308 |                 print(comps, reflen)
309 | 
310 |         totalcomps['reflen'] = self._reflen
311 |         totalcomps['testlen'] = self._testlen
312 | 
313 |         bleus = []
314 |         bleu = 1.
315 |         for k in range(n):
316 |             bleu *= float(totalcomps['correct'][k] + tiny) \
317 |                     / (totalcomps['guess'][k] + small)
318 |             bleus.append(bleu ** (1./(k+1)))
319 |         ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
320 |         if ratio < 1:
321 |             for k in range(n):
322 |                 bleus[k] *= math.exp(1 - 1/ratio)
323 | 
324 |         if verbose > 0:
325 |             print(totalcomps)
326 |             print("ratio:", ratio)
327 | 
328 |         self._score = bleus
329 |         return self._score, bleu_list
330 |     
331 | if __name__ == "__main__":
332 |     hype = "The cat the cat on the mat"
333 |     refs = ["The cat is on the mat","There is a cat on the mat"]
334 |     scores = compute_individual_bleu(refs,hype)
335 |     print(scores)


--------------------------------------------------------------------------------
/models/BiMPM.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | class BiMPM(nn.Module):
  7 |     def __init__(self, args, data):
  8 |         super(BiMPM, self).__init__()
  9 | 
 10 |         self.args = args
 11 |         self.d = self.args.word_dim + int(self.args.use_char_emb) * self.args.char_hidden_size
 12 |         self.l = self.args.num_perspective
 13 | 
 14 |         # ----- Word Representation Layer -----
 15 |         self.char_emb = nn.Embedding(args.char_vocab_size, args.char_dim, padding_idx=0)
 16 | 
 17 |         self.word_emb = nn.Embedding(args.word_vocab_size, args.word_dim)
 18 |         # initialize word embedding with GloVe or Other pre-trained word embedding
 19 |         self.word_emb.weight.data.copy_(data.TEXT.vocab.vectors)
 20 |         # no fine-tuning for word vectors
 21 |         self.word_emb.weight.requires_grad = False
 22 | 
 23 |         self.char_LSTM = nn.LSTM(
 24 |             input_size=self.args.char_dim,
 25 |             hidden_size=self.args.char_hidden_size,
 26 |             num_layers=1,
 27 |             bidirectional=False,
 28 |             batch_first=True)
 29 | 
 30 |         # ----- Context Representation Layer -----
 31 |         self.context_LSTM = nn.LSTM(
 32 |             input_size=self.d,
 33 |             hidden_size=self.args.hidden_size,
 34 |             num_layers=1,
 35 |             bidirectional=True,
 36 |             batch_first=True
 37 |         )
 38 | 
 39 |         # ----- Matching Layer -----
 40 |         for i in range(1, 9):
 41 |             setattr(self, f'mp_w{i}',
 42 |                     nn.Parameter(torch.rand(self.l, self.args.hidden_size)))
 43 | 
 44 |         # ----- Aggregation Layer -----
 45 |         self.aggregation_LSTM = nn.LSTM(
 46 |             input_size=self.l * 8,
 47 |             hidden_size=self.args.hidden_size,
 48 |             num_layers=1,
 49 |             bidirectional=True,
 50 |             batch_first=True
 51 |         )
 52 | 
 53 |         # ----- Prediction Layer -----
 54 |         self.pred_fc1 = nn.Linear(self.args.hidden_size * 4, self.args.hidden_size * 2)
 55 |         self.pred_fc2 = nn.Linear(self.args.hidden_size * 2, self.args.class_size)
 56 | 
 57 |         self.reset_parameters()
 58 | 
 59 |     def reset_parameters(self):
 60 |         # ----- Word Representation Layer -----
 61 |         nn.init.uniform(self.char_emb.weight, -0.005, 0.005)
 62 |         # zero vectors for padding
 63 |         self.char_emb.weight.data[0].fill_(0)
 64 | 
 65 |         # <unk> vectors is randomly initialized
 66 |         nn.init.uniform(self.word_emb.weight.data[0], -0.1, 0.1)
 67 | 
 68 |         nn.init.kaiming_normal(self.char_LSTM.weight_ih_l0)
 69 |         nn.init.constant(self.char_LSTM.bias_ih_l0, val=0)
 70 |         nn.init.orthogonal(self.char_LSTM.weight_hh_l0)
 71 |         nn.init.constant(self.char_LSTM.bias_hh_l0, val=0)
 72 | 
 73 |         # ----- Context Representation Layer -----
 74 |         nn.init.kaiming_normal(self.context_LSTM.weight_ih_l0)
 75 |         nn.init.constant(self.context_LSTM.bias_ih_l0, val=0)
 76 |         nn.init.orthogonal(self.context_LSTM.weight_hh_l0)
 77 |         nn.init.constant(self.context_LSTM.bias_hh_l0, val=0)
 78 | 
 79 |         nn.init.kaiming_normal(self.context_LSTM.weight_ih_l0_reverse)
 80 |         nn.init.constant(self.context_LSTM.bias_ih_l0_reverse, val=0)
 81 |         nn.init.orthogonal(self.context_LSTM.weight_hh_l0_reverse)
 82 |         nn.init.constant(self.context_LSTM.bias_hh_l0_reverse, val=0)
 83 | 
 84 |         # ----- Matching Layer -----
 85 |         for i in range(1, 9):
 86 |             w = getattr(self, f'mp_w{i}')
 87 |             nn.init.kaiming_normal(w)
 88 | 
 89 |         # ----- Aggregation Layer -----
 90 |         nn.init.kaiming_normal(self.aggregation_LSTM.weight_ih_l0)
 91 |         nn.init.constant(self.aggregation_LSTM.bias_ih_l0, val=0)
 92 |         nn.init.orthogonal(self.aggregation_LSTM.weight_hh_l0)
 93 |         nn.init.constant(self.aggregation_LSTM.bias_hh_l0, val=0)
 94 | 
 95 |         nn.init.kaiming_normal(self.aggregation_LSTM.weight_ih_l0_reverse)
 96 |         nn.init.constant(self.aggregation_LSTM.bias_ih_l0_reverse, val=0)
 97 |         nn.init.orthogonal(self.aggregation_LSTM.weight_hh_l0_reverse)
 98 |         nn.init.constant(self.aggregation_LSTM.bias_hh_l0_reverse, val=0)
 99 | 
100 |         # ----- Prediction Layer ----
101 |         nn.init.uniform(self.pred_fc1.weight, -0.005, 0.005)
102 |         nn.init.constant(self.pred_fc1.bias, val=0)
103 | 
104 |         nn.init.uniform(self.pred_fc2.weight, -0.005, 0.005)
105 |         nn.init.constant(self.pred_fc2.bias, val=0)
106 | 
107 |     def dropout(self, v):
108 |         return F.dropout(v, p=self.args.dropout, training=self.training)
109 | 
110 |     def forward(self, **kwargs):
111 |         # ----- Matching Layer -----
112 |         def mp_matching_func(v1, v2, w):
113 |             """
114 |             :param v1: (batch, seq_len, hidden_size)
115 |             :param v2: (batch, seq_len, hidden_size) or (batch, hidden_size)
116 |             :param w: (l, hidden_size)
117 |             :return: (batch, l)
118 |             """
119 |             seq_len = v1.size(1)
120 | 
121 |             # Trick for large memory requirement
122 |             """
123 |             if len(v2.size()) == 2:
124 |                 v2 = torch.stack([v2] * seq_len, dim=1)
125 |             m = []
126 |             for i in range(self.l):
127 |                 # v1: (batch, seq_len, hidden_size)
128 |                 # v2: (batch, seq_len, hidden_size)
129 |                 # w: (1, 1, hidden_size)
130 |                 # -> (batch, seq_len)
131 |                 m.append(F.cosine_similarity(w[i].view(1, 1, -1) * v1, w[i].view(1, 1, -1) * v2, dim=2))
132 |             # list of (batch, seq_len) -> (batch, seq_len, l)
133 |             m = torch.stack(m, dim=2)
134 |             """
135 | 
136 |             # (1, 1, hidden_size, l)
137 |             w = w.transpose(1, 0).unsqueeze(0).unsqueeze(0)
138 |             # (batch, seq_len, hidden_size, l)
139 |             v1 = w * torch.stack([v1] * self.l, dim=3)
140 |             if len(v2.size()) == 3:
141 |                 v2 = w * torch.stack([v2] * self.l, dim=3)
142 |             else:
143 |                 v2 = w * torch.stack([torch.stack([v2] * seq_len, dim=1)] * self.l, dim=3)
144 | 
145 |             m = F.cosine_similarity(v1, v2, dim=2)
146 | 
147 |             return m
148 | 
149 |         def mp_matching_func_pairwise(v1, v2, w):
150 |             """
151 |             :param v1: (batch, seq_len1, hidden_size)
152 |             :param v2: (batch, seq_len2, hidden_size)
153 |             :param w: (l, hidden_size)
154 |             :return: (batch, l, seq_len1, seq_len2)
155 |             """
156 | 
157 |             # Trick for large memory requirement
158 |             """
159 |             m = []
160 |             for i in range(self.l):
161 |                 # (1, 1, hidden_size)
162 |                 w_i = w[i].view(1, 1, -1)
163 |                 # (batch, seq_len1, hidden_size), (batch, seq_len2, hidden_size)
164 |                 v1, v2 = w_i * v1, w_i * v2
165 |                 # (batch, seq_len, hidden_size->1)
166 |                 v1_norm = v1.norm(p=2, dim=2, keepdim=True)
167 |                 v2_norm = v2.norm(p=2, dim=2, keepdim=True)
168 |                 # (batch, seq_len1, seq_len2)
169 |                 n = torch.matmul(v1, v2.permute(0, 2, 1))
170 |                 d = v1_norm * v2_norm.permute(0, 2, 1)
171 |                 m.append(div_with_small_value(n, d))
172 |             # list of (batch, seq_len1, seq_len2) -> (batch, seq_len1, seq_len2, l)
173 |             m = torch.stack(m, dim=3)
174 |             """
175 | 
176 |             # (1, l, 1, hidden_size)
177 |             w = w.unsqueeze(0).unsqueeze(2)
178 |             # (batch, l, seq_len, hidden_size)
179 |             v1, v2 = w * torch.stack([v1] * self.l, dim=1), w * torch.stack([v2] * self.l, dim=1)
180 |             # (batch, l, seq_len, hidden_size->1)
181 |             v1_norm = v1.norm(p=2, dim=3, keepdim=True)
182 |             v2_norm = v2.norm(p=2, dim=3, keepdim=True)
183 | 
184 |             # (batch, l, seq_len1, seq_len2)
185 |             n = torch.matmul(v1, v2.transpose(2, 3))
186 |             d = v1_norm * v2_norm.transpose(2, 3)
187 | 
188 |             # (batch, seq_len1, seq_len2, l)
189 |             m = div_with_small_value(n, d).permute(0, 2, 3, 1)
190 | 
191 |             return m
192 | 
193 |         def attention(v1, v2):
194 |             """
195 |             :param v1: (batch, seq_len1, hidden_size)
196 |             :param v2: (batch, seq_len2, hidden_size)
197 |             :return: (batch, seq_len1, seq_len2)
198 |             """
199 | 
200 |             # (batch, seq_len1, 1)
201 |             v1_norm = v1.norm(p=2, dim=2, keepdim=True)
202 |             # (batch, 1, seq_len2)
203 |             v2_norm = v2.norm(p=2, dim=2, keepdim=True).permute(0, 2, 1)
204 | 
205 |             # (batch, seq_len1, seq_len2)
206 |             a = torch.bmm(v1, v2.permute(0, 2, 1))
207 |             d = v1_norm * v2_norm
208 | 
209 |             return div_with_small_value(a, d)
210 | 
211 |         def div_with_small_value(n, d, eps=1e-8):
212 |             # too small values are replaced by 1e-8 to prevent it from exploding.
213 |             d = d * (d > eps).float() + eps * (d <= eps).float()
214 |             return n / d
215 | 
216 |         # ----- Word Representation Layer -----
217 |         # (batch, seq_len) -> (batch, seq_len, word_dim)
218 | 
219 |         p = self.word_emb(kwargs['p'])
220 |         h = self.word_emb(kwargs['h'])
221 | 
222 |         if self.args.use_char_emb:
223 |             # (batch, seq_len, max_word_len) -> (batch * seq_len, max_word_len)
224 |             seq_len_p = kwargs['char_p'].size(1)
225 |             seq_len_h = kwargs['char_h'].size(1)
226 | 
227 |             char_p = kwargs['char_p'].view(-1, self.args.max_word_len)
228 |             char_h = kwargs['char_h'].view(-1, self.args.max_word_len)
229 | 
230 |             # (batch * seq_len, max_word_len, char_dim)-> (1, batch * seq_len, char_hidden_size)
231 |             _, (char_p, _) = self.char_LSTM(self.char_emb(char_p))
232 |             _, (char_h, _) = self.char_LSTM(self.char_emb(char_h))
233 | 
234 |             # (batch, seq_len, char_hidden_size)
235 |             char_p = char_p.view(-1, seq_len_p, self.args.char_hidden_size)
236 |             char_h = char_h.view(-1, seq_len_h, self.args.char_hidden_size)
237 | 
238 |             # (batch, seq_len, word_dim + char_hidden_size)
239 |             p = torch.cat([p, char_p], dim=-1)
240 |             h = torch.cat([h, char_h], dim=-1)
241 | 
242 |         p = self.dropout(p)
243 |         h = self.dropout(h)
244 | 
245 |         # ----- Context Representation Layer -----
246 |         # (batch, seq_len, hidden_size * 2)
247 |         con_p, _ = self.context_LSTM(p)
248 |         con_h, _ = self.context_LSTM(h)
249 | 
250 |         con_p = self.dropout(con_p)
251 |         con_h = self.dropout(con_h)
252 | 
253 |         # (batch, seq_len, hidden_size)
254 |         con_p_fw, con_p_bw = torch.split(con_p, self.args.hidden_size, dim=-1)
255 |         con_h_fw, con_h_bw = torch.split(con_h, self.args.hidden_size, dim=-1)
256 | 
257 |         # 1. Full-Matching
258 | 
259 |         # (batch, seq_len, hidden_size), (batch, hidden_size)
260 |         # -> (batch, seq_len, l)
261 |         mv_p_full_fw = mp_matching_func(con_p_fw, con_h_fw[:, -1, :], self.mp_w1)
262 |         mv_p_full_bw = mp_matching_func(con_p_bw, con_h_bw[:, 0, :], self.mp_w2)
263 |         mv_h_full_fw = mp_matching_func(con_h_fw, con_p_fw[:, -1, :], self.mp_w1)
264 |         mv_h_full_bw = mp_matching_func(con_h_bw, con_p_bw[:, 0, :], self.mp_w2)
265 | 
266 |         # 2. Maxpooling-Matching
267 | 
268 |         # (batch, seq_len1, seq_len2, l)
269 |         mv_max_fw = mp_matching_func_pairwise(con_p_fw, con_h_fw, self.mp_w3)
270 |         mv_max_bw = mp_matching_func_pairwise(con_p_bw, con_h_bw, self.mp_w4)
271 | 
272 |         # (batch, seq_len, l)
273 |         mv_p_max_fw, _ = mv_max_fw.max(dim=2)
274 |         mv_p_max_bw, _ = mv_max_bw.max(dim=2)
275 |         mv_h_max_fw, _ = mv_max_fw.max(dim=1)
276 |         mv_h_max_bw, _ = mv_max_bw.max(dim=1)
277 | 
278 |         # 3. Attentive-Matching
279 | 
280 |         # (batch, seq_len1, seq_len2)
281 |         att_fw = attention(con_p_fw, con_h_fw)
282 |         att_bw = attention(con_p_bw, con_h_bw)
283 | 
284 |         # (batch, seq_len2, hidden_size) -> (batch, 1, seq_len2, hidden_size)
285 |         # (batch, seq_len1, seq_len2) -> (batch, seq_len1, seq_len2, 1)
286 |         # -> (batch, seq_len1, seq_len2, hidden_size)
287 |         att_h_fw = con_h_fw.unsqueeze(1) * att_fw.unsqueeze(3)
288 |         att_h_bw = con_h_bw.unsqueeze(1) * att_bw.unsqueeze(3)
289 |         # (batch, seq_len1, hidden_size) -> (batch, seq_len1, 1, hidden_size)
290 |         # (batch, seq_len1, seq_len2) -> (batch, seq_len1, seq_len2, 1)
291 |         # -> (batch, seq_len1, seq_len2, hidden_size)
292 |         att_p_fw = con_p_fw.unsqueeze(2) * att_fw.unsqueeze(3)
293 |         att_p_bw = con_p_bw.unsqueeze(2) * att_bw.unsqueeze(3)
294 | 
295 |         # (batch, seq_len1, hidden_size) / (batch, seq_len1, 1) -> (batch, seq_len1, hidden_size)
296 |         att_mean_h_fw = div_with_small_value(att_h_fw.sum(dim=2), att_fw.sum(dim=2, keepdim=True))
297 |         att_mean_h_bw = div_with_small_value(att_h_bw.sum(dim=2), att_bw.sum(dim=2, keepdim=True))
298 | 
299 |         # (batch, seq_len2, hidden_size) / (batch, seq_len2, 1) -> (batch, seq_len2, hidden_size)
300 |         att_mean_p_fw = div_with_small_value(att_p_fw.sum(dim=1), att_fw.sum(dim=1, keepdim=True).permute(0, 2, 1))
301 |         att_mean_p_bw = div_with_small_value(att_p_bw.sum(dim=1), att_bw.sum(dim=1, keepdim=True).permute(0, 2, 1))
302 | 
303 |         # (batch, seq_len, l)
304 |         mv_p_att_mean_fw = mp_matching_func(con_p_fw, att_mean_h_fw, self.mp_w5)
305 |         mv_p_att_mean_bw = mp_matching_func(con_p_bw, att_mean_h_bw, self.mp_w6)
306 |         mv_h_att_mean_fw = mp_matching_func(con_h_fw, att_mean_p_fw, self.mp_w5)
307 |         mv_h_att_mean_bw = mp_matching_func(con_h_bw, att_mean_p_bw, self.mp_w6)
308 | 
309 |         # 4. Max-Attentive-Matching
310 | 
311 |         # (batch, seq_len1, hidden_size)
312 |         att_max_h_fw, _ = att_h_fw.max(dim=2)
313 |         att_max_h_bw, _ = att_h_bw.max(dim=2)
314 |         # (batch, seq_len2, hidden_size)
315 |         att_max_p_fw, _ = att_p_fw.max(dim=1)
316 |         att_max_p_bw, _ = att_p_bw.max(dim=1)
317 | 
318 |         # (batch, seq_len, l)
319 |         mv_p_att_max_fw = mp_matching_func(con_p_fw, att_max_h_fw, self.mp_w7)
320 |         mv_p_att_max_bw = mp_matching_func(con_p_bw, att_max_h_bw, self.mp_w8)
321 |         mv_h_att_max_fw = mp_matching_func(con_h_fw, att_max_p_fw, self.mp_w7)
322 |         mv_h_att_max_bw = mp_matching_func(con_h_bw, att_max_p_bw, self.mp_w8)
323 | 
324 |         # (batch, seq_len, l * 8)
325 |         mv_p = torch.cat(
326 |             [mv_p_full_fw, mv_p_max_fw, mv_p_att_mean_fw, mv_p_att_max_fw,
327 |              mv_p_full_bw, mv_p_max_bw, mv_p_att_mean_bw, mv_p_att_max_bw], dim=2)
328 |         mv_h = torch.cat(
329 |             [mv_h_full_fw, mv_h_max_fw, mv_h_att_mean_fw, mv_h_att_max_fw,
330 |              mv_h_full_bw, mv_h_max_bw, mv_h_att_mean_bw, mv_h_att_max_bw], dim=2)
331 | 
332 |         mv_p = self.dropout(mv_p)
333 |         mv_h = self.dropout(mv_h)
334 | 
335 |         # ----- Aggregation Layer -----
336 |         # (batch, seq_len, l * 8) -> (2, batch, hidden_size)
337 |         _, (agg_p_last, _) = self.aggregation_LSTM(mv_p)
338 |         _, (agg_h_last, _) = self.aggregation_LSTM(mv_h)
339 | 
340 |         # 2 * (2, batch, hidden_size) -> 2 * (batch, hidden_size * 2) -> (batch, hidden_size * 4)
341 |         x = torch.cat(
342 |             [agg_p_last.permute(1, 0, 2).contiguous().view(-1, self.args.hidden_size * 2),
343 |              agg_h_last.permute(1, 0, 2).contiguous().view(-1, self.args.hidden_size * 2)], dim=1)
344 |         x = self.dropout(x)
345 | 
346 |         # ----- Prediction Layer -----
347 |         x = F.tanh(self.pred_fc1(x))
348 |         x = self.dropout(x)
349 |         x = self.pred_fc2(x)
350 | 
351 |         return x


--------------------------------------------------------------------------------
/models/ABCNN.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | class Abcnn3(nn.Module):
  7 |     '''
  8 |     ABCNN3
  9 |     1. ABCNN1
 10 |     2. wide convolution
 11 |     3. ABCNN2
 12 |     Attributes
 13 |     ----------
 14 |     layer_size : int
 15 |         the number of (abcnn1 + abcnn2)
 16 |     distance : function
 17 |         cosine similarity or manhattan
 18 |     abcnn1 : list of abcnn1
 19 |     abcnn2 : list of abcnn2
 20 |     conv : list of convolution layer
 21 |     ap : list of pooling layer
 22 |     fc : last linear layer(in paper use logistic regression)
 23 |     '''
 24 | 
 25 |     def __init__(self, emb_dim, sentence_length, filter_width, filter_channel=100, layer_size=2, match='cosine',
 26 |                  inception=True):
 27 |         super(Abcnn3, self).__init__()
 28 |         self.layer_size = layer_size
 29 | 
 30 |         if match == 'cosine':
 31 |             self.distance = cosine_similarity
 32 |         else:
 33 |             self.distance = manhattan_distance
 34 | 
 35 |         self.abcnn1 = nn.ModuleList()
 36 |         self.abcnn2 = nn.ModuleList()
 37 |         self.conv = nn.ModuleList()
 38 |         self.ap = nn.ModuleList([ApLayer(emb_dim)])
 39 |         self.fc = nn.Linear(layer_size + 1, 1)
 40 | 
 41 |         for i in range(layer_size):
 42 |             self.abcnn1.append(Abcnn1Portion(sentence_length, emb_dim if i == 0 else filter_channel))
 43 |             self.abcnn2.append(Abcnn2Portion(sentence_length, filter_width))
 44 |             self.conv.append(
 45 |                 ConvLayer(False, sentence_length, filter_width, emb_dim if i == 0 else filter_channel, filter_channel,
 46 |                           inception))
 47 |             self.ap.append(ApLayer(filter_channel))
 48 | 
 49 |     def forward(self, x1, x2):
 50 |         '''
 51 |         1. stack sentence vector similarity
 52 |         2. for layer_size
 53 |             abcnn1
 54 |             convolution
 55 |             stack sentence representation vector similarity
 56 |             abcnn2
 57 | 
 58 |         3. concatenate similarity list
 59 |             size (batch_size, layer_size + 1)
 60 |         4. Linear layer
 61 |             size (batch_size, 1)
 62 |         Parameters
 63 |         ----------
 64 |         x1, x2 : 4-D torch Tensor
 65 |             size (batch_size, 1, sentence_length, emb_dim)
 66 |         Returns
 67 |         -------
 68 |         output : 2-D torch Tensor
 69 |             size (batch_size, 1)
 70 |         '''
 71 |         sim = []
 72 |         sim.append(self.distance(self.ap[0](x1), self.ap[0](x2)))
 73 | 
 74 |         for i in range(self.layer_size):
 75 |             x1, x2 = self.abcnn1[i](x1, x2)
 76 |             x1 = self.conv[i](x1)
 77 |             x2 = self.conv[i](x2)
 78 |             sim.append(self.distance(self.ap[i + 1](x1), self.ap[i + 1](x2)))
 79 |             x1, x2 = self.abcnn2[i](x1, x2)
 80 | 
 81 |         sim_fc = torch.cat(sim, dim=1)
 82 |         output = self.fc(sim_fc)
 83 |         return output
 84 | 
 85 | 
 86 | class Abcnn1(nn.Module):
 87 |     '''
 88 |     ABCNN1
 89 |     1. ABCNN1
 90 |     2. wide convolution
 91 |     3. W-ap
 92 |     Attributes
 93 |     ----------
 94 |     layer_size : int
 95 |         the number of (abcnn1)
 96 |     distance : function
 97 |         cosine similarity or manhattan
 98 |     abcnn : list of abcnn1
 99 |     conv : list of convolution layer
100 |     wp : list of w-ap pooling layer
101 |     ap : list of pooling layer
102 |     fc : last linear layer(in paper use logistic regression)
103 |     '''
104 | 
105 |     def __init__(self, emb_dim, sentence_length, filter_width, filter_channel=100, layer_size=2, match='cosine',
106 |                  inception=True):
107 |         super(Abcnn1, self).__init__()
108 |         self.layer_size = layer_size
109 | 
110 |         if match == 'cosine':
111 |             self.distance = cosine_similarity
112 |         else:
113 |             self.distance = manhattan_distance
114 | 
115 |         self.abcnn = nn.ModuleList()
116 |         self.conv = nn.ModuleList()
117 |         self.ap = nn.ModuleList([ApLayer(emb_dim)])
118 |         self.wp = nn.ModuleList()
119 |         self.fc = nn.Linear(layer_size + 1, 1)
120 | 
121 |         for i in range(layer_size):
122 |             self.abcnn.append(Abcnn1Portion(sentence_length, emb_dim if i == 0 else filter_channel))
123 |             self.conv.append(
124 |                 ConvLayer(False, sentence_length, filter_width, emb_dim if i == 0 else filter_channel, filter_channel,
125 |                           inception))
126 |             self.ap.append(ApLayer(filter_channel))
127 |             self.wp.append(WpLayer(sentence_length, filter_width, False))
128 | 
129 |     def forward(self, x1, x2):
130 |         '''
131 |         1. stack sentence vector similarity
132 |         2. for layer_size
133 |             abcnn1
134 |             convolution
135 |             stack sentence vector similarity
136 |             W-ap for next loop x1, x2
137 | 
138 |         3. concatenate similarity list
139 |             size (batch_size, layer_size + 1)
140 |         4. Linear layer
141 |             size (batch_size, 1)
142 |         Parameters
143 |         ----------
144 |         x1, x2 : 4-D torch Tensor
145 |             size (batch_size, 1, sentence_length, emb_dim)
146 |         Returns
147 |         -------
148 |         output : 2-D torch Tensor
149 |             size (batch_size, 1)
150 |         '''
151 |         sim = []
152 |         sim.append(self.distance(self.ap[0](x1), self.ap[0](x2)))
153 | 
154 |         for i in range(self.layer_size):
155 |             x1, x2 = self.abcnn[i](x1, x2)
156 |             x1 = self.conv[i](x1)
157 |             x2 = self.conv[i](x2)
158 |             sim.append(self.distance(self.ap[i + 1](x1), self.ap[i + 1](x2)))
159 |             x1 = self.wp[i](x1)
160 |             x2 = self.wp[i](x2)
161 | 
162 |         sim_fc = torch.cat(sim, dim=1)
163 |         output = self.fc(sim_fc)
164 |         return output
165 | 
166 | 
167 | class Abcnn2(nn.Module):
168 |     '''
169 |     ABCNN2
170 |     1. wide convolution
171 |     2. ABCNN2
172 |     Attributes
173 |     ----------
174 |     layer_size : int
175 |         the number of (abcnn2)
176 |     distance : function
177 |         cosine similarity or manhattan
178 |     abcnn : list of abcnn2
179 |     conv : list of convolution layer
180 |     ap : list of pooling layer
181 |     fc : last linear layer(in paper use logistic regression)
182 |     '''
183 | 
184 |     def __init__(self, emb_dim, sentence_length, filter_width, filter_channel=100, layer_size=2, match='cosine',
185 |                  inception=True):
186 |         super(Abcnn2, self).__init__()
187 |         self.layer_size = layer_size
188 | 
189 |         if match == 'cosine':
190 |             self.distance = cosine_similarity
191 |         else:
192 |             self.distance = manhattan_distance
193 | 
194 |         self.abcnn = nn.ModuleList()
195 |         self.conv = nn.ModuleList()
196 |         self.ap = nn.ModuleList([ApLayer(sentence_length, emb_dim)])
197 |         self.fc = nn.Linear(layer_size + 1, 1)
198 | 
199 |         for i in range(layer_size):
200 |             self.abcnn.append(Abcnn2Portion(sentence_length, filter_width))
201 |             self.conv.append(
202 |                 ConvLayer(True, sentence_length, filter_width, emb_dim if i == 0 else filter_channel, filter_channel,
203 |                           inception))
204 |             self.ap.append(ApLayer(sentence_length + filter_width - 1, filter_channel))
205 | 
206 |     def forward(self, x1, x2):
207 |         '''
208 |         1. stack sentence vector similarity
209 |         2. for layer_size
210 |             convolution
211 |             stack sentence vector similarity
212 |             abcnn2
213 | 
214 |         3. concatenate similarity list
215 |             size (batch_size, layer_size + 1)
216 |         4. Linear layer
217 |             size (batch_size, 1)
218 |         Parameters
219 |         ----------
220 |         x1, x2 : 4-D torch Tensor
221 |             size (batch_size, 1, sentence_length, emb_dim)
222 |         Returns
223 |         -------
224 |         output : 2-D torch Tensor
225 |             size (batch_size, 1)
226 |         '''
227 |         sim = []
228 |         sim.append(self.distance(self.ap[0](x1), self.ap[0](x2)))
229 | 
230 |         for i in range(self.layer_size):
231 |             x1 = self.conv[i](x1)
232 |             x2 = self.conv[i](x2)
233 |             sim.append(self.distance(self.ap[i + 1](x1), self.ap[i + 1](x2)))
234 |             x1, x2 = self.abcnn[i](x1, x2)
235 | 
236 |         sim_fc = torch.cat(sim, dim=1)
237 |         output = self.fc(sim_fc)
238 |         return output
239 | 
240 | 
241 | class Abcnn1Portion(nn.Module):
242 |     '''Part of Abcnn1
243 |     '''
244 | 
245 |     def __init__(self, in_dim, out_dim):
246 |         super(Abcnn1Portion, self).__init__()
247 |         self.batchNorm = nn.BatchNorm2d(2)
248 |         self.attention_feature_layer = nn.Linear(in_dim, out_dim)
249 | 
250 |     def forward(self, x1, x2):
251 |         '''
252 |         1. compute attention matrix
253 |             attention_m : size (batch_size, sentence_length, sentence_length)
254 |         2. generate attention feature map(weight matrix are parameters of the model to be learned)
255 |             x_attention : size (batch_size, 1, sentence_length, emb_dim)
256 |         3. stack the representation feature map and attention feature map
257 |             x : size (batch_size, 2, sentence_length, emb_dim)
258 |         4. batch norm(not in paper)
259 |         Parameters
260 |         ----------
261 |         x1, x2 : 4-D torch Tensor
262 |             size (batch_size, 1, sentence_length, emb_dim)
263 |         Returns
264 |         -------
265 |         (x1, x2) : list of 4-D torch Tensor
266 |             size (batch_size, 2, sentence_length, emb_dim)
267 |         '''
268 |         attention_m = attention_matrix(x1, x2)
269 | 
270 |         x1_attention = self.attention_feature_layer(attention_m.permute(0, 2, 1))
271 |         x1_attention = x1_attention.unsqueeze(1)
272 |         x1 = torch.cat([x1, x1_attention], 1)
273 | 
274 |         x2_attention = self.attention_feature_layer(attention_m)
275 |         x2_attention = x2_attention.unsqueeze(1)
276 |         x2 = torch.cat([x2, x2_attention], 1)
277 | 
278 |         x1 = self.batchNorm(x1)
279 |         x2 = self.batchNorm(x2)
280 | 
281 |         return (x1, x2)
282 | 
283 | 
284 | class Abcnn2Portion(nn.Module):
285 |     '''Part of Abcnn2
286 |     '''
287 | 
288 |     def __init__(self, sentence_length, filter_width):
289 |         super(Abcnn2Portion, self).__init__()
290 |         self.wp = WpLayer(sentence_length, filter_width, True)
291 | 
292 |     def forward(self, x1, x2):
293 |         '''
294 |         1. compute attention matrix
295 |             attention_m : size (batch_size, sentence_length + filter_width - 1, sentence_length + filter_width - 1)
296 |         2. sum all attention values for a unit to derive a single attention weight for that unit
297 |             x_a_conv : size (batch_size, sentence_length + filter_width - 1)
298 |         3. average pooling(w-ap)
299 |         Parameters
300 |         ----------
301 |         x1, x2 : 4-D torch Tensor
302 |             size (batch_size, 1, sentence_length + filter_width - 1, height)
303 |         Returns
304 |         -------
305 |         (x1, x2) : list of 4-D torch Tensor
306 |             size (batch_size, 1, sentence_length, height)
307 |         '''
308 |         attention_m = attention_matrix(x1, x2)
309 |         x1_a_conv = attention_m.sum(dim=1)
310 |         x2_a_conv = attention_m.sum(dim=2)
311 |         x1 = self.wp(x1, x1_a_conv)
312 |         x2 = self.wp(x2, x2_a_conv)
313 | 
314 |         return (x1, x2)
315 | 
316 | 
317 | class InceptionModule(nn.Module):
318 |     '''
319 |     inception module(not in paper)
320 |     first layer width is filter_width(given)
321 |     second layer width is filter_width + 4
322 |     third layer width is sentence_length
323 |     this helps model to be learned(when the number of layers > 8)
324 |     '''
325 | 
326 |     def __init__(self, in_channel, sentence_length, filter_width, filter_height, filter_channel):
327 |         super(InceptionModule, self).__init__()
328 |         self.conv_1 = convolution(in_channel, filter_width, filter_height,
329 |                                   int(filter_channel / 3) + filter_channel - 3 * int(filter_channel / 3),
330 |                                   filter_width - 1)
331 |         self.conv_2 = convolution(in_channel, filter_width + 4, filter_height, int(filter_channel / 3),
332 |                                   filter_width + 1)
333 |         self.conv_3 = convolution(in_channel, sentence_length, filter_height, int(filter_channel / 3),
334 |                                   int((sentence_length + filter_width - 2) / 2))
335 | 
336 |     def forward(self, x):
337 |         out_1 = self.conv_1(x)
338 |         out_2 = self.conv_2(x)
339 |         out_3 = self.conv_3(x)
340 |         output = torch.cat([out_1, out_2, out_3], dim=1)
341 |         return output
342 | 
343 | 
344 | class ConvLayer(nn.Module):
345 |     '''
346 |     convolution layer for abcnn
347 |     Attributes
348 |     ----------
349 |     inception : bool
350 |         whether use inception module
351 |     '''
352 | 
353 |     def __init__(self, isAbcnn2, sentence_length, filter_width, filter_height, filter_channel, inception):
354 |         super(ConvLayer, self).__init__()
355 |         if inception:
356 |             self.model = InceptionModule(1 if isAbcnn2 else 2, sentence_length, filter_width, filter_height,
357 |                                          filter_channel)
358 |         else:
359 |             self.model = convolution(1 if isAbcnn2 else 2, filter_width, filter_height, filter_channel,
360 |                                      filter_width - 1)
361 | 
362 |     def forward(self, x):
363 |         '''
364 |         1. convlayer
365 |             size (batch_size, filter_channel, height, 1)
366 |         2. transpose
367 |             size (batch_size, 1, height, filter_channel)
368 |         Parameters
369 |         ----------
370 |         x : 4-D torch Tensor
371 |             size (batch_size, 1, height, width)
372 | 
373 |         Returns
374 |         -------
375 |         output : 4-D torch Tensor
376 |             size (batch_size, 1, height, filter_channel)
377 |         '''
378 |         output = self.model(x)
379 |         output = output.permute(0, 3, 2, 1)
380 |         return output
381 | 
382 | 
383 | def cosine_similarity(x1, x2):
384 |     '''compute cosine similarity between x1 and x2
385 |     Parameters
386 |     ----------
387 |     x1, x2 : 2-D torch Tensor
388 |         size (batch_size, 1)
389 |     Returns
390 |     -------
391 |     distance : 2-D torch Tensor
392 |         similarity result of size (batch_size, 1)
393 |     '''
394 |     return F.cosine_similarity(x1, x2).unsqueeze(1)
395 | 
396 | 
397 | def manhattan_distance(x1, x2):
398 |     '''compute manhattan distance between x1 and x2 (not in paper)
399 |     Parameters
400 |     ----------
401 |     x1, x2 : 2-D torch Tensor
402 |         size (batch_size, 1)
403 |     Returns
404 |     -------
405 |     distance : 2-D torch Tensor
406 |         similarity result of size (batch_size, 1)
407 |     '''
408 |     return torch.div(torch.norm((x1 - x2), 1, 1, keepdim=True), x1.size()[1])
409 | 
410 | 
411 | def convolution(in_channel, filter_width, filter_height, filter_channel, padding):
412 |     '''convolution layer
413 |     '''
414 |     model = nn.Sequential(
415 |         nn.Conv2d(in_channel, filter_channel, (filter_width, filter_height), stride=1, padding=(padding, 0)),
416 |         nn.BatchNorm2d(filter_channel),
417 |         nn.Tanh()
418 |     )
419 |     return model
420 | 
421 | 
422 | def attention_matrix(x1, x2, eps=1e-6):
423 |     '''compute attention matrix using match score
424 | 
425 |     1 / (1 + |x · y|)
426 |     |·| is euclidean distance
427 |     Parameters
428 |     ----------
429 |     x1, x2 : 4-D torch Tensor
430 |         size (batch_size, 1, sentence_length, width)
431 | 
432 |     Returns
433 |     -------
434 |     output : 3-D torch Tensor
435 |         match score result of size (batch_size, sentence_length(for x2), sentence_length(for x1))
436 |     '''
437 |     eps = torch.tensor(eps)
438 |     one = torch.tensor(1.)
439 |     euclidean = (torch.pow(x1 - x2.permute(0, 2, 1, 3), 2).sum(dim=3) + eps).sqrt()
440 |     return (euclidean + one).reciprocal()
441 | 
442 | 
443 | class ApLayer(nn.Module):
444 |     '''column-wise averaging over all columns
445 |     '''
446 | 
447 |     def __init__(self, width):
448 |         super(ApLayer, self).__init__()
449 |         self.ap = nn.AvgPool2d((1, width), stride=1)
450 | 
451 |     def forward(self, x):
452 |         '''
453 |         1. average pooling
454 |             x size (batch_size, 1, sentence_length, 1)
455 |         2. representation vector for the sentence
456 |             output size (batch_size, sentence_length)
457 |         Parameters
458 |         ----------
459 |         x : 4-D torch Tensor
460 |             convolution output of size (batch_size, 1, sentence_length, width)
461 | 
462 |         Returns
463 |         -------
464 |         output : 2-D torch Tensor
465 |             representation vector of size (batch_size, width)
466 |         '''
467 |         return self.ap(x).squeeze(1).squeeze(2)
468 | 
469 | 
470 | class WpLayer(nn.Module):
471 |     '''column-wise averaging over windows of w consecutive columns
472 |     Attributes
473 |     ----------
474 |     attention : bool
475 |         compute layer with attention matrix
476 |     '''
477 | 
478 |     def __init__(self, sentence_length, filter_width, attention):
479 |         super(WpLayer, self).__init__()
480 |         self.attention = attention
481 |         if attention:
482 |             self.sentence_length = sentence_length
483 |             self.filter_width = filter_width
484 |         else:
485 |             self.wp = nn.AvgPool2d((filter_width, 1), stride=1)
486 | 
487 |     def forward(self, x, attention_matrix=None):
488 |         '''
489 |         if attention
490 |             reweight the convolution output with attention matrix
491 |         else
492 |             average pooling
493 |         Parameters
494 |         ----------
495 |         x : 4-D torch Tensor
496 |             convolution output of size (batch_size, 1, sentence_length + filter_width - 1, height)
497 |         attention_matrix: 2-D torch Tensor
498 |             attention matrix between (convolution output x1 and convolution output x2) of size (batch_size, sentence_length + filter_width - 1)
499 | 
500 |         Returns
501 |         -------
502 |         output : 4-D torch Tensor
503 |             size (batch_size, 1, sentence_length, height)
504 |         '''
505 |         if self.attention:
506 |             pools = []
507 |             attention_matrix = attention_matrix.unsqueeze(1).unsqueeze(3)
508 |             for i in range(self.sentence_length):
509 |                 pools.append(
510 |                     (x[:, :, i:i + self.filter_width, :] * attention_matrix[:, :, i:i + self.filter_width, :]).sum(
511 |                         dim=2, keepdim=True))
512 | 
513 |             return torch.cat(pools, dim=2)
514 | 
515 |         else:
516 |             return self.wp(x)
517 | 
518 | 
519 | def weights_init(m):
520 |     classname = m.__class__.__name__
521 |     if classname.find('Conv') != -1 and classname.find('Layer') == -1:
522 |         nn.init.xavier_uniform_(m.weight)
523 |     elif classname.find('Linear') != -1:
524 |         nn.init.xavier_uniform_(m.weight)
525 |         nn.init.constant_(m.bias, 0.1)
526 |     elif classname.find('BatchNorm') != -1:
527 |         m.weight.data.normal_(1.0, 0.02)
528 |         m.bias.data.fill_(0)


--------------------------------------------------------------------------------