├── LICENSE
├── README.md
└── cdssm.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Nishant Nikhil
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deep-Semantic-Similarity-Model-PyTorch
2 | Implementation of C-DSSM(Microsoft Research Paper) described [here](http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf). A random data generator is included in the code, you can play with it or use your own data.
3 | 
4 | Keras model: [airalcorn2/Deep-Semantic-Similarity-Model](https://github.com/airalcorn2/Deep-Semantic-Similarity-Model).
5 | 
6 | Corresponding blog post is at: [Medium](https://medium.com/towards-data-science/pytorch-first-program-and-walk-through-ceb739134ab9)
7 | 


--------------------------------------------------------------------------------
/cdssm.py:
--------------------------------------------------------------------------------
  1 | # Nishant Nikhil (i.nishantnikhil@gmail.com)
  2 | # An implementation of the Deep Semantic Similarity Model (DSSM) found in [1].
  3 | # [1] Shen, Y., He, X., Gao, J., Deng, L., and Mesnil, G. 2014. A latent semantic model
  4 | #         with convolutional-pooling structure for information retrieval. In CIKM, pp. 101-110.
  5 | #         http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf
  6 | # [2] http://research.microsoft.com/en-us/projects/dssm/
  7 | # [3] http://research.microsoft.com/pubs/238873/wsdm2015.v3.pdf
  8 | 
  9 | import torch 
 10 | import torch.nn as nn
 11 | import torchvision.datasets as dsets
 12 | import torchvision.transforms as transforms
 13 | from torch.autograd import Variable
 14 | import torch.nn.functional as F
 15 | 
 16 | 
 17 | LETTER_GRAM_SIZE = 3 # See section 3.2.
 18 | WINDOW_SIZE = 3 # See section 3.2.
 19 | TOTAL_LETTER_GRAMS = int(3 * 1e4) # Determined from data. See section 3.2.
 20 | WORD_DEPTH = WINDOW_SIZE * TOTAL_LETTER_GRAMS # See equation (1).
 21 | # Uncomment it, if testing
 22 | # WORD_DEPTH = 1000
 23 | K = 300 # Dimensionality of the max-pooling layer. See section 3.4.
 24 | L = 128 # Dimensionality of latent semantic space. See section 3.5.
 25 | J = 4 # Number of random unclicked documents serving as negative examples for a query. See section 4.
 26 | FILTER_LENGTH = 1 # We only consider one time step for convolutions.
 27 | 
 28 | 
 29 | def kmax_pooling(x, dim, k):
 30 |     index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
 31 |     return x.gather(dim, index)
 32 | 
 33 | class CDSSM(nn.Module):
 34 |     def __init__(self):
 35 |         super(CDSSM, self).__init__()
 36 |         # layers for query
 37 |         self.query_conv = nn.Conv1d(WORD_DEPTH, K, FILTER_LENGTH)
 38 |         self.query_sem = nn.Linear(K, L)
 39 |         # layers for docs
 40 |         self.doc_conv = nn.Conv1d(WORD_DEPTH, K, FILTER_LENGTH)
 41 |         self.doc_sem = nn.Linear(K, L)
 42 |         # learning gamma
 43 |         self.learn_gamma = nn.Conv1d(1, 1, 1)
 44 |     def forward(self, q, pos, negs):
 45 |         # Query model. The paper uses separate neural nets for queries and documents (see section 5.2).
 46 |         # To make it compatible with Conv layer we reshape it to: (batch_size, WORD_DEPTH, query_len)
 47 |         q = q.transpose(1,2)
 48 |         # In this step, we transform each word vector with WORD_DEPTH dimensions into its
 49 |         # convolved representation with K dimensions. K is the number of kernels/filters
 50 |         # being used in the operation. Essentially, the operation is taking the dot product
 51 |         # of a single weight matrix (W_c) with each of the word vectors (l_t) from the
 52 |         # query matrix (l_Q), adding a bias vector (b_c), and then applying the tanh activation.
 53 |         # That is, h_Q = tanh(W_c • l_Q + b_c). Note: the paper does not include bias units.
 54 |         q_c = F.tanh(self.query_conv(q))
 55 |         # Next, we apply a max-pooling layer to the convolved query matrix.
 56 |         q_k = kmax_pooling(q_c, 2, 1)
 57 |         q_k = q_k.transpose(1,2)
 58 |         # In this step, we generate the semantic vector represenation of the query. This
 59 |         # is a standard neural network dense layer, i.e., y = tanh(W_s • v + b_s). Again,
 60 |         # the paper does not include bias units.
 61 |         q_s = F.tanh(self.query_sem(q_k))
 62 |         q_s = q_s.resize(L)
 63 |         # # The document equivalent of the above query model for positive document
 64 |         pos = pos.transpose(1,2)
 65 |         pos_c = F.tanh(self.doc_conv(pos))
 66 |         pos_k = kmax_pooling(pos_c, 2, 1)
 67 |         pos_k = pos_k.transpose(1,2)
 68 |         pos_s = F.tanh(self.doc_sem(pos_k))
 69 |         pos_s = pos_s.resize(L)
 70 |         # # The document equivalent of the above query model for negative documents
 71 |         negs = [neg.transpose(1,2) for neg in negs]
 72 |         neg_cs = [F.tanh(self.doc_conv(neg)) for neg in negs]
 73 |         neg_ks = [kmax_pooling(neg_c, 2, 1) for neg_c in neg_cs]
 74 |         neg_ks = [neg_k.transpose(1,2) for neg_k in neg_ks]
 75 |         neg_ss = [F.tanh(self.doc_sem(neg_k)) for neg_k in neg_ks]
 76 |         neg_ss = [neg_s.resize(L) for neg_s in neg_ss]
 77 |         # Now let us calculates the cosine similarity between the semantic representations of
 78 |         # a queries and documents
 79 |         # dots[0] is the dot-product for positive document, this is necessary to remember
 80 |         # because we set the target label accordingly
 81 |         dots = [q_s.dot(pos_s)]
 82 |         dots = dots + [q_s.dot(neg_s) for neg_s in neg_ss]
 83 |         # dots is a list as of now, lets convert it to torch variable
 84 |         dots = torch.stack(dots)
 85 |         # In this step, we multiply each dot product value by gamma. In the paper, gamma is
 86 |         # described as a smoothing factor for the softmax function, and it's set empirically
 87 |         # on a held-out data set. We're going to learn gamma's value by pretending it's
 88 |         # a single 1 x 1 kernel.
 89 |         with_gamma = self.learn_gamma(dots.resize(J+1, 1, 1))
 90 |         # You can use the softmax function to calculate P(D+|Q), but here we return the logits for the CrossEntropyLoss
 91 |         # prob = F.softmax(with_gamma)
 92 |         return with_gamma
 93 | 
 94 | model = CDSSM()
 95 | 
 96 | # Build a random data set.
 97 | import numpy as np
 98 | sample_size = 10
 99 | l_Qs = []
100 | pos_l_Ds = []
101 | 
102 | (query_len, doc_len) = (5, 100)
103 | 
104 | for i in range(sample_size):
105 |     query_len = np.random.randint(1, 10)
106 |     l_Q = np.random.rand(1, query_len, WORD_DEPTH)
107 |     l_Qs.append(l_Q)
108 |     
109 |     doc_len = np.random.randint(50, 500)
110 |     l_D = np.random.rand(1, doc_len, WORD_DEPTH)
111 |     pos_l_Ds.append(l_D)
112 | 
113 | neg_l_Ds = [[] for j in range(J)]
114 | for i in range(sample_size):
115 |     possibilities = list(range(sample_size))
116 |     possibilities.remove(i)
117 |     negatives = np.random.choice(possibilities, J, replace = False)
118 |     for j in range(J):
119 |         negative = negatives[j]
120 |         neg_l_Ds[j].append(pos_l_Ds[negative])
121 | 
122 | # Till now, we have made a complete numpy dataset
123 | # Now let's convert the numpy variables to torch Variable
124 | 
125 | for i in range(len(l_Qs)):
126 |     l_Qs[i] = Variable(torch.from_numpy(l_Qs[i]).float())
127 |     pos_l_Ds[i] = Variable(torch.from_numpy(pos_l_Ds[i]).float())
128 |     for j in range(J):
129 |         neg_l_Ds[j][i] = Variable(torch.from_numpy(neg_l_Ds[j][i]).float())
130 | 
131 | 
132 | # Loss and optimizer
133 | criterion = torch.nn.CrossEntropyLoss()
134 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
135 | 
136 | # output variable, remember the cosine similarity with positive doc was at 0th index
137 | y = np.ndarray(1)
138 | # CrossEntropyLoss expects only the index as a long tensor
139 | y[0] = 0
140 | y = Variable(torch.from_numpy(y).long())
141 | 
142 | for i in range(sample_size):
143 |     y_pred = model(l_Qs[i], pos_l_Ds[i], [neg_l_Ds[j][i] for j in range(J)])
144 |     loss = criterion(y_pred.resize(1,J+1), y)
145 |     print (i, loss.data[0])
146 |     optimizer.zero_grad()
147 |     loss.backward()
148 |     optimizer.step()
149 | 


--------------------------------------------------------------------------------