├── LICENSE ├── README.md └── cdssm.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Nishant Nikhil 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-Semantic-Similarity-Model-PyTorch 2 | Implementation of C-DSSM(Microsoft Research Paper) described [here](http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf). A random data generator is included in the code, you can play with it or use your own data. 3 | 4 | Keras model: [airalcorn2/Deep-Semantic-Similarity-Model](https://github.com/airalcorn2/Deep-Semantic-Similarity-Model). 5 | 6 | Corresponding blog post is at: [Medium](https://medium.com/towards-data-science/pytorch-first-program-and-walk-through-ceb739134ab9) 7 | -------------------------------------------------------------------------------- /cdssm.py: -------------------------------------------------------------------------------- 1 | # Nishant Nikhil (i.nishantnikhil@gmail.com) 2 | # An implementation of the Deep Semantic Similarity Model (DSSM) found in [1]. 3 | # [1] Shen, Y., He, X., Gao, J., Deng, L., and Mesnil, G. 2014. A latent semantic model 4 | # with convolutional-pooling structure for information retrieval. In CIKM, pp. 101-110. 5 | # http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf 6 | # [2] http://research.microsoft.com/en-us/projects/dssm/ 7 | # [3] http://research.microsoft.com/pubs/238873/wsdm2015.v3.pdf 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torchvision.datasets as dsets 12 | import torchvision.transforms as transforms 13 | from torch.autograd import Variable 14 | import torch.nn.functional as F 15 | 16 | 17 | LETTER_GRAM_SIZE = 3 # See section 3.2. 18 | WINDOW_SIZE = 3 # See section 3.2. 19 | TOTAL_LETTER_GRAMS = int(3 * 1e4) # Determined from data. See section 3.2. 20 | WORD_DEPTH = WINDOW_SIZE * TOTAL_LETTER_GRAMS # See equation (1). 21 | # Uncomment it, if testing 22 | # WORD_DEPTH = 1000 23 | K = 300 # Dimensionality of the max-pooling layer. See section 3.4. 24 | L = 128 # Dimensionality of latent semantic space. See section 3.5. 25 | J = 4 # Number of random unclicked documents serving as negative examples for a query. See section 4. 26 | FILTER_LENGTH = 1 # We only consider one time step for convolutions. 27 | 28 | 29 | def kmax_pooling(x, dim, k): 30 | index = x.topk(k, dim = dim)[1].sort(dim = dim)[0] 31 | return x.gather(dim, index) 32 | 33 | class CDSSM(nn.Module): 34 | def __init__(self): 35 | super(CDSSM, self).__init__() 36 | # layers for query 37 | self.query_conv = nn.Conv1d(WORD_DEPTH, K, FILTER_LENGTH) 38 | self.query_sem = nn.Linear(K, L) 39 | # layers for docs 40 | self.doc_conv = nn.Conv1d(WORD_DEPTH, K, FILTER_LENGTH) 41 | self.doc_sem = nn.Linear(K, L) 42 | # learning gamma 43 | self.learn_gamma = nn.Conv1d(1, 1, 1) 44 | def forward(self, q, pos, negs): 45 | # Query model. The paper uses separate neural nets for queries and documents (see section 5.2). 46 | # To make it compatible with Conv layer we reshape it to: (batch_size, WORD_DEPTH, query_len) 47 | q = q.transpose(1,2) 48 | # In this step, we transform each word vector with WORD_DEPTH dimensions into its 49 | # convolved representation with K dimensions. K is the number of kernels/filters 50 | # being used in the operation. Essentially, the operation is taking the dot product 51 | # of a single weight matrix (W_c) with each of the word vectors (l_t) from the 52 | # query matrix (l_Q), adding a bias vector (b_c), and then applying the tanh activation. 53 | # That is, h_Q = tanh(W_c • l_Q + b_c). Note: the paper does not include bias units. 54 | q_c = F.tanh(self.query_conv(q)) 55 | # Next, we apply a max-pooling layer to the convolved query matrix. 56 | q_k = kmax_pooling(q_c, 2, 1) 57 | q_k = q_k.transpose(1,2) 58 | # In this step, we generate the semantic vector represenation of the query. This 59 | # is a standard neural network dense layer, i.e., y = tanh(W_s • v + b_s). Again, 60 | # the paper does not include bias units. 61 | q_s = F.tanh(self.query_sem(q_k)) 62 | q_s = q_s.resize(L) 63 | # # The document equivalent of the above query model for positive document 64 | pos = pos.transpose(1,2) 65 | pos_c = F.tanh(self.doc_conv(pos)) 66 | pos_k = kmax_pooling(pos_c, 2, 1) 67 | pos_k = pos_k.transpose(1,2) 68 | pos_s = F.tanh(self.doc_sem(pos_k)) 69 | pos_s = pos_s.resize(L) 70 | # # The document equivalent of the above query model for negative documents 71 | negs = [neg.transpose(1,2) for neg in negs] 72 | neg_cs = [F.tanh(self.doc_conv(neg)) for neg in negs] 73 | neg_ks = [kmax_pooling(neg_c, 2, 1) for neg_c in neg_cs] 74 | neg_ks = [neg_k.transpose(1,2) for neg_k in neg_ks] 75 | neg_ss = [F.tanh(self.doc_sem(neg_k)) for neg_k in neg_ks] 76 | neg_ss = [neg_s.resize(L) for neg_s in neg_ss] 77 | # Now let us calculates the cosine similarity between the semantic representations of 78 | # a queries and documents 79 | # dots[0] is the dot-product for positive document, this is necessary to remember 80 | # because we set the target label accordingly 81 | dots = [q_s.dot(pos_s)] 82 | dots = dots + [q_s.dot(neg_s) for neg_s in neg_ss] 83 | # dots is a list as of now, lets convert it to torch variable 84 | dots = torch.stack(dots) 85 | # In this step, we multiply each dot product value by gamma. In the paper, gamma is 86 | # described as a smoothing factor for the softmax function, and it's set empirically 87 | # on a held-out data set. We're going to learn gamma's value by pretending it's 88 | # a single 1 x 1 kernel. 89 | with_gamma = self.learn_gamma(dots.resize(J+1, 1, 1)) 90 | # You can use the softmax function to calculate P(D+|Q), but here we return the logits for the CrossEntropyLoss 91 | # prob = F.softmax(with_gamma) 92 | return with_gamma 93 | 94 | model = CDSSM() 95 | 96 | # Build a random data set. 97 | import numpy as np 98 | sample_size = 10 99 | l_Qs = [] 100 | pos_l_Ds = [] 101 | 102 | (query_len, doc_len) = (5, 100) 103 | 104 | for i in range(sample_size): 105 | query_len = np.random.randint(1, 10) 106 | l_Q = np.random.rand(1, query_len, WORD_DEPTH) 107 | l_Qs.append(l_Q) 108 | 109 | doc_len = np.random.randint(50, 500) 110 | l_D = np.random.rand(1, doc_len, WORD_DEPTH) 111 | pos_l_Ds.append(l_D) 112 | 113 | neg_l_Ds = [[] for j in range(J)] 114 | for i in range(sample_size): 115 | possibilities = list(range(sample_size)) 116 | possibilities.remove(i) 117 | negatives = np.random.choice(possibilities, J, replace = False) 118 | for j in range(J): 119 | negative = negatives[j] 120 | neg_l_Ds[j].append(pos_l_Ds[negative]) 121 | 122 | # Till now, we have made a complete numpy dataset 123 | # Now let's convert the numpy variables to torch Variable 124 | 125 | for i in range(len(l_Qs)): 126 | l_Qs[i] = Variable(torch.from_numpy(l_Qs[i]).float()) 127 | pos_l_Ds[i] = Variable(torch.from_numpy(pos_l_Ds[i]).float()) 128 | for j in range(J): 129 | neg_l_Ds[j][i] = Variable(torch.from_numpy(neg_l_Ds[j][i]).float()) 130 | 131 | 132 | # Loss and optimizer 133 | criterion = torch.nn.CrossEntropyLoss() 134 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9) 135 | 136 | # output variable, remember the cosine similarity with positive doc was at 0th index 137 | y = np.ndarray(1) 138 | # CrossEntropyLoss expects only the index as a long tensor 139 | y[0] = 0 140 | y = Variable(torch.from_numpy(y).long()) 141 | 142 | for i in range(sample_size): 143 | y_pred = model(l_Qs[i], pos_l_Ds[i], [neg_l_Ds[j][i] for j in range(J)]) 144 | loss = criterion(y_pred.resize(1,J+1), y) 145 | print (i, loss.data[0]) 146 | optimizer.zero_grad() 147 | loss.backward() 148 | optimizer.step() 149 | --------------------------------------------------------------------------------