├── README.md
└── cbow_model_pytorch.py


/README.md:
--------------------------------------------------------------------------------
 1 | ### continuous-bag-of-words(CBOW)-pytorch
 2 | This is one of the implementation of CBOW model in pytorch lib. CBOW is used for learning the word(getting the word probability) by looking at the context. A single window was define for words context learnig. One of the objectives of CBOW model is to find two words similirity. 
 3 |  Nowadays CBOW is frequently used in many NLP deep learning task.
 4 |  
 5 | ## RUN
 6 | ```bash
 7 | python3 cbow_model_pytorch.py 
 8 | ```
 9 | it will verbose all training loss.
10 | **test_cbow** function used to show the two words similarity after learning the corpus context.
11 | 


--------------------------------------------------------------------------------
/cbow_model_pytorch.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.autograd import Variable
  4 | from torch.optim import SGD
  5 | import torch.nn.functional as F
  6 | 
  7 | CONTEXT_SIZE = 4
  8 | EMBEDDING_DIM = 300
  9 | EPOCH = 20
 10 | VERVOSE = 5
 11 | 
 12 | corpus_text = "This tutorial will walk you through the key ideas of deep learning programming using Pytorch." \
 13 |               " Many of the concepts (such as the computation graph abstraction and autograd) " \
 14 |               "are not unique to Pytorch and are relevant to any deep learning tool kit out there.".split(' ')
 15 | 
 16 | 
 17 | class CBOW(nn.Module):
 18 |     def __init__(self, vocab_size, embedding_size, context_size):
 19 |         super(CBOW, self).__init__()
 20 |         self.vocab_size = vocab_size
 21 |         self.embedding_size = embedding_size
 22 |         self.context_size = context_size
 23 |         self.embeddings = nn.Embedding(self.vocab_size, self.embedding_size)
 24 |         # return vector size will be context_size*2*embedding_size
 25 |         self.lin1 = nn.Linear(self.context_size * 2 * self.embedding_size, 512)
 26 |         self.lin2 = nn.Linear(512, self.vocab_size)
 27 |     
 28 |     def forward(self, inp):
 29 |         out = self.embeddings(inp).view(1, -1)
 30 |         out = out.view(1, -1)
 31 |         out = self.lin1(out)
 32 |         out = F.relu(out)
 33 |         out = self.lin2(out)
 34 |         out = F.log_softmax(out, dim=1)
 35 |         return out
 36 |     
 37 |     def get_word_vector(self, word_idx):
 38 |         word = Variable(torch.LongTensor([word_idx]))
 39 |         return self.embeddings(word).view(1, -1)
 40 | 
 41 | 
 42 | def train_cbow(data, unique_vocab, word_to_idx):
 43 |     cbow = CBOW(len(unique_vocab), EMBEDDING_DIM, CONTEXT_SIZE)
 44 |     
 45 |     nll_loss = nn.NLLLoss()  # loss function
 46 |     optimizer = SGD(cbow.parameters(), lr=0.001)
 47 |     
 48 |     print(len(data))
 49 |     
 50 |     for epoch in range(EPOCH):
 51 |         total_loss = 0
 52 |         for context, target in data:            
 53 |             inp_var = Variable(torch.LongTensor([word_to_idx[word] for word in context]))
 54 |             target_var = Variable(torch.LongTensor([word_to_idx[target]]))
 55 |                         
 56 |             cbow.zero_grad()
 57 |             log_prob = cbow(inp_var)
 58 |             loss = nll_loss(log_prob, target_var)
 59 |             loss.backward()
 60 |             optimizer.step()
 61 |             total_loss += loss.data
 62 |         
 63 |         if epoch % VERVOSE == 0:
 64 |             loss_avg = float(total_loss / len(data))
 65 |             print("{}/{} loss {:.2f}".format(epoch, EPOCH, loss_avg))
 66 |     return cbow
 67 | 
 68 | 
 69 | def test_cbow(cbow, unique_vocab, word_to_idx):
 70 |     # test word similarity
 71 |     word_1 = unique_vocab[2]
 72 |     word_2 = unique_vocab[3]
 73 |     
 74 |     word_1_vec = cbow.get_word_vector(word_to_idx[word_1])
 75 |     word_2_vec = cbow.get_word_vector(word_to_idx[word_2])
 76 |     
 77 |     word_similarity = (word_1_vec.dot(word_2_vec) / (torch.norm(word_1_vec) * torch.norm(word_2_vec))).data.numpy()[0]
 78 |     print("Similarity between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_similarity))
 79 | 
 80 | 
 81 | def main():
 82 |     # content processed as context/target
 83 |     # consider 2*CONTEXT_SIZE as context window where middle word as target
 84 |     data = list()
 85 |     for i in range(CONTEXT_SIZE, len(corpus_text) - CONTEXT_SIZE):
 86 |         data_context = list()
 87 |         for j in range(CONTEXT_SIZE):
 88 |             data_context.append(corpus_text[i - CONTEXT_SIZE + j])
 89 |         
 90 |         for j in range(1, CONTEXT_SIZE + 1):
 91 |             data_context.append(corpus_text[i + j])
 92 |         data_target = corpus_text[i]
 93 |         data.append((data_context, data_target))
 94 |  
 95 |     print("Some data: ",data[:3])
 96 | 
 97 |     unique_vocab = list(set(corpus_text))
 98 |     
 99 |     # mapping to index
100 |     word_to_idx = {w: i for i, w in enumerate(unique_vocab)}
101 | 
102 |     # train model- changed global variable if needed
103 |     cbow = train_cbow(data, unique_vocab, word_to_idx)
104 |     
105 |     # get two words similarity
106 |     test_cbow(cbow, unique_vocab, word_to_idx)
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     main()
111 | 


--------------------------------------------------------------------------------