├── README.md └── cbow_model_pytorch.py /README.md: -------------------------------------------------------------------------------- 1 | ### continuous-bag-of-words(CBOW)-pytorch 2 | This is one of the implementation of CBOW model in pytorch lib. CBOW is used for learning the word(getting the word probability) by looking at the context. A single window was define for words context learnig. One of the objectives of CBOW model is to find two words similirity. 3 | Nowadays CBOW is frequently used in many NLP deep learning task. 4 | 5 | ## RUN 6 | ```bash 7 | python3 cbow_model_pytorch.py 8 | ``` 9 | it will verbose all training loss. 10 | **test_cbow** function used to show the two words similarity after learning the corpus context. 11 | -------------------------------------------------------------------------------- /cbow_model_pytorch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.autograd import Variable 4 | from torch.optim import SGD 5 | import torch.nn.functional as F 6 | 7 | CONTEXT_SIZE = 4 8 | EMBEDDING_DIM = 300 9 | EPOCH = 20 10 | VERVOSE = 5 11 | 12 | corpus_text = "This tutorial will walk you through the key ideas of deep learning programming using Pytorch." \ 13 | " Many of the concepts (such as the computation graph abstraction and autograd) " \ 14 | "are not unique to Pytorch and are relevant to any deep learning tool kit out there.".split(' ') 15 | 16 | 17 | class CBOW(nn.Module): 18 | def __init__(self, vocab_size, embedding_size, context_size): 19 | super(CBOW, self).__init__() 20 | self.vocab_size = vocab_size 21 | self.embedding_size = embedding_size 22 | self.context_size = context_size 23 | self.embeddings = nn.Embedding(self.vocab_size, self.embedding_size) 24 | # return vector size will be context_size*2*embedding_size 25 | self.lin1 = nn.Linear(self.context_size * 2 * self.embedding_size, 512) 26 | self.lin2 = nn.Linear(512, self.vocab_size) 27 | 28 | def forward(self, inp): 29 | out = self.embeddings(inp).view(1, -1) 30 | out = out.view(1, -1) 31 | out = self.lin1(out) 32 | out = F.relu(out) 33 | out = self.lin2(out) 34 | out = F.log_softmax(out, dim=1) 35 | return out 36 | 37 | def get_word_vector(self, word_idx): 38 | word = Variable(torch.LongTensor([word_idx])) 39 | return self.embeddings(word).view(1, -1) 40 | 41 | 42 | def train_cbow(data, unique_vocab, word_to_idx): 43 | cbow = CBOW(len(unique_vocab), EMBEDDING_DIM, CONTEXT_SIZE) 44 | 45 | nll_loss = nn.NLLLoss() # loss function 46 | optimizer = SGD(cbow.parameters(), lr=0.001) 47 | 48 | print(len(data)) 49 | 50 | for epoch in range(EPOCH): 51 | total_loss = 0 52 | for context, target in data: 53 | inp_var = Variable(torch.LongTensor([word_to_idx[word] for word in context])) 54 | target_var = Variable(torch.LongTensor([word_to_idx[target]])) 55 | 56 | cbow.zero_grad() 57 | log_prob = cbow(inp_var) 58 | loss = nll_loss(log_prob, target_var) 59 | loss.backward() 60 | optimizer.step() 61 | total_loss += loss.data 62 | 63 | if epoch % VERVOSE == 0: 64 | loss_avg = float(total_loss / len(data)) 65 | print("{}/{} loss {:.2f}".format(epoch, EPOCH, loss_avg)) 66 | return cbow 67 | 68 | 69 | def test_cbow(cbow, unique_vocab, word_to_idx): 70 | # test word similarity 71 | word_1 = unique_vocab[2] 72 | word_2 = unique_vocab[3] 73 | 74 | word_1_vec = cbow.get_word_vector(word_to_idx[word_1]) 75 | word_2_vec = cbow.get_word_vector(word_to_idx[word_2]) 76 | 77 | word_similarity = (word_1_vec.dot(word_2_vec) / (torch.norm(word_1_vec) * torch.norm(word_2_vec))).data.numpy()[0] 78 | print("Similarity between '{}' & '{}' : {:0.4f}".format(word_1, word_2, word_similarity)) 79 | 80 | 81 | def main(): 82 | # content processed as context/target 83 | # consider 2*CONTEXT_SIZE as context window where middle word as target 84 | data = list() 85 | for i in range(CONTEXT_SIZE, len(corpus_text) - CONTEXT_SIZE): 86 | data_context = list() 87 | for j in range(CONTEXT_SIZE): 88 | data_context.append(corpus_text[i - CONTEXT_SIZE + j]) 89 | 90 | for j in range(1, CONTEXT_SIZE + 1): 91 | data_context.append(corpus_text[i + j]) 92 | data_target = corpus_text[i] 93 | data.append((data_context, data_target)) 94 | 95 | print("Some data: ",data[:3]) 96 | 97 | unique_vocab = list(set(corpus_text)) 98 | 99 | # mapping to index 100 | word_to_idx = {w: i for i, w in enumerate(unique_vocab)} 101 | 102 | # train model- changed global variable if needed 103 | cbow = train_cbow(data, unique_vocab, word_to_idx) 104 | 105 | # get two words similarity 106 | test_cbow(cbow, unique_vocab, word_to_idx) 107 | 108 | 109 | if __name__ == "__main__": 110 | main() 111 | --------------------------------------------------------------------------------