├── .gitignore ├── README.md ├── setup.py └── word2vec ├── __init__.py ├── data_reader.py ├── model.py └── trainer.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | __pycache__/* 3 | 4 | 5 | \.idea/* 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Word2vec Pytorch 2 | 3 | Fast word2vec implementation at competitive speed compared with fasttext. The slowest part is the python data loader. Indeed, Python isn't the fastest programming language, maybe you can improve the code :) 4 | 5 | ## Advantages 6 | 7 | * Easy to understand, solid code 8 | * Easy to extend for new experiments 9 | * You can try advanced learning optimizers, with new learning technics 10 | * GPU support 11 | 12 | ## Supported features 13 | 14 | * Skip-gram 15 | * Batch update 16 | * Cosine Annealing 17 | * Negative Sampling 18 | * Sub-sampling of frequent word 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='word2vec', 5 | version='0.0.1', 6 | description='Skip-gram PyTorch implementation', 7 | packages=find_packages(include=['word2vec']), 8 | install_requires=[], 9 | ) 10 | 11 | -------------------------------------------------------------------------------- /word2vec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Andras7/word2vec-pytorch/36b93a503e8b3b5448abbc0e18f2a6bd3e017fc9/word2vec/__init__.py -------------------------------------------------------------------------------- /word2vec/data_reader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.utils.data import Dataset 4 | 5 | np.random.seed(12345) 6 | 7 | 8 | class DataReader: 9 | NEGATIVE_TABLE_SIZE = 1e8 10 | 11 | def __init__(self, inputFileName, min_count): 12 | 13 | self.negatives = [] 14 | self.discards = [] 15 | self.negpos = 0 16 | 17 | self.word2id = dict() 18 | self.id2word = dict() 19 | self.sentences_count = 0 20 | self.token_count = 0 21 | self.word_frequency = dict() 22 | 23 | self.inputFileName = inputFileName 24 | self.read_words(min_count) 25 | self.initTableNegatives() 26 | self.initTableDiscards() 27 | 28 | def read_words(self, min_count): 29 | word_frequency = dict() 30 | for line in open(self.inputFileName, encoding="utf8"): 31 | line = line.split() 32 | if len(line) > 1: 33 | self.sentences_count += 1 34 | for word in line: 35 | if len(word) > 0: 36 | self.token_count += 1 37 | word_frequency[word] = word_frequency.get(word, 0) + 1 38 | 39 | if self.token_count % 1000000 == 0: 40 | print("Read " + str(int(self.token_count / 1000000)) + "M words.") 41 | 42 | wid = 0 43 | for w, c in word_frequency.items(): 44 | if c < min_count: 45 | continue 46 | self.word2id[w] = wid 47 | self.id2word[wid] = w 48 | self.word_frequency[wid] = c 49 | wid += 1 50 | print("Total embeddings: " + str(len(self.word2id))) 51 | 52 | def initTableDiscards(self): 53 | t = 0.0001 54 | f = np.array(list(self.word_frequency.values())) / self.token_count 55 | self.discards = np.sqrt(t / f) + (t / f) 56 | 57 | def initTableNegatives(self): 58 | pow_frequency = np.array(list(self.word_frequency.values())) ** 0.5 59 | words_pow = sum(pow_frequency) 60 | ratio = pow_frequency / words_pow 61 | count = np.round(ratio * DataReader.NEGATIVE_TABLE_SIZE) 62 | for wid, c in enumerate(count): 63 | self.negatives += [wid] * int(c) 64 | self.negatives = np.array(self.negatives) 65 | np.random.shuffle(self.negatives) 66 | 67 | def getNegatives(self, target, size): # TODO check equality with target 68 | response = self.negatives[self.negpos:self.negpos + size] 69 | self.negpos = (self.negpos + size) % len(self.negatives) 70 | if len(response) != size: 71 | return np.concatenate((response, self.negatives[0:self.negpos])) 72 | return response 73 | 74 | 75 | # ----------------------------------------------------------------------------------------------------------------- 76 | 77 | class Word2vecDataset(Dataset): 78 | def __init__(self, data, window_size): 79 | self.data = data 80 | self.window_size = window_size 81 | self.input_file = open(data.inputFileName, encoding="utf8") 82 | 83 | def __len__(self): 84 | return self.data.sentences_count 85 | 86 | def __getitem__(self, idx): 87 | while True: 88 | line = self.input_file.readline() 89 | if not line: 90 | self.input_file.seek(0, 0) 91 | line = self.input_file.readline() 92 | 93 | if len(line) > 1: 94 | words = line.split() 95 | 96 | if len(words) > 1: 97 | word_ids = [self.data.word2id[w] for w in words if 98 | w in self.data.word2id and np.random.rand() < self.data.discards[self.data.word2id[w]]] 99 | 100 | boundary = np.random.randint(1, self.window_size) 101 | return [(u, v, self.data.getNegatives(v, 5)) for i, u in enumerate(word_ids) for j, v in 102 | enumerate(word_ids[max(i - boundary, 0):i + boundary]) if u != v] 103 | 104 | @staticmethod 105 | def collate(batches): 106 | all_u = [u for batch in batches for u, _, _ in batch if len(batch) > 0] 107 | all_v = [v for batch in batches for _, v, _ in batch if len(batch) > 0] 108 | all_neg_v = [neg_v for batch in batches for _, _, neg_v in batch if len(batch) > 0] 109 | 110 | return torch.LongTensor(all_u), torch.LongTensor(all_v), torch.LongTensor(all_neg_v) 111 | -------------------------------------------------------------------------------- /word2vec/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn import init 5 | 6 | """ 7 | u_embedding: Embedding for center word. 8 | v_embedding: Embedding for neighbor words. 9 | """ 10 | 11 | 12 | class SkipGramModel(nn.Module): 13 | 14 | def __init__(self, emb_size, emb_dimension): 15 | super(SkipGramModel, self).__init__() 16 | self.emb_size = emb_size 17 | self.emb_dimension = emb_dimension 18 | self.u_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True) 19 | self.v_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True) 20 | 21 | initrange = 1.0 / self.emb_dimension 22 | init.uniform_(self.u_embeddings.weight.data, -initrange, initrange) 23 | init.constant_(self.v_embeddings.weight.data, 0) 24 | 25 | def forward(self, pos_u, pos_v, neg_v): 26 | emb_u = self.u_embeddings(pos_u) 27 | emb_v = self.v_embeddings(pos_v) 28 | emb_neg_v = self.v_embeddings(neg_v) 29 | 30 | score = torch.sum(torch.mul(emb_u, emb_v), dim=1) 31 | score = torch.clamp(score, max=10, min=-10) 32 | score = -F.logsigmoid(score) 33 | 34 | neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze(2)).squeeze() 35 | neg_score = torch.clamp(neg_score, max=10, min=-10) 36 | neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1) 37 | 38 | return torch.mean(score + neg_score) 39 | 40 | def save_embedding(self, id2word, file_name): 41 | embedding = self.u_embeddings.weight.cpu().data.numpy() 42 | with open(file_name, 'w') as f: 43 | f.write('%d %d\n' % (len(id2word), self.emb_dimension)) 44 | for wid, w in id2word.items(): 45 | e = ' '.join(map(lambda x: str(x), embedding[wid])) 46 | f.write('%s %s\n' % (w, e)) 47 | -------------------------------------------------------------------------------- /word2vec/trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.optim as optim 3 | from torch.utils.data import DataLoader 4 | from tqdm import tqdm 5 | 6 | from word2vec.data_reader import DataReader, Word2vecDataset 7 | from word2vec.model import SkipGramModel 8 | 9 | 10 | class Word2VecTrainer: 11 | def __init__(self, input_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, 12 | initial_lr=0.001, min_count=12): 13 | 14 | self.data = DataReader(input_file, min_count) 15 | dataset = Word2vecDataset(self.data, window_size) 16 | self.dataloader = DataLoader(dataset, batch_size=batch_size, 17 | shuffle=False, num_workers=0, collate_fn=dataset.collate) 18 | 19 | self.output_file_name = output_file 20 | self.emb_size = len(self.data.word2id) 21 | self.emb_dimension = emb_dimension 22 | self.batch_size = batch_size 23 | self.iterations = iterations 24 | self.initial_lr = initial_lr 25 | self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) 26 | 27 | self.use_cuda = torch.cuda.is_available() 28 | self.device = torch.device("cuda" if self.use_cuda else "cpu") 29 | if self.use_cuda: 30 | self.skip_gram_model.cuda() 31 | 32 | def train(self): 33 | 34 | for iteration in range(self.iterations): 35 | 36 | print("\n\n\nIteration: " + str(iteration + 1)) 37 | optimizer = optim.SparseAdam(self.skip_gram_model.parameters(), lr=self.initial_lr) 38 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, len(self.dataloader)) 39 | 40 | running_loss = 0.0 41 | for i, sample_batched in enumerate(tqdm(self.dataloader)): 42 | 43 | if len(sample_batched[0]) > 1: 44 | pos_u = sample_batched[0].to(self.device) 45 | pos_v = sample_batched[1].to(self.device) 46 | neg_v = sample_batched[2].to(self.device) 47 | 48 | scheduler.step() 49 | optimizer.zero_grad() 50 | loss = self.skip_gram_model.forward(pos_u, pos_v, neg_v) 51 | loss.backward() 52 | optimizer.step() 53 | 54 | running_loss = running_loss * 0.9 + loss.item() * 0.1 55 | if i > 0 and i % 500 == 0: 56 | print(" Loss: " + str(running_loss)) 57 | 58 | self.skip_gram_model.save_embedding(self.data.id2word, self.output_file_name) 59 | 60 | 61 | if __name__ == '__main__': 62 | w2v = Word2VecTrainer(input_file="input.txt", output_file="out.vec") 63 | w2v.train() 64 | --------------------------------------------------------------------------------