├── .gitignore ├── README.md ├── corpus.txt ├── main.py └── model.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### Vim ### 2 | [._]*.s[a-w][a-z] 3 | [._]s[a-w][a-z] 4 | *.un~ 5 | Session.vim 6 | .netrwhist 7 | *~ 8 | 9 | 10 | ### IPythonNotebook ### 11 | # Temporary data 12 | .ipynb_checkpoints/ 13 | 14 | 15 | ### Python ### 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | env/ 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *,cover 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Word2Vec in Tensorflow 2 | ====================== 3 | 4 | Tensorflow implementation of [Word2Vec](https://code.google.com/p/word2vec). The referenced torch code and dataset can be found [here](https://github.com/yoonkim/word2vec_torch). 5 | 6 | 7 | Author 8 | ------ 9 | 10 | Taehoon Kim / [@carpedm20](http://carpedm20.github.io/) 11 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from model import Word2Vec 4 | 5 | config = {} 6 | config['corpus'] = "corpus.txt" # input data 7 | config['window'] = 5 # (maximum) window size 8 | config['embed_size'] = 100 # dimensionality of word embeddings 9 | config['alpha'] = 0.75 # smooth out unigram frequencies 10 | config['table_size'] = int(1E8) # table size from which to sample neg samples 11 | config['neg_sample_size'] = 5 # number of negative samples for each positive sample 12 | config['min_frequency'] = 10 #threshold for vocab frequency 13 | config['lr'] = 0.025 # initial learning rate 14 | config['min_lr'] = 0.001 # min learning rate 15 | config['epochs'] = 3 # number of epochs to train 16 | config['gpu'] = 0 # 1 = use gpu, 0 = use cpu 17 | config['stream'] = 1 # 1 = stream from hard drive 0 = copy to memory first 18 | 19 | with tf.Session() as sess: 20 | w2v = Word2Vec(config, sess) 21 | w2v.build_vocab(config['corpus']) 22 | w2v.build_table() 23 | 24 | for idx in xrange(config['epochs']): 25 | w2v.lr = config['lr'] 26 | w2v.train_model(config['corpus']) 27 | 28 | w2v.get_sim_words(['the', 'he', 'can'], 5) 29 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import time 7 | import math 8 | import random 9 | import numpy as np 10 | import tensorflow as tf 11 | from collections import Counter 12 | 13 | class Word2Vec(object): 14 | def __init__(self, config, sess): 15 | self.sess = sess 16 | 17 | self.alpha = config['alpha'] 18 | self.embed_size = config['embed_size'] 19 | self.neg_sample_size = config['neg_sample_size'] 20 | self.min_frequency = config['min_frequency'] 21 | self.window = config['window'] 22 | self.lr = config['lr'] 23 | self.min_lr = config['min_lr'] 24 | self.table_size = config['table_size'] # unigram table size 25 | 26 | def build_vocab(self, filename): 27 | start_time = time.time() 28 | with open(filename) as f: 29 | words = [word for line in f.readlines() for word in line.split()] 30 | self.total_count = len(words) 31 | self.counter = [['UNK', 0]] 32 | self.counter.extend([list(item) for item in Counter(words).most_common() 33 | if item[0] > self.min_frequency]) 34 | self.vocab_size = len(self.counter) 35 | word2idx = dict() 36 | for word, _ in self.counter: 37 | word2idx[word] = len(word2idx) 38 | data = list() 39 | unk_count = 0 40 | for word in words: 41 | if word in word2idx: 42 | idx = word2idx[word] 43 | else: 44 | idx = 0 # word2idx['UNK'] 45 | unk_count = unk_count + 1 46 | data.append(idx) 47 | self.counter[0][1] = unk_count 48 | idx2word = dict(zip(word2idx.values(), word2idx.keys())) 49 | duration = time.time() - start_time 50 | 51 | print("%d words processed in %.2f seconds" % (self.total_count, duration)) 52 | print("Vocab size after eliminating words occuring less than %d times: %d" % (self.min_frequency, self.vocab_size)) 53 | 54 | self.data = data 55 | self.words = words 56 | self.word2idx = word2idx 57 | self.idx2word = idx2word 58 | 59 | self.decay = (self.min_lr-self.lr)/(self.total_count*self.window) 60 | self.labels = np.zeros([1, 1+self.neg_sample_size], dtype=np.float32); self.labels[0][0] = 1 61 | self.contexts = np.ndarray(1 + self.neg_sample_size, dtype=np.int32) 62 | self.build_model() 63 | 64 | def build_model(self): 65 | self.x = tf.placeholder(tf.int32, [1], name='pos_x') 66 | self.y = tf.placeholder(tf.int32, [1 + self.neg_sample_size], name='pos_x') 67 | 68 | init_width = 0.5 / self.embed_size 69 | self.embed = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_size], -init_width, init_width), name='embed') 70 | self.w = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size], stddev=1.0 / math.sqrt(self.embed_size)), name='w') 71 | 72 | self.x_embed = tf.nn.embedding_lookup(self.embed, self.x, name='pos_embed') 73 | self.y_w = tf.nn.embedding_lookup(self.w, self.y, name='pos_embed') 74 | 75 | self.mul = tf.matmul(self.x_embed, self.y_w, transpose_b=True) 76 | self.p = tf.nn.sigmoid(self.mul) 77 | 78 | self.loss = tf.nn.softmax_cross_entropy_with_logits(self.p, self.labels) 79 | self.train = tf.train.GradientDescentOptimizer(0.001).minimize(self.loss) 80 | 81 | self.sess.run(tf.initialize_all_variables()) 82 | 83 | def train_pair(self, word_idx, contexts): 84 | self.sess.run(self.train, feed_dict={self.x: [word_idx], self.y: contexts}) 85 | 86 | def build_table(self): 87 | start_time = time.time() 88 | total_count_pow = 0 89 | for _, count in self.counter: 90 | total_count_pow += math.pow(count, self.alpha) 91 | word_idx = 1 92 | self.table = np.zeros([self.table_size], dtype=np.int32) 93 | word_prob = math.pow(self.counter[word_idx][1], self.alpha) / total_count_pow 94 | for idx in xrange(self.table_size): 95 | self.table[idx] = word_idx 96 | if idx / self.table_size > word_prob: 97 | word_idx += 1 98 | word_prob += math.pow(self.counter[word_idx][1], self.alpha) / total_count_pow 99 | if word_idx > self.vocab_size: 100 | word_idx = word_idx - 1 101 | print("Done in %.2f seconds.", time.time() - start_time) 102 | 103 | def sample_contexts(self, context): 104 | self.contexts[0] = context 105 | idx = 0 106 | while idx < self.neg_sample_size - 1: 107 | neg_context = self.table[random.randrange(self.table_size)] 108 | if context != neg_context: 109 | self.contexts[idx+2] = neg_context 110 | idx += 1 111 | 112 | def train_stream(self, filename): 113 | print("Training...") 114 | 115 | start_time = time.time() 116 | c = 0 117 | with open(filename) as f: 118 | words = [word for line in f.readlines() for word in line.split()] 119 | for idx, word in enumerate(words): 120 | try: 121 | word_idx = self.word2idx[word] 122 | reduced_window = random.randrange(self.window) 123 | self.words[0] = word_idx 124 | for jdx in xrange(idx - reduced_window, idx + reduced_window + 1): 125 | context = words[jdx] 126 | if jdx != idx: 127 | try: 128 | context_idx = self.word2idx[context] 129 | self.sample_contexts(context_idx) 130 | self.train_pair(word_idx, self.contexts) 131 | self.lr = max(self.min_lr, self.lr + self.decay) 132 | c += 1 133 | if c % 100000 == 0: 134 | loss = self.sess.run(self.loss, feed_dict={self.x: [word_idx], self.y: self.contexts}) 135 | print("%d words trained in %.2f seconds. Learning rate: %.4f, Loss : %.4f" % (c, time.time() - start_time, self.lr, loss)) 136 | except: 137 | continue 138 | except: 139 | continue 140 | 141 | def get_sim_Words(self, idxs, k): 142 | if type(idxs[0]) == str: 143 | idxs = np.array([self.word2idx[word] for word in idxs]) 144 | else: 145 | idxs = np.array(idxs) 146 | 147 | vals, idxs = sess.run( 148 | [nearby_val, nearby_idx], {nearby_character: idxs}) 149 | for i in xrange(len(idxs)): 150 | print(idx2word[idxs[i]]) 151 | print() 152 | for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]): 153 | print("%-20s %6.4f" % (idx2word[neighbor], distance)) 154 | 155 | def train_model(self, corpus): 156 | self.train_stream(corpus) 157 | 158 | --------------------------------------------------------------------------------