├── .gitignore
├── README.md
├── corpus.txt
├── main.py
└── model.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Vim ###
 2 | [._]*.s[a-w][a-z]
 3 | [._]s[a-w][a-z]
 4 | *.un~
 5 | Session.vim
 6 | .netrwhist
 7 | *~
 8 | 
 9 | 
10 | ### IPythonNotebook ###
11 | # Temporary data
12 | .ipynb_checkpoints/
13 | 
14 | 
15 | ### Python ###
16 | # Byte-compiled / optimized / DLL files
17 | __pycache__/
18 | *.py[cod]
19 | *$py.class
20 | 
21 | # C extensions
22 | *.so
23 | 
24 | # Distribution / packaging
25 | .Python
26 | env/
27 | build/
28 | develop-eggs/
29 | dist/
30 | downloads/
31 | eggs/
32 | .eggs/
33 | lib/
34 | lib64/
35 | parts/
36 | sdist/
37 | var/
38 | *.egg-info/
39 | .installed.cfg
40 | *.egg
41 | 
42 | # PyInstaller
43 | #  Usually these files are written by a python script from a template
44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
45 | *.manifest
46 | *.spec
47 | 
48 | # Installer logs
49 | pip-log.txt
50 | pip-delete-this-directory.txt
51 | 
52 | # Unit test / coverage reports
53 | htmlcov/
54 | .tox/
55 | .coverage
56 | .coverage.*
57 | .cache
58 | nosetests.xml
59 | coverage.xml
60 | *,cover
61 | 
62 | # Translations
63 | *.mo
64 | *.pot
65 | 
66 | # Django stuff:
67 | *.log
68 | 
69 | # Sphinx documentation
70 | docs/_build/
71 | 
72 | # PyBuilder
73 | target/
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Word2Vec in Tensorflow
 2 | ======================
 3 | 
 4 | Tensorflow implementation of [Word2Vec](https://code.google.com/p/word2vec). The referenced torch code and dataset can be found [here](https://github.com/yoonkim/word2vec_torch).
 5 | 
 6 | 
 7 | Author
 8 | ------
 9 | 
10 | Taehoon Kim / [@carpedm20](http://carpedm20.github.io/)
11 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from model import Word2Vec
 4 | 
 5 | config = {}
 6 | config['corpus'] = "corpus.txt" # input data
 7 | config['window'] = 5 # (maximum) window size
 8 | config['embed_size'] = 100 # dimensionality of word embeddings
 9 | config['alpha'] = 0.75 # smooth out unigram frequencies
10 | config['table_size'] = int(1E8) # table size from which to sample neg samples
11 | config['neg_sample_size'] = 5 # number of negative samples for each positive sample
12 | config['min_frequency'] = 10 #threshold for vocab frequency
13 | config['lr'] = 0.025 # initial learning rate
14 | config['min_lr'] = 0.001 # min learning rate
15 | config['epochs'] = 3 # number of epochs to train
16 | config['gpu'] = 0 # 1 = use gpu, 0 = use cpu
17 | config['stream'] = 1 # 1 = stream from hard drive 0 = copy to memory first
18 | 
19 | with tf.Session() as sess:
20 |     w2v = Word2Vec(config, sess)
21 |     w2v.build_vocab(config['corpus'])
22 |     w2v.build_table()
23 | 
24 |     for idx in xrange(config['epochs']):
25 |         w2v.lr = config['lr']
26 |         w2v.train_model(config['corpus'])
27 | 
28 |     w2v.get_sim_words(['the', 'he', 'can'], 5)
29 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | import time
  7 | import math
  8 | import random
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | from collections import Counter
 12 | 
 13 | class Word2Vec(object):
 14 |     def __init__(self, config, sess):
 15 |         self.sess = sess
 16 | 
 17 |         self.alpha = config['alpha']
 18 |         self.embed_size = config['embed_size']
 19 |         self.neg_sample_size = config['neg_sample_size']
 20 |         self.min_frequency = config['min_frequency']
 21 |         self.window = config['window']
 22 |         self.lr = config['lr']
 23 |         self.min_lr = config['min_lr']
 24 |         self.table_size = config['table_size'] # unigram table size
 25 | 
 26 |     def build_vocab(self, filename):
 27 |         start_time = time.time()
 28 |         with open(filename) as f:
 29 |             words = [word for line in f.readlines() for word in line.split()]
 30 |         self.total_count = len(words)
 31 |         self.counter = [['UNK', 0]]
 32 |         self.counter.extend([list(item) for item in Counter(words).most_common()
 33 |                           if item[0] > self.min_frequency])
 34 |         self.vocab_size = len(self.counter)
 35 |         word2idx = dict()
 36 |         for word, _ in self.counter:
 37 |             word2idx[word] = len(word2idx)
 38 |         data = list()
 39 |         unk_count = 0
 40 |         for word in words:
 41 |             if word in word2idx:
 42 |                 idx = word2idx[word]
 43 |             else:
 44 |                 idx = 0 # word2idx['UNK']
 45 |                 unk_count = unk_count + 1
 46 |             data.append(idx)
 47 |         self.counter[0][1] = unk_count
 48 |         idx2word = dict(zip(word2idx.values(), word2idx.keys()))
 49 |         duration = time.time() - start_time
 50 | 
 51 |         print("%d words processed in %.2f seconds" % (self.total_count, duration))
 52 |         print("Vocab size after eliminating words occuring less than %d times: %d" % (self.min_frequency, self.vocab_size))
 53 | 
 54 |         self.data = data
 55 |         self.words = words
 56 |         self.word2idx = word2idx 
 57 |         self.idx2word = idx2word
 58 | 
 59 |         self.decay = (self.min_lr-self.lr)/(self.total_count*self.window)
 60 |         self.labels = np.zeros([1, 1+self.neg_sample_size], dtype=np.float32); self.labels[0][0] = 1
 61 |         self.contexts = np.ndarray(1 + self.neg_sample_size, dtype=np.int32)
 62 |         self.build_model()
 63 | 
 64 |     def build_model(self):
 65 |         self.x = tf.placeholder(tf.int32, [1], name='pos_x')
 66 |         self.y = tf.placeholder(tf.int32, [1 + self.neg_sample_size], name='pos_x')
 67 | 
 68 |         init_width = 0.5 / self.embed_size
 69 |         self.embed = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_size], -init_width, init_width), name='embed')
 70 |         self.w = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size], stddev=1.0 / math.sqrt(self.embed_size)), name='w')
 71 | 
 72 |         self.x_embed = tf.nn.embedding_lookup(self.embed, self.x, name='pos_embed')
 73 |         self.y_w = tf.nn.embedding_lookup(self.w, self.y, name='pos_embed')
 74 | 
 75 |         self.mul = tf.matmul(self.x_embed, self.y_w, transpose_b=True)
 76 |         self.p = tf.nn.sigmoid(self.mul)
 77 | 
 78 |         self.loss = tf.nn.softmax_cross_entropy_with_logits(self.p, self.labels)
 79 |         self.train = tf.train.GradientDescentOptimizer(0.001).minimize(self.loss)
 80 | 
 81 |         self.sess.run(tf.initialize_all_variables())
 82 | 
 83 |     def train_pair(self, word_idx, contexts):
 84 |         self.sess.run(self.train, feed_dict={self.x: [word_idx], self.y: contexts})
 85 | 
 86 |     def build_table(self):
 87 |         start_time = time.time()
 88 |         total_count_pow = 0
 89 |         for _, count in self.counter:
 90 |             total_count_pow += math.pow(count, self.alpha)
 91 |         word_idx = 1
 92 |         self.table = np.zeros([self.table_size], dtype=np.int32)
 93 |         word_prob = math.pow(self.counter[word_idx][1], self.alpha) / total_count_pow
 94 |         for idx in xrange(self.table_size):
 95 |             self.table[idx] = word_idx
 96 |             if idx / self.table_size > word_prob:
 97 |                 word_idx += 1
 98 |                 word_prob += math.pow(self.counter[word_idx][1], self.alpha) / total_count_pow
 99 |             if word_idx > self.vocab_size:
100 |                 word_idx = word_idx - 1
101 |         print("Done in %.2f seconds.", time.time() - start_time)
102 | 
103 |     def sample_contexts(self, context):
104 |         self.contexts[0] = context
105 |         idx = 0
106 |         while idx < self.neg_sample_size - 1:
107 |             neg_context = self.table[random.randrange(self.table_size)]
108 |             if context != neg_context:
109 |                 self.contexts[idx+2] = neg_context
110 |                 idx += 1
111 | 
112 |     def train_stream(self, filename):
113 |         print("Training...")
114 | 
115 |         start_time = time.time()
116 |         c = 0
117 |         with open(filename) as f:
118 |             words = [word for line in f.readlines() for word in line.split()]
119 |             for idx, word in enumerate(words):
120 |                 try:
121 |                     word_idx = self.word2idx[word]
122 |                     reduced_window = random.randrange(self.window)
123 |                     self.words[0] = word_idx
124 |                     for jdx in xrange(idx - reduced_window, idx + reduced_window + 1):
125 |                         context = words[jdx]
126 |                         if jdx != idx:
127 |                             try:
128 |                                 context_idx = self.word2idx[context]
129 |                                 self.sample_contexts(context_idx)
130 |                                 self.train_pair(word_idx, self.contexts)
131 |                                 self.lr = max(self.min_lr, self.lr + self.decay)
132 |                                 c += 1
133 |                                 if c % 100000 == 0:
134 |                                     loss = self.sess.run(self.loss, feed_dict={self.x: [word_idx], self.y: self.contexts})
135 |                                     print("%d words trained in %.2f seconds. Learning rate: %.4f, Loss : %.4f" % (c, time.time() - start_time, self.lr, loss))
136 |                             except:
137 |                                 continue
138 |                 except:
139 |                     continue
140 | 
141 |     def get_sim_Words(self, idxs, k):
142 |         if type(idxs[0]) == str:
143 |             idxs = np.array([self.word2idx[word] for word in idxs])
144 |         else:
145 |             idxs = np.array(idxs)
146 | 
147 |         vals, idxs = sess.run(
148 |             [nearby_val, nearby_idx], {nearby_character: idxs})
149 |         for i in xrange(len(idxs)):
150 |             print(idx2word[idxs[i]])
151 |             print()
152 |             for (neighbor, distance) in zip(idx[i, :num], vals[i, :num]):
153 |                 print("%-20s %6.4f" % (idx2word[neighbor], distance))
154 | 
155 |     def train_model(self, corpus):
156 |         self.train_stream(corpus)
157 | 
158 | 


--------------------------------------------------------------------------------