├── README.md
├── word_segmenter.py
└── data_iterator.py


/README.md:
--------------------------------------------------------------------------------
 1 | # char2vec
 2 | Implementation of char2vec model from http://www.aclweb.org/anthology/W/W16/W16-1603.pdf
 3 | 
 4 | These two files implement the Char2Vec model from "A Joint Model for Word Embedding and Word Morphology" by Kris Cao and
 5 | Marek Rei, 2016. To get started:
 6 | 
 7 | * Create a corpus
 8 | * Create a WordSegmenter object (this is the model)
 9 | * Create a data iterator
10 | * Build and train the model
11 | * ???
12 | * PROFIT
13 | 
14 | ## Creating a corpus
15 | A corpus is just any iterator which spits out a list of tokens which are in the same context. The Gensim Text8Corpus class is
16 | an example of this: it takes the path to a local version of the Text8 corpus and spits out 1000 word chunks of the corpus.
17 | 
18 | Alternatively, if you have a file where each line is a sentence, then the FileIterator class in data_iterator.py does the 
19 | same thing.
20 | 
21 | ## Creating a WordSegmenter object
22 | The WordSegmenter object is the implementation of the model. The `word2vec_model` argument of the constructor allows you to
23 | pass a pretrained Gensim Word2Vec model, to speed up getting the word indexes for the data iterator and to load pre-trained
24 | context vectors to help train the model. The `build_iterator` method builds a data iterator to train the model with: read on
25 | for more information.
26 | 
27 | ## Creating a data iterator
28 | Call `build_iterator` with your preferred hyperparameters to construct a data iterator for training. For our values, read 
29 | the paper.
30 | 
31 | ## Training the model
32 | Call the `build` method to construct the model. If you passed in a pre-trained word2vec model in the constructor, you can 
33 | also pass in `learn_context_weights=False` in the arguments to use the pre-trained word2vec context weights.
34 | 
35 | Now call the `train` method to train your model for the specified number of epochs and save the weights and graph of loss 
36 | with the specified name.
37 | 
38 | ## ???
39 | Watch the numbers go down.
40 | 
41 | ## PROFIT
42 | * The `predict` method predicts an embedding and a segmentation for any word you give it. Play around, even with OOV words!
43 | * The `most_similar` method gives you the most similar words to your target word in the training vocabulary.
44 | 


--------------------------------------------------------------------------------
/word_segmenter.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | import numpy
  6 | from scipy.spatial.distance import cosine
  7 | 
  8 | from keras.engine import Model, merge
  9 | from keras.layers.embeddings import Embedding
 10 | from keras.layers.recurrent import LSTM
 11 | from keras.layers.core import (Flatten, Dense, Activation, Lambda)
 12 | from keras.layers import Input
 13 | from keras.layers.wrappers import TimeDistributed
 14 | import keras.backend as K
 15 | 
 16 | 
 17 | from data_iterator import (Word2VecIterator, MENEvaluator,
 18 |                            infinite_cycle, LossHistory, word_to_indices)
 19 | 
 20 | 
 21 | class WordSegmenter(object):
 22 |     def __init__(self, word2vec_model):
 23 |         self.word_vectors = None
 24 |         self.word2vec_model = word2vec_model
 25 | 
 26 |     def build_iterator(self, corpus, n_neg, batch_size, window, min_count):
 27 |         self.iterator = Word2VecIterator(corpus, n_neg, batch_size, window,
 28 |                                          min_count, self.word2vec_model)
 29 | 
 30 |     def build(self, learn_context_weights=True):
 31 |         content_forward = Input(shape=(None,), dtype='int32',
 32 |                                 name='content_forward')
 33 |         content_backward = Input(shape=(None,), dtype='int32',
 34 |                                  name='content_backward')
 35 |         context = Input(shape=(1,), dtype='int32', name='context')
 36 | 
 37 |         if learn_context_weights:
 38 |             context_weights = None
 39 |         else:
 40 |             context_weights = [self.word2vec_model.syn1neg]
 41 |         context_embedding = Embedding(input_dim=len(self.iterator.word_index),
 42 |                                       output_dim=256, input_length=1,
 43 |                                       weights=context_weights)
 44 |         if not learn_context_weights:
 45 |             context_embedding.trainable = False
 46 |         context_flat = Flatten()(context_embedding(context))
 47 | 
 48 |         char_embedding = Embedding(
 49 |             input_dim=29, output_dim=64, mask_zero=True)
 50 | 
 51 |         embed_forward = char_embedding(content_forward)
 52 |         embed_backward = char_embedding(content_backward)
 53 | 
 54 |         rnn_forward = LSTM(output_dim=256, return_sequences=True,
 55 |                            activation='tanh')(embed_forward)
 56 |         backwards_lstm = LSTM(output_dim=256, return_sequences=True,
 57 |                               activation='tanh', go_backwards=True)
 58 | 
 59 |         def reverse_tensor(inputs, mask):
 60 |             return inputs[:, ::-1, :]
 61 | 
 62 |         def reverse_tensor_shape(input_shapes):
 63 |             return input_shapes
 64 | 
 65 |         reverse = Lambda(reverse_tensor, output_shape=reverse_tensor_shape)
 66 |         reverse.supports_masking = True
 67 | 
 68 |         rnn_backward = reverse(backwards_lstm(embed_backward))
 69 | 
 70 |         rnn_bidi = TimeDistributed(Dense(output_dim=256))(
 71 |             merge([rnn_forward, rnn_backward], mode='concat'))
 72 | 
 73 |         attention_1 = TimeDistributed(Dense(output_dim=256,
 74 |                                             activation='tanh',
 75 |                                             bias=False))(rnn_bidi)
 76 |         attention_2 = TimeDistributed(Dense(output_dim=1,
 77 |                                             activity_regularizer='activity_l2',
 78 |                                             bias=False))(attention_1)
 79 | 
 80 |         def attn_merge(inputs, mask):
 81 |             vectors = inputs[0]
 82 |             logits = inputs[1]
 83 |             # Flatten the logits and take a softmax
 84 |             logits = K.squeeze(logits, axis=2)
 85 |             pre_softmax = K.switch(mask[0], logits, -numpy.inf)
 86 |             weights = K.expand_dims(K.softmax(pre_softmax))
 87 |             return K.sum(vectors * weights, axis=1)
 88 | 
 89 |         def attn_merge_shape(input_shapes):
 90 |             return(input_shapes[0][0], input_shapes[0][2])
 91 | 
 92 |         attn = Lambda(attn_merge, output_shape=attn_merge_shape)
 93 |         attn.supports_masking = True
 94 |         attn.compute_mask = lambda inputs, mask: None
 95 |         content_flat = attn([rnn_bidi, attention_2])
 96 | 
 97 |         output = Activation('sigmoid', name='output')(
 98 |             merge([content_flat, context_flat], mode='dot',
 99 |                   dot_axes=(1, 1)))
100 | 
101 | 
102 |         model = Model(input=[content_forward, content_backward, context],
103 |                       output=output)
104 |         model.compile(loss='binary_crossentropy', optimizer='adam',
105 |                       metrics=['accuracy'])
106 | 
107 |         inputs = [content_forward, content_backward]
108 | 
109 |         self._predict = K.function(inputs, content_flat)
110 |         self._attention = K.function(inputs, K.squeeze(attention_2, axis=2))
111 |         self.model = model
112 | 
113 |     def train(self, epochs, model_name):
114 |         history = LossHistory()
115 |         evaluation = MENEvaluator(self)
116 | 
117 |         self.model.fit_generator(iter(infinite_cycle(self.iterator)),
118 |                                  len(self.iterator),
119 |                                  epochs,
120 |                                  callbacks=[history, evaluation])
121 | 
122 |         plt.figure()
123 |         plt.plot(history.total_seen, history.losses)
124 |         plt.ylim((0.0, 0.5))
125 |         plt.savefig(model_name + '.png')
126 | 
127 |     def predict(self, word):
128 |         indices = word_to_indices(word)
129 | 
130 |         forward = numpy.array([[27] + indices])
131 |         backward = numpy.array([indices + [28]])
132 | 
133 |         embedding = self._predict([forward, backward])
134 |         attention = self._attention([forward, backward])
135 | 
136 |         return embedding, attention
137 | 
138 |     def most_similar(self, word, n=10, context=False, cutoff=None):
139 |         if not self.word_vectors:
140 |             self.word_vectors = [self.predict(x)[0][0] for x in self.iterator.word_index]
141 | 
142 |         distances = numpy.array(
143 |             [cosine(y, self.word_vectors[self.iterator.word2index[word]])
144 |              for y in self.word_vectors])
145 | 
146 |         sorted_args = numpy.argsort(distances)
147 |         if cutoff:
148 |             sorted_args = [arg for arg in sorted_args
149 |                            if self.iterator.word_count[self.iterator.word_index[y]] >= cutoff]
150 |         words = [self.iterator.word_index[y] for y in sorted_args]
151 | 
152 |         return zip(distances[sorted_args][:n], words[:n])
153 | 


--------------------------------------------------------------------------------
/data_iterator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | from collections import defaultdict
  4 | import numpy
  5 | numpy.random.seed(45)
  6 | import random
  7 | random.seed(45)
  8 | 
  9 | import pdb
 10 | 
 11 | from scipy.stats import spearmanr
 12 | 
 13 | import keras.backend as K
 14 | from keras import initializations, regularizers
 15 | from keras.callbacks import Callback
 16 | 
 17 | import theano.tensor as T
 18 | 
 19 | 
 20 | def word_to_indices(word):
 21 |     """Converts a word to a sequence of character IDs"""
 22 |     indices = [ord(letter) - ord('a') + 1 for letter in word]
 23 |     assert(max(indices) < 27)
 24 |     return indices
 25 | 
 26 | 
 27 | def infinite_cycle(iterator):
 28 |     while True:
 29 |         for item in iterator:
 30 |             yield item
 31 | 
 32 | 
 33 | class FileIterator(object):
 34 |     def __init__(self, filename):
 35 |         self.filename = filename
 36 | 
 37 |     def __iter__(self):
 38 |         with open(self.filename, 'r') as f:
 39 |             for line in f:
 40 |                 yield line.split()
 41 | 
 42 | 
 43 | class Word2VecIterator(object):
 44 |     """Reads a corpus, and spits out pairs to feed to a model"""
 45 | 
 46 |     def __init__(self, corpus, n_neg, batch_size, window, min_count,
 47 |                  model=None):
 48 |         self.corpus = corpus
 49 |         self.n_neg = n_neg
 50 |         self.batch_size = batch_size
 51 |         self.window = window
 52 |         self.min_buflen = 100000
 53 |         self.max_buflen = 2 * self.min_buflen
 54 | 
 55 |         self.min_count = min_count
 56 | 
 57 |         self.neg_table_size = 50000000
 58 | 
 59 |         self._length = 0
 60 | 
 61 |         self.word_counts = defaultdict(int)
 62 | 
 63 |         if not model:
 64 |             self.get_counts()
 65 |             self.process_vocab(self.min_count)
 66 |             self.make_neg_table()
 67 | 
 68 |         else:
 69 |             self.word2index = {}
 70 |             for word in model.vocab:
 71 |                 self.word_counts[word] = model.vocab[word].count
 72 |                 self.word2index[word] = model.vocab[word].index
 73 | 
 74 |             self.word_index = sorted(list(self.word2index.keys()),
 75 |                                      key=lambda x: model.vocab[x].index)
 76 |             self.make_neg_table()
 77 | 
 78 |     def get_counts(self):
 79 |         for line in self.corpus:
 80 |             for word in line:
 81 |                 self.word_counts[word] += 1
 82 | 
 83 |     def process_vocab(self, min_count):
 84 |         self.word_counts = {word: self.word_counts[word] for word in self.word_counts
 85 |                             if self.word_counts[word] >= min_count}
 86 |         self.word_index = self.word_counts.keys()
 87 |         self.word2index = {self.word_index[i]: i for i in range(len(self.word_index))}
 88 | 
 89 |     def make_neg_table(self):
 90 |         # make negative sampling table
 91 |         vocab_size = len(self.word_counts)
 92 |         self.neg_table = numpy.zeros(self.neg_table_size, dtype=numpy.uint32)
 93 |         train_words_pow = float(sum(self.word_counts[word]**0.75 for word in self.word_counts))
 94 |         cumulative = 0
 95 |         table_idx = 0
 96 |         for index in range(vocab_size):
 97 |             cumulative += self.word_counts[self.word_index[index]]**0.75 / train_words_pow
 98 |             while table_idx < int(cumulative * self.neg_table_size):
 99 |                 self.neg_table[table_idx] = index
100 |                 table_idx += 1
101 | 
102 |         while table_idx < self.neg_table_size:
103 |             self.neg_table[table_idx] = vocab_size - 1
104 |             table_idx += 1
105 |         assert self.neg_table[-1] == vocab_size - 1
106 | 
107 |     def __len__(self):
108 |         # account for negative samples
109 | 
110 |         if self._length == 0:
111 |             for line in self.corpus:
112 |                 # filter the line of OOV words and subsample common words
113 |                 line = [word for word in line if word in self.word_counts]
114 |                 for pos, word in enumerate(line):
115 |                     start = max(0, pos - self.window)
116 |                     for pos2, word2 in enumerate(line[start:(pos + self.window)], start):
117 |                         if pos2 != pos:
118 |                             # account for negative samples
119 |                             self._length += (1 + self.n_neg)
120 | 
121 |         return self._length
122 | 
123 |     def __iter__(self):
124 |         buffer = {}
125 |         current_key = 0
126 | 
127 |         def make_batch(batch_size):
128 |             content_forward = []
129 |             content_backward = []
130 | #            contents = []
131 |             contexts = []
132 |             labels = []
133 |             weights = []
134 |             keys = buffer.keys()
135 |             for i, key in enumerate(keys):
136 |                 if i < batch_size:
137 |                     item = buffer.pop(key)
138 |                     content_forward.append([27] + item[0])
139 |                     content_backward.append(item[0] + [28])
140 |     #                contents.append(item[0])
141 |                     contexts.append(item[1])
142 |                     labels.append(item[2])
143 |                     weights.append(item[3])
144 |                 else:
145 |                     break
146 |             max_content_length = max(len(content)
147 |                                      for content in content_forward)
148 |             l1 = numpy.zeros((batch_size, max_content_length), dtype=numpy.int32)
149 |             l2 = numpy.zeros((batch_size, max_content_length), dtype=numpy.int32)
150 |             for i, (forward, backward) in enumerate(zip(content_forward,
151 |                                                         content_backward)):
152 |                 l1[i, :len(forward)] = forward
153 |                 l2[i, :len(backward)] = backward
154 | #            l = numpy.array(contents, dtype=numpy.int32)
155 |             r = numpy.array(contexts, dtype=numpy.int32)
156 |             labels = numpy.array(labels)
157 |             weights = numpy.array(weights)
158 |             return ([l1, l2, r],  # these are the inputs
159 |                     labels,  # these are the targets
160 |                     weights)  # these are the sample weights
161 | 
162 |         for line in self.corpus:
163 |             # filter the line of OOV words and subsample common words
164 |             line = [word for word in line
165 |                     if word in self.word_counts]
166 |             for pos, word in enumerate(line):
167 |                 word_chars = word_to_indices(word)
168 |                 start = max(0, pos - self.window)
169 |                 for pos2, word2 in enumerate(line[start:(pos + self.window)], start):
170 |                     if pos2 != pos:
171 |                         # add both words to the batch with label 1
172 |                         buffer[current_key] = (word_chars, [self.word2index[word2]], 1., 1.)
173 |                         current_key += 1
174 | 
175 |                         neg_samples = []
176 |                         # add n_neg negative samples
177 |                         while len(neg_samples) < self.n_neg:
178 |                             word2 = self.neg_table[numpy.random.randint(0, self.neg_table_size)]
179 |                             if word2 not in [self.word2index[word]] + neg_samples:
180 |                                 neg_samples.append(word2)
181 |                                 buffer[current_key] = (word_chars, [word2], 0., 1.)
182 |                                 current_key += 1
183 | 
184 |             if len(buffer) > self.max_buflen:
185 |                 while len(buffer) > self.min_buflen:
186 |                     # yield some batches to clear space in the buffer
187 |                     yield make_batch(min(self.batch_size, len(buffer)))
188 | 
189 |         # empty the buffer when we've reached the end of the corpus
190 |         while len(buffer) > 0:
191 |             yield make_batch(min(self.batch_size, len(buffer)))
192 | 
193 | 
194 | class LossHistory(Callback):
195 |     def on_train_begin(self, logs={}):
196 |         self.losses = []
197 |         self.seen = 0
198 |         self.total_seen = []
199 | 
200 |     def on_batch_begin(self, batch, logs={}):
201 |         self.seen += logs.get('size')
202 | 
203 |     def on_batch_end(self, batch, logs={}):
204 |         self.total_seen.append(self.seen)
205 |         self.losses.append(logs.get('loss'))
206 | 
207 | 
208 | class MENEvaluator(Callback):
209 |     def __init__(self, model):
210 |         self.wordseg_model = model
211 | 
212 |     def on_train_begin(self, logs={}):
213 |         self.similarities = {}
214 |         # load the MEN dev set and strip pos tags
215 |         with open('/local/scratch/kc391/word_similarity_data/MEN/MEN_dataset_lemma_form.dev', 'r') as f:
216 |             for line in f:
217 |                 line = line.split()
218 |                 word_pair = (line[0][:-2], line[1][:-2])
219 |                 sim = float(line[2])
220 |                 self.similarities[word_pair] = sim
221 | 
222 |     def on_epoch_end(self, epoch, logs={}):
223 |         predicted_similarities_all = []
224 |         gold_similarities_all = []
225 |         predicted_similarities_res = []
226 |         gold_similarities_res = []
227 |         for word_pair in self.similarities.keys():
228 |             word1_vec = self.wordseg_model.predict(word_pair[0])[0][0]
229 |             word2_vec = self.wordseg_model.predict(word_pair[1])[0][0]
230 | 
231 |             sim = (numpy.dot(word1_vec, word2_vec.T) /
232 |                    (numpy.linalg.norm(word1_vec) * numpy.linalg.norm(word2_vec)))
233 | 
234 |             predicted_similarities_all.append(sim)
235 |             gold_similarities_all.append(self.similarities[word_pair])
236 | 
237 |             if all(word in self.wordseg_model.word2vec_model
238 |                    for word in word_pair):
239 |                 predicted_similarities_res.append(sim)
240 |                 gold_similarities_res.append(self.similarities[word_pair])
241 | 
242 |         r_all = spearmanr(predicted_similarities_all, gold_similarities_all)
243 |         r_res = spearmanr(predicted_similarities_res, gold_similarities_res)
244 | 
245 |         print("r all: {0:.4f}; r res: {1:.4f}".format(r_all[0], r_res[0]))
246 | 


--------------------------------------------------------------------------------