├── README.md ├── word_segmenter.py └── data_iterator.py /README.md: -------------------------------------------------------------------------------- 1 | # char2vec 2 | Implementation of char2vec model from http://www.aclweb.org/anthology/W/W16/W16-1603.pdf 3 | 4 | These two files implement the Char2Vec model from "A Joint Model for Word Embedding and Word Morphology" by Kris Cao and 5 | Marek Rei, 2016. To get started: 6 | 7 | * Create a corpus 8 | * Create a WordSegmenter object (this is the model) 9 | * Create a data iterator 10 | * Build and train the model 11 | * ??? 12 | * PROFIT 13 | 14 | ## Creating a corpus 15 | A corpus is just any iterator which spits out a list of tokens which are in the same context. The Gensim Text8Corpus class is 16 | an example of this: it takes the path to a local version of the Text8 corpus and spits out 1000 word chunks of the corpus. 17 | 18 | Alternatively, if you have a file where each line is a sentence, then the FileIterator class in data_iterator.py does the 19 | same thing. 20 | 21 | ## Creating a WordSegmenter object 22 | The WordSegmenter object is the implementation of the model. The `word2vec_model` argument of the constructor allows you to 23 | pass a pretrained Gensim Word2Vec model, to speed up getting the word indexes for the data iterator and to load pre-trained 24 | context vectors to help train the model. The `build_iterator` method builds a data iterator to train the model with: read on 25 | for more information. 26 | 27 | ## Creating a data iterator 28 | Call `build_iterator` with your preferred hyperparameters to construct a data iterator for training. For our values, read 29 | the paper. 30 | 31 | ## Training the model 32 | Call the `build` method to construct the model. If you passed in a pre-trained word2vec model in the constructor, you can 33 | also pass in `learn_context_weights=False` in the arguments to use the pre-trained word2vec context weights. 34 | 35 | Now call the `train` method to train your model for the specified number of epochs and save the weights and graph of loss 36 | with the specified name. 37 | 38 | ## ??? 39 | Watch the numbers go down. 40 | 41 | ## PROFIT 42 | * The `predict` method predicts an embedding and a segmentation for any word you give it. Play around, even with OOV words! 43 | * The `most_similar` method gives you the most similar words to your target word in the training vocabulary. 44 | -------------------------------------------------------------------------------- /word_segmenter.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | 5 | import numpy 6 | from scipy.spatial.distance import cosine 7 | 8 | from keras.engine import Model, merge 9 | from keras.layers.embeddings import Embedding 10 | from keras.layers.recurrent import LSTM 11 | from keras.layers.core import (Flatten, Dense, Activation, Lambda) 12 | from keras.layers import Input 13 | from keras.layers.wrappers import TimeDistributed 14 | import keras.backend as K 15 | 16 | 17 | from data_iterator import (Word2VecIterator, MENEvaluator, 18 | infinite_cycle, LossHistory, word_to_indices) 19 | 20 | 21 | class WordSegmenter(object): 22 | def __init__(self, word2vec_model): 23 | self.word_vectors = None 24 | self.word2vec_model = word2vec_model 25 | 26 | def build_iterator(self, corpus, n_neg, batch_size, window, min_count): 27 | self.iterator = Word2VecIterator(corpus, n_neg, batch_size, window, 28 | min_count, self.word2vec_model) 29 | 30 | def build(self, learn_context_weights=True): 31 | content_forward = Input(shape=(None,), dtype='int32', 32 | name='content_forward') 33 | content_backward = Input(shape=(None,), dtype='int32', 34 | name='content_backward') 35 | context = Input(shape=(1,), dtype='int32', name='context') 36 | 37 | if learn_context_weights: 38 | context_weights = None 39 | else: 40 | context_weights = [self.word2vec_model.syn1neg] 41 | context_embedding = Embedding(input_dim=len(self.iterator.word_index), 42 | output_dim=256, input_length=1, 43 | weights=context_weights) 44 | if not learn_context_weights: 45 | context_embedding.trainable = False 46 | context_flat = Flatten()(context_embedding(context)) 47 | 48 | char_embedding = Embedding( 49 | input_dim=29, output_dim=64, mask_zero=True) 50 | 51 | embed_forward = char_embedding(content_forward) 52 | embed_backward = char_embedding(content_backward) 53 | 54 | rnn_forward = LSTM(output_dim=256, return_sequences=True, 55 | activation='tanh')(embed_forward) 56 | backwards_lstm = LSTM(output_dim=256, return_sequences=True, 57 | activation='tanh', go_backwards=True) 58 | 59 | def reverse_tensor(inputs, mask): 60 | return inputs[:, ::-1, :] 61 | 62 | def reverse_tensor_shape(input_shapes): 63 | return input_shapes 64 | 65 | reverse = Lambda(reverse_tensor, output_shape=reverse_tensor_shape) 66 | reverse.supports_masking = True 67 | 68 | rnn_backward = reverse(backwards_lstm(embed_backward)) 69 | 70 | rnn_bidi = TimeDistributed(Dense(output_dim=256))( 71 | merge([rnn_forward, rnn_backward], mode='concat')) 72 | 73 | attention_1 = TimeDistributed(Dense(output_dim=256, 74 | activation='tanh', 75 | bias=False))(rnn_bidi) 76 | attention_2 = TimeDistributed(Dense(output_dim=1, 77 | activity_regularizer='activity_l2', 78 | bias=False))(attention_1) 79 | 80 | def attn_merge(inputs, mask): 81 | vectors = inputs[0] 82 | logits = inputs[1] 83 | # Flatten the logits and take a softmax 84 | logits = K.squeeze(logits, axis=2) 85 | pre_softmax = K.switch(mask[0], logits, -numpy.inf) 86 | weights = K.expand_dims(K.softmax(pre_softmax)) 87 | return K.sum(vectors * weights, axis=1) 88 | 89 | def attn_merge_shape(input_shapes): 90 | return(input_shapes[0][0], input_shapes[0][2]) 91 | 92 | attn = Lambda(attn_merge, output_shape=attn_merge_shape) 93 | attn.supports_masking = True 94 | attn.compute_mask = lambda inputs, mask: None 95 | content_flat = attn([rnn_bidi, attention_2]) 96 | 97 | output = Activation('sigmoid', name='output')( 98 | merge([content_flat, context_flat], mode='dot', 99 | dot_axes=(1, 1))) 100 | 101 | 102 | model = Model(input=[content_forward, content_backward, context], 103 | output=output) 104 | model.compile(loss='binary_crossentropy', optimizer='adam', 105 | metrics=['accuracy']) 106 | 107 | inputs = [content_forward, content_backward] 108 | 109 | self._predict = K.function(inputs, content_flat) 110 | self._attention = K.function(inputs, K.squeeze(attention_2, axis=2)) 111 | self.model = model 112 | 113 | def train(self, epochs, model_name): 114 | history = LossHistory() 115 | evaluation = MENEvaluator(self) 116 | 117 | self.model.fit_generator(iter(infinite_cycle(self.iterator)), 118 | len(self.iterator), 119 | epochs, 120 | callbacks=[history, evaluation]) 121 | 122 | plt.figure() 123 | plt.plot(history.total_seen, history.losses) 124 | plt.ylim((0.0, 0.5)) 125 | plt.savefig(model_name + '.png') 126 | 127 | def predict(self, word): 128 | indices = word_to_indices(word) 129 | 130 | forward = numpy.array([[27] + indices]) 131 | backward = numpy.array([indices + [28]]) 132 | 133 | embedding = self._predict([forward, backward]) 134 | attention = self._attention([forward, backward]) 135 | 136 | return embedding, attention 137 | 138 | def most_similar(self, word, n=10, context=False, cutoff=None): 139 | if not self.word_vectors: 140 | self.word_vectors = [self.predict(x)[0][0] for x in self.iterator.word_index] 141 | 142 | distances = numpy.array( 143 | [cosine(y, self.word_vectors[self.iterator.word2index[word]]) 144 | for y in self.word_vectors]) 145 | 146 | sorted_args = numpy.argsort(distances) 147 | if cutoff: 148 | sorted_args = [arg for arg in sorted_args 149 | if self.iterator.word_count[self.iterator.word_index[y]] >= cutoff] 150 | words = [self.iterator.word_index[y] for y in sorted_args] 151 | 152 | return zip(distances[sorted_args][:n], words[:n]) 153 | -------------------------------------------------------------------------------- /data_iterator.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from collections import defaultdict 4 | import numpy 5 | numpy.random.seed(45) 6 | import random 7 | random.seed(45) 8 | 9 | import pdb 10 | 11 | from scipy.stats import spearmanr 12 | 13 | import keras.backend as K 14 | from keras import initializations, regularizers 15 | from keras.callbacks import Callback 16 | 17 | import theano.tensor as T 18 | 19 | 20 | def word_to_indices(word): 21 | """Converts a word to a sequence of character IDs""" 22 | indices = [ord(letter) - ord('a') + 1 for letter in word] 23 | assert(max(indices) < 27) 24 | return indices 25 | 26 | 27 | def infinite_cycle(iterator): 28 | while True: 29 | for item in iterator: 30 | yield item 31 | 32 | 33 | class FileIterator(object): 34 | def __init__(self, filename): 35 | self.filename = filename 36 | 37 | def __iter__(self): 38 | with open(self.filename, 'r') as f: 39 | for line in f: 40 | yield line.split() 41 | 42 | 43 | class Word2VecIterator(object): 44 | """Reads a corpus, and spits out pairs to feed to a model""" 45 | 46 | def __init__(self, corpus, n_neg, batch_size, window, min_count, 47 | model=None): 48 | self.corpus = corpus 49 | self.n_neg = n_neg 50 | self.batch_size = batch_size 51 | self.window = window 52 | self.min_buflen = 100000 53 | self.max_buflen = 2 * self.min_buflen 54 | 55 | self.min_count = min_count 56 | 57 | self.neg_table_size = 50000000 58 | 59 | self._length = 0 60 | 61 | self.word_counts = defaultdict(int) 62 | 63 | if not model: 64 | self.get_counts() 65 | self.process_vocab(self.min_count) 66 | self.make_neg_table() 67 | 68 | else: 69 | self.word2index = {} 70 | for word in model.vocab: 71 | self.word_counts[word] = model.vocab[word].count 72 | self.word2index[word] = model.vocab[word].index 73 | 74 | self.word_index = sorted(list(self.word2index.keys()), 75 | key=lambda x: model.vocab[x].index) 76 | self.make_neg_table() 77 | 78 | def get_counts(self): 79 | for line in self.corpus: 80 | for word in line: 81 | self.word_counts[word] += 1 82 | 83 | def process_vocab(self, min_count): 84 | self.word_counts = {word: self.word_counts[word] for word in self.word_counts 85 | if self.word_counts[word] >= min_count} 86 | self.word_index = self.word_counts.keys() 87 | self.word2index = {self.word_index[i]: i for i in range(len(self.word_index))} 88 | 89 | def make_neg_table(self): 90 | # make negative sampling table 91 | vocab_size = len(self.word_counts) 92 | self.neg_table = numpy.zeros(self.neg_table_size, dtype=numpy.uint32) 93 | train_words_pow = float(sum(self.word_counts[word]**0.75 for word in self.word_counts)) 94 | cumulative = 0 95 | table_idx = 0 96 | for index in range(vocab_size): 97 | cumulative += self.word_counts[self.word_index[index]]**0.75 / train_words_pow 98 | while table_idx < int(cumulative * self.neg_table_size): 99 | self.neg_table[table_idx] = index 100 | table_idx += 1 101 | 102 | while table_idx < self.neg_table_size: 103 | self.neg_table[table_idx] = vocab_size - 1 104 | table_idx += 1 105 | assert self.neg_table[-1] == vocab_size - 1 106 | 107 | def __len__(self): 108 | # account for negative samples 109 | 110 | if self._length == 0: 111 | for line in self.corpus: 112 | # filter the line of OOV words and subsample common words 113 | line = [word for word in line if word in self.word_counts] 114 | for pos, word in enumerate(line): 115 | start = max(0, pos - self.window) 116 | for pos2, word2 in enumerate(line[start:(pos + self.window)], start): 117 | if pos2 != pos: 118 | # account for negative samples 119 | self._length += (1 + self.n_neg) 120 | 121 | return self._length 122 | 123 | def __iter__(self): 124 | buffer = {} 125 | current_key = 0 126 | 127 | def make_batch(batch_size): 128 | content_forward = [] 129 | content_backward = [] 130 | # contents = [] 131 | contexts = [] 132 | labels = [] 133 | weights = [] 134 | keys = buffer.keys() 135 | for i, key in enumerate(keys): 136 | if i < batch_size: 137 | item = buffer.pop(key) 138 | content_forward.append([27] + item[0]) 139 | content_backward.append(item[0] + [28]) 140 | # contents.append(item[0]) 141 | contexts.append(item[1]) 142 | labels.append(item[2]) 143 | weights.append(item[3]) 144 | else: 145 | break 146 | max_content_length = max(len(content) 147 | for content in content_forward) 148 | l1 = numpy.zeros((batch_size, max_content_length), dtype=numpy.int32) 149 | l2 = numpy.zeros((batch_size, max_content_length), dtype=numpy.int32) 150 | for i, (forward, backward) in enumerate(zip(content_forward, 151 | content_backward)): 152 | l1[i, :len(forward)] = forward 153 | l2[i, :len(backward)] = backward 154 | # l = numpy.array(contents, dtype=numpy.int32) 155 | r = numpy.array(contexts, dtype=numpy.int32) 156 | labels = numpy.array(labels) 157 | weights = numpy.array(weights) 158 | return ([l1, l2, r], # these are the inputs 159 | labels, # these are the targets 160 | weights) # these are the sample weights 161 | 162 | for line in self.corpus: 163 | # filter the line of OOV words and subsample common words 164 | line = [word for word in line 165 | if word in self.word_counts] 166 | for pos, word in enumerate(line): 167 | word_chars = word_to_indices(word) 168 | start = max(0, pos - self.window) 169 | for pos2, word2 in enumerate(line[start:(pos + self.window)], start): 170 | if pos2 != pos: 171 | # add both words to the batch with label 1 172 | buffer[current_key] = (word_chars, [self.word2index[word2]], 1., 1.) 173 | current_key += 1 174 | 175 | neg_samples = [] 176 | # add n_neg negative samples 177 | while len(neg_samples) < self.n_neg: 178 | word2 = self.neg_table[numpy.random.randint(0, self.neg_table_size)] 179 | if word2 not in [self.word2index[word]] + neg_samples: 180 | neg_samples.append(word2) 181 | buffer[current_key] = (word_chars, [word2], 0., 1.) 182 | current_key += 1 183 | 184 | if len(buffer) > self.max_buflen: 185 | while len(buffer) > self.min_buflen: 186 | # yield some batches to clear space in the buffer 187 | yield make_batch(min(self.batch_size, len(buffer))) 188 | 189 | # empty the buffer when we've reached the end of the corpus 190 | while len(buffer) > 0: 191 | yield make_batch(min(self.batch_size, len(buffer))) 192 | 193 | 194 | class LossHistory(Callback): 195 | def on_train_begin(self, logs={}): 196 | self.losses = [] 197 | self.seen = 0 198 | self.total_seen = [] 199 | 200 | def on_batch_begin(self, batch, logs={}): 201 | self.seen += logs.get('size') 202 | 203 | def on_batch_end(self, batch, logs={}): 204 | self.total_seen.append(self.seen) 205 | self.losses.append(logs.get('loss')) 206 | 207 | 208 | class MENEvaluator(Callback): 209 | def __init__(self, model): 210 | self.wordseg_model = model 211 | 212 | def on_train_begin(self, logs={}): 213 | self.similarities = {} 214 | # load the MEN dev set and strip pos tags 215 | with open('/local/scratch/kc391/word_similarity_data/MEN/MEN_dataset_lemma_form.dev', 'r') as f: 216 | for line in f: 217 | line = line.split() 218 | word_pair = (line[0][:-2], line[1][:-2]) 219 | sim = float(line[2]) 220 | self.similarities[word_pair] = sim 221 | 222 | def on_epoch_end(self, epoch, logs={}): 223 | predicted_similarities_all = [] 224 | gold_similarities_all = [] 225 | predicted_similarities_res = [] 226 | gold_similarities_res = [] 227 | for word_pair in self.similarities.keys(): 228 | word1_vec = self.wordseg_model.predict(word_pair[0])[0][0] 229 | word2_vec = self.wordseg_model.predict(word_pair[1])[0][0] 230 | 231 | sim = (numpy.dot(word1_vec, word2_vec.T) / 232 | (numpy.linalg.norm(word1_vec) * numpy.linalg.norm(word2_vec))) 233 | 234 | predicted_similarities_all.append(sim) 235 | gold_similarities_all.append(self.similarities[word_pair]) 236 | 237 | if all(word in self.wordseg_model.word2vec_model 238 | for word in word_pair): 239 | predicted_similarities_res.append(sim) 240 | gold_similarities_res.append(self.similarities[word_pair]) 241 | 242 | r_all = spearmanr(predicted_similarities_all, gold_similarities_all) 243 | r_res = spearmanr(predicted_similarities_res, gold_similarities_res) 244 | 245 | print("r all: {0:.4f}; r res: {1:.4f}".format(r_all[0], r_res[0])) 246 | --------------------------------------------------------------------------------