├── helper.py └── model.py /helper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper abstract classes for WordContextModel 3 | """ 4 | 5 | import logging 6 | 7 | 8 | class CorpusReader(object): 9 | """ CorpusReader object. Works as a iterator of longer texts 10 | 11 | Parameters: 12 | path (str): The path to a text file 13 | """ 14 | def __init__(self, path): 15 | self._path = path 16 | self._corpus_size = None 17 | 18 | def __iter__(self): 19 | with open(self._path) as fin: 20 | self._corpus_size = 0 21 | for i, line in enumerate(fin, 1): 22 | self._corpus_size += 1 23 | if not i % 1000: 24 | logging.info('Done %d sequences', i) 25 | yield line.strip() 26 | 27 | @property 28 | def corpus_size(self): 29 | """ Return the corpus size """ 30 | return self._corpus_size 31 | 32 | @property 33 | def path(self): 34 | """ Return the corpus path """ 35 | return self._path 36 | 37 | 38 | class Model(object): 39 | """ Abstract class for Keras models. 40 | 41 | Parameters: 42 | loss_function (str or func): either a Keras function (see online docs) 43 | or a theano function which given two arrays 44 | returns a scalar 45 | optimizer (str or class): any of the Keras optimizers 46 | """ 47 | def __init__(self, loss_function='binary_crossentropy', 48 | optimizer='rmsprop'): 49 | self.loss_function = loss_function 50 | self.optimizer = optimizer 51 | 52 | def prepare_model(self, *args, **kwargs): 53 | """ Return the instantiated keras model """ 54 | raise NotImplementedError 55 | 56 | def save_model(self, path): 57 | """ Save the model to the indicated path """ 58 | raise NotImplementedError 59 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Neural Embeddings in Keras 4 | 5 | this module emulates the functionality of word2vec with negative sampling 6 | written in Keras. 7 | 8 | __authors__ = 'Dimitrios Alikaniotis' 9 | __affiliation__ = 'University of Cambridge' 10 | __email__ = 'da352@cam.ac.uk' 11 | 12 | """ 13 | 14 | import os 15 | import logging 16 | import cPickle as pkl 17 | 18 | import numpy as np 19 | 20 | from keras.models import Sequential 21 | from keras.layers.core import Merge, Reshape, Dense 22 | from keras.layers.embeddings import Embedding 23 | from keras.preprocessing import sequence, text 24 | 25 | from helper import CorpusReader, Model 26 | 27 | FORMAT = "%(asctime)s - %(levelname)s - %(message)s" 28 | logging.basicConfig(format=FORMAT, level=logging.DEBUG) 29 | 30 | 31 | class WordContextModel(Model): 32 | """ WordContextModel implements a word2vec-like model trained with 33 | negative sampling. The word indices (of both the target word and 34 | its context) are passed to two distinct networks and the goal is to 35 | learn to discriminate between `true' and `false' contexts 36 | 37 | Parameters: 38 | corpus_path (str): the path to a text file 39 | embedding_size (int): the size of the neural embeddings 40 | """ 41 | def __init__(self, corpus_path, embedding_size, *args, **kwargs): 42 | self.corpus_path = corpus_path 43 | self.embedding_size = embedding_size 44 | self.tokenizer = None 45 | self.corpus = None 46 | self.model = None 47 | self._vocab_size = None 48 | super(WordContextModel, self).__init__(*args, **kwargs) 49 | 50 | @staticmethod 51 | def _prepare_model(vocab_size, vector_dim, loss_function, 52 | optimizer): 53 | """ Procedure to prepare the Keras model """ 54 | logging.info('Building word model...') 55 | word = Sequential() 56 | word.add(Embedding(vocab_size, vector_dim, input_length=1)) 57 | word.add(Reshape((vector_dim, 1))) 58 | 59 | logging.info('Building context model...') 60 | context = Sequential() 61 | context.add(Embedding(vocab_size, vector_dim, input_length=1)) 62 | context.add(Reshape((vector_dim, 1))) 63 | 64 | logging.info('Building composite graph') 65 | model = Sequential() 66 | model.add(Merge([word, context], mode='dot', dot_axes=1)) 67 | model.add(Reshape((1, ))) 68 | model.add(Dense(1, activation='sigmoid')) 69 | model.compile(loss=loss_function, optimizer=optimizer) 70 | return model 71 | 72 | def prepare_model(self): 73 | """ Procedure to prepare the Keras model """ 74 | self.model = self._prepare_model(self.vocab_size, 75 | self.embedding_size, 76 | self.loss_function, 77 | self.optimizer) 78 | 79 | @property 80 | def vocab_size(self): 81 | """ Return the size of the vocabulary """ 82 | try: 83 | return self._vocab_size 84 | except AttributeError: 85 | logging.error('Please tokenize the corpus first') 86 | 87 | def tokenize_corpus(self): 88 | """ Tokenize the corpus using Keras helper classes""" 89 | self.corpus = CorpusReader(self.corpus_path) 90 | logging.info('Tokenizing the corpus') 91 | self.tokenizer = text.Tokenizer() 92 | self.tokenizer.fit_on_texts(self.corpus) 93 | self._vocab_size = len(self.tokenizer.word_counts) + 1 94 | 95 | def train_corpus(self, negative_samples=20, window_size=4): 96 | """ Train the model on the given corpus 97 | 98 | Parameters: 99 | negative_samples (int): the number of `false contexts' for each word 100 | window_size (int): the size of each context 101 | """ 102 | logging.info('Initialising sampling table') 103 | sampling_table = sequence.make_sampling_table(self.vocab_size) 104 | ans = [] 105 | for i, seq in enumerate( 106 | self.tokenizer.texts_to_sequences_generator(self.corpus)): 107 | logging.info(i) 108 | couples, labels = sequence.skipgrams( 109 | seq, self.vocab_size, window_size=window_size, 110 | negative_samples=negative_samples, 111 | sampling_table=sampling_table) 112 | if couples: 113 | word_target, word_context = zip(*couples) 114 | word_target = np.array(word_target, dtype="int32") 115 | word_context = np.array(word_context, dtype="int32") 116 | loss = self.model.train_on_batch([word_target, word_context], 117 | labels) 118 | ans.append(loss) 119 | return ans 120 | 121 | def save_model(self, path): 122 | """ Saves the model in json format and the weights in hdf5""" 123 | self.model.to_json(os.path.join(path, 'model.json')) 124 | self.model.save_weights(os.path.join(path, 'model_weights.hdf5')) 125 | 126 | def save_embeddings(self, path): 127 | """ Saves only the word embeddings along with the dictionary """ 128 | with open(os.path.join(path, 'embeddings.pkl', 'wb')) as fout: 129 | pkl.dump([self.model.get_weights()[0], 130 | self.tokenizer.word_index], fout, -1) 131 | --------------------------------------------------------------------------------