├── helper.py
└── model.py


/helper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helper abstract classes for WordContextModel
 3 | """
 4 | 
 5 | import logging
 6 | 
 7 | 
 8 | class CorpusReader(object):
 9 |     """ CorpusReader object. Works as a iterator of longer texts
10 | 
11 |     Parameters:
12 |         path (str): The path to a text file
13 |     """
14 |     def __init__(self, path):
15 |         self._path = path
16 |         self._corpus_size = None
17 | 
18 |     def __iter__(self):
19 |         with open(self._path) as fin:
20 |             self._corpus_size = 0
21 |             for i, line in enumerate(fin, 1):
22 |                 self._corpus_size += 1
23 |                 if not i % 1000:
24 |                     logging.info('Done %d sequences', i)
25 |                 yield line.strip()
26 | 
27 |     @property
28 |     def corpus_size(self):
29 |         """ Return the corpus size """
30 |         return self._corpus_size
31 | 
32 |     @property
33 |     def path(self):
34 |         """ Return the corpus path """
35 |         return self._path
36 | 
37 | 
38 | class Model(object):
39 |     """ Abstract class for Keras models.
40 | 
41 |     Parameters:
42 |     loss_function (str or func): either a Keras function (see online docs)
43 |                                  or a theano function which given two arrays
44 |                                  returns a scalar
45 |     optimizer (str or class): any of the Keras optimizers
46 |     """
47 |     def __init__(self, loss_function='binary_crossentropy',
48 |                  optimizer='rmsprop'):
49 |         self.loss_function = loss_function
50 |         self.optimizer = optimizer
51 | 
52 |     def prepare_model(self, *args, **kwargs):
53 |         """ Return the instantiated keras model """
54 |         raise NotImplementedError
55 | 
56 |     def save_model(self, path):
57 |         """ Save the model to the indicated path """
58 |         raise NotImplementedError
59 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Neural Embeddings in Keras
  4 | 
  5 | this module emulates the functionality of word2vec with negative sampling
  6 | written in Keras.
  7 | 
  8 | __authors__ = 'Dimitrios Alikaniotis'
  9 | __affiliation__ = 'University of Cambridge'
 10 | __email__ = 'da352@cam.ac.uk'
 11 | 
 12 | """
 13 | 
 14 | import os
 15 | import logging
 16 | import cPickle as pkl
 17 | 
 18 | import numpy as np
 19 | 
 20 | from keras.models import Sequential
 21 | from keras.layers.core import Merge, Reshape, Dense
 22 | from keras.layers.embeddings import Embedding
 23 | from keras.preprocessing import sequence, text
 24 | 
 25 | from helper import CorpusReader, Model
 26 | 
 27 | FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
 28 | logging.basicConfig(format=FORMAT, level=logging.DEBUG)
 29 | 
 30 | 
 31 | class WordContextModel(Model):
 32 |     """ WordContextModel implements a word2vec-like model trained with
 33 |     negative sampling. The word indices (of both the target word and
 34 |     its context) are passed to two distinct networks and the goal is to
 35 |     learn to discriminate between `true' and `false' contexts
 36 | 
 37 |     Parameters:
 38 |     corpus_path (str): the path to a text file
 39 |     embedding_size (int): the size of the neural embeddings
 40 |     """
 41 |     def __init__(self, corpus_path, embedding_size, *args, **kwargs):
 42 |         self.corpus_path = corpus_path
 43 |         self.embedding_size = embedding_size
 44 |         self.tokenizer = None
 45 |         self.corpus = None
 46 |         self.model = None
 47 |         self._vocab_size = None
 48 |         super(WordContextModel, self).__init__(*args, **kwargs)
 49 | 
 50 |     @staticmethod
 51 |     def _prepare_model(vocab_size, vector_dim, loss_function,
 52 |                        optimizer):
 53 |         """ Procedure to prepare the Keras model """
 54 |         logging.info('Building word model...')
 55 |         word = Sequential()
 56 |         word.add(Embedding(vocab_size, vector_dim, input_length=1))
 57 |         word.add(Reshape((vector_dim, 1)))
 58 | 
 59 |         logging.info('Building context model...')
 60 |         context = Sequential()
 61 |         context.add(Embedding(vocab_size, vector_dim, input_length=1))
 62 |         context.add(Reshape((vector_dim, 1)))
 63 | 
 64 |         logging.info('Building composite graph')
 65 |         model = Sequential()
 66 |         model.add(Merge([word, context], mode='dot', dot_axes=1))
 67 |         model.add(Reshape((1, )))
 68 |         model.add(Dense(1, activation='sigmoid'))
 69 |         model.compile(loss=loss_function, optimizer=optimizer)
 70 |         return model
 71 | 
 72 |     def prepare_model(self):
 73 |         """ Procedure to prepare the Keras model """
 74 |         self.model = self._prepare_model(self.vocab_size,
 75 |                                          self.embedding_size,
 76 |                                          self.loss_function,
 77 |                                          self.optimizer)
 78 | 
 79 |     @property
 80 |     def vocab_size(self):
 81 |         """ Return the size of the vocabulary """
 82 |         try:
 83 |             return self._vocab_size
 84 |         except AttributeError:
 85 |             logging.error('Please tokenize the corpus first')
 86 | 
 87 |     def tokenize_corpus(self):
 88 |         """ Tokenize the corpus using Keras helper classes"""
 89 |         self.corpus = CorpusReader(self.corpus_path)
 90 |         logging.info('Tokenizing the corpus')
 91 |         self.tokenizer = text.Tokenizer()
 92 |         self.tokenizer.fit_on_texts(self.corpus)
 93 |         self._vocab_size = len(self.tokenizer.word_counts) + 1
 94 | 
 95 |     def train_corpus(self, negative_samples=20, window_size=4):
 96 |         """ Train the model on the given corpus
 97 | 
 98 |         Parameters:
 99 |         negative_samples (int): the number of `false contexts' for each word
100 |         window_size (int): the size of each context
101 |         """
102 |         logging.info('Initialising sampling table')
103 |         sampling_table = sequence.make_sampling_table(self.vocab_size)
104 |         ans = []
105 |         for i, seq in enumerate(
106 |                 self.tokenizer.texts_to_sequences_generator(self.corpus)):
107 |             logging.info(i)
108 |             couples, labels = sequence.skipgrams(
109 |                 seq, self.vocab_size, window_size=window_size,
110 |                 negative_samples=negative_samples,
111 |                 sampling_table=sampling_table)
112 |         if couples:
113 |             word_target, word_context = zip(*couples)
114 |             word_target = np.array(word_target, dtype="int32")
115 |             word_context = np.array(word_context, dtype="int32")
116 |             loss = self.model.train_on_batch([word_target, word_context],
117 |                                              labels)
118 |         ans.append(loss)
119 |         return ans
120 | 
121 |     def save_model(self, path):
122 |         """ Saves the model in json format and the weights in hdf5"""
123 |         self.model.to_json(os.path.join(path, 'model.json'))
124 |         self.model.save_weights(os.path.join(path, 'model_weights.hdf5'))
125 | 
126 |     def save_embeddings(self, path):
127 |         """ Saves only the word embeddings along with the dictionary """
128 |         with open(os.path.join(path, 'embeddings.pkl', 'wb')) as fout:
129 |             pkl.dump([self.model.get_weights()[0],
130 |                       self.tokenizer.word_index], fout, -1)
131 | 


--------------------------------------------------------------------------------