├── CNN_text.py
├── RoB_CNN_redux.py
├── rationale_CNN.py
└── rationale_CNN_2.py


/CNN_text.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | @author Byron Wallace
  3 | A Keras implementation of CNNs for text classification. 
  4 | 
  5 | Credit for initial pass of implementation to: Cheng Guo (https://gist.github.com/entron).
  6 | 
  7 | References
  8 | --
  9 | Yoon Kim. "Convolutional Neural Networks for Sentence Classification". EMNLP 2014.
 10 | Ye Zhang and Byron Wallace. "A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification". http://arxiv.org/abs/1510.03820.
 11 | & also: http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
 12 | '''
 13 | 
 14 | from __future__ import print_function
 15 | import pdb
 16 | import sys
 17 | reload(sys)
 18 | sys.setdefaultencoding('utf8')
 19 | 
 20 | import numpy as np
 21 | 
 22 | from keras.preprocessing import sequence
 23 | from keras.preprocessing.sequence import pad_sequences
 24 | from keras.models import Graph
 25 | from keras.layers.core import Dense, Dropout, Activation, Flatten
 26 | from keras.layers.embeddings import Embedding
 27 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
 28 | from keras.datasets import imdb
 29 | from keras.utils.np_utils import accuracy
 30 | from keras.preprocessing.text import text_to_word_sequence, Tokenizer
 31 | from keras.callbacks import ModelCheckpoint
 32 | 
 33 | class TextCNN:
 34 | 
 35 |     def __init__(self, preprocessor, filters=None, n_filters=100, dropout=0.0):
 36 |         '''
 37 |         parameters
 38 |         ---
 39 |         preprocessor: an instance of the Preprocessor class, defined below
 40 |         '''
 41 |         self.preprocessor = preprocessor
 42 | 
 43 |         if filters is None:
 44 |             self.ngram_filters = [3, 4, 5]
 45 |         else:
 46 |             self.ngram_filters = filters 
 47 | 
 48 |         self.nb_filter = n_filters 
 49 |         self.dropout = dropout
 50 | 
 51 |         self.build_model() # build model
 52 |     
 53 |     def train(self, X_train, y_train, X_val=None, y_val=None,
 54 |                 nb_epoch=5, batch_size=32, optimizer='adam'):
 55 |         ''' 
 56 |         Accepts an X matrix (presumably some slice of self.X) and corresponding
 57 |         vector of labels. May want to revisit this. 
 58 | 
 59 |         X_val and y_val are to be used to validate during training. 
 60 |         '''
 61 | 
 62 | 
 63 |         checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
 64 |                                        verbose=1, 
 65 |                                        save_best_only=(X_val is not None))
 66 | 
 67 |         if X_val is not None:
 68 |             self.model.fit({'input': X_train, 'output': y_train},
 69 |                 batch_size=batch_size, nb_epoch=nb_epoch,
 70 |                 validation_data={'input': X_val, 'output': y_val},
 71 |                 verbose=2, callbacks=[checkpointer])
 72 |         else: 
 73 |             print("no validation data provided!")
 74 |             self.model.fit({'input': X_train, 'output': y_train},
 75 |                 batch_size=batch_size, nb_epoch=nb_epoch, 
 76 |                 verbose=2, callbacks=[checkpointer])
 77 |         
 78 | 
 79 |     def predict(self, X_test, batch_size=32, binarize=False):
 80 |         raw_preds = self.model.predict({'input': X_test}, batch_size=batch_size)['output']
 81 | 
 82 |         #np.array(self.model.predict({'input': X_test}, 
 83 |                     #              batch_size=batch_size)['output'])
 84 |         if binarize:
 85 |           return np.round(raw_preds)
 86 |         return raw_preds
 87 | 
 88 | 
 89 |     def build_model(self):
 90 |         # again, credit to Cheng Guo
 91 |         self.model = Graph()
 92 |         self.model.add_input(name='input', input_shape=(self.preprocessor.maxlen,), dtype=int)
 93 | 
 94 |         #pdb.set_trace()
 95 |         self.model.add_node(Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
 96 |                                 input_length=self.preprocessor.maxlen, weights=self.preprocessor.init_vectors), 
 97 |                                 name='embedding', input='input')
 98 |         self.model.add_node(Dropout(0.), name='dropout_embedding', input='embedding')
 99 |         for n_gram in self.ngram_filters:
100 |             self.model.add_node(Convolution1D(nb_filter=self.nb_filter,
101 |                                          filter_length=n_gram,
102 |                                          border_mode='valid',
103 |                                          activation='relu',
104 |                                          subsample_length=1,
105 |                                          input_dim=self.preprocessor.embedding_dims,
106 |                                          input_length=self.preprocessor.maxlen),
107 |                            name='conv_' + str(n_gram),
108 |                            input='dropout_embedding')
109 |             self.model.add_node(MaxPooling1D(pool_length=self.preprocessor.maxlen - n_gram + 1),
110 |                            name='maxpool_' + str(n_gram),
111 |                            input='conv_' + str(n_gram))
112 |             self.model.add_node(Flatten(),
113 |                            name='flat_' + str(n_gram),
114 |                            input='maxpool_' + str(n_gram))
115 |         self.model.add_node(Dropout(self.dropout), name='dropout', inputs=['flat_' + str(n) for n in self.ngram_filters])
116 |         self.model.add_node(Dense(1, input_dim=self.nb_filter * len(self.ngram_filters)), 
117 |                                   name='dense', input='dropout')
118 |         self.model.add_node(Activation('sigmoid'), name='sigmoid', input='dense')
119 |         self.model.add_output(name='output', input='sigmoid')
120 |         print("model built")
121 |         print(self.model.summary())
122 |         self.model.compile(loss={'output': 'binary_crossentropy'}, 
123 |                                 optimizer="adam")#optimizer)
124 | 
125 | class Preprocessor:
126 |     def __init__(self, max_features, maxlen, embedding_dims=200, wvs=None):
127 |         '''
128 |         max_features: the upper bound to be placed on the vocabulary size.
129 |         maxlen: the maximum length (in terms of tokens) of the instances/texts.
130 |         embedding_dims: size of the token embeddings; over-ridden if pre-trained
131 |                           vectors is provided (if wvs is not None).
132 |         '''
133 | 
134 |         self.max_features = max_features  
135 |         self.tokenizer = Tokenizer(nb_words=self.max_features)
136 |         self.maxlen = maxlen  
137 | 
138 |         self.use_pretrained_embeddings = False 
139 |         self.init_vectors = None 
140 |         if wvs is None:
141 |             self.embedding_dims = embedding_dims
142 |         else:
143 |             # note that these are only for initialization;
144 |             # they will be tuned!
145 |             self.use_pretrained_embeddings = True
146 |             self.embedding_dims = wvs.vector_size
147 |             self.word_embeddings = wvs
148 | 
149 | 
150 |     def preprocess(self, all_texts):
151 |         ''' 
152 |         This fits tokenizer and builds up input vectors (X) from the list 
153 |         of texts in all_texts. Needs to be called before train!
154 |         '''
155 |         self.raw_texts = all_texts
156 |         #self.build_sequences()
157 |         self.fit_tokenizer()
158 |         if self.use_pretrained_embeddings:
159 |             self.init_word_vectors()
160 | 
161 |     def fit_tokenizer(self):
162 |         ''' Fits tokenizer to all raw texts; remembers indices->words mappings. '''
163 |         self.tokenizer.fit_on_texts(self.raw_texts)
164 |         self.word_indices_to_words = {}
165 |         for token, idx in self.tokenizer.word_index.items():
166 |             self.word_indices_to_words[idx] = token
167 | 
168 |     def build_sequences(self, texts):
169 |         X = list(self.tokenizer.texts_to_sequences_generator(texts))
170 |         X = np.array(pad_sequences(X, maxlen=self.maxlen))
171 |         return X
172 | 
173 |     def init_word_vectors(self):
174 |         ''' 
175 |         Initialize word vectors.
176 |         '''
177 |         self.init_vectors = []
178 |         unknown_words_to_vecs = {}
179 |         for t, token_idx in self.tokenizer.word_index.items():
180 |             if token_idx <= self.max_features:
181 |                 try:
182 |                     self.init_vectors.append(self.word_embeddings[t])
183 |                 except:
184 |                     if t not in unknown_words_to_vecs:
185 |                         # randomly initialize
186 |                         unknown_words_to_vecs[t] = np.random.random(
187 |                                                 self.embedding_dims)*-2 + 1
188 | 
189 |                     self.init_vectors.append(unknown_words_to_vecs[t])
190 | 
191 |         # note that we make this a singleton list because that's
192 |         # what Keras wants. 
193 |         self.init_vectors = [np.vstack(self.init_vectors)]
194 | 
195 | 


--------------------------------------------------------------------------------
/RoB_CNN_redux.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import sys
  3 | import os 
  4 | csv.field_size_limit(sys.maxsize)
  5 | 
  6 | import sklearn 
  7 | from sklearn.metrics import accuracy_score
  8 | 
  9 | import pandas as pd 
 10 | import numpy as np 
 11 | 
 12 | import gensim 
 13 | from gensim.models import Word2Vec
 14 | 
 15 | import CNN_text
 16 | 
 17 | 
 18 | def load_trained_w2v_model(path="/Users/byron/dev/Deep-PICO/PubMed-w2v.bin"):
 19 |     m = Word2Vec.load_word2vec_format(path, binary=True)
 20 |     return m
 21 | 
 22 | 
 23 | def read_RoB_data(path="RoB-data/train-Xy-Random-sequence-generation.txt", 
 24 |                     y_tuples=False, zero_one=True):
 25 |     ''' 
 26 |     Assumes data is in CSV with label as second entry.
 27 |     '''
 28 |     raw_texts, y = [], []
 29 |     with open(path) as input_file: 
 30 |         rows = csv.reader(input_file)
 31 |         for row in rows: 
 32 |             doc_text, lbl = row
 33 |             raw_texts.append(doc_text)
 34 |             cur_y = int(lbl)
 35 |             if y_tuples:
 36 |                 if cur_y > 0:
 37 |                     y.append(np.array([0,1]))
 38 |                 else: 
 39 |                     y.append(np.array([1,0]))
 40 |             else:
 41 |                 if cur_y < 1:
 42 |                     if zero_one:
 43 |                         y.append(0)
 44 |                     else:
 45 |                         y.append(-1)
 46 |                     
 47 |                 else:
 48 |                     y.append(1)
 49 | 
 50 |     return raw_texts, y 
 51 | 
 52 | 
 53 | 
 54 | def RoB_CNN(total_epochs=60, weights_file=None):
 55 |     train_docs, y_train = read_RoB_data(path="RoB-data/train-Xy-Random-sequence-generation.txt", 
 56 |                                         y_tuples=False)
 57 |    
 58 |     test_docs, y_test = read_RoB_data(path="RoB-data/test-Xy-Random-sequence-generation.txt",
 59 |                                         y_tuples=False)
 60 | 
 61 |     
 62 |     train_docs = train_docs#[:500]
 63 |     y_train = y_train#[:500]
 64 | 
 65 |     wvs = load_trained_w2v_model()
 66 |     # preprocessor for texts
 67 | 
 68 |     # then the CNN
 69 |     p = CNN_text.Preprocessor(max_features=10000, maxlen=5000, wvs=wvs)
 70 |     all_docs = train_docs + test_docs
 71 |     
 72 |     print("preprocessing...")
 73 |     p.preprocess(all_docs)
 74 |     train_X = p.build_sequences(train_docs)
 75 |     test_X = p.build_sequences(test_docs)
 76 |     
 77 | 
 78 |     cnn = CNN_text.TextCNN(p, filters=[2,3,5], n_filters=100, dropout=0.0)
 79 | 
 80 |     # write the model out
 81 |     json_string = cnn.model.to_json()
 82 |     open('RoB_model_architecture.json', 'w').write(json_string)
 83 |     print("dumped model!")
 84 | 
 85 |     if weights_file:
 86 |         cnn.model.load_weights('weights.hdf5')
 87 | 
 88 |     epochs_per_iter = 10
 89 |     epochs_so_far = 0
 90 |     while epochs_so_far < total_epochs:
 91 |         cnn.train(train_X, y_train, nb_epochs=epochs_per_iter)#, X_val=test_X, y_val=y_test)
 92 |         epochs_so_far += epochs_per_iter
 93 |         
 94 |         yhat = cnn.predict(test_X, binarize=True)
 95 |         #import pdb; pdb.set_trace()
 96 |         print("acc @ epoch %s: %s" % (epochs_so_far, accuracy_score(y_test, yhat)))
 97 | 
 98 |     #cnn.initialize_sequences_and_vocab(all_docs)
 99 |     #cnn.train(X_train, y_train, X_val=None, y_val=None
100 | 
101 | 
102 | # note that on TACC you need:
103 | #    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/apps/intel14/hdf5/1.8.12/x86_64/lib/
104 | if __name__ == '__main__':
105 |     RoB_CNN()
106 | 


--------------------------------------------------------------------------------
/rationale_CNN.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | @author Byron Wallace
  3 | A Keras implementation of our "rationale augmented CNN" (https://arxiv.org/abs/1605.04469). Please note that
  4 | the model was originally implemented in Theano -- this version is a work in progress.
  5 | 
  6 | Credit for initial pass of basic CNN implementation to: Cheng Guo (https://gist.github.com/entron).
  7 | 
  8 | References
  9 | --
 10 | Ye Zhang, Iain J. Marshall and Byron C. Wallace. "Rationale-Augmented Convolutional Neural Networks for Text Classification". http://arxiv.org/abs/1605.04469
 11 | Yoon Kim. "Convolutional Neural Networks for Sentence Classification". EMNLP 2014.
 12 | Ye Zhang and Byron Wallace. "A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification". http://arxiv.org/abs/1510.03820.
 13 | & also: http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
 14 | '''
 15 | 
 16 | from __future__ import print_function
 17 | import pdb
 18 | import sys
 19 | import random
 20 | reload(sys)
 21 | sys.setdefaultencoding('utf8')
 22 | 
 23 | import numpy as np
 24 | 
 25 | from keras.preprocessing import sequence
 26 | from keras.engine.topology import Layer
 27 | from keras.preprocessing.sequence import pad_sequences
 28 | from keras.models import Graph
 29 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Merge, Reshape, Permute, Lambda
 30 | from keras.layers.embeddings import Embedding
 31 | from keras.layers.convolutional import Convolution1D, Convolution2D, MaxPooling1D, MaxPooling2D
 32 | from keras.datasets import imdb
 33 | from keras.utils.np_utils import accuracy
 34 | from keras.preprocessing.text import text_to_word_sequence, Tokenizer
 35 | from keras.callbacks import ModelCheckpoint
 36 | 
 37 | ##
 38 | from keras.layers import Input, Embedding, Dense, merge
 39 | from keras.models import Model, Sequential
 40 | 
 41 | from keras import backend as K 
 42 | 
 43 | class SentenceConvolutionLayer(Layer):
 44 |     # input shape: (sentences, max_tokens)
 45 |     
 46 |     def __init__(self):
 47 |         pass 
 48 | 
 49 | class WeightedSumSentenceVector2(Layer):
 50 |     # input shape: (sentences, sentence_embedding_dim)
 51 |     #       # documents[0].sentence_sequences
 52 |     # output shape: (1, sentence_embedding_dim)
 53 | 
 54 |     def __init__(self, sentence_model, **kwargs):
 55 |         #self.sentence_cnn = sentence_cnn
 56 |         #self.sentence_model = sentence_model
 57 |          pass 
 58 | 
 59 | class WeightedSumSentenceVector(Layer):
 60 |     # input shape: (sentences, sentence_embedding_dim)
 61 |     #       # documents[0].sentence_sequences
 62 |     # output shape: (1, sentence_embedding_dim)
 63 | 
 64 |     def __init__(self, sentence_model, **kwargs):
 65 |         #self.sentence_cnn = sentence_cnn
 66 |         self.sentence_model = sentence_model
 67 | 
 68 | 
 69 |         super(WeightedSumSentenceVector, self).__init__(**kwargs)
 70 | 
 71 |     def get_output_shape_for(self, input_shape):
 72 |         assert len(shape) == 2
 73 |         return (1, input_shape[1])
 74 | 
 75 |     def call(self, X, mask=None):
 76 |         '''
 77 |         X assumed to be a document, with each row a list of 
 78 |         token indicators in the corresponding sentences. 
 79 |         '''
 80 |         conv_f = K.function(
 81 |                         [self.sentence_model.layers[0].input, K.learning_phase()], 
 82 |                         [self.sentence_model.layers[-2].output])
 83 | 
 84 |         import pdb; pdb.set_trace()
 85 |         sentence_vectors = conv_f([X,1])[0]
 86 | 
 87 |         sentence_predictions = sentence_model.predict([test_sentences])
 88 |         weights = np.amax(sentence_predictions[:,0:2],axis=1)
 89 | 
 90 |         return np.matrix(np.dot(weights, vecs))
 91 | 
 92 |     def build(self, input_shape):
 93 | 
 94 |         #input_dim = input_shape[1]
 95 |         #initial_weight_value = np.random.random((input_dim, output_dim))
 96 |         #self.W = K.variable(initial_weight_value)
 97 |         #self.trainable_weights = [self.W]
 98 |         self.trainable_weights = self.sentence_model.trainable_weights
 99 | 
100 | class RationaleCNN:
101 | 
102 |     def __init__(self, preprocessor, filters=None, n_filters=100, dropout=0.0):
103 |         '''
104 |         parameters
105 |         ---
106 |         preprocessor: an instance of the Preprocessor class, defined below
107 |         '''
108 |         self.preprocessor = preprocessor
109 | 
110 |         if filters is None:
111 |             self.ngram_filters = [3, 4, 5]
112 |         else:
113 |             self.ngram_filters = filters 
114 | 
115 |         self.nb_filter = n_filters 
116 |         self.dropout = dropout
117 |         self.sentence_model_trained = False 
118 | 
119 |         #self.build_model() # build model
120 |         #self.train_sentence_model()
121 | 
122 |     @staticmethod
123 |     def weighted_sum(X):
124 |         # @TODO.. add sentence preds!
125 |         return K.sum(X, axis=0) # I *think* axis 0 is correct...
126 | 
127 |     @staticmethod
128 |     def weighted_sum_output_shape(input_shape):
129 |         # expects something like (None, max_doc_len, num_features) 
130 |         shape = list(input_shape)
131 |         #assert len(shape) == 2 # not sure if correct...
132 |         #print len(shape)
133 |         print("shape: %s" % shape)
134 |         # (1 x num_features)
135 |         return tuple((1, shape[-1]))
136 | 
137 |     @staticmethod
138 |     def balanced_sample(X, y):
139 |         _, pos_rationale_indices = np.where([y[:,0] > 0]) 
140 |         _, neg_rationale_indices = np.where([y[:,1] > 0]) 
141 |         _, non_rationale_indices = np.where([y[:,2] > 0]) 
142 | 
143 |         # sample a number of non-rationales equal to the total
144 |         # number of pos/neg rationales. 
145 |         m = pos_rationale_indices.shape[0] + neg_rationale_indices.shape[0]
146 |         sampled_non_rationale_indices = np.array(random.sample(non_rationale_indices, m))
147 | 
148 |         train_indices = np.concatenate([pos_rationale_indices, neg_rationale_indices, sampled_non_rationale_indices])
149 |         np.random.shuffle(train_indices) # why not
150 |         return X[train_indices,:], y[train_indices]
151 | 
152 |     # r_CNN.sentence_model.predict(X[:10], batch_size=128)
153 |     def train_sentence_model(self, train_documents, nb_epoch=5, downsample=True, batch_size=128, optimizer='adam'):
154 |         # assumes sentence sequences have been generated!
155 |         assert(train_documents[0].sentence_sequences is not None)
156 | 
157 |         X, y= [], []
158 |         # flatten sentences/sentence labels
159 |         for d in train_documents:
160 |             X.extend(d.sentence_sequences)
161 |             y.extend(d.sentences_y)
162 | 
163 |         # @TODO sub-sample magic?
164 |         X, y = np.asarray(X), np.asarray(y)
165 |         
166 |         # downsample
167 |         if downsample:
168 |             X, y = RationaleCNN.balanced_sample(X, y)
169 | 
170 |         #self.train(X[:1000], y[:1000])
171 |         self.train(X, y)
172 | 
173 |         self.sentence_model_trained = True
174 | 
175 | 
176 |     def train(self, X_train, y_train, X_val=None, y_val=None,
177 |                 nb_epoch=5, batch_size=32, optimizer='adam'):
178 |         ''' 
179 |         Accepts an X matrix (presumably some slice of self.X) and corresponding
180 |         vector of labels. May want to revisit this. 
181 | 
182 |         X_val and y_val are to be used to validate during training. 
183 |         '''
184 |         checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
185 |                                        verbose=1, 
186 |                                        save_best_only=(X_val is not None))
187 | 
188 |         if X_val is not None:
189 |             self.sentence_model.fit({'input': X_train, 'output': y_train},
190 |                 batch_size=batch_size, nb_epoch=nb_epoch,
191 |                 validation_data={'input': X_val, 'output': y_val},
192 |                 verbose=2, callbacks=[checkpointer])
193 |         else: 
194 |             print("no validation data provided!")
195 |             #self.sentence_model.fit({'input': X_train, 'output': y_train},
196 |             #    batch_size=batch_size, nb_epoch=nb_epoch, 
197 |             #    verbose=2, callbacks=[checkpointer])
198 |             self.sentence_model.fit(X_train, y_train,
199 |                 batch_size=batch_size, nb_epoch=nb_epoch, 
200 |                 verbose=2, callbacks=[checkpointer])
201 |         
202 | 
203 |     '''
204 |     def predict(self, X_test, batch_size=32, binarize=False):
205 |         raw_preds = self.model.predict({'input': X_test}, batch_size=batch_size)['output']
206 | 
207 |         #np.array(self.model.predict({'input': X_test}, 
208 |                     #              batch_size=batch_size)['output'])
209 |         if binarize:
210 |           return np.round(raw_preds)
211 |         return raw_preds
212 |     '''
213 | 
214 | 
215 |     def build_sentence_model(self):
216 |         ''' 
217 |         Build the *sentence* level model, which operates over, erm, sentences. 
218 |         The task is to predict which sentences are pos/neg rationales.
219 |         '''
220 |         tokens_input = Input(name='input', shape=(self.preprocessor.max_sent_len,), dtype='int32')
221 |         x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
222 |                       input_length=self.preprocessor.max_sent_len, 
223 |                       weights=self.preprocessor.init_vectors)(tokens_input)
224 |         
225 |         x = Dropout(0.1)(x)
226 | 
227 |         convolutions = []
228 |         for n_gram in self.ngram_filters:
229 |             cur_conv = Convolution1D(nb_filter=self.nb_filter,
230 |                                          filter_length=n_gram,
231 |                                          border_mode='valid',
232 |                                          activation='relu',
233 |                                          subsample_length=1,
234 |                                          input_dim=self.preprocessor.embedding_dims,
235 |                                          input_length=self.preprocessor.max_sent_len)(x)
236 |             # pool
237 |             one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv)
238 |             flattened = Flatten()(one_max)
239 |             convolutions.append(flattened)
240 | 
241 |         sentence_vector = merge(convolutions, name="sentence_vector") # hang on to this layer!
242 |         output = Dense(3, activation="softmax")(sentence_vector)
243 | 
244 |         self.sentence_model = Model(input=tokens_input, output=output)
245 |         print("model built")
246 |         print(self.sentence_model.summary())
247 |         self.sentence_model.compile(loss='categorical_crossentropy', optimizer="adam")
248 | 
249 |         self.sentence_embedding_dim = self.sentence_model.layers[-2].output_shape[1]
250 | 
251 |         return self.sentence_model 
252 | 
253 | 
254 |    
255 | 
256 |     def build_doc_model_fixed(self):
257 |         # no magic here.
258 |         #input_layer = Dense(1, batch_input_shape=(None, self.sentence_embedding_dim))#input_shape=(self.sentence_embedding_dim, ))
259 |         #output_layer = Activation('sigmoid')(input_layer)
260 | 
261 |         self.document_model = Sequential() 
262 |         self.document_model.add(Dense(1, input_dim=self.sentence_embedding_dim))
263 |         self.document_model.add(Activation("sigmoid"))
264 | 
265 |         #self.document_model = Model(input=tokens_input, output=output)
266 |         self.document_model.compile(loss='binary_crossentropy', optimizer="adam")
267 | 
268 | 
269 |     def train_doc_model_fixed(self, train_documents):
270 |         conv_f = K.function(
271 |                         [self.sentence_model.layers[0].input, K.learning_phase()], 
272 |                         [self.sentence_model.layers[-2].output])
273 |         
274 |         X, y = [], []
275 | 
276 |         for d in train_documents:
277 |             sentence_vectors = np.matrix([conv_f([np.matrix(sent_seq),1])[0][0] for 
278 |                                 sent_seq in d.sentence_sequences])
279 | 
280 |             #sentence_predictions = self.sentence_model.predict(d.sentence_sequences)
281 |             sentence_predictions = self.sentence_model.predict(d.sentence_sequences)
282 |             weights = np.amax(sentence_predictions[:,0:2],axis=1)
283 |             weighted = np.dot(weights, sentence_vectors)
284 |             X.append(weighted)
285 |             y.append(d.doc_y)
286 |         #train_sequences = 
287 | 
288 |         X = np.vstack(X)
289 |         y = np.array(y)
290 |         #import pdb; pdb.set_trace()
291 |         self.document_model.fit(X, y)
292 |         
293 | 
294 |         #return np.matrix(np.dot(weights, vecs))
295 | 
296 |     def train_document_model(self, train_documents, 
297 |                                 nb_epoch=5, downsample=True, 
298 |                                 batch_size=128, optimizer='adam'):
299 |         # assumes sentence sequences have been generated!
300 |         assert(train_documents[0].sentence_sequences is not None)
301 | 
302 |         X, y= [], []
303 |         # flatten sentences/sentence labels
304 |         for d in train_documents:
305 |             X.extend(d.sentence_sequences)
306 |             y.extend(d.sentences_y)
307 | 
308 |         # @TODO sub-sample magic?
309 |         X, y = np.asarray(X), np.asarray(y)
310 |         
311 |         # downsample
312 |         if downsample:
313 |             X, y = RationaleCNN.balanced_sample(X, y)
314 | 
315 |         #self.train(X[:1000], y[:1000])
316 |         self.train(X, y)
317 | 
318 |         self.sentence_model_trained = True
319 | 
320 | 
321 |     def build_doc_model_concat(self):
322 |         # the idea is here is to concatenate the sentence inputs; so represent each document
323 |         # by one very long row
324 |         doc_len = self.preprocessor.max_sent_len * self.preprocessor.max_doc_len
325 |         tokens_input = Input(name='input', 
326 |                             shape=(doc_len,), dtype='int32')
327 | 
328 |         x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
329 |                       input_length=doc_len, 
330 |                       weights=self.preprocessor.init_vectors)(tokens_input)
331 | 
332 | 
333 |     def build_sequential_doc_model(self):
334 |         #self.document_model = Sequential()
335 |         m = Sequential()
336 | 
337 |         # input layer. this is a matrix with dimensions:
338 |         #       (max_doc_length x max_sent_length)
339 |         #
340 |         m.add(Dense(100, input_shape=(p.max_sent_len,)))
341 | 
342 |         #pass 
343 | 
344 |     def build_doc_model3(self):
345 |         model = Sequential()
346 | 
347 |         # 32 is just n_filters; 1 is n_gram
348 |         nb_feature_maps = n_filters = 32
349 |         
350 |         maxlen = self.preprocessor.max_sent_len
351 |         
352 |         conv_filters = []
353 |         for n_gram in self.ngram_filters:
354 |             sequential = Sequential()
355 |             conv_filters.append(sequential)
356 | 
357 |             sequential.add(Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims))
358 |             sequential.add(Reshape(1, maxlen, self.preprocessor.embedding_dims))
359 |             sequential.add(Convolution2D(nb_feature_maps, 1, n_gram, self.preprocessor.embedding_dims))
360 |             sequential.add(Activation("relu"))
361 |             sequential.add(MaxPooling2D(poolsize=(maxlen - n_gram + 1, 1)))
362 |             sequential.add(Flatten())
363 | 
364 |         model = Sequential()
365 |         model.add(Merge(conv_filters, mode='concat'))
366 |         model.add(Dropout(0.5))
367 |         model.add(Dense(nb_feature_maps * len(conv_filters), 1))
368 |         model.add(Activation("sigmoid"))
369 | 
370 |         '''
371 |         convolutions = []
372 |         for n_gram in self.ngram_filters:
373 |             cur_conv = Convolution2D(n_filters, 1, n_gram, 
374 |                                         input_shape=(1, p.max_doc_len, p.max_sent_len),
375 |                                         activation='relu', border_mode='valid')
376 | 
377 |             #Convolution1D(nb_filter=self.nb_filter,
378 |             #                             filter_length=n_gram,
379 |             #                             border_mode='valid',
380 |             #                             activation='relu',
381 |             #                             subsample_length=1,
382 |             #                             input_dim=self.preprocessor.embedding_dims,
383 |             #                             input_length=self.preprocessor.max_sent_len)(x)
384 |             # pool
385 |             one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv)
386 |             flattened = Flatten()(one_max)
387 |             convolutions.append(flattened)
388 | 
389 |         '''
390 | 
391 |         #model.add(
392 |         #    Convolution2D(n_filters, 1, n_gram, 
393 |         #    input_shape=(1, p.max_doc_len, p.max_sent_len))
394 | 
395 |         # get vectors for each sentence
396 |         #MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)
397 | 
398 | 
399 | 
400 |         #one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv)
401 | 
402 |         '''
403 |         document_input = Input(name='input', 
404 |             shape=(None, self.preprocessor.max_doc_len, 
405 |                    self.preprocessor.max_sent_len), dtype='int32')
406 | 
407 |         # filter, nb_rows, nb_cols
408 |         n_gram = 1
409 |         cur_conv = Convolution2D(32, 
410 |                              n_gram, self.preprocessor.embedding_dims, 
411 | 
412 |                              activation='relu',
413 |                              # samples, channels, rows, cols
414 |                              input_shape=(1,
415 |                                 self.preprocessor.max_doc_len,
416 |                                 self.preprocessor.embedding_dims,
417 |                              ))(document_input)
418 |         '''
419 |     def build_doc_model2(self):
420 |         document_input = Input(name='input', 
421 |             shape=(self.preprocessor.max_doc_len, 
422 |                    self.preprocessor.max_sent_len,), dtype='int32')
423 | 
424 |         document_vector = WeightedSumSentenceVector(self.sentence_model)(document_input)
425 | 
426 |         # sentence_vectors = 
427 | 
428 |         # 
429 |         #conv_f = K.function([self.sentence_model.layers[0].input, K.learning_phase()], 
430 |         #                [self.sentence_model.layers[-2].output])
431 |         # test_sent.shape
432 |         #   (1,50) ### this is the list of token indices!
433 |         # sentence_v = conv_f([test_sent,1])[0]
434 | 
435 |         ''' 
436 |         Re-construct the (start of) the *sentence* level model, which operates over, erm, sentences. 
437 |         The task is to predict which sentences are pos/neg rationales.
438 |         '''
439 |         # 
440 | 
441 | 
442 | 
443 |         '''
444 |         tokens_input = Input(name='input', shape=(self.preprocessor.max_sent_len,), dtype='int32')
445 |         x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
446 |                       input_length=self.preprocessor.max_sent_len, 
447 |                       weights=self.preprocessor.init_vectors)(tokens_input)
448 |         
449 |         x = Dropout(0.1)(x)
450 | 
451 |         convolutions = []
452 |         for n_gram in self.ngram_filters:
453 |             cur_conv = Convolution1D(nb_filter=self.nb_filter,
454 |                                          filter_length=n_gram,
455 |                                          border_mode='valid',
456 |                                          activation='relu',
457 |                                          subsample_length=1,
458 |                                          input_dim=self.preprocessor.embedding_dims,
459 |                                          input_length=self.preprocessor.max_sent_len)(x)
460 |             # pool
461 |             one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv)
462 |             flattened = Flatten()(one_max)
463 |             convolutions.append(flattened)
464 | 
465 |         sentence_vector = merge(convolutions, name="sentence_vector") # hang on to this layer!
466 |         '''
467 | 
468 |         # ok initialize each layer with parameters!
469 | 
470 | 
471 |         ###
472 |         # 
473 |         '''
474 |         output = Dense(3, activation="softmax")(self.penultimate_layer)
475 | 
476 |         self.sentence_model = Model(input=tokens_input, output=output)
477 |         '''
478 | 
479 | 
480 |     '''
481 |     In [137]: model.summary()
482 |     ____________________________________________________________________________________________________
483 |     Layer (type)                       Output Shape        Param #     Connected to                     
484 |     ====================================================================================================
485 |     input (InputLayer)                 (None, 500, 50)     0                                            
486 |     ____________________________________________________________________________________________________
487 |     reshape_16 (Reshape)               (None, 25000)       0           input[0][0]                      
488 |     ____________________________________________________________________________________________________
489 |     embedding_12 (Embedding)           (None, 25000, 200)  2000000     reshape_16[0][0]                 
490 |     ____________________________________________________________________________________________________
491 |     reshape_17 (Reshape)               (None, 500, 10000)  0           embedding_12[0][0]               
492 |     ____________________________________________________________________________________________________
493 |     reshape_18 (Reshape)               (None, 1, 500, 100000           reshape_17[0][0]                 
494 |     ____________________________________________________________________________________________________
495 |     convolution2d_4 (Convolution2D)    (None, 32, 500, 50) 6432        reshape_18[0][0]                 
496 |     ____________________________________________________________________________________________________
497 |     maxpooling2d_1 (MaxPooling2D)      (None, 32, 500, 1)  0           convolution2d_4[0][0]            
498 |     ____________________________________________________________________________________________________
499 |     permute_2 (Permute)                (None, 1, 500, 32)  0           maxpooling2d_1[0][0]             
500 |     ____________________________________________________________________________________________________
501 |     reshape_19 (Reshape)               (None, 500, 32)     0           permute_2[0][0]                  
502 |     =====================================================================================
503 |     '''
504 |     def build_doc_model_clean(self, n_filters=32):
505 |         # input dim is (max_doc_len x max_sent_len) -- eliding the batch size
506 |         tokens_input = Input(name='input', 
507 |                             shape=(self.preprocessor.max_doc_len, self.preprocessor.max_sent_len), 
508 |                             dtype='int32')
509 |         # flatten; create a very wide matrix to hand to embedding layer
510 |         tokens_reshaped = Reshape([self.preprocessor.max_doc_len*self.preprocessor.max_sent_len])(tokens_input)
511 |         # embed the tokens; output will be (p.max_doc_len*p.max_sent_len x embedding_dims)
512 |         x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
513 |                         weights=self.preprocessor.init_vectors)(tokens_reshaped)
514 | 
515 |         # reshape to preserve document structure; each doc will now be a
516 |         # a row in this matrix
517 |         x = Reshape((1, self.preprocessor.max_doc_len, 
518 |                      self.preprocessor.max_sent_len*self.preprocessor.embedding_dims))(x)
519 | 
520 |         #x = Reshape((1, p.max_doc_len, p.max_sent_len*p.embedding_dims))(x)
521 | 
522 |         x = Dropout(0.1)(x)
523 | 
524 |         ####
525 |         # @TODO wrap in loop to include all n_grams!
526 |         n_gram = 1 # tmp
527 |         
528 | 
529 |         cur_conv = Convolution2D(n_filters, 1, 
530 |                                  n_gram*self.preprocessor.embedding_dims, 
531 |                                  subsample=(1, self.preprocessor.embedding_dims))(x)
532 |         # model = Model(input=tokens_input, output=cur_conv)
533 | 
534 |         # this output (n_filters x max_doc_len x 1)
535 |         one_max = MaxPooling2D(pool_size=(1, self.preprocessor.max_sent_len - n_gram + 1))(cur_conv)
536 |         # flip around, to get (1 x max_doc_len x n_filters)
537 |         permuted = Permute((3,2,1)) (one_max)
538 |         # drop extra dimension
539 |         r = Reshape((self.preprocessor.max_doc_len, n_filters))(permuted)
540 |         # now we want to average the sentence vectors!
541 |         x_doc = Lambda(RationaleCNN.weighted_sum, 
542 |                         output_shape=RationaleCNN.weighted_sum_output_shape)(r)
543 | 
544 |         # finally, the sigmoid layer for classification
545 |         y_hat = Dense(1, activation="softmax")(x_doc)
546 |         model = Model(input=tokens_input, output=x_doc)
547 |         return model 
548 |         #model.summary()
549 | 
550 |     def build_doc_model(self):
551 |         '''
552 |         Builds the *document* level model, which uses the sentence level model to inform
553 |         its predictions.
554 |         '''
555 |         #tokens_input = Input(name='input', shape=(None, 
556 |         #                        self.preprocessor.max_doc_len, 
557 |         #                        self.preprocessor.max_sent_len), dtype='int32')
558 |         tokens_input = Input(name='input', shape=(p.max_doc_len, p.max_sent_len), dtype='int32')
559 | 
560 |         tokens_reshaped = Reshape([p.max_doc_len*p.max_sent_len])(tokens_input)
561 | 
562 |         x = Embedding(p.max_features, p.embedding_dims, weights=p.init_vectors)(tokens_reshaped)
563 |         #tokens_reshaped = Reshape((self.preprocessor.max_doc_len, 
564 |         #                           self.preprocessor.max_sent_len*self.preprocessor.embedding_dims))(tokens_input)
565 | 
566 |         # so this will be (max_doc_len, max_sent_len, wv_size), i think
567 |         #x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
568 |         #        weights=self.preprocessor.init_vectors)(tokens_input)
569 |                       #input_length=self.preprocessor.max_sent_len, 
570 |                       #weights=self.preprocessor.init_vectors)(tokens_input)
571 |         
572 |         x = Reshape((p.max_doc_len, p.max_sent_len*p.embedding_dims))(x)
573 |         x = Dropout(0.1)(x)
574 | 
575 |         #  (max_doc_len, max_sent_len, wv_size) -> (max_doc_len, max_sent_len * wv_size)
576 |         #r = Reshape(self.preprocessor.max_doc_len, 
577 |         #            self.preprocessor.max_sent_len * self.preprocessor.embedding_dims)(x)
578 |         convolutions = []
579 |         for n_gram in self.ngram_filters:
580 |             #cur_conv = Convolution1D(nb_filter=self.nb_filter, filter_length=n_gram)
581 | 
582 |             '''
583 |             # filter, nb_rows, nb_cols
584 |             cur_conv = Convolution2D(self.nb_filter, 
585 |                              1, self.preprocessor.embedding_dims, 
586 |                              filter_length=n_gram,
587 |                              activation='relu',
588 |                              input_dim=self.preprocessor.embedding_dims,
589 |                              input_length=self.preprocessor.max_sent_len)(x)
590 |             '''
591 | 
592 |             # cur_conv = Convolution2D(32, p.embedding_dims, n_gram, input_shape=(1, p.embedding_dims,  p.max_sent_len))(x)
593 |             cur_conv = Convolution1D(nb_filter=self.nb_filter,
594 |                                          filter_length=n_gram,
595 |                                          border_mode='valid',
596 |                                          activation='relu',
597 |                                          subsample_length=1,
598 |                                          input_dim=self.preprocessor.embedding_dims,
599 |                                          input_length=self.preprocessor.max_sent_len)(x)
600 |             
601 | 
602 |             # pool
603 |             #one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv)
604 |             one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv)
605 |             flattened = Flatten()(one_max)
606 |             convolutions.append(flattened)
607 | 
608 |         penultimate_layer = merge(convolutions)
609 |         
610 |         output = Dense(1, activation="sigmoid")(penultimate_layer)
611 | 
612 |         self.document_model = Model(input=tokens_input, output=output)
613 |         
614 |         print(self.document_model.summary())
615 |         self.document_model.compile(loss='binary_crossentropy', optimizer="adam")
616 | 
617 |         return self.document_model 
618 | 
619 | 
620 |         '''
621 |         self.model = Graph()
622 |         self.model.add_input(name='input', input_shape=(self.preprocessor.maxlen,), dtype=int)
623 | 
624 |         self.model.add_node(Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
625 |                                 input_length=self.preprocessor.maxlen, weights=self.preprocessor.init_vectors), 
626 |                                 name='embedding', input='input')
627 |         self.model.add_node(Dropout(0.), name='dropout_embedding', input='embedding')
628 |         for n_gram in self.ngram_filters:
629 |             self.model.add_node(Convolution1D(nb_filter=self.nb_filter,
630 |                                          filter_length=n_gram,
631 |                                          border_mode='valid',
632 |                                          activation='relu',
633 |                                          subsample_length=1,
634 |                                          input_dim=self.preprocessor.embedding_dims,
635 |                                          input_length=self.preprocessor.maxlen),
636 |                            name='conv_' + str(n_gram),
637 |                            input='dropout_embedding')
638 |             self.model.add_node(MaxPooling1D(pool_length=self.preprocessor.maxlen - n_gram + 1),
639 |                            name='maxpool_' + str(n_gram),
640 |                            input='conv_' + str(n_gram))
641 |             self.model.add_node(Flatten(),
642 |                            name='flat_' + str(n_gram),
643 |                            input='maxpool_' + str(n_gram))
644 |         self.model.add_node(Dropout(self.dropout), name='dropout', inputs=['flat_' + str(n) for n in self.ngram_filters])
645 |         self.model.add_node(Dense(1, input_dim=self.nb_filter * len(self.ngram_filters)), 
646 |                                   name='dense', input='dropout')
647 |         self.model.add_node(Activation('sigmoid'), name='sigmoid', input='dense')
648 |         self.model.add_output(name='output', input='sigmoid')
649 |         print("model built")
650 |         print(self.model.summary())
651 |         self.model.compile(loss={'output': 'binary_crossentropy'}, 
652 |                                 optimizer="adam")#optimizer)
653 |         '''
654 | 
655 | class Document:
656 |     def __init__(self, doc_id, sentences, doc_label=None, sentences_labels=None):
657 |         self.doc_id = doc_id
658 |         self.doc_y = doc_label
659 | 
660 |         self.sentences = sentences
661 |         self.sentence_sequences = None
662 | 
663 |         self.sentences_y = sentences_labels
664 | 
665 |         self.sentence_idx = 0
666 |         self.n = len(self.sentences)
667 | 
668 | 
669 |     def __len__(self):
670 |         return self.n 
671 | 
672 |     def generate_sequences(self, p):
673 |         ''' 
674 |         p is a preprocessor that has been instantiated
675 |         elsewhere! this will be used to map sentences to 
676 |         integer sequences here.
677 |         '''
678 |         self.sentence_sequences = p.build_sequences(self.sentences)
679 | 
680 | 
681 | 
682 |     '''
683 |     def __iter__(self):
684 |         return self 
685 | 
686 |     def next(self):
687 |         if self.sentence_idx < self.n:
688 |             cur_sentence_idx = self.sentence_idx 
689 |             self.sentence_idx += 1
690 | 
691 |             return self.sentences[cur_sentence_idx]
692 |         else:
693 |             raise StopIteration()
694 | 
695 |     '''
696 | 
697 | 
698 | class Preprocessor:
699 |     def __init__(self, max_features, max_sent_len, embedding_dims=200, wvs=None, max_doc_len=500):
700 |         '''
701 |         max_features: the upper bound to be placed on the vocabulary size.
702 |         max_sent_len: the maximum length (in terms of tokens) of the instances/texts.
703 |         embedding_dims: size of the token embeddings; over-ridden if pre-trained
704 |                           vectors is provided (if wvs is not None).
705 |         '''
706 | 
707 |         self.max_features = max_features  
708 |         self.tokenizer = Tokenizer(nb_words=self.max_features)
709 |         self.max_sent_len = max_sent_len  # the max sentence length! @TODO rename; this is confusing. 
710 |         self.max_doc_len = max_doc_len # w.r.t. number of sentences!
711 | 
712 |         self.use_pretrained_embeddings = False 
713 |         self.init_vectors = None 
714 |         if wvs is None:
715 |             self.embedding_dims = embedding_dims
716 |         else:
717 |             # note that these are only for initialization;
718 |             # they will be tuned!
719 |             self.use_pretrained_embeddings = True
720 |             self.embedding_dims = wvs.vector_size
721 |             self.word_embeddings = wvs
722 | 
723 | 
724 |     def preprocess(self, all_docs):
725 |         ''' 
726 |         This fits tokenizer and builds up input vectors (X) from the list 
727 |         of texts in all_texts. Needs to be called before train!
728 |         '''
729 |         self.raw_texts = all_docs
730 |         #self.build_sequences()
731 |         self.fit_tokenizer()
732 |         if self.use_pretrained_embeddings:
733 |             self.init_word_vectors()
734 | 
735 | 
736 |     def fit_tokenizer(self):
737 |         ''' Fits tokenizer to all raw texts; remembers indices->words mappings. '''
738 |         self.tokenizer.fit_on_texts(self.raw_texts)
739 |         self.word_indices_to_words = {}
740 |         for token, idx in self.tokenizer.word_index.items():
741 |             self.word_indices_to_words[idx] = token
742 | 
743 |     def build_sequences(self, texts):
744 |         X = list(self.tokenizer.texts_to_sequences_generator(texts))
745 |         X = np.array(pad_sequences(X, maxlen=self.max_sent_len))
746 |         return X
747 | 
748 |     def init_word_vectors(self):
749 |         ''' 
750 |         Initialize word vectors.
751 |         '''
752 |         self.init_vectors = []
753 |         unknown_words_to_vecs = {}
754 |         for t, token_idx in self.tokenizer.word_index.items():
755 |             if token_idx <= self.max_features:
756 |                 try:
757 |                     self.init_vectors.append(self.word_embeddings[t])
758 |                 except:
759 |                     if t not in unknown_words_to_vecs:
760 |                         # randomly initialize
761 |                         unknown_words_to_vecs[t] = np.random.random(
762 |                                                 self.embedding_dims)*-2 + 1
763 | 
764 |                     self.init_vectors.append(unknown_words_to_vecs[t])
765 | 
766 |         # note that we make this a singleton list because that's
767 |         # what Keras wants. 
768 |         self.init_vectors = [np.vstack(self.init_vectors)]
769 | 
770 | 


--------------------------------------------------------------------------------
/rationale_CNN_2.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | @authors Byron Wallace, Edward Banner, Ye Zhang
  3 | 
  4 | A Keras implementation of our "rationale augmented CNN" (https://arxiv.org/abs/1605.04469). Please note that
  5 | the model was originally implemented in Theano -- this version is a work in progress.
  6 | 
  7 | Credit for initial pass of basic CNN implementation to: Cheng Guo (https://gist.github.com/entron).
  8 | 
  9 | References
 10 | --
 11 | Ye Zhang, Iain J. Marshall and Byron C. Wallace. "Rationale-Augmented Convolutional Neural Networks for Text Classification". http://arxiv.org/abs/1605.04469
 12 | Yoon Kim. "Convolutional Neural Networks for Sentence Classification". EMNLP 2014.
 13 | Ye Zhang and Byron Wallace. "A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification". http://arxiv.org/abs/1510.03820.
 14 | & also: http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
 15 | '''
 16 | 
 17 | from __future__ import print_function
 18 | import pdb
 19 | import sys
 20 | import random
 21 | reload(sys)
 22 | sys.setdefaultencoding('utf8')
 23 | 
 24 | import numpy as np
 25 | 
 26 | from keras import backend as K 
 27 | from keras.models import Graph, Model, Sequential
 28 | from keras.preprocessing import sequence
 29 | from keras.engine.topology import Layer
 30 | from keras.preprocessing.sequence import pad_sequences
 31 | from keras.layers import Input, Embedding, Dense, merge
 32 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Merge, Reshape, Permute, Lambda
 33 | from keras.layers.embeddings import Embedding
 34 | from keras.layers.convolutional import Convolution1D, Convolution2D, MaxPooling1D, MaxPooling2D
 35 | from keras.utils.np_utils import accuracy
 36 | from keras.preprocessing.text import text_to_word_sequence, Tokenizer
 37 | from keras.callbacks import ModelCheckpoint
 38 | 
 39 | 
 40 | 
 41 | class RationaleCNN:
 42 | 
 43 |     def __init__(self, preprocessor, filters=None, n_filters=32, dropout=0.0):
 44 |         '''
 45 |         parameters
 46 |         ---
 47 |         preprocessor: an instance of the Preprocessor class, defined below
 48 |         '''
 49 |         self.preprocessor = preprocessor
 50 | 
 51 |         if filters is None:
 52 |             self.ngram_filters = [3, 4, 5]
 53 |         else:
 54 |             self.ngram_filters = filters 
 55 | 
 56 |         self.n_filters = n_filters 
 57 |         self.dropout = dropout
 58 |         self.sentence_model_trained = False 
 59 | 
 60 |     @staticmethod
 61 |     def weighted_sum(X):
 62 |         # @TODO.. add sentence preds!
 63 |         return K.sum(X, axis=0) 
 64 | 
 65 |     @staticmethod
 66 |     def weighted_sum_output_shape(input_shape):
 67 |         # expects something like (None, max_doc_len, num_features) 
 68 |         # returns (1 x num_features)
 69 |         shape = list(input_shape)
 70 |         return tuple((1, shape[-1]))
 71 | 
 72 |     @staticmethod
 73 |     def balanced_sample(X, y):
 74 |         _, pos_rationale_indices = np.where([y[:,0] > 0]) 
 75 |         _, neg_rationale_indices = np.where([y[:,1] > 0]) 
 76 |         _, non_rationale_indices = np.where([y[:,2] > 0]) 
 77 | 
 78 |         # sample a number of non-rationales equal to the total
 79 |         # number of pos/neg rationales. 
 80 |         m = pos_rationale_indices.shape[0] + neg_rationale_indices.shape[0]
 81 |         sampled_non_rationale_indices = np.array(random.sample(non_rationale_indices, m))
 82 | 
 83 |         train_indices = np.concatenate([pos_rationale_indices, neg_rationale_indices, sampled_non_rationale_indices])
 84 |         np.random.shuffle(train_indices) # why not
 85 |         return X[train_indices,:], y[train_indices]
 86 | 
 87 |     def get_conv_layers_from_sentence_model():
 88 |         layers_to_weights = {}
 89 |         for ngram in self.ngram_filters:
 90 |             layer_name = "conv_" + str(ngram)
 91 |             cur_conv_layer = self.sentence_model.get_layer(layer_name)
 92 |             weights, biases = cur_conv_layer.get_weights()
 93 | 
 94 |             # here it gets tricky because we need
 95 |             # so, e.g., (32 x 200 x 3 x 1) -> (32 x 3 x 200 x 1)
 96 |             # we do this because reshape by default iterates over
 97 |             # the last dimension fastest
 98 |             # swapped = np.swapaxes(X, 1, 2)
 99 |             # Xp = swapped.reshape(32, 1, 1, 600)
100 | 
101 |     def build_doc_model(self):
102 |         #assert self.sentence_model_trained
103 | 
104 |         # input dim is (max_doc_len x max_sent_len) -- eliding the batch size
105 |         tokens_input = Input(name='input', 
106 |                             shape=(self.preprocessor.max_doc_len, self.preprocessor.max_sent_len), 
107 |                             dtype='int32')
108 |         # flatten; create a very wide matrix to hand to embedding layer
109 |         tokens_reshaped = Reshape([self.preprocessor.max_doc_len*self.preprocessor.max_sent_len])(tokens_input)
110 |         # embed the tokens; output will be (p.max_doc_len*p.max_sent_len x embedding_dims)
111 |         # here we should initialize with weights from sentence model embedding layer!
112 | 
113 | 
114 |         ### 
115 |         # getting weights for initialization
116 |         x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
117 |                         weights=self.sentence_model.get_layer("embedding").get_weights(),
118 |                         #weights=self.preprocessor.init_vectors, 
119 |                         name="embedding")(tokens_reshaped)
120 | 
121 |         # reshape to preserve document structure; each doc will now be a
122 |         # a row in this matrix
123 |         x = Reshape((1, self.preprocessor.max_doc_len, 
124 |                      self.preprocessor.max_sent_len*self.preprocessor.embedding_dims), 
125 |                      name="reshape")(x)
126 | 
127 |         x = Dropout(0.1, name="dropout")(x)
128 | 
129 |         convolutions = []
130 |         for n_gram in self.ngram_filters:
131 | 
132 | 
133 |             #import pdb; pdb.set_trace()
134 | 
135 |             ### here is where we pull out weights
136 |             layer_name = "conv_" + str(n_gram)
137 |             cur_conv_layer = self.sentence_model.get_layer(layer_name)
138 |             weights, biases = cur_conv_layer.get_weights()
139 |             # here it gets a bit tricky; we need dims 
140 |             #       (nb_filters x 1 x 1 x (n_gram*embedding_dim))
141 |             # for 2d conv; our 1d conv model, though, will have
142 |             #       (nb_filters x embedding_dim x n_gram x 1)
143 |             # need to reshape this. but first need to swap around
144 |             # axes due to how reshape works (it iterates over last 
145 |             # dimension first). in particular, e.g.,:
146 |             #       (32 x 200 x 3 x 1) -> (32 x 3 x 200 x 1)
147 |             # swapped = np.swapaxes(X, 1, 2)
148 |             swapped_weights = np.swapaxes(weights, 1, 2)
149 |             init_weights = swapped_weights.reshape(self.n_filters, 
150 |                             1, 1, n_gram*self.preprocessor.embedding_dims)
151 | 
152 |             cur_conv = Convolution2D(self.n_filters, 1, 
153 |                                      n_gram*self.preprocessor.embedding_dims, 
154 |                                      subsample=(1, self.preprocessor.embedding_dims),
155 |                                      name="conv2d_"+str(n_gram),
156 |                                      weights=[init_weights, biases])(x)
157 | 
158 |             # this output (n_filters x max_doc_len x 1)
159 |             one_max = MaxPooling2D(pool_size=(1, self.preprocessor.max_sent_len - n_gram + 1), 
160 |                                    name="pooling_"+str(n_gram))(cur_conv)
161 | 
162 |             # flip around, to get (1 x max_doc_len x n_filters)
163 |             permuted = Permute((3,2,1), name="permute_"+str(n_gram)) (one_max)
164 |             
165 |             # drop extra dimension
166 |             r = Reshape((self.preprocessor.max_doc_len, self.n_filters), 
167 |                             name="conv_"+str(n_gram))(permuted)
168 |             
169 |             convolutions.append(r)
170 | 
171 |         # merge the filter size convolutions
172 |         r = merge(convolutions, name="sentence_vectors")
173 | 
174 |         # now we take a weighted sum of the sentence vectors
175 |         # to induce a document representation
176 |         x_doc = Lambda(RationaleCNN.weighted_sum, 
177 |                         output_shape=RationaleCNN.weighted_sum_output_shape, 
178 |                         name="weighted_doc_vector")(r)
179 | 
180 |         # finally, the sigmoid layer for classification
181 |         y_hat = Dense(1, activation="softmax", name="document_prediction")(x_doc)
182 |         model = Model(input=tokens_input, output=x_doc)
183 |         return model 
184 |         
185 | 
186 |     def train(self, X_train, y_train, X_val=None, y_val=None,
187 |                 nb_epoch=5, batch_size=32, optimizer='adam'):
188 |         ''' 
189 |         Accepts an X matrix (presumably some slice of self.X) and corresponding
190 |         vector of labels. May want to revisit this. 
191 | 
192 |         X_val and y_val are to be used to validate during training. 
193 |         '''
194 |         checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
195 |                                        verbose=1, 
196 |                                        save_best_only=(X_val is not None))
197 | 
198 |         if X_val is not None:
199 |             self.sentence_model.fit(X_train, y_train,
200 |                 batch_size=batch_size, nb_epoch=nb_epoch,
201 |                 validation_data=(X_val, y_val),
202 |                 verbose=2, callbacks=[checkpointer])
203 |         else: 
204 |             # no validation provided
205 |             self.sentence_model.fit(X_train, y_train,
206 |                 batch_size=batch_size, nb_epoch=nb_epoch, 
207 |                 verbose=2, callbacks=[checkpointer])
208 | 
209 | 
210 |     '''
211 |     def build_sentence_model(self):
212 | 
213 |         # input dim is (max_doc_len x max_sent_len) -- eliding the batch size
214 |         tokens_input = Input(name='input', 
215 |                             shape=(self.preprocessor.max_sent_len,), 
216 |                             dtype='int32')
217 | 
218 |         # embed the tokens; output will be (p.max_doc_len*p.max_sent_len x embedding_dims)
219 |         x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
220 |                         weights=self.preprocessor.init_vectors, name="embedding")(tokens_input)
221 | 
222 |         x = Dropout(0.1, name="dropout")(x)
223 | 
224 |         convolutions = []
225 |         for n_gram in self.ngram_filters:
226 |             cur_conv = Convolution2D(self.n_filters, 1, 
227 |                                      n_gram*self.preprocessor.embedding_dims, 
228 |                                      subsample=(1, self.preprocessor.embedding_dims),
229 |                                      name="conv2d_"+str(n_gram))(x)
230 | 
231 |             # this output (n_filters x max_doc_len x 1)
232 |             one_max = MaxPooling2D(pool_size=(1, self.preprocessor.max_sent_len - n_gram + 1), 
233 |                                    name="pooling_"+str(n_gram))(cur_conv)
234 | 
235 |             # flip around, to get (1 x max_doc_len x n_filters)
236 |             permuted = Permute((3,2,1), name="permute_"+str(n_gram)) (one_max)
237 |             
238 |             # drop extra dimension
239 |             r = Reshape((self.preprocessor.max_doc_len, self.n_filters), 
240 |                             name="conv_"+str(n_gram))(permuted)
241 |             
242 |             convolutions.append(r)
243 | 
244 |         # merge the filter size convolutions
245 |         r = merge(convolutions, name="sentence_vectors")
246 | 
247 |         # now the classification layer...
248 |     '''
249 |    
250 |     def build_sentence_model(self):
251 |         ''' 
252 |         Build the *sentence* level model, which operates over, erm, sentences. 
253 |         The task is to predict which sentences are pos/neg rationales.
254 |         '''
255 |         tokens_input = Input(name='input', shape=(self.preprocessor.max_sent_len,), dtype='int32')
256 |         x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 
257 |                       name="embedding",
258 |                       input_length=self.preprocessor.max_sent_len, 
259 |                       weights=self.preprocessor.init_vectors)(tokens_input)
260 |         
261 |         x = Dropout(0.1)(x)
262 | 
263 |         convolutions = []
264 |         for n_gram in self.ngram_filters:
265 |             cur_conv = Convolution1D(name="conv_" + str(n_gram), 
266 |                                          nb_filter=self.n_filters,
267 |                                          filter_length=n_gram,
268 |                                          border_mode='valid',
269 |                                          activation='relu',
270 |                                          subsample_length=1,
271 |                                          input_dim=self.preprocessor.embedding_dims,
272 |                                          input_length=self.preprocessor.max_sent_len)(x)
273 |             # pool
274 |             one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv)
275 |             flattened = Flatten()(one_max)
276 |             convolutions.append(flattened)
277 | 
278 |         sentence_vector = merge(convolutions, name="sentence_vector") # hang on to this layer!
279 |         output = Dense(3, activation="softmax", name="sentence_prediction")(sentence_vector)
280 | 
281 |         self.sentence_model = Model(input=tokens_input, output=output)
282 |         print("model built")
283 |         print(self.sentence_model.summary())
284 |         self.sentence_model.compile(loss='categorical_crossentropy', optimizer="adam")
285 | 
286 |         self.sentence_embedding_dim = self.sentence_model.layers[-2].output_shape[1]
287 | 
288 |         return self.sentence_model 
289 | 
290 | 
291 |     def train_sentence_model(self, train_documents, nb_epoch=5, downsample=True, 
292 |                                     batch_size=128, optimizer='adam'):
293 |         # assumes sentence sequences have been generated!
294 |         assert(train_documents[0].sentence_sequences is not None)
295 | 
296 |         X, y= [], []
297 |         # flatten sentences/sentence labels
298 |         for d in train_documents:
299 |             X.extend(d.sentence_sequences)
300 |             y.extend(d.sentences_y)
301 | 
302 |         # @TODO sub-sample magic?
303 |         X, y = np.asarray(X), np.asarray(y)
304 |         
305 |         # downsample
306 |         if downsample:
307 |             X, y = RationaleCNN.balanced_sample(X, y)
308 | 
309 |         #self.train(X[:1000], y[:1000])
310 |         self.train(X, y)
311 | 
312 |         self.sentence_model_trained = True
313 | 
314 | 
315 | class Preprocessor:
316 |     def __init__(self, max_features, max_sent_len, embedding_dims=200, wvs=None, max_doc_len=500):
317 |         '''
318 |         max_features: the upper bound to be placed on the vocabulary size.
319 |         max_sent_len: the maximum length (in terms of tokens) of the instances/texts.
320 |         embedding_dims: size of the token embeddings; over-ridden if pre-trained
321 |                           vectors is provided (if wvs is not None).
322 |         '''
323 | 
324 |         self.max_features = max_features  
325 |         self.tokenizer = Tokenizer(nb_words=self.max_features)
326 |         self.max_sent_len = max_sent_len  # the max sentence length! @TODO rename; this is confusing. 
327 |         self.max_doc_len = max_doc_len # w.r.t. number of sentences!
328 | 
329 |         self.use_pretrained_embeddings = False 
330 |         self.init_vectors = None 
331 |         if wvs is None:
332 |             self.embedding_dims = embedding_dims
333 |         else:
334 |             # note that these are only for initialization;
335 |             # they will be tuned!
336 |             self.use_pretrained_embeddings = True
337 |             self.embedding_dims = wvs.vector_size
338 |             self.word_embeddings = wvs
339 | 
340 | 
341 |     def preprocess(self, all_docs):
342 |         ''' 
343 |         This fits tokenizer and builds up input vectors (X) from the list 
344 |         of texts in all_texts. Needs to be called before train!
345 |         '''
346 |         self.raw_texts = all_docs
347 |         #self.build_sequences()
348 |         self.fit_tokenizer()
349 |         if self.use_pretrained_embeddings:
350 |             self.init_word_vectors()
351 | 
352 | 
353 |     def fit_tokenizer(self):
354 |         ''' Fits tokenizer to all raw texts; remembers indices->words mappings. '''
355 |         self.tokenizer.fit_on_texts(self.raw_texts)
356 |         self.word_indices_to_words = {}
357 |         for token, idx in self.tokenizer.word_index.items():
358 |             self.word_indices_to_words[idx] = token
359 | 
360 |     def build_sequences(self, texts):
361 |         X = list(self.tokenizer.texts_to_sequences_generator(texts))
362 |         X = np.array(pad_sequences(X, maxlen=self.max_sent_len))
363 |         return X
364 | 
365 |     def init_word_vectors(self):
366 |         ''' 
367 |         Initialize word vectors.
368 |         '''
369 |         self.init_vectors = []
370 |         unknown_words_to_vecs = {}
371 |         for t, token_idx in self.tokenizer.word_index.items():
372 |             if token_idx <= self.max_features:
373 |                 try:
374 |                     self.init_vectors.append(self.word_embeddings[t])
375 |                 except:
376 |                     if t not in unknown_words_to_vecs:
377 |                         # randomly initialize
378 |                         unknown_words_to_vecs[t] = np.random.random(
379 |                                                 self.embedding_dims)*-2 + 1
380 | 
381 |                     self.init_vectors.append(unknown_words_to_vecs[t])
382 | 
383 |         # note that we make this a singleton list because that's
384 |         # what Keras wants. 
385 |         self.init_vectors = [np.vstack(self.init_vectors)]
386 | 


--------------------------------------------------------------------------------