├── CNN_text.py ├── RoB_CNN_redux.py ├── rationale_CNN.py └── rationale_CNN_2.py /CNN_text.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @author Byron Wallace 3 | A Keras implementation of CNNs for text classification. 4 | 5 | Credit for initial pass of implementation to: Cheng Guo (https://gist.github.com/entron). 6 | 7 | References 8 | -- 9 | Yoon Kim. "Convolutional Neural Networks for Sentence Classification". EMNLP 2014. 10 | Ye Zhang and Byron Wallace. "A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification". http://arxiv.org/abs/1510.03820. 11 | & also: http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/ 12 | ''' 13 | 14 | from __future__ import print_function 15 | import pdb 16 | import sys 17 | reload(sys) 18 | sys.setdefaultencoding('utf8') 19 | 20 | import numpy as np 21 | 22 | from keras.preprocessing import sequence 23 | from keras.preprocessing.sequence import pad_sequences 24 | from keras.models import Graph 25 | from keras.layers.core import Dense, Dropout, Activation, Flatten 26 | from keras.layers.embeddings import Embedding 27 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 28 | from keras.datasets import imdb 29 | from keras.utils.np_utils import accuracy 30 | from keras.preprocessing.text import text_to_word_sequence, Tokenizer 31 | from keras.callbacks import ModelCheckpoint 32 | 33 | class TextCNN: 34 | 35 | def __init__(self, preprocessor, filters=None, n_filters=100, dropout=0.0): 36 | ''' 37 | parameters 38 | --- 39 | preprocessor: an instance of the Preprocessor class, defined below 40 | ''' 41 | self.preprocessor = preprocessor 42 | 43 | if filters is None: 44 | self.ngram_filters = [3, 4, 5] 45 | else: 46 | self.ngram_filters = filters 47 | 48 | self.nb_filter = n_filters 49 | self.dropout = dropout 50 | 51 | self.build_model() # build model 52 | 53 | def train(self, X_train, y_train, X_val=None, y_val=None, 54 | nb_epoch=5, batch_size=32, optimizer='adam'): 55 | ''' 56 | Accepts an X matrix (presumably some slice of self.X) and corresponding 57 | vector of labels. May want to revisit this. 58 | 59 | X_val and y_val are to be used to validate during training. 60 | ''' 61 | 62 | 63 | checkpointer = ModelCheckpoint(filepath="weights.hdf5", 64 | verbose=1, 65 | save_best_only=(X_val is not None)) 66 | 67 | if X_val is not None: 68 | self.model.fit({'input': X_train, 'output': y_train}, 69 | batch_size=batch_size, nb_epoch=nb_epoch, 70 | validation_data={'input': X_val, 'output': y_val}, 71 | verbose=2, callbacks=[checkpointer]) 72 | else: 73 | print("no validation data provided!") 74 | self.model.fit({'input': X_train, 'output': y_train}, 75 | batch_size=batch_size, nb_epoch=nb_epoch, 76 | verbose=2, callbacks=[checkpointer]) 77 | 78 | 79 | def predict(self, X_test, batch_size=32, binarize=False): 80 | raw_preds = self.model.predict({'input': X_test}, batch_size=batch_size)['output'] 81 | 82 | #np.array(self.model.predict({'input': X_test}, 83 | # batch_size=batch_size)['output']) 84 | if binarize: 85 | return np.round(raw_preds) 86 | return raw_preds 87 | 88 | 89 | def build_model(self): 90 | # again, credit to Cheng Guo 91 | self.model = Graph() 92 | self.model.add_input(name='input', input_shape=(self.preprocessor.maxlen,), dtype=int) 93 | 94 | #pdb.set_trace() 95 | self.model.add_node(Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 96 | input_length=self.preprocessor.maxlen, weights=self.preprocessor.init_vectors), 97 | name='embedding', input='input') 98 | self.model.add_node(Dropout(0.), name='dropout_embedding', input='embedding') 99 | for n_gram in self.ngram_filters: 100 | self.model.add_node(Convolution1D(nb_filter=self.nb_filter, 101 | filter_length=n_gram, 102 | border_mode='valid', 103 | activation='relu', 104 | subsample_length=1, 105 | input_dim=self.preprocessor.embedding_dims, 106 | input_length=self.preprocessor.maxlen), 107 | name='conv_' + str(n_gram), 108 | input='dropout_embedding') 109 | self.model.add_node(MaxPooling1D(pool_length=self.preprocessor.maxlen - n_gram + 1), 110 | name='maxpool_' + str(n_gram), 111 | input='conv_' + str(n_gram)) 112 | self.model.add_node(Flatten(), 113 | name='flat_' + str(n_gram), 114 | input='maxpool_' + str(n_gram)) 115 | self.model.add_node(Dropout(self.dropout), name='dropout', inputs=['flat_' + str(n) for n in self.ngram_filters]) 116 | self.model.add_node(Dense(1, input_dim=self.nb_filter * len(self.ngram_filters)), 117 | name='dense', input='dropout') 118 | self.model.add_node(Activation('sigmoid'), name='sigmoid', input='dense') 119 | self.model.add_output(name='output', input='sigmoid') 120 | print("model built") 121 | print(self.model.summary()) 122 | self.model.compile(loss={'output': 'binary_crossentropy'}, 123 | optimizer="adam")#optimizer) 124 | 125 | class Preprocessor: 126 | def __init__(self, max_features, maxlen, embedding_dims=200, wvs=None): 127 | ''' 128 | max_features: the upper bound to be placed on the vocabulary size. 129 | maxlen: the maximum length (in terms of tokens) of the instances/texts. 130 | embedding_dims: size of the token embeddings; over-ridden if pre-trained 131 | vectors is provided (if wvs is not None). 132 | ''' 133 | 134 | self.max_features = max_features 135 | self.tokenizer = Tokenizer(nb_words=self.max_features) 136 | self.maxlen = maxlen 137 | 138 | self.use_pretrained_embeddings = False 139 | self.init_vectors = None 140 | if wvs is None: 141 | self.embedding_dims = embedding_dims 142 | else: 143 | # note that these are only for initialization; 144 | # they will be tuned! 145 | self.use_pretrained_embeddings = True 146 | self.embedding_dims = wvs.vector_size 147 | self.word_embeddings = wvs 148 | 149 | 150 | def preprocess(self, all_texts): 151 | ''' 152 | This fits tokenizer and builds up input vectors (X) from the list 153 | of texts in all_texts. Needs to be called before train! 154 | ''' 155 | self.raw_texts = all_texts 156 | #self.build_sequences() 157 | self.fit_tokenizer() 158 | if self.use_pretrained_embeddings: 159 | self.init_word_vectors() 160 | 161 | def fit_tokenizer(self): 162 | ''' Fits tokenizer to all raw texts; remembers indices->words mappings. ''' 163 | self.tokenizer.fit_on_texts(self.raw_texts) 164 | self.word_indices_to_words = {} 165 | for token, idx in self.tokenizer.word_index.items(): 166 | self.word_indices_to_words[idx] = token 167 | 168 | def build_sequences(self, texts): 169 | X = list(self.tokenizer.texts_to_sequences_generator(texts)) 170 | X = np.array(pad_sequences(X, maxlen=self.maxlen)) 171 | return X 172 | 173 | def init_word_vectors(self): 174 | ''' 175 | Initialize word vectors. 176 | ''' 177 | self.init_vectors = [] 178 | unknown_words_to_vecs = {} 179 | for t, token_idx in self.tokenizer.word_index.items(): 180 | if token_idx <= self.max_features: 181 | try: 182 | self.init_vectors.append(self.word_embeddings[t]) 183 | except: 184 | if t not in unknown_words_to_vecs: 185 | # randomly initialize 186 | unknown_words_to_vecs[t] = np.random.random( 187 | self.embedding_dims)*-2 + 1 188 | 189 | self.init_vectors.append(unknown_words_to_vecs[t]) 190 | 191 | # note that we make this a singleton list because that's 192 | # what Keras wants. 193 | self.init_vectors = [np.vstack(self.init_vectors)] 194 | 195 | -------------------------------------------------------------------------------- /RoB_CNN_redux.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import os 4 | csv.field_size_limit(sys.maxsize) 5 | 6 | import sklearn 7 | from sklearn.metrics import accuracy_score 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | import gensim 13 | from gensim.models import Word2Vec 14 | 15 | import CNN_text 16 | 17 | 18 | def load_trained_w2v_model(path="/Users/byron/dev/Deep-PICO/PubMed-w2v.bin"): 19 | m = Word2Vec.load_word2vec_format(path, binary=True) 20 | return m 21 | 22 | 23 | def read_RoB_data(path="RoB-data/train-Xy-Random-sequence-generation.txt", 24 | y_tuples=False, zero_one=True): 25 | ''' 26 | Assumes data is in CSV with label as second entry. 27 | ''' 28 | raw_texts, y = [], [] 29 | with open(path) as input_file: 30 | rows = csv.reader(input_file) 31 | for row in rows: 32 | doc_text, lbl = row 33 | raw_texts.append(doc_text) 34 | cur_y = int(lbl) 35 | if y_tuples: 36 | if cur_y > 0: 37 | y.append(np.array([0,1])) 38 | else: 39 | y.append(np.array([1,0])) 40 | else: 41 | if cur_y < 1: 42 | if zero_one: 43 | y.append(0) 44 | else: 45 | y.append(-1) 46 | 47 | else: 48 | y.append(1) 49 | 50 | return raw_texts, y 51 | 52 | 53 | 54 | def RoB_CNN(total_epochs=60, weights_file=None): 55 | train_docs, y_train = read_RoB_data(path="RoB-data/train-Xy-Random-sequence-generation.txt", 56 | y_tuples=False) 57 | 58 | test_docs, y_test = read_RoB_data(path="RoB-data/test-Xy-Random-sequence-generation.txt", 59 | y_tuples=False) 60 | 61 | 62 | train_docs = train_docs#[:500] 63 | y_train = y_train#[:500] 64 | 65 | wvs = load_trained_w2v_model() 66 | # preprocessor for texts 67 | 68 | # then the CNN 69 | p = CNN_text.Preprocessor(max_features=10000, maxlen=5000, wvs=wvs) 70 | all_docs = train_docs + test_docs 71 | 72 | print("preprocessing...") 73 | p.preprocess(all_docs) 74 | train_X = p.build_sequences(train_docs) 75 | test_X = p.build_sequences(test_docs) 76 | 77 | 78 | cnn = CNN_text.TextCNN(p, filters=[2,3,5], n_filters=100, dropout=0.0) 79 | 80 | # write the model out 81 | json_string = cnn.model.to_json() 82 | open('RoB_model_architecture.json', 'w').write(json_string) 83 | print("dumped model!") 84 | 85 | if weights_file: 86 | cnn.model.load_weights('weights.hdf5') 87 | 88 | epochs_per_iter = 10 89 | epochs_so_far = 0 90 | while epochs_so_far < total_epochs: 91 | cnn.train(train_X, y_train, nb_epochs=epochs_per_iter)#, X_val=test_X, y_val=y_test) 92 | epochs_so_far += epochs_per_iter 93 | 94 | yhat = cnn.predict(test_X, binarize=True) 95 | #import pdb; pdb.set_trace() 96 | print("acc @ epoch %s: %s" % (epochs_so_far, accuracy_score(y_test, yhat))) 97 | 98 | #cnn.initialize_sequences_and_vocab(all_docs) 99 | #cnn.train(X_train, y_train, X_val=None, y_val=None 100 | 101 | 102 | # note that on TACC you need: 103 | # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/apps/intel14/hdf5/1.8.12/x86_64/lib/ 104 | if __name__ == '__main__': 105 | RoB_CNN() 106 | -------------------------------------------------------------------------------- /rationale_CNN.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @author Byron Wallace 3 | A Keras implementation of our "rationale augmented CNN" (https://arxiv.org/abs/1605.04469). Please note that 4 | the model was originally implemented in Theano -- this version is a work in progress. 5 | 6 | Credit for initial pass of basic CNN implementation to: Cheng Guo (https://gist.github.com/entron). 7 | 8 | References 9 | -- 10 | Ye Zhang, Iain J. Marshall and Byron C. Wallace. "Rationale-Augmented Convolutional Neural Networks for Text Classification". http://arxiv.org/abs/1605.04469 11 | Yoon Kim. "Convolutional Neural Networks for Sentence Classification". EMNLP 2014. 12 | Ye Zhang and Byron Wallace. "A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification". http://arxiv.org/abs/1510.03820. 13 | & also: http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/ 14 | ''' 15 | 16 | from __future__ import print_function 17 | import pdb 18 | import sys 19 | import random 20 | reload(sys) 21 | sys.setdefaultencoding('utf8') 22 | 23 | import numpy as np 24 | 25 | from keras.preprocessing import sequence 26 | from keras.engine.topology import Layer 27 | from keras.preprocessing.sequence import pad_sequences 28 | from keras.models import Graph 29 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Merge, Reshape, Permute, Lambda 30 | from keras.layers.embeddings import Embedding 31 | from keras.layers.convolutional import Convolution1D, Convolution2D, MaxPooling1D, MaxPooling2D 32 | from keras.datasets import imdb 33 | from keras.utils.np_utils import accuracy 34 | from keras.preprocessing.text import text_to_word_sequence, Tokenizer 35 | from keras.callbacks import ModelCheckpoint 36 | 37 | ## 38 | from keras.layers import Input, Embedding, Dense, merge 39 | from keras.models import Model, Sequential 40 | 41 | from keras import backend as K 42 | 43 | class SentenceConvolutionLayer(Layer): 44 | # input shape: (sentences, max_tokens) 45 | 46 | def __init__(self): 47 | pass 48 | 49 | class WeightedSumSentenceVector2(Layer): 50 | # input shape: (sentences, sentence_embedding_dim) 51 | # # documents[0].sentence_sequences 52 | # output shape: (1, sentence_embedding_dim) 53 | 54 | def __init__(self, sentence_model, **kwargs): 55 | #self.sentence_cnn = sentence_cnn 56 | #self.sentence_model = sentence_model 57 | pass 58 | 59 | class WeightedSumSentenceVector(Layer): 60 | # input shape: (sentences, sentence_embedding_dim) 61 | # # documents[0].sentence_sequences 62 | # output shape: (1, sentence_embedding_dim) 63 | 64 | def __init__(self, sentence_model, **kwargs): 65 | #self.sentence_cnn = sentence_cnn 66 | self.sentence_model = sentence_model 67 | 68 | 69 | super(WeightedSumSentenceVector, self).__init__(**kwargs) 70 | 71 | def get_output_shape_for(self, input_shape): 72 | assert len(shape) == 2 73 | return (1, input_shape[1]) 74 | 75 | def call(self, X, mask=None): 76 | ''' 77 | X assumed to be a document, with each row a list of 78 | token indicators in the corresponding sentences. 79 | ''' 80 | conv_f = K.function( 81 | [self.sentence_model.layers[0].input, K.learning_phase()], 82 | [self.sentence_model.layers[-2].output]) 83 | 84 | import pdb; pdb.set_trace() 85 | sentence_vectors = conv_f([X,1])[0] 86 | 87 | sentence_predictions = sentence_model.predict([test_sentences]) 88 | weights = np.amax(sentence_predictions[:,0:2],axis=1) 89 | 90 | return np.matrix(np.dot(weights, vecs)) 91 | 92 | def build(self, input_shape): 93 | 94 | #input_dim = input_shape[1] 95 | #initial_weight_value = np.random.random((input_dim, output_dim)) 96 | #self.W = K.variable(initial_weight_value) 97 | #self.trainable_weights = [self.W] 98 | self.trainable_weights = self.sentence_model.trainable_weights 99 | 100 | class RationaleCNN: 101 | 102 | def __init__(self, preprocessor, filters=None, n_filters=100, dropout=0.0): 103 | ''' 104 | parameters 105 | --- 106 | preprocessor: an instance of the Preprocessor class, defined below 107 | ''' 108 | self.preprocessor = preprocessor 109 | 110 | if filters is None: 111 | self.ngram_filters = [3, 4, 5] 112 | else: 113 | self.ngram_filters = filters 114 | 115 | self.nb_filter = n_filters 116 | self.dropout = dropout 117 | self.sentence_model_trained = False 118 | 119 | #self.build_model() # build model 120 | #self.train_sentence_model() 121 | 122 | @staticmethod 123 | def weighted_sum(X): 124 | # @TODO.. add sentence preds! 125 | return K.sum(X, axis=0) # I *think* axis 0 is correct... 126 | 127 | @staticmethod 128 | def weighted_sum_output_shape(input_shape): 129 | # expects something like (None, max_doc_len, num_features) 130 | shape = list(input_shape) 131 | #assert len(shape) == 2 # not sure if correct... 132 | #print len(shape) 133 | print("shape: %s" % shape) 134 | # (1 x num_features) 135 | return tuple((1, shape[-1])) 136 | 137 | @staticmethod 138 | def balanced_sample(X, y): 139 | _, pos_rationale_indices = np.where([y[:,0] > 0]) 140 | _, neg_rationale_indices = np.where([y[:,1] > 0]) 141 | _, non_rationale_indices = np.where([y[:,2] > 0]) 142 | 143 | # sample a number of non-rationales equal to the total 144 | # number of pos/neg rationales. 145 | m = pos_rationale_indices.shape[0] + neg_rationale_indices.shape[0] 146 | sampled_non_rationale_indices = np.array(random.sample(non_rationale_indices, m)) 147 | 148 | train_indices = np.concatenate([pos_rationale_indices, neg_rationale_indices, sampled_non_rationale_indices]) 149 | np.random.shuffle(train_indices) # why not 150 | return X[train_indices,:], y[train_indices] 151 | 152 | # r_CNN.sentence_model.predict(X[:10], batch_size=128) 153 | def train_sentence_model(self, train_documents, nb_epoch=5, downsample=True, batch_size=128, optimizer='adam'): 154 | # assumes sentence sequences have been generated! 155 | assert(train_documents[0].sentence_sequences is not None) 156 | 157 | X, y= [], [] 158 | # flatten sentences/sentence labels 159 | for d in train_documents: 160 | X.extend(d.sentence_sequences) 161 | y.extend(d.sentences_y) 162 | 163 | # @TODO sub-sample magic? 164 | X, y = np.asarray(X), np.asarray(y) 165 | 166 | # downsample 167 | if downsample: 168 | X, y = RationaleCNN.balanced_sample(X, y) 169 | 170 | #self.train(X[:1000], y[:1000]) 171 | self.train(X, y) 172 | 173 | self.sentence_model_trained = True 174 | 175 | 176 | def train(self, X_train, y_train, X_val=None, y_val=None, 177 | nb_epoch=5, batch_size=32, optimizer='adam'): 178 | ''' 179 | Accepts an X matrix (presumably some slice of self.X) and corresponding 180 | vector of labels. May want to revisit this. 181 | 182 | X_val and y_val are to be used to validate during training. 183 | ''' 184 | checkpointer = ModelCheckpoint(filepath="weights.hdf5", 185 | verbose=1, 186 | save_best_only=(X_val is not None)) 187 | 188 | if X_val is not None: 189 | self.sentence_model.fit({'input': X_train, 'output': y_train}, 190 | batch_size=batch_size, nb_epoch=nb_epoch, 191 | validation_data={'input': X_val, 'output': y_val}, 192 | verbose=2, callbacks=[checkpointer]) 193 | else: 194 | print("no validation data provided!") 195 | #self.sentence_model.fit({'input': X_train, 'output': y_train}, 196 | # batch_size=batch_size, nb_epoch=nb_epoch, 197 | # verbose=2, callbacks=[checkpointer]) 198 | self.sentence_model.fit(X_train, y_train, 199 | batch_size=batch_size, nb_epoch=nb_epoch, 200 | verbose=2, callbacks=[checkpointer]) 201 | 202 | 203 | ''' 204 | def predict(self, X_test, batch_size=32, binarize=False): 205 | raw_preds = self.model.predict({'input': X_test}, batch_size=batch_size)['output'] 206 | 207 | #np.array(self.model.predict({'input': X_test}, 208 | # batch_size=batch_size)['output']) 209 | if binarize: 210 | return np.round(raw_preds) 211 | return raw_preds 212 | ''' 213 | 214 | 215 | def build_sentence_model(self): 216 | ''' 217 | Build the *sentence* level model, which operates over, erm, sentences. 218 | The task is to predict which sentences are pos/neg rationales. 219 | ''' 220 | tokens_input = Input(name='input', shape=(self.preprocessor.max_sent_len,), dtype='int32') 221 | x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 222 | input_length=self.preprocessor.max_sent_len, 223 | weights=self.preprocessor.init_vectors)(tokens_input) 224 | 225 | x = Dropout(0.1)(x) 226 | 227 | convolutions = [] 228 | for n_gram in self.ngram_filters: 229 | cur_conv = Convolution1D(nb_filter=self.nb_filter, 230 | filter_length=n_gram, 231 | border_mode='valid', 232 | activation='relu', 233 | subsample_length=1, 234 | input_dim=self.preprocessor.embedding_dims, 235 | input_length=self.preprocessor.max_sent_len)(x) 236 | # pool 237 | one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv) 238 | flattened = Flatten()(one_max) 239 | convolutions.append(flattened) 240 | 241 | sentence_vector = merge(convolutions, name="sentence_vector") # hang on to this layer! 242 | output = Dense(3, activation="softmax")(sentence_vector) 243 | 244 | self.sentence_model = Model(input=tokens_input, output=output) 245 | print("model built") 246 | print(self.sentence_model.summary()) 247 | self.sentence_model.compile(loss='categorical_crossentropy', optimizer="adam") 248 | 249 | self.sentence_embedding_dim = self.sentence_model.layers[-2].output_shape[1] 250 | 251 | return self.sentence_model 252 | 253 | 254 | 255 | 256 | def build_doc_model_fixed(self): 257 | # no magic here. 258 | #input_layer = Dense(1, batch_input_shape=(None, self.sentence_embedding_dim))#input_shape=(self.sentence_embedding_dim, )) 259 | #output_layer = Activation('sigmoid')(input_layer) 260 | 261 | self.document_model = Sequential() 262 | self.document_model.add(Dense(1, input_dim=self.sentence_embedding_dim)) 263 | self.document_model.add(Activation("sigmoid")) 264 | 265 | #self.document_model = Model(input=tokens_input, output=output) 266 | self.document_model.compile(loss='binary_crossentropy', optimizer="adam") 267 | 268 | 269 | def train_doc_model_fixed(self, train_documents): 270 | conv_f = K.function( 271 | [self.sentence_model.layers[0].input, K.learning_phase()], 272 | [self.sentence_model.layers[-2].output]) 273 | 274 | X, y = [], [] 275 | 276 | for d in train_documents: 277 | sentence_vectors = np.matrix([conv_f([np.matrix(sent_seq),1])[0][0] for 278 | sent_seq in d.sentence_sequences]) 279 | 280 | #sentence_predictions = self.sentence_model.predict(d.sentence_sequences) 281 | sentence_predictions = self.sentence_model.predict(d.sentence_sequences) 282 | weights = np.amax(sentence_predictions[:,0:2],axis=1) 283 | weighted = np.dot(weights, sentence_vectors) 284 | X.append(weighted) 285 | y.append(d.doc_y) 286 | #train_sequences = 287 | 288 | X = np.vstack(X) 289 | y = np.array(y) 290 | #import pdb; pdb.set_trace() 291 | self.document_model.fit(X, y) 292 | 293 | 294 | #return np.matrix(np.dot(weights, vecs)) 295 | 296 | def train_document_model(self, train_documents, 297 | nb_epoch=5, downsample=True, 298 | batch_size=128, optimizer='adam'): 299 | # assumes sentence sequences have been generated! 300 | assert(train_documents[0].sentence_sequences is not None) 301 | 302 | X, y= [], [] 303 | # flatten sentences/sentence labels 304 | for d in train_documents: 305 | X.extend(d.sentence_sequences) 306 | y.extend(d.sentences_y) 307 | 308 | # @TODO sub-sample magic? 309 | X, y = np.asarray(X), np.asarray(y) 310 | 311 | # downsample 312 | if downsample: 313 | X, y = RationaleCNN.balanced_sample(X, y) 314 | 315 | #self.train(X[:1000], y[:1000]) 316 | self.train(X, y) 317 | 318 | self.sentence_model_trained = True 319 | 320 | 321 | def build_doc_model_concat(self): 322 | # the idea is here is to concatenate the sentence inputs; so represent each document 323 | # by one very long row 324 | doc_len = self.preprocessor.max_sent_len * self.preprocessor.max_doc_len 325 | tokens_input = Input(name='input', 326 | shape=(doc_len,), dtype='int32') 327 | 328 | x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 329 | input_length=doc_len, 330 | weights=self.preprocessor.init_vectors)(tokens_input) 331 | 332 | 333 | def build_sequential_doc_model(self): 334 | #self.document_model = Sequential() 335 | m = Sequential() 336 | 337 | # input layer. this is a matrix with dimensions: 338 | # (max_doc_length x max_sent_length) 339 | # 340 | m.add(Dense(100, input_shape=(p.max_sent_len,))) 341 | 342 | #pass 343 | 344 | def build_doc_model3(self): 345 | model = Sequential() 346 | 347 | # 32 is just n_filters; 1 is n_gram 348 | nb_feature_maps = n_filters = 32 349 | 350 | maxlen = self.preprocessor.max_sent_len 351 | 352 | conv_filters = [] 353 | for n_gram in self.ngram_filters: 354 | sequential = Sequential() 355 | conv_filters.append(sequential) 356 | 357 | sequential.add(Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims)) 358 | sequential.add(Reshape(1, maxlen, self.preprocessor.embedding_dims)) 359 | sequential.add(Convolution2D(nb_feature_maps, 1, n_gram, self.preprocessor.embedding_dims)) 360 | sequential.add(Activation("relu")) 361 | sequential.add(MaxPooling2D(poolsize=(maxlen - n_gram + 1, 1))) 362 | sequential.add(Flatten()) 363 | 364 | model = Sequential() 365 | model.add(Merge(conv_filters, mode='concat')) 366 | model.add(Dropout(0.5)) 367 | model.add(Dense(nb_feature_maps * len(conv_filters), 1)) 368 | model.add(Activation("sigmoid")) 369 | 370 | ''' 371 | convolutions = [] 372 | for n_gram in self.ngram_filters: 373 | cur_conv = Convolution2D(n_filters, 1, n_gram, 374 | input_shape=(1, p.max_doc_len, p.max_sent_len), 375 | activation='relu', border_mode='valid') 376 | 377 | #Convolution1D(nb_filter=self.nb_filter, 378 | # filter_length=n_gram, 379 | # border_mode='valid', 380 | # activation='relu', 381 | # subsample_length=1, 382 | # input_dim=self.preprocessor.embedding_dims, 383 | # input_length=self.preprocessor.max_sent_len)(x) 384 | # pool 385 | one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv) 386 | flattened = Flatten()(one_max) 387 | convolutions.append(flattened) 388 | 389 | ''' 390 | 391 | #model.add( 392 | # Convolution2D(n_filters, 1, n_gram, 393 | # input_shape=(1, p.max_doc_len, p.max_sent_len)) 394 | 395 | # get vectors for each sentence 396 | #MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1) 397 | 398 | 399 | 400 | #one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv) 401 | 402 | ''' 403 | document_input = Input(name='input', 404 | shape=(None, self.preprocessor.max_doc_len, 405 | self.preprocessor.max_sent_len), dtype='int32') 406 | 407 | # filter, nb_rows, nb_cols 408 | n_gram = 1 409 | cur_conv = Convolution2D(32, 410 | n_gram, self.preprocessor.embedding_dims, 411 | 412 | activation='relu', 413 | # samples, channels, rows, cols 414 | input_shape=(1, 415 | self.preprocessor.max_doc_len, 416 | self.preprocessor.embedding_dims, 417 | ))(document_input) 418 | ''' 419 | def build_doc_model2(self): 420 | document_input = Input(name='input', 421 | shape=(self.preprocessor.max_doc_len, 422 | self.preprocessor.max_sent_len,), dtype='int32') 423 | 424 | document_vector = WeightedSumSentenceVector(self.sentence_model)(document_input) 425 | 426 | # sentence_vectors = 427 | 428 | # 429 | #conv_f = K.function([self.sentence_model.layers[0].input, K.learning_phase()], 430 | # [self.sentence_model.layers[-2].output]) 431 | # test_sent.shape 432 | # (1,50) ### this is the list of token indices! 433 | # sentence_v = conv_f([test_sent,1])[0] 434 | 435 | ''' 436 | Re-construct the (start of) the *sentence* level model, which operates over, erm, sentences. 437 | The task is to predict which sentences are pos/neg rationales. 438 | ''' 439 | # 440 | 441 | 442 | 443 | ''' 444 | tokens_input = Input(name='input', shape=(self.preprocessor.max_sent_len,), dtype='int32') 445 | x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 446 | input_length=self.preprocessor.max_sent_len, 447 | weights=self.preprocessor.init_vectors)(tokens_input) 448 | 449 | x = Dropout(0.1)(x) 450 | 451 | convolutions = [] 452 | for n_gram in self.ngram_filters: 453 | cur_conv = Convolution1D(nb_filter=self.nb_filter, 454 | filter_length=n_gram, 455 | border_mode='valid', 456 | activation='relu', 457 | subsample_length=1, 458 | input_dim=self.preprocessor.embedding_dims, 459 | input_length=self.preprocessor.max_sent_len)(x) 460 | # pool 461 | one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv) 462 | flattened = Flatten()(one_max) 463 | convolutions.append(flattened) 464 | 465 | sentence_vector = merge(convolutions, name="sentence_vector") # hang on to this layer! 466 | ''' 467 | 468 | # ok initialize each layer with parameters! 469 | 470 | 471 | ### 472 | # 473 | ''' 474 | output = Dense(3, activation="softmax")(self.penultimate_layer) 475 | 476 | self.sentence_model = Model(input=tokens_input, output=output) 477 | ''' 478 | 479 | 480 | ''' 481 | In [137]: model.summary() 482 | ____________________________________________________________________________________________________ 483 | Layer (type) Output Shape Param # Connected to 484 | ==================================================================================================== 485 | input (InputLayer) (None, 500, 50) 0 486 | ____________________________________________________________________________________________________ 487 | reshape_16 (Reshape) (None, 25000) 0 input[0][0] 488 | ____________________________________________________________________________________________________ 489 | embedding_12 (Embedding) (None, 25000, 200) 2000000 reshape_16[0][0] 490 | ____________________________________________________________________________________________________ 491 | reshape_17 (Reshape) (None, 500, 10000) 0 embedding_12[0][0] 492 | ____________________________________________________________________________________________________ 493 | reshape_18 (Reshape) (None, 1, 500, 100000 reshape_17[0][0] 494 | ____________________________________________________________________________________________________ 495 | convolution2d_4 (Convolution2D) (None, 32, 500, 50) 6432 reshape_18[0][0] 496 | ____________________________________________________________________________________________________ 497 | maxpooling2d_1 (MaxPooling2D) (None, 32, 500, 1) 0 convolution2d_4[0][0] 498 | ____________________________________________________________________________________________________ 499 | permute_2 (Permute) (None, 1, 500, 32) 0 maxpooling2d_1[0][0] 500 | ____________________________________________________________________________________________________ 501 | reshape_19 (Reshape) (None, 500, 32) 0 permute_2[0][0] 502 | ===================================================================================== 503 | ''' 504 | def build_doc_model_clean(self, n_filters=32): 505 | # input dim is (max_doc_len x max_sent_len) -- eliding the batch size 506 | tokens_input = Input(name='input', 507 | shape=(self.preprocessor.max_doc_len, self.preprocessor.max_sent_len), 508 | dtype='int32') 509 | # flatten; create a very wide matrix to hand to embedding layer 510 | tokens_reshaped = Reshape([self.preprocessor.max_doc_len*self.preprocessor.max_sent_len])(tokens_input) 511 | # embed the tokens; output will be (p.max_doc_len*p.max_sent_len x embedding_dims) 512 | x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 513 | weights=self.preprocessor.init_vectors)(tokens_reshaped) 514 | 515 | # reshape to preserve document structure; each doc will now be a 516 | # a row in this matrix 517 | x = Reshape((1, self.preprocessor.max_doc_len, 518 | self.preprocessor.max_sent_len*self.preprocessor.embedding_dims))(x) 519 | 520 | #x = Reshape((1, p.max_doc_len, p.max_sent_len*p.embedding_dims))(x) 521 | 522 | x = Dropout(0.1)(x) 523 | 524 | #### 525 | # @TODO wrap in loop to include all n_grams! 526 | n_gram = 1 # tmp 527 | 528 | 529 | cur_conv = Convolution2D(n_filters, 1, 530 | n_gram*self.preprocessor.embedding_dims, 531 | subsample=(1, self.preprocessor.embedding_dims))(x) 532 | # model = Model(input=tokens_input, output=cur_conv) 533 | 534 | # this output (n_filters x max_doc_len x 1) 535 | one_max = MaxPooling2D(pool_size=(1, self.preprocessor.max_sent_len - n_gram + 1))(cur_conv) 536 | # flip around, to get (1 x max_doc_len x n_filters) 537 | permuted = Permute((3,2,1)) (one_max) 538 | # drop extra dimension 539 | r = Reshape((self.preprocessor.max_doc_len, n_filters))(permuted) 540 | # now we want to average the sentence vectors! 541 | x_doc = Lambda(RationaleCNN.weighted_sum, 542 | output_shape=RationaleCNN.weighted_sum_output_shape)(r) 543 | 544 | # finally, the sigmoid layer for classification 545 | y_hat = Dense(1, activation="softmax")(x_doc) 546 | model = Model(input=tokens_input, output=x_doc) 547 | return model 548 | #model.summary() 549 | 550 | def build_doc_model(self): 551 | ''' 552 | Builds the *document* level model, which uses the sentence level model to inform 553 | its predictions. 554 | ''' 555 | #tokens_input = Input(name='input', shape=(None, 556 | # self.preprocessor.max_doc_len, 557 | # self.preprocessor.max_sent_len), dtype='int32') 558 | tokens_input = Input(name='input', shape=(p.max_doc_len, p.max_sent_len), dtype='int32') 559 | 560 | tokens_reshaped = Reshape([p.max_doc_len*p.max_sent_len])(tokens_input) 561 | 562 | x = Embedding(p.max_features, p.embedding_dims, weights=p.init_vectors)(tokens_reshaped) 563 | #tokens_reshaped = Reshape((self.preprocessor.max_doc_len, 564 | # self.preprocessor.max_sent_len*self.preprocessor.embedding_dims))(tokens_input) 565 | 566 | # so this will be (max_doc_len, max_sent_len, wv_size), i think 567 | #x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 568 | # weights=self.preprocessor.init_vectors)(tokens_input) 569 | #input_length=self.preprocessor.max_sent_len, 570 | #weights=self.preprocessor.init_vectors)(tokens_input) 571 | 572 | x = Reshape((p.max_doc_len, p.max_sent_len*p.embedding_dims))(x) 573 | x = Dropout(0.1)(x) 574 | 575 | # (max_doc_len, max_sent_len, wv_size) -> (max_doc_len, max_sent_len * wv_size) 576 | #r = Reshape(self.preprocessor.max_doc_len, 577 | # self.preprocessor.max_sent_len * self.preprocessor.embedding_dims)(x) 578 | convolutions = [] 579 | for n_gram in self.ngram_filters: 580 | #cur_conv = Convolution1D(nb_filter=self.nb_filter, filter_length=n_gram) 581 | 582 | ''' 583 | # filter, nb_rows, nb_cols 584 | cur_conv = Convolution2D(self.nb_filter, 585 | 1, self.preprocessor.embedding_dims, 586 | filter_length=n_gram, 587 | activation='relu', 588 | input_dim=self.preprocessor.embedding_dims, 589 | input_length=self.preprocessor.max_sent_len)(x) 590 | ''' 591 | 592 | # cur_conv = Convolution2D(32, p.embedding_dims, n_gram, input_shape=(1, p.embedding_dims, p.max_sent_len))(x) 593 | cur_conv = Convolution1D(nb_filter=self.nb_filter, 594 | filter_length=n_gram, 595 | border_mode='valid', 596 | activation='relu', 597 | subsample_length=1, 598 | input_dim=self.preprocessor.embedding_dims, 599 | input_length=self.preprocessor.max_sent_len)(x) 600 | 601 | 602 | # pool 603 | #one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv) 604 | one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv) 605 | flattened = Flatten()(one_max) 606 | convolutions.append(flattened) 607 | 608 | penultimate_layer = merge(convolutions) 609 | 610 | output = Dense(1, activation="sigmoid")(penultimate_layer) 611 | 612 | self.document_model = Model(input=tokens_input, output=output) 613 | 614 | print(self.document_model.summary()) 615 | self.document_model.compile(loss='binary_crossentropy', optimizer="adam") 616 | 617 | return self.document_model 618 | 619 | 620 | ''' 621 | self.model = Graph() 622 | self.model.add_input(name='input', input_shape=(self.preprocessor.maxlen,), dtype=int) 623 | 624 | self.model.add_node(Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 625 | input_length=self.preprocessor.maxlen, weights=self.preprocessor.init_vectors), 626 | name='embedding', input='input') 627 | self.model.add_node(Dropout(0.), name='dropout_embedding', input='embedding') 628 | for n_gram in self.ngram_filters: 629 | self.model.add_node(Convolution1D(nb_filter=self.nb_filter, 630 | filter_length=n_gram, 631 | border_mode='valid', 632 | activation='relu', 633 | subsample_length=1, 634 | input_dim=self.preprocessor.embedding_dims, 635 | input_length=self.preprocessor.maxlen), 636 | name='conv_' + str(n_gram), 637 | input='dropout_embedding') 638 | self.model.add_node(MaxPooling1D(pool_length=self.preprocessor.maxlen - n_gram + 1), 639 | name='maxpool_' + str(n_gram), 640 | input='conv_' + str(n_gram)) 641 | self.model.add_node(Flatten(), 642 | name='flat_' + str(n_gram), 643 | input='maxpool_' + str(n_gram)) 644 | self.model.add_node(Dropout(self.dropout), name='dropout', inputs=['flat_' + str(n) for n in self.ngram_filters]) 645 | self.model.add_node(Dense(1, input_dim=self.nb_filter * len(self.ngram_filters)), 646 | name='dense', input='dropout') 647 | self.model.add_node(Activation('sigmoid'), name='sigmoid', input='dense') 648 | self.model.add_output(name='output', input='sigmoid') 649 | print("model built") 650 | print(self.model.summary()) 651 | self.model.compile(loss={'output': 'binary_crossentropy'}, 652 | optimizer="adam")#optimizer) 653 | ''' 654 | 655 | class Document: 656 | def __init__(self, doc_id, sentences, doc_label=None, sentences_labels=None): 657 | self.doc_id = doc_id 658 | self.doc_y = doc_label 659 | 660 | self.sentences = sentences 661 | self.sentence_sequences = None 662 | 663 | self.sentences_y = sentences_labels 664 | 665 | self.sentence_idx = 0 666 | self.n = len(self.sentences) 667 | 668 | 669 | def __len__(self): 670 | return self.n 671 | 672 | def generate_sequences(self, p): 673 | ''' 674 | p is a preprocessor that has been instantiated 675 | elsewhere! this will be used to map sentences to 676 | integer sequences here. 677 | ''' 678 | self.sentence_sequences = p.build_sequences(self.sentences) 679 | 680 | 681 | 682 | ''' 683 | def __iter__(self): 684 | return self 685 | 686 | def next(self): 687 | if self.sentence_idx < self.n: 688 | cur_sentence_idx = self.sentence_idx 689 | self.sentence_idx += 1 690 | 691 | return self.sentences[cur_sentence_idx] 692 | else: 693 | raise StopIteration() 694 | 695 | ''' 696 | 697 | 698 | class Preprocessor: 699 | def __init__(self, max_features, max_sent_len, embedding_dims=200, wvs=None, max_doc_len=500): 700 | ''' 701 | max_features: the upper bound to be placed on the vocabulary size. 702 | max_sent_len: the maximum length (in terms of tokens) of the instances/texts. 703 | embedding_dims: size of the token embeddings; over-ridden if pre-trained 704 | vectors is provided (if wvs is not None). 705 | ''' 706 | 707 | self.max_features = max_features 708 | self.tokenizer = Tokenizer(nb_words=self.max_features) 709 | self.max_sent_len = max_sent_len # the max sentence length! @TODO rename; this is confusing. 710 | self.max_doc_len = max_doc_len # w.r.t. number of sentences! 711 | 712 | self.use_pretrained_embeddings = False 713 | self.init_vectors = None 714 | if wvs is None: 715 | self.embedding_dims = embedding_dims 716 | else: 717 | # note that these are only for initialization; 718 | # they will be tuned! 719 | self.use_pretrained_embeddings = True 720 | self.embedding_dims = wvs.vector_size 721 | self.word_embeddings = wvs 722 | 723 | 724 | def preprocess(self, all_docs): 725 | ''' 726 | This fits tokenizer and builds up input vectors (X) from the list 727 | of texts in all_texts. Needs to be called before train! 728 | ''' 729 | self.raw_texts = all_docs 730 | #self.build_sequences() 731 | self.fit_tokenizer() 732 | if self.use_pretrained_embeddings: 733 | self.init_word_vectors() 734 | 735 | 736 | def fit_tokenizer(self): 737 | ''' Fits tokenizer to all raw texts; remembers indices->words mappings. ''' 738 | self.tokenizer.fit_on_texts(self.raw_texts) 739 | self.word_indices_to_words = {} 740 | for token, idx in self.tokenizer.word_index.items(): 741 | self.word_indices_to_words[idx] = token 742 | 743 | def build_sequences(self, texts): 744 | X = list(self.tokenizer.texts_to_sequences_generator(texts)) 745 | X = np.array(pad_sequences(X, maxlen=self.max_sent_len)) 746 | return X 747 | 748 | def init_word_vectors(self): 749 | ''' 750 | Initialize word vectors. 751 | ''' 752 | self.init_vectors = [] 753 | unknown_words_to_vecs = {} 754 | for t, token_idx in self.tokenizer.word_index.items(): 755 | if token_idx <= self.max_features: 756 | try: 757 | self.init_vectors.append(self.word_embeddings[t]) 758 | except: 759 | if t not in unknown_words_to_vecs: 760 | # randomly initialize 761 | unknown_words_to_vecs[t] = np.random.random( 762 | self.embedding_dims)*-2 + 1 763 | 764 | self.init_vectors.append(unknown_words_to_vecs[t]) 765 | 766 | # note that we make this a singleton list because that's 767 | # what Keras wants. 768 | self.init_vectors = [np.vstack(self.init_vectors)] 769 | 770 | -------------------------------------------------------------------------------- /rationale_CNN_2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @authors Byron Wallace, Edward Banner, Ye Zhang 3 | 4 | A Keras implementation of our "rationale augmented CNN" (https://arxiv.org/abs/1605.04469). Please note that 5 | the model was originally implemented in Theano -- this version is a work in progress. 6 | 7 | Credit for initial pass of basic CNN implementation to: Cheng Guo (https://gist.github.com/entron). 8 | 9 | References 10 | -- 11 | Ye Zhang, Iain J. Marshall and Byron C. Wallace. "Rationale-Augmented Convolutional Neural Networks for Text Classification". http://arxiv.org/abs/1605.04469 12 | Yoon Kim. "Convolutional Neural Networks for Sentence Classification". EMNLP 2014. 13 | Ye Zhang and Byron Wallace. "A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification". http://arxiv.org/abs/1510.03820. 14 | & also: http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/ 15 | ''' 16 | 17 | from __future__ import print_function 18 | import pdb 19 | import sys 20 | import random 21 | reload(sys) 22 | sys.setdefaultencoding('utf8') 23 | 24 | import numpy as np 25 | 26 | from keras import backend as K 27 | from keras.models import Graph, Model, Sequential 28 | from keras.preprocessing import sequence 29 | from keras.engine.topology import Layer 30 | from keras.preprocessing.sequence import pad_sequences 31 | from keras.layers import Input, Embedding, Dense, merge 32 | from keras.layers.core import Dense, Dropout, Activation, Flatten, Merge, Reshape, Permute, Lambda 33 | from keras.layers.embeddings import Embedding 34 | from keras.layers.convolutional import Convolution1D, Convolution2D, MaxPooling1D, MaxPooling2D 35 | from keras.utils.np_utils import accuracy 36 | from keras.preprocessing.text import text_to_word_sequence, Tokenizer 37 | from keras.callbacks import ModelCheckpoint 38 | 39 | 40 | 41 | class RationaleCNN: 42 | 43 | def __init__(self, preprocessor, filters=None, n_filters=32, dropout=0.0): 44 | ''' 45 | parameters 46 | --- 47 | preprocessor: an instance of the Preprocessor class, defined below 48 | ''' 49 | self.preprocessor = preprocessor 50 | 51 | if filters is None: 52 | self.ngram_filters = [3, 4, 5] 53 | else: 54 | self.ngram_filters = filters 55 | 56 | self.n_filters = n_filters 57 | self.dropout = dropout 58 | self.sentence_model_trained = False 59 | 60 | @staticmethod 61 | def weighted_sum(X): 62 | # @TODO.. add sentence preds! 63 | return K.sum(X, axis=0) 64 | 65 | @staticmethod 66 | def weighted_sum_output_shape(input_shape): 67 | # expects something like (None, max_doc_len, num_features) 68 | # returns (1 x num_features) 69 | shape = list(input_shape) 70 | return tuple((1, shape[-1])) 71 | 72 | @staticmethod 73 | def balanced_sample(X, y): 74 | _, pos_rationale_indices = np.where([y[:,0] > 0]) 75 | _, neg_rationale_indices = np.where([y[:,1] > 0]) 76 | _, non_rationale_indices = np.where([y[:,2] > 0]) 77 | 78 | # sample a number of non-rationales equal to the total 79 | # number of pos/neg rationales. 80 | m = pos_rationale_indices.shape[0] + neg_rationale_indices.shape[0] 81 | sampled_non_rationale_indices = np.array(random.sample(non_rationale_indices, m)) 82 | 83 | train_indices = np.concatenate([pos_rationale_indices, neg_rationale_indices, sampled_non_rationale_indices]) 84 | np.random.shuffle(train_indices) # why not 85 | return X[train_indices,:], y[train_indices] 86 | 87 | def get_conv_layers_from_sentence_model(): 88 | layers_to_weights = {} 89 | for ngram in self.ngram_filters: 90 | layer_name = "conv_" + str(ngram) 91 | cur_conv_layer = self.sentence_model.get_layer(layer_name) 92 | weights, biases = cur_conv_layer.get_weights() 93 | 94 | # here it gets tricky because we need 95 | # so, e.g., (32 x 200 x 3 x 1) -> (32 x 3 x 200 x 1) 96 | # we do this because reshape by default iterates over 97 | # the last dimension fastest 98 | # swapped = np.swapaxes(X, 1, 2) 99 | # Xp = swapped.reshape(32, 1, 1, 600) 100 | 101 | def build_doc_model(self): 102 | #assert self.sentence_model_trained 103 | 104 | # input dim is (max_doc_len x max_sent_len) -- eliding the batch size 105 | tokens_input = Input(name='input', 106 | shape=(self.preprocessor.max_doc_len, self.preprocessor.max_sent_len), 107 | dtype='int32') 108 | # flatten; create a very wide matrix to hand to embedding layer 109 | tokens_reshaped = Reshape([self.preprocessor.max_doc_len*self.preprocessor.max_sent_len])(tokens_input) 110 | # embed the tokens; output will be (p.max_doc_len*p.max_sent_len x embedding_dims) 111 | # here we should initialize with weights from sentence model embedding layer! 112 | 113 | 114 | ### 115 | # getting weights for initialization 116 | x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 117 | weights=self.sentence_model.get_layer("embedding").get_weights(), 118 | #weights=self.preprocessor.init_vectors, 119 | name="embedding")(tokens_reshaped) 120 | 121 | # reshape to preserve document structure; each doc will now be a 122 | # a row in this matrix 123 | x = Reshape((1, self.preprocessor.max_doc_len, 124 | self.preprocessor.max_sent_len*self.preprocessor.embedding_dims), 125 | name="reshape")(x) 126 | 127 | x = Dropout(0.1, name="dropout")(x) 128 | 129 | convolutions = [] 130 | for n_gram in self.ngram_filters: 131 | 132 | 133 | #import pdb; pdb.set_trace() 134 | 135 | ### here is where we pull out weights 136 | layer_name = "conv_" + str(n_gram) 137 | cur_conv_layer = self.sentence_model.get_layer(layer_name) 138 | weights, biases = cur_conv_layer.get_weights() 139 | # here it gets a bit tricky; we need dims 140 | # (nb_filters x 1 x 1 x (n_gram*embedding_dim)) 141 | # for 2d conv; our 1d conv model, though, will have 142 | # (nb_filters x embedding_dim x n_gram x 1) 143 | # need to reshape this. but first need to swap around 144 | # axes due to how reshape works (it iterates over last 145 | # dimension first). in particular, e.g.,: 146 | # (32 x 200 x 3 x 1) -> (32 x 3 x 200 x 1) 147 | # swapped = np.swapaxes(X, 1, 2) 148 | swapped_weights = np.swapaxes(weights, 1, 2) 149 | init_weights = swapped_weights.reshape(self.n_filters, 150 | 1, 1, n_gram*self.preprocessor.embedding_dims) 151 | 152 | cur_conv = Convolution2D(self.n_filters, 1, 153 | n_gram*self.preprocessor.embedding_dims, 154 | subsample=(1, self.preprocessor.embedding_dims), 155 | name="conv2d_"+str(n_gram), 156 | weights=[init_weights, biases])(x) 157 | 158 | # this output (n_filters x max_doc_len x 1) 159 | one_max = MaxPooling2D(pool_size=(1, self.preprocessor.max_sent_len - n_gram + 1), 160 | name="pooling_"+str(n_gram))(cur_conv) 161 | 162 | # flip around, to get (1 x max_doc_len x n_filters) 163 | permuted = Permute((3,2,1), name="permute_"+str(n_gram)) (one_max) 164 | 165 | # drop extra dimension 166 | r = Reshape((self.preprocessor.max_doc_len, self.n_filters), 167 | name="conv_"+str(n_gram))(permuted) 168 | 169 | convolutions.append(r) 170 | 171 | # merge the filter size convolutions 172 | r = merge(convolutions, name="sentence_vectors") 173 | 174 | # now we take a weighted sum of the sentence vectors 175 | # to induce a document representation 176 | x_doc = Lambda(RationaleCNN.weighted_sum, 177 | output_shape=RationaleCNN.weighted_sum_output_shape, 178 | name="weighted_doc_vector")(r) 179 | 180 | # finally, the sigmoid layer for classification 181 | y_hat = Dense(1, activation="softmax", name="document_prediction")(x_doc) 182 | model = Model(input=tokens_input, output=x_doc) 183 | return model 184 | 185 | 186 | def train(self, X_train, y_train, X_val=None, y_val=None, 187 | nb_epoch=5, batch_size=32, optimizer='adam'): 188 | ''' 189 | Accepts an X matrix (presumably some slice of self.X) and corresponding 190 | vector of labels. May want to revisit this. 191 | 192 | X_val and y_val are to be used to validate during training. 193 | ''' 194 | checkpointer = ModelCheckpoint(filepath="weights.hdf5", 195 | verbose=1, 196 | save_best_only=(X_val is not None)) 197 | 198 | if X_val is not None: 199 | self.sentence_model.fit(X_train, y_train, 200 | batch_size=batch_size, nb_epoch=nb_epoch, 201 | validation_data=(X_val, y_val), 202 | verbose=2, callbacks=[checkpointer]) 203 | else: 204 | # no validation provided 205 | self.sentence_model.fit(X_train, y_train, 206 | batch_size=batch_size, nb_epoch=nb_epoch, 207 | verbose=2, callbacks=[checkpointer]) 208 | 209 | 210 | ''' 211 | def build_sentence_model(self): 212 | 213 | # input dim is (max_doc_len x max_sent_len) -- eliding the batch size 214 | tokens_input = Input(name='input', 215 | shape=(self.preprocessor.max_sent_len,), 216 | dtype='int32') 217 | 218 | # embed the tokens; output will be (p.max_doc_len*p.max_sent_len x embedding_dims) 219 | x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 220 | weights=self.preprocessor.init_vectors, name="embedding")(tokens_input) 221 | 222 | x = Dropout(0.1, name="dropout")(x) 223 | 224 | convolutions = [] 225 | for n_gram in self.ngram_filters: 226 | cur_conv = Convolution2D(self.n_filters, 1, 227 | n_gram*self.preprocessor.embedding_dims, 228 | subsample=(1, self.preprocessor.embedding_dims), 229 | name="conv2d_"+str(n_gram))(x) 230 | 231 | # this output (n_filters x max_doc_len x 1) 232 | one_max = MaxPooling2D(pool_size=(1, self.preprocessor.max_sent_len - n_gram + 1), 233 | name="pooling_"+str(n_gram))(cur_conv) 234 | 235 | # flip around, to get (1 x max_doc_len x n_filters) 236 | permuted = Permute((3,2,1), name="permute_"+str(n_gram)) (one_max) 237 | 238 | # drop extra dimension 239 | r = Reshape((self.preprocessor.max_doc_len, self.n_filters), 240 | name="conv_"+str(n_gram))(permuted) 241 | 242 | convolutions.append(r) 243 | 244 | # merge the filter size convolutions 245 | r = merge(convolutions, name="sentence_vectors") 246 | 247 | # now the classification layer... 248 | ''' 249 | 250 | def build_sentence_model(self): 251 | ''' 252 | Build the *sentence* level model, which operates over, erm, sentences. 253 | The task is to predict which sentences are pos/neg rationales. 254 | ''' 255 | tokens_input = Input(name='input', shape=(self.preprocessor.max_sent_len,), dtype='int32') 256 | x = Embedding(self.preprocessor.max_features, self.preprocessor.embedding_dims, 257 | name="embedding", 258 | input_length=self.preprocessor.max_sent_len, 259 | weights=self.preprocessor.init_vectors)(tokens_input) 260 | 261 | x = Dropout(0.1)(x) 262 | 263 | convolutions = [] 264 | for n_gram in self.ngram_filters: 265 | cur_conv = Convolution1D(name="conv_" + str(n_gram), 266 | nb_filter=self.n_filters, 267 | filter_length=n_gram, 268 | border_mode='valid', 269 | activation='relu', 270 | subsample_length=1, 271 | input_dim=self.preprocessor.embedding_dims, 272 | input_length=self.preprocessor.max_sent_len)(x) 273 | # pool 274 | one_max = MaxPooling1D(pool_length=self.preprocessor.max_sent_len - n_gram + 1)(cur_conv) 275 | flattened = Flatten()(one_max) 276 | convolutions.append(flattened) 277 | 278 | sentence_vector = merge(convolutions, name="sentence_vector") # hang on to this layer! 279 | output = Dense(3, activation="softmax", name="sentence_prediction")(sentence_vector) 280 | 281 | self.sentence_model = Model(input=tokens_input, output=output) 282 | print("model built") 283 | print(self.sentence_model.summary()) 284 | self.sentence_model.compile(loss='categorical_crossentropy', optimizer="adam") 285 | 286 | self.sentence_embedding_dim = self.sentence_model.layers[-2].output_shape[1] 287 | 288 | return self.sentence_model 289 | 290 | 291 | def train_sentence_model(self, train_documents, nb_epoch=5, downsample=True, 292 | batch_size=128, optimizer='adam'): 293 | # assumes sentence sequences have been generated! 294 | assert(train_documents[0].sentence_sequences is not None) 295 | 296 | X, y= [], [] 297 | # flatten sentences/sentence labels 298 | for d in train_documents: 299 | X.extend(d.sentence_sequences) 300 | y.extend(d.sentences_y) 301 | 302 | # @TODO sub-sample magic? 303 | X, y = np.asarray(X), np.asarray(y) 304 | 305 | # downsample 306 | if downsample: 307 | X, y = RationaleCNN.balanced_sample(X, y) 308 | 309 | #self.train(X[:1000], y[:1000]) 310 | self.train(X, y) 311 | 312 | self.sentence_model_trained = True 313 | 314 | 315 | class Preprocessor: 316 | def __init__(self, max_features, max_sent_len, embedding_dims=200, wvs=None, max_doc_len=500): 317 | ''' 318 | max_features: the upper bound to be placed on the vocabulary size. 319 | max_sent_len: the maximum length (in terms of tokens) of the instances/texts. 320 | embedding_dims: size of the token embeddings; over-ridden if pre-trained 321 | vectors is provided (if wvs is not None). 322 | ''' 323 | 324 | self.max_features = max_features 325 | self.tokenizer = Tokenizer(nb_words=self.max_features) 326 | self.max_sent_len = max_sent_len # the max sentence length! @TODO rename; this is confusing. 327 | self.max_doc_len = max_doc_len # w.r.t. number of sentences! 328 | 329 | self.use_pretrained_embeddings = False 330 | self.init_vectors = None 331 | if wvs is None: 332 | self.embedding_dims = embedding_dims 333 | else: 334 | # note that these are only for initialization; 335 | # they will be tuned! 336 | self.use_pretrained_embeddings = True 337 | self.embedding_dims = wvs.vector_size 338 | self.word_embeddings = wvs 339 | 340 | 341 | def preprocess(self, all_docs): 342 | ''' 343 | This fits tokenizer and builds up input vectors (X) from the list 344 | of texts in all_texts. Needs to be called before train! 345 | ''' 346 | self.raw_texts = all_docs 347 | #self.build_sequences() 348 | self.fit_tokenizer() 349 | if self.use_pretrained_embeddings: 350 | self.init_word_vectors() 351 | 352 | 353 | def fit_tokenizer(self): 354 | ''' Fits tokenizer to all raw texts; remembers indices->words mappings. ''' 355 | self.tokenizer.fit_on_texts(self.raw_texts) 356 | self.word_indices_to_words = {} 357 | for token, idx in self.tokenizer.word_index.items(): 358 | self.word_indices_to_words[idx] = token 359 | 360 | def build_sequences(self, texts): 361 | X = list(self.tokenizer.texts_to_sequences_generator(texts)) 362 | X = np.array(pad_sequences(X, maxlen=self.max_sent_len)) 363 | return X 364 | 365 | def init_word_vectors(self): 366 | ''' 367 | Initialize word vectors. 368 | ''' 369 | self.init_vectors = [] 370 | unknown_words_to_vecs = {} 371 | for t, token_idx in self.tokenizer.word_index.items(): 372 | if token_idx <= self.max_features: 373 | try: 374 | self.init_vectors.append(self.word_embeddings[t]) 375 | except: 376 | if t not in unknown_words_to_vecs: 377 | # randomly initialize 378 | unknown_words_to_vecs[t] = np.random.random( 379 | self.embedding_dims)*-2 + 1 380 | 381 | self.init_vectors.append(unknown_words_to_vecs[t]) 382 | 383 | # note that we make this a singleton list because that's 384 | # what Keras wants. 385 | self.init_vectors = [np.vstack(self.init_vectors)] 386 | --------------------------------------------------------------------------------