├── LICENSE ├── README.md ├── imdb_train.py └── model.py /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Synthesio 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hierarchical-attention-networks 2 | Implementation of Hierarchical Attention Networks as presented in https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf 3 | 4 | imdb_train.py trains the model with the IMDB sentiment data. 5 | 6 | The input data for the model is ( batch docs, sentence, words). IMDB data contains a "sentence" per document, so we had an extra dimension as sentence. -------------------------------------------------------------------------------- /imdb_train.py: -------------------------------------------------------------------------------- 1 | '''Trains a Hierarchical Attention Model on the IMDB sentiment classification task. 2 | Modified from keras' examples/imbd_lstm.py. 3 | ''' 4 | from __future__ import print_function 5 | import numpy as np 6 | from model import createHierarchicalAttentionModel 7 | np.random.seed(1337) # for reproducibility 8 | 9 | from keras.preprocessing import sequence 10 | from keras.datasets import imdb 11 | 12 | max_features = 20000 13 | maxlen = 80 # cut texts after this number of words (among top max_features most common words) 14 | batch_size = 32 15 | 16 | print('Loading data...') 17 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) 18 | print(len(X_train), 'train sequences') 19 | print(len(X_test), 'test sequences') 20 | 21 | print('Pad sequences (samples x time)') 22 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 23 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 24 | #add one extra dimention as the sentence (1 sentence per doc!) 25 | X_train = np.expand_dims(X_train, axis=1) 26 | X_test = np.expand_dims(X_test, axis=1) 27 | print('X_train shape:', X_train.shape) 28 | print('X_test shape:', X_test.shape) 29 | 30 | print('Build model...') 31 | model, modelAttEval = createHierarchicalAttentionModel(maxlen, embeddingSize = 200, vocabSize = max_features) 32 | 33 | print('Train...') 34 | model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=10, 35 | validation_data=(X_test, y_test)) 36 | score, acc = model.evaluate(X_test, y_test, 37 | batch_size=batch_size) 38 | print('Test score:', score) 39 | print('Test accuracy:', acc) 40 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | ## model creation on Keras 2 | 3 | 4 | from keras.models import Model 5 | from keras.layers import Input 6 | from keras.layers.embeddings import Embedding 7 | from keras.layers.recurrent import GRU 8 | from keras.layers.wrappers import Bidirectional, TimeDistributed 9 | from keras.layers.core import Dropout, Dense, Lambda, Masking 10 | from keras.engine.topology import merge, Layer 11 | 12 | from keras import backend as K, initializations 13 | 14 | class AttentionLayer(Layer): 15 | ''' 16 | Attention layer. 17 | ''' 18 | def __init__(self, init='glorot_uniform', **kwargs): 19 | super(AttentionLayer, self).__init__(**kwargs) 20 | self.supports_masking = True 21 | self.init = initializations.get(init) 22 | 23 | def build(self, input_shape): 24 | input_dim = input_shape[-1] 25 | self.Uw = self.init((input_dim, )) 26 | self.trainable_weights = [self.Uw] 27 | super(AttentionLayer, self).build(input_shape) 28 | 29 | def compute_mask(self, input, mask): 30 | return mask 31 | 32 | def call(self, x, mask=None): 33 | multData = K.exp(K.dot(x, self.Uw)) 34 | if mask is not None: 35 | multData = mask*multData 36 | output = multData/(K.sum(multData, axis=1)+K.epsilon())[:,None] 37 | return K.reshape(output, (output.shape[0],output.shape[1],1)) 38 | 39 | def get_output_shape_for(self, input_shape): 40 | newShape = list(input_shape) 41 | newShape[-1] = 1 42 | return tuple(newShape) 43 | 44 | # dropSentenceRnnOut = 0.5 45 | 46 | 47 | def createHierarchicalAttentionModel(maxSeq, 48 | embWeights=None, embeddingSize = None, vocabSize = None, #embedding 49 | recursiveClass = GRU, wordRnnSize=100, sentenceRnnSize=100, #rnn 50 | #wordDenseSize = 100, sentenceHiddenSize = 128, #dense 51 | dropWordEmb = 0.2, dropWordRnnOut = 0.2, dropSentenceRnnOut = 0.5): 52 | ''' 53 | Creates a model based on the Hierarchical Attention model according to : https://arxiv.org/abs/1606.02393 54 | inputs: 55 | maxSeq : max size for sentences 56 | embedding 57 | embWeights : numpy matrix with embedding values 58 | embeddingSize (if embWeights is None) : embedding size 59 | vocabSize (if embWeights is None) : vocabulary size 60 | Recursive Layers 61 | recursiveClass : class for recursive class. Default is GRU 62 | wordRnnSize : RNN size for word sequence 63 | sentenceRnnSize : RNN size for sentence sequence 64 | Dense Layers 65 | wordDenseSize: dense layer at exit from RNN , on sentence at word level 66 | sentenceHiddenSize : dense layer at exit from RNN , on document at sentence level 67 | Dropout 68 | 69 | returns : Two models. They are the same, but the second contains multiple outputs that can be use to analyse attention. 70 | ''' 71 | 72 | ## 73 | ## Sentence level logic 74 | wordsInputs = Input(shape=(maxSeq,), dtype='int32', name='words_input') 75 | if embWeights is None: 76 | emb = Embedding(vocabSize, embeddingSize, mask_zero=True)(wordsInputs) 77 | else: 78 | emb = Embedding(embWeights.shape[0], embWeights.shape[1], mask_zero=True, weights=[embWeights], trainable=False)(wordsInputs) 79 | if dropWordEmb != 0.0: 80 | emb = Dropout(dropWordEmb)(emb) 81 | wordRnn = Bidirectional(recursiveClass(wordRnnSize, return_sequences=True), merge_mode='concat')(emb) 82 | if dropWordRnnOut > 0.0: 83 | wordRnn = Dropout(dropWordRnnOut)(wordRnn) 84 | attention = AttentionLayer()(wordRnn) 85 | sentenceEmb = merge([wordRnn, attention], mode=lambda x:x[1]*x[0], output_shape=lambda x:x[0]) 86 | sentenceEmb = Lambda(lambda x:K.sum(x, axis=1), output_shape=lambda x:(x[0],x[2]))(sentenceEmb) 87 | modelSentence = Model(wordsInputs, sentenceEmb) 88 | modelSentAttention = Model(wordsInputs, attention) 89 | 90 | 91 | documentInputs = Input(shape=(None,maxSeq), dtype='int32', name='document_input') 92 | sentenceMasking = Masking(mask_value=0)(documentInputs) 93 | sentenceEmbbeding = TimeDistributed(modelSentence)(sentenceMasking) 94 | sentenceAttention = TimeDistributed(modelSentAttention)(sentenceMasking) 95 | sentenceRnn = Bidirectional(recursiveClass(wordRnnSize, return_sequences=True), merge_mode='concat')(sentenceEmbbeding) 96 | if dropSentenceRnnOut > 0.0: 97 | sentenceRnn = Dropout(dropSentenceRnnOut)(sentenceRnn) 98 | attentionSent = AttentionLayer()(sentenceRnn) 99 | documentEmb = merge([sentenceRnn, attentionSent], mode=lambda x:x[1]*x[0], output_shape=lambda x:x[0]) 100 | documentEmb = Lambda(lambda x:K.sum(x, axis=1), output_shape=lambda x:(x[0],x[2]), name="att2")(documentEmb) 101 | documentOut = Dense(1, activation="sigmoid", name="documentOut")(documentEmb) 102 | 103 | 104 | model = Model(input=[documentInputs], output=[documentOut]) 105 | model.compile(loss='binary_crossentropy', 106 | optimizer='rmsprop', 107 | metrics=['accuracy']) 108 | 109 | modelAttentionEv = Model(input=[documentInputs], output=[documentOut, sentenceAttention, attentionSent]) 110 | modelAttentionEv.compile(loss='binary_crossentropy', 111 | optimizer='rmsprop', 112 | metrics=['accuracy']) 113 | return model, modelAttentionEv 114 | --------------------------------------------------------------------------------