├── LICENSE
├── README.md
├── imdb_train.py
└── model.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Synthesio
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hierarchical-attention-networks
2 | Implementation of Hierarchical Attention Networks as presented in https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf
3 | 
4 | imdb_train.py trains the model with the IMDB sentiment data. 
5 | 
6 | The input data for the model is ( batch docs, sentence, words). IMDB data contains a "sentence" per document, so we had an extra dimension as sentence.  


--------------------------------------------------------------------------------
/imdb_train.py:
--------------------------------------------------------------------------------
 1 | '''Trains a Hierarchical Attention Model on the IMDB sentiment classification task.
 2 | Modified from keras' examples/imbd_lstm.py.
 3 | '''
 4 | from __future__ import print_function
 5 | import numpy as np
 6 | from model import createHierarchicalAttentionModel
 7 | np.random.seed(1337)  # for reproducibility
 8 | 
 9 | from keras.preprocessing import sequence
10 | from keras.datasets import imdb
11 | 
12 | max_features = 20000
13 | maxlen = 80  # cut texts after this number of words (among top max_features most common words)
14 | batch_size = 32 
15 | 
16 | print('Loading data...')
17 | (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
18 | print(len(X_train), 'train sequences')
19 | print(len(X_test), 'test sequences')
20 | 
21 | print('Pad sequences (samples x time)')
22 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
23 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
24 | #add one extra dimention as the sentence (1 sentence per doc!)
25 | X_train = np.expand_dims(X_train, axis=1)
26 | X_test = np.expand_dims(X_test, axis=1)
27 | print('X_train shape:', X_train.shape)
28 | print('X_test shape:', X_test.shape)
29 | 
30 | print('Build model...')
31 | model, modelAttEval = createHierarchicalAttentionModel(maxlen, embeddingSize = 200, vocabSize = max_features)
32 | 
33 | print('Train...')
34 | model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=10,
35 |           validation_data=(X_test, y_test))
36 | score, acc = model.evaluate(X_test, y_test,
37 |                             batch_size=batch_size)
38 | print('Test score:', score)
39 | print('Test accuracy:', acc)
40 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | ## model creation on Keras
  2 | 
  3 | 
  4 | from keras.models import Model
  5 | from keras.layers import Input
  6 | from keras.layers.embeddings import Embedding
  7 | from keras.layers.recurrent import GRU
  8 | from keras.layers.wrappers import Bidirectional, TimeDistributed
  9 | from keras.layers.core import Dropout, Dense, Lambda, Masking
 10 | from keras.engine.topology import merge, Layer
 11 | 
 12 | from keras import backend as K, initializations
 13 | 
 14 | class AttentionLayer(Layer):
 15 |     '''
 16 |     Attention layer. 
 17 |     '''
 18 |     def __init__(self, init='glorot_uniform', **kwargs):
 19 |         super(AttentionLayer, self).__init__(**kwargs)
 20 |         self.supports_masking = True
 21 |         self.init = initializations.get(init)
 22 |         
 23 |     def build(self, input_shape):
 24 |         input_dim = input_shape[-1]
 25 |         self.Uw = self.init((input_dim, ))
 26 |         self.trainable_weights = [self.Uw]
 27 |         super(AttentionLayer, self).build(input_shape)  
 28 |     
 29 |     def compute_mask(self, input, mask):
 30 |         return mask
 31 |     
 32 |     def call(self, x, mask=None):
 33 |         multData =  K.exp(K.dot(x, self.Uw))
 34 |         if mask is not None:
 35 |             multData = mask*multData
 36 |         output = multData/(K.sum(multData, axis=1)+K.epsilon())[:,None]
 37 |         return K.reshape(output, (output.shape[0],output.shape[1],1))
 38 | 
 39 |     def get_output_shape_for(self, input_shape):
 40 |         newShape = list(input_shape)
 41 |         newShape[-1] = 1
 42 |         return tuple(newShape)
 43 | 
 44 | # dropSentenceRnnOut = 0.5
 45 | 
 46 | 
 47 | def createHierarchicalAttentionModel(maxSeq, 
 48 |                                      embWeights=None, embeddingSize = None, vocabSize = None, #embedding
 49 |                                   recursiveClass = GRU, wordRnnSize=100, sentenceRnnSize=100,  #rnn 
 50 |                                   #wordDenseSize = 100, sentenceHiddenSize = 128, #dense
 51 |                                   dropWordEmb = 0.2, dropWordRnnOut = 0.2, dropSentenceRnnOut = 0.5):
 52 |     '''
 53 |     Creates a model based on the Hierarchical Attention model according to : https://arxiv.org/abs/1606.02393
 54 |     inputs:
 55 |         maxSeq : max size for sentences
 56 |         embedding
 57 |             embWeights : numpy matrix with embedding values
 58 |             embeddingSize (if embWeights is None) : embedding size
 59 |             vocabSize (if embWeights is None) : vocabulary size
 60 |         Recursive Layers 
 61 |             recursiveClass : class for recursive class. Default is GRU
 62 |             wordRnnSize : RNN size for word sequence 
 63 |             sentenceRnnSize :  RNN size for sentence sequence
 64 |         Dense Layers
 65 |             wordDenseSize: dense layer at exit from RNN , on sentence at word level
 66 |             sentenceHiddenSize : dense layer at exit from RNN , on document at sentence level 
 67 |         Dropout
 68 |             
 69 |     returns : Two models. They are the same, but the second contains multiple outputs that can be use to analyse attention. 
 70 |     '''
 71 |     
 72 |     ##
 73 |     ## Sentence level logic 
 74 |     wordsInputs = Input(shape=(maxSeq,), dtype='int32', name='words_input')
 75 |     if embWeights is None:
 76 |         emb = Embedding(vocabSize, embeddingSize, mask_zero=True)(wordsInputs)
 77 |     else:
 78 |         emb = Embedding(embWeights.shape[0], embWeights.shape[1], mask_zero=True, weights=[embWeights], trainable=False)(wordsInputs)
 79 |     if dropWordEmb != 0.0:
 80 |         emb = Dropout(dropWordEmb)(emb)
 81 |     wordRnn = Bidirectional(recursiveClass(wordRnnSize, return_sequences=True), merge_mode='concat')(emb)
 82 |     if dropWordRnnOut  > 0.0:
 83 |         wordRnn = Dropout(dropWordRnnOut)(wordRnn)
 84 |     attention = AttentionLayer()(wordRnn)
 85 |     sentenceEmb = merge([wordRnn, attention], mode=lambda x:x[1]*x[0], output_shape=lambda x:x[0])
 86 |     sentenceEmb = Lambda(lambda x:K.sum(x, axis=1), output_shape=lambda x:(x[0],x[2]))(sentenceEmb)
 87 |     modelSentence = Model(wordsInputs, sentenceEmb)
 88 |     modelSentAttention = Model(wordsInputs, attention)
 89 |     
 90 |     
 91 |     documentInputs = Input(shape=(None,maxSeq), dtype='int32', name='document_input')
 92 |     sentenceMasking = Masking(mask_value=0)(documentInputs)
 93 |     sentenceEmbbeding = TimeDistributed(modelSentence)(sentenceMasking)
 94 |     sentenceAttention = TimeDistributed(modelSentAttention)(sentenceMasking)
 95 |     sentenceRnn = Bidirectional(recursiveClass(wordRnnSize, return_sequences=True), merge_mode='concat')(sentenceEmbbeding)
 96 |     if dropSentenceRnnOut > 0.0:
 97 |         sentenceRnn = Dropout(dropSentenceRnnOut)(sentenceRnn)
 98 |     attentionSent = AttentionLayer()(sentenceRnn)
 99 |     documentEmb = merge([sentenceRnn, attentionSent], mode=lambda x:x[1]*x[0], output_shape=lambda x:x[0])
100 |     documentEmb = Lambda(lambda x:K.sum(x, axis=1), output_shape=lambda x:(x[0],x[2]), name="att2")(documentEmb)
101 |     documentOut = Dense(1, activation="sigmoid", name="documentOut")(documentEmb)
102 |     
103 |     
104 |     model = Model(input=[documentInputs], output=[documentOut])
105 |     model.compile(loss='binary_crossentropy',
106 |               optimizer='rmsprop',
107 |               metrics=['accuracy'])
108 |     
109 |     modelAttentionEv = Model(input=[documentInputs], output=[documentOut,  sentenceAttention, attentionSent])
110 |     modelAttentionEv.compile(loss='binary_crossentropy',
111 |               optimizer='rmsprop',
112 |               metrics=['accuracy'])
113 |     return model, modelAttentionEv
114 | 


--------------------------------------------------------------------------------