├── 0.0- Hierarchical Attention.py
├── 0.1 -Hierarchical Attention.py
├── 0.2 Hierarchical Attention.py
├── 1.0- addictive attention.py
├── 2.0- Bahdanau_attention.py
├── 3.0- Soft_attention.py
├── 4.0 -Luong_attention.py
├── 5.0-recognizing_entailment.py
├── Images
    ├──  Bahdanau_attention.png
    ├── alignments.png
    ├── attention-mechanisms.png
    ├── demo.txt
    ├── diff.png
    ├── ml.png
    └── white.png
├── Keras_Multi-head_attention.py
├── Keras_Multihead_attention_1.py
├── Multi-Head_attention.py
├── Multiple_Multi_head_attention.py
├── README.md
├── Sentence_level_Hierarchical_Attention.py
├── Tensorflow_Attention_apis
    ├── Bahdanau_attention.ipynb
    └── Luong_Attention.ipynb
├── Word_level_Hierarchical_Attention.py
├── scaled_dot_product_attention.py
└── simplest_self_attention.py


/0.0- Hierarchical Attention.py:
--------------------------------------------------------------------------------
 1 | #From https://github.com/ilivans/tf-rnn-attention/blob/master/attention.py
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def attention(inputs, attention_size, time_major=False, return_alphas=False):
 7 |     """
 8 |     Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.
 9 |     The idea was proposed in the article by Z. Yang et al., "Hierarchical Attention Networks
10 |      for Document Classification", 2016: http://www.aclweb.org/anthology/N16-1174.
11 |     Variables notation is also inherited from the article
12 |     
13 |     Args:
14 |         inputs: The Attention inputs.
15 |             Matches outputs of RNN/Bi-RNN layer (not final state):
16 |                 In case of RNN, this must be RNN outputs `Tensor`:
17 |                     If time_major == False (default), this must be a tensor of shape:
18 |                         `[batch_size, max_time, cell.output_size]`.
19 |                     If time_major == True, this must be a tensor of shape:
20 |                         `[max_time, batch_size, cell.output_size]`.
21 |                 In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and
22 |                 the backward RNN outputs `Tensor`.
23 |                     If time_major == False (default),
24 |                         outputs_fw is a `Tensor` shaped:
25 |                         `[batch_size, max_time, cell_fw.output_size]`
26 |                         and outputs_bw is a `Tensor` shaped:
27 |                         `[batch_size, max_time, cell_bw.output_size]`.
28 |                     If time_major == True,
29 |                         outputs_fw is a `Tensor` shaped:
30 |                         `[max_time, batch_size, cell_fw.output_size]`
31 |                         and outputs_bw is a `Tensor` shaped:
32 |                         `[max_time, batch_size, cell_bw.output_size]`.
33 |         attention_size: Linear size of the Attention weights.
34 |         time_major: The shape format of the `inputs` Tensors.
35 |             If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
36 |             If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
37 |             Using `time_major = True` is a bit more efficient because it avoids
38 |             transposes at the beginning and end of the RNN calculation.  However,
39 |             most TensorFlow data is batch-major, so by default this function
40 |             accepts input and emits output in batch-major form.
41 |         return_alphas: Whether to return attention coefficients variable along with layer's output.
42 |             Used for visualization purpose.
43 |     Returns:
44 |         The Attention output `Tensor`.
45 |         In case of RNN, this will be a `Tensor` shaped:
46 |             `[batch_size, cell.output_size]`.
47 |         In case of Bidirectional RNN, this will be a `Tensor` shaped:
48 |             `[batch_size, cell_fw.output_size + cell_bw.output_size]`.
49 |     """
50 | 
51 |     if isinstance(inputs, tuple):
52 |         # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
53 |         inputs = tf.concat(inputs, 2)
54 | 
55 |     if time_major:
56 |         # (T,B,D) => (B,T,D)
57 |         inputs = tf.array_ops.transpose(inputs, [1, 0, 2])
58 | 
59 |     hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer
60 | 
61 |     # Trainable parameters
62 |     w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
63 |     b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
64 |     u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
65 | 
66 |     with tf.name_scope('v'):
67 |         # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
68 |         #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
69 |         v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
70 | 
71 |     # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
72 |     vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
73 |     alphas = tf.nn.softmax(vu, name='alphas')         # (B,T) shape
74 | 
75 |     # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
76 |     output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
77 | 
78 |     if not return_alphas:
79 |         return output
80 |     else:
81 |         return output, alphas
82 | 


--------------------------------------------------------------------------------
/0.1 -Hierarchical Attention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | def attention(inputs, attention_size, time_major=False, return_alphas=False):
 5 | 
 6 |     if isinstance(inputs, tuple):
 7 |         inputs = tf.concat(inputs, 2)
 8 | 
 9 |     if time_major:
10 |         inputs = tf.array_ops.transpose(inputs, [1, 0, 2])
11 | 
12 |     inputs = tf.transpose(inputs, [1, 0, 2])
13 |     sequence_length = inputs.shape[1].value # the length of sequences processed in the antecedent RNN layer
14 |     hidden_size = inputs.shape[2].value # hidden size of the RNN layer
15 | 
16 |     # Attention mechanism
17 |     W_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
18 |     b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
19 |     u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
20 | 
21 |     v = tf.tanh(tf.matmul(tf.reshape(inputs, [-1, hidden_size]), W_omega) + tf.reshape(b_omega, [1, -1]))
22 |     vu = tf.matmul(v, tf.reshape(u_omega, [-1, 1]))
23 |     exps = tf.reshape(tf.exp(vu), [-1, sequence_length])
24 |     alphas = exps / tf.reshape(tf.reduce_sum(exps, 1), [-1, 1])
25 | 
26 |     # Output of Bi-RNN is reduced with attention vector
27 |     output = tf.reduce_sum(inputs * tf.reshape(alphas, [-1, sequence_length, 1]), 1)
28 | 
29 |     if not return_alphas:
30 |         return output
31 |     else:
32 |         return output, alphas
33 | 


--------------------------------------------------------------------------------
/0.2 Hierarchical Attention.py:
--------------------------------------------------------------------------------
  1 | import datetime, pickle, os
  2 | import numpy as np
  3 | import keras
  4 | from keras.models import *
  5 | from keras.layers import *
  6 | from keras.optimizers import *
  7 | from keras.callbacks import *
  8 | from keras import regularizers
  9 | from keras.preprocessing.text import Tokenizer
 10 | from keras.preprocessing.sequence import pad_sequences
 11 | from keras import backend as K
 12 | from keras.utils import CustomObjectScope
 13 | from keras.engine.topology import Layer
 14 | from keras import initializers
 15 | 
 16 | from util.text_util import normalize
 17 | from util.glove import load_glove_embedding
 18 | 
 19 | # Uncomment below for debugging
 20 | # from tensorflow.python import debug as tf_debug
 21 | # sess = K.get_session()
 22 | # sess = tf_debug.LocalCLIDebugWrapperSession(sess)
 23 | # K.set_session(sess)
 24 | 
 25 | TOKENIZER_STATE_PATH = 'saved_models/tokenizer.p'
 26 | GLOVE_EMBEDDING_PATH = 'saved_models/glove.6B.100d.txt'
 27 | 
 28 | class Attention(Layer):
 29 | 	def __init__(self, regularizer=None, **kwargs):
 30 | 		super(Attention, self).__init__(**kwargs)
 31 | 		self.regularizer = regularizer
 32 | 		self.supports_masking = True
 33 | 
 34 | 	def build(self, input_shape):
 35 | 		# Create a trainable weight variable for this layer.
 36 | 		self.context = self.add_weight(name='context', 
 37 | 									   shape=(input_shape[-1], 1),
 38 | 									   initializer=initializers.RandomNormal(
 39 | 									   		mean=0.0, stddev=0.05, seed=None),
 40 | 									   regularizer=self.regularizer,
 41 | 									   trainable=True)
 42 | 		super(Attention, self).build(input_shape)
 43 | 
 44 | 	def call(self, x, mask=None):
 45 | 		attention_in = K.exp(K.squeeze(K.dot(x, self.context), axis=-1))
 46 | 		attention = attention_in/K.expand_dims(K.sum(attention_in, axis=-1), -1)
 47 | 
 48 | 		if mask is not None:
 49 | 			# use only the inputs specified by the mask
 50 | 			# import pdb; pdb.set_trace()
 51 | 			attention = attention*K.cast(mask, 'float32')
 52 | 
 53 | 		weighted_sum = K.batch_dot(K.permute_dimensions(x, [0, 2, 1]), attention)
 54 | 		return weighted_sum
 55 | 
 56 | 	def compute_output_shape(self, input_shape):
 57 | 		print(input_shape)
 58 | 		return (input_shape[0], input_shape[-1])
 59 | 
 60 | class HNATT():
 61 | 	def __init__(self):
 62 | 		self.model = None
 63 | 		self.MAX_SENTENCE_LENGTH = 0
 64 | 		self.MAX_SENTENCE_COUNT = 0
 65 | 		self.VOCABULARY_SIZE = 0
 66 | 		self.word_embedding = None
 67 | 		self.model = None
 68 | 		self.word_attention_model = None
 69 | 		self.tokenizer = None
 70 | 		self.class_count = 2
 71 | 
 72 | 	def _generate_embedding(self, path, dim):
 73 | 		return load_glove_embedding(path, dim, self.tokenizer.word_index)
 74 | 
 75 | 	def _build_model(self, n_classes=2, embedding_dim=100, embeddings_path=False):
 76 | 		l2_reg = regularizers.l2(1e-8)
 77 | 		# embedding_weights = np.random.normal(0, 1, (len(self.tokenizer.word_index) + 1, embedding_dim))
 78 | 		# embedding_weights = np.zeros((len(self.tokenizer.word_index) + 1, embedding_dim))
 79 | 		embedding_weights = np.random.normal(0, 1, (len(self.tokenizer.word_index) + 1, embedding_dim))
 80 | 		if embeddings_path:
 81 | 			embedding_weights = self._generate_embedding(embeddings_path, embedding_dim)
 82 | 
 83 | 		# Generate word-attention-weighted sentence scores
 84 | 		sentence_in = Input(shape=(self.MAX_SENTENCE_LENGTH,), dtype='int32')
 85 | 		embedded_word_seq = Embedding(
 86 | 			self.VOCABULARY_SIZE,
 87 | 			embedding_dim,
 88 | 			weights=[embedding_weights],
 89 | 			input_length=self.MAX_SENTENCE_LENGTH,
 90 | 			trainable=True,
 91 | 			mask_zero=True,
 92 | 			name='word_embeddings',)(sentence_in)
 93 | 		word_encoder = Bidirectional(
 94 | 			GRU(50, return_sequences=True, kernel_regularizer=l2_reg))(embedded_word_seq)
 95 | 		dense_transform_w = Dense(
 96 | 			100, 
 97 | 			activation='relu', 
 98 | 			name='dense_transform_w', 
 99 | 			kernel_regularizer=l2_reg)(word_encoder)
100 | 		attention_weighted_sentence = Model(
101 | 			sentence_in, Attention(name='word_attention', regularizer=l2_reg)(dense_transform_w))
102 | 		self.word_attention_model = attention_weighted_sentence
103 | 		attention_weighted_sentence.summary()
104 | 
105 | 		# Generate sentence-attention-weighted document scores
106 | 		texts_in = Input(shape=(self.MAX_SENTENCE_COUNT, self.MAX_SENTENCE_LENGTH), dtype='int32')
107 | 		attention_weighted_sentences = TimeDistributed(attention_weighted_sentence)(texts_in)
108 | 		sentence_encoder = Bidirectional(
109 | 			GRU(50, return_sequences=True, kernel_regularizer=l2_reg))(attention_weighted_sentences)
110 | 		dense_transform_s = Dense(
111 | 			100, 
112 | 			activation='relu', 
113 | 			name='dense_transform_s',
114 | 			kernel_regularizer=l2_reg)(sentence_encoder)	
115 | 		attention_weighted_text = Attention(name='sentence_attention', regularizer=l2_reg)(dense_transform_s)
116 | 		prediction = Dense(n_classes, activation='softmax')(attention_weighted_text)
117 | 		model = Model(texts_in, prediction)
118 | 		model.summary()
119 | 
120 | 		model.compile(#optimizer=RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0),
121 | 					  #optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True),
122 | 					  optimizer=Adam(lr=0.001),
123 | 		              loss='categorical_crossentropy',
124 | 		              metrics=['acc'])
125 | 
126 | 		return model
127 | 
128 | 	def load_weights(self, saved_model_dir, saved_model_filename):
129 | 		with CustomObjectScope({'Attention': Attention}):
130 | 			self.model = load_model(os.path.join(saved_model_dir, saved_model_filename))
131 | 			self.word_attention_model = self.model.get_layer('time_distributed_1').layer
132 | 			tokenizer_path = os.path.join(
133 | 				saved_model_dir, self._get_tokenizer_filename(saved_model_filename))
134 | 			tokenizer_state = pickle.load(open(tokenizer_path, "rb" ))
135 | 			self.tokenizer = tokenizer_state['tokenizer']
136 | 			self.MAX_SENTENCE_COUNT = tokenizer_state['maxSentenceCount']
137 | 			self.MAX_SENTENCE_LENGTH = tokenizer_state['maxSentenceLength']
138 | 			self.VOCABULARY_SIZE = tokenizer_state['vocabularySize']
139 | 			self._create_reverse_word_index()
140 | 
141 | 	def _get_tokenizer_filename(self, saved_model_filename):
142 | 		return saved_model_filename + '.tokenizer'
143 | 
144 | 	def _fit_on_texts(self, texts):
145 | 		self.tokenizer = Tokenizer(filters='"()*,-/;[\]^_`{|}~', oov_token='UNK');
146 | 		all_sentences = []
147 | 		max_sentence_count = 0
148 | 		max_sentence_length = 0
149 | 		for text in texts:
150 | 			sentence_count = len(text)
151 | 			if sentence_count > max_sentence_count:
152 | 				max_sentence_count = sentence_count
153 | 			for sentence in text:
154 | 				sentence_length = len(sentence)
155 | 				if sentence_length > max_sentence_length:
156 | 					max_sentence_length = sentence_length
157 | 				all_sentences.append(sentence)
158 | 
159 | 		self.MAX_SENTENCE_COUNT = min(max_sentence_count, 20)
160 | 		self.MAX_SENTENCE_LENGTH = min(max_sentence_length, 50)
161 | 		self.tokenizer.fit_on_texts(all_sentences)
162 | 		self.VOCABULARY_SIZE = len(self.tokenizer.word_index) + 1
163 | 		self._create_reverse_word_index()
164 | 
165 | 	def _create_reverse_word_index(self):
166 | 		self.reverse_word_index = {value:key for key,value in self.tokenizer.word_index.items()}
167 | 
168 | 	def _encode_texts(self, texts):
169 | 		encoded_texts = np.zeros((len(texts), self.MAX_SENTENCE_COUNT, self.MAX_SENTENCE_LENGTH))
170 | 		for i, text in enumerate(texts):
171 | 			encoded_text = np.array(pad_sequences(
172 | 				self.tokenizer.texts_to_sequences(text), 
173 | 				maxlen=self.MAX_SENTENCE_LENGTH))[:self.MAX_SENTENCE_COUNT]
174 | 			encoded_texts[i][-len(encoded_text):] = encoded_text
175 | 		return encoded_texts
176 | 
177 | 	def _save_tokenizer_on_epoch_end(self, path, epoch):
178 | 		if epoch == 0:
179 | 			tokenizer_state = {
180 | 				'tokenizer': self.tokenizer,
181 | 				'maxSentenceCount': self.MAX_SENTENCE_COUNT,
182 | 				'maxSentenceLength': self.MAX_SENTENCE_LENGTH,
183 | 				'vocabularySize': self.VOCABULARY_SIZE
184 | 			}
185 | 			pickle.dump(tokenizer_state, open(path, "wb" ) )
186 | 
187 | 	def train(self, train_x, train_y, 
188 | 		batch_size=16, epochs=1, 
189 | 		embedding_dim=100,
190 | 		embeddings_path=False, 
191 | 		saved_model_dir='saved_models', saved_model_filename=None,):
192 | 		# fit tokenizer
193 | 		self._fit_on_texts(train_x)
194 | 		self.model = self._build_model(
195 | 			n_classes=train_y.shape[-1], 
196 | 			embedding_dim=100,
197 | 			embeddings_path=embeddings_path)
198 | 		encoded_train_x = self._encode_texts(train_x)
199 | 		callbacks = [
200 | 			# EarlyStopping(
201 | 			# 	monitor='acc',
202 | 			# 	patience=2,
203 | 			# ),
204 | 			ReduceLROnPlateau(),
205 | 			# keras.callbacks.TensorBoard(
206 | 			# 	log_dir="logs/final/{}".format(datetime.datetime.now()), 
207 | 			# 	histogram_freq=1, 
208 | 			# 	write_graph=True, 
209 | 			# 	write_images=True
210 | 			# )
211 | 			LambdaCallback(
212 | 				on_epoch_end=lambda epoch, logs: self._save_tokenizer_on_epoch_end(
213 | 					os.path.join(saved_model_dir, 
214 | 						self._get_tokenizer_filename(saved_model_filename)), epoch))
215 | 		]
216 | 
217 | 		if saved_model_filename:
218 | 			callbacks.append(
219 | 				ModelCheckpoint(
220 | 					filepath=os.path.join(saved_model_dir, saved_model_filename),
221 | 					monitor='val_acc',
222 | 					save_best_only=True,
223 | 					save_weights_only=False,
224 | 				)
225 | 			)
226 | 		self.model.fit(x=encoded_train_x, y=train_y, 
227 | 					   batch_size=batch_size, 
228 | 					   epochs=epochs, 
229 | 					   verbose=1, 
230 | 					   callbacks=callbacks,
231 | 					   validation_split=0.1,  
232 | 					   shuffle=True)
233 | 
234 | 	def _encode_input(self, x, log=False):
235 | 		x = np.array(x)
236 | 		if not x.shape:
237 | 			x = np.expand_dims(x, 0)
238 | 		texts = np.array([normalize(text) for text in x])
239 | 		return self._encode_texts(texts)
240 | 
241 | 	def predict(self, x):
242 | 		encoded_x = self._encode_texts(x)
243 | 		return self.model.predict(encoded_x)
244 | 
245 | 	def activation_maps(self, text, websafe=False):
246 | 		normalized_text = normalize(text)
247 | 		encoded_text = self._encode_input(text)[0]
248 | 
249 | 		# get word activations
250 | 		hidden_word_encoding_out = Model(inputs=self.word_attention_model.input,
251 | 		                             	 outputs=self.word_attention_model.get_layer('dense_transform_w').output)
252 | 		hidden_word_encodings = hidden_word_encoding_out.predict(encoded_text)
253 | 		word_context = self.word_attention_model.get_layer('word_attention').get_weights()[0]
254 | 		u_wattention = encoded_text*np.exp(np.squeeze(np.dot(hidden_word_encodings, word_context)))
255 | 		if websafe:
256 | 			u_wattention = u_wattention.astype(float)
257 | 
258 | 		# generate word, activation pairs
259 | 		nopad_encoded_text = encoded_text[-len(normalized_text):]
260 | 		nopad_encoded_text = [list(filter(lambda x: x > 0, sentence)) for sentence in nopad_encoded_text]
261 | 		reconstructed_texts = [[self.reverse_word_index[int(i)] 
262 | 								for i in sentence] for sentence in nopad_encoded_text]
263 | 		nopad_wattention = u_wattention[-len(normalized_text):]
264 | 		nopad_wattention = nopad_wattention/np.expand_dims(np.sum(nopad_wattention, -1), -1)
265 | 		nopad_wattention = np.array([attention_seq[-len(sentence):] 
266 | 							for attention_seq, sentence in zip(nopad_wattention, nopad_encoded_text)])
267 | 		word_activation_maps = []
268 | 		for i, text in enumerate(reconstructed_texts):
269 | 			word_activation_maps.append(list(zip(text, nopad_wattention[i])))
270 | 
271 | 		# get sentence activations
272 | 		hidden_sentence_encoding_out = Model(inputs=self.model.input,
273 | 											 outputs=self.model.get_layer('dense_transform_s').output)
274 | 		hidden_sentence_encodings = np.squeeze(
275 | 			hidden_sentence_encoding_out.predict(np.expand_dims(encoded_text, 0)), 0)
276 | 		sentence_context = self.model.get_layer('sentence_attention').get_weights()[0]
277 | 		u_sattention = np.exp(np.squeeze(np.dot(hidden_sentence_encodings, sentence_context), -1))
278 | 		if websafe:
279 | 			u_sattention = u_sattention.astype(float)
280 | 		nopad_sattention = u_sattention[-len(normalized_text):]
281 | 
282 | 		nopad_sattention = nopad_sattention/np.expand_dims(np.sum(nopad_sattention, -1), -1)
283 | 
284 | 		activation_map = list(zip(word_activation_maps, nopad_sattention))	
285 | 
286 | 		return activation_map
287 |     
288 | # source https://github.com/minqi/hnatt/blob/master/hnatt.py
289 | 


--------------------------------------------------------------------------------
/1.0- addictive attention.py:
--------------------------------------------------------------------------------
 1 | #From suriyadeepan code library
 2 | 
 3 | def additive_attention(ref, query, ref_dim, qdim, 
 4 |         normalize=False, blend=False):
 5 |     # infer timesteps
 6 |     timesteps = tf.shape(ref)[1]
 7 |     
 8 |     U = tf.get_variable('U', 
 9 |             shape=[ref_dim, qdim],
10 |             dtype=tf.float32, 
11 |             initializer=tf.random_uniform_initializer(-0.01, 0.01))
12 |     V = tf.get_variable('V', 
13 |             shape=[qdim, qdim],
14 |             dtype=tf.float32, 
15 |             initializer=tf.random_uniform_initializer(-0.01, 0.01))
16 |     Av = tf.get_variable('Av', 
17 |             shape=[qdim, 1],
18 |             dtype=tf.float32, 
19 |             initializer=tf.random_uniform_initializer(-0.01, 0.01))
20 |     # NOTE : reference should be in batch_major format
21 |     ref_proj = tf.reshape(
22 |         tf.matmul(tf.reshape(ref, [-1, ref_dim]), U), #  collapse dims to matmul
23 |         [-1, timesteps, qdim]) # expand again
24 |     hi = tf.expand_dims(tf.matmul(query, V),
25 |                         axis=1) # expand time dim to add to reference
26 |     
27 |     # sum up ref, query
28 |     blended = (ref_proj + hi)
29 |     scores = tf.reshape(tf.matmul( 
30 |             tf.reshape(blended, [-1, qdim]), # collapse dims
31 |                 Av), # matmul with attention vector
32 |                   [-1, timesteps]) # attention weights across timesteps
33 |     
34 |     # normalize scores
35 |     probs = tf.nn.softmax(scores)
36 |     if normalize:
37 |         return probs
38 |     if blend: # reduce reference based on attention weights
39 |         return tf.reduce_sum(ref * tf.expand_dims(probs, axis=-1), 
40 |                 axis=1) # reduce across time dimension
41 |     return scores # return score
42 | 


--------------------------------------------------------------------------------
/2.0- Bahdanau_attention.py:
--------------------------------------------------------------------------------
 1 | #from pytorch 
 2 | 
 3 | class BahdanauAttnDecoderRNN(nn.Module):
 4 |     def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=0.1):
 5 |         super(AttnDecoderRNN, self).__init__()
 6 |         
 7 |         # Define parameters
 8 |         self.hidden_size = hidden_size
 9 |         self.output_size = output_size
10 |         self.n_layers = n_layers
11 |         self.dropout_p = dropout_p
12 |         self.max_length = max_length
13 |         
14 |         # Define layers
15 |         self.embedding = nn.Embedding(output_size, hidden_size)
16 |         self.dropout = nn.Dropout(dropout_p)
17 |         self.attn = GeneralAttn(hidden_size)
18 |         self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
19 |         self.out = nn.Linear(hidden_size, output_size)
20 |     
21 |     def forward(self, word_input, last_hidden, encoder_outputs):
22 |         # Note that we will only be running forward for a single decoder time step, but will use all encoder outputs
23 |         
24 |         # Get the embedding of the current input word (last output word)
25 |         word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
26 |         word_embedded = self.dropout(word_embedded)
27 |         
28 |         # Calculate attention weights and apply to encoder outputs
29 |         attn_weights = self.attn(last_hidden[-1], encoder_outputs)
30 |         context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
31 |         
32 |         # Combine embedded input word and attended context, run through RNN
33 |         rnn_input = torch.cat((word_embedded, context), 2)
34 |         output, hidden = self.gru(rnn_input, last_hidden)
35 |         
36 |         # Final output layer
37 |         output = output.squeeze(0) # B x N
38 |         output = F.log_softmax(self.out(torch.cat((output, context), 1)))
39 |         
40 |         # Return final output, hidden state, and attention weights (for visualization)
41 |         return output, hidden, attn_weights
42 | 


--------------------------------------------------------------------------------
/3.0- Soft_attention.py:
--------------------------------------------------------------------------------
 1 | #after getting output from bidirectional rnn 
 2 | 
 3 | #Attention_layer 
 4 |         
 5 |         x_attention = tf.reshape(transpose,[-1,rnn_num_units*2])
 6 |         attention_size=tf.get_variable(name='attention',shape=[rnn_num_units*2,1],dtype=tf.float32,initializer=tf.random_uniform_initializer(-0.01,0.01))
 7 |         bias_ = tf.get_variable(name='bias_',shape=[1],dtype=tf.float32,initializer=tf.random_uniform_initializer(-0.01,0.01))
 8 |         linear_projection = tf.add(tf.matmul(x_attention,attention_size),bias_)
 9 | #         print(sentence_input.shape[0])
10 |         reshape_ = tf.reshape(linear_projection,[tf.shape(sentence_input)[0],tf.shape(sentence_input)[1],-1])
11 |         attention_output=tf.nn.softmax(reshape_,dim=1)
12 |         
13 |         atten_visualize=tf.reshape(attention_output,[tf.shape(sentence_input)[0],tf.shape(sentence_input)[1]],name='plot_dis')
14 |         
15 |         multi = tf.multiply(attention_output,transpose)
16 |         
17 | 
18 |         atten_out_s = tf.reduce_sum(multi,1)
19 | 
20 | #         attention_visualize = tf.reshape(atten_out,[tf.shape(sentence_input)[0],tf.shape(sentence_input)[1]])
21 | 


--------------------------------------------------------------------------------
/4.0 -Luong_attention.py:
--------------------------------------------------------------------------------
 1 | class Attn(nn.Module):
 2 |     def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
 3 |         super(Attn, self).__init__()
 4 |         
 5 |         self.method = method
 6 |         self.hidden_size = hidden_size
 7 |         
 8 |         if self.method == 'general':
 9 |             self.attn = nn.Linear(self.hidden_size, hidden_size)
10 | 
11 |         elif self.method == 'concat':
12 |             self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
13 |             self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))
14 | 
15 |     def forward(self, hidden, encoder_outputs):
16 |         seq_len = len(encoder_outputs)
17 | 
18 |         # Create variable to store attention energies
19 |         attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
20 |         if USE_CUDA: attn_energies = attn_energies.cuda()
21 | 
22 |         # Calculate energies for each encoder output
23 |         for i in range(seq_len):
24 |             attn_energies[i] = self.score(hidden, encoder_outputs[i])
25 | 
26 |         # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
27 |         return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)
28 |     
29 |     def score(self, hidden, encoder_output):
30 |         
31 |         if self.method == 'dot':
32 |             energy = hidden.dot(encoder_output)
33 |             return energy
34 |         
35 |         elif self.method == 'general':
36 |             energy = self.attn(encoder_output)
37 |             energy = hidden.dot(energy)
38 |             return energy
39 |         
40 |         elif self.method == 'concat':
41 |             energy = self.attn(torch.cat((hidden, encoder_output), 1))
42 |             energy = self.other.dot(energy)
43 |             return energy
44 | 


--------------------------------------------------------------------------------
/5.0-recognizing_entailment.py:
--------------------------------------------------------------------------------
 1 | #paper 
 2 | #Reasoning about Entailment with Neural Attention
 3 | #https://arxiv.org/pdf/1509.06664v1.pdf
 4 | 
 5 | 
 6 | import tensorflow as tf
 7 | import numpy as np
 8 | 
 9 | batch_size = 3
10 | seq_len = 5
11 | dim = 2
12 | # [batch_size x seq_len x dim]  -- hidden states
13 | Y = tf.constant(np.random.randn(batch_size, seq_len, dim), tf.float32)
14 | # [batch_size x dim]            -- h_N
15 | h = tf.constant(np.random.randn(batch_size, dim), tf.float32)
16 | 
17 | initializer = tf.random_uniform_initializer()
18 | W = tf.get_variable("weights_Y", [dim, dim], initializer=initializer)
19 | w = tf.get_variable("weights_w", [dim], initializer=initializer)
20 | 
21 | # [batch_size x seq_len x dim]  -- tanh(W^{Y}Y)
22 | M = tf.tanh(tf.einsum("aij,jk->aik", Y, W))
23 | # [batch_size x seq_len]        -- softmax(Y w^T)
24 | a = tf.nn.softmax(tf.einsum("aij,j->ai", M, w))
25 | # [batch_size x dim]            -- Ya^T
26 | r = tf.einsum("aij,ai->aj", Y, a)
27 | 
28 | with tf.Session() as sess:
29 |     sess.run(tf.global_variables_initializer())
30 |     a_val, r_val = sess.run([a, r])
31 |     print("a:", a_val, "\nr:", r_val)
32 |     
33 |     
34 |     
35 |     
36 |     #I came across this here https://stackoverflow.com/questions/42507030/implementing-attention-in-tensorflow
37 | 


--------------------------------------------------------------------------------
/Images/ Bahdanau_attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monk1337/Various-Attention-mechanisms/b4462102dcecb05c544a31aae8973a5477cc838d/Images/ Bahdanau_attention.png


--------------------------------------------------------------------------------
/Images/alignments.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monk1337/Various-Attention-mechanisms/b4462102dcecb05c544a31aae8973a5477cc838d/Images/alignments.png


--------------------------------------------------------------------------------
/Images/attention-mechanisms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monk1337/Various-Attention-mechanisms/b4462102dcecb05c544a31aae8973a5477cc838d/Images/attention-mechanisms.png


--------------------------------------------------------------------------------
/Images/demo.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Images/diff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monk1337/Various-Attention-mechanisms/b4462102dcecb05c544a31aae8973a5477cc838d/Images/diff.png


--------------------------------------------------------------------------------
/Images/ml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monk1337/Various-Attention-mechanisms/b4462102dcecb05c544a31aae8973a5477cc838d/Images/ml.png


--------------------------------------------------------------------------------
/Images/white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monk1337/Various-Attention-mechanisms/b4462102dcecb05c544a31aae8973a5477cc838d/Images/white.png


--------------------------------------------------------------------------------
/Keras_Multi-head_attention.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow import keras
 3 | from tensorflow.keras import layers
 4 | 
 5 | 
 6 | class MultiHeadSelfAttention(layers.Layer):
 7 |     def __init__(self, embed_dim, num_heads=8):
 8 |         super(MultiHeadSelfAttention, self).__init__()
 9 |         self.embed_dim = embed_dim
10 |         self.num_heads = num_heads
11 |         if embed_dim % num_heads != 0:
12 |             raise ValueError(
13 |                 f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
14 |             )
15 |         self.projection_dim = embed_dim // num_heads
16 |         self.query_dense = layers.Dense(embed_dim)
17 |         self.key_dense = layers.Dense(embed_dim)
18 |         self.value_dense = layers.Dense(embed_dim)
19 |         self.combine_heads = layers.Dense(embed_dim)
20 | 
21 |     def attention(self, query, key, value):
22 |         score = tf.matmul(query, key, transpose_b=True)
23 |         dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
24 |         scaled_score = score / tf.math.sqrt(dim_key)
25 |         weights = tf.nn.softmax(scaled_score, axis=-1)
26 |         output = tf.matmul(weights, value)
27 |         return output, weights
28 | 
29 |     def separate_heads(self, x, batch_size):
30 |         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
31 |         return tf.transpose(x, perm=[0, 2, 1, 3])
32 | 
33 |     def call(self, inputs):
34 |         # x.shape = [batch_size, seq_len, embedding_dim]
35 |         batch_size = tf.shape(inputs)[0]
36 |         query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
37 |         key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
38 |         value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
39 |         query = self.separate_heads(
40 |             query, batch_size
41 |         )  # (batch_size, num_heads, seq_len, projection_dim)
42 |         key = self.separate_heads(
43 |             key, batch_size
44 |         )  # (batch_size, num_heads, seq_len, projection_dim)
45 |         value = self.separate_heads(
46 |             value, batch_size
47 |         )  # (batch_size, num_heads, seq_len, projection_dim)
48 |         attention, weights = self.attention(query, key, value)
49 |         attention = tf.transpose(
50 |             attention, perm=[0, 2, 1, 3]
51 |         )  # (batch_size, seq_len, num_heads, projection_dim)
52 |         concat_attention = tf.reshape(
53 |             attention, (batch_size, -1, self.embed_dim)
54 |         )  # (batch_size, seq_len, embed_dim)
55 |         output = self.combine_heads(
56 |             concat_attention
57 |         )  # (batch_size, seq_len, embed_dim)
58 |         return output
59 | 


--------------------------------------------------------------------------------
/Keras_Multihead_attention_1.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import tensorflow as tf
 3 | 
 4 | import time
 5 | import numpy as np
 6 | 
 7 | def scaled_dot_product_attention(q, k, v, mask):
 8 |   """Calculate the attention weights.
 9 |   q, k, v must have matching leading dimensions.
10 |   k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
11 |   The mask has different shapes depending on its type(padding or look ahead) 
12 |   but it must be broadcastable for addition.
13 |   
14 |   Args:
15 |     q: query shape == (..., seq_len_q, depth)
16 |     k: key shape == (..., seq_len_k, depth)
17 |     v: value shape == (..., seq_len_v, depth_v)
18 |     mask: Float tensor with shape broadcastable 
19 |           to (..., seq_len_q, seq_len_k). Defaults to None.
20 |     
21 |   Returns:
22 |     output, attention_weights
23 |   """
24 | 
25 |   matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
26 |   
27 |   # scale matmul_qk
28 |   dk = tf.cast(tf.shape(k)[-1], tf.float32)
29 |   scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
30 | 
31 |   # add the mask to the scaled tensor.
32 |   if mask is not None:
33 |     scaled_attention_logits += (mask * -1e9)  
34 | 
35 |   # softmax is normalized on the last axis (seq_len_k) so that the scores
36 |   # add up to 1.
37 |   attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
38 | 
39 |   output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
40 | 
41 |   return output, attention_weights
42 | 
43 | 
44 | 
45 | class MultiHeadAttention(tf.keras.layers.Layer):
46 |   def __init__(self, d_model, num_heads):
47 |     super(MultiHeadAttention, self).__init__()
48 |     self.num_heads = num_heads
49 |     self.d_model = d_model
50 |     
51 |     assert d_model % self.num_heads == 0
52 |     
53 |     self.depth = d_model // self.num_heads
54 |     
55 |     self.wq = tf.keras.layers.Dense(d_model)
56 |     self.wk = tf.keras.layers.Dense(d_model)
57 |     self.wv = tf.keras.layers.Dense(d_model)
58 |     
59 |     self.dense = tf.keras.layers.Dense(d_model)
60 |         
61 |   def split_heads(self, x, batch_size):
62 |     """Split the last dimension into (num_heads, depth).
63 |     Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
64 |     """
65 |     x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
66 |     return tf.transpose(x, perm=[0, 2, 1, 3])
67 |     
68 |   def call(self, v, k, q, mask):
69 |     batch_size = tf.shape(q)[0]
70 |     
71 |     q = self.wq(q)  # (batch_size, seq_len, d_model)
72 |     k = self.wk(k)  # (batch_size, seq_len, d_model)
73 |     v = self.wv(v)  # (batch_size, seq_len, d_model)
74 |     
75 |     q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
76 |     k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
77 |     v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
78 |     
79 |     # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
80 |     # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
81 |     scaled_attention, attention_weights = scaled_dot_product_attention(
82 |         q, k, v, mask)
83 |     
84 |     scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
85 | 
86 |     concat_attention = tf.reshape(scaled_attention, 
87 |                                   (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
88 | 
89 |     output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
90 |         
91 |     return output, attention_weights
92 | 


--------------------------------------------------------------------------------
/Multi-Head_attention.py:
--------------------------------------------------------------------------------
 1 | def multihead_attention(queries,
 2 |                         keys,
 3 |                         num_units=None,
 4 |                         num_heads=8,
 5 |                         dropout_rate=0,
 6 |                         is_training=True,
 7 |                         causality=False,
 8 |                         scope="multihead_attention",
 9 |                         reuse=None):
10 |     with tf.variable_scope(scope, reuse=reuse):
11 |         if num_units is None:  # set default size for attention size C
12 |             num_units = queries.get_shape().as_list()[-1]
13 | 
14 |         # Linear Projections
15 |         Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu)  # [N, T_q, C]
16 |         K = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # [N, T_k, C]
17 |         V = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # [N, T_k, C]
18 | 
19 |         # Split and concat
20 |         Q_ = tf.concat(tf.split(Q, num_heads, axis=-1), axis=0)  # [num_heads * N, T_q, C/num_heads]
21 |         K_ = tf.concat(tf.split(K, num_heads, axis=-1), axis=0)  # [num_heads * N, T_k, C/num_heads]
22 |         V_ = tf.concat(tf.split(V, num_heads, axis=-1), axis=0)  # [num_heads * N, T_k, C/num_heads]
23 | 
24 |         # Attention
25 |         outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (num_heads * N, T_q, T_k)
26 | 
27 |         # Scale : outputs = outputs / sqrt( d_k)
28 |         outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
29 | 
30 |         # Key Masking
31 |         # see : https://github.com/Kyubyong/transformer/issues/3
32 |         key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))  # (N, T_k)
33 |         key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
34 |         key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1])  # (h*N, T_q, T_k)
35 | 
36 |         paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)  # -infinity
37 |         outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs)  # (h*N, T_q, T_k)
38 | 
39 |         # Causality = Future blinding
40 |         if causality:
41 |             diag_vals = tf.ones_like(outputs[0, :, :])  # (T_q, T_k)
42 |             tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense()  # (T_q, T_k)
43 |             masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1])  # (h*N, T_q, T_k)
44 | 
45 |             paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
46 |             outputs = tf.where(tf.equal(masks, 0), paddings, outputs)  # (h*N, T_q, T_k)
47 | 
48 |         # Activation: outputs is a weight matrix
49 |         outputs = tf.nn.softmax(outputs)  # (h*N, T_q, T_k)
50 | 
51 |         # Query Masking
52 |         query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))  # (N, T_q)
53 |         query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
54 |         query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
55 |         outputs *= query_masks  # broadcasting. (N, T_q, C)
56 | 
57 |         # dropouts
58 |         outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
59 | 
60 |         # weighted sum
61 |         outputs = tf.matmul(outputs, V_)  # ( h*N, T_q, C/h)
62 | 
63 |         # reshape
64 |         outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, C)
65 | 
66 |         # residual connection
67 |         outputs += queries
68 | 
69 |         # layer normaliztion
70 |         outputs = layer_normalization(outputs)
71 |         return outputs
72 | # https://github.com/TobiasLee/Text-Classification/blob/master/models/modules/multihead.py
73 | 


--------------------------------------------------------------------------------
/Multiple_Multi_head_attention.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #test self-attention
  3 | import tensorflow as tf
  4 | import time
  5 | """
  6 | multi head attention.
  7 | 1.linearly project the queries,keys and values h times(with different,learned linear projections to d_k,d_k,d_v dimensions)
  8 | 2.scaled dot product attention for each projected version of Q,K,V
  9 | 3.concatenated result
 10 | 4.linear projection to get final result
 11 | three kinds of usage:
 12 | 1. attention for encoder
 13 | 2. attention for decoder(need a mask to pay attention for only known position)
 14 | 3. attention as bridge of encoder and decoder
 15 | """
 16 | class MultiHeadAttention(object):
 17 |     """ multi head attention"""
 18 |     def __init__(self,Q,K_s,V_s,d_model,d_k,d_v,sequence_length,h,type=None,is_training=None,mask=None,dropout_rate=0.1):
 19 |         self.d_model=d_model
 20 |         self.d_k=d_k
 21 |         self.d_v=d_v
 22 |         self.sequence_length=sequence_length
 23 |         self.h=h
 24 |         self.Q=Q
 25 |         self.K_s=K_s
 26 |         self.V_s=V_s
 27 |         self.type=type
 28 |         self.is_training=is_training
 29 |         self.mask=mask
 30 |         self.dropout_rate=dropout_rate
 31 |         print("MultiHeadAttention.self.dropout_rate:",self.dropout_rate)
 32 | 
 33 |     def multi_head_attention_fn(self):
 34 |         """
 35 |         multi head attention
 36 |         :param Q: query.  shape:[batch,sequence_length,d_model]
 37 |         :param K_s: keys. shape:[batch,sequence_length,d_model].
 38 |         :param V_s:values.shape:[batch,sequence_length,d_model].
 39 |         :param h: h times
 40 |         :return: result of scaled dot product attention. shape:[sequence_length,d_model]
 41 |         """
 42 |         # 1. linearly project the queries,keys and values h times(with different,learned linear projections to d_k,d_k,d_v dimensions)
 43 |         Q_projected   = tf.layers.dense(self.Q,units=self.d_model)     # [batch,sequence_length,d_model]
 44 |         K_s_projected = tf.layers.dense(self.K_s, units=self.d_model)  # [batch,sequence_length,d_model]
 45 |         V_s_projected = tf.layers.dense(self.V_s, units=self.d_model)  # [batch,sequence_length,d_model]
 46 |         # 2. scaled dot product attention for each projected version of Q,K,V
 47 |         dot_product=self.scaled_dot_product_attention_batch(Q_projected,K_s_projected,V_s_projected) # [batch,h,sequence_length,d_k]
 48 |         # 3. concatenated
 49 |         print("dot_product:====================================================================================>",dot_product) #dot_product:(128, 8, 6, 64)
 50 |         batch_size,h,length,d_k=dot_product.get_shape().as_list()
 51 |         print("self.sequence_length:",self.sequence_length) #5
 52 |         dot_product=tf.reshape(dot_product,shape=(-1,length,self.d_model))
 53 |         # 4. linear projection
 54 |         output=tf.layers.dense(dot_product,units=self.d_model) # [batch,sequence_length,d_model]
 55 |         return output  #[batch,sequence_length,d_model]
 56 | 
 57 |     def scaled_dot_product_attention_batch_mine(self,Q,K_s,V_s): #my own implementation of scaled dot product attention.
 58 |         """
 59 |         scaled dot product attention
 60 |         :param Q:  query.  shape:[batch,sequence_length,d_model]
 61 |         :param K_s: keys.  shape:[batch,sequence_length,d_model]
 62 |         :param V_s:values. shape:[batch,sequence_length,d_model]
 63 |         :param mask:       shape:[batch,sequence_length]
 64 |         :return: result of scaled dot product attention. shape:[batch,h,sequence_length,d_k]
 65 |         """
 66 |         # 1. split Q,K,V
 67 |         Q_heads = tf.stack(tf.split(Q,self.h,axis=2),axis=1)         # [batch,h,sequence_length,d_k]
 68 |         K_heads = tf.stack(tf.split(K_s, self.h, axis=2), axis=1)    # [batch,h,sequence_length,d_k]
 69 |         V_heads = tf.stack(tf.split(V_s, self.h, axis=2), axis=1)    # [batch,h,sequence_length,d_k]
 70 |         dot_product=tf.multiply(Q_heads,K_heads)                     # [batch,h,sequence_length,d_k]
 71 |         # 2. dot product
 72 |         dot_product=dot_product*(1.0/tf.sqrt(tf.cast(self.d_model,tf.float32))) # [batch,h,sequence_length,d_k]
 73 |         dot_product=tf.reduce_sum(dot_product,axis=-1,keep_dims=True) # [batch,h,sequence_length,1]
 74 |         # 3. add mask if it is none
 75 |         if self.mask is not None:
 76 |             mask = tf.expand_dims(self.mask, axis=-1)  # [batch,sequence_length,1]
 77 |             mask = tf.expand_dims(mask, axis=1)  # [batch,1,sequence_length,1]
 78 |             dot_product=dot_product+mask   # [batch,h,sequence_length,1]
 79 |         # 4. get possibility
 80 |         p=tf.nn.softmax(dot_product)                                  # [batch,h,sequence_length,1]
 81 |         # 5. final output
 82 |         output=tf.multiply(p,V_heads)                                 # [batch,h,sequence_length,d_k]
 83 |         return output                                                 # [batch,h,sequence_length,d_k]
 84 | 
 85 |     def scaled_dot_product_attention_batch(self, Q, K_s, V_s):# scaled dot product attention: implementation style like tensor2tensor from google
 86 |         """
 87 |         scaled dot product attention
 88 |         :param Q:  query.  shape:[batch,sequence_length,d_model]
 89 |         :param K_s: keys.  shape:[batch,sequence_length,d_model]
 90 |         :param V_s:values. shape:[batch,sequence_length,d_model]
 91 |         :param mask:       shape:[sequence_length,sequence_length]
 92 |         :return: result of scaled dot product attention. shape:[batch,h,sequence_length,d_k]
 93 |         """
 94 |         # 1. split Q,K,V
 95 |         Q_heads = tf.stack(tf.split(Q,self.h,axis=2),axis=1)                    # [batch,h,sequence_length,d_k]
 96 |         K_heads = tf.stack(tf.split(K_s, self.h, axis=2), axis=1)               # [batch,h,sequence_length,d_k]
 97 |         V_heads = tf.stack(tf.split(V_s, self.h, axis=2), axis=1)               # [batch,h,sequence_length,d_k]
 98 |         # 2. dot product of Q,K
 99 |         dot_product=tf.matmul(Q_heads,K_heads,transpose_b=True)                 # [batch,h,sequence_length,sequence_length]
100 |         dot_product=dot_product*(1.0/tf.sqrt(tf.cast(self.d_model,tf.float32))) # [batch,h,sequence_length,sequence_length]
101 |         # 3. add mask if it is none
102 |         print("scaled_dot_product_attention_batch.===============================================================>mask is not none?",self.mask is not None)
103 |         if self.mask is not None:
104 |             mask_expand=tf.expand_dims(tf.expand_dims(self.mask,axis=0),axis=0) # [1,1,sequence_length,sequence_length]
105 |             #dot_product:(128, 8, 6, 6);mask_expand:(1, 1, 5, 5)
106 |             print("scaled_dot_product_attention_batch.===============================================================>dot_product:",dot_product,";mask_expand:",mask_expand)
107 |             dot_product=dot_product+mask_expand                                 # [batch,h,sequence_length,sequence_length]
108 |         # 4.get possibility
109 |         weights=tf.nn.softmax(dot_product)                                      # [batch,h,sequence_length,sequence_length]
110 |         # drop out weights
111 |         weights=tf.nn.dropout(weights,1.0-self.dropout_rate)                    # [batch,h,sequence_length,sequence_length]
112 |         # 5. final output
113 |         output=tf.matmul(weights,V_heads)                                       # [batch,h,sequence_length,d_model]
114 |         return output
115 | 
116 | 
117 | #vectorized implementation of multi head attention for sentences with batch
118 | def multi_head_attention_for_sentence_vectorized(layer_number):
119 |     print("started...")
120 |     start = time.time()
121 |     # 1.set parameter
122 |     d_model = 512
123 |     d_k = 64
124 |     d_v = 64
125 |     sequence_length = 1000
126 |     h = 8
127 |     batch_size=128
128 |     initializer = tf.random_normal_initializer(stddev=0.1)
129 |     # 2.set Q,K,V
130 |     vocab_size=1000
131 |     embed_size=d_model
132 |     type='decoder'
133 |     Embedding = tf.get_variable("Embedding_", shape=[vocab_size, embed_size],initializer=initializer)
134 |     input_x = tf.placeholder(tf.int32, [batch_size,sequence_length], name="input_x")
135 |     embedded_words = tf.nn.embedding_lookup(Embedding, input_x) #[batch_size,sequence_length,embed_size]
136 |     mask=get_mask(batch_size,sequence_length) #tf.ones((batch_size,sequence_length))*-1e8  #[batch,sequence_length]
137 |     with tf.variable_scope("query_at_each_sentence"+str(layer_number)):
138 |         Q = embedded_words  # [batch_size*sequence_length,embed_size]
139 |         K_s=embedded_words #[batch_size*sequence_length,embed_size]
140 |         #V_s=tf.get_variable("V_s_original_", shape=embedded_words.get_shape().as_list(),initializer=initializer) #[batch_size,sequence_length,embed_size]
141 |         V_s=K_s
142 |         # 3.call method to get result
143 |         multi_head_attention_class = MultiHeadAttention(Q, K_s, V_s, d_model, d_k, d_v, sequence_length, h,type='decoder',mask=mask)
144 |         encoder_output=multi_head_attention_class.multi_head_attention_fn() #shape:[sequence_length,d_model]
145 |         encoder_output=tf.reshape(encoder_output,shape=(batch_size,sequence_length,d_model))
146 |     end = time.time()
147 |     print("input_x:",input_x)
148 |     print("encoder_output:",encoder_output,";time_spent:",(end-start))
149 | 
150 | def get_mask(batch_size,sequence_length):
151 |     lower_triangle=tf.matrix_band_part(tf.ones([sequence_length,sequence_length]),-1,0)
152 |     result=-1e9*(1.0-lower_triangle)
153 |     print("get_mask==>result:",result)
154 |     return result
155 | 
156 | #multi_head_attention_for_sentence_vectorized(0)
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">  Various-Attention-mechanisms </h1>
 2 | This repository contain various types of attention mechanism like Bahdanau , Soft attention , Additive Attention , Hierarchical Attention etc in Pytorch & Tensorflow, Keras
 3 | 
 4 | 
 5 | <p align="center">
 6 |   <img width="650" src="/Images/ Bahdanau_attention.png">
 7 | </p>
 8 | 
 9 | ### Papers, research and study
10 | |      Research Paper                 | Python Code  |
11 | | :-------------------- | :----------: |
12 | | [Paper ](https://arxiv.org/pdf/1409.0473.pdf) | [Code ](https://github.com/monk1337/Various-Attention-mechanisms/blob/master/2.0-%20Bahdanau_attention.py) |
13 | 
14 | 
15 | Luong attention and Bahdanau attention
16 | 
17 | 
18 | 
19 | <img src="/Images/white.png" alt="My cool logo"/>
20 | 
21 | <img src="/Images/attention-mechanisms.png" alt="My cool logo"/>
22 |  
23 | <img src="/Images/alignments.png" alt="My cool logo"/>
24 | 
25 | #### working on an attetion module for tensorflow where you can just import the attention, check it out and contribute :
26 | 
27 | https://github.com/monk1337/Tensorflow-Attention-mechanisms
28 | 
29 | #Images source : http://cnyah.com/2017/08/01/attention-variants/
30 | 


--------------------------------------------------------------------------------
/Sentence_level_Hierarchical_Attention.py:
--------------------------------------------------------------------------------
 1 | def attention_sentence_level(self, hidden_state_sentence):
 2 |         """
 3 |         input1: hidden_state_sentence: a list,len:num_sentence,element:[None,hidden_size*4]
 4 |         input2: sentence level context vector:[self.hidden_size*2]
 5 |         :return:representation.shape:[None,hidden_size*4]
 6 |         """
 7 |         hidden_state_ = tf.stack(hidden_state_sentence, axis=1)  # shape:[None,num_sentence,hidden_size*4]
 8 | 
 9 |         # 0) one layer of feed forward
10 |         hidden_state_2 = tf.reshape(hidden_state_,
11 |                                     shape=[-1, self.hidden_size * 4])  # [None*num_sentence,hidden_size*4]
12 |         hidden_representation = tf.nn.tanh(tf.matmul(hidden_state_2,
13 |                                                      self.W_w_attention_sentence) + self.W_b_attention_sentence)  # shape:[None*num_sentence,hidden_size*2]
14 |         hidden_representation = tf.reshape(hidden_representation, shape=[-1, self.num_sentences,
15 |                                                                          self.hidden_size * 2])  # [None,num_sentence,hidden_size*2]
16 |         # attention process:1.get logits for each sentence in the doc.2.get possibility distribution for each sentence in the doc.3.get weighted sum for the sentences as doc representation.
17 |         # 1) get logits for each word in the sentence.
18 |         hidden_state_context_similiarity = tf.multiply(hidden_representation,
19 |                                                        self.context_vecotor_sentence)  # shape:[None,num_sentence,hidden_size*2]
20 |         attention_logits = tf.reduce_sum(hidden_state_context_similiarity,
21 |                                          axis=2)  # shape:[None,num_sentence]. that is get logit for each num_sentence.
22 |         # subtract max for numerical stability (softmax is shift invariant). tf.reduce_max:computes the maximum of elements across dimensions of a tensor.
23 |         attention_logits_max = tf.reduce_max(attention_logits, axis=1, keep_dims=True)  # shape:[None,1]
24 |         # 2) get possibility distribution for each word in the sentence.
25 |         p_attention = tf.nn.softmax(attention_logits - attention_logits_max)  # shape:[None,num_sentence]
26 |         # 3) get weighted hidden state by attention vector(sentence level)
27 |         p_attention_expanded = tf.expand_dims(p_attention, axis=2)  # shape:[None,num_sentence,1]
28 |         sentence_representation = tf.multiply(p_attention_expanded,
29 |                                               hidden_state_)  # shape:[None,num_sentence,hidden_size*2]<---p_attention_expanded:[None,num_sentence,1];hidden_state_:[None,num_sentence,hidden_size*2]
30 |         sentence_representation = tf.reduce_sum(sentence_representation, axis=1)  # shape:[None,hidden_size*2]
31 |         return sentence_representation  # shape:[None,hidden_size*2]
32 | 


--------------------------------------------------------------------------------
/Tensorflow_Attention_apis/Bahdanau_attention.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Bahdanau_attention.ipynb",
  7 |       "version": "0.3.2",
  8 |       "views": {},
  9 |       "default_view": {},
 10 |       "provenance": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "metadata": {
 20 |         "id": "mfRstwjv55mA",
 21 |         "colab_type": "code",
 22 |         "colab": {
 23 |           "autoexec": {
 24 |             "startup": false,
 25 |             "wait_interval": 0
 26 |           }
 27 |         }
 28 |       },
 29 |       "cell_type": "code",
 30 |       "source": [
 31 |         "#importing libraries\n",
 32 |         "\n",
 33 |         "import tensorflow as tf\n",
 34 |         "import numpy as np\n",
 35 |         "tf.reset_default_graph()"
 36 |       ],
 37 |       "execution_count": 0,
 38 |       "outputs": []
 39 |     },
 40 |     {
 41 |       "metadata": {
 42 |         "id": "AlP_UDLf6KSX",
 43 |         "colab_type": "code",
 44 |         "colab": {
 45 |           "autoexec": {
 46 |             "startup": false,
 47 |             "wait_interval": 0
 48 |           }
 49 |         }
 50 |       },
 51 |       "cell_type": "code",
 52 |       "source": [
 53 |         "#Bahdanau_attention argumants are . ==> rnn_size (num_units of lstm/gru cell) , \n",
 54 |         "\n",
 55 |         "# encoder_output [batch x max_time x dim] \n",
 56 |         "\n",
 57 |         "# sentence_length ( max _length of sequence in batch)"
 58 |       ],
 59 |       "execution_count": 0,
 60 |       "outputs": []
 61 |     },
 62 |     {
 63 |       "metadata": {
 64 |         "id": "N0GpWlj46tfa",
 65 |         "colab_type": "code",
 66 |         "colab": {
 67 |           "autoexec": {
 68 |             "startup": false,
 69 |             "wait_interval": 0
 70 |           }
 71 |         }
 72 |       },
 73 |       "cell_type": "code",
 74 |       "source": [
 75 |         "#let's define encoder first \n",
 76 |         "\n",
 77 |         "batch_size = 4\n",
 78 |         "max_time   = 5\n",
 79 |         "dim        = 12\n",
 80 |         "num_units  = 6\n",
 81 |         "sequence_length = [2,4,5,1]\n",
 82 |         "\n",
 83 |         "\n",
 84 |         "input_data = np.random.randn(batch_size , max_time , dim).astype(np.float32)\n",
 85 |         "\n",
 86 |         "lstm_cell = tf.contrib.rnn.LSTMCell(num_units)\n",
 87 |         "\n",
 88 |         "output , last_state = tf.nn.dynamic_rnn(lstm_cell,input_data,sequence_length,dtype=tf.float32)\n",
 89 |         "\n",
 90 |         "#input batch x max x dim ==> lstm ==> (batch x max x lstm_dim , batch x lstm_dim )\n",
 91 |         "# return (4, 5, 6) , (4, 6)\n",
 92 |         "\n",
 93 |         "\n",
 94 |         "\n",
 95 |         "# with tf.Session() as sess:\n",
 96 |         "#   sess.run(tf.global_variables_initializer())\n",
 97 |         "#   print(sess.run(last_state[0]).shape)\n",
 98 |         "\n"
 99 |       ],
100 |       "execution_count": 0,
101 |       "outputs": []
102 |     },
103 |     {
104 |       "metadata": {
105 |         "id": "4G1wM9Gk9-E4",
106 |         "colab_type": "code",
107 |         "colab": {
108 |           "autoexec": {
109 |             "startup": false,
110 |             "wait_interval": 0
111 |           }
112 |         }
113 |       },
114 |       "cell_type": "code",
115 |       "source": [
116 |         "#now let's Construct the Attention mechanism.\n",
117 |         "\n",
118 |         "\n",
119 |         "attention_Bahdanau = tf.contrib.seq2seq.BahdanauAttention(\n",
120 |         "                                                  num_units=num_units,\n",
121 |         "                                                  memory=output,\n",
122 |         "                                                  memory_sequence_length=sequence_length,\n",
123 |         "                                                  dtype=None, \n",
124 |         "                                                  name='BahdanauAttention'\n",
125 |         "                                                 )\n",
126 |         "\n",
127 |         "#You can test and use it's various methods \n",
128 |         "\n",
129 |         "# batch_size = attention_Bahdanau.batch_size\n",
130 |         "# memory_layer = attention_Bahdanau.memory_layer\n",
131 |         "# keys=attention_Bahdanau.keys\n",
132 |         "# values=attention_Bahdanau.values\n",
133 |         "# alignment_size= attention_Bahdanau.alignments_size\n",
134 |         "# state_size = attention_Bahdanau.state_size\n",
135 |         "\n",
136 |         "\n",
137 |         "\n",
138 |         "\n",
139 |         "# will return [batch_size x alignments_size  ] for next time_stamp\n",
140 |         "\n",
141 |         "# [ batch_size, alignments_size] (alignments_size is memory's max_time).\n"
142 |       ],
143 |       "execution_count": 0,
144 |       "outputs": []
145 |     },
146 |     {
147 |       "metadata": {
148 |         "id": "JdovMup1_txu",
149 |         "colab_type": "code",
150 |         "colab": {
151 |           "autoexec": {
152 |             "startup": false,
153 |             "wait_interval": 0
154 |           }
155 |         }
156 |       },
157 |       "cell_type": "code",
158 |       "source": [
159 |         "#let's print the alignment output \n",
160 |         "\n",
161 |         "#for that we need two variable , state and query\n",
162 |         "\n",
163 |         "query_ = tf.get_variable(name='query_dta_1',\n",
164 |         "                         shape=[batch_size,num_units],\n",
165 |         "                         dtype=tf.float32,initializer=tf.random_uniform_initializer(-0.01,0.01))\n",
166 |         "\n",
167 |         "state_ = tf.get_variable(name='state__dta_l',\n",
168 |         "                         shape=[batch_size,max_time],\n",
169 |         "                         dtype=tf.float32,initializer=tf.random_uniform_initializer(-0.01,0.01))"
170 |       ],
171 |       "execution_count": 0,
172 |       "outputs": []
173 |     },
174 |     {
175 |       "metadata": {
176 |         "id": "Zg_OgD1HAQ-o",
177 |         "colab_type": "code",
178 |         "colab": {
179 |           "autoexec": {
180 |             "startup": false,
181 |             "wait_interval": 0
182 |           },
183 |           "base_uri": "https://localhost:8080/",
184 |           "height": 34
185 |         },
186 |         "outputId": "0afe6c5f-e104-45d8-e73f-8d421e15c04b",
187 |         "executionInfo": {
188 |           "status": "ok",
189 |           "timestamp": 1530806676710,
190 |           "user_tz": -330,
191 |           "elapsed": 963,
192 |           "user": {
193 |             "displayName": "ayodhyankit paul",
194 |             "photoUrl": "//lh3.googleusercontent.com/-aLSMOExWjxQ/AAAAAAAAAAI/AAAAAAAAAAc/yPMgEhPgnpk/s50-c-k-no/photo.jpg",
195 |             "userId": "106815194044651409765"
196 |           }
197 |         }
198 |       },
199 |       "cell_type": "code",
200 |       "source": [
201 |         "#initial_alignments\n",
202 |         "initial_alignments = attention_Bahdanau.initial_alignments(batch_size,dtype=tf.float32)\n",
203 |         "\n",
204 |         "\n",
205 |         "#initial_state\n",
206 |         "initial_state   =    attention_Bahdanau.initial_state(batch_size,dtype=tf.float32)\n",
207 |         "\n",
208 |         "\n",
209 |         "\n",
210 |         "call = attention_Bahdanau.__call__(query_,state_)\n",
211 |         "\n",
212 |         "with tf.Session() as sess:\n",
213 |         "  sess.run(tf.global_variables_initializer())\n",
214 |         "  alignment , next_state = sess.run(call)\n",
215 |         "  \n",
216 |         "  print(alignment.shape,next_state.shape)\n",
217 |         "  \n",
218 |         "  #batch_size x max_time  alignment _shape\n",
219 |         "  #batch_size x max_time  next_state_sgape\n",
220 |         "  \n",
221 |         "  \n",
222 |         "  "
223 |       ],
224 |       "execution_count": 110,
225 |       "outputs": [
226 |         {
227 |           "output_type": "stream",
228 |           "text": [
229 |             "(4, 5) (4, 5)\n"
230 |           ],
231 |           "name": "stdout"
232 |         }
233 |       ]
234 |     },
235 |     {
236 |       "metadata": {
237 |         "id": "7XYdbBunAyYW",
238 |         "colab_type": "code",
239 |         "colab": {
240 |           "autoexec": {
241 |             "startup": false,
242 |             "wait_interval": 0
243 |           }
244 |         }
245 |       },
246 |       "cell_type": "code",
247 |       "source": [
248 |         "#Now we don't need to do those things , Tensorflow have a wrapper called AttentionWrapper which will do all those things\n",
249 |         "\n",
250 |         "#attention_wrapper arguments ==> cell , attention_mech , rnn_size \n",
251 |         "\n",
252 |         "attention_wrapper = tf.contrib.seq2seq.AttentionWrapper(lstm_cell,\n",
253 |         "                                                        attention_Bahdanau,\n",
254 |         "                                                        num_units)\n",
255 |         "\n",
256 |         "#this will return \n",
257 |         "\n",
258 |         "#AttentionWrapperState   batch x num_units\n",
259 |         "# attention vector       batch x num_units\n",
260 |         "#current time_stamp      1 , 0 depend of time_staps\n",
261 |         "#alignments              batch x max_time\n",
262 |         "# alignment_history\n",
263 |         "#attention_state         batch x max_time\n",
264 |         "\n",
265 |         "#     collections.namedtuple(\"AttentionWrapperState\",\n",
266 |         "#                            (\"cell_state\", \"attention\", \n",
267 |         "#                                \"time\", \"alignments\",\n",
268 |         "#                             \"alignment_history\", \"attention_state\")))\n",
269 |         "\n",
270 |         "#   \"\"\"`namedtuple` storing the state of a `AttentionWrapper`.\n",
271 |         "\n",
272 |         "\n",
273 |         "\n",
274 |         "\n"
275 |       ],
276 |       "execution_count": 0,
277 |       "outputs": []
278 |     },
279 |     {
280 |       "metadata": {
281 |         "id": "HM4hU8_sDZOZ",
282 |         "colab_type": "code",
283 |         "colab": {
284 |           "autoexec": {
285 |             "startup": false,
286 |             "wait_interval": 0
287 |           },
288 |           "base_uri": "https://localhost:8080/",
289 |           "height": 507
290 |         },
291 |         "outputId": "6abebe7d-4ecd-4e0e-f763-46e0ac8d63f6",
292 |         "executionInfo": {
293 |           "status": "ok",
294 |           "timestamp": 1530806682921,
295 |           "user_tz": -330,
296 |           "elapsed": 925,
297 |           "user": {
298 |             "displayName": "ayodhyankit paul",
299 |             "photoUrl": "//lh3.googleusercontent.com/-aLSMOExWjxQ/AAAAAAAAAAI/AAAAAAAAAAc/yPMgEhPgnpk/s50-c-k-no/photo.jpg",
300 |             "userId": "106815194044651409765"
301 |           }
302 |         }
303 |       },
304 |       "cell_type": "code",
305 |       "source": [
306 |         "#let's check and unroll for zero state with timestamp 1\n",
307 |         "\n",
308 |         "# first we have to define zero_state\n",
309 |         "\n",
310 |         "zero_s= attention_wrapper.zero_state(batch_size=batch_size,dtype=tf.float32)\n",
311 |         "\n",
312 |         "wrapper_call = attention_wrapper.__call__(query_,zero_s)\n",
313 |         "AttentionWrapperState , (cell_state , attention , time , alignments , alignment_history , attention_state) = wrapper_call\n",
314 |         "\n",
315 |         "\n",
316 |         "\n",
317 |         "with tf.Session() as sess:\n",
318 |         "  sess.run(tf.global_variables_initializer())\n",
319 |         "  \n",
320 |         "  AttentionWrapperState , (cell_state , attention , time , alignments , alignment_history , attention_state) =sess.run(wrapper_call)\n",
321 |         "  \n",
322 |         "\n",
323 |         "  \n",
324 |         "  \n",
325 |         "  print( 'Attention_wrapper_state {} \\n '.format(AttentionWrapperState.shape))\n",
326 |         "  print( 'cell_state {} \\n '.format(cell_state))\n",
327 |         "  print('attention {} \\n '.format(attention.shape))\n",
328 |         "  print('time {} \\n ',time)\n",
329 |         "  print('alignments {} \\n '.format(alignments.shape))\n",
330 |         "  print('alignment_history {} \\n ',alignment_history)\n",
331 |         "  print('attention_state {} \\n ',attention_state.shape)\n",
332 |         "  \n",
333 |         "  \n",
334 |         "\n",
335 |         "  \n",
336 |         "  \n",
337 |         "  \n",
338 |         "  \n",
339 |         "#  Contains:\n",
340 |         "#     - `cell_state`: The state of the wrapped `RNNCell` at the previous time\n",
341 |         "#       step.\n",
342 |         "#     - `attention`: The attention emitted at the previous time step.\n",
343 |         "#     - `time`: int32 scalar containing the current time step.\n",
344 |         "#     - `alignments`: A single or tuple of `Tensor`(s) containing the alignments\n",
345 |         "#        emitted at the previous time step for each attention mechanism.\n",
346 |         "#     - `alignment_history`: (if enabled) a single or tuple of `TensorArray`(s)\n",
347 |         "#        containing alignment matrices from all time steps for each attention\n",
348 |         "#        mechanism. Call `stack()` on each to convert to a `Tensor`.\n",
349 |         "#     - `attention_state`: A single or tuple of nested objects\n",
350 |         "#        containing attention mechanism state for each attention mechanism.\n",
351 |         "#        The objects may contain Tensors or TensorArrays.\n",
352 |         "#   \"\"\"\n",
353 |         "\n"
354 |       ],
355 |       "execution_count": 112,
356 |       "outputs": [
357 |         {
358 |           "output_type": "stream",
359 |           "text": [
360 |             "Attention_wrapper_state (4, 6) \n",
361 |             " \n",
362 |             "cell_state LSTMStateTuple(c=array([[ 1.5111677e-03,  4.0882453e-04,  1.3467169e-03,  1.7782598e-03,\n",
363 |             "        -1.2312753e-03,  1.5057663e-03],\n",
364 |             "       [ 2.3543267e-03, -1.9105258e-03,  1.1930702e-03,  7.3114608e-04,\n",
365 |             "         4.3127441e-04,  1.0493306e-05],\n",
366 |             "       [-1.5790670e-03, -6.5457175e-04, -3.7221948e-04, -8.0193178e-04,\n",
367 |             "         5.4796168e-04, -1.5430629e-03],\n",
368 |             "       [-1.6524252e-03,  3.0396411e-03,  9.2920812e-04,  5.6705176e-04,\n",
369 |             "        -2.6813590e-03,  3.5907242e-03]], dtype=float32), h=array([[ 7.5694249e-04,  2.0503084e-04,  6.7268399e-04,  8.8943960e-04,\n",
370 |             "        -6.1760383e-04,  7.5344433e-04],\n",
371 |             "       [ 1.1791448e-03, -9.5691084e-04,  5.9497467e-04,  3.6539393e-04,\n",
372 |             "         2.1601986e-04,  5.2492933e-06],\n",
373 |             "       [-7.8811386e-04, -3.2676206e-04, -1.8669791e-04, -4.0103088e-04,\n",
374 |             "         2.7398436e-04, -7.7201752e-04],\n",
375 |             "       [-8.2568289e-04,  1.5217348e-03,  4.6479961e-04,  2.8396057e-04,\n",
376 |             "        -1.3408703e-03,  1.7910090e-03]], dtype=float32)) \n",
377 |             " \n",
378 |             "attention (4, 6) \n",
379 |             " \n",
380 |             "time {} \n",
381 |             "  1\n",
382 |             "alignments (4, 5) \n",
383 |             " \n",
384 |             "alignment_history {} \n",
385 |             "  ()\n",
386 |             "attention_state {} \n",
387 |             "  (4, 5)\n"
388 |           ],
389 |           "name": "stdout"
390 |         }
391 |       ]
392 |     },
393 |     {
394 |       "metadata": {
395 |         "id": "YjHfyvq8FSwe",
396 |         "colab_type": "code",
397 |         "colab": {
398 |           "autoexec": {
399 |             "startup": false,
400 |             "wait_interval": 0
401 |           },
402 |           "base_uri": "https://localhost:8080/",
403 |           "height": 658
404 |         },
405 |         "outputId": "c2209f76-5fd2-4a7c-885e-a0b0b4bf17fd",
406 |         "executionInfo": {
407 |           "status": "ok",
408 |           "timestamp": 1530806706633,
409 |           "user_tz": -330,
410 |           "elapsed": 1448,
411 |           "user": {
412 |             "displayName": "ayodhyankit paul",
413 |             "photoUrl": "//lh3.googleusercontent.com/-aLSMOExWjxQ/AAAAAAAAAAI/AAAAAAAAAAc/yPMgEhPgnpk/s50-c-k-no/photo.jpg",
414 |             "userId": "106815194044651409765"
415 |           }
416 |         }
417 |       },
418 |       "cell_type": "code",
419 |       "source": [
420 |         "#let's feed this time step for next one \n",
421 |         "\n",
422 |         "next_unroll = attention_wrapper.__call__(query_,wrapper_call[1])\n",
423 |         "with tf.Session() as sess:\n",
424 |         "  sess.run(tf.global_variables_initializer())\n",
425 |         "  print(sess.run(next_unroll))\n",
426 |         "\n",
427 |         "  \n",
428 |         "\n"
429 |       ],
430 |       "execution_count": 114,
431 |       "outputs": [
432 |         {
433 |           "output_type": "stream",
434 |           "text": [
435 |             "(array([[ 0.02850083, -0.08838632,  0.03423838, -0.04875825,  0.02433623,\n",
436 |             "         0.01494354],\n",
437 |             "       [ 0.0340319 ,  0.14170066, -0.03641218,  0.06206513, -0.02657704,\n",
438 |             "        -0.15244852],\n",
439 |             "       [ 0.05387848, -0.05067005, -0.02691523,  0.00859693,  0.07006565,\n",
440 |             "         0.07642467],\n",
441 |             "       [ 0.08855353,  0.12874618, -0.12630436,  0.0099225 , -0.07797796,\n",
442 |             "         0.03092834]], dtype=float32), AttentionWrapperState(cell_state=LSTMStateTuple(c=array([[ 0.00654604, -0.01203434, -0.00615299,  0.0005315 , -0.00387453,\n",
443 |             "        -0.00444581],\n",
444 |             "       [ 0.01928671, -0.0066505 ,  0.01987001,  0.01775809,  0.0203748 ,\n",
445 |             "         0.02813004],\n",
446 |             "       [-0.00456914,  0.00182934, -0.00227972, -0.02489319,  0.00236064,\n",
447 |             "        -0.02902747],\n",
448 |             "       [-0.00715777,  0.00372725,  0.01859315, -0.00267803, -0.01072733,\n",
449 |             "        -0.00648396]], dtype=float32), h=array([[ 0.00330853, -0.00600193, -0.00309194,  0.00027005, -0.00192373,\n",
450 |             "        -0.00221843],\n",
451 |             "       [ 0.00961715, -0.00332273,  0.01001352,  0.00857936,  0.01010879,\n",
452 |             "         0.01378607],\n",
453 |             "       [-0.00225938,  0.00092878, -0.00112528, -0.01261584,  0.00117257,\n",
454 |             "        -0.01440058],\n",
455 |             "       [-0.00357611,  0.00189408,  0.00896786, -0.00130601, -0.0052112 ,\n",
456 |             "        -0.00312695]], dtype=float32)), attention=array([[ 0.02850083, -0.08838632,  0.03423838, -0.04875825,  0.02433623,\n",
457 |             "         0.01494354],\n",
458 |             "       [ 0.0340319 ,  0.14170066, -0.03641218,  0.06206513, -0.02657704,\n",
459 |             "        -0.15244852],\n",
460 |             "       [ 0.05387848, -0.05067005, -0.02691523,  0.00859693,  0.07006565,\n",
461 |             "         0.07642467],\n",
462 |             "       [ 0.08855353,  0.12874618, -0.12630436,  0.0099225 , -0.07797796,\n",
463 |             "         0.03092834]], dtype=float32), time=2, alignments=array([[0.5233706 , 0.4766294 , 0.        , 0.        , 0.        ],\n",
464 |             "       [0.3698612 , 0.285511  , 0.15458336, 0.1900444 , 0.        ],\n",
465 |             "       [0.17603096, 0.25858277, 0.15518893, 0.22356178, 0.1866356 ],\n",
466 |             "       [1.        , 0.        , 0.        , 0.        , 0.        ]],\n",
467 |             "      dtype=float32), alignment_history=(), attention_state=array([[0.5233706 , 0.4766294 , 0.        , 0.        , 0.        ],\n",
468 |             "       [0.3698612 , 0.285511  , 0.15458336, 0.1900444 , 0.        ],\n",
469 |             "       [0.17603096, 0.25858277, 0.15518893, 0.22356178, 0.1866356 ],\n",
470 |             "       [1.        , 0.        , 0.        , 0.        , 0.        ]],\n",
471 |             "      dtype=float32)))\n"
472 |           ],
473 |           "name": "stdout"
474 |         }
475 |       ]
476 |     },
477 |     {
478 |       "metadata": {
479 |         "id": "iA8nYpPJIxqn",
480 |         "colab_type": "code",
481 |         "colab": {
482 |           "autoexec": {
483 |             "startup": false,
484 |             "wait_interval": 0
485 |           }
486 |         }
487 |       },
488 |       "cell_type": "code",
489 |       "source": [
490 |         ""
491 |       ],
492 |       "execution_count": 0,
493 |       "outputs": []
494 |     }
495 |   ]
496 | }


--------------------------------------------------------------------------------
/Tensorflow_Attention_apis/Luong_Attention.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Luong_Attention.ipynb",
  7 |       "version": "0.3.2",
  8 |       "views": {},
  9 |       "default_view": {},
 10 |       "provenance": []
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "metadata": {
 20 |         "id": "mfRstwjv55mA",
 21 |         "colab_type": "code",
 22 |         "colab": {
 23 |           "autoexec": {
 24 |             "startup": false,
 25 |             "wait_interval": 0
 26 |           }
 27 |         }
 28 |       },
 29 |       "cell_type": "code",
 30 |       "source": [
 31 |         "#importing libraries\n",
 32 |         "\n",
 33 |         "import tensorflow as tf\n",
 34 |         "import numpy as np\n",
 35 |         "tf.reset_default_graph()"
 36 |       ],
 37 |       "execution_count": 0,
 38 |       "outputs": []
 39 |     },
 40 |     {
 41 |       "metadata": {
 42 |         "id": "AlP_UDLf6KSX",
 43 |         "colab_type": "code",
 44 |         "colab": {
 45 |           "autoexec": {
 46 |             "startup": false,
 47 |             "wait_interval": 0
 48 |           }
 49 |         }
 50 |       },
 51 |       "cell_type": "code",
 52 |       "source": [
 53 |         "#Luong_ Attention argumants are . ==> rnn_size (num_units of lstm/gru cell) , \n",
 54 |         "\n",
 55 |         "# encoder_output [batch x max_time x dim] \n",
 56 |         "\n",
 57 |         "# sentence_length ( max _length of sequence in batch)"
 58 |       ],
 59 |       "execution_count": 0,
 60 |       "outputs": []
 61 |     },
 62 |     {
 63 |       "metadata": {
 64 |         "id": "N0GpWlj46tfa",
 65 |         "colab_type": "code",
 66 |         "colab": {
 67 |           "autoexec": {
 68 |             "startup": false,
 69 |             "wait_interval": 0
 70 |           }
 71 |         }
 72 |       },
 73 |       "cell_type": "code",
 74 |       "source": [
 75 |         "#let's define encoder first \n",
 76 |         "\n",
 77 |         "batch_size = 4\n",
 78 |         "max_time   = 5\n",
 79 |         "dim        = 12\n",
 80 |         "num_units  = 6\n",
 81 |         "sequence_length = [2,4,5,1]\n",
 82 |         "\n",
 83 |         "\n",
 84 |         "input_data = np.random.randn(batch_size , max_time , dim).astype(np.float32)\n",
 85 |         "\n",
 86 |         "lstm_cell = tf.contrib.rnn.LSTMCell(num_units)\n",
 87 |         "\n",
 88 |         "output , last_state = tf.nn.dynamic_rnn(lstm_cell,input_data,sequence_length,dtype=tf.float32)\n",
 89 |         "\n",
 90 |         "#input batch x max x dim ==> lstm ==> (batch x max x lstm_dim , batch x lstm_dim )\n",
 91 |         "# return (4, 5, 6) , (4, 6)\n",
 92 |         "\n",
 93 |         "\n",
 94 |         "\n",
 95 |         "# with tf.Session() as sess:\n",
 96 |         "#   sess.run(tf.global_variables_initializer())\n",
 97 |         "#   print(sess.run(last_state[0]).shape)\n",
 98 |         "\n"
 99 |       ],
100 |       "execution_count": 0,
101 |       "outputs": []
102 |     },
103 |     {
104 |       "metadata": {
105 |         "id": "4G1wM9Gk9-E4",
106 |         "colab_type": "code",
107 |         "colab": {
108 |           "autoexec": {
109 |             "startup": false,
110 |             "wait_interval": 0
111 |           }
112 |         }
113 |       },
114 |       "cell_type": "code",
115 |       "source": [
116 |         "#now let's Construct the Attention mechanism.\n",
117 |         "\n",
118 |         "\n",
119 |         "Luong_Attention = tf.contrib.seq2seq.LuongAttention(\n",
120 |         "                                                  num_units=num_units,\n",
121 |         "                                                  memory=output,\n",
122 |         "                                                  memory_sequence_length=sequence_length,\n",
123 |         "                                                  dtype=None, \n",
124 |         "                                                  name='LuongAttention'\n",
125 |         "                                                 )\n",
126 |         "\n",
127 |         "#You can test and use it's various methods \n",
128 |         " \n",
129 |         "\n",
130 |         "\n",
131 |         "\n",
132 |         "\n",
133 |         "# will return [batch_size x alignments_size  ] for next time_stamp\n",
134 |         "\n",
135 |         "# [ batch_size, alignments_size] (alignments_size is memory's max_time).\n"
136 |       ],
137 |       "execution_count": 0,
138 |       "outputs": []
139 |     },
140 |     {
141 |       "metadata": {
142 |         "id": "JdovMup1_txu",
143 |         "colab_type": "code",
144 |         "colab": {
145 |           "autoexec": {
146 |             "startup": false,
147 |             "wait_interval": 0
148 |           }
149 |         }
150 |       },
151 |       "cell_type": "code",
152 |       "source": [
153 |         "#let's print the alignment output \n",
154 |         "\n",
155 |         "#for that we need two variable , state and query\n",
156 |         "\n",
157 |         "query_ = tf.get_variable(name='query_dta_1',\n",
158 |         "                         shape=[batch_size,num_units],\n",
159 |         "                         dtype=tf.float32,initializer=tf.random_uniform_initializer(-0.01,0.01))\n",
160 |         "\n",
161 |         "state_ = tf.get_variable(name='state__dta_l',\n",
162 |         "                         shape=[batch_size,max_time],\n",
163 |         "                         dtype=tf.float32,initializer=tf.random_uniform_initializer(-0.01,0.01))"
164 |       ],
165 |       "execution_count": 0,
166 |       "outputs": []
167 |     },
168 |     {
169 |       "metadata": {
170 |         "id": "Zg_OgD1HAQ-o",
171 |         "colab_type": "code",
172 |         "colab": {
173 |           "autoexec": {
174 |             "startup": false,
175 |             "wait_interval": 0
176 |           },
177 |           "base_uri": "https://localhost:8080/",
178 |           "height": 34
179 |         },
180 |         "outputId": "b869f08a-0d6b-481f-fd48-ee4f8d24e585",
181 |         "executionInfo": {
182 |           "status": "ok",
183 |           "timestamp": 1530807209951,
184 |           "user_tz": -330,
185 |           "elapsed": 1237,
186 |           "user": {
187 |             "displayName": "ayodhyankit paul",
188 |             "photoUrl": "//lh3.googleusercontent.com/-aLSMOExWjxQ/AAAAAAAAAAI/AAAAAAAAAAc/yPMgEhPgnpk/s50-c-k-no/photo.jpg",
189 |             "userId": "106815194044651409765"
190 |           }
191 |         }
192 |       },
193 |       "cell_type": "code",
194 |       "source": [
195 |         "#initial_alignments\n",
196 |         "initial_alignments = Luong_Attention.initial_alignments(batch_size,dtype=tf.float32)\n",
197 |         "\n",
198 |         "\n",
199 |         "#initial_state\n",
200 |         "initial_state   =    Luong_Attention.initial_state(batch_size,dtype=tf.float32)\n",
201 |         "\n",
202 |         "\n",
203 |         "\n",
204 |         "call = Luong_Attention.__call__(query_,state_)\n",
205 |         "\n",
206 |         "\n",
207 |         "with tf.Session() as sess:\n",
208 |         "  sess.run(tf.global_variables_initializer())\n",
209 |         "  alignment , next_state = sess.run(call)\n",
210 |         "  \n",
211 |         "  print(alignment.shape,next_state.shape)\n",
212 |         "  \n",
213 |         "  #batch_size x max_time  alignment _shape\n",
214 |         "  #batch_size x max_time  next_state_sgape\n",
215 |         "  \n",
216 |         "  \n",
217 |         "  "
218 |       ],
219 |       "execution_count": 133,
220 |       "outputs": [
221 |         {
222 |           "output_type": "stream",
223 |           "text": [
224 |             "(4, 5) (4, 5)\n"
225 |           ],
226 |           "name": "stdout"
227 |         }
228 |       ]
229 |     },
230 |     {
231 |       "metadata": {
232 |         "id": "7XYdbBunAyYW",
233 |         "colab_type": "code",
234 |         "colab": {
235 |           "autoexec": {
236 |             "startup": false,
237 |             "wait_interval": 0
238 |           }
239 |         }
240 |       },
241 |       "cell_type": "code",
242 |       "source": [
243 |         "#Now we don't need to do those things , Tensorflow have a wrapper called AttentionWrapper which will do all those things\n",
244 |         "\n",
245 |         "#attention_wrapper arguments ==> cell , attention_mech , rnn_size \n",
246 |         "\n",
247 |         "attention_wrapper = tf.contrib.seq2seq.AttentionWrapper(lstm_cell,\n",
248 |         "                                                        Luong_Attention,\n",
249 |         "                                                        num_units)\n",
250 |         "\n",
251 |         "#this will return \n",
252 |         "\n",
253 |         "#AttentionWrapperState   batch x num_units\n",
254 |         "# attention vector       batch x num_units\n",
255 |         "#current time_stamp      1 , 0 depend of time_staps\n",
256 |         "#alignments              batch x max_time\n",
257 |         "# alignment_history\n",
258 |         "#attention_state         batch x max_time\n",
259 |         "\n",
260 |         "#     collections.namedtuple(\"AttentionWrapperState\",\n",
261 |         "#                            (\"cell_state\", \"attention\", \n",
262 |         "#                                \"time\", \"alignments\",\n",
263 |         "#                             \"alignment_history\", \"attention_state\")))\n",
264 |         "\n",
265 |         "#   \"\"\"`namedtuple` storing the state of a `AttentionWrapper`.\n",
266 |         "\n",
267 |         "\n",
268 |         "\n",
269 |         "\n"
270 |       ],
271 |       "execution_count": 0,
272 |       "outputs": []
273 |     },
274 |     {
275 |       "metadata": {
276 |         "id": "HM4hU8_sDZOZ",
277 |         "colab_type": "code",
278 |         "colab": {
279 |           "autoexec": {
280 |             "startup": false,
281 |             "wait_interval": 0
282 |           },
283 |           "base_uri": "https://localhost:8080/",
284 |           "height": 507
285 |         },
286 |         "outputId": "4dbe9c58-713c-4353-fc12-05b4f704c36f",
287 |         "executionInfo": {
288 |           "status": "ok",
289 |           "timestamp": 1530807225433,
290 |           "user_tz": -330,
291 |           "elapsed": 920,
292 |           "user": {
293 |             "displayName": "ayodhyankit paul",
294 |             "photoUrl": "//lh3.googleusercontent.com/-aLSMOExWjxQ/AAAAAAAAAAI/AAAAAAAAAAc/yPMgEhPgnpk/s50-c-k-no/photo.jpg",
295 |             "userId": "106815194044651409765"
296 |           }
297 |         }
298 |       },
299 |       "cell_type": "code",
300 |       "source": [
301 |         "#let's check and unroll for zero state with timestamp 1\n",
302 |         "\n",
303 |         "# first we have to define zero_state\n",
304 |         "\n",
305 |         "zero_s= attention_wrapper.zero_state(batch_size=batch_size,dtype=tf.float32)\n",
306 |         "\n",
307 |         "wrapper_call = attention_wrapper.__call__(query_,zero_s)\n",
308 |         "AttentionWrapperState , (cell_state , attention , time , alignments , alignment_history , attention_state) = wrapper_call\n",
309 |         "\n",
310 |         "\n",
311 |         "\n",
312 |         "with tf.Session() as sess:\n",
313 |         "  sess.run(tf.global_variables_initializer())\n",
314 |         "  \n",
315 |         "  AttentionWrapperState , (cell_state , attention , time , alignments , alignment_history , attention_state) =sess.run(wrapper_call)\n",
316 |         "  \n",
317 |         "\n",
318 |         "  \n",
319 |         "  \n",
320 |         "  print( 'Attention_wrapper_state {} \\n '.format(AttentionWrapperState.shape))\n",
321 |         "  print( 'cell_state {} \\n '.format(cell_state))\n",
322 |         "  print('attention {} \\n '.format(attention.shape))\n",
323 |         "  print('time {} \\n ',time)\n",
324 |         "  print('alignments {} \\n '.format(alignments.shape))\n",
325 |         "  print('alignment_history {} \\n ',alignment_history)\n",
326 |         "  print('attention_state {} \\n ',attention_state.shape)\n",
327 |         "  \n",
328 |         "  \n",
329 |         "\n",
330 |         "  \n",
331 |         "  \n",
332 |         "  \n",
333 |         "  \n",
334 |         "#  Contains:\n",
335 |         "#     - `cell_state`: The state of the wrapped `RNNCell` at the previous time\n",
336 |         "#       step.\n",
337 |         "#     - `attention`: The attention emitted at the previous time step.\n",
338 |         "#     - `time`: int32 scalar containing the current time step.\n",
339 |         "#     - `alignments`: A single or tuple of `Tensor`(s) containing the alignments\n",
340 |         "#        emitted at the previous time step for each attention mechanism.\n",
341 |         "#     - `alignment_history`: (if enabled) a single or tuple of `TensorArray`(s)\n",
342 |         "#        containing alignment matrices from all time steps for each attention\n",
343 |         "#        mechanism. Call `stack()` on each to convert to a `Tensor`.\n",
344 |         "#     - `attention_state`: A single or tuple of nested objects\n",
345 |         "#        containing attention mechanism state for each attention mechanism.\n",
346 |         "#        The objects may contain Tensors or TensorArrays.\n",
347 |         "#   \"\"\"\n",
348 |         "\n"
349 |       ],
350 |       "execution_count": 135,
351 |       "outputs": [
352 |         {
353 |           "output_type": "stream",
354 |           "text": [
355 |             "Attention_wrapper_state (4, 6) \n",
356 |             " \n",
357 |             "cell_state LSTMStateTuple(c=array([[ 3.08043323e-03,  1.55311858e-03,  1.70145626e-03,\n",
358 |             "         2.11932187e-04,  8.50574463e-04,  2.05059652e-03],\n",
359 |             "       [-4.74626489e-04,  9.16106452e-04,  1.78456702e-03,\n",
360 |             "        -1.44209852e-03,  7.57550995e-04, -1.64095720e-03],\n",
361 |             "       [-3.85492458e-04, -1.17246434e-03,  1.75134250e-04,\n",
362 |             "        -1.00763515e-04, -4.77696769e-04, -5.45556890e-04],\n",
363 |             "       [ 2.44189054e-03,  2.35536245e-05,  1.76277978e-03,\n",
364 |             "         3.67806439e-04,  3.43983644e-04,  1.76626770e-03]], dtype=float32), h=array([[ 1.54447532e-03,  7.77840789e-04,  8.49683536e-04,\n",
365 |             "         1.06092135e-04,  4.24564350e-04,  1.02856825e-03],\n",
366 |             "       [-2.36995256e-04,  4.58025868e-04,  8.91984731e-04,\n",
367 |             "        -7.19621603e-04,  3.78311321e-04, -8.19765264e-04],\n",
368 |             "       [-1.92408625e-04, -5.86739799e-04,  8.76856939e-05,\n",
369 |             "        -5.03981573e-05, -2.39042798e-04, -2.72577658e-04],\n",
370 |             "       [ 1.22316589e-03,  1.17831314e-05,  8.81763175e-04,\n",
371 |             "         1.84339791e-04,  1.71725464e-04,  8.85150221e-04]], dtype=float32)) \n",
372 |             " \n",
373 |             "attention (4, 6) \n",
374 |             " \n",
375 |             "time {} \n",
376 |             "  1\n",
377 |             "alignments (4, 5) \n",
378 |             " \n",
379 |             "alignment_history {} \n",
380 |             "  ()\n",
381 |             "attention_state {} \n",
382 |             "  (4, 5)\n"
383 |           ],
384 |           "name": "stdout"
385 |         }
386 |       ]
387 |     },
388 |     {
389 |       "metadata": {
390 |         "id": "YjHfyvq8FSwe",
391 |         "colab_type": "code",
392 |         "colab": {
393 |           "autoexec": {
394 |             "startup": false,
395 |             "wait_interval": 0
396 |           },
397 |           "base_uri": "https://localhost:8080/",
398 |           "height": 658
399 |         },
400 |         "outputId": "4505d95a-4a8c-4d2d-a35b-25735a58a3f9",
401 |         "executionInfo": {
402 |           "status": "ok",
403 |           "timestamp": 1530807228499,
404 |           "user_tz": -330,
405 |           "elapsed": 1124,
406 |           "user": {
407 |             "displayName": "ayodhyankit paul",
408 |             "photoUrl": "//lh3.googleusercontent.com/-aLSMOExWjxQ/AAAAAAAAAAI/AAAAAAAAAAc/yPMgEhPgnpk/s50-c-k-no/photo.jpg",
409 |             "userId": "106815194044651409765"
410 |           }
411 |         }
412 |       },
413 |       "cell_type": "code",
414 |       "source": [
415 |         "#let's feed this time step for next one \n",
416 |         "\n",
417 |         "next_unroll = attention_wrapper.__call__(query_,wrapper_call[1])\n",
418 |         "with tf.Session() as sess:\n",
419 |         "  sess.run(tf.global_variables_initializer())\n",
420 |         "  print(sess.run(next_unroll))\n",
421 |         "\n",
422 |         "  \n",
423 |         "\n"
424 |       ],
425 |       "execution_count": 136,
426 |       "outputs": [
427 |         {
428 |           "output_type": "stream",
429 |           "text": [
430 |             "(array([[ 0.15419605,  0.02215088,  0.09260561,  0.05548464,  0.03287077,\n",
431 |             "        -0.24773316],\n",
432 |             "       [ 0.17521137,  0.05044638, -0.15758426, -0.02293995,  0.07392441,\n",
433 |             "        -0.2799775 ],\n",
434 |             "       [-0.00876762,  0.12460517,  0.0767331 , -0.04451225, -0.00721595,\n",
435 |             "         0.02627574],\n",
436 |             "       [ 0.00121187, -0.13014013,  0.0483407 ,  0.12857303,  0.05783688,\n",
437 |             "        -0.12511724]], dtype=float32), AttentionWrapperState(cell_state=LSTMStateTuple(c=array([[-0.0517728 , -0.07100058,  0.04009656, -0.01267772,  0.01637715,\n",
438 |             "         0.01745801],\n",
439 |             "       [-0.03136916, -0.05405451,  0.01539867, -0.01264702,  0.06239711,\n",
440 |             "         0.03233024],\n",
441 |             "       [-0.01143507,  0.01212406, -0.02517536,  0.02526897, -0.02043939,\n",
442 |             "        -0.01568646],\n",
443 |             "       [-0.04043157, -0.03508674,  0.07172593, -0.03577641,  0.02680731,\n",
444 |             "         0.02483258]], dtype=float32), h=array([[-0.02548631, -0.03585832,  0.02078494, -0.00657643,  0.00795086,\n",
445 |             "         0.00852645],\n",
446 |             "       [-0.01629876, -0.02728193,  0.00779214, -0.00644129,  0.03004474,\n",
447 |             "         0.01576882],\n",
448 |             "       [-0.00573416,  0.00598284, -0.01281709,  0.01293437, -0.0103511 ,\n",
449 |             "        -0.00790804],\n",
450 |             "       [-0.01982848, -0.01795979,  0.03605331, -0.01772337,  0.01287513,\n",
451 |             "         0.01215158]], dtype=float32)), attention=array([[ 0.15419605,  0.02215088,  0.09260561,  0.05548464,  0.03287077,\n",
452 |             "        -0.24773316],\n",
453 |             "       [ 0.17521137,  0.05044638, -0.15758426, -0.02293995,  0.07392441,\n",
454 |             "        -0.2799775 ],\n",
455 |             "       [-0.00876762,  0.12460517,  0.0767331 , -0.04451225, -0.00721595,\n",
456 |             "         0.02627574],\n",
457 |             "       [ 0.00121187, -0.13014013,  0.0483407 ,  0.12857303,  0.05783688,\n",
458 |             "        -0.12511724]], dtype=float32), time=2, alignments=array([[0.4988134 , 0.5011866 , 0.        , 0.        , 0.        ],\n",
459 |             "       [0.24770409, 0.25034893, 0.25059932, 0.25134757, 0.        ],\n",
460 |             "       [0.19988278, 0.19986606, 0.20005208, 0.20036341, 0.19983569],\n",
461 |             "       [1.        , 0.        , 0.        , 0.        , 0.        ]],\n",
462 |             "      dtype=float32), alignment_history=(), attention_state=array([[0.4988134 , 0.5011866 , 0.        , 0.        , 0.        ],\n",
463 |             "       [0.24770409, 0.25034893, 0.25059932, 0.25134757, 0.        ],\n",
464 |             "       [0.19988278, 0.19986606, 0.20005208, 0.20036341, 0.19983569],\n",
465 |             "       [1.        , 0.        , 0.        , 0.        , 0.        ]],\n",
466 |             "      dtype=float32)))\n"
467 |           ],
468 |           "name": "stdout"
469 |         }
470 |       ]
471 |     },
472 |     {
473 |       "metadata": {
474 |         "id": "iA8nYpPJIxqn",
475 |         "colab_type": "code",
476 |         "colab": {
477 |           "autoexec": {
478 |             "startup": false,
479 |             "wait_interval": 0
480 |           }
481 |         }
482 |       },
483 |       "cell_type": "code",
484 |       "source": [
485 |         ""
486 |       ],
487 |       "execution_count": 0,
488 |       "outputs": []
489 |     },
490 |     {
491 |       "metadata": {
492 |         "id": "eWAx3lbCOTE1",
493 |         "colab_type": "code",
494 |         "colab": {
495 |           "autoexec": {
496 |             "startup": false,
497 |             "wait_interval": 0
498 |           }
499 |         }
500 |       },
501 |       "cell_type": "code",
502 |       "source": [
503 |         ""
504 |       ],
505 |       "execution_count": 0,
506 |       "outputs": []
507 |     }
508 |   ]
509 | }


--------------------------------------------------------------------------------
/Word_level_Hierarchical_Attention.py:
--------------------------------------------------------------------------------
 1 | def attention_word_level(self, hidden_state):
 2 |         """
 3 |         input1:self.hidden_state: hidden_state:list,len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]
 4 |         input2:sentence level context vector:[batch_size*num_sentences,hidden_size*2]
 5 |         :return:representation.shape:[batch_size*num_sentences,hidden_size*2]
 6 |         """
 7 |         hidden_state_ = tf.stack(hidden_state, axis=1)  # shape:[batch_size*num_sentences,sequence_length,hidden_size*2]
 8 |         # 0) one layer of feed forward network
 9 |         hidden_state_2 = tf.reshape(hidden_state_, shape=[-1,
10 |                                                           self.hidden_size * 2])  # shape:[batch_size*num_sentences*sequence_length,hidden_size*2]
11 |         # hidden_state_:[batch_size*num_sentences*sequence_length,hidden_size*2];W_w_attention_sentence:[,hidden_size*2,,hidden_size*2]
12 |         hidden_representation = tf.nn.tanh(tf.matmul(hidden_state_2,
13 |                                                      self.W_w_attention_word) + self.W_b_attention_word)  # shape:[batch_size*num_sentences*sequence_length,hidden_size*2]
14 |         hidden_representation = tf.reshape(hidden_representation, shape=[-1, self.sequence_length,
15 |                                                                          self.hidden_size * 2])  # shape:[batch_size*num_sentences,sequence_length,hidden_size*2]
16 |         # attention process:1.get logits for each word in the sentence. 2.get possibility distribution for each word in the sentence. 3.get weighted sum for the sentence as sentence representation.
17 |         # 1) get logits for each word in the sentence.
18 |         hidden_state_context_similiarity = tf.multiply(hidden_representation,
19 |                                                        self.context_vecotor_word)  # shape:[batch_size*num_sentences,sequence_length,hidden_size*2]
20 |         attention_logits = tf.reduce_sum(hidden_state_context_similiarity,
21 |                                          axis=2)  # shape:[batch_size*num_sentences,sequence_length]
22 |         # subtract max for numerical stability (softmax is shift invariant). tf.reduce_max:Computes the maximum of elements across dimensions of a tensor.
23 |         attention_logits_max = tf.reduce_max(attention_logits, axis=1,
24 |                                              keep_dims=True)  # shape:[batch_size*num_sentences,1]
25 |         # 2) get possibility distribution for each word in the sentence.
26 |         p_attention = tf.nn.softmax(
27 |             attention_logits - attention_logits_max)  # shape:[batch_size*num_sentences,sequence_length]
28 |         # 3) get weighted hidden state by attention vector
29 |         p_attention_expanded = tf.expand_dims(p_attention, axis=2)  # shape:[batch_size*num_sentences,sequence_length,1]
30 |         # below sentence_representation'shape:[batch_size*num_sentences,sequence_length,hidden_size*2]<----p_attention_expanded:[batch_size*num_sentences,sequence_length,1];hidden_state_:[batch_size*num_sentences,sequence_length,hidden_size*2]
31 |         sentence_representation = tf.multiply(p_attention_expanded,
32 |                                               hidden_state_)  # shape:[batch_size*num_sentences,sequence_length,hidden_size*2]
33 |         sentence_representation = tf.reduce_sum(sentence_representation,
34 |                                                 axis=1)  # shape:[batch_size*num_sentences,hidden_size*2]
35 |         return sentence_representation  # shape:[batch_size*num_sentences,hidden_size*2]
36 | 


--------------------------------------------------------------------------------
/scaled_dot_product_attention.py:
--------------------------------------------------------------------------------
 1 | def scaled_dot_product_attention(q, k, v, mask):
 2 |   """Calculate the attention weights.
 3 |   q, k, v must have matching leading dimensions.
 4 |   k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
 5 |   The mask has different shapes depending on its type(padding or look ahead) 
 6 |   but it must be broadcastable for addition.
 7 |   
 8 |   Args:
 9 |     q: query shape == (..., seq_len_q, depth)
10 |     k: key shape == (..., seq_len_k, depth)
11 |     v: value shape == (..., seq_len_v, depth_v)
12 |     mask: Float tensor with shape broadcastable 
13 |           to (..., seq_len_q, seq_len_k). Defaults to None.
14 |     
15 |   Returns:
16 |     output, attention_weights
17 |   """
18 | 
19 |   matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
20 |   
21 |   # scale matmul_qk
22 |   dk = tf.cast(tf.shape(k)[-1], tf.float32)
23 |   scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
24 | 
25 |   # add the mask to the scaled tensor.
26 |   if mask is not None:
27 |     scaled_attention_logits += (mask * -1e9)  
28 | 
29 |   # softmax is normalized on the last axis (seq_len_k) so that the scores
30 |   # add up to 1.
31 |   attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
32 | 
33 |   output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
34 | 
35 |   return output, attention_weights
36 | 


--------------------------------------------------------------------------------
/simplest_self_attention.py:
--------------------------------------------------------------------------------
1 | 
2 | #logits from any model (lstm, cnn or any ) or embedding 
3 | 
4 | f_1 = tf.layers.conv1d(seq_fts, 1, 1)
5 | f_2 = tf.layers.conv1d(seq_fts, 1, 1)
6 | logits = f_1 + tf.transpose(f_2, [0, 2, 1])
7 | coefs = tf.nn.softmax(tf.nn.leaky_relu(logits) + bias_mat)
8 | 
9 | 


--------------------------------------------------------------------------------