├── ABC
    ├── ABC.png
    ├── README.md
    ├── abCNN.py
    └── localAtt.py
├── CoA
    ├── CoA_arch.png
    ├── README.md
    ├── co_attention.py
    └── selfDef.py
├── NIPS2016
    ├── README.md
    ├── VQA.png
    ├── co_attention.py
    └── selfDef.py
├── README.md
└── TAB-LSTM
    ├── README.md
    ├── TAB-LSTM.png
    ├── topicAtt.py
    └── topicLSTM.py


/ABC/ABC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhangtd/Models-reproducing/48cff578ef2b1a0f0a265bc7a0d10b17bd12e3fd/ABC/ABC.png


--------------------------------------------------------------------------------
/ABC/README.md:
--------------------------------------------------------------------------------
 1 | # ABC model
 2 | 
 3 | ### Proposed in [Hashtag Recommendation Using Attention-Based Convolutional Neural Network](https://www.ijcai.org/Proceedings/16/Papers/395.pdf) IJCAI 2016, by Yuyun Gong et al.
 4 | 
 5 | ## **Overview**
 6 | ###  ABC adopts an attentionbased CNN architecture to recommend tags for textual content. It models the text sequential order with both local attention and the global attention.  
 7 | ### Model Architecture
 8 | ![ABC model](ABC.png)
 9 | 
10 | ## **Usage**
11 | ### Source files in this repository can not be executed immediately, as there is no data preprocessing and data inputs statements. So you should get your data prepared according to your need.  
12 | 
13 | ## **Requirements**
14 | - Python 3.x
15 | - Tensorflow >= 1.7
16 | - Keras >= 2.1.5
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/ABC/abCNN.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Model
 2 | from keras.layers import Input, Reshape, Dense, Embedding, Dropout, LSTM, MaxPooling1D, Lambda, Concatenate, \
 3 |     Multiply, RepeatVector, Flatten, Activation, Permute, Conv1D
 4 | import keras.backend as K
 5 | from localAtt import LocalAttention
 6 | import numpy as np
 7 | 
 8 | 
 9 | num_tags = 2207
10 | num_words = 20000
11 | index_from = 3
12 | seq_length = 30
13 | batch_size = 256
14 | embedding_size = 100
15 | drop_rate = 0.75
16 | num_epoch = 35
17 | 
18 | # prepare the following data. img data is the output of VGG-16
19 | img_train, text_train, tag_train, img_test, text_test, tag_test
20 | 
21 | 
22 | def myLossFunc(y_true, y_pred):
23 |     probs_log = -K.log(y_pred)
24 |     loss = K.mean(K.sum(probs_log*y_true, axis=-1))
25 |     return loss
26 | 
27 | 
28 | def modelDef():
29 |     input_text = Input(shape=(seq_length, ))
30 |     embeddings = Embedding(input_dim=num_words + index_from, output_dim=embedding_size,
31 |                            mask_zero=False, input_length=seq_length)(input_text)
32 |     #Global channel
33 |     gc1 = Conv1D(filters=embedding_size, kernel_size=1, activation="tanh", use_bias=True)(embeddings)
34 |     gc2 = Conv1D(filters=embedding_size, kernel_size=2, activation="tanh", use_bias=True)(embeddings)
35 |     gc3 = Conv1D(filters=embedding_size, kernel_size=3, activation="tanh", use_bias=True)(embeddings)
36 | 
37 |     gc1m = MaxPooling1D(pool_size=30)(gc1)
38 |     gc2m = MaxPooling1D(pool_size=29)(gc2)
39 |     gc3m = MaxPooling1D(pool_size=28)(gc3)
40 |     gc = MaxPooling1D(pool_size=3)(Concatenate(axis=1)([gc1m, gc2m, gc3m]))
41 |     gc = Lambda(lambda x:K.squeeze(x, axis=1))(gc)
42 |     # print(gc)
43 | 
44 |     #Local channel
45 |     lcs = Conv1D(filters=1, kernel_size=5, activation="tanh", use_bias=True, padding="same")(embeddings)
46 |     lcs = Lambda(lambda x:K.squeeze(x, axis=-1))(lcs)
47 |     bools = LocalAttention()(lcs)
48 |     bools = Permute([2, 1])(RepeatVector(embedding_size)(bools))
49 |     lca = Multiply()([embeddings, bools])
50 |     lcf = Activation("tanh")(Lambda(lambda x:K.sum(x, axis=1))(lca))
51 |     # print(lcf)
52 | 
53 |     gc = RepeatVector(1)(gc)
54 |     lcf = RepeatVector(1)(lcf)
55 |     h = Concatenate(axis=1)([gc, lcf])
56 |     h = Conv1D(filters=embedding_size, kernel_size=2, activation="tanh", use_bias=True)(h)
57 |     dropout = Dropout(drop_rate)(Lambda(lambda x:K.squeeze(x, axis=1))(h))
58 | 
59 |     Softmax = Dense(num_tags, activation="softmax", use_bias=True)(dropout)
60 |     model = Model(inputs=input_text, outputs=Softmax)
61 |     model.compile(optimizer="adam", loss=myLossFunc)
62 |     return model
63 | 
64 | 
65 | def evaluation(y_true, y_pred, top_K):
66 |     acc_count = 0
67 |     correct = 0
68 | 
69 |     for i in range(y_pred.shape[0]):
70 |         top_indices = y_pred[i].argsort()[-top_K:]
71 |         if np.sum(y_true[i, top_indices]) >= 1:
72 |             acc_count += 1
73 |         correct += np.sum(y_true[i, top_indices])
74 | 
75 |     acc_K = acc_count * 1.0 / y_pred.shape[0]
76 |     precision_K = correct * 1.0 / (top_K * y_pred.shape[0])
77 |     recall_K = correct * 1.0 / np.sum(y_true)
78 |     f1_K = 2 * precision_K * recall_K / (precision_K + recall_K)
79 | 
80 |     return acc_K, precision_K, recall_K, f1_K
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     myModel = modelDef()
85 |     history = myModel.fit(x=texts_train,
86 |                           y=tags_train,
87 |                           batch_size=batch_size,
88 |                           epochs=1,
89 |                           verbose=1,)
90 |     y_pred = myModel.predict(x=[texts_test])
91 |     acc, precision, recall, f1 = evaluation(tags_test, y_pred, 3)
92 | 


--------------------------------------------------------------------------------
/ABC/localAtt.py:
--------------------------------------------------------------------------------
 1 | import keras.backend as K
 2 | from keras.engine.topology import Layer, InputSpec
 3 | from keras.utils import conv_utils
 4 | from keras import activations, initializers
 5 | 
 6 | theta = 0.8
 7 | 
 8 | 
 9 | class LocalAttention(Layer):
10 |     """
11 |         generate local attention maps based on input score tensor.
12 |         input: score tensor
13 |         output: attention weights tensor(0/1)
14 | 
15 |         input shape: (batch_size, seq_length)
16 |         output shape: (batch_size, seq_length)
17 |     """
18 |     def __init__(self, **kwargs):
19 |         super(LocalAttention, self).__init__(**kwargs)
20 |         self.input_spec = InputSpec(min_ndim=2)
21 | 
22 |     def build(self, input_shape):
23 |         super(LocalAttention, self).build(input_shape)  # Be sure to call this somewhere!
24 | 
25 |     def call(self, x, mask=None):
26 |         max_score = K.max(x)
27 |         min_score = K.min(x)
28 |         threhold = theta*min_score + (1-theta)*max_score
29 |         threholds = threhold * K.ones_like(x)
30 |         output = K.relu(x-threholds)
31 |         return output
32 | 
33 |     def compute_output_shape(self, input_shape):
34 |         output_shape = input_shape
35 |         return output_shape
36 | 
37 | 


--------------------------------------------------------------------------------
/CoA/CoA_arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhangtd/Models-reproducing/48cff578ef2b1a0f0a265bc7a0d10b17bd12e3fd/CoA/CoA_arch.png


--------------------------------------------------------------------------------
/CoA/README.md:
--------------------------------------------------------------------------------
 1 | # CoA model
 2 | 
 3 | ### Proposed in [Hashtag Recommendation for Multimodal Microblog Using Co-Attention Networks](https://www.ijcai.org/proceedings/2017/0478.pdf) IJCAI 2017, by Qi Zhang et al.
 4 | 
 5 | ## **Overview**
 6 | ### CoA is can be used to recommend hashtags for multimodal microblogs which are composed of both texual and visual content. The main contribution of CoA lies in an alternative co-attention mechanism to extract post features.  
 7 | ### Model Architecture
 8 | ![CoA model](CoA_arch.png)
 9 | 
10 | ## **Usage**
11 | ### Source files in this repository can not be executed immediately, as there is no data preprocessing and data inputs statements. So you should get your data prepared according to your need.  
12 | 
13 | ## **Requirements**
14 | - Python 3.x
15 | - Tensorflow >= 1.7
16 | - Keras >= 2.1.5
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/CoA/co_attention.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | hierarchical Co-attention model based on IJCAI article
  3 | '''
  4 | from keras.models import Model
  5 | from keras.layers.core import Activation, Flatten, Reshape, RepeatVector
  6 | from keras.layers.recurrent import LSTM
  7 | from keras.layers.convolutional import AveragePooling1D
  8 | from keras.layers.wrappers import TimeDistributed
  9 | from keras.layers import Input, Dense, Embedding, Merge, Dropout, Lambda
 10 | import keras.backend as K
 11 | 
 12 | from selfDef import coAttention_alt, myLossFunc
 13 | import numpy as np
 14 | 
 15 | num_tags = 3896
 16 | num_words = 212000
 17 | index_from = 3
 18 | seq_length = 30
 19 | batch_size = 512
 20 | embedding_size = 200
 21 | hidden_size = 100
 22 | attention_size = 200
 23 | dim_k = 100
 24 | num_region = 7*7
 25 | drop_rate = 0.5
 26 | TopK= 6
 27 | 
 28 | # prepare the following data. img data is the output of VGG-16
 29 | img_train, text_train, tag_train, img_test, text_test, tag_test
 30 | 
 31 | 
 32 | def imageFeature(inputs):
 33 |     features = Reshape(target_shape=(num_region, 512))(inputs)
 34 |     features = Dense(embedding_size, activation="tanh", use_bias=False)(features)
 35 |     features_pooling = AveragePooling1D(pool_size=num_region, padding="same")(features)
 36 |     features_pooling = Lambda(lambda x: K.squeeze(x, axis=1))(features_pooling)
 37 | 
 38 |     return features, features_pooling
 39 | 
 40 | 
 41 | def textFeature(X):
 42 |     embeddings = Embedding(input_dim=num_words + index_from, output_dim=embedding_size,
 43 |                            mask_zero=True, input_length=seq_length)(X)
 44 |     tFeature = LSTM(units=embedding_size, return_sequences=True)(embeddings)
 45 | 
 46 |     return tFeature
 47 | 
 48 | 
 49 | def modelDef():
 50 |     inputs_img = Input(shape=(7, 7, 512))
 51 |     inputs_text = Input(shape=(seq_length,))
 52 | 
 53 |     iFeature, iFeature_pooling = imageFeature(inputs_img)
 54 |     tFeature = textFeature(inputs_text)
 55 |     co_feature = coAttention_alt(dim_k=dim_k)([iFeature, tFeature])
 56 |     dropout = Dropout(drop_rate)(co_feature)
 57 |     Softmax = Dense(num_tags, activation="softmax", use_bias=True)(dropout)
 58 |     model = Model(inputs=[inputs_img, inputs_text],
 59 |                   outputs=[Softmax])
 60 |     # adam = optimizers.adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.00001)
 61 |     model.compile(optimizer="adam", loss=myLossFunc)
 62 |     return model
 63 | 
 64 | 
 65 | def evaluation(y_true, y_pred, top_K):
 66 |     acc_count = 0
 67 |     precision_K = []
 68 |     recall_K = []
 69 |     f1_K = []
 70 | 
 71 |     for i in range(y_pred.shape[0]):
 72 |         top_indices = y_pred[i].argsort()[-top_K:]
 73 |         if np.sum(y_true[i, top_indices]) >= 1:
 74 |             acc_count += 1
 75 |         p = np.sum(y_true[i, top_indices])/top_K
 76 |         r = np.sum(y_true[i, top_indices])/np.sum(y_true[i, :])
 77 |         precision_K.append(p)
 78 |         recall_K.append(r)
 79 |         if p != 0 or r != 0:
 80 |             f1_K.append(2 * p * r / (p + r))
 81 |         else:
 82 |             f1_K.append(0)
 83 | 
 84 |     acc_K = acc_count * 1.0 / y_pred.shape[0]
 85 | 
 86 |     return acc_K, np.mean(np.array(precision_K)), np.mean(np.array(recall_K)), np.mean(np.array(f1_K))
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     model = modelDef()
 91 |     history = model.fit(x=[img_train, text_train],
 92 |                         y=tag_train,
 93 |                         batch_size=batch_size,
 94 |                         epochs=1,
 95 |                         verbose=1,)
 96 |     y_pred = model.predict(x=[test_img, test_text])
 97 |     acc_K, precision_K, recall_K, f1_K = evaluation(test_tag, y_pred, TopK)
 98 | 
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/CoA/selfDef.py:
--------------------------------------------------------------------------------
  1 | """
  2 | some DIY components used in co_attention.py
  3 | 
  4 |     coAttention_alt -- DIY coAttention layer using alternative mechanism
  5 | 
  6 |     myLossFunc -- DIY loss function. Loss is defined as the sum of -log(p),
  7 |                                     where p is the probability of a hashtag in a train instance
  8 | 
  9 | """
 10 | import keras.backend as K
 11 | from keras.engine.topology import Layer, InputSpec
 12 | 
 13 | 
 14 | class coAttention_alt(Layer):
 15 |     """
 16 |     self defined co-attention layer.
 17 |     alternative co-attention
 18 |     inputs: [image feature tensor, hidden text feature tensor]
 19 |     output: co-Attention feature of image and text
 20 | 
 21 |     input dimensions:[(batchSize, num_region, CNN_dimension),
 22 |                     (batchSize, seq_length, CNN_dimension)]
 23 |     output dimension: batch_size*CNN_dimension
 24 |     """
 25 |     def __init__(self, dim_k, **kwargs):
 26 |         super(coAttention_alt, self).__init__(**kwargs)
 27 |         self.dim_k = dim_k  # internal tensor dimension
 28 |         # self.input_spec = InputSpec(min_ndim=3)
 29 |         self.supports_masking = True
 30 | 
 31 |     def build(self, input_shape):
 32 |         if not isinstance(input_shape, list):
 33 |             raise ValueError('A Co-Attention_alt layer should be called '
 34 |                              'on a list of inputs.')
 35 |         if len(input_shape) != 2:
 36 |             raise ValueError('A Co-Attention_alt layer should be called on a list of 3 inputs.'
 37 |                              'Got '+str(len(input_shape))+'inputs.')
 38 |         # print(input_shape)
 39 |         self.num_imgRegion = input_shape[0][1]
 40 |         self.seq_len = input_shape[1][1]
 41 |         self.output_dim = input_shape[0][2]
 42 | 
 43 |         """trainable variables naming rule:
 44 |             w/b + '_Dense_' + Vi/Vt + '_' + 0/1
 45 |             w: weight
 46 |             b: bias
 47 |             Vi: about image feature
 48 |             Vt: about text feature
 49 |             0: phase 0
 50 |             1: phase 1
 51 |         """
 52 |         self.w_Dense_Vi_0 = self.add_weight(name='w_Dense_Vi_0',
 53 |                                             shape=(self.output_dim, self.dim_k),
 54 |                                             initializer='random_normal',
 55 |                                             trainable=True)
 56 |         self.w_Dense_Vt_0 = self.add_weight(name='w_Dense_Vt_0',
 57 |                                             shape=(self.output_dim, self.dim_k),
 58 |                                             initializer='random_normal',
 59 |                                             trainable=True)
 60 |         self.w_Dense_Pi_0 = self.add_weight(name='w_Dense_Pi_0',
 61 |                                             shape=(2*self.dim_k, 1),
 62 |                                             initializer='random_normal',
 63 |                                             trainable=True)
 64 |         self.b_Dense_Pi_0 = self.add_weight(name='b_Dense_Pi_0',
 65 |                                             shape=(self.num_imgRegion,),
 66 |                                             initializer='zeros',
 67 |                                             trainable=True)
 68 | 
 69 |         self.w_Dense_Vi_1 = self.add_weight(name='w_Dense_Vi_1',
 70 |                                             shape=(self.output_dim, self.dim_k),
 71 |                                             initializer='random_normal',
 72 |                                             trainable=True)
 73 |         self.w_Dense_Vt_1 = self.add_weight(name='w_Dense_Vt_1',
 74 |                                             shape=(self.output_dim, self.dim_k),
 75 |                                             initializer='random_normal',
 76 |                                             trainable=True)
 77 |         self.w_Dense_Pi_1 = self.add_weight(name='w_Dense_Pi_1',
 78 |                                             shape=(2*self.dim_k, 1),
 79 |                                             initializer='random_normal',
 80 |                                             trainable=True)
 81 |         self.b_Dense_Pi_1 = self.add_weight(name='b_Dense_Pi_1',
 82 |                                             shape=(self.seq_len,),
 83 |                                             initializer='zeros',
 84 |                                             trainable=True)
 85 | 
 86 |         super(coAttention_alt, self).build(input_shape)  # Be sure to call this somewhere!
 87 | 
 88 |     def call(self, x, mask=None):
 89 |         ifeature = x[0]
 90 |         tfeature_h = x[1]
 91 |         # tfeature = x[2]
 92 |         output_dim = self.output_dim
 93 |         num_imgRegion = self.num_imgRegion
 94 |         dim_k = self.dim_k
 95 |         seq_len = self.seq_len
 96 |         tfeature = K.mean(tfeature_h, axis=1)
 97 |         # print(tfeature_h, tfeature)
 98 | 
 99 |         # phase 0: text-guided image feature computation
100 |         w_Vi_0 = K.dot(K.reshape(ifeature, [-1, output_dim]), self.w_Dense_Vi_0)
101 |         # shape=((batchSize*num_imgRegion),dim_k)
102 |         w_Vi_0 = K.reshape(w_Vi_0, [-1, num_imgRegion, dim_k])  # shape=(batchSize,num_imgRegion,dim_k)
103 |         w_Vt_0 = K.repeat(K.dot(tfeature, self.w_Dense_Vt_0), num_imgRegion)  # shape=(batchSize,num_imgRegion,dim_k)
104 |         Vi_Vt_0 = K.concatenate([w_Vi_0, w_Vt_0], axis=-1)  # shape=(batchSize,num_imgRegion,2*dim_k)
105 |         Hi = K.tanh(Vi_Vt_0)
106 |         # Hi_w = K.squeeze(K.dot(K.reshape(Hi, [-1, 2*dim_k]), self.w_Dense_Pi_0), axis=-1)
107 |         # Hi_w_b = K.reshape(Hi_w, [-1, num_imgRegion]) + self.b_Dense_Pi_0
108 |         Hi_w_b = K.squeeze(K.dot(Hi, self.w_Dense_Pi_0), axis=-1) + self.b_Dense_Pi_0  # shape=(batchSize,num_imgRegion)
109 |         Pi = K.softmax(Hi_w_b)
110 |         Pi = K.permute_dimensions(K.repeat(Pi, output_dim), (0, 2, 1))  # shape=(batchSize,num_imgRegion,output_dim)
111 |         Pi_Vi = Pi*ifeature
112 |         Vi = K.sum(Pi_Vi, axis=1)  # shape=(batchSize,output_dim)
113 | 
114 |         # phase 1: image-guided text feature computation
115 |         w_Vi_1 = K.repeat(K.dot(Vi, self.w_Dense_Vi_1), seq_len)    # shape=(batchSize,seq_len,dim_k)
116 |         w_Vt_1 = K.dot(K.reshape(tfeature_h, [-1, output_dim]), self.w_Dense_Vt_1)   # shape=((batchSize*seq_len),dim_k)
117 |         w_Vt_1 = K.reshape(w_Vt_1, (-1, seq_len, dim_k))    # shape= (batchSize, seq_len, dim_k)
118 |         Vi_Vt_1 = K.concatenate([w_Vi_1, w_Vt_1], axis=-1)    # shape=(batchSize, seq_len, 2*dim_k)
119 |         Ht = K.tanh(Vi_Vt_1)
120 |         Ht_b = K.squeeze(K.dot(Ht, self.w_Dense_Pi_1), axis=-1) + self.b_Dense_Pi_1   # shape=(batch_size, seq_len)
121 |         Pt = K.softmax(Ht_b)
122 |         Pt = K.permute_dimensions(K.repeat(Pt, output_dim), (0, 2, 1))    # shape=(batchSize, seq_len, output_dim)
123 |         Pt_Vt = Pt*tfeature_h
124 |         Vt = K.sum(Pt_Vt, axis=1)    # shape=(batchSize, output_dim)
125 | 
126 |         return Vi+Vt
127 | 
128 |     def compute_output_shape(self, input_shape):
129 |         output_shape = (input_shape[0][0], input_shape[0][-1])
130 |         return output_shape
131 | 
132 |     def get_config(self):
133 |         return super(coAttention_alt, self).get_config()
134 | 
135 | 
136 | def myLossFunc(y_true, y_pred):
137 |     probs_log = -K.log(y_pred)
138 |     loss = K.mean(K.sum(probs_log*y_true, axis=-1))
139 |     # loss = K.mean(K.sum(K.clip(probs_log * y_true, -1e40, 100), axis=-1))
140 |     return loss
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     from keras.layers import Input
145 |     i1 = Input(batch_shape=(10, 25, 100))
146 |     i2 = Input(batch_shape=(10, 36, 100))
147 |     y = coAttention_alt(100)([i1, i2])
148 |     print(y)
149 | 


--------------------------------------------------------------------------------
/NIPS2016/README.md:
--------------------------------------------------------------------------------
 1 | # Hierarchical Co-Attention VQA
 2 | 
 3 | Proposed in [Hierarchical Question-Image Co-Attention for Visual Question Answering](http://papers.nips.cc/paper/6202-hierarchical-question-image-co-attention-for-visual-question-answering.pdf) NIPS 2016, by Jiasen Lu et al.
 4 | 
 5 | ## **Overview**
 6 | ###  Two kinds of co-attention mechanisms are proposedin this article. Basic idea of co-attention mechanism is that for certain target, the importance of subparts in text and image vary so they should be treated differently. Moreover, question text and image are usually semantically related and one can be used to understand the other. So they resort to co-attention mechanism and propose co-attention in alternative way and parallel way.  
 7 | ### Model Architecture
 8 | ![VQA model](VQA.png)
 9 | 
10 | ## **Usage**
11 | ### Source files in this repository can not be executed immediately, as there is no data preprocessing and data inputs statements. So you should get your data prepared according to your need.  
12 | 
13 | ## **Requirements**
14 | - Python 3.x
15 | - Tensorflow >= 1.7
16 | - Keras >= 2.1.5
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/NIPS2016/VQA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhangtd/Models-reproducing/48cff578ef2b1a0f0a265bc7a0d10b17bd12e3fd/NIPS2016/VQA.png


--------------------------------------------------------------------------------
/NIPS2016/co_attention.py:
--------------------------------------------------------------------------------
 1 | from keras.applications.inception_v3 import InceptionV3
 2 | from keras.models import Model, model_from_json
 3 | from keras.layers import Input, Reshape, Dense, Embedding, Bidirectional, GRU, Dropout
 4 | from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
 5 | from keras import optimizers
 6 | 
 7 | from selfDef import coAttention_alt, coAttention_para, myLossFunc, self_conv1d, \
 8 |     self_maxpooling, text_attention, encoding
 9 | 
10 | 
11 | def textFeature(X):
12 |     embeddings = Embedding(input_dim=num_words + 3, output_dim=embedding_size,
13 |                 weights=[init_embeddings], mask_zero=True, input_length=seq_length)(X)
14 | 
15 |     word_level = embeddings
16 |     phrase_level_1 = self_conv1d(filters=embedding_size,
17 |                                  kernel_size=1,
18 |                                  padding="same",
19 |                                  activation="tanh")(word_level)
20 |     phrase_level_2 = self_conv1d(filters=embedding_size,
21 |                                  kernel_size=2,
22 |                                  padding="same",
23 |                                  activation="tanh")(word_level)
24 |     phrase_level_3 = self_conv1d(filters=embedding_size,
25 |                                  kernel_size=3,
26 |                                  padding="same",
27 |                                  activation="tanh")(word_level)
28 | 
29 |     phrase_level = self_maxpooling()([phrase_level_1, phrase_level_2, phrase_level_3])
30 | 
31 |     text_level = Bidirectional(GRU(units=hidden_size, return_sequences=True))(phrase_level)
32 |     return word_level, phrase_level, text_level
33 | 
34 | 
35 | def imageFeature(inputs):
36 |     imageModel = InceptionV3(weights='imagenet', include_top=False, )
37 |     for layer in imageModel.layers:
38 |         layer.trainable = False
39 |     features = imageModel(inputs)
40 |     features = Reshape(target_shape=(num_region, 2048))(features)
41 |     features = Dense(hidden_size * 2, activation="tanh", use_bias=False)(features)
42 |     return features
43 | 
44 | 
45 | def modelDef():
46 |     inputs_img = Input(shape=(299, 299, 3,))
47 |     inputs_text = Input(shape=(seq_length,))
48 |     text_mask = Input(shape=(seq_length,))
49 | 
50 |     iFeature = imageFeature(inputs_img)
51 |     tFeature_word, tFeature_phrase, tFeature_text = textFeature(inputs_text)
52 | 
53 |     sum_tFeature_word = text_attention(attention_size)(tFeature_word)
54 |     sum_tFeature_phrase = text_attention(attention_size)(tFeature_phrase)
55 |     sum_tFeature_text = text_attention(attention_size)(tFeature_text)
56 | 
57 |     co_feature_word = coAttention_alt(dim_k=dim_k)([iFeature, tFeature_word, sum_tFeature_word])
58 |     co_feature_phrase = coAttention_alt(dim_k=dim_k)([iFeature, tFeature_phrase, sum_tFeature_phrase])
59 |     co_feature_text = coAttention_alt(dim_k=dim_k)([iFeature, tFeature_text, sum_tFeature_text])
60 | 
61 |     h = encoding()([co_feature_word, co_feature_phrase, co_feature_text])
62 |     dropout = Dropout(drop_rate)(h)
63 | 
64 |     Softmax = Dense(num_tags, activation="softmax", use_bias=True)(dropout)
65 | 
66 |     model = Model(inputs=[inputs_img, inputs_text, text_mask],
67 |                   outputs=[Softmax])
68 |     sgd = optimizers.SGD(lr=0.15, momentum=0.9, clipnorm=1.0)
69 |     model.compile(optimizer=sgd, loss=myLossFunc, metrics=[accuracy])
70 |     # res = model.predict(x=..)
71 |     return model
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/NIPS2016/selfDef.py:
--------------------------------------------------------------------------------
  1 | import keras.backend as K
  2 | from keras.engine.topology import Layer, InputSpec
  3 | from keras.utils import conv_utils
  4 | from keras import activations, initializers, regularizers, constraints
  5 | 
  6 | top_K = 1
  7 | REMOVE_FACTOR = -10000
  8 | 
  9 | 
 10 | class text_attention(Layer):
 11 |     """
 12 |     self defined text attention layer.
 13 |     input: hidden text feature
 14 |     output: summarized text feature with attention mechanism
 15 | 
 16 |     input shape: (batch_size, seq_length, embedding_size)
 17 |     output shape: (batch_size, embedding_size)
 18 |     """
 19 |     def __init__(self, units, return_alphas=False, **kwargs):
 20 |         super(text_attention, self).__init__(**kwargs)
 21 |         self.units = units
 22 |         self.input_spec = InputSpec(min_ndim=3)
 23 |         self.supports_masking = True
 24 |         self.return_alphas = return_alphas
 25 | 
 26 |     def build(self, input_shape):
 27 |         input_dim = input_shape[-1]
 28 |         # Create a trainable weight variable for this layer.
 29 |         self.w_omega = self.add_weight(name='w_omega',
 30 |                                        shape=(input_dim, self.units),
 31 |                                        initializer='random_normal',
 32 |                                        trainable=True)
 33 |         self.b_omega = self.add_weight(name='b_omega',
 34 |                                        shape=(self.units,),
 35 |                                        initializer='zeros',
 36 |                                        trainable=True)
 37 |         self.u_omega = self.add_weight(name='u_omega',
 38 |                                        shape=(self.units,),
 39 |                                        initializer='random_normal',
 40 |                                        trainable=True)
 41 |         super(text_attention, self).build(input_shape)  # Be sure to call this somewhere!
 42 | 
 43 |     def call(self, x, mask=None):
 44 |         input_dim = K.shape(x)[-1]
 45 |         v = K.tanh(K.dot(K.reshape(x, [-1, input_dim]), self.w_omega) + K.expand_dims(self.b_omega, 0))
 46 |         vu = K.dot(v, K.expand_dims(self.u_omega, -1))
 47 |         vu = K.reshape(vu, K.shape(x)[:2])
 48 |         m = K.cast(mask, dtype='float32')
 49 |         m = m - 1
 50 |         m = m * REMOVE_FACTOR
 51 |         vu = vu + m
 52 |         alphas = K.softmax(vu)
 53 |         output = K.sum(x * K.expand_dims(alphas, -1), 1)
 54 |         if self.return_alphas:
 55 |             return [output] + [alphas]
 56 |         else:
 57 |             return output
 58 | 
 59 |     def compute_mask(self, inputs, mask=None):
 60 |         return None
 61 | 
 62 |     def compute_output_shape(self, input_shape):
 63 |         output_shape = (input_shape[0], input_shape[2])
 64 |         if self.return_alphas:
 65 |             alphas_shape = [(input_shape[0], input_shape[1])]
 66 |             return [output_shape] + alphas_shape
 67 |         else:
 68 |             return output_shape
 69 | 
 70 |     def get_config(self):
 71 |         return super(text_attention, self).get_config()
 72 | 
 73 | 
 74 | class coAttention_alt(Layer):
 75 |     """
 76 |     self defined co-attention layer.
 77 |     alternative co-attention
 78 |     inputs: [image feature tensor, hidden text feature tensor, summarized text feature tensor(after attention)]
 79 |     output: co-Attention feature of image and text
 80 | 
 81 |     input dimensions:[(batchSize, num_region, CNN_dimension),
 82 |                     (batchSize, seq_length, CNN_dimension),(batchSize, CNN_dimension)]
 83 |     output dimension: batch_size*CNN_dimension
 84 |     """
 85 |     def __init__(self, dim_k, **kwargs):
 86 |         super(coAttention_alt, self).__init__(**kwargs)
 87 |         self.dim_k = dim_k  # internal tensor dimension
 88 |         # self.input_spec = InputSpec(min_ndim=3)
 89 |         self.supports_masking = True
 90 | 
 91 |     def build(self, input_shape):
 92 |         if not isinstance(input_shape, list):
 93 |             raise ValueError('A Co-Attention_alt layer should be called '
 94 |                              'on a list of inputs.')
 95 |         if len(input_shape) != 3:
 96 |             raise ValueError('A Co-Attention_alt layer should be called on a list of 3 inputs.'
 97 |                              'Got '+str(len(input_shape))+'inputs.')
 98 |         self.num_imgRegion = input_shape[0][1]
 99 |         self.seq_len = input_shape[1][1]
100 |         self.output_dim = input_shape[0][2]
101 | 
102 |         """trainable variables naming rule:
103 |             w/b + '_Dense_' + Vi/Vt + '_' + 0/1
104 |             w: weight
105 |             b: bias
106 |             Vi: about image feature
107 |             Vt: about text feature
108 |             0: phase 0
109 |             1: phase 1
110 |         """
111 |         self.w_Dense_Vi_0 = self.add_weight(name='w_Dense_Vi_0',
112 |                                             shape=(self.output_dim, self.dim_k),
113 |                                             initializer='random_normal',
114 |                                             trainable=True)
115 |         self.w_Dense_Vt_0 = self.add_weight(name='w_Dense_Vt_0',
116 |                                             shape=(self.output_dim, self.dim_k),
117 |                                             initializer='random_normal',
118 |                                             trainable=True)
119 |         self.w_Dense_Pi_0 = self.add_weight(name='w_Dense_Pi_0',
120 |                                             shape=(2*self.dim_k, 1),
121 |                                             initializer='random_normal',
122 |                                             trainable=True)
123 |         self.b_Dense_Pi_0 = self.add_weight(name='b_Dense_Pi_0',
124 |                                             shape=(self.num_imgRegion,),
125 |                                             initializer='zeros',
126 |                                             trainable=True)
127 | 
128 |         self.w_Dense_Vi_1 = self.add_weight(name='w_Dense_Vi_1',
129 |                                             shape=(self.output_dim, self.dim_k),
130 |                                             initializer='random_normal',
131 |                                             trainable=True)
132 |         self.w_Dense_Vt_1 = self.add_weight(name='w_Dense_Vt_1',
133 |                                             shape=(self.output_dim, self.dim_k),
134 |                                             initializer='random_normal',
135 |                                             trainable=True)
136 |         self.w_Dense_Pi_1 = self.add_weight(name='w_Dense_Pi_1',
137 |                                             shape=(2*self.dim_k, 1),
138 |                                             initializer='random_normal',
139 |                                             trainable=True)
140 |         self.b_Dense_Pi_1 = self.add_weight(name='b_Dense_Pi_1',
141 |                                             shape=(self.seq_len,),
142 |                                             initializer='zeros',
143 |                                             trainable=True)
144 | 
145 |         super(coAttention_alt, self).build(input_shape)  # Be sure to call this somewhere!
146 | 
147 |     def call(self, x, mask=None):
148 |         ifeature = x[0]
149 |         tfeature_h = x[1]
150 |         tfeature = x[2]
151 |         output_dim = self.output_dim
152 |         num_imgRegion = self.num_imgRegion
153 |         dim_k = self.dim_k
154 |         seq_len = self.seq_len
155 | 
156 |         # phase 0: text-guided image feature computation
157 |         w_Vi_0 = K.dot(K.reshape(ifeature, [-1, output_dim]), self.w_Dense_Vi_0)
158 |         # shape=((batchSize*num_imgRegion),dim_k)
159 |         w_Vi_0 = K.reshape(w_Vi_0, [-1, num_imgRegion, dim_k])  # shape=(batchSize,num_imgRegion,dim_k)
160 |         w_Vt_0 = K.repeat(K.dot(tfeature, self.w_Dense_Vt_0), num_imgRegion)  # shape=(batchSize,num_imgRegion,dim_k)
161 |         Vi_Vt_0 = K.concatenate([w_Vi_0, w_Vt_0], axis=-1)  # shape=(batchSize,num_imgRegion,2*dim_k)
162 |         Hi = K.tanh(Vi_Vt_0)
163 |         # Hi_w = K.squeeze(K.dot(K.reshape(Hi, [-1, 2*dim_k]), self.w_Dense_Pi_0), axis=-1)
164 |         # Hi_w_b = K.reshape(Hi_w, [-1, num_imgRegion]) + self.b_Dense_Pi_0
165 |         Hi_w_b = K.squeeze(K.dot(Hi, self.w_Dense_Pi_0), axis=-1) + self.b_Dense_Pi_0  # shape=(batchSize,num_imgRegion)
166 |         Pi = K.softmax(Hi_w_b)
167 |         Pi = K.permute_dimensions(K.repeat(Pi, output_dim), (0, 2, 1))  # shape=(batchSize,num_imgRegion,output_dim)
168 |         Pi_Vi = Pi*ifeature
169 |         Vi = K.sum(Pi_Vi, axis=1)  # shape=(batchSize,output_dim)
170 | 
171 |         # phase 1: image-guided text feature computation
172 |         w_Vi_1 = K.repeat(K.dot(Vi, self.w_Dense_Vi_1), seq_len)    # shape=(batchSize,seq_len,dim_k)
173 |         w_Vt_1 = K.dot(K.reshape(tfeature_h, [-1, output_dim]), self.w_Dense_Vt_1)   # shape=((batchSize*seq_len),dim_k)
174 |         w_Vt_1 = K.reshape(w_Vt_1, (-1, seq_len, dim_k))    # shape= (batchSize, seq_len, dim_k)
175 |         Vi_Vt_1 = K.concatenate([w_Vi_1, w_Vt_1], axis=-1)    # shape=(batchSize, seq_len, 2*dim_k)
176 |         Ht = K.tanh(Vi_Vt_1)
177 |         Ht_b = K.squeeze(K.dot(Ht, self.w_Dense_Pi_1), axis=-1) + self.b_Dense_Pi_1   # shape=(batch_size, seq_len)
178 |         Pt = K.softmax(Ht_b)
179 |         Pt = K.permute_dimensions(K.repeat(Pt, output_dim), (0, 2, 1))    # shape=(batchSize, seq_len, output_dim)
180 |         Pt_Vt = Pt*tfeature_h
181 |         Vt = K.sum(Pt_Vt, axis=1)    # shape=(batchSize, output_dim)
182 | 
183 |         return Vi+Vt
184 | 
185 |     def compute_output_shape(self, input_shape):
186 |         output_shape = (input_shape[0][0], input_shape[0][-1])
187 |         return output_shape
188 | 
189 |     def get_config(self):
190 |         return super(coAttention_alt, self).get_config()
191 | 
192 | 
193 | class coAttention_para(Layer):
194 |     """
195 |     self-defined parallel co-attention layer.
196 |     inputs: [tFeature, iFeature]
197 |     outputs: [coFeature]
198 | 
199 |     dimension:
200 |     input dimensions: [(batch_size, seq_length, embedding_size), (batch_size, num_img_region, 2*hidden_size)]
201 |         considering subsequent operation, better to set embedding_size == 2*hidden_size
202 |     output dimensions:[(batch_size, 2*hidden_size)]
203 |     """
204 |     def __init__(self, dim_k, **kwargs):
205 |         super(coAttention_para, self).__init__(**kwargs)
206 |         self.dim_k = dim_k  # internal tensor dimension
207 |         self.supports_masking = True
208 | 
209 |     def build(self, input_shape):
210 |         if not isinstance(input_shape, list):
211 |             raise ValueError('A Co-Attention_para layer should be called '
212 |                              'on a list of inputs.')
213 |         if len(input_shape) != 2:
214 |             raise ValueError('A Co-Attention_para layer should be called on a list of 2 inputs.'
215 |                              'Got '+str(len(input_shape))+'inputs.')
216 |         self.embedding_size = input_shape[0][-1]
217 |         self.num_region = input_shape[1][1]
218 |         self.seq_len = input_shape[0][1]
219 |         """
220 |         naming variables following the VQA paper
221 |         """
222 |         self.Wb = self.add_weight(name="Wb",
223 |                                   initializer="random_normal",
224 |                                   # initializer="ones",
225 |                                   shape=(self.embedding_size, self.embedding_size),
226 |                                   trainable=True)
227 |         self.Wq = self.add_weight(name="Wq",
228 |                                   initializer="random_normal",
229 |                                   # initializer="ones",
230 |                                   shape=(self.embedding_size, self.dim_k),
231 |                                   trainable=True)
232 |         self.Wv = self.add_weight(name="Wv",
233 |                                   initializer="random_normal",
234 |                                   # initializer="ones",
235 |                                   shape=(self.embedding_size, self.dim_k),
236 |                                   trainable=True)
237 |         self.Whv = self.add_weight(name="Whv",
238 |                                    initializer="random_normal",
239 |                                    # initializer="ones",
240 |                                    shape=(self.dim_k, 1),
241 |                                    trainable=True)
242 |         self.Whq = self.add_weight(name="Whq",
243 |                                    initializer="random_normal",
244 |                                    # initializer="ones",
245 |                                    shape=(self.dim_k, 1),
246 |                                    trainable=True)
247 | 
248 |         super(coAttention_para, self).build(input_shape)  # Be sure to call this somewhere!
249 | 
250 |     def call(self, inputs, mask=None):
251 |         tFeature = inputs[0]
252 |         iFeature = inputs[1]
253 |         # affinity matrix C
254 |         affi_mat = K.dot(tFeature, self.Wb)
255 |         affi_mat = K.batch_dot(affi_mat, K.permute_dimensions(iFeature, (0, 2, 1)))  # (batch_size, seq_len, num_region)
256 |         # Hq, Hv, av, aq
257 |         tmp_Hv = K.dot(tFeature, self.Wq)
258 |         Hv = K.dot(iFeature, self.Wv) + K.batch_dot(K.permute_dimensions(affi_mat, (0, 2, 1)), tmp_Hv)
259 |         Hv = K.tanh(Hv)
260 |         av = K.softmax(K.squeeze(K.dot(Hv, self.Whv), axis=-1))
261 | 
262 |         tmp_Hq = K.dot(iFeature, self.Wv)
263 |         Hq = K.dot(tFeature, self.Wq) + K.batch_dot(affi_mat, tmp_Hq)
264 |         Hq = K.tanh(Hq)
265 |         aq = K.softmax(K.squeeze(K.dot(Hq, self.Whq), axis=-1))
266 | 
267 |         av = K.permute_dimensions(K.repeat(av, self.embedding_size), (0, 2, 1))
268 |         aq = K.permute_dimensions(K.repeat(aq, self.embedding_size), (0, 2, 1))
269 | 
270 |         tfeature = K.sum(aq * tFeature, axis=1)
271 |         ifeature = K.sum(av * iFeature, axis=1)
272 | 
273 |         return tfeature+ifeature
274 | 
275 |     def get_config(self):
276 |         return super(coAttention_para, self).get_config()
277 | 
278 | 
279 | class encoding(Layer):
280 |     """
281 |     self defined encoding layer, summarize total co-feature based on three level co-features
282 |     input: [co_feature_word, co_feature_phrase, co_feature_text]
283 |     output: total co_feature
284 | 
285 |     dimension :
286 |     input dimensions : [(batch_size, embedding_size)]*3
287 |     output dimension: (batch_size, embedding_size)
288 |     """
289 |     def __init__(self, **kwargs):
290 |         super(encoding, self).__init__(**kwargs)
291 |         # self.input_spec = InputSpec(min_ndim=3)
292 |         self.supports_masking = True
293 | 
294 |     def build(self, input_shape):
295 |         if not isinstance(input_shape, list):
296 |             raise ValueError('A Co-Attention_alt layer should be called '
297 |                              'on a list of inputs.')
298 |         if len(input_shape) != 3:
299 |             raise ValueError('A Co-Attention_alt layer should be called on a list of 3 inputs.'
300 |                              'Got '+str(len(input_shape))+'inputs.')
301 |         self.output_dim = input_shape[0][-1]
302 | 
303 |         self.w_word = self.add_weight(name='w_word',
304 |                                       shape=(self.output_dim, self.output_dim),
305 |                                       initializer='random_normal',
306 |                                       trainable=True)
307 |         self.w_phrase = self.add_weight(name="w_phrase",
308 |                                         shape=(self.output_dim*2, self.output_dim),
309 |                                         initializer="random_normal",
310 |                                         trainable=True)
311 |         self.w_text = self.add_weight(name="w_text",
312 |                                       shape=(self.output_dim*2, self.output_dim),
313 |                                       initializer="random_normal",
314 |                                       trainable=True)
315 |         super(encoding, self).build(input_shape)  # Be sure to call this somewhere!
316 | 
317 |     def call(self, inputs, mask=None):
318 |         feature_word = inputs[0]
319 |         feature_phrase = inputs[1]
320 |         feature_text = inputs[2]
321 | 
322 |         h_w = K.tanh(K.dot(feature_word, self.w_word))
323 |         h_p = K.tanh(K.dot(K.concatenate([feature_phrase, h_w]), self.w_phrase))
324 |         h_t = K.tanh(K.dot(K.concatenate([feature_text, h_p]), self.w_text))
325 | 
326 |         return h_t
327 | 
328 |     def compute_output_shape(self, input_shape):
329 |         output_shape = (input_shape[0][0], input_shape[0][1])
330 |         return output_shape
331 | 
332 |     def get_config(self):
333 |         return super(encoding, self).get_config()
334 | 
335 | 
336 | class self_conv1d(Layer):
337 |     def __init__(self, filters,
338 |                  kernel_size,
339 |                  strides=1,
340 |                  padding='same',
341 |                  data_format=None,
342 |                  activation='tanh',
343 |                  kernel_initializer='glorot_uniform',
344 |                  bias_initializer='zeros',
345 |                  use_bias=True,
346 |                  ** kwargs):
347 |         super(self_conv1d, self).__init__(**kwargs)
348 |         self.rank = 1
349 |         self.filters = filters
350 |         self.data_format = conv_utils.normalize_data_format(data_format)
351 |         self.kernel_size = conv_utils.normalize_tuple(kernel_size, self.rank, 'kernel_size')
352 |         self.strides = conv_utils.normalize_tuple(strides, self.rank, 'strides')
353 |         self.padding = conv_utils.normalize_padding(padding)
354 |         self.activation = activations.get(activation)
355 |         self.use_bias = use_bias
356 |         self.supports_masking = True
357 |         self.kernel_initializer = initializers.get(kernel_initializer)
358 |         self.bias_initializer = initializers.get(bias_initializer)
359 | 
360 |     def build(self, input_shape):
361 |         if len(input_shape) != 3:
362 |             raise ValueError('A Co-Attention_alt layer should be called on a tensor of 3 dims.'
363 |                              'Got '+str(len(input_shape))+'dims.')
364 |         if self.data_format == "channels_first":
365 |             channel_axis = 1
366 |         else:
367 |             channel_axis = -1
368 |         if input_shape[channel_axis] is None:
369 |             raise ValueError('The channel dimension of the inputs '
370 |                              'should be defined. Found `None`.')
371 |         input_dim = input_shape[channel_axis]
372 |         kernel_shape = self.kernel_size + (input_dim, self.filters)
373 |         self.kernel = self.add_weight(shape=kernel_shape,
374 |                                       initializer=self.kernel_initializer,
375 |                                       # initializer='ones',
376 |                                       name='kernel',
377 |                                       trainable=True)
378 |         if self.use_bias:
379 |             self.bias = self.add_weight(shape=(self.filters,),
380 |                                         initializer=self.bias_initializer,
381 |                                         # initializer='zeros',
382 |                                         name='bias',
383 |                                         trainable=True)
384 |         else:
385 |             self.bias = None
386 | 
387 |         self.built = True
388 | 
389 |     def call(self, inputs, mask=None):
390 |         # print(K.get_value(self.kernel))
391 |         outputs = K.conv1d(
392 |             inputs,
393 |             self.kernel,
394 |             strides=self.strides[0],
395 |             padding=self.padding,
396 |             data_format=self.data_format)
397 | 
398 |         if self.use_bias:
399 |             outputs = K.bias_add(
400 |                 outputs,
401 |                 self.bias,
402 |                 data_format=self.data_format)
403 | 
404 |         if self.activation is not None:
405 |             return self.activation(outputs)
406 | 
407 |         return outputs
408 | 
409 |     def compute_mask(self, inputs, mask=None):
410 |         if isinstance(mask, list):
411 |             mask = mask[0]
412 |         return mask
413 | 
414 |     def compute_output_shape(self, input_shape):
415 |         output_shape = input_shape
416 |         return output_shape
417 | 
418 |     def get_config(self):
419 |         return super(self_conv1d, self).get_config()
420 | 
421 | 
422 | class self_maxpooling(Layer):
423 |     def __init__(self, **kwargs):
424 |         self.supports_masking = True
425 |         super(self_maxpooling, self).__init__(**kwargs)
426 | 
427 |     def build(self, input_shape):
428 |         if not isinstance(input_shape, list):
429 |             raise ValueError('A Co-Attention_alt layer should be called '
430 |                              'on a list of inputs.')
431 |         self.num_inputs = len(input_shape)
432 | 
433 |     def call(self, inputs, mask=None):
434 |         tmp = K.stack([inputs[0], inputs[1], inputs[2]], axis=1)
435 | 
436 |         outputs = K.max(tmp, axis=1)
437 | 
438 |         return outputs
439 | 
440 |     def compute_mask(self, inputs, mask=None):
441 |         if isinstance(mask, list):
442 |             mask = mask[0]
443 |         return mask
444 | 
445 |     def compute_output_shape(self, input_shape):
446 |         output_shape = input_shape[0]
447 |         return output_shape
448 | 
449 |     def get_config(self):
450 |         return super(self_maxpooling, self).get_config()
451 | 
452 | 
453 | def myLossFunc(y_true, y_pred):
454 |     probs_log = -K.log(y_pred)
455 |     loss = K.mean(K.sum(probs_log*y_true, axis=-1))
456 |     return loss
457 | 
458 | 
459 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Models-reproducing
 2 | Some models in AI academical papers implemented by myself.
 3 | 
 4 | **Things really useful are the self defined Keras layers.**
 5 | ### 1. **CoA** from [Hashtag Recommendation for Multimodal Microblog Using Co-Attention Networks.](https://www.ijcai.org/proceedings/2017/0478.pdf) 
 6 | A hashtag recommendation model using a co-attention mechanism.
 7 | 
 8 | ### 2. **TAB-LSTM** from [Hashtag recommendation with topical attention-based LSTM](http://www.aclweb.org/anthology/C16-1284). 
 9 | A hashtag recommendation model for texual content using attention mechanism and topical distribution.
10 | 
11 | ### 3. **ABC (attention based convolutional)** from [Hashtag Recommendation Using Attention-Based Convolutional Neural Network](https://www.ijcai.org/Proceedings/16/Papers/395.pdf).
12 |  A hashtag recommendation model for textual content using two level (local and global) convolutional NN based attention mechanism.
13 | 
14 | ### 4. **Hierarchical Co-Attention VQA** from [Hierarchical Question-Image Co-Attention for Visual Question Answering](http://papers.nips.cc/paper/6202-hierarchical-question-image-co-attention-for-visual-question-answering.pdf).
15 |  Co-Attention mechanism is proposed in this article for the first time.


--------------------------------------------------------------------------------
/TAB-LSTM/README.md:
--------------------------------------------------------------------------------
 1 | # TAB-LSTM model
 2 | 
 3 | ### Proposed in [Hashtag recommendation with topical attention-based LSTM](http://www.aclweb.org/anthology/C16-1284) COLING 2016, by Yang Li et al.
 4 | 
 5 | ## **Overview**
 6 | ### TAB-LSTM is used to recommend hashtags for texual content. it employs LSTM to extract textual features and integrates an attention mechanism by pre-trained topical distributions.
 7 | 
 8 | ### Model Architecture
 9 | ![TAB-LSTM model](TAB-LSTM.png)
10 | 
11 | ## **Usage**
12 | ### Source files in this repository can not be executed immediately, as there is no data preprocessing and data inputs statements. So you should get your data prepared according to your need.  
13 | 
14 | ### Topical distribution can be pre-trained with LDA model.
15 | 
16 | ## **Requirements**
17 | - Python 3.x
18 | - Tensorflow >= 1.7
19 | - Keras >= 2.1.5
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/TAB-LSTM/TAB-LSTM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Zhangtd/Models-reproducing/48cff578ef2b1a0f0a265bc7a0d10b17bd12e3fd/TAB-LSTM/TAB-LSTM.png


--------------------------------------------------------------------------------
/TAB-LSTM/topicAtt.py:
--------------------------------------------------------------------------------
 1 | from keras.models import Model
 2 | from keras.layers import Input
 3 | import keras.backend as K
 4 | from keras.engine.topology import Layer, InputSpec
 5 | from keras.utils import conv_utils
 6 | from keras import activations, initializers
 7 | 
 8 | 
 9 | class topicAttention(Layer):
10 |     """
11 |     self defined topical attention layer.
12 |     input: [hiddenStates, topicDistribution]
13 |     input_shape: [(batch_size, seq_len, embedding_size), (batch_size, topic_num)]
14 |     output: topical_text_feature
15 |     output shape: (batch_size, embedding_size)
16 |     """
17 |     def __init__(self, **kwargs):
18 |         super(topicAttention, self).__init__(**kwargs)
19 |         # self.input_spec = InputSpec(min_ndim=3)
20 |         self.supports_masking = True
21 | 
22 |     def build(self, input_shape):
23 |         self.embedding_size = input_shape[0][-1]
24 |         self.topic_num = input_shape[1][-1]
25 |         self.seq_len = input_shape[0][1]
26 | 
27 |         # Create a trainable weight variable for this layer.
28 |         self.w = self.add_weight(name='w',
29 |                                  shape=(self.embedding_size, self.topic_num),
30 |                                  initializer='random_normal',
31 |                                  trainable=True)
32 |         self.v = self.add_weight(name='v',
33 |                                  shape=(self.embedding_size, 1),
34 |                                  initializer='zeros',
35 |                                  trainable=True)
36 |         self.u = self.add_weight(name='u',
37 |                                  shape=(self.embedding_size, self.embedding_size),
38 |                                  initializer='random_normal',
39 |                                  trainable=True)
40 |         super(topicAttention, self).build(input_shape)  # Be sure to call this somewhere!
41 | 
42 |     def call(self, x, mask=None):
43 |         h = x[0]
44 |         theta = x[1]
45 | 
46 |         theta_w = K.dot(theta, K.transpose(self.w))
47 |         theta_w = K.repeat(theta_w, self.seq_len)
48 |         h_ = K.reshape(h, [-1, self.embedding_size])
49 |         h_u = K.dot(h_, self.u)
50 |         h_u = K.reshape(h_u, [-1, self.seq_len, self.embedding_size])
51 | 
52 |         g = K.dot(K.tanh(theta_w+h_u), self.v)
53 |         weight = K.softmax(K.squeeze(g, axis=-1))
54 |         weight = K.expand_dims(weight, axis=-1)
55 |         weight = K.repeat_elements(weight, self.embedding_size, axis=-1)
56 |         vec = weight * h
57 |         vec = K.sum(vec, axis=1)
58 | 
59 |         return vec
60 | 
61 |     def compute_mask(self, inputs, mask=None):
62 |         return None
63 | 
64 |     def compute_output_shape(self, input_shape):
65 |         output_shape = (input_shape[0][0], input_shape[0][-1])
66 |         return output_shape
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     input1 = Input(batch_shape=(10, 25, 50))
71 |     input2 = Input(batch_shape=(10, 20))
72 | 
73 |     topic_h = topicAttention()([input1, input2])
74 |     print(topic_h)
75 | 


--------------------------------------------------------------------------------
/TAB-LSTM/topicLSTM.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from keras.models import Model
 3 | from keras.layers import Input, Reshape, Dense, Embedding, Dropout, LSTM, AveragePooling1D, Lambda, Concatenate, \
 4 |     Multiply, RepeatVector, Flatten, Activation, Permute, merge
 5 | import keras.backend as K
 6 | 
 7 | from topicAtt import topicAttention
 8 | import numpy as np
 9 | 
10 | num_tags = 3896
11 | num_words = 212000
12 | index_from = 3
13 | seq_length = 30
14 | batch_size = 512
15 | embedding_size = 300
16 | attention_size = 200
17 | topic_num = 100
18 | dim_k = 100
19 | drop_rate = 0.75
20 | 
21 | # prepare the following data. img data is the output of VGG-16
22 | topics_train, text_train, tag_train, topics_test, text_test, tag_test
23 | 
24 | 
25 | def myLossFunc(y_true, y_pred):
26 |     probs_log = -K.log(y_pred)
27 |     loss = K.mean(K.sum(probs_log*y_true, axis=-1))
28 |     # loss = K.mean(K.sum(K.clip(probs_log * y_true, -1e40, 100), axis=-1))
29 |     return loss
30 | 
31 | 
32 | def modelDef():
33 |     input_text = Input(shape=(seq_length, ))
34 |     input_topic = Input(shape=(topic_num,))
35 | 
36 |     embeddings = Embedding(input_dim=num_words+index_from, output_dim=embedding_size,
37 |                            mask_zero=True, input_length=seq_length)(input_text)
38 |     tFeature = LSTM(units=embedding_size, return_sequences=True)(embeddings)
39 |     topic_h = topicAttention()([tFeature, input_topic])
40 |     dropout = Dropout(drop_rate)(topic_h)
41 |     Softmax = Dense(num_tags, activation="softmax", use_bias=True)(dropout)
42 | 
43 |     model = Model(inputs=[input_text, input_topic], outputs=[Softmax])
44 |     model.compile(optimizer="adam", loss=myLossFunc)
45 | 
46 |     return model
47 | 
48 | 
49 | def evaluation(y_true, y_pred, top_K):
50 |     acc_count = 0
51 |     precision_K = []
52 |     recall_K = []
53 |     f1_K = []
54 | 
55 |     for i in range(y_pred.shape[0]):
56 |         top_indices = y_pred[i].argsort()[-top_K:]
57 |         if np.sum(y_true[i, top_indices]) >= 1:
58 |             acc_count += 1
59 |         p = np.sum(y_true[i, top_indices])/top_K
60 |         r = np.sum(y_true[i, top_indices])/np.sum(y_true[i, :])
61 |         precision_K.append(p)
62 |         recall_K.append(r)
63 |         if p != 0 or r != 0:
64 |             f1_K.append(2 * p * r / (p + r))
65 |         else:
66 |             f1_K.append(0)
67 | 
68 |     acc_K = acc_count * 1.0 / y_pred.shape[0]
69 | 
70 |     return acc_K, np.mean(np.array(precision_K)), np.mean(np.array(recall_K)), np.mean(np.array(f1_K))
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     history = myModel.fit(x=[texts_train, topics_train],
75 |                           y=tags_train,
76 |                           batch_size=batch_size,
77 |                           epochs=1,
78 |                           verbose=1, )
79 |     y_pred = myModel.predict(x=[texts_test, topics_test])
80 |     acc, precision, recall, f1 = evaluation(tags_test, y_pred, top_K)
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------