├── .gitignore
├── LICENSE.txt
├── README.md
├── keras_vse
    ├── __init__.py
    ├── layers.py
    ├── models.py
    └── tools.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.DS_Store
 2 | *.pyc
 3 | *.swp
 4 | *.egg-info
 5 | build/
 6 | !examples/README.md
 7 | vgg16_weights.h5
 8 | venv
 9 | MANIFEST
10 | dist/
11 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | Copyright (c) 2016 Adam Wentz
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 5 | this software and associated documentation files (the "Software"), to deal in the
 6 | Software without restriction, including without limitation the rights to use,
 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
 8 | Software, and to permit persons to whom the Software is furnished to do so,
 9 | subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
19 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Visual-Semantic Embedding with Keras
2 | =====================================
3 | [Keras](http://keras.io/) implementation of https://github.com/ryankiros/visual-semantic-embedding
4 | 
5 | Pre-trained model
6 | -----------------
7 |  * [Visual-Semantic Embedding](http://www.cs.toronto.edu/~rkiros/models/vse.zip)
8 | 


--------------------------------------------------------------------------------
/keras_vse/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/keras_vse/layers.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | from keras.layers import Layer
 3 | 
 4 | 
 5 | class L2Normalize(Layer):
 6 |     def __init__(self, axis=-1, **kwargs):
 7 |         self.axis = axis
 8 |         return super(L2Normalize, self).__init__(**kwargs)
 9 | 
10 |     def call(self, x, mask=None):
11 |         return K.l2_normalize(x, axis=self.axis)
12 | 
13 |     def get_config(self):
14 |         base_config = super(L2Normalize, self).get_config()
15 |         base_config.update(dict(axis=self.axis))
16 |         return base_config
17 | 


--------------------------------------------------------------------------------
/keras_vse/models.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import six
 3 | from keras import backend as K
 4 | from keras.layers import Convolution2D, Dense, Embedding, GRU, Input
 5 | from keras.models import Model, Sequential
 6 | 
 7 | from .layers import L2Normalize
 8 | from .tools import encode_sentences
 9 | 
10 | 
11 | def build_image_encoder(weights=None, input_dim=4096, embedding_dim=1024, normalize=True):
12 |     input = Input(shape=(input_dim,))
13 |     x = Dense(
14 |         embedding_dim,
15 |         weights=weights
16 |     )(input)
17 |     if normalize:
18 |         x = L2Normalize()(x)
19 |     model = Model(input=input, output=x)
20 |     return model
21 | 
22 | 
23 | def build_sentence_encoder(embedding_weights=None, gru_weights=None, input_length=None, vocab_dim=32198,
24 |         vocab_embedding_dim=300, embedding_dim=1024, normalize=True):
25 |     # NOTE: This gives slightly different results than the original model.
26 |     # I think it's because the original has a different masking scheme.
27 |     model = Sequential([
28 |         Embedding(
29 |             vocab_dim, vocab_embedding_dim, input_length=input_length,
30 |             weights=embedding_weights, mask_zero=True  # TODO: masking isn't quite right
31 |         ),
32 |         GRU(embedding_dim, weights=gru_weights, inner_activation='sigmoid'),
33 |     ])
34 |     if normalize:
35 |         model.add(L2Normalize())
36 |     return model
37 | 
38 | 
39 | def build_pretrained_models(model_filename, input_length=None, normalize=True):
40 |     img_enc_weights, embedding_weights, gru_weights, vocab_map = load_pretrained_parameters(model_filename)
41 |     image_encoder = build_image_encoder(weights=img_enc_weights, normalize=normalize)
42 |     sentence_encoder = build_sentence_encoder(
43 |         embedding_weights=embedding_weights,
44 |         gru_weights=gru_weights,
45 |         input_length=input_length, vocab_dim=len(vocab_map),
46 |         normalize=normalize)
47 |     return image_encoder, sentence_encoder, vocab_map
48 | 
49 | 
50 | def load_pretrained_parameters(filename):
51 |     '''Load up the pre-trained weights from the @ryankiros implementation.
52 |     '''
53 |     params = np.load(filename)
54 |     vocab_map = np.load('{}.dictionary.pkl'.format(filename))
55 |     # image encoder weights
56 |     if params:
57 |         img_enc_weights = [params['ff_image_W'], params['ff_image_b']]
58 |     else:
59 |         img_enc_weights = None
60 |     # sentence encoder weights
61 |     embedding_weights = [params['Wemb']]
62 |     W_h = params['encoder_Wx']
63 |     U_h = params['encoder_Ux']
64 |     b_h = params['encoder_bx']
65 |     W_r, W_z = np.split(params['encoder_W'], 2, axis=1)
66 |     U_r, U_z = np.split(params['encoder_U'], 2, axis=1)
67 |     b_r, b_z = np.split(params['encoder_b'], 2)
68 |     gru_weights = [
69 |         W_z, U_z, b_z,
70 |         W_r, U_r, b_r,
71 |         W_h, U_h, b_h,
72 |     ]
73 |     return img_enc_weights, embedding_weights, gru_weights, vocab_map
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     import argparse
78 | 
79 |     parser = argparse.ArgumentParser('Visual semantic embeddings')
80 |     parser.add_argument('model_file', type=six.text_type)
81 |     parser.add_argument('--length', type=int, default=None)
82 |     args = parser.parse_args()
83 | 
84 |     image_encoder, sentence_encoder, vocab_map = \
85 |         build_pretrained_models(args.model_file, input_length=args.length)
86 |     for enc in image_encoder, sentence_encoder:
87 |         enc.compile(optimizer='adam', loss='mse')
88 |     print(image_encoder.predict(np.random.uniform(0, 500, (2, 4096,))))
89 |     print(encode_sentences(
90 |         sentence_encoder, vocab_map,
91 |         ['contains fancy gold', 'contains fancy gold'],
92 |         max_length=args.length))
93 | 


--------------------------------------------------------------------------------
/keras_vse/tools.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def encode_sentences(model, vocab_map, X, max_length=None,
 7 |         embedding_dim=1024, verbose=False, batch_size=128):
 8 |     '''Encode sentences into the joint embedding space.
 9 | 
10 |     This is mostly from the original @ryankiros implementation.
11 |     '''
12 |     n_words = len(vocab_map)
13 |     features = np.zeros((len(X), embedding_dim), dtype='float32')
14 | 
15 |     captions = [s.split() for s in X]
16 |     if max_length is None:
17 |         max_length = max((len(c) for c in captions))
18 |     else:
19 |         captions = [s[:max_length - 1] for s in captions]
20 |     # quick check if a word is in the dictionary
21 |     d = defaultdict(bool)
22 |     for w in vocab_map.keys():
23 |         d[w] = True
24 | 
25 |     k = max_length - 1
26 |     if verbose:
27 |         print k
28 |     numbatches = len(captions) / batch_size + 1
29 |     for minibatch in range(0, len(captions), batch_size):
30 |         caption = captions[minibatch:minibatch + batch_size]
31 | 
32 |         seqs = []
33 |         for i, cc in enumerate(caption):
34 |             seqs.append([
35 |                 vocab_map[w] if d[w] > 0 and vocab_map[w] < n_words else 1 for w in cc
36 |             ])
37 |         x = np.zeros((k + 1, len(caption))).astype('int64')
38 |         x_mask = np.zeros((k + 1, len(caption))).astype('float32')
39 |         for idx, s in enumerate(seqs):
40 |             x[:len(s), idx] = s
41 |             x_mask[:len(s) + 1, idx] = 1.
42 | 
43 |         ff = model.predict(x.transpose(1, 0))
44 |         for ind, c in enumerate(range(minibatch, minibatch + len(caption))):
45 |             features[c] = ff[ind]
46 | 
47 |     return features
48 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from distutils.core import setup
 3 | from setuptools import find_packages
 4 | 
 5 | setup(
 6 |     name='keras-visual-semantic-embedding',
 7 |     version='0.0.1',
 8 |     description='Keras implementation of visual-semantic embedding.',
 9 |     author='Adam Wentz',
10 |     author_email='adam@adamwentz.com',
11 |     url='https://github.com/awentzonline/keras-visual-semantic-embedding/',
12 |     packages=find_packages(),
13 |     install_requires=[
14 |         'h5py>=2.5.0',
15 |         'Keras>=1.0.1',
16 |         'numpy>=1.10.4',
17 |         'six>=1.10.0',
18 |     ],
19 | )
20 | 


--------------------------------------------------------------------------------