├── .gitignore ├── LICENSE.txt ├── README.md ├── keras_vse ├── __init__.py ├── layers.py ├── models.py └── tools.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | *.pyc 3 | *.swp 4 | *.egg-info 5 | build/ 6 | !examples/README.md 7 | vgg16_weights.h5 8 | venv 9 | MANIFEST 10 | dist/ 11 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 Adam Wentz 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | this software and associated documentation files (the "Software"), to deal in the 6 | Software without restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the 8 | Software, and to permit persons to whom the Software is furnished to do so, 9 | subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 19 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Visual-Semantic Embedding with Keras 2 | ===================================== 3 | [Keras](http://keras.io/) implementation of https://github.com/ryankiros/visual-semantic-embedding 4 | 5 | Pre-trained model 6 | ----------------- 7 | * [Visual-Semantic Embedding](http://www.cs.toronto.edu/~rkiros/models/vse.zip) 8 | -------------------------------------------------------------------------------- /keras_vse/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /keras_vse/layers.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | from keras.layers import Layer 3 | 4 | 5 | class L2Normalize(Layer): 6 | def __init__(self, axis=-1, **kwargs): 7 | self.axis = axis 8 | return super(L2Normalize, self).__init__(**kwargs) 9 | 10 | def call(self, x, mask=None): 11 | return K.l2_normalize(x, axis=self.axis) 12 | 13 | def get_config(self): 14 | base_config = super(L2Normalize, self).get_config() 15 | base_config.update(dict(axis=self.axis)) 16 | return base_config 17 | -------------------------------------------------------------------------------- /keras_vse/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import six 3 | from keras import backend as K 4 | from keras.layers import Convolution2D, Dense, Embedding, GRU, Input 5 | from keras.models import Model, Sequential 6 | 7 | from .layers import L2Normalize 8 | from .tools import encode_sentences 9 | 10 | 11 | def build_image_encoder(weights=None, input_dim=4096, embedding_dim=1024, normalize=True): 12 | input = Input(shape=(input_dim,)) 13 | x = Dense( 14 | embedding_dim, 15 | weights=weights 16 | )(input) 17 | if normalize: 18 | x = L2Normalize()(x) 19 | model = Model(input=input, output=x) 20 | return model 21 | 22 | 23 | def build_sentence_encoder(embedding_weights=None, gru_weights=None, input_length=None, vocab_dim=32198, 24 | vocab_embedding_dim=300, embedding_dim=1024, normalize=True): 25 | # NOTE: This gives slightly different results than the original model. 26 | # I think it's because the original has a different masking scheme. 27 | model = Sequential([ 28 | Embedding( 29 | vocab_dim, vocab_embedding_dim, input_length=input_length, 30 | weights=embedding_weights, mask_zero=True # TODO: masking isn't quite right 31 | ), 32 | GRU(embedding_dim, weights=gru_weights, inner_activation='sigmoid'), 33 | ]) 34 | if normalize: 35 | model.add(L2Normalize()) 36 | return model 37 | 38 | 39 | def build_pretrained_models(model_filename, input_length=None, normalize=True): 40 | img_enc_weights, embedding_weights, gru_weights, vocab_map = load_pretrained_parameters(model_filename) 41 | image_encoder = build_image_encoder(weights=img_enc_weights, normalize=normalize) 42 | sentence_encoder = build_sentence_encoder( 43 | embedding_weights=embedding_weights, 44 | gru_weights=gru_weights, 45 | input_length=input_length, vocab_dim=len(vocab_map), 46 | normalize=normalize) 47 | return image_encoder, sentence_encoder, vocab_map 48 | 49 | 50 | def load_pretrained_parameters(filename): 51 | '''Load up the pre-trained weights from the @ryankiros implementation. 52 | ''' 53 | params = np.load(filename) 54 | vocab_map = np.load('{}.dictionary.pkl'.format(filename)) 55 | # image encoder weights 56 | if params: 57 | img_enc_weights = [params['ff_image_W'], params['ff_image_b']] 58 | else: 59 | img_enc_weights = None 60 | # sentence encoder weights 61 | embedding_weights = [params['Wemb']] 62 | W_h = params['encoder_Wx'] 63 | U_h = params['encoder_Ux'] 64 | b_h = params['encoder_bx'] 65 | W_r, W_z = np.split(params['encoder_W'], 2, axis=1) 66 | U_r, U_z = np.split(params['encoder_U'], 2, axis=1) 67 | b_r, b_z = np.split(params['encoder_b'], 2) 68 | gru_weights = [ 69 | W_z, U_z, b_z, 70 | W_r, U_r, b_r, 71 | W_h, U_h, b_h, 72 | ] 73 | return img_enc_weights, embedding_weights, gru_weights, vocab_map 74 | 75 | 76 | if __name__ == '__main__': 77 | import argparse 78 | 79 | parser = argparse.ArgumentParser('Visual semantic embeddings') 80 | parser.add_argument('model_file', type=six.text_type) 81 | parser.add_argument('--length', type=int, default=None) 82 | args = parser.parse_args() 83 | 84 | image_encoder, sentence_encoder, vocab_map = \ 85 | build_pretrained_models(args.model_file, input_length=args.length) 86 | for enc in image_encoder, sentence_encoder: 87 | enc.compile(optimizer='adam', loss='mse') 88 | print(image_encoder.predict(np.random.uniform(0, 500, (2, 4096,)))) 89 | print(encode_sentences( 90 | sentence_encoder, vocab_map, 91 | ['contains fancy gold', 'contains fancy gold'], 92 | max_length=args.length)) 93 | -------------------------------------------------------------------------------- /keras_vse/tools.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | 5 | 6 | def encode_sentences(model, vocab_map, X, max_length=None, 7 | embedding_dim=1024, verbose=False, batch_size=128): 8 | '''Encode sentences into the joint embedding space. 9 | 10 | This is mostly from the original @ryankiros implementation. 11 | ''' 12 | n_words = len(vocab_map) 13 | features = np.zeros((len(X), embedding_dim), dtype='float32') 14 | 15 | captions = [s.split() for s in X] 16 | if max_length is None: 17 | max_length = max((len(c) for c in captions)) 18 | else: 19 | captions = [s[:max_length - 1] for s in captions] 20 | # quick check if a word is in the dictionary 21 | d = defaultdict(bool) 22 | for w in vocab_map.keys(): 23 | d[w] = True 24 | 25 | k = max_length - 1 26 | if verbose: 27 | print k 28 | numbatches = len(captions) / batch_size + 1 29 | for minibatch in range(0, len(captions), batch_size): 30 | caption = captions[minibatch:minibatch + batch_size] 31 | 32 | seqs = [] 33 | for i, cc in enumerate(caption): 34 | seqs.append([ 35 | vocab_map[w] if d[w] > 0 and vocab_map[w] < n_words else 1 for w in cc 36 | ]) 37 | x = np.zeros((k + 1, len(caption))).astype('int64') 38 | x_mask = np.zeros((k + 1, len(caption))).astype('float32') 39 | for idx, s in enumerate(seqs): 40 | x[:len(s), idx] = s 41 | x_mask[:len(s) + 1, idx] = 1. 42 | 43 | ff = model.predict(x.transpose(1, 0)) 44 | for ind, c in enumerate(range(minibatch, minibatch + len(caption))): 45 | features[c] = ff[ind] 46 | 47 | return features 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from distutils.core import setup 3 | from setuptools import find_packages 4 | 5 | setup( 6 | name='keras-visual-semantic-embedding', 7 | version='0.0.1', 8 | description='Keras implementation of visual-semantic embedding.', 9 | author='Adam Wentz', 10 | author_email='adam@adamwentz.com', 11 | url='https://github.com/awentzonline/keras-visual-semantic-embedding/', 12 | packages=find_packages(), 13 | install_requires=[ 14 | 'h5py>=2.5.0', 15 | 'Keras>=1.0.1', 16 | 'numpy>=1.10.4', 17 | 'six>=1.10.0', 18 | ], 19 | ) 20 | --------------------------------------------------------------------------------