├── imagernn
    ├── __init__.py
    ├── README.md
    ├── utils.py
    ├── imagernn_utils.py
    ├── data_provider.py
    ├── solver.py
    ├── generic_batch_generator.py
    ├── rnn_generator.py
    └── lstm_generator.py
├── .gitignore
├── delete_index.sh
├── docker_install.sh
├── static
    ├── favicon.png
    ├── uploads
    │   ├── surf.png
    │   ├── output.png
    │   └── cricket1.jpg
    └── database
    │   ├── img1.jpg
    │   ├── img2.jpg
    │   ├── img3.jpg
    │   ├── img4.jpg
    │   └── cricket1.jpg
├── .vscode
    └── settings.json
├── models
    └── README.md
├── cmd
    ├── index.py
    ├── query.py
    └── main.py
├── docker-compose.yaml
├── requirements.txt
├── index_database.py
├── templates
    ├── home.html
    ├── database.html
    ├── caption.html
    ├── search.html
    └── layout.html
├── Dockerfile
├── LICENSE.md
├── capgen.py
├── README.md
└── server.py


/imagernn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.p
2 | *.pyc
3 | image-search/*
4 | 


--------------------------------------------------------------------------------
/delete_index.sh:
--------------------------------------------------------------------------------
1 | curl -X DELETE "localhost:9200/desearch"


--------------------------------------------------------------------------------
/docker_install.sh:
--------------------------------------------------------------------------------
1 | docker build . -t image_to_image_search:version2
2 | docker-compose up -d
3 | 


--------------------------------------------------------------------------------
/static/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/favicon.png


--------------------------------------------------------------------------------
/static/uploads/surf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/uploads/surf.png


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "git.ignoreLimitWarning": true,
3 |     "python.pythonPath": "image-search/bin/python3.6"
4 | }


--------------------------------------------------------------------------------
/static/database/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/img1.jpg


--------------------------------------------------------------------------------
/static/database/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/img2.jpg


--------------------------------------------------------------------------------
/static/database/img3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/img3.jpg


--------------------------------------------------------------------------------
/static/database/img4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/img4.jpg


--------------------------------------------------------------------------------
/static/uploads/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/uploads/output.png


--------------------------------------------------------------------------------
/static/database/cricket1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/cricket1.jpg


--------------------------------------------------------------------------------
/static/uploads/cricket1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/uploads/cricket1.jpg


--------------------------------------------------------------------------------
/models/README.md:
--------------------------------------------------------------------------------
1 | Place the [Flickr 8K lSTM weights](https://cs.stanford.edu/people/karpathy/neuraltalk/flickr8k_cnn_lstm_v1.zip) here 
2 | 


--------------------------------------------------------------------------------
/cmd/index.py:
--------------------------------------------------------------------------------
 1 | ''' 
 2 | Indexes dataset.json in the elastic search server
 3 | '''
 4 | from elasticsearch import Elasticsearch
 5 | from elasticsearch.helpers import bulk
 6 | import json
 7 | 
 8 | es = Elasticsearch()
 9 | with open("dataset.json") as f:
10 |     data = json.load(f)
11 | actions = []
12 | for i in range(len(data['images'])):
13 |         doc = {'id': i, 'imgurl': data['images'][i]['filename'], 'description': data['images'][i]['sentences'][0]['raw'] }
14 |         actions.append(doc)
15 | bulk(es,actions,index="desearch",doc_type="json")
16 | es.indices.refresh(index="desearch")
17 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | services:
 3 |   elasticsearch:
 4 |     image: docker.elastic.co/elasticsearch/elasticsearch:6.6.0
 5 |     container_name: elasticsearch
 6 |     ports:
 7 |       - "9200:9200"
 8 |       - "9300:9300"
 9 |     environment:
10 |       ES_JAVA_OPTS: "-Xms256m -Xmx256m"
11 |       network.bind_host: 0.0.0.0
12 |       network.host: 0.0.0.0
13 |       discovery.type: single-node
14 |   # website:
15 |   #   image: image_to_image_search:version2
16 |   #   volumes:
17 |   #     - ~/.keras:/root/.keras
18 |   #   ports:
19 |   #     - "5000:5000"
20 |   #   depends_on:
21 |   #     - elasticsearch
22 |   #   working_dir: /image_search
23 |   #   command: python3 server.py
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==0.7.1
 2 | astor==0.7.1
 3 | bleach==1.5.0
 4 | Click==7.0
 5 | elasticsearch==6.3.1
 6 | enum34==1.1.6
 7 | Flask==1.0.3
 8 | gast==0.2.2
 9 | grpcio==1.21.1
10 | h5py==2.9.0
11 | html5lib==1.0.1
12 | itsdangerous==1.1.0
13 | Jinja2==2.10.1
14 | Keras==2.2.4
15 | Keras-Applications==1.0.7
16 | Keras-Preprocessing==1.0.9
17 | Markdown==3.1
18 | MarkupSafe==1.1.1
19 | mock==2.0.0
20 | numpy==1.16.2
21 | pbr==5.1.3
22 | Pillow==6.2.0
23 | protobuf==3.7.1
24 | PyYAML==5.1
25 | scipy==1.2.1
26 | six==1.12.0
27 | tensorboard==1.13.1
28 | tensorflow==1.15.0
29 | tensorflow-estimator==1.13.0
30 | tensorflow-tensorboard==0.1.8
31 | termcolor==1.1.0
32 | urllib3==1.24.2
33 | Werkzeug==0.15.3
34 | 


--------------------------------------------------------------------------------
/index_database.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | from elasticsearch.helpers import bulk
 3 | from capgen import CaptionGenerator
 4 | import glob
 5 | import os
 6 | 
 7 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
 8 | es = Elasticsearch()
 9 | gencap = CaptionGenerator()
10 | 
11 | def index_database():
12 |     images = glob.glob('static/database/*')
13 |     actions = []
14 |     for i, image in enumerate(images):
15 |         cap = gencap.get_caption(image)
16 |         doc = {'imgurl': image, 'description': cap}
17 |         actions.append(doc)
18 |     bulk(es,actions,index="desearch",doc_type="json")
19 | 
20 | if __name__ == "__main__":
21 |     index_database()
22 |     print('Images from static/img are indexed successfully')


--------------------------------------------------------------------------------
/templates/home.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}            
 2 | {% block content %}
 3 | 
 4 | <div class="row"><div class="s2"></div><div class="s9 center"><img src="static/logo.jpg" width=60%></div></div>
 5 |                   <div class="row">
 6 |                         <div class="col s2">
 7 |                           <!-- Grey navigation panel -->
 8 |                         </div>
 9 |                   
10 |                         <div class="col s9">
11 |                           <p class="caption"> SmartSearch is a reverse image search engine. Built on top of Tensorflow and ElasticSearch. It generates image captions to find similar images.</p>
12 |                           </div>
13 |                   </div>
14 |                   
15 |                     
16 | {% endblock %}
17 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | RUN apt-get update
 3 | RUN apt-get install -y apt-utils
 4 | RUN apt-get install -y curl
 5 | RUN apt-get install -y wget
 6 | RUN apt-get install -y vim
 7 | RUN apt-get install -y git-core
 8 | RUN apt-get install -y python3
 9 | RUN curl -O https://bootstrap.pypa.io/get-pip.py
10 | RUN python3 get-pip.py
11 | RUN rm get-pip.py
12 | RUN git clone -b docker_image https://github.com/sethuiyer/Image-to-Image-Search.git /image_search
13 | RUN pip3 install -r /image_search/requirements.txt
14 | RUN wget https://cs.stanford.edu/people/karpathy/neuraltalk/flickr8k_cnn_lstm_v1.zip
15 | RUN apt-get install -y unzip
16 | RUN unzip flickr8k_cnn_lstm_v1.zip
17 | RUN rm flickr8k_cnn_lstm_v1.zip
18 | RUN mv *.p /image_search/models/
19 | CMD ["python3","/image_search/server.py"]
20 | EXPOSE 5000
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/cmd/query.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | es = Elasticsearch() 
 3 | 
 4 | def description_search(query):
 5 |     results = es.search(
 6 |         index="desearch",
 7 |         body={
 8 |             "size": 4,
 9 |             "query": {
10 |             "match": {"description": query}
11 |             }
12 |             })
13 |     hitCount = results['hits']['total']
14 |     if hitCount > 0:
15 |         if hitCount is 1:
16 |             print(str(hitCount),' result')
17 |         else:
18 |             print(str(hitCount), 'results')
19 |         answers =[]  
20 |         for hit in results['hits']['hits']:
21 |             desc = hit['_source']['description']
22 |             imgurl = hit['_source']['imgurl']
23 |             answers.append({'URL':imgurl,'Description':desc})
24 |     else:
25 |         answers = []
26 |     return answers
27 | 
28 | 


--------------------------------------------------------------------------------
/imagernn/README.md:
--------------------------------------------------------------------------------
 1 | The code is organized as follows:
 2 | 
 3 | - `data_provider.py` abstracts away the datasets and provides uniform API for the code.
 4 | - `utils.py` is what it sounds like it is :)
 5 | - `solver.py`: the solver class doesn't know anything about images or sentences, it gets a model and the gradients and performs a step update
 6 | - `generic_batch_generator.py` handles batching across a batch of image/sentences that need to be forwarded through the networks. It calls the
 7 | - `lstm_generator.py`, which is an implementation of the Google LSTM for generating images.
 8 | - `imagernn_utils.py` contains some image-rnn specific utilities, such as evaluation function etc. These come in handy when we want to use some functionality across different scripts (e.g. driver and evaluator)
 9 | - `rnn_generator.py` has a simple RNN implementation for now, an alternative to LSTM
10 | 


--------------------------------------------------------------------------------
/imagernn/utils.py:
--------------------------------------------------------------------------------
 1 | from random import uniform
 2 | import numpy as np
 3 | 
 4 | def randi(N):
 5 |   """ get random integer in range [0, N) """
 6 |   return int(uniform(0, N))
 7 | 
 8 | def merge_init_structs(s0, s1):
 9 |   """ merge struct s1 into s0 """
10 |   for k in s1['model']:
11 |     assert (not k in s0['model']), 'Error: looks like parameter %s is trying to be initialized twice!' % (k, )
12 |     s0['model'][k] = s1['model'][k] # copy over the pointer
13 |   s0['update'].extend(s1['update'])
14 |   s0['regularize'].extend(s1['regularize'])
15 | 
16 | def initw(n,d): # initialize matrix of this size
17 |   magic_number = 0.1
18 |   return (np.random.rand(n,d) * 2 - 1) * magic_number # U[-0.1, 0.1]
19 | 
20 | def accumNpDicts(d0, d1):
21 |   """ forall k in d0, d0 += d1 . d's are dictionaries of key -> numpy array """
22 |   for k in d1:
23 |     if k in d0:
24 |       d0[k] += d1[k]
25 |     else:
26 |       d0[k] = d1[k]


--------------------------------------------------------------------------------
/cmd/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  #surpress tensorflow warnings
 3 | from capgen import CaptionGenerator
 4 | try:
 5 | 	from query import description_search
 6 | except:
 7 | 	print('Please ensure the elastic search server is enabled')
 8 | from PIL import Image
 9 | c = CaptionGenerator()
10 | import index
11 | import query
12 | PATH_TO_FLICK8K_IMG="imgs/"
13 | while True:
14 | 	print('Enter the Image path to get Similar Images ( q to quit)')
15 | 	img_path = input()
16 | 	if img_path != "q":
17 | 		print('Processing Image Caption...')
18 | 		caption = c.get_caption(img_path)
19 | 		print('The Generated caption for this image is {} .'.format(caption))
20 | 		print('Matching Nearest Captions...')
21 | 		answers = query.description_search(caption)
22 | 		if len(answers) == 0:
23 | 			print('No similar images found')
24 | 		for _ in range(len(answers)):
25 | 			print(PATH_TO_FLICK8K_IMG+answers[_]['URL'])
26 | 			img = Image.open(PATH_TO_FLICK8K_IMG+answers[_]['URL'])
27 | 			img.show()
28 | 	else:
29 | 		break
30 | 
31 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 wx-chevalier
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/imagernn/imagernn_utils.py:
--------------------------------------------------------------------------------
 1 | from imagernn.generic_batch_generator import GenericBatchGenerator
 2 | import numpy as np
 3 | 
 4 | def decodeGenerator(params):
 5 |   """ 
 6 |   in the future we may want to have different classes
 7 |   and options for them. For now there is this one generator
 8 |   implemented and simply returned here.
 9 |   """ 
10 |   return GenericBatchGenerator
11 | 
12 | def eval_split(split, dp, model, params, misc, **kwargs):
13 |   """ evaluate performance on a given split """
14 |   # allow kwargs to override what is inside params
15 |   eval_batch_size = kwargs.get('eval_batch_size', params.get('eval_batch_size',100))
16 |   eval_max_images = kwargs.get('eval_max_images', params.get('eval_max_images', -1))
17 |   BatchGenerator = decodeGenerator(params)
18 |   wordtoix = misc['wordtoix']
19 | 
20 |   logppl = 0
21 |   logppln = 0
22 |   nsent = 0
23 |   for batch in dp.iterImageSentencePairBatch(split = split, max_batch_size = eval_batch_size, max_images = eval_max_images):
24 |     Ys, gen_caches = BatchGenerator.forward(batch, model, params, misc, predict_mode = True)
25 | 
26 |     for i,pair in enumerate(batch):
27 |       gtix = [ wordtoix[w] for w in pair['sentence']['tokens'] if w in wordtoix ]
28 |       gtix.append(0) # we expect END token at the end
29 |       Y = Ys[i]
30 |       maxes = np.amax(Y, axis=1, keepdims=True)
31 |       e = np.exp(Y - maxes) # for numerical stability shift into good numerical range
32 |       P = e / np.sum(e, axis=1, keepdims=True)
33 |       logppl += - np.sum(np.log2(1e-20 + P[range(len(gtix)),gtix])) # also accumulate log2 perplexities
34 |       logppln += len(gtix)
35 |       nsent += 1
36 | 
37 |   ppl2 = 2 ** (logppl / logppln) 
38 |   return ppl2 # return the perplexity
39 | 


--------------------------------------------------------------------------------
/templates/database.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}            
 2 | {% block content %}
 3 |                   <div class="row">
 4 |                                 <div class="col s2"></div>
 5 |                                 <div class="col s8">
 6 |                                 <form method="POST" enctype="multipart/form-data" action="upload">
 7 |                                   <div class = "row">
 8 |                                       <div class = "file-field input-field">
 9 |                                          <div>
10 |                                             <input type = "file" name="photos" multiple />
11 |                                             <span><button type="submit"  class = "btn waves-effect waves-light">Upload<i class="material-icons right">cloud_upload</i> </button></span>
12 |                                          </div>
13 |                                          <div class = "file-path-wrapper">
14 |                                             <input class = "file-path validate" type = "text"
15 |                                                placeholder = "Upload multiple images" />
16 |                                          </div>
17 |                                       </div>    
18 |                                    </div>
19 |                                 </form>
20 |                             </div>
21 |                     </div> 
22 |                     {% if database_images %}
23 |                     <div class="row">
24 |                         <div class="col s2"></div>
25 |                         <div class="col s8">
26 |                                 <h5>Database Images: </h5>
27 |                         </div>
28 |                     </div>
29 |                     <div class="photo-container">
30 |                     <div class="photos">
31 | 
32 |                         {% for db_img in database_images %}
33 |                             <img id="img" src="{{db_img}}" class="z-depth-2">
34 |                         {% endfor %}
35 |                     </div>
36 |                   </div>
37 |                     {% endif %}
38 |                     {% endblock %}
39 | 
40 |  


--------------------------------------------------------------------------------
/templates/caption.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}            
 2 | {% block content %}
 3 |                   <div class="row">
 4 |                                 <div class="col s2"></div>
 5 |                                 <div class="col s8">
 6 |                                 <form method="POST" enctype="multipart/form-data" action="caption">
 7 |                                     <div class = "row">
 8 |                                             <div class = "file-field input-field">
 9 |                                                <div>
10 |                                                   <input type = "file" name = 'query_img'/>
11 |                                                   <span><button type="submit" class="btn waves-effect waves-light">Caption<i class="material-icons right">remove_red_eye</i> </button></span>
12 |                                                </div>
13 |                                                
14 |                                                <div class = "file-path-wrapper">
15 |                                                   <input class = "file-path validate" type = "text"
16 |                                                      placeholder = "Upload Image" />
17 |                                                </div>
18 |                                             </div>
19 |                                          </div>
20 |                                 </form>
21 |                             </div>
22 |                     </div>
23 | 
24 |                     {% if query_path %}
25 |                     <div class="row">
26 |                         <div class="col s2"></div>
27 |                             <div class="col s8">
28 |                                 <h5>Given Image: </h5>
29 |                                 <img id="img" src="{{ query_path }}" width="40%" alt="Query" class="z-depth-2">
30 |                             </div>
31 |                     </div>
32 |                     {% endif %}
33 |                     
34 |                     {% if caption %}
35 |                     <div class="row">
36 |                         <div class="col s2"></div>
37 |                             <div class="col s8">
38 |                                 <h5>Caption: </h5>
39 |                                 <p> {{caption}} </p>
40 |                             </div>
41 |                     </div>
42 |                     {% endif %}
43 |                     {% endblock %}
44 | 
45 |                     
46 |  


--------------------------------------------------------------------------------
/capgen.py:
--------------------------------------------------------------------------------
 1 | # download checkpoint model from http://cs.stanford.edu/people/karpathy/neuraltalk/
 2 | import os
 3 | import numpy as np
 4 | from imagernn.imagernn_utils import decodeGenerator
 5 | import pickle
 6 | from keras.applications import VGG16,imagenet_utils
 7 | from keras.preprocessing.image import load_img,img_to_array
 8 | from keras.models import Model
 9 | import tensorflow as tf
10 | 
11 | preprocess = imagenet_utils.preprocess_input
12 | 
13 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
14 | FILE_DIR = os.path.dirname(os.path.realpath(__file__))
15 | CHECKPOINT_PATH = os.path.join(FILE_DIR, 'models','flickr8k_cnn_lstm_v1.p')
16 | 
17 | class CaptionGenerator:
18 |     def __init__(self):
19 |         self.checkpoint = pickle.load(open(CHECKPOINT_PATH, 'rb'),encoding='latin1')
20 |         self.checkpoint_params = self.checkpoint['params']
21 |         self.language_model = self.checkpoint['model']
22 |         self.ixtoword = self.checkpoint['ixtoword']
23 |         model = VGG16(weights="imagenet")
24 |         self.visual_model = Model(input=model.input,output=model.layers[21].output)
25 |         self.visual_model._make_predict_function()
26 |         self.graph = tf.get_default_graph()
27 |         self.BEAM_SIZE = 2
28 | 
29 |     def convert_img_to_vector(self,img_path):
30 |         image = load_img(img_path,target_size=(224,224))
31 |         image = img_to_array(image)
32 |         image = np.expand_dims(image,axis=0)
33 |         image = preprocess(image)
34 |         return image
35 | 
36 |     def get_image_feature(self,img_path):
37 |         feats = np.transpose(self.visual_model.predict(self.convert_img_to_vector(img_path)))
38 |         return feats
39 | 
40 |     def predict(self, features):
41 |         BatchGenerator = decodeGenerator(CHECKPOINT_PATH)
42 |         img = {}
43 |         img['feat'] = features[:, 0]
44 |         kwparams = {'beam_size': self.BEAM_SIZE}
45 |         Ys = BatchGenerator.predict([{'image': img}], self.language_model, self.checkpoint_params, **kwparams)
46 |         top_predictions = Ys[0]  # take predictions for the first (and only) image we passed in
47 |         top_prediction = top_predictions[0]  # these are sorted with highest on top
48 |         candidate = ' '.join(
49 |             [self.ixtoword[ix] for ix in top_prediction[1] if ix > 0])  # ix 0 is the END token, skip that
50 |         return candidate
51 | 
52 |     def get_caption(self, file):
53 |         with self.graph.as_default():
54 |             feat = self.get_image_feature(file)
55 |         caption = self.predict(feat)
56 |         return caption
57 | 
58 | 
59 | 
60 | 
61 |     
62 | 


--------------------------------------------------------------------------------
/templates/search.html:
--------------------------------------------------------------------------------
 1 | {% extends "layout.html" %}            
 2 | {% block content %}
 3 |                   <div class="row">
 4 |                                 <div class="col s2"></div>
 5 |                                 <div class="col s8">
 6 |                                 <form method="POST" enctype="multipart/form-data" action="search">
 7 |                                     <div class = "row">
 8 |                                             <div class = "file-field input-field">
 9 |                                                <div>
10 |                                                   <input type = "file" name = 'query_img'/>
11 |                                                   <span><button type="submit" class="btn waves-effect waves-light">Search<i class="material-icons right">search</i> </button></span>
12 |                                                </div>
13 |                                                
14 |                                                <div class = "file-path-wrapper">
15 |                                                   <input class = "file-path validate" type = "text"
16 |                                                      placeholder = "Search image.." />
17 |                                                </div>
18 |                                             </div>
19 |                                          </div>
20 |                                 </form>
21 |                             </div>
22 |                     </div>
23 | 
24 |                     {% if query_path %}
25 |                     <div class="row">
26 |                         <div class="col s2"></div>
27 |                             <div class="col s8">
28 |                                 <h5>Given Image: </h5>
29 |                                 <img id="img" src="{{ query_path }}" width="40%" alt="Query" class="z-depth-2">
30 |                             </div>
31 |                     </div>
32 |                     {% endif %}
33 |                     {% if answers %}
34 |                             <div class="row">
35 |                                 <div class="col s2"></div>
36 |                                 <div class="co s8">
37 |                                         <h5>Results: </h5>
38 |                                 </div>
39 |                             </div>
40 |                             <div class="photo-container">
41 |                                     <div class="photos">
42 |                                     {% for answer in answers %}
43 |                                     <img id="img" src="{{ answer[0] }}" class="z-depth-2">
44 |                                     {% endfor %}
45 |                             </div>
46 |                         </div>
47 |                     {% endif %}
48 |                     {% endblock %}
49 | 


--------------------------------------------------------------------------------
/templates/layout.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 |   <html>
 3 |     <head>
 4 |       <!--Import Google Icon Font-->
 5 |       <link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
 6 |       <!--Import materialize.css-->
 7 |       <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/materialize/1.0.0/css/materialize.min.css">
 8 |     <title> Smart Search</title>
 9 |     <link rel=icon href=static/favicon.png>
10 |     <style>
11 |         .photo-container {
12 |   max-width: 960px;
13 |   margin: 0 auto;
14 | }
15 | 
16 | .photos {
17 |   display: flex;
18 |   flex-direction: row;
19 |   flex-wrap: wrap;
20 |   justify-content: flex-start;
21 |   align-content: stretch;
22 |   padding: 0;
23 |   max-width: 980px;
24 | }
25 | 
26 | .photos img {
27 |   display: block;
28 |   float: left;
29 |   flex: 0 0 auto;
30 |   margin-left: 4px;
31 |   max-height: 200px;
32 | }
33 | 
34 | 
35 | @media screen and (min-width: 1024px) {
36 |   .photos img {
37 |     width: calc(100%/6);
38 |     height: calc(100%/6);
39 |   }
40 | }
41 | 
42 | @media screen and (min-width: 769px) and (max-width: 1024px) {
43 |   .photos img {
44 |     width: calc(100%/4);
45 |     height: calc(100%/4);
46 |   }
47 | }
48 | 
49 | @media screen and (min-width: 481px) and (max-width: 768px) {
50 |   .photos img {
51 |     width: calc(100%/3);
52 |     height: calc(100%/3);
53 |   }
54 | }
55 | 
56 | @media screen and (min-width: 321px) and (max-width: 480px) {
57 |   .photos img {
58 |     width: calc(100%/2);
59 |     height: calc(100%/2);
60 |   }
61 | }
62 | 
63 | @media screen  and (max-width: 320px) {
64 |   .photos img {
65 |     width: 100%;
66 |     height: 100%;
67 |   }
68 | }
69 |     </style>
70 |         </head>
71 |     <body>
72 |     
73 |             <div class="container">
74 |                     <div class="container"><a href="#" data-target="nav-mobile" class="top-nav sidenav-trigger full medium black-text"><i class="material-icons">menu</i></a></div>
75 |                   <ul id="nav-mobile" class="sidenav z-depth-2">
76 |                         <li class="brand-logo center"><img src="static/logo.jpg" height="40%" width="40%"/></li>
77 |                         <hr>
78 |                         <li class="bold"><a href="{{ url_for('index') }}" class="collapsible-header waves-effect"><i class="material-icons">home</i>Home </a></li>
79 |                         <li class="bold"><a href="{{ url_for('search') }}" class="collapsible-header waves-effect"><i class="material-icons">search</i>Image Search</a></li>
80 |                         <li class="bold"><a href="{{ url_for('database') }}" class="collapsible-header waves-effect"><i class="material-icons">folder</i>Database</a></li>
81 |                         <li class="bold"><a href="{{ url_for('caption') }}" class="collapsible-header waves-effect"><i class="material-icons">remove_red_eye</i>Caption it!</a></li>
82 |                         <hr>
83 |                         <li class="bold center"> Made by <b>Sethu Iyer</b></li>
84 |                     </ul>  
85 |                   <br><br>
86 |                         {% block content %}{% endblock %}
87 |                   </div>
88 |                   
89 |                  
90 |                   <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
91 |             <script src="https://cdnjs.cloudflare.com/ajax/libs/materialize/1.0.0/js/materialize.min.js"></script>
92 |             <script>
93 |               M.AutoInit()
94 |               $(document).ready(function(){
95 |     $('.sidenav').sidenav();
96 |   });
97 |         </script>
98 |     </body>
99 |   </html>


--------------------------------------------------------------------------------
/imagernn/data_provider.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import random
  4 | import scipy.io
  5 | import codecs
  6 | from collections import defaultdict
  7 | 
  8 | class BasicDataProvider:
  9 |   def __init__(self, dataset):
 10 |     print 'Initializing data provider for dataset %s...' % (dataset, )
 11 | 
 12 |     # !assumptions on folder structure
 13 |     self.dataset_root = os.path.join('data', dataset)
 14 |     self.image_root = os.path.join('data', dataset, 'imgs')
 15 | 
 16 |     # load the dataset into memory
 17 |     dataset_path = os.path.join(self.dataset_root, 'dataset.json')
 18 |     print 'BasicDataProvider: reading %s' % (dataset_path, )
 19 |     self.dataset = json.load(open(dataset_path, 'r'))
 20 | 
 21 |     # load the image features into memory
 22 |     features_path = os.path.join(self.dataset_root, 'vgg_feats.mat')
 23 |     print 'BasicDataProvider: reading %s' % (features_path, )
 24 |     features_struct = scipy.io.loadmat(features_path)
 25 |     self.features = features_struct['feats']
 26 | 
 27 |     # group images by their train/val/test split into a dictionary -> list structure
 28 |     self.split = defaultdict(list)
 29 |     for img in self.dataset['images']:
 30 |       self.split[img['split']].append(img)
 31 | 
 32 |   # "PRIVATE" FUNCTIONS
 33 |   # in future we may want to create copies here so that we don't touch the 
 34 |   # data provider class data, but for now lets do the simple thing and 
 35 |   # just return raw internal img sent structs. This also has the advantage
 36 |   # that the driver could store various useful caching stuff in these structs
 37 |   # and they will be returned in the future with the cache present
 38 |   def _getImage(self, img):
 39 |     """ create an image structure for the driver """
 40 | 
 41 |     # lazily fill in some attributes
 42 |     if not 'local_file_path' in img: img['local_file_path'] = os.path.join(self.image_root, img['filename'])
 43 |     if not 'feat' in img: # also fill in the features
 44 |       feature_index = img['imgid'] # NOTE: imgid is an integer, and it indexes into features
 45 |       img['feat'] = self.features[:,feature_index]
 46 |     return img
 47 | 
 48 |   def _getSentence(self, sent):
 49 |     """ create a sentence structure for the driver """
 50 |     # NOOP for now
 51 |     return sent
 52 | 
 53 |   # PUBLIC FUNCTIONS
 54 | 
 55 |   def getSplitSize(self, split, ofwhat = 'sentences'):
 56 |     """ return size of a split, either number of sentences or number of images """
 57 |     if ofwhat == 'sentences': 
 58 |       return sum(len(img['sentences']) for img in self.split[split])
 59 |     else: # assume images
 60 |       return len(self.split[split])
 61 | 
 62 |   def sampleImageSentencePair(self, split = 'train'):
 63 |     """ sample image sentence pair from a split """
 64 |     images = self.split[split]
 65 | 
 66 |     img = random.choice(images)
 67 |     sent = random.choice(img['sentences'])
 68 | 
 69 |     out = {}
 70 |     out['image'] = self._getImage(img)
 71 |     out['sentence'] = self._getSentence(sent)
 72 |     return out
 73 | 
 74 |   def iterImageSentencePair(self, split = 'train', max_images = -1):
 75 |     for i,img in enumerate(self.split[split]):
 76 |       if max_images >= 0 and i >= max_images: break
 77 |       for sent in img['sentences']:
 78 |         out = {}
 79 |         out['image'] = self._getImage(img)
 80 |         out['sentence'] = self._getSentence(sent)
 81 |         yield out
 82 | 
 83 |   def iterImageSentencePairBatch(self, split = 'train', max_images = -1, max_batch_size = 100):
 84 |     batch = []
 85 |     for i,img in enumerate(self.split[split]):
 86 |       if max_images >= 0 and i >= max_images: break
 87 |       for sent in img['sentences']:
 88 |         out = {}
 89 |         out['image'] = self._getImage(img)
 90 |         out['sentence'] = self._getSentence(sent)
 91 |         batch.append(out)
 92 |         if len(batch) >= max_batch_size:
 93 |           yield batch
 94 |           batch = []
 95 |     if batch:
 96 |       yield batch
 97 | 
 98 |   def iterSentences(self, split = 'train'):
 99 |     for img in self.split[split]: 
100 |       for sent in img['sentences']:
101 |         yield self._getSentence(sent)
102 | 
103 |   def iterImages(self, split = 'train', shuffle = False, max_images = -1):
104 |     imglist = self.split[split]
105 |     ix = range(len(imglist))
106 |     if shuffle:
107 |       random.shuffle(ix)
108 |     if max_images > 0:
109 |       ix = ix[:min(len(ix),max_images)] # crop the list
110 |     for i in ix:
111 |       yield self._getImage(imglist[i])
112 | 
113 | def getDataProvider(dataset):
114 |   """ we could intercept a special dataset and return different data providers """
115 |   assert dataset in ['flickr8k', 'flickr30k', 'coco'], 'dataset %s unknown' % (dataset, )
116 |   return BasicDataProvider(dataset)
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Contributors][contributors-shield]][contributors-url]
  2 | [![Forks][forks-shield]][forks-url]
  3 | [![Stargazers][stars-shield]][stars-url]
  4 | [![Issues][issues-shield]][issues-url]
  5 | [![MIT License][license-shield]][license-url]
  6 | 
  7 | <!-- PROJECT LOGO -->
  8 | <br />
  9 | <p align="center">
 10 |   <a href="https://github.com/wx-chevalier/tensorflow-image-search">
 11 |     <img src="https://s2.ax1x.com/2020/01/16/ljgpUP.png" alt="Logo" width="120" height="80">
 12 |   </a>
 13 | 
 14 |   <h3 align="center">tensorflow-image-search</h3>
 15 | 
 16 |   <p align="center">
 17 |     A reverse image search engine powered by elastic search and tensorflow
 18 |     <br />
 19 |     <a href="https://github.com/wx-chevalier/tensorflow-image-search"><strong>Explore the docs »</strong></a>
 20 |     <br />
 21 |     <br />
 22 |     <a href="https://github.com/wx-chevalier/tensorflow-image-search">View Demo</a>
 23 |     ·
 24 |     <a href="https://github.com/wx-chevalier/tensorflow-image-search/issues">Report Bug</a>
 25 |     ·
 26 |     <a href="https://github.com/wx-chevalier/tensorflow-image-search/issues">Request Feature</a>
 27 |   </p>
 28 | </p>
 29 | 
 30 | <!-- ABOUT THE PROJECT -->
 31 | 
 32 | # Introduction
 33 | 
 34 | Fork from [sethuiyer/Image-to-Image-search](https://github.com/sethuiyer/Image-to-Image-search).
 35 | 
 36 | ## Nav | 导航
 37 | 
 38 | # Getting Started
 39 | 
 40 | ## Packages Required:
 41 | 
 42 | - Anaconda
 43 | - Keras with Tensorflow Backend (Python 3.6)
 44 | - Elastic Search and elasticsearch-py (Elastic Search 6.0)
 45 | 
 46 | For more, check out requirements.txt
 47 | 
 48 | ## Pre-trained models
 49 | 
 50 | - [Flickr-8k LSTM weights (flickr8k_cnn_lstm_v1.p)](https://cs.stanford.edu/people/karpathy/neuraltalk/flickr8k_cnn_lstm_v1.zip)
 51 | 
 52 | Download this and paste it inside models folder.
 53 | 
 54 | ## Output
 55 | 
 56 | <img src="static/screenshot-app.jpg"/>
 57 | 
 58 | <img src="https://github.com/sethuiyer/Image-to-Image-search/raw/bootstrap/webapp.png"/>
 59 | 
 60 | ## Tips
 61 | 
 62 | - Install elasticsearch and always check if elastic search process is running before launching server.py or index_database.py.
 63 | 
 64 | * Instead of using the upload functionality, paste all your images inside `static/img` folder followed by `python index_database.py` to index all those images.
 65 | 
 66 | - If you want to delete the indexed images, do `sh delete_index.sh`
 67 | 
 68 | # About
 69 | 
 70 | <!-- ROADMAP -->
 71 | 
 72 | ## Roadmap
 73 | 
 74 | See the [open issues](https://github.com/wx-chevalier/tensorflow-image-search/issues) for a list of proposed features (and known issues).
 75 | 
 76 | <!-- CONTRIBUTING -->
 77 | 
 78 | ## Contributing
 79 | 
 80 | Contributions are what make the open source community such an amazing place to be learn, inspire, and create. Any contributions you make are **greatly appreciated**.
 81 | 
 82 | 1. Fork the Project
 83 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
 84 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
 85 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
 86 | 5. Open a Pull Request
 87 | 
 88 | <!-- LICENSE -->
 89 | 
 90 | ## License
 91 | 
 92 | Distributed under the MIT License. See `LICENSE` for more information.
 93 | 
 94 | <!-- ACKNOWLEDGEMENTS -->
 95 | 
 96 | ## Acknowledgements
 97 | 
 98 | - [sethuiyer/Image-to-Image-search](https://github.com/sethuiyer/Image-to-Image-search)
 99 | 
100 | - [Sis](https://github.com/matsui528/sis)
101 | 
102 | ## Copyright & More | 延伸阅读
103 | 
104 | 笔者所有文章遵循[知识共享 署名 - 非商业性使用 - 禁止演绎 4.0 国际许可协议](https://creativecommons.org/licenses/by-nc-nd/4.0/deed.zh)，欢迎转载，尊重版权。如果觉得本系列对你有所帮助，欢迎给我家布丁买点狗粮(支付宝扫码)~
105 | 
106 | ![技术视野](https://s2.ax1x.com/2019/12/03/QQJLvt.png)
107 | 
108 | 您还可以前往 [NGTE Books](https://ng-tech.icu/books/) 主页浏览包含知识体系、编程语言、软件工程、模式与架构、Web 与大前端、服务端开发实践与工程架构、分布式基础架构、人工智能与深度学习、产品运营与创业等多类目的书籍列表：
109 | 
110 | ![NGTE Books](https://s2.ax1x.com/2020/01/18/19uXtI.png)
111 | 
112 | <!-- MARKDOWN LINKS & IMAGES -->
113 | <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->
114 | 
115 | [contributors-shield]: https://img.shields.io/github/contributors/wx-chevalier/tensorflow-image-search.svg?style=flat-square
116 | [contributors-url]: https://github.com/wx-chevalier/tensorflow-image-search/graphs/contributors
117 | [forks-shield]: https://img.shields.io/github/forks/wx-chevalier/tensorflow-image-search.svg?style=flat-square
118 | [forks-url]: https://github.com/wx-chevalier/tensorflow-image-search/network/members
119 | [stars-shield]: https://img.shields.io/github/stars/wx-chevalier/tensorflow-image-search.svg?style=flat-square
120 | [stars-url]: https://github.com/wx-chevalier/tensorflow-image-search/stargazers
121 | [issues-shield]: https://img.shields.io/github/issues/wx-chevalier/tensorflow-image-search.svg?style=flat-square
122 | [issues-url]: https://github.com/wx-chevalier/tensorflow-image-search/issues
123 | [license-shield]: https://img.shields.io/github/license/wx-chevalier/tensorflow-image-search.svg?style=flat-square
124 | [license-url]: https://github.com/wx-chevalier/tensorflow-image-search/blob/master/LICENSE.txt
125 | 


--------------------------------------------------------------------------------
/imagernn/solver.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | from imagernn.utils import randi
  4 | 
  5 | class Solver:
  6 |   """
  7 |   solver worries about:
  8 |   - different optimization methods, updates, weight decays
  9 |   - it can also perform gradient check
 10 |   """
 11 |   def __init__(self):
 12 |     self.step_cache_ = {} # might need this
 13 |     self.step_cache2_ = {} # might need this
 14 | 
 15 |   def step(self, batch, model, cost_function, **kwargs):
 16 |     """ 
 17 |     perform a single batch update. Takes as input:
 18 |     - batch of data (X)
 19 |     - model (W)
 20 |     - cost function which takes batch, model
 21 |     """
 22 |     
 23 |     learning_rate = kwargs.get('learning_rate', 0.0)
 24 |     update = kwargs.get('update', model.keys())
 25 |     grad_clip = kwargs.get('grad_clip', -1)
 26 |     solver = kwargs.get('solver', 'vanilla')
 27 |     momentum = kwargs.get('momentum', 0)
 28 |     smooth_eps = kwargs.get('smooth_eps', 1e-8)
 29 |     decay_rate = kwargs.get('decay_rate', 0.999)
 30 | 
 31 |     if not (solver == 'vanilla' and momentum == 0):
 32 |       # lazily make sure we initialize step cache if needed
 33 |       for u in update:
 34 |         if not u in self.step_cache_: 
 35 |           self.step_cache_[u] = np.zeros(model[u].shape)
 36 |           if solver == 'adadelta':
 37 |             self.step_cache2_[u] = np.zeros(model[u].shape) # adadelta needs one more cache
 38 | 
 39 |     # compute cost and gradient
 40 |     cg = cost_function(batch, model)
 41 |     cost = cg['cost']
 42 |     grads = cg['grad']
 43 |     stats = cg['stats']
 44 | 
 45 |     # clip gradients if needed, simplest possible version
 46 |     # todo later: maybe implement the gradient direction conserving version
 47 |     if grad_clip > 0:
 48 |       for p in update:
 49 |         if p in grads:
 50 |           grads[p] = np.minimum(grads[p], grad_clip)
 51 |           grads[p] = np.maximum(grads[p], -grad_clip)
 52 | 
 53 |     # perform parameter update
 54 |     for p in update:
 55 |       if p in grads:
 56 | 
 57 |         if solver == 'vanilla': # vanilla sgd, optional with momentum
 58 |           if momentum > 0:
 59 |             dx = momentum * self.step_cache_[p] - learning_rate * grads[p]
 60 |             self.step_cache_[p] = dx
 61 |           else:
 62 |             dx = - learning_rate * grads[p]
 63 | 
 64 |         elif solver == 'rmsprop':
 65 |           self.step_cache_[p] = self.step_cache_[p] * decay_rate + (1.0 - decay_rate) * grads[p] ** 2
 66 |           dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache_[p] + smooth_eps)
 67 |         
 68 |         elif solver == 'adagrad':
 69 |           self.step_cache_[p] += grads[p] ** 2
 70 |           dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache_[p] + smooth_eps)
 71 | 
 72 |         elif solver == 'adadelta':
 73 |           self.step_cache_[p] = self.step_cache_[p] * decay_rate + (1.0 - decay_rate) * grads[p] ** 2
 74 |           dx = - np.sqrt( (self.step_cache2_[p] + smooth_eps) / (self.step_cache_[p] + smooth_eps) ) * grads[p]
 75 |           self.step_cache2_[p] = self.step_cache2_[p] * decay_rate + (1.0 - decay_rate) * (dx ** 2)
 76 | 
 77 |         else:
 78 |           raise Exception("solver %s not supported" % (solver, ))
 79 | 
 80 |         # perform the parameter update
 81 |         model[p] += dx
 82 | 
 83 |     # create output dict and return
 84 |     out = {}
 85 |     out['cost'] = cost
 86 |     out['stats'] = stats
 87 |     return out
 88 | 
 89 |   def gradCheck(self, batch, model, cost_function, **kwargs):
 90 |     """ 
 91 |     perform gradient check.
 92 |     since gradcheck can be tricky (especially with relus involved)
 93 |     this function prints to console for visual inspection
 94 |     """
 95 | 
 96 |     num_checks = kwargs.get('num_checks', 10)
 97 |     delta = kwargs.get('delta', 1e-5)
 98 |     rel_error_thr_warning = kwargs.get('rel_error_thr_warning', 1e-2)
 99 |     rel_error_thr_error = kwargs.get('rel_error_thr_error', 1)
100 | 
101 |     cg = cost_function(batch, model)
102 | 
103 |     print 'running gradient check...'
104 |     for p in model.keys():
105 |       print 'checking gradient on parameter %s of shape %s...' % (p, `model[p].shape`)
106 |       mat = model[p]
107 | 
108 |       s0 = cg['grad'][p].shape
109 |       s1 = mat.shape
110 |       assert s0 == s1, 'Error dims dont match: %s and %s.' % (`s0`, `s1`)
111 | 
112 |       for i in xrange(num_checks):
113 |         ri = randi(mat.size)
114 | 
115 |         # evluate cost at [x + delta] and [x - delta]
116 |         old_val = mat.flat[ri]
117 |         mat.flat[ri] = old_val + delta
118 |         cg0 = cost_function(batch, model)
119 |         mat.flat[ri] = old_val - delta
120 |         cg1 = cost_function(batch, model)
121 |         mat.flat[ri] = old_val # reset old value for this parameter
122 | 
123 |         # fetch both numerical and analytic gradient
124 |         grad_analytic = cg['grad'][p].flat[ri]
125 |         grad_numerical = (cg0['cost']['total_cost'] - cg1['cost']['total_cost']) / ( 2 * delta )
126 | 
127 |         # compare them
128 |         if grad_numerical == 0 and grad_analytic == 0:
129 |           rel_error = 0 # both are zero, OK.
130 |           status = 'OK'
131 |         elif abs(grad_numerical) < 1e-7 and abs(grad_analytic) < 1e-7:
132 |           rel_error = 0 # not enough precision to check this
133 |           status = 'VAL SMALL WARNING'
134 |         else:
135 |           rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic)
136 |           status = 'OK'
137 |           if rel_error > rel_error_thr_warning: status = 'WARNING'
138 |           if rel_error > rel_error_thr_error: status = '!!!!! NOTOK'
139 | 
140 |         # print stats
141 |         print '%s checking param %s index %8d (val = %+8f), analytic = %+8f, numerical = %+8f, relative error = %+8f' \
142 |               % (status, p, ri, old_val, grad_analytic, grad_numerical, rel_error)
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/imagernn/generic_batch_generator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import code
  3 | from imagernn.utils import merge_init_structs, initw, accumNpDicts
  4 | from imagernn.lstm_generator import LSTMGenerator
  5 | from imagernn.rnn_generator import RNNGenerator
  6 | 
  7 | def decodeGenerator(generator):
  8 |   if generator == 'lstm':
  9 |     return LSTMGenerator
 10 |   if generator == 'rnn':
 11 |     return RNNGenerator
 12 |   else:
 13 |     raise Exception('generator %s is not yet supported' % (base_generator_str,))
 14 | 
 15 | class GenericBatchGenerator:
 16 |   """ 
 17 |   Base batch generator class. 
 18 |   This class is aware of the fact that we are generating
 19 |   sentences from images.
 20 |   """
 21 | 
 22 |   @staticmethod
 23 |   def init(params, misc):
 24 | 
 25 |     # inputs
 26 |     image_encoding_size = params.get('image_encoding_size', 128)
 27 |     word_encoding_size = params.get('word_encoding_size', 128)
 28 |     hidden_size = params.get('hidden_size', 128)
 29 |     generator = params.get('generator', 'lstm')
 30 |     vocabulary_size = len(misc['wordtoix'])
 31 |     output_size = len(misc['ixtoword']) # these should match though
 32 |     image_size = 4096 # size of CNN vectors hardcoded here
 33 | 
 34 |     if generator == 'lstm':
 35 |       assert image_encoding_size == word_encoding_size, 'this implementation does not support different sizes for these parameters'
 36 | 
 37 |     # initialize the encoder models
 38 |     model = {}
 39 |     model['We'] = initw(image_size, image_encoding_size) # image encoder
 40 |     model['be'] = np.zeros((1,image_encoding_size))
 41 |     model['Ws'] = initw(vocabulary_size, word_encoding_size) # word encoder
 42 |     update = ['We', 'be', 'Ws']
 43 |     regularize = ['We', 'Ws']
 44 |     init_struct = { 'model' : model, 'update' : update, 'regularize' : regularize}
 45 | 
 46 |     # descend into the specific Generator and initialize it
 47 |     Generator = decodeGenerator(generator)
 48 |     generator_init_struct = Generator.init(word_encoding_size, hidden_size, output_size)
 49 |     merge_init_structs(init_struct, generator_init_struct)
 50 |     return init_struct
 51 | 
 52 |   @staticmethod
 53 |   def forward(batch, model, params, misc, predict_mode = False):
 54 |     """ iterates over items in the batch and calls generators on them """
 55 |     # we do the encoding here across all images/words in batch in single matrix
 56 |     # multiplies to gain efficiency. The RNNs are then called individually
 57 |     # in for loop on per-image-sentence pair and all they are concerned about is
 58 |     # taking single matrix of vectors and doing the forward/backward pass without
 59 |     # knowing anything about images, sentences or anything of that sort.
 60 | 
 61 |     # encode all images
 62 |     # concatenate as rows. If N is number of image-sentence pairs,
 63 |     # F will be N x image_size
 64 |     F = np.row_stack(x['image']['feat'] for x in batch) 
 65 |     We = model['We']
 66 |     be = model['be']
 67 |     Xe = F.dot(We) + be # Xe becomes N x image_encoding_size
 68 | 
 69 |     # decode the generator we wish to use
 70 |     generator_str = params.get('generator', 'lstm') 
 71 |     Generator = decodeGenerator(generator_str)
 72 | 
 73 |     # encode all words in all sentences (which exist in our vocab)
 74 |     wordtoix = misc['wordtoix']
 75 |     Ws = model['Ws']
 76 |     gen_caches = []
 77 |     Ys = [] # outputs
 78 |     for i,x in enumerate(batch):
 79 |       # take all words in this sentence and pluck out their word vectors
 80 |       # from Ws. Then arrange them in a single matrix Xs
 81 |       # Note that we are setting the start token as first vector
 82 |       # and then all the words afterwards. And start token is the first row of Ws
 83 |       ix = [0] + [ wordtoix[w] for w in x['sentence']['tokens'] if w in wordtoix ]
 84 |       Xs = np.row_stack( [Ws[j, :] for j in ix] )
 85 |       Xi = Xe[i,:]
 86 | 
 87 |       # forward prop through the RNN
 88 |       gen_Y, gen_cache = Generator.forward(Xi, Xs, model, params, predict_mode = predict_mode)
 89 |       gen_caches.append((ix, gen_cache))
 90 |       Ys.append(gen_Y)
 91 | 
 92 |     # back up information we need for efficient backprop
 93 |     cache = {}
 94 |     if not predict_mode:
 95 |       # ok we need cache as well because we'll do backward pass
 96 |       cache['gen_caches'] = gen_caches
 97 |       cache['Xe'] = Xe
 98 |       cache['Ws_shape'] = Ws.shape
 99 |       cache['F'] = F
100 |       cache['generator_str'] = generator_str
101 | 
102 |     return Ys, cache
103 |     
104 |   @staticmethod
105 |   def backward(dY, cache):
106 |     Xe = cache['Xe']
107 |     generator_str = cache['generator_str']
108 |     dWs = np.zeros(cache['Ws_shape'])
109 |     gen_caches = cache['gen_caches']
110 |     F = cache['F']
111 |     dXe = np.zeros(Xe.shape)
112 | 
113 |     Generator = decodeGenerator(generator_str)
114 | 
115 |     # backprop each item in the batch
116 |     grads = {}
117 |     for i in xrange(len(gen_caches)):
118 |       ix, gen_cache = gen_caches[i] # unpack
119 |       local_grads = Generator.backward(dY[i], gen_cache)
120 |       dXs = local_grads['dXs'] # intercept the gradients wrt Xi and Xs
121 |       del local_grads['dXs']
122 |       dXi = local_grads['dXi']
123 |       del local_grads['dXi']
124 |       accumNpDicts(grads, local_grads) # add up the gradients wrt model parameters
125 | 
126 |       # now backprop from dXs to the image vector and word vectors
127 |       dXe[i,:] += dXi # image vector
128 |       for n,j in enumerate(ix): # and now all the other words
129 |         dWs[j,:] += dXs[n,:]
130 | 
131 |     # finally backprop into the image encoder
132 |     dWe = F.transpose().dot(dXe)
133 |     dbe = np.sum(dXe, axis=0, keepdims = True)
134 | 
135 |     accumNpDicts(grads, { 'We':dWe, 'be':dbe, 'Ws':dWs })
136 |     return grads
137 | 
138 |   @staticmethod
139 |   def predict(batch, model, params, **kwparams):
140 |     """ some code duplication here with forward pass, but I think we want the freedom in future """
141 |     F = np.row_stack(x['image']['feat'] for x in batch) 
142 |     We = model['We']
143 |     be = model['be']
144 |     Xe = F.dot(We) + be # Xe becomes N x image_encoding_size
145 |     generator_str = params['generator']
146 |     Generator = decodeGenerator(generator_str)
147 |     Ys = []
148 |     for i,x in enumerate(batch):
149 |       gen_Y = Generator.predict(Xe[i, :], model, model['Ws'], params, **kwparams)
150 |       Ys.append(gen_Y)
151 |     return Ys
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | 
  4 | from PIL import Image
  5 | from elasticsearch import Elasticsearch
  6 | from elasticsearch.helpers import bulk
  7 | from flask import Flask, render_template, request, Response
  8 | from werkzeug.utils import secure_filename
  9 | import json
 10 | 
 11 | from capgen import CaptionGenerator
 12 | 
 13 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
 14 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
 15 | es = Elasticsearch()
 16 | gencap = CaptionGenerator()
 17 | 
 18 | 
 19 | def description_search(query):
 20 |     global es
 21 |     results = es.search(
 22 |         index="desearch",
 23 |         body={
 24 |             "size": 20,
 25 |             "query": {
 26 |                 "match": {"description": query}
 27 |             }
 28 |         })
 29 | 
 30 |     hitCount = results['hits']['total']
 31 | 
 32 |     if hitCount > 0:
 33 |         if hitCount is 1:
 34 |             print(str(hitCount), ' result')
 35 |         else:
 36 |             print(str(hitCount), 'results')
 37 |         answers = []
 38 |         max_score = results['hits']['max_score']
 39 | 
 40 |         if max_score >= 0.35:
 41 |             for hit in results['hits']['hits']:
 42 |                 if hit['_score'] > 0.5 * max_score:
 43 |                     desc = hit['_source']['description']
 44 |                     imgurl = hit['_source']['imgurl']
 45 |                     answers.append([imgurl, desc])
 46 |     else:
 47 |         answers = []
 48 |     return answers
 49 | 
 50 | 
 51 | app = Flask(__name__)
 52 | app.config['UPLOAD_FOLDER'] = os.path.join('static', 'database')
 53 | app.config['TEMP_UPLOAD_FOLDER'] = os.path.join('static', 'uploads')
 54 | app.config['ALLOWED_EXTENSIONS'] = set(['jpg', 'jpeg', 'png'])
 55 | 
 56 | 
 57 | def allowed_file(filename):
 58 |     return '.' in filename and \
 59 |            filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS']
 60 | 
 61 | 
 62 | @app.route('/')
 63 | def index():
 64 |     return render_template('home.html')
 65 | 
 66 | 
 67 | @app.route('/search', methods=['GET', 'POST'])
 68 | def search():
 69 |     global gencap
 70 |     if request.method == 'POST':
 71 |         if 'query_img' not in request.files or request.files['query_img'].filename == '' or not allowed_file(
 72 |                 request.files['query_img'].filename):
 73 |             return render_template('search.html')
 74 |         file = request.files['query_img']
 75 |         img = Image.open(file.stream)  # PIL image
 76 |         uploaded_img_path = os.path.join(app.config['TEMP_UPLOAD_FOLDER'], file.filename)
 77 |         img.save(uploaded_img_path)
 78 |         query = gencap.get_caption(uploaded_img_path)
 79 |         answers = description_search(query)
 80 | 
 81 |         return render_template('search.html',
 82 |                                query_path=uploaded_img_path,
 83 |                                answers=answers)
 84 |     else:
 85 |         return render_template('search.html')
 86 | 
 87 | 
 88 | @app.route('/api/search', methods=['POST'])
 89 | def api_search():
 90 |     global gencap
 91 |     if 'query_img' not in request.files or request.files['query_img'].filename == '' or not allowed_file(
 92 |             request.files['query_img'].filename):
 93 |         return Response(response=json.dumps({'success': False, 'message': 'Uploaded image is invalid or not allowed'}),
 94 |                         status=400, mimetype="application/json")
 95 |     file = request.files['query_img']
 96 |     img = Image.open(file.stream)  # PIL image
 97 |     uploaded_img_path = os.path.join(app.config['TEMP_UPLOAD_FOLDER'], file.filename)
 98 |     img.save(uploaded_img_path)
 99 |     query = gencap.get_caption(uploaded_img_path)
100 |     answers = description_search(query)
101 | 
102 |     return Response(response=json.dumps({'success': True, 'answers': answers}),
103 |                     status=200, mimetype="application/json")
104 | 
105 | 
106 | @app.route('/database')
107 | def database():
108 |     images = glob.glob(os.path.join(app.config['UPLOAD_FOLDER'], '*'))
109 |     return render_template('database.html', database_images=images)
110 | 
111 | 
112 | @app.route('/upload', methods=['GET', 'POST'])
113 | def upload():
114 |     if request.method == 'POST':
115 |         if 'photos' not in request.files:
116 |             return render_template('database.html')
117 |         actions = []
118 |         for file in request.files.getlist('photos'):
119 |             if file and allowed_file(file.filename):
120 |                 filename = secure_filename(file.filename)
121 |                 file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
122 |                 file.save(file_path)
123 |                 cap = gencap.get_caption(file_path)
124 |                 doc = {'imgurl': file_path, 'description': cap}
125 |                 actions.append(doc)
126 |         bulk(es, actions, index="desearch", doc_type="json")
127 |         return render_template('database.html')
128 | 
129 | 
130 | @app.route('/caption', methods=['GET', 'POST'])
131 | def caption():
132 |     if request.method == 'POST':
133 |         if 'query_img' not in request.files or request.files['query_img'].filename == '' or not allowed_file(
134 |                 request.files['query_img'].filename):
135 |             return render_template('caption.html')
136 |         file = request.files['query_img']
137 |         img = Image.open(file.stream)  # PIL image
138 |         uploaded_img_path = os.path.join(app.config['TEMP_UPLOAD_FOLDER'], file.filename)
139 |         img.save(uploaded_img_path)
140 |         cap = gencap.get_caption(uploaded_img_path)
141 |         return render_template('caption.html', caption=cap, query_path=uploaded_img_path)
142 |     else:
143 |         return render_template('caption.html')
144 | 
145 | 
146 | @app.route('/api/caption', methods=['POST'])
147 | def caption_api():
148 |     if 'query_img' not in request.files or request.files['query_img'].filename == '' or not allowed_file(
149 |             request.files['query_img'].filename):
150 |         return Response(response=json.dumps({'success': False, 'message': 'Uploaded image is invalid or not allowed'}),
151 |                         status=400, mimetype="application/json")
152 |     file = request.files['query_img']
153 |     img = Image.open(file.stream)  # PIL image
154 |     uploaded_img_path = os.path.join(app.config['TEMP_UPLOAD_FOLDER'], file.filename)
155 |     img.save(uploaded_img_path)
156 |     cap = gencap.get_caption(uploaded_img_path)
157 |     return Response(response=json.dumps({'success': True, 'caption': cap}),
158 |                     status=200, mimetype="application/json")
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     app.run(host="0.0.0.0", port=5000)
163 | 


--------------------------------------------------------------------------------
/imagernn/rnn_generator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import code
  3 | 
  4 | from imagernn.utils import initw
  5 | 
  6 | class RNNGenerator:
  7 |   """ 
  8 |   An RNN generator.
  9 |   This class is as stupid as possible. It gets some conditioning vector,
 10 |   a sequence of input vectors, and produces a sequence of output vectors
 11 |   """
 12 |   
 13 |   @staticmethod
 14 |   def init(input_size, hidden_size, output_size):
 15 | 
 16 |     model = {}
 17 |     # connections to x_t
 18 |     model['Wxh'] = initw(input_size, hidden_size)
 19 |     model['bxh'] = np.zeros((1, hidden_size))
 20 |     # connections to h_{t-1}
 21 |     model['Whh'] = initw(hidden_size, hidden_size)
 22 |     model['bhh'] = np.zeros((1, hidden_size))
 23 |     # Decoder weights (e.g. mapping to vocabulary)
 24 |     model['Wd'] = initw(hidden_size, output_size) * 0.1 # decoder
 25 |     model['bd'] = np.zeros((1, output_size))
 26 | 
 27 |     update = ['Whh', 'bhh', 'Wxh', 'bxh', 'Wd', 'bd']
 28 |     regularize = ['Whh', 'Wxh', 'Wd']
 29 |     return { 'model' : model, 'update' : update, 'regularize' : regularize }
 30 | 
 31 |   @staticmethod
 32 |   def forward(Xi, Xs, model, params, **kwargs):
 33 |     """
 34 |     Xi is 1-d array of size D1 (containing the image representation)
 35 |     Xs is N x D2 (N time steps, rows are data containng word representations), and
 36 |     it is assumed that the first row is already filled in as the start token. So a
 37 |     sentence with 10 words will be of size 11xD2 in Xs.
 38 |     """
 39 |     predict_mode = kwargs.get('predict_mode', False)
 40 | 
 41 |     # options
 42 |     drop_prob_encoder = params.get('drop_prob_encoder', 0.0)
 43 |     drop_prob_decoder = params.get('drop_prob_decoder', 0.0)
 44 |     relu_encoders = params.get('rnn_relu_encoders', 0)
 45 |     rnn_feed_once = params.get('rnn_feed_once', 0)
 46 | 
 47 |     if drop_prob_encoder > 0: # if we want dropout on the encoder
 48 |       # inverted version of dropout here. Suppose the drop_prob is 0.5, then during training
 49 |       # we are going to drop half of the units. In this inverted version we also boost the activations
 50 |       # of the remaining 50% by 2.0 (scale). The nice property of this is that during prediction time
 51 |       # we don't have to do any scailing, since all 100% of units will be active, but at their base
 52 |       # firing rate, giving 100% of the "energy". So the neurons later in the pipeline dont't change
 53 |       # their expected firing rate magnitudes
 54 |       if not predict_mode: # and we are in training mode
 55 |         scale = 1.0 / (1.0 - drop_prob_encoder)
 56 |         Us = (np.random.rand(*(Xs.shape)) < (1 - drop_prob_encoder)) * scale # generate scaled mask
 57 |         Xs *= Us # drop!
 58 |         Ui = (np.random.rand(*(Xi.shape)) < (1 - drop_prob_encoder)) * scale
 59 |         Xi *= Ui # drop!
 60 | 
 61 |     # encode input vectors
 62 |     Wxh = model['Wxh']
 63 |     bxh = model['bxh']
 64 |     Xsh = Xs.dot(Wxh) + bxh
 65 | 
 66 |     if relu_encoders:
 67 |       Xsh = np.maximum(Xsh, 0)
 68 |       Xi = np.maximum(Xi, 0)
 69 | 
 70 |     # recurrence iteration for the Multimodal RNN similar to one described in Karpathy et al.
 71 |     d = model['Wd'].shape[0] # size of hidden layer
 72 |     n = Xs.shape[0]
 73 |     H = np.zeros((n, d)) # hidden layer representation
 74 |     Whh = model['Whh']
 75 |     bhh = model['bhh']
 76 |     for t in xrange(n):
 77 |       
 78 |       prev = np.zeros(d) if t == 0 else H[t-1]
 79 |       if not rnn_feed_once or t == 0:
 80 |         # feed the image in if feedonce is false. And it it is true, then
 81 |         # only feed the image in if its the first iteration
 82 |         H[t] = np.maximum(Xi + Xsh[t] + prev.dot(Whh) + bhh, 0) # also ReLU
 83 |       else:
 84 |         H[t] = np.maximum(Xsh[t] + prev.dot(Whh) + bhh, 0) # also ReLU
 85 | 
 86 |     if drop_prob_decoder > 0: # if we want dropout on the decoder
 87 |       if not predict_mode: # and we are in training mode
 88 |         scale2 = 1.0 / (1.0 - drop_prob_decoder)
 89 |         U2 = (np.random.rand(*(H.shape)) < (1 - drop_prob_decoder)) * scale2 # generate scaled mask
 90 |         H *= U2 # drop!
 91 | 
 92 |     # decoder at the end
 93 |     Wd = model['Wd']
 94 |     bd = model['bd']
 95 |     Y = H.dot(Wd) + bd
 96 | 
 97 |     cache = {}
 98 |     if not predict_mode:
 99 |       # we can expect to do a backward pass
100 |       cache['Whh'] = Whh
101 |       cache['H'] = H
102 |       cache['Wd'] = Wd
103 |       cache['Xs'] = Xs
104 |       cache['Xsh'] = Xsh
105 |       cache['Wxh'] = Wxh
106 |       cache['Xi'] = Xi
107 |       cache['relu_encoders'] = relu_encoders
108 |       cache['drop_prob_encoder'] = drop_prob_encoder
109 |       cache['drop_prob_decoder'] = drop_prob_decoder
110 |       cache['rnn_feed_once'] = rnn_feed_once
111 |       if drop_prob_encoder > 0: 
112 |         cache['Us'] = Us # keep the dropout masks around for backprop
113 |         cache['Ui'] = Ui
114 |       if drop_prob_decoder > 0: cache['U2'] = U2
115 | 
116 |     return Y, cache
117 | 
118 |   @staticmethod
119 |   def backward(dY, cache):
120 | 
121 |     Wd = cache['Wd']
122 |     H = cache['H']
123 |     Xs = cache['Xs']
124 |     Xsh = cache['Xsh']
125 |     Whh = cache['Whh']
126 |     Wxh = cache['Wxh']
127 |     Xi = cache['Xi']
128 |     drop_prob_encoder = cache['drop_prob_encoder']
129 |     drop_prob_decoder = cache['drop_prob_decoder']
130 |     relu_encoders = cache['relu_encoders']
131 |     rnn_feed_once = cache['rnn_feed_once']
132 |     n,d = H.shape
133 | 
134 |     # backprop the decoder
135 |     dWd = H.transpose().dot(dY)
136 |     dbd = np.sum(dY, axis=0, keepdims = True)
137 |     dH = dY.dot(Wd.transpose())
138 | 
139 |     # backprop dropout, if it was applied
140 |     if drop_prob_decoder > 0:
141 |       dH *= cache['U2']
142 | 
143 |     # backprop the recurrent connections
144 |     dXsh = np.zeros(Xsh.shape)
145 |     dXi = np.zeros(d)
146 |     dWhh = np.zeros(Whh.shape)
147 |     dbhh = np.zeros((1,d))
148 |     for t in reversed(xrange(n)):
149 |       dht = (H[t] > 0) * dH[t] # backprop ReLU
150 | 
151 |       if not rnn_feed_once or t == 0:
152 |         dXi += dht # backprop to Xi
153 |         
154 |       dXsh[t] += dht # backprop to word encodings
155 |       dbhh[0] += dht # backprop to bias
156 | 
157 |       if t > 0:
158 |         dH[t-1] += dht.dot(Whh.transpose())
159 |         dWhh += np.outer(H[t-1], dht)
160 | 
161 |     if relu_encoders:
162 |       # backprop relu
163 |       dXsh[Xsh <= 0] = 0
164 |       dXi[Xi <= 0] = 0
165 | 
166 |     # backprop the word encoder
167 |     dWxh = Xs.transpose().dot(dXsh)
168 |     dbxh = np.sum(dXsh, axis=0, keepdims = True)
169 |     dXs = dXsh.dot(Wxh.transpose())
170 | 
171 |     if drop_prob_encoder > 0: # backprop encoder dropout
172 |       dXi *= cache['Ui']
173 |       dXs *= cache['Us']
174 | 
175 |     return { 'Whh': dWhh, 'bhh': dbhh, 'Wd': dWd, 'bd': dbd, 'Wxh':dWxh, 'bxh':dbxh, 'dXs' : dXs, 'dXi': dXi }
176 | 
177 |   @staticmethod
178 |   def predict(Xi, model, Ws, params, **kwargs):
179 |     
180 |     beam_size = kwargs.get('beam_size', 1)
181 |     relu_encoders = params.get('rnn_relu_encoders', 0)
182 |     rnn_feed_once = params.get('rnn_feed_once', 0)
183 | 
184 |     d = model['Wd'].shape[0] # size of hidden layer
185 |     Whh = model['Whh']
186 |     bhh = model['bhh']
187 |     Wd = model['Wd']
188 |     bd = model['bd']
189 |     Wxh = model['Wxh']
190 |     bxh = model['bxh']
191 | 
192 |     if relu_encoders:
193 |       Xi = np.maximum(Xi, 0)
194 | 
195 |     if beam_size > 1:
196 |       # perform beam search
197 |       # NOTE: code duplication here with lstm_generator
198 |       # ideally the beam search would be abstracted away nicely and would take
199 |       # a TICK function or something, but for now lets save time & copy code around. Sorry ;\
200 |       beams = [(0.0, [], np.zeros(d))] 
201 |       nsteps = 0
202 |       while True:
203 |         beam_candidates = []
204 |         for b in beams:
205 |           ixprev = b[1][-1] if b[1] else 0
206 |           if ixprev == 0 and b[1]:
207 |             # this beam predicted end token. Keep in the candidates but don't expand it out any more
208 |             beam_candidates.append(b)
209 |             continue
210 |           # tick the RNN for this beam
211 |           Xsh = Ws[ixprev].dot(Wxh) + bxh
212 |           if relu_encoders:
213 |             Xsh = np.maximum(Xsh, 0)
214 | 
215 |           if (not rnn_feed_once) or (not b[1]):
216 |             h1 = np.maximum(Xi + Xsh + b[2].dot(Whh) + bhh, 0)
217 |           else:
218 |             h1 = np.maximum(Xsh + b[2].dot(Whh) + bhh, 0)
219 | 
220 |           y1 = h1.dot(Wd) + bd
221 | 
222 |           # compute new candidates that expand out form this beam
223 |           y1 = y1.ravel() # make into 1D vector
224 |           maxy1 = np.amax(y1)
225 |           e1 = np.exp(y1 - maxy1) # for numerical stability shift into good numerical range
226 |           p1 = e1 / np.sum(e1)
227 |           y1 = np.log(1e-20 + p1) # and back to log domain
228 |           top_indices = np.argsort(-y1)  # we do -y because we want decreasing order
229 |           for i in xrange(beam_size):
230 |             wordix = top_indices[i]
231 |             beam_candidates.append((b[0] + y1[wordix], b[1] + [wordix], h1))
232 | 
233 |         beam_candidates.sort(reverse = True) # decreasing order
234 |         beams = beam_candidates[:beam_size] # truncate to get new beams
235 |         nsteps += 1
236 |         if nsteps >= 20: # bad things are probably happening, break out
237 |           break
238 |       # strip the intermediates
239 |       predictions = [(b[0], b[1]) for b in beams]
240 | 
241 |     else:
242 |       ixprev = 0 # start out on start token
243 |       nsteps = 0
244 |       predix = []
245 |       predlogprob = 0.0
246 |       hprev = np.zeros((1, d)) # hidden layer representation
247 |       xsprev = Ws[0] # start token
248 |       while True:
249 |         Xsh = Ws[ixprev].dot(Wxh) + bxh
250 |         if relu_encoders:
251 |           Xsh = np.maximum(Xsh, 0)
252 | 
253 |         if (not rnn_feed_once) or (nsteps == 0):
254 |           ht = np.maximum(Xi + Xsh + hprev.dot(Whh) + bhh, 0)
255 |         else:
256 |           ht = np.maximum(Xsh + hprev.dot(Whh) + bhh, 0)
257 | 
258 |         Y = ht.dot(Wd) + bd
259 |         hprev = ht
260 | 
261 |         ixprev, ixlogprob = ymax(Y)
262 |         predix.append(ixprev)
263 |         predlogprob += ixlogprob
264 | 
265 |         nsteps += 1
266 |         if ixprev == 0 or nsteps >= 20:
267 |           break
268 |       predictions = [(predlogprob, predix)]
269 |     return predictions
270 | 
271 | 
272 | def ymax(y):
273 |   """ simple helper function here that takes unnormalized logprobs """
274 |   y1 = y.ravel() # make sure 1d
275 |   maxy1 = np.amax(y1)
276 |   e1 = np.exp(y1 - maxy1) # for numerical stability shift into good numerical range
277 |   p1 = e1 / np.sum(e1)
278 |   y1 = np.log(1e-20 + p1) # guard against zero probabilities just in case
279 |   ix = np.argmax(y1)
280 |   return (ix, y1[ix])
281 | 


--------------------------------------------------------------------------------
/imagernn/lstm_generator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import code
  3 | 
  4 | from imagernn.utils import initw
  5 | 
  6 | class LSTMGenerator:
  7 |   """ 
  8 |   A multimodal long short-term memory (LSTM) generator
  9 |   """
 10 |   
 11 |   @staticmethod
 12 |   def init(input_size, hidden_size, output_size):
 13 | 
 14 |     model = {}
 15 |     # Recurrent weights: take x_t, h_{t-1}, and bias unit
 16 |     # and produce the 3 gates and the input to cell signal
 17 |     model['WLSTM'] = initw(input_size + hidden_size + 1, 4 * hidden_size)
 18 |     # Decoder weights (e.g. mapping to vocabulary)
 19 |     model['Wd'] = initw(hidden_size, output_size) # decoder
 20 |     model['bd'] = np.zeros((1, output_size))
 21 | 
 22 |     update = ['WLSTM', 'Wd', 'bd']
 23 |     regularize = ['WLSTM', 'Wd']
 24 |     return { 'model' : model, 'update' : update, 'regularize' : regularize }
 25 | 
 26 |   @staticmethod
 27 |   def forward(Xi, Xs, model, params, **kwargs):
 28 |     """
 29 |     Xi is 1-d array of size D (containing the image representation)
 30 |     Xs is N x D (N time steps, rows are data containng word representations), and
 31 |     it is assumed that the first row is already filled in as the start token. So a
 32 |     sentence with 10 words will be of size 11xD in Xs.
 33 |     """
 34 |     predict_mode = kwargs.get('predict_mode', False)
 35 | 
 36 |     # Google paper concatenates the image to the word vectors as the first word vector
 37 |     X = np.row_stack([Xi, Xs])
 38 | 
 39 |     # options
 40 |     # use the version of LSTM with tanh? Otherwise dont use tanh (Google style)
 41 |     # following http://arxiv.org/abs/1409.3215
 42 |     tanhC_version = params.get('tanhC_version', 0)
 43 |     drop_prob_encoder = params.get('drop_prob_encoder', 0.0)
 44 |     drop_prob_decoder = params.get('drop_prob_decoder', 0.0)
 45 | 
 46 |     if drop_prob_encoder > 0: # if we want dropout on the encoder
 47 |       # inverted version of dropout here. Suppose the drop_prob is 0.5, then during training
 48 |       # we are going to drop half of the units. In this inverted version we also boost the activations
 49 |       # of the remaining 50% by 2.0 (scale). The nice property of this is that during prediction time
 50 |       # we don't have to do any scailing, since all 100% of units will be active, but at their base
 51 |       # firing rate, giving 100% of the "energy". So the neurons later in the pipeline dont't change
 52 |       # their expected firing rate magnitudes
 53 |       if not predict_mode: # and we are in training mode
 54 |         scale = 1.0 / (1.0 - drop_prob_encoder)
 55 |         U = (np.random.rand(*(X.shape)) < (1 - drop_prob_encoder)) * scale # generate scaled mask
 56 |         X *= U # drop!
 57 | 
 58 |     # follows http://arxiv.org/pdf/1409.2329.pdf
 59 |     WLSTM = model['WLSTM']
 60 |     n = X.shape[0]
 61 |     d = model['Wd'].shape[0] # size of hidden layer
 62 |     Hin = np.zeros((n, WLSTM.shape[0])) # xt, ht-1, bias
 63 |     Hout = np.zeros((n, d))
 64 |     IFOG = np.zeros((n, d * 4))
 65 |     IFOGf = np.zeros((n, d * 4)) # after nonlinearity
 66 |     C = np.zeros((n, d))
 67 |     for t in range(n):
 68 |       # set input
 69 |       prev = np.zeros(d) if t == 0 else Hout[t-1]
 70 |       Hin[t,0] = 1
 71 |       Hin[t,1:1+d] = X[t]
 72 |       Hin[t,1+d:] = prev
 73 | 
 74 |       # compute all gate activations. dots:
 75 |       IFOG[t] = Hin[t].dot(WLSTM)
 76 |       
 77 |       # non-linearities
 78 |       IFOGf[t,:3*d] = 1.0/(1.0+np.exp(-IFOG[t,:3*d])) # sigmoids; these are the gates
 79 |       IFOGf[t,3*d:] = np.tanh(IFOG[t, 3*d:]) # tanh
 80 | 
 81 |       # compute the cell activation
 82 |       C[t] = IFOGf[t,:d] * IFOGf[t, 3*d:]
 83 |       if t > 0: C[t] += IFOGf[t,d:2*d] * C[t-1]
 84 |       if tanhC_version:
 85 |         Hout[t] = IFOGf[t,2*d:3*d] * np.tanh(C[t])
 86 |       else:
 87 |         Hout[t] = IFOGf[t,2*d:3*d] * C[t]
 88 | 
 89 |     if drop_prob_decoder > 0: # if we want dropout on the decoder
 90 |       if not predict_mode: # and we are in training mode
 91 |         scale2 = 1.0 / (1.0 - drop_prob_decoder)
 92 |         U2 = (np.random.rand(*(Hout.shape)) < (1 - drop_prob_decoder)) * scale2 # generate scaled mask
 93 |         Hout *= U2 # drop!
 94 | 
 95 |     # decoder at the end
 96 |     Wd = model['Wd']
 97 |     bd = model['bd']
 98 |     # NOTE1: we are leaving out the first prediction, which was made for the image
 99 |     # and is meaningless.
100 |     Y = Hout[1:, :].dot(Wd) + bd 
101 | 
102 |     cache = {}
103 |     if not predict_mode:
104 |       # we can expect to do a backward pass
105 |       cache['WLSTM'] = WLSTM
106 |       cache['Hout'] = Hout
107 |       cache['Wd'] = Wd
108 |       cache['IFOGf'] = IFOGf
109 |       cache['IFOG'] = IFOG
110 |       cache['C'] = C
111 |       cache['X'] = X
112 |       cache['Hin'] = Hin
113 |       cache['tanhC_version'] = tanhC_version
114 |       cache['drop_prob_encoder'] = drop_prob_encoder
115 |       cache['drop_prob_decoder'] = drop_prob_decoder
116 |       if drop_prob_encoder > 0: cache['U'] = U # keep the dropout masks around for backprop
117 |       if drop_prob_decoder > 0: cache['U2'] = U2
118 | 
119 |     return Y, cache
120 | 
121 |   @staticmethod
122 |   def backward(dY, cache):
123 | 
124 |     Wd = cache['Wd']
125 |     Hout = cache['Hout']
126 |     IFOG = cache['IFOG']
127 |     IFOGf = cache['IFOGf']
128 |     C = cache['C']
129 |     Hin = cache['Hin']
130 |     WLSTM = cache['WLSTM']
131 |     X = cache['X']
132 |     tanhC_version = cache['tanhC_version']
133 |     drop_prob_encoder = cache['drop_prob_encoder']
134 |     drop_prob_decoder = cache['drop_prob_decoder']
135 |     n,d = Hout.shape
136 | 
137 |     # we have to add back a row of zeros, since in the forward pass
138 |     # this information was not used. See NOTE1 above.
139 |     dY = np.row_stack([np.zeros(dY.shape[1]), dY])
140 | 
141 |     # backprop the decoder
142 |     dWd = Hout.transpose().dot(dY)
143 |     dbd = np.sum(dY, axis=0, keepdims = True)
144 |     dHout = dY.dot(Wd.transpose())
145 | 
146 |     # backprop dropout, if it was applied
147 |     if drop_prob_decoder > 0:
148 |       dHout *= cache['U2']
149 | 
150 |     # backprop the LSTM
151 |     dIFOG = np.zeros(IFOG.shape)
152 |     dIFOGf = np.zeros(IFOGf.shape)
153 |     dWLSTM = np.zeros(WLSTM.shape)
154 |     dHin = np.zeros(Hin.shape)
155 |     dC = np.zeros(C.shape)
156 |     dX = np.zeros(X.shape)
157 |     for t in reversed(range(n)):
158 | 
159 |       if tanhC_version:
160 |         tanhCt = np.tanh(C[t]) # recompute this here
161 |         dIFOGf[t,2*d:3*d] = tanhCt * dHout[t]
162 |         # backprop tanh non-linearity first then continue backprop
163 |         dC[t] += (1-tanhCt**2) * (IFOGf[t,2*d:3*d] * dHout[t])
164 |       else:
165 |         dIFOGf[t,2*d:3*d] = C[t] * dHout[t]
166 |         dC[t] += IFOGf[t,2*d:3*d] * dHout[t]
167 | 
168 |       if t > 0:
169 |         dIFOGf[t,d:2*d] = C[t-1] * dC[t]
170 |         dC[t-1] += IFOGf[t,d:2*d] * dC[t]
171 |       dIFOGf[t,:d] = IFOGf[t, 3*d:] * dC[t]
172 |       dIFOGf[t, 3*d:] = IFOGf[t,:d] * dC[t]
173 |       
174 |       # backprop activation functions
175 |       dIFOG[t,3*d:] = (1 - IFOGf[t, 3*d:] ** 2) * dIFOGf[t,3*d:]
176 |       y = IFOGf[t,:3*d]
177 |       dIFOG[t,:3*d] = (y*(1.0-y)) * dIFOGf[t,:3*d]
178 | 
179 |       # backprop matrix multiply
180 |       dWLSTM += np.outer(Hin[t], dIFOG[t])
181 |       dHin[t] = dIFOG[t].dot(WLSTM.transpose())
182 | 
183 |       # backprop the identity transforms into Hin
184 |       dX[t] = dHin[t,1:1+d]
185 |       if t > 0:
186 |         dHout[t-1] += dHin[t,1+d:]
187 | 
188 |     if drop_prob_encoder > 0: # backprop encoder dropout
189 |       dX *= cache['U']
190 | 
191 |     return { 'WLSTM': dWLSTM, 'Wd': dWd, 'bd': dbd, 'dXi': dX[0,:], 'dXs': dX[1:,:] }
192 | 
193 |   @staticmethod
194 |   def predict(Xi, model, Ws, params, **kwargs):
195 |     """ 
196 |     Run in prediction mode with beam search. The input is the vector Xi, which 
197 |     should be a 1-D array that contains the encoded image vector. We go from there.
198 |     Ws should be NxD array where N is size of vocabulary + 1. So there should be exactly
199 |     as many rows in Ws as there are outputs in the decoder Y. We are passing in Ws like
200 |     this because we may not want it to be exactly model['Ws']. For example it could be
201 |     fixed word vectors from somewhere else.
202 |     """
203 |     tanhC_version = params['tanhC_version']
204 |     beam_size = kwargs.get('beam_size', 1)
205 | 
206 |     WLSTM = model['WLSTM']
207 |     d = model['Wd'].shape[0] # size of hidden layer
208 |     Wd = model['Wd']
209 |     bd = model['bd']
210 | 
211 |     # lets define a helper function that does a single LSTM tick
212 |     def LSTMtick(x, h_prev, c_prev):
213 |       t = 0
214 | 
215 |       # setup the input vector
216 |       Hin = np.zeros((1,WLSTM.shape[0])) # xt, ht-1, bias
217 |       Hin[t,0] = 1
218 |       Hin[t,1:1+d] = x
219 |       Hin[t,1+d:] = h_prev
220 | 
221 |       # LSTM tick forward
222 |       IFOG = np.zeros((1, d * 4))
223 |       IFOGf = np.zeros((1, d * 4))
224 |       C = np.zeros((1, d))
225 |       Hout = np.zeros((1, d))
226 |       IFOG[t] = Hin[t].dot(WLSTM)
227 |       IFOGf[t,:3*d] = 1.0/(1.0+np.exp(-IFOG[t,:3*d]))
228 |       IFOGf[t,3*d:] = np.tanh(IFOG[t, 3*d:])
229 |       C[t] = IFOGf[t,:d] * IFOGf[t, 3*d:] + IFOGf[t,d:2*d] * c_prev
230 |       if tanhC_version:
231 |         Hout[t] = IFOGf[t,2*d:3*d] * np.tanh(C[t])
232 |       else:
233 |         Hout[t] = IFOGf[t,2*d:3*d] * C[t]
234 |       Y = Hout.dot(Wd) + bd
235 |       return (Y, Hout, C) # return output, new hidden, new cell
236 | 
237 |     # forward prop the image
238 |     (y0, h, c) = LSTMtick(Xi, np.zeros(d), np.zeros(d))
239 |     
240 |     # perform BEAM search. NOTE: I am not very confident in this implementation since I don't have
241 |     # a lot of experience with these models. This implements my current understanding but I'm not
242 |     # sure how to handle beams that predict END tokens. TODO: research this more.
243 |     if beam_size > 1:
244 |       # log probability, indices of words predicted in this beam so far, and the hidden and cell states
245 |       beams = [(0.0, [], h, c)] 
246 |       nsteps = 0
247 |       while True:
248 |         beam_candidates = []
249 |         for b in beams:
250 |           ixprev = b[1][-1] if b[1] else 0 # start off with the word where this beam left off
251 |           if ixprev == 0 and b[1]:
252 |             # this beam predicted end token. Keep in the candidates but don't expand it out any more
253 |             beam_candidates.append(b)
254 |             continue
255 |           (y1, h1, c1) = LSTMtick(Ws[ixprev], b[2], b[3])
256 |           y1 = y1.ravel() # make into 1D vector
257 |           maxy1 = np.amax(y1)
258 |           e1 = np.exp(y1 - maxy1) # for numerical stability shift into good numerical range
259 |           p1 = e1 / np.sum(e1)
260 |           y1 = np.log(1e-20 + p1) # and back to log domain
261 |           top_indices = np.argsort(-y1)  # we do -y because we want decreasing order
262 |           for i in range(beam_size):
263 |             wordix = top_indices[i]
264 |             beam_candidates.append((b[0] + y1[wordix], b[1] + [wordix], h1, c1))
265 |         beam_candidates.sort(reverse = True) # decreasing order
266 |         beams = beam_candidates[:beam_size] # truncate to get new beams
267 |         nsteps += 1
268 |         if nsteps >= 20: # bad things are probably happening, break out
269 |           break
270 |       # strip the intermediates
271 |       predictions = [(b[0], b[1]) for b in beams]
272 |     else:
273 |       # greedy inference. lets write it up independently, should be bit faster and simpler
274 |       ixprev = 0
275 |       nsteps = 0
276 |       predix = []
277 |       predlogprob = 0.0
278 |       while True:
279 |         (y1, h, c) = LSTMtick(Ws[ixprev], h, c)
280 |         ixprev, ixlogprob = ymax(y1)
281 |         predix.append(ixprev)
282 |         predlogprob += ixlogprob
283 |         nsteps += 1
284 |         if ixprev == 0 or nsteps >= 20:
285 |           break
286 |       predictions = [(predlogprob, predix)]
287 | 
288 |     return predictions
289 | 
290 | def ymax(y):
291 |   """ simple helper function here that takes unnormalized logprobs """
292 |   y1 = y.ravel() # make sure 1d
293 |   maxy1 = np.amax(y1)
294 |   e1 = np.exp(y1 - maxy1) # for numerical stability shift into good numerical range
295 |   p1 = e1 / np.sum(e1)
296 |   y1 = np.log(1e-20 + p1) # guard against zero probabilities just in case
297 |   ix = np.argmax(y1)
298 |   return (ix, y1[ix])
299 | 


--------------------------------------------------------------------------------