├── imagernn ├── __init__.py ├── README.md ├── utils.py ├── imagernn_utils.py ├── data_provider.py ├── solver.py ├── generic_batch_generator.py ├── rnn_generator.py └── lstm_generator.py ├── .gitignore ├── delete_index.sh ├── docker_install.sh ├── static ├── favicon.png ├── uploads │ ├── surf.png │ ├── output.png │ └── cricket1.jpg └── database │ ├── img1.jpg │ ├── img2.jpg │ ├── img3.jpg │ ├── img4.jpg │ └── cricket1.jpg ├── .vscode └── settings.json ├── models └── README.md ├── cmd ├── index.py ├── query.py └── main.py ├── docker-compose.yaml ├── requirements.txt ├── index_database.py ├── templates ├── home.html ├── database.html ├── caption.html ├── search.html └── layout.html ├── Dockerfile ├── LICENSE.md ├── capgen.py ├── README.md └── server.py /imagernn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.p 2 | *.pyc 3 | image-search/* 4 | -------------------------------------------------------------------------------- /delete_index.sh: -------------------------------------------------------------------------------- 1 | curl -X DELETE "localhost:9200/desearch" -------------------------------------------------------------------------------- /docker_install.sh: -------------------------------------------------------------------------------- 1 | docker build . -t image_to_image_search:version2 2 | docker-compose up -d 3 | -------------------------------------------------------------------------------- /static/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/favicon.png -------------------------------------------------------------------------------- /static/uploads/surf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/uploads/surf.png -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "git.ignoreLimitWarning": true, 3 | "python.pythonPath": "image-search/bin/python3.6" 4 | } -------------------------------------------------------------------------------- /static/database/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/img1.jpg -------------------------------------------------------------------------------- /static/database/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/img2.jpg -------------------------------------------------------------------------------- /static/database/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/img3.jpg -------------------------------------------------------------------------------- /static/database/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/img4.jpg -------------------------------------------------------------------------------- /static/uploads/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/uploads/output.png -------------------------------------------------------------------------------- /static/database/cricket1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/database/cricket1.jpg -------------------------------------------------------------------------------- /static/uploads/cricket1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wx-chevalier/ai-tensorflow-image-search/master/static/uploads/cricket1.jpg -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | Place the [Flickr 8K lSTM weights](https://cs.stanford.edu/people/karpathy/neuraltalk/flickr8k_cnn_lstm_v1.zip) here 2 | -------------------------------------------------------------------------------- /cmd/index.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Indexes dataset.json in the elastic search server 3 | ''' 4 | from elasticsearch import Elasticsearch 5 | from elasticsearch.helpers import bulk 6 | import json 7 | 8 | es = Elasticsearch() 9 | with open("dataset.json") as f: 10 | data = json.load(f) 11 | actions = [] 12 | for i in range(len(data['images'])): 13 | doc = {'id': i, 'imgurl': data['images'][i]['filename'], 'description': data['images'][i]['sentences'][0]['raw'] } 14 | actions.append(doc) 15 | bulk(es,actions,index="desearch",doc_type="json") 16 | es.indices.refresh(index="desearch") 17 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | services: 3 | elasticsearch: 4 | image: docker.elastic.co/elasticsearch/elasticsearch:6.6.0 5 | container_name: elasticsearch 6 | ports: 7 | - "9200:9200" 8 | - "9300:9300" 9 | environment: 10 | ES_JAVA_OPTS: "-Xms256m -Xmx256m" 11 | network.bind_host: 0.0.0.0 12 | network.host: 0.0.0.0 13 | discovery.type: single-node 14 | # website: 15 | # image: image_to_image_search:version2 16 | # volumes: 17 | # - ~/.keras:/root/.keras 18 | # ports: 19 | # - "5000:5000" 20 | # depends_on: 21 | # - elasticsearch 22 | # working_dir: /image_search 23 | # command: python3 server.py 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.7.1 2 | astor==0.7.1 3 | bleach==1.5.0 4 | Click==7.0 5 | elasticsearch==6.3.1 6 | enum34==1.1.6 7 | Flask==1.0.3 8 | gast==0.2.2 9 | grpcio==1.21.1 10 | h5py==2.9.0 11 | html5lib==1.0.1 12 | itsdangerous==1.1.0 13 | Jinja2==2.10.1 14 | Keras==2.2.4 15 | Keras-Applications==1.0.7 16 | Keras-Preprocessing==1.0.9 17 | Markdown==3.1 18 | MarkupSafe==1.1.1 19 | mock==2.0.0 20 | numpy==1.16.2 21 | pbr==5.1.3 22 | Pillow==6.2.0 23 | protobuf==3.7.1 24 | PyYAML==5.1 25 | scipy==1.2.1 26 | six==1.12.0 27 | tensorboard==1.13.1 28 | tensorflow==1.15.0 29 | tensorflow-estimator==1.13.0 30 | tensorflow-tensorboard==0.1.8 31 | termcolor==1.1.0 32 | urllib3==1.24.2 33 | Werkzeug==0.15.3 34 | -------------------------------------------------------------------------------- /index_database.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | from elasticsearch.helpers import bulk 3 | from capgen import CaptionGenerator 4 | import glob 5 | import os 6 | 7 | os.environ['CUDA_VISIBLE_DEVICES'] = '' 8 | es = Elasticsearch() 9 | gencap = CaptionGenerator() 10 | 11 | def index_database(): 12 | images = glob.glob('static/database/*') 13 | actions = [] 14 | for i, image in enumerate(images): 15 | cap = gencap.get_caption(image) 16 | doc = {'imgurl': image, 'description': cap} 17 | actions.append(doc) 18 | bulk(es,actions,index="desearch",doc_type="json") 19 | 20 | if __name__ == "__main__": 21 | index_database() 22 | print('Images from static/img are indexed successfully') -------------------------------------------------------------------------------- /templates/home.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | {% block content %} 3 | 4 |

SmartSearch is a reverse image search engine. Built on top of Tensorflow and ElasticSearch. It generates image captions to find similar images.
12 |{{caption}}
40 |
10 |
11 |
12 |
13 |
14 |
17 | A reverse image search engine powered by elastic search and tensorflow
18 |
19 | Explore the docs »
20 |
21 |
22 | View Demo
23 | ·
24 | Report Bug
25 | ·
26 | Request Feature
27 |
57 |
58 |
59 |
60 | ## Tips
61 |
62 | - Install elasticsearch and always check if elastic search process is running before launching server.py or index_database.py.
63 |
64 | * Instead of using the upload functionality, paste all your images inside `static/img` folder followed by `python index_database.py` to index all those images.
65 |
66 | - If you want to delete the indexed images, do `sh delete_index.sh`
67 |
68 | # About
69 |
70 |
71 |
72 | ## Roadmap
73 |
74 | See the [open issues](https://github.com/wx-chevalier/tensorflow-image-search/issues) for a list of proposed features (and known issues).
75 |
76 |
77 |
78 | ## Contributing
79 |
80 | Contributions are what make the open source community such an amazing place to be learn, inspire, and create. Any contributions you make are **greatly appreciated**.
81 |
82 | 1. Fork the Project
83 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
84 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
85 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
86 | 5. Open a Pull Request
87 |
88 |
89 |
90 | ## License
91 |
92 | Distributed under the MIT License. See `LICENSE` for more information.
93 |
94 |
95 |
96 | ## Acknowledgements
97 |
98 | - [sethuiyer/Image-to-Image-search](https://github.com/sethuiyer/Image-to-Image-search)
99 |
100 | - [Sis](https://github.com/matsui528/sis)
101 |
102 | ## Copyright & More | 延伸阅读
103 |
104 | 笔者所有文章遵循[知识共享 署名 - 非商业性使用 - 禁止演绎 4.0 国际许可协议](https://creativecommons.org/licenses/by-nc-nd/4.0/deed.zh),欢迎转载,尊重版权。如果觉得本系列对你有所帮助,欢迎给我家布丁买点狗粮(支付宝扫码)~
105 |
106 | 
107 |
108 | 您还可以前往 [NGTE Books](https://ng-tech.icu/books/) 主页浏览包含知识体系、编程语言、软件工程、模式与架构、Web 与大前端、服务端开发实践与工程架构、分布式基础架构、人工智能与深度学习、产品运营与创业等多类目的书籍列表:
109 |
110 | 
111 |
112 |
113 |
114 |
115 | [contributors-shield]: https://img.shields.io/github/contributors/wx-chevalier/tensorflow-image-search.svg?style=flat-square
116 | [contributors-url]: https://github.com/wx-chevalier/tensorflow-image-search/graphs/contributors
117 | [forks-shield]: https://img.shields.io/github/forks/wx-chevalier/tensorflow-image-search.svg?style=flat-square
118 | [forks-url]: https://github.com/wx-chevalier/tensorflow-image-search/network/members
119 | [stars-shield]: https://img.shields.io/github/stars/wx-chevalier/tensorflow-image-search.svg?style=flat-square
120 | [stars-url]: https://github.com/wx-chevalier/tensorflow-image-search/stargazers
121 | [issues-shield]: https://img.shields.io/github/issues/wx-chevalier/tensorflow-image-search.svg?style=flat-square
122 | [issues-url]: https://github.com/wx-chevalier/tensorflow-image-search/issues
123 | [license-shield]: https://img.shields.io/github/license/wx-chevalier/tensorflow-image-search.svg?style=flat-square
124 | [license-url]: https://github.com/wx-chevalier/tensorflow-image-search/blob/master/LICENSE.txt
125 |
--------------------------------------------------------------------------------
/imagernn/solver.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 | from imagernn.utils import randi
4 |
5 | class Solver:
6 | """
7 | solver worries about:
8 | - different optimization methods, updates, weight decays
9 | - it can also perform gradient check
10 | """
11 | def __init__(self):
12 | self.step_cache_ = {} # might need this
13 | self.step_cache2_ = {} # might need this
14 |
15 | def step(self, batch, model, cost_function, **kwargs):
16 | """
17 | perform a single batch update. Takes as input:
18 | - batch of data (X)
19 | - model (W)
20 | - cost function which takes batch, model
21 | """
22 |
23 | learning_rate = kwargs.get('learning_rate', 0.0)
24 | update = kwargs.get('update', model.keys())
25 | grad_clip = kwargs.get('grad_clip', -1)
26 | solver = kwargs.get('solver', 'vanilla')
27 | momentum = kwargs.get('momentum', 0)
28 | smooth_eps = kwargs.get('smooth_eps', 1e-8)
29 | decay_rate = kwargs.get('decay_rate', 0.999)
30 |
31 | if not (solver == 'vanilla' and momentum == 0):
32 | # lazily make sure we initialize step cache if needed
33 | for u in update:
34 | if not u in self.step_cache_:
35 | self.step_cache_[u] = np.zeros(model[u].shape)
36 | if solver == 'adadelta':
37 | self.step_cache2_[u] = np.zeros(model[u].shape) # adadelta needs one more cache
38 |
39 | # compute cost and gradient
40 | cg = cost_function(batch, model)
41 | cost = cg['cost']
42 | grads = cg['grad']
43 | stats = cg['stats']
44 |
45 | # clip gradients if needed, simplest possible version
46 | # todo later: maybe implement the gradient direction conserving version
47 | if grad_clip > 0:
48 | for p in update:
49 | if p in grads:
50 | grads[p] = np.minimum(grads[p], grad_clip)
51 | grads[p] = np.maximum(grads[p], -grad_clip)
52 |
53 | # perform parameter update
54 | for p in update:
55 | if p in grads:
56 |
57 | if solver == 'vanilla': # vanilla sgd, optional with momentum
58 | if momentum > 0:
59 | dx = momentum * self.step_cache_[p] - learning_rate * grads[p]
60 | self.step_cache_[p] = dx
61 | else:
62 | dx = - learning_rate * grads[p]
63 |
64 | elif solver == 'rmsprop':
65 | self.step_cache_[p] = self.step_cache_[p] * decay_rate + (1.0 - decay_rate) * grads[p] ** 2
66 | dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache_[p] + smooth_eps)
67 |
68 | elif solver == 'adagrad':
69 | self.step_cache_[p] += grads[p] ** 2
70 | dx = -(learning_rate * grads[p]) / np.sqrt(self.step_cache_[p] + smooth_eps)
71 |
72 | elif solver == 'adadelta':
73 | self.step_cache_[p] = self.step_cache_[p] * decay_rate + (1.0 - decay_rate) * grads[p] ** 2
74 | dx = - np.sqrt( (self.step_cache2_[p] + smooth_eps) / (self.step_cache_[p] + smooth_eps) ) * grads[p]
75 | self.step_cache2_[p] = self.step_cache2_[p] * decay_rate + (1.0 - decay_rate) * (dx ** 2)
76 |
77 | else:
78 | raise Exception("solver %s not supported" % (solver, ))
79 |
80 | # perform the parameter update
81 | model[p] += dx
82 |
83 | # create output dict and return
84 | out = {}
85 | out['cost'] = cost
86 | out['stats'] = stats
87 | return out
88 |
89 | def gradCheck(self, batch, model, cost_function, **kwargs):
90 | """
91 | perform gradient check.
92 | since gradcheck can be tricky (especially with relus involved)
93 | this function prints to console for visual inspection
94 | """
95 |
96 | num_checks = kwargs.get('num_checks', 10)
97 | delta = kwargs.get('delta', 1e-5)
98 | rel_error_thr_warning = kwargs.get('rel_error_thr_warning', 1e-2)
99 | rel_error_thr_error = kwargs.get('rel_error_thr_error', 1)
100 |
101 | cg = cost_function(batch, model)
102 |
103 | print 'running gradient check...'
104 | for p in model.keys():
105 | print 'checking gradient on parameter %s of shape %s...' % (p, `model[p].shape`)
106 | mat = model[p]
107 |
108 | s0 = cg['grad'][p].shape
109 | s1 = mat.shape
110 | assert s0 == s1, 'Error dims dont match: %s and %s.' % (`s0`, `s1`)
111 |
112 | for i in xrange(num_checks):
113 | ri = randi(mat.size)
114 |
115 | # evluate cost at [x + delta] and [x - delta]
116 | old_val = mat.flat[ri]
117 | mat.flat[ri] = old_val + delta
118 | cg0 = cost_function(batch, model)
119 | mat.flat[ri] = old_val - delta
120 | cg1 = cost_function(batch, model)
121 | mat.flat[ri] = old_val # reset old value for this parameter
122 |
123 | # fetch both numerical and analytic gradient
124 | grad_analytic = cg['grad'][p].flat[ri]
125 | grad_numerical = (cg0['cost']['total_cost'] - cg1['cost']['total_cost']) / ( 2 * delta )
126 |
127 | # compare them
128 | if grad_numerical == 0 and grad_analytic == 0:
129 | rel_error = 0 # both are zero, OK.
130 | status = 'OK'
131 | elif abs(grad_numerical) < 1e-7 and abs(grad_analytic) < 1e-7:
132 | rel_error = 0 # not enough precision to check this
133 | status = 'VAL SMALL WARNING'
134 | else:
135 | rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic)
136 | status = 'OK'
137 | if rel_error > rel_error_thr_warning: status = 'WARNING'
138 | if rel_error > rel_error_thr_error: status = '!!!!! NOTOK'
139 |
140 | # print stats
141 | print '%s checking param %s index %8d (val = %+8f), analytic = %+8f, numerical = %+8f, relative error = %+8f' \
142 | % (status, p, ri, old_val, grad_analytic, grad_numerical, rel_error)
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
--------------------------------------------------------------------------------
/imagernn/generic_batch_generator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import code
3 | from imagernn.utils import merge_init_structs, initw, accumNpDicts
4 | from imagernn.lstm_generator import LSTMGenerator
5 | from imagernn.rnn_generator import RNNGenerator
6 |
7 | def decodeGenerator(generator):
8 | if generator == 'lstm':
9 | return LSTMGenerator
10 | if generator == 'rnn':
11 | return RNNGenerator
12 | else:
13 | raise Exception('generator %s is not yet supported' % (base_generator_str,))
14 |
15 | class GenericBatchGenerator:
16 | """
17 | Base batch generator class.
18 | This class is aware of the fact that we are generating
19 | sentences from images.
20 | """
21 |
22 | @staticmethod
23 | def init(params, misc):
24 |
25 | # inputs
26 | image_encoding_size = params.get('image_encoding_size', 128)
27 | word_encoding_size = params.get('word_encoding_size', 128)
28 | hidden_size = params.get('hidden_size', 128)
29 | generator = params.get('generator', 'lstm')
30 | vocabulary_size = len(misc['wordtoix'])
31 | output_size = len(misc['ixtoword']) # these should match though
32 | image_size = 4096 # size of CNN vectors hardcoded here
33 |
34 | if generator == 'lstm':
35 | assert image_encoding_size == word_encoding_size, 'this implementation does not support different sizes for these parameters'
36 |
37 | # initialize the encoder models
38 | model = {}
39 | model['We'] = initw(image_size, image_encoding_size) # image encoder
40 | model['be'] = np.zeros((1,image_encoding_size))
41 | model['Ws'] = initw(vocabulary_size, word_encoding_size) # word encoder
42 | update = ['We', 'be', 'Ws']
43 | regularize = ['We', 'Ws']
44 | init_struct = { 'model' : model, 'update' : update, 'regularize' : regularize}
45 |
46 | # descend into the specific Generator and initialize it
47 | Generator = decodeGenerator(generator)
48 | generator_init_struct = Generator.init(word_encoding_size, hidden_size, output_size)
49 | merge_init_structs(init_struct, generator_init_struct)
50 | return init_struct
51 |
52 | @staticmethod
53 | def forward(batch, model, params, misc, predict_mode = False):
54 | """ iterates over items in the batch and calls generators on them """
55 | # we do the encoding here across all images/words in batch in single matrix
56 | # multiplies to gain efficiency. The RNNs are then called individually
57 | # in for loop on per-image-sentence pair and all they are concerned about is
58 | # taking single matrix of vectors and doing the forward/backward pass without
59 | # knowing anything about images, sentences or anything of that sort.
60 |
61 | # encode all images
62 | # concatenate as rows. If N is number of image-sentence pairs,
63 | # F will be N x image_size
64 | F = np.row_stack(x['image']['feat'] for x in batch)
65 | We = model['We']
66 | be = model['be']
67 | Xe = F.dot(We) + be # Xe becomes N x image_encoding_size
68 |
69 | # decode the generator we wish to use
70 | generator_str = params.get('generator', 'lstm')
71 | Generator = decodeGenerator(generator_str)
72 |
73 | # encode all words in all sentences (which exist in our vocab)
74 | wordtoix = misc['wordtoix']
75 | Ws = model['Ws']
76 | gen_caches = []
77 | Ys = [] # outputs
78 | for i,x in enumerate(batch):
79 | # take all words in this sentence and pluck out their word vectors
80 | # from Ws. Then arrange them in a single matrix Xs
81 | # Note that we are setting the start token as first vector
82 | # and then all the words afterwards. And start token is the first row of Ws
83 | ix = [0] + [ wordtoix[w] for w in x['sentence']['tokens'] if w in wordtoix ]
84 | Xs = np.row_stack( [Ws[j, :] for j in ix] )
85 | Xi = Xe[i,:]
86 |
87 | # forward prop through the RNN
88 | gen_Y, gen_cache = Generator.forward(Xi, Xs, model, params, predict_mode = predict_mode)
89 | gen_caches.append((ix, gen_cache))
90 | Ys.append(gen_Y)
91 |
92 | # back up information we need for efficient backprop
93 | cache = {}
94 | if not predict_mode:
95 | # ok we need cache as well because we'll do backward pass
96 | cache['gen_caches'] = gen_caches
97 | cache['Xe'] = Xe
98 | cache['Ws_shape'] = Ws.shape
99 | cache['F'] = F
100 | cache['generator_str'] = generator_str
101 |
102 | return Ys, cache
103 |
104 | @staticmethod
105 | def backward(dY, cache):
106 | Xe = cache['Xe']
107 | generator_str = cache['generator_str']
108 | dWs = np.zeros(cache['Ws_shape'])
109 | gen_caches = cache['gen_caches']
110 | F = cache['F']
111 | dXe = np.zeros(Xe.shape)
112 |
113 | Generator = decodeGenerator(generator_str)
114 |
115 | # backprop each item in the batch
116 | grads = {}
117 | for i in xrange(len(gen_caches)):
118 | ix, gen_cache = gen_caches[i] # unpack
119 | local_grads = Generator.backward(dY[i], gen_cache)
120 | dXs = local_grads['dXs'] # intercept the gradients wrt Xi and Xs
121 | del local_grads['dXs']
122 | dXi = local_grads['dXi']
123 | del local_grads['dXi']
124 | accumNpDicts(grads, local_grads) # add up the gradients wrt model parameters
125 |
126 | # now backprop from dXs to the image vector and word vectors
127 | dXe[i,:] += dXi # image vector
128 | for n,j in enumerate(ix): # and now all the other words
129 | dWs[j,:] += dXs[n,:]
130 |
131 | # finally backprop into the image encoder
132 | dWe = F.transpose().dot(dXe)
133 | dbe = np.sum(dXe, axis=0, keepdims = True)
134 |
135 | accumNpDicts(grads, { 'We':dWe, 'be':dbe, 'Ws':dWs })
136 | return grads
137 |
138 | @staticmethod
139 | def predict(batch, model, params, **kwparams):
140 | """ some code duplication here with forward pass, but I think we want the freedom in future """
141 | F = np.row_stack(x['image']['feat'] for x in batch)
142 | We = model['We']
143 | be = model['be']
144 | Xe = F.dot(We) + be # Xe becomes N x image_encoding_size
145 | generator_str = params['generator']
146 | Generator = decodeGenerator(generator_str)
147 | Ys = []
148 | for i,x in enumerate(batch):
149 | gen_Y = Generator.predict(Xe[i, :], model, model['Ws'], params, **kwparams)
150 | Ys.append(gen_Y)
151 | return Ys
152 |
153 |
154 |
--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 |
4 | from PIL import Image
5 | from elasticsearch import Elasticsearch
6 | from elasticsearch.helpers import bulk
7 | from flask import Flask, render_template, request, Response
8 | from werkzeug.utils import secure_filename
9 | import json
10 |
11 | from capgen import CaptionGenerator
12 |
13 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
14 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
15 | es = Elasticsearch()
16 | gencap = CaptionGenerator()
17 |
18 |
19 | def description_search(query):
20 | global es
21 | results = es.search(
22 | index="desearch",
23 | body={
24 | "size": 20,
25 | "query": {
26 | "match": {"description": query}
27 | }
28 | })
29 |
30 | hitCount = results['hits']['total']
31 |
32 | if hitCount > 0:
33 | if hitCount is 1:
34 | print(str(hitCount), ' result')
35 | else:
36 | print(str(hitCount), 'results')
37 | answers = []
38 | max_score = results['hits']['max_score']
39 |
40 | if max_score >= 0.35:
41 | for hit in results['hits']['hits']:
42 | if hit['_score'] > 0.5 * max_score:
43 | desc = hit['_source']['description']
44 | imgurl = hit['_source']['imgurl']
45 | answers.append([imgurl, desc])
46 | else:
47 | answers = []
48 | return answers
49 |
50 |
51 | app = Flask(__name__)
52 | app.config['UPLOAD_FOLDER'] = os.path.join('static', 'database')
53 | app.config['TEMP_UPLOAD_FOLDER'] = os.path.join('static', 'uploads')
54 | app.config['ALLOWED_EXTENSIONS'] = set(['jpg', 'jpeg', 'png'])
55 |
56 |
57 | def allowed_file(filename):
58 | return '.' in filename and \
59 | filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS']
60 |
61 |
62 | @app.route('/')
63 | def index():
64 | return render_template('home.html')
65 |
66 |
67 | @app.route('/search', methods=['GET', 'POST'])
68 | def search():
69 | global gencap
70 | if request.method == 'POST':
71 | if 'query_img' not in request.files or request.files['query_img'].filename == '' or not allowed_file(
72 | request.files['query_img'].filename):
73 | return render_template('search.html')
74 | file = request.files['query_img']
75 | img = Image.open(file.stream) # PIL image
76 | uploaded_img_path = os.path.join(app.config['TEMP_UPLOAD_FOLDER'], file.filename)
77 | img.save(uploaded_img_path)
78 | query = gencap.get_caption(uploaded_img_path)
79 | answers = description_search(query)
80 |
81 | return render_template('search.html',
82 | query_path=uploaded_img_path,
83 | answers=answers)
84 | else:
85 | return render_template('search.html')
86 |
87 |
88 | @app.route('/api/search', methods=['POST'])
89 | def api_search():
90 | global gencap
91 | if 'query_img' not in request.files or request.files['query_img'].filename == '' or not allowed_file(
92 | request.files['query_img'].filename):
93 | return Response(response=json.dumps({'success': False, 'message': 'Uploaded image is invalid or not allowed'}),
94 | status=400, mimetype="application/json")
95 | file = request.files['query_img']
96 | img = Image.open(file.stream) # PIL image
97 | uploaded_img_path = os.path.join(app.config['TEMP_UPLOAD_FOLDER'], file.filename)
98 | img.save(uploaded_img_path)
99 | query = gencap.get_caption(uploaded_img_path)
100 | answers = description_search(query)
101 |
102 | return Response(response=json.dumps({'success': True, 'answers': answers}),
103 | status=200, mimetype="application/json")
104 |
105 |
106 | @app.route('/database')
107 | def database():
108 | images = glob.glob(os.path.join(app.config['UPLOAD_FOLDER'], '*'))
109 | return render_template('database.html', database_images=images)
110 |
111 |
112 | @app.route('/upload', methods=['GET', 'POST'])
113 | def upload():
114 | if request.method == 'POST':
115 | if 'photos' not in request.files:
116 | return render_template('database.html')
117 | actions = []
118 | for file in request.files.getlist('photos'):
119 | if file and allowed_file(file.filename):
120 | filename = secure_filename(file.filename)
121 | file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
122 | file.save(file_path)
123 | cap = gencap.get_caption(file_path)
124 | doc = {'imgurl': file_path, 'description': cap}
125 | actions.append(doc)
126 | bulk(es, actions, index="desearch", doc_type="json")
127 | return render_template('database.html')
128 |
129 |
130 | @app.route('/caption', methods=['GET', 'POST'])
131 | def caption():
132 | if request.method == 'POST':
133 | if 'query_img' not in request.files or request.files['query_img'].filename == '' or not allowed_file(
134 | request.files['query_img'].filename):
135 | return render_template('caption.html')
136 | file = request.files['query_img']
137 | img = Image.open(file.stream) # PIL image
138 | uploaded_img_path = os.path.join(app.config['TEMP_UPLOAD_FOLDER'], file.filename)
139 | img.save(uploaded_img_path)
140 | cap = gencap.get_caption(uploaded_img_path)
141 | return render_template('caption.html', caption=cap, query_path=uploaded_img_path)
142 | else:
143 | return render_template('caption.html')
144 |
145 |
146 | @app.route('/api/caption', methods=['POST'])
147 | def caption_api():
148 | if 'query_img' not in request.files or request.files['query_img'].filename == '' or not allowed_file(
149 | request.files['query_img'].filename):
150 | return Response(response=json.dumps({'success': False, 'message': 'Uploaded image is invalid or not allowed'}),
151 | status=400, mimetype="application/json")
152 | file = request.files['query_img']
153 | img = Image.open(file.stream) # PIL image
154 | uploaded_img_path = os.path.join(app.config['TEMP_UPLOAD_FOLDER'], file.filename)
155 | img.save(uploaded_img_path)
156 | cap = gencap.get_caption(uploaded_img_path)
157 | return Response(response=json.dumps({'success': True, 'caption': cap}),
158 | status=200, mimetype="application/json")
159 |
160 |
161 | if __name__ == "__main__":
162 | app.run(host="0.0.0.0", port=5000)
163 |
--------------------------------------------------------------------------------
/imagernn/rnn_generator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import code
3 |
4 | from imagernn.utils import initw
5 |
6 | class RNNGenerator:
7 | """
8 | An RNN generator.
9 | This class is as stupid as possible. It gets some conditioning vector,
10 | a sequence of input vectors, and produces a sequence of output vectors
11 | """
12 |
13 | @staticmethod
14 | def init(input_size, hidden_size, output_size):
15 |
16 | model = {}
17 | # connections to x_t
18 | model['Wxh'] = initw(input_size, hidden_size)
19 | model['bxh'] = np.zeros((1, hidden_size))
20 | # connections to h_{t-1}
21 | model['Whh'] = initw(hidden_size, hidden_size)
22 | model['bhh'] = np.zeros((1, hidden_size))
23 | # Decoder weights (e.g. mapping to vocabulary)
24 | model['Wd'] = initw(hidden_size, output_size) * 0.1 # decoder
25 | model['bd'] = np.zeros((1, output_size))
26 |
27 | update = ['Whh', 'bhh', 'Wxh', 'bxh', 'Wd', 'bd']
28 | regularize = ['Whh', 'Wxh', 'Wd']
29 | return { 'model' : model, 'update' : update, 'regularize' : regularize }
30 |
31 | @staticmethod
32 | def forward(Xi, Xs, model, params, **kwargs):
33 | """
34 | Xi is 1-d array of size D1 (containing the image representation)
35 | Xs is N x D2 (N time steps, rows are data containng word representations), and
36 | it is assumed that the first row is already filled in as the start token. So a
37 | sentence with 10 words will be of size 11xD2 in Xs.
38 | """
39 | predict_mode = kwargs.get('predict_mode', False)
40 |
41 | # options
42 | drop_prob_encoder = params.get('drop_prob_encoder', 0.0)
43 | drop_prob_decoder = params.get('drop_prob_decoder', 0.0)
44 | relu_encoders = params.get('rnn_relu_encoders', 0)
45 | rnn_feed_once = params.get('rnn_feed_once', 0)
46 |
47 | if drop_prob_encoder > 0: # if we want dropout on the encoder
48 | # inverted version of dropout here. Suppose the drop_prob is 0.5, then during training
49 | # we are going to drop half of the units. In this inverted version we also boost the activations
50 | # of the remaining 50% by 2.0 (scale). The nice property of this is that during prediction time
51 | # we don't have to do any scailing, since all 100% of units will be active, but at their base
52 | # firing rate, giving 100% of the "energy". So the neurons later in the pipeline dont't change
53 | # their expected firing rate magnitudes
54 | if not predict_mode: # and we are in training mode
55 | scale = 1.0 / (1.0 - drop_prob_encoder)
56 | Us = (np.random.rand(*(Xs.shape)) < (1 - drop_prob_encoder)) * scale # generate scaled mask
57 | Xs *= Us # drop!
58 | Ui = (np.random.rand(*(Xi.shape)) < (1 - drop_prob_encoder)) * scale
59 | Xi *= Ui # drop!
60 |
61 | # encode input vectors
62 | Wxh = model['Wxh']
63 | bxh = model['bxh']
64 | Xsh = Xs.dot(Wxh) + bxh
65 |
66 | if relu_encoders:
67 | Xsh = np.maximum(Xsh, 0)
68 | Xi = np.maximum(Xi, 0)
69 |
70 | # recurrence iteration for the Multimodal RNN similar to one described in Karpathy et al.
71 | d = model['Wd'].shape[0] # size of hidden layer
72 | n = Xs.shape[0]
73 | H = np.zeros((n, d)) # hidden layer representation
74 | Whh = model['Whh']
75 | bhh = model['bhh']
76 | for t in xrange(n):
77 |
78 | prev = np.zeros(d) if t == 0 else H[t-1]
79 | if not rnn_feed_once or t == 0:
80 | # feed the image in if feedonce is false. And it it is true, then
81 | # only feed the image in if its the first iteration
82 | H[t] = np.maximum(Xi + Xsh[t] + prev.dot(Whh) + bhh, 0) # also ReLU
83 | else:
84 | H[t] = np.maximum(Xsh[t] + prev.dot(Whh) + bhh, 0) # also ReLU
85 |
86 | if drop_prob_decoder > 0: # if we want dropout on the decoder
87 | if not predict_mode: # and we are in training mode
88 | scale2 = 1.0 / (1.0 - drop_prob_decoder)
89 | U2 = (np.random.rand(*(H.shape)) < (1 - drop_prob_decoder)) * scale2 # generate scaled mask
90 | H *= U2 # drop!
91 |
92 | # decoder at the end
93 | Wd = model['Wd']
94 | bd = model['bd']
95 | Y = H.dot(Wd) + bd
96 |
97 | cache = {}
98 | if not predict_mode:
99 | # we can expect to do a backward pass
100 | cache['Whh'] = Whh
101 | cache['H'] = H
102 | cache['Wd'] = Wd
103 | cache['Xs'] = Xs
104 | cache['Xsh'] = Xsh
105 | cache['Wxh'] = Wxh
106 | cache['Xi'] = Xi
107 | cache['relu_encoders'] = relu_encoders
108 | cache['drop_prob_encoder'] = drop_prob_encoder
109 | cache['drop_prob_decoder'] = drop_prob_decoder
110 | cache['rnn_feed_once'] = rnn_feed_once
111 | if drop_prob_encoder > 0:
112 | cache['Us'] = Us # keep the dropout masks around for backprop
113 | cache['Ui'] = Ui
114 | if drop_prob_decoder > 0: cache['U2'] = U2
115 |
116 | return Y, cache
117 |
118 | @staticmethod
119 | def backward(dY, cache):
120 |
121 | Wd = cache['Wd']
122 | H = cache['H']
123 | Xs = cache['Xs']
124 | Xsh = cache['Xsh']
125 | Whh = cache['Whh']
126 | Wxh = cache['Wxh']
127 | Xi = cache['Xi']
128 | drop_prob_encoder = cache['drop_prob_encoder']
129 | drop_prob_decoder = cache['drop_prob_decoder']
130 | relu_encoders = cache['relu_encoders']
131 | rnn_feed_once = cache['rnn_feed_once']
132 | n,d = H.shape
133 |
134 | # backprop the decoder
135 | dWd = H.transpose().dot(dY)
136 | dbd = np.sum(dY, axis=0, keepdims = True)
137 | dH = dY.dot(Wd.transpose())
138 |
139 | # backprop dropout, if it was applied
140 | if drop_prob_decoder > 0:
141 | dH *= cache['U2']
142 |
143 | # backprop the recurrent connections
144 | dXsh = np.zeros(Xsh.shape)
145 | dXi = np.zeros(d)
146 | dWhh = np.zeros(Whh.shape)
147 | dbhh = np.zeros((1,d))
148 | for t in reversed(xrange(n)):
149 | dht = (H[t] > 0) * dH[t] # backprop ReLU
150 |
151 | if not rnn_feed_once or t == 0:
152 | dXi += dht # backprop to Xi
153 |
154 | dXsh[t] += dht # backprop to word encodings
155 | dbhh[0] += dht # backprop to bias
156 |
157 | if t > 0:
158 | dH[t-1] += dht.dot(Whh.transpose())
159 | dWhh += np.outer(H[t-1], dht)
160 |
161 | if relu_encoders:
162 | # backprop relu
163 | dXsh[Xsh <= 0] = 0
164 | dXi[Xi <= 0] = 0
165 |
166 | # backprop the word encoder
167 | dWxh = Xs.transpose().dot(dXsh)
168 | dbxh = np.sum(dXsh, axis=0, keepdims = True)
169 | dXs = dXsh.dot(Wxh.transpose())
170 |
171 | if drop_prob_encoder > 0: # backprop encoder dropout
172 | dXi *= cache['Ui']
173 | dXs *= cache['Us']
174 |
175 | return { 'Whh': dWhh, 'bhh': dbhh, 'Wd': dWd, 'bd': dbd, 'Wxh':dWxh, 'bxh':dbxh, 'dXs' : dXs, 'dXi': dXi }
176 |
177 | @staticmethod
178 | def predict(Xi, model, Ws, params, **kwargs):
179 |
180 | beam_size = kwargs.get('beam_size', 1)
181 | relu_encoders = params.get('rnn_relu_encoders', 0)
182 | rnn_feed_once = params.get('rnn_feed_once', 0)
183 |
184 | d = model['Wd'].shape[0] # size of hidden layer
185 | Whh = model['Whh']
186 | bhh = model['bhh']
187 | Wd = model['Wd']
188 | bd = model['bd']
189 | Wxh = model['Wxh']
190 | bxh = model['bxh']
191 |
192 | if relu_encoders:
193 | Xi = np.maximum(Xi, 0)
194 |
195 | if beam_size > 1:
196 | # perform beam search
197 | # NOTE: code duplication here with lstm_generator
198 | # ideally the beam search would be abstracted away nicely and would take
199 | # a TICK function or something, but for now lets save time & copy code around. Sorry ;\
200 | beams = [(0.0, [], np.zeros(d))]
201 | nsteps = 0
202 | while True:
203 | beam_candidates = []
204 | for b in beams:
205 | ixprev = b[1][-1] if b[1] else 0
206 | if ixprev == 0 and b[1]:
207 | # this beam predicted end token. Keep in the candidates but don't expand it out any more
208 | beam_candidates.append(b)
209 | continue
210 | # tick the RNN for this beam
211 | Xsh = Ws[ixprev].dot(Wxh) + bxh
212 | if relu_encoders:
213 | Xsh = np.maximum(Xsh, 0)
214 |
215 | if (not rnn_feed_once) or (not b[1]):
216 | h1 = np.maximum(Xi + Xsh + b[2].dot(Whh) + bhh, 0)
217 | else:
218 | h1 = np.maximum(Xsh + b[2].dot(Whh) + bhh, 0)
219 |
220 | y1 = h1.dot(Wd) + bd
221 |
222 | # compute new candidates that expand out form this beam
223 | y1 = y1.ravel() # make into 1D vector
224 | maxy1 = np.amax(y1)
225 | e1 = np.exp(y1 - maxy1) # for numerical stability shift into good numerical range
226 | p1 = e1 / np.sum(e1)
227 | y1 = np.log(1e-20 + p1) # and back to log domain
228 | top_indices = np.argsort(-y1) # we do -y because we want decreasing order
229 | for i in xrange(beam_size):
230 | wordix = top_indices[i]
231 | beam_candidates.append((b[0] + y1[wordix], b[1] + [wordix], h1))
232 |
233 | beam_candidates.sort(reverse = True) # decreasing order
234 | beams = beam_candidates[:beam_size] # truncate to get new beams
235 | nsteps += 1
236 | if nsteps >= 20: # bad things are probably happening, break out
237 | break
238 | # strip the intermediates
239 | predictions = [(b[0], b[1]) for b in beams]
240 |
241 | else:
242 | ixprev = 0 # start out on start token
243 | nsteps = 0
244 | predix = []
245 | predlogprob = 0.0
246 | hprev = np.zeros((1, d)) # hidden layer representation
247 | xsprev = Ws[0] # start token
248 | while True:
249 | Xsh = Ws[ixprev].dot(Wxh) + bxh
250 | if relu_encoders:
251 | Xsh = np.maximum(Xsh, 0)
252 |
253 | if (not rnn_feed_once) or (nsteps == 0):
254 | ht = np.maximum(Xi + Xsh + hprev.dot(Whh) + bhh, 0)
255 | else:
256 | ht = np.maximum(Xsh + hprev.dot(Whh) + bhh, 0)
257 |
258 | Y = ht.dot(Wd) + bd
259 | hprev = ht
260 |
261 | ixprev, ixlogprob = ymax(Y)
262 | predix.append(ixprev)
263 | predlogprob += ixlogprob
264 |
265 | nsteps += 1
266 | if ixprev == 0 or nsteps >= 20:
267 | break
268 | predictions = [(predlogprob, predix)]
269 | return predictions
270 |
271 |
272 | def ymax(y):
273 | """ simple helper function here that takes unnormalized logprobs """
274 | y1 = y.ravel() # make sure 1d
275 | maxy1 = np.amax(y1)
276 | e1 = np.exp(y1 - maxy1) # for numerical stability shift into good numerical range
277 | p1 = e1 / np.sum(e1)
278 | y1 = np.log(1e-20 + p1) # guard against zero probabilities just in case
279 | ix = np.argmax(y1)
280 | return (ix, y1[ix])
281 |
--------------------------------------------------------------------------------
/imagernn/lstm_generator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import code
3 |
4 | from imagernn.utils import initw
5 |
6 | class LSTMGenerator:
7 | """
8 | A multimodal long short-term memory (LSTM) generator
9 | """
10 |
11 | @staticmethod
12 | def init(input_size, hidden_size, output_size):
13 |
14 | model = {}
15 | # Recurrent weights: take x_t, h_{t-1}, and bias unit
16 | # and produce the 3 gates and the input to cell signal
17 | model['WLSTM'] = initw(input_size + hidden_size + 1, 4 * hidden_size)
18 | # Decoder weights (e.g. mapping to vocabulary)
19 | model['Wd'] = initw(hidden_size, output_size) # decoder
20 | model['bd'] = np.zeros((1, output_size))
21 |
22 | update = ['WLSTM', 'Wd', 'bd']
23 | regularize = ['WLSTM', 'Wd']
24 | return { 'model' : model, 'update' : update, 'regularize' : regularize }
25 |
26 | @staticmethod
27 | def forward(Xi, Xs, model, params, **kwargs):
28 | """
29 | Xi is 1-d array of size D (containing the image representation)
30 | Xs is N x D (N time steps, rows are data containng word representations), and
31 | it is assumed that the first row is already filled in as the start token. So a
32 | sentence with 10 words will be of size 11xD in Xs.
33 | """
34 | predict_mode = kwargs.get('predict_mode', False)
35 |
36 | # Google paper concatenates the image to the word vectors as the first word vector
37 | X = np.row_stack([Xi, Xs])
38 |
39 | # options
40 | # use the version of LSTM with tanh? Otherwise dont use tanh (Google style)
41 | # following http://arxiv.org/abs/1409.3215
42 | tanhC_version = params.get('tanhC_version', 0)
43 | drop_prob_encoder = params.get('drop_prob_encoder', 0.0)
44 | drop_prob_decoder = params.get('drop_prob_decoder', 0.0)
45 |
46 | if drop_prob_encoder > 0: # if we want dropout on the encoder
47 | # inverted version of dropout here. Suppose the drop_prob is 0.5, then during training
48 | # we are going to drop half of the units. In this inverted version we also boost the activations
49 | # of the remaining 50% by 2.0 (scale). The nice property of this is that during prediction time
50 | # we don't have to do any scailing, since all 100% of units will be active, but at their base
51 | # firing rate, giving 100% of the "energy". So the neurons later in the pipeline dont't change
52 | # their expected firing rate magnitudes
53 | if not predict_mode: # and we are in training mode
54 | scale = 1.0 / (1.0 - drop_prob_encoder)
55 | U = (np.random.rand(*(X.shape)) < (1 - drop_prob_encoder)) * scale # generate scaled mask
56 | X *= U # drop!
57 |
58 | # follows http://arxiv.org/pdf/1409.2329.pdf
59 | WLSTM = model['WLSTM']
60 | n = X.shape[0]
61 | d = model['Wd'].shape[0] # size of hidden layer
62 | Hin = np.zeros((n, WLSTM.shape[0])) # xt, ht-1, bias
63 | Hout = np.zeros((n, d))
64 | IFOG = np.zeros((n, d * 4))
65 | IFOGf = np.zeros((n, d * 4)) # after nonlinearity
66 | C = np.zeros((n, d))
67 | for t in range(n):
68 | # set input
69 | prev = np.zeros(d) if t == 0 else Hout[t-1]
70 | Hin[t,0] = 1
71 | Hin[t,1:1+d] = X[t]
72 | Hin[t,1+d:] = prev
73 |
74 | # compute all gate activations. dots:
75 | IFOG[t] = Hin[t].dot(WLSTM)
76 |
77 | # non-linearities
78 | IFOGf[t,:3*d] = 1.0/(1.0+np.exp(-IFOG[t,:3*d])) # sigmoids; these are the gates
79 | IFOGf[t,3*d:] = np.tanh(IFOG[t, 3*d:]) # tanh
80 |
81 | # compute the cell activation
82 | C[t] = IFOGf[t,:d] * IFOGf[t, 3*d:]
83 | if t > 0: C[t] += IFOGf[t,d:2*d] * C[t-1]
84 | if tanhC_version:
85 | Hout[t] = IFOGf[t,2*d:3*d] * np.tanh(C[t])
86 | else:
87 | Hout[t] = IFOGf[t,2*d:3*d] * C[t]
88 |
89 | if drop_prob_decoder > 0: # if we want dropout on the decoder
90 | if not predict_mode: # and we are in training mode
91 | scale2 = 1.0 / (1.0 - drop_prob_decoder)
92 | U2 = (np.random.rand(*(Hout.shape)) < (1 - drop_prob_decoder)) * scale2 # generate scaled mask
93 | Hout *= U2 # drop!
94 |
95 | # decoder at the end
96 | Wd = model['Wd']
97 | bd = model['bd']
98 | # NOTE1: we are leaving out the first prediction, which was made for the image
99 | # and is meaningless.
100 | Y = Hout[1:, :].dot(Wd) + bd
101 |
102 | cache = {}
103 | if not predict_mode:
104 | # we can expect to do a backward pass
105 | cache['WLSTM'] = WLSTM
106 | cache['Hout'] = Hout
107 | cache['Wd'] = Wd
108 | cache['IFOGf'] = IFOGf
109 | cache['IFOG'] = IFOG
110 | cache['C'] = C
111 | cache['X'] = X
112 | cache['Hin'] = Hin
113 | cache['tanhC_version'] = tanhC_version
114 | cache['drop_prob_encoder'] = drop_prob_encoder
115 | cache['drop_prob_decoder'] = drop_prob_decoder
116 | if drop_prob_encoder > 0: cache['U'] = U # keep the dropout masks around for backprop
117 | if drop_prob_decoder > 0: cache['U2'] = U2
118 |
119 | return Y, cache
120 |
121 | @staticmethod
122 | def backward(dY, cache):
123 |
124 | Wd = cache['Wd']
125 | Hout = cache['Hout']
126 | IFOG = cache['IFOG']
127 | IFOGf = cache['IFOGf']
128 | C = cache['C']
129 | Hin = cache['Hin']
130 | WLSTM = cache['WLSTM']
131 | X = cache['X']
132 | tanhC_version = cache['tanhC_version']
133 | drop_prob_encoder = cache['drop_prob_encoder']
134 | drop_prob_decoder = cache['drop_prob_decoder']
135 | n,d = Hout.shape
136 |
137 | # we have to add back a row of zeros, since in the forward pass
138 | # this information was not used. See NOTE1 above.
139 | dY = np.row_stack([np.zeros(dY.shape[1]), dY])
140 |
141 | # backprop the decoder
142 | dWd = Hout.transpose().dot(dY)
143 | dbd = np.sum(dY, axis=0, keepdims = True)
144 | dHout = dY.dot(Wd.transpose())
145 |
146 | # backprop dropout, if it was applied
147 | if drop_prob_decoder > 0:
148 | dHout *= cache['U2']
149 |
150 | # backprop the LSTM
151 | dIFOG = np.zeros(IFOG.shape)
152 | dIFOGf = np.zeros(IFOGf.shape)
153 | dWLSTM = np.zeros(WLSTM.shape)
154 | dHin = np.zeros(Hin.shape)
155 | dC = np.zeros(C.shape)
156 | dX = np.zeros(X.shape)
157 | for t in reversed(range(n)):
158 |
159 | if tanhC_version:
160 | tanhCt = np.tanh(C[t]) # recompute this here
161 | dIFOGf[t,2*d:3*d] = tanhCt * dHout[t]
162 | # backprop tanh non-linearity first then continue backprop
163 | dC[t] += (1-tanhCt**2) * (IFOGf[t,2*d:3*d] * dHout[t])
164 | else:
165 | dIFOGf[t,2*d:3*d] = C[t] * dHout[t]
166 | dC[t] += IFOGf[t,2*d:3*d] * dHout[t]
167 |
168 | if t > 0:
169 | dIFOGf[t,d:2*d] = C[t-1] * dC[t]
170 | dC[t-1] += IFOGf[t,d:2*d] * dC[t]
171 | dIFOGf[t,:d] = IFOGf[t, 3*d:] * dC[t]
172 | dIFOGf[t, 3*d:] = IFOGf[t,:d] * dC[t]
173 |
174 | # backprop activation functions
175 | dIFOG[t,3*d:] = (1 - IFOGf[t, 3*d:] ** 2) * dIFOGf[t,3*d:]
176 | y = IFOGf[t,:3*d]
177 | dIFOG[t,:3*d] = (y*(1.0-y)) * dIFOGf[t,:3*d]
178 |
179 | # backprop matrix multiply
180 | dWLSTM += np.outer(Hin[t], dIFOG[t])
181 | dHin[t] = dIFOG[t].dot(WLSTM.transpose())
182 |
183 | # backprop the identity transforms into Hin
184 | dX[t] = dHin[t,1:1+d]
185 | if t > 0:
186 | dHout[t-1] += dHin[t,1+d:]
187 |
188 | if drop_prob_encoder > 0: # backprop encoder dropout
189 | dX *= cache['U']
190 |
191 | return { 'WLSTM': dWLSTM, 'Wd': dWd, 'bd': dbd, 'dXi': dX[0,:], 'dXs': dX[1:,:] }
192 |
193 | @staticmethod
194 | def predict(Xi, model, Ws, params, **kwargs):
195 | """
196 | Run in prediction mode with beam search. The input is the vector Xi, which
197 | should be a 1-D array that contains the encoded image vector. We go from there.
198 | Ws should be NxD array where N is size of vocabulary + 1. So there should be exactly
199 | as many rows in Ws as there are outputs in the decoder Y. We are passing in Ws like
200 | this because we may not want it to be exactly model['Ws']. For example it could be
201 | fixed word vectors from somewhere else.
202 | """
203 | tanhC_version = params['tanhC_version']
204 | beam_size = kwargs.get('beam_size', 1)
205 |
206 | WLSTM = model['WLSTM']
207 | d = model['Wd'].shape[0] # size of hidden layer
208 | Wd = model['Wd']
209 | bd = model['bd']
210 |
211 | # lets define a helper function that does a single LSTM tick
212 | def LSTMtick(x, h_prev, c_prev):
213 | t = 0
214 |
215 | # setup the input vector
216 | Hin = np.zeros((1,WLSTM.shape[0])) # xt, ht-1, bias
217 | Hin[t,0] = 1
218 | Hin[t,1:1+d] = x
219 | Hin[t,1+d:] = h_prev
220 |
221 | # LSTM tick forward
222 | IFOG = np.zeros((1, d * 4))
223 | IFOGf = np.zeros((1, d * 4))
224 | C = np.zeros((1, d))
225 | Hout = np.zeros((1, d))
226 | IFOG[t] = Hin[t].dot(WLSTM)
227 | IFOGf[t,:3*d] = 1.0/(1.0+np.exp(-IFOG[t,:3*d]))
228 | IFOGf[t,3*d:] = np.tanh(IFOG[t, 3*d:])
229 | C[t] = IFOGf[t,:d] * IFOGf[t, 3*d:] + IFOGf[t,d:2*d] * c_prev
230 | if tanhC_version:
231 | Hout[t] = IFOGf[t,2*d:3*d] * np.tanh(C[t])
232 | else:
233 | Hout[t] = IFOGf[t,2*d:3*d] * C[t]
234 | Y = Hout.dot(Wd) + bd
235 | return (Y, Hout, C) # return output, new hidden, new cell
236 |
237 | # forward prop the image
238 | (y0, h, c) = LSTMtick(Xi, np.zeros(d), np.zeros(d))
239 |
240 | # perform BEAM search. NOTE: I am not very confident in this implementation since I don't have
241 | # a lot of experience with these models. This implements my current understanding but I'm not
242 | # sure how to handle beams that predict END tokens. TODO: research this more.
243 | if beam_size > 1:
244 | # log probability, indices of words predicted in this beam so far, and the hidden and cell states
245 | beams = [(0.0, [], h, c)]
246 | nsteps = 0
247 | while True:
248 | beam_candidates = []
249 | for b in beams:
250 | ixprev = b[1][-1] if b[1] else 0 # start off with the word where this beam left off
251 | if ixprev == 0 and b[1]:
252 | # this beam predicted end token. Keep in the candidates but don't expand it out any more
253 | beam_candidates.append(b)
254 | continue
255 | (y1, h1, c1) = LSTMtick(Ws[ixprev], b[2], b[3])
256 | y1 = y1.ravel() # make into 1D vector
257 | maxy1 = np.amax(y1)
258 | e1 = np.exp(y1 - maxy1) # for numerical stability shift into good numerical range
259 | p1 = e1 / np.sum(e1)
260 | y1 = np.log(1e-20 + p1) # and back to log domain
261 | top_indices = np.argsort(-y1) # we do -y because we want decreasing order
262 | for i in range(beam_size):
263 | wordix = top_indices[i]
264 | beam_candidates.append((b[0] + y1[wordix], b[1] + [wordix], h1, c1))
265 | beam_candidates.sort(reverse = True) # decreasing order
266 | beams = beam_candidates[:beam_size] # truncate to get new beams
267 | nsteps += 1
268 | if nsteps >= 20: # bad things are probably happening, break out
269 | break
270 | # strip the intermediates
271 | predictions = [(b[0], b[1]) for b in beams]
272 | else:
273 | # greedy inference. lets write it up independently, should be bit faster and simpler
274 | ixprev = 0
275 | nsteps = 0
276 | predix = []
277 | predlogprob = 0.0
278 | while True:
279 | (y1, h, c) = LSTMtick(Ws[ixprev], h, c)
280 | ixprev, ixlogprob = ymax(y1)
281 | predix.append(ixprev)
282 | predlogprob += ixlogprob
283 | nsteps += 1
284 | if ixprev == 0 or nsteps >= 20:
285 | break
286 | predictions = [(predlogprob, predix)]
287 |
288 | return predictions
289 |
290 | def ymax(y):
291 | """ simple helper function here that takes unnormalized logprobs """
292 | y1 = y.ravel() # make sure 1d
293 | maxy1 = np.amax(y1)
294 | e1 = np.exp(y1 - maxy1) # for numerical stability shift into good numerical range
295 | p1 = e1 / np.sum(e1)
296 | y1 = np.log(1e-20 + p1) # guard against zero probabilities just in case
297 | ix = np.argmax(y1)
298 | return (ix, y1[ix])
299 |
--------------------------------------------------------------------------------