├── README.md
├── constants.py
├── discrete_DPPO.py
└── scene_loader.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Target-Driven-Visual-Navigation-with-Distributed-PPO
 2 | 
 3 | This repository has used AI2THOR CVPR data set. 
 4 | 
 5 | The original problem can be found in this Repositray - https://github.com/zfw1226/icra2017-visual-navigation
 6 | 
 7 | This problem was solved by A3C agents as in the paper - https://arxiv.org/abs/1609.05143
 8 | 
 9 | Here, I used Distributed PPO to solve the CVPR THPR data set problem
10 | 
11 | The DPPO code was originally taken from - https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow
12 | 


--------------------------------------------------------------------------------
/constants.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | LOCAL_T_MAX = 5 # repeat step size
 4 | RMSP_ALPHA = 0.99 # decay parameter for RMSProp
 5 | RMSP_EPSILON = 0.1 # epsilon parameter for RMSProp
 6 | CHECKPOINT_DIR = 'checkpoints'
 7 | LOG_FILE = 'logs'
 8 | INITIAL_ALPHA_LOW = 1e-4    # log_uniform low limit for learning rate
 9 | INITIAL_ALPHA_HIGH = 1e-2   # log_uniform high limit for learning rate
10 | 
11 | PARALLEL_SIZE = 20 # parallel thread size
12 | ACTION_SIZE = 4 # action size
13 | 
14 | INITIAL_ALPHA_LOG_RATE = 0.4226 # log_uniform interpolate rate for learning rate (around 7 * 10^-4)
15 | GAMMA = 0.99 # discount factor for rewards
16 | ENTROPY_BETA = 0.01 # entropy regurarlization constant
17 | MAX_TIME_STEP = 10.0 * 10**6 # 10 million frames
18 | GRAD_NORM_CLIP = 40.0 # gradient norm clipping
19 | USE_GPU = True # To use GPU, set True
20 | VERBOSE = True
21 | 
22 | SCREEN_WIDTH = 84
23 | SCREEN_HEIGHT = 84
24 | HISTORY_LENGTH = 4
25 | 
26 | NUM_EVAL_EPISODES = 100 # number of episodes for evaluation
27 | 
28 | TASK_TYPE = 'navigation' # no need to change
29 | # keys are scene names, and values are a list of location ids (navigation targets)
30 | 
31 | 
32 | TASK_LIST = {
33 |   'bathroom_02'    : ['26']#, '37', '43', '53', '69']
34 | }
35 | '''
36 | TASK_LIST = {
37 |   'bathroom_02'    : ['26', '37', '43', '53', '69'],
38 |   'bedroom_04'     : ['134', '264', '320', '384', '387'],
39 |   'kitchen_02'     : ['90', '136', '157', '207', '329'],
40 |   'living_room_08' : ['92', '135', '193', '228', '254']
41 | }
42 | '''


--------------------------------------------------------------------------------
/discrete_DPPO.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A simple version of OpenAI's Proximal Policy Optimization (PPO). [https://arxiv.org/abs/1707.06347]
  3 | 
  4 | Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
  5 | Restart workers once PPO is updated.
  6 | 
  7 | The global PPO updating rule is adopted from DeepMind's paper (DPPO):
  8 | Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286]
  9 | 
 10 | View more on my tutorial website: https://morvanzhou.github.io/tutorials
 11 | 
 12 | Dependencies:
 13 | tensorflow 1.8.0
 14 | gym 0.9.2
 15 | """
 16 | 
 17 | import tensorflow as tf
 18 | import numpy as np
 19 | import matplotlib.pyplot as plt
 20 | import gym, threading, queue
 21 | 
 22 | from scene_loader import THORDiscreteEnvironment as Environment
 23 | 
 24 | import pdb
 25 | 
 26 | EP_MAX = 1000
 27 | EP_LEN = 500
 28 | N_WORKER = 8                # parallel workers
 29 | GAMMA = 0.9                 # reward discount factor
 30 | A_LR = 0.0001               # learning rate for actor
 31 | C_LR = 0.0001               # learning rate for critic
 32 | MIN_BATCH_SIZE = 64         # minimum batch size for updating PPO
 33 | UPDATE_STEP = 15            # loop update operation n-steps
 34 | EPSILON = 0.2               # for clipping surrogate objective
 35 | GAME = 'CartPole-v0'
 36 | 
 37 | #env = gym.make(GAME)
 38 | #S_DIM = env.observation_space.shape[0]
 39 | #A_DIM = env.action_space.n
 40 | 
 41 | from constants import TASK_TYPE
 42 | from constants import TASK_LIST
 43 | 
 44 | 
 45 | class PPONet(object):
 46 |     def __init__(self):
 47 |         self.sess = tf.Session()
 48 |         #self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state')
 49 | 
 50 |         self.tfs_S = tf.placeholder("float", [None, 2048,4], 'state_new')
 51 |         self.tfs_T = tf.placeholder("float", [None, 2048,4], 'target_new')
 52 | 
 53 |         self.tfs_S_N=tf.reshape(self.tfs_S, [-1, 8192])
 54 |         self.tfs_T_N=tf.reshape(self.tfs_T, [-1, 8192])
 55 | 
 56 | 
 57 | 
 58 |         # critic
 59 |         #w_init = tf.random_normal_initializer(0., .1)
 60 |         #lc = tf.layers.dense(self.tfs, 200, tf.nn.relu, kernel_initializer=w_init, name='lc')
 61 |         #self.v = tf.layers.dense(lc, 1)
 62 |         #self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
 63 |         #self.advantage = self.tfdc_r - self.v
 64 |         ##self.closs = tf.reduce_mean(tf.square(self.advantage))
 65 |         #self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs)
 66 | 
 67 |         # actor
 68 |         #self.pi, pi_params,self.pi_new,self.v_new = self._build_anet('pi', trainable=True)
 69 |         pi_params,self.pi_new,self.v_new = self._build_anet('pi', trainable=True)
 70 | 
 71 |         #oldpi, oldpi_params,oldpi_new,oldv_new = self._build_anet('oldpi', trainable=False)
 72 |         oldpi_params,oldpi_new,oldv_new = self._build_anet('oldpi', trainable=False)
 73 |         
 74 |         self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)]
 75 | 
 76 | 
 77 |         self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r')
 78 |         self.advantage = self.tfdc_r - self.v_new
 79 |         self.closs = tf.reduce_mean(tf.square(self.advantage))
 80 | 
 81 | 
 82 |         self.tfa = tf.placeholder(tf.int32, [None, ], 'action')
 83 |         self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')
 84 | 
 85 |         a_indices = tf.stack([tf.range(tf.shape(self.tfa)[0], dtype=tf.int32), self.tfa], axis=1)
 86 |         pi_prob = tf.gather_nd(params=self.pi_new, indices=a_indices)   # shape=(None, )
 87 |         oldpi_prob = tf.gather_nd(params=oldpi_new, indices=a_indices)  # shape=(None, )
 88 |         ratio = pi_prob/oldpi_prob
 89 |         surr = ratio * self.tfadv                       # surrogate loss
 90 | 
 91 |         self.aloss = -tf.reduce_mean(tf.minimum(        # clipped surrogate objective
 92 |             surr,
 93 |             tf.clip_by_value(ratio, 1. - EPSILON, 1. + EPSILON) * self.tfadv))
 94 | 
 95 |         self.total_loss= self.aloss + 0.5*self.closs
 96 | 
 97 |         self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.total_loss)
 98 |         self.sess.run(tf.global_variables_initializer())
 99 | 
100 |     def update(self):
101 |         global GLOBAL_UPDATE_COUNTER
102 |         while not COORD.should_stop():
103 |             if GLOBAL_EP < EP_MAX:
104 |                 UPDATE_EVENT.wait()                     # wait until get batch of data
105 |                 self.sess.run(self.update_oldpi_op)     # copy pi to old pi
106 |                 data = [QUEUE.get() for _ in range(QUEUE.qsize())]      # collect data from all workers
107 | 
108 |                 data = np.vstack(data)
109 | 
110 |                 #s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + 1].ravel(), data[:, -1:]
111 | 
112 |       
113 | 
114 |                 s, t,a, r = data[:, :8192],data[:, 8192: 16384], data[:, 16384: 16384 + 1].ravel(), data[:, -1:]
115 | 
116 | 
117 | 
118 |                 s=np.reshape(s,[s.shape[0],2048,4])
119 |                 t=np.reshape(t,[t.shape[0],2048,4])
120 | 
121 | 
122 |                 adv = self.sess.run(self.advantage, {self.tfs_S: s,self.tfs_T: t, self.tfdc_r: r})
123 | 
124 |                 loss=self.sess.run(self.closs,{self.tfs_S: s,self.tfs_T: t, self.tfdc_r: r, self.tfa: a, self.tfadv: adv})
125 | 
126 |                 # update actor and critic in a update loop
127 |                 [self.sess.run(self.atrain_op, {self.tfs_S: s,self.tfs_T: t, self.tfdc_r: r, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)]
128 |                 #[self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)]
129 | 
130 |                 
131 |                 UPDATE_EVENT.clear()        # updating finished
132 |                 GLOBAL_UPDATE_COUNTER = 0   # reset counter
133 |                 ROLLING_EVENT.set()         # set roll-out available
134 | 
135 |     def _build_anet(self, name, trainable):
136 | 
137 |         with tf.variable_scope(name):
138 |             with tf.variable_scope("Siamese", reuse=tf.AUTO_REUSE):
139 |                 self.siamese_s=self.construct_Siamese(self.tfs_S_N,trainable)
140 |                 self.siamese_t=self.construct_Siamese(self.tfs_T_N,trainable)
141 |                 self.concat=tf.concat(values=[self.siamese_s, self.siamese_t], axis=1)
142 |                 #self.obs=self.fusion_layer(self.concat,trainable)
143 | 
144 | 
145 |             self.obs=self.fusion_layer(self.concat,trainable)
146 | 
147 | 
148 |             #l_a = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable)
149 |             
150 |             #a_prob = tf.layers.dense(l_a, A_DIM, tf.nn.softmax, trainable=trainable)
151 | 
152 | 
153 |             #Newly added 
154 |             l_a_new = tf.layers.dense(self.obs, 512, tf.nn.relu, trainable=trainable)
155 |             a_prob_new = tf.layers.dense(l_a_new, 4, tf.nn.softmax, trainable=trainable)
156 | 
157 |             v_new = tf.layers.dense(l_a_new, 1,trainable=trainable)
158 |             
159 | 
160 | 
161 | 
162 |         params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
163 | 
164 |         #return a_prob, params,a_prob_new,v_new
165 |         return params,a_prob_new,v_new
166 | 
167 |     def construct_Siamese(self, input,trainable):
168 |         layer_1 = tf.layers.dense(inputs=input, units=512, activation=tf.nn.leaky_relu, name='Siamese_layer_1',trainable=trainable)
169 |         #layer_2=tf.layers.dropout(layer_1,rate=0.5,noise_shape=None,seed=None,training=True,name='Drop_out_1')
170 |         #layer_3=tf.layers.dense(inputs=layer_2, units=128, activation=tf.nn.leaky_relu, name='Siamese_layer_2')
171 |         return layer_1
172 | 
173 |     def fusion_layer(self, input,trainable): #This is also a shared fusion layer
174 |         fuse_layer_1 = tf.layers.dense(inputs=input, units=512, activation=tf.nn.leaky_relu, name='Fuse_layer',trainable=trainable)
175 |         #fuse_layer_2=tf.layers.dropout(fuse_layer_1,rate=0.5,noise_shape=None,seed=None,training=True,name='Drop_out_2')  #added dropout
176 |         #fuse_layer_3 = tf.layers.dense(inputs=fuse_layer_1, units=128, activation=tf.nn.leaky_relu, name='Fuse_layer_3')
177 |         return fuse_layer_1
178 | 
179 |     def choose_action(self,s_new,t_new):  # run by a local
180 |         #prob_weights = self.sess.run(self.pi, feed_dict={self.tfs: s[None, :]})
181 | 
182 | 
183 |         prob_weights_new = self.sess.run(self.pi_new, feed_dict={self.tfs_S: [s_new],self.tfs_T:[t_new]})
184 |         # action = np.random.choice(range(prob_weights.shape[1]),
185 |         #                               p=prob_weights.ravel())  # select action w.r.t the actions prob
186 | 
187 | 
188 |         action_new = np.random.choice(range(prob_weights_new.shape[1]),
189 |                                       p=prob_weights_new.ravel())  # select action w.r.t the actions prob
190 | 
191 |  
192 |         #return action,action_new
193 |         return action_new
194 | 
195 | 
196 |     
197 |     def get_v(self, s):
198 |         if s.ndim < 2: s = s[np.newaxis, :]
199 |         return self.sess.run(self.v, {self.tfs: s})[0, 0]
200 | 
201 |     def get_v_new(self, S,T):
202 |          #if s.ndim < 2: s = s[np.newaxis, :]
203 |          return self.sess.run(self.v_new, {self.tfs_S: [S],self.tfs_T:[T]})[0, 0]
204 | 
205 | 
206 | class Worker(object):
207 |     def __init__(self, wid,target_id):
208 |         self.wid = wid
209 |         #self.env = gym.make(GAME).unwrapped
210 |         self.env_new=Environment({'scene_name':'bathroom_02','terminal_state_id': int(target_id)})
211 |         self.ppo = GLOBAL_PPO
212 |         self.task_scope = target_id
213 | 
214 |      
215 | 
216 | 
217 | 
218 |     def work(self):
219 |         global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
220 |         global GLOBAL_EP_new, GLOBAL_RUNNING_R_new, GLOBAL_UPDATE_COUNTER_new
221 |         while not COORD.should_stop():
222 |             #s = self.env.reset()
223 |             self.env_new.reset()
224 | 
225 |             ep_r = 0
226 |             ep_r_new = 0
227 |             buffer_s, buffer_a, buffer_r = [], [], []
228 | 
229 |             buffer_s_new,buffer_t_new, buffer_a_new, buffer_r_new = [], [], [],[]
230 | 
231 | 
232 |             for t in range(EP_LEN):
233 |                 if not ROLLING_EVENT.is_set():                  # while global PPO is updating
234 |                     ROLLING_EVENT.wait()                        # wait until PPO is updated
235 |                     buffer_s, buffer_a, buffer_r = [], [], []   # clear history buffer, use new policy to collect data
236 |                     buffer_s_new, buffer_a_new, buffer_r_new = [], [], []
237 |                 a_new = self.ppo.choose_action(self.env_new.s_t, self.env_new.target) #This is suppose to take the optimal action
238 | 
239 |           
240 |                 # process game
241 |                 self.env_new.step(a_new)
242 | 
243 |                 #s_, r, done, _ = self.env.step(a)
244 | 
245 | 
246 |                 r_new  = self.env_new.reward
247 |                 done_new = self.env_new.terminal
248 |                 r_new = 1 if done_new else -0.01
249 | 
250 |                 
251 | 
252 |                 # if done: r = -10
253 |                 # buffer_s.append(s)
254 |                 # buffer_a.append(a)
255 |                 # buffer_r.append(r-1)           # 0 for not down, -11 for down. Reward engineering
256 | 
257 |                 buffer_s_new.append(self.env_new.s_t)
258 |                 buffer_t_new.append(self.env_new.target)
259 |                 buffer_a_new.append(a_new)
260 |                 buffer_r_new.append(r_new) 
261 | 
262 |                 self.env_new.update()
263 | 
264 | 
265 |                 #s = s_
266 |                 s_new=self.env_new.s_t
267 |                 target=self.env_new.target
268 |                 #ep_r += r
269 |                 ep_r_new += r_new
270 | 
271 |                 GLOBAL_UPDATE_COUNTER += 1         # count to minimum batch size, no need to wait other workers
272 |                 if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done_new:
273 |                     if done_new:
274 |                         #v_s_ = 0 
275 |                         v_s_new=0                               # end of episode
276 |                     else:
277 |                         #v_s_ = self.ppo.get_v(s_)
278 |                         v_s_new = self.ppo.get_v_new(s_new,target)
279 |                     
280 |                     # discounted_r = []                    # compute discounted reward
281 |                     # for r in buffer_r[::-1]:
282 |                     #     v_s_ = r + GAMMA * v_s_
283 |                     #     discounted_r.append(v_s_)
284 |                     # discounted_r.reverse()
285 | 
286 | 
287 |                     discounted_r_new = []                    # compute discounted reward
288 |                     for r in buffer_r_new[::-1]:
289 |                         v_s_new = r_new + GAMMA * v_s_new
290 |                         discounted_r_new.append(v_s_new)
291 |                     discounted_r_new.reverse()
292 | 
293 |                     # bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, None]
294 |   
295 | 
296 | 
297 |                     # buffer_s, buffer_a, buffer_r = [], [], []
298 |                     # QUEUE.put(np.hstack((bs, ba, br)))          # put data in the queue
299 | 
300 |     
301 | 
302 | 
303 |                     bs_new,bt_new, ba_new, br_new = np.vstack([buffer_s_new]),np.vstack([buffer_t_new]), np.vstack(buffer_a_new), np.array(discounted_r_new)[:, None]
304 |                     bs_new=np.reshape(bs_new,[bs_new.shape[0],-1])
305 | 
306 |                     bt_new=np.reshape(bt_new,[bs_new.shape[0],-1])
307 | 
308 | 
309 |               
310 | 
311 |                     buffer_s_new,buffer_t_new, buffer_a_new, buffer_r_new = [], [], [],[]
312 |                     QUEUE.put(np.hstack((bs_new, bt_new,ba_new, br_new)))          # put data in the queue
313 | 
314 | 
315 | 
316 |                     if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
317 |                         ROLLING_EVENT.clear()       # stop collecting data
318 |                         UPDATE_EVENT.set()          # globalPPO update
319 | 
320 |                     if GLOBAL_EP >= EP_MAX:         # stop training
321 |                         COORD.request_stop()
322 |                         break
323 |         
324 |                     if done_new: break
325 | 
326 |             # record reward changes, plot later
327 | 
328 |          
329 |             # if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
330 |             # else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1)
331 |             # GLOBAL_EP += 1
332 |             # print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid,  '|Ep_r: %.2f' % ep_r,)
333 | 
334 |             # record reward changes, plot later
335 |             if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R_new.append(ep_r_new)
336 |             else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r_new*0.1)
337 |             GLOBAL_EP += 1
338 |             print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid,  '|Ep_r: %.2f' % ep_r_new,)
339 | 
340 | 
341 |         
342 | 
343 | 
344 | if __name__ == '__main__':
345 |     GLOBAL_PPO = PPONet()
346 |     UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event()
347 |     UPDATE_EVENT.clear()            # not update now
348 |     ROLLING_EVENT.set()             # start to roll out
349 | 
350 | 
351 |     network_scope = TASK_TYPE
352 |     list_of_tasks = TASK_LIST
353 |     scene_scopes = list_of_tasks.keys()
354 | 
355 |     branches=[]
356 | 
357 |     for scene in scene_scopes:
358 |         for task in list_of_tasks[scene]:
359 |             branches.append((scene, task))
360 | 
361 |     NUM_TASKS = len(branches)
362 | 
363 |     workers = []
364 | 
365 |     for i in range(N_WORKER): #This is the parrele size
366 | 
367 |         scene, task = branches[i%NUM_TASKS]
368 |         training_thread = Worker(wid=i,target_id=task)
369 |         workers.append(training_thread)
370 | 
371 |     
372 |     #workers = [Worker(wid=i,target_id=i) for i in range(N_WORKER)]
373 |     
374 | 
375 |     GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0
376 |     GLOBAL_RUNNING_R = []
377 |     GLOBAL_RUNNING_R_new =[]
378 |     COORD = tf.train.Coordinator()
379 |     QUEUE = queue.Queue()           # workers putting data in this queue
380 |     threads = []
381 |     for worker in workers:          # worker threads
382 |         t = threading.Thread(target=worker.work, args=())
383 |         t.start()                   # training
384 |         threads.append(t)
385 | 
386 | 
387 | 
388 | 
389 |    
390 |     # add a PPO updating thread
391 |     threads.append(threading.Thread(target=GLOBAL_PPO.update,))
392 | 
393 |   
394 |     threads[-1].start()
395 |     COORD.join(threads)
396 | 
397 |     # plot reward change and test
398 |     plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R)
399 |     plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show()
400 |     #env = gym.make('CartPole-v0')
401 | 
402 |     self.env =Environment({'scene_name':'bathroom_02','terminal_state_id': int(26)})
403 |     while True:
404 |         self.env.reset()
405 |         for t in range(1000):
406 |             env.render()
407 |             self.env.step(GLOBAL_PPO.choose_action(self.env.s_t, self.env.target))
408 | 
409 |             print(t)
410 |             if done:
411 |                 break
412 | 
413 | 


--------------------------------------------------------------------------------
/scene_loader.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import sys
  3 | import h5py
  4 | import json
  5 | import numpy as np
  6 | import random
  7 | import skimage.io
  8 | from skimage.transform import resize
  9 | from constants import ACTION_SIZE
 10 | from constants import SCREEN_WIDTH
 11 | from constants import SCREEN_HEIGHT
 12 | from constants import HISTORY_LENGTH
 13 | 
 14 | class THORDiscreteEnvironment(object):
 15 | 
 16 |   def __init__(self, config=dict()):
 17 | 
 18 |     # configurations
 19 |     self.scene_name          = config.get('scene_name', 'bedroom_04')
 20 |     self.random_start        = config.get('random_start', True)
 21 |     self.n_feat_per_locaiton = config.get('n_feat_per_locaiton', 1) # 1 for no sampling
 22 |     self.terminal_state_id   = config.get('terminal_state_id', 0)
 23 | 
 24 |     self.h5_file_path = config.get('h5_file_path', 'data/%s.h5'%self.scene_name)
 25 |     self.h5_file      = h5py.File(self.h5_file_path, 'r')
 26 | 
 27 |     self.locations   = self.h5_file['location'][()]
 28 |     self.rotations   = self.h5_file['rotation'][()]
 29 |     self.n_locations = self.locations.shape[0]
 30 | 
 31 |     self.terminals = np.zeros(self.n_locations)
 32 |     self.terminals[self.terminal_state_id] = 1
 33 |     self.terminal_states, = np.where(self.terminals)
 34 | 
 35 |     self.transition_graph = self.h5_file['graph'][()]
 36 |     self.shortest_path_distances = self.h5_file['shortest_path_distance'][()]
 37 | 
 38 |     self.history_length = HISTORY_LENGTH
 39 |     self.screen_height  = SCREEN_HEIGHT
 40 |     self.screen_width   = SCREEN_WIDTH
 41 | 
 42 |     # we use pre-computed fc7 features from ResNet-50
 43 |     # self.s_t = np.zeros([self.screen_height, self.screen_width, self.history_length])
 44 |     self.s_t      = np.zeros([2048, self.history_length])
 45 |     self.s_t1     = np.zeros_like(self.s_t)
 46 |     self.s_target = self._tiled_state(self.terminal_state_id)
 47 | 
 48 |     self.reset()
 49 | 
 50 |   # public methods
 51 | 
 52 |   def reset(self):
 53 |     # randomize initial state
 54 |     while True:
 55 |       k = random.randrange(self.n_locations)
 56 |       min_d = np.inf
 57 |       # check if target is reachable
 58 |       for t_state in self.terminal_states:
 59 |         dist = self.shortest_path_distances[k][t_state]
 60 |         min_d = min(min_d, dist)
 61 |       # min_d = 0  if k is a terminal state
 62 |       # min_d = -1 if no terminal state is reachable from k
 63 |       if min_d > 0: break
 64 | 
 65 |     # reset parameters
 66 |     self.current_state_id = k
 67 |     self.s_t = self._tiled_state(self.current_state_id)
 68 | 
 69 |     self.reward   = 0
 70 |     self.collided = False
 71 |     self.terminal = False
 72 | 
 73 |   def step(self, action):
 74 |     assert not self.terminal, 'step() called in terminal state'
 75 |     k = self.current_state_id
 76 |     if self.transition_graph[k][action] != -1:
 77 |       self.current_state_id = self.transition_graph[k][action]
 78 |       if self.terminals[self.current_state_id]:
 79 |         self.terminal = True
 80 |         self.collided = False
 81 |       else:
 82 |         self.terminal = False
 83 |         self.collided = False
 84 |     else:
 85 |       self.terminal = False
 86 |       self.collided = True
 87 | 
 88 |     self.reward = self._reward(self.terminal, self.collided)
 89 |     self.s_t1 = np.append(self.s_t[:,1:], self.state, axis=1)
 90 | 
 91 |   def update(self):
 92 |     self.s_t = self.s_t1
 93 | 
 94 |   # private methods
 95 | 
 96 |   def _tiled_state(self, state_id):
 97 |     k = random.randrange(self.n_feat_per_locaiton)
 98 |     f = self.h5_file['resnet_feature'][state_id][k][:,np.newaxis]
 99 |     return np.tile(f, (1, self.history_length))
100 | 
101 |   def _reward(self, terminal, collided):
102 |     # positive reward upon task completion
103 |     if terminal: return 10.0
104 |     # time penalty or collision penalty
105 |     return -0.1 if collided else -0.01
106 | 
107 |   # properties
108 | 
109 |   @property
110 |   def action_size(self):
111 |     # move forward/backward, turn left/right for navigation
112 |     return ACTION_SIZE 
113 | 
114 |   @property
115 |   def action_definitions(self):
116 |     action_vocab = ["MoveForward", "RotateRight", "RotateLeft", "MoveBackward"]
117 |     return action_vocab[:ACTION_SIZE]
118 | 
119 |   @property
120 |   def observation(self):
121 |     return self.h5_file['observation'][self.current_state_id]
122 | 
123 |   @property
124 |   def state(self):
125 |     # read from hdf5 cache
126 |     k = random.randrange(self.n_feat_per_locaiton)
127 |     return self.h5_file['resnet_feature'][self.current_state_id][k][:,np.newaxis]
128 | 
129 |   @property
130 |   def target(self):
131 |     return self.s_target
132 | 
133 |   @property
134 |   def x(self):
135 |     return self.locations[self.current_state_id][0]
136 | 
137 |   @property
138 |   def z(self):
139 |     return self.locations[self.current_state_id][1]
140 | 
141 |   @property
142 |   def r(self):
143 |     return self.rotations[self.current_state_id]
144 | 
145 | if __name__ == "__main__":
146 |   scene_name = 'bedroom_04'
147 | 
148 |   env = THORDiscreteEnvironment({
149 |     'random_start': True,
150 |     'scene_name': scene_name,
151 |     'h5_file_path': 'data/%s.h5'%scene_name
152 |   })
153 | 


--------------------------------------------------------------------------------