├── README.md ├── constants.py ├── discrete_DPPO.py └── scene_loader.py /README.md: -------------------------------------------------------------------------------- 1 | # Target-Driven-Visual-Navigation-with-Distributed-PPO 2 | 3 | This repository has used AI2THOR CVPR data set. 4 | 5 | The original problem can be found in this Repositray - https://github.com/zfw1226/icra2017-visual-navigation 6 | 7 | This problem was solved by A3C agents as in the paper - https://arxiv.org/abs/1609.05143 8 | 9 | Here, I used Distributed PPO to solve the CVPR THPR data set problem 10 | 11 | The DPPO code was originally taken from - https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow 12 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | LOCAL_T_MAX = 5 # repeat step size 4 | RMSP_ALPHA = 0.99 # decay parameter for RMSProp 5 | RMSP_EPSILON = 0.1 # epsilon parameter for RMSProp 6 | CHECKPOINT_DIR = 'checkpoints' 7 | LOG_FILE = 'logs' 8 | INITIAL_ALPHA_LOW = 1e-4 # log_uniform low limit for learning rate 9 | INITIAL_ALPHA_HIGH = 1e-2 # log_uniform high limit for learning rate 10 | 11 | PARALLEL_SIZE = 20 # parallel thread size 12 | ACTION_SIZE = 4 # action size 13 | 14 | INITIAL_ALPHA_LOG_RATE = 0.4226 # log_uniform interpolate rate for learning rate (around 7 * 10^-4) 15 | GAMMA = 0.99 # discount factor for rewards 16 | ENTROPY_BETA = 0.01 # entropy regurarlization constant 17 | MAX_TIME_STEP = 10.0 * 10**6 # 10 million frames 18 | GRAD_NORM_CLIP = 40.0 # gradient norm clipping 19 | USE_GPU = True # To use GPU, set True 20 | VERBOSE = True 21 | 22 | SCREEN_WIDTH = 84 23 | SCREEN_HEIGHT = 84 24 | HISTORY_LENGTH = 4 25 | 26 | NUM_EVAL_EPISODES = 100 # number of episodes for evaluation 27 | 28 | TASK_TYPE = 'navigation' # no need to change 29 | # keys are scene names, and values are a list of location ids (navigation targets) 30 | 31 | 32 | TASK_LIST = { 33 | 'bathroom_02' : ['26']#, '37', '43', '53', '69'] 34 | } 35 | ''' 36 | TASK_LIST = { 37 | 'bathroom_02' : ['26', '37', '43', '53', '69'], 38 | 'bedroom_04' : ['134', '264', '320', '384', '387'], 39 | 'kitchen_02' : ['90', '136', '157', '207', '329'], 40 | 'living_room_08' : ['92', '135', '193', '228', '254'] 41 | } 42 | ''' -------------------------------------------------------------------------------- /discrete_DPPO.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple version of OpenAI's Proximal Policy Optimization (PPO). [https://arxiv.org/abs/1707.06347] 3 | 4 | Distributing workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data. 5 | Restart workers once PPO is updated. 6 | 7 | The global PPO updating rule is adopted from DeepMind's paper (DPPO): 8 | Emergence of Locomotion Behaviours in Rich Environments (Google Deepmind): [https://arxiv.org/abs/1707.02286] 9 | 10 | View more on my tutorial website: https://morvanzhou.github.io/tutorials 11 | 12 | Dependencies: 13 | tensorflow 1.8.0 14 | gym 0.9.2 15 | """ 16 | 17 | import tensorflow as tf 18 | import numpy as np 19 | import matplotlib.pyplot as plt 20 | import gym, threading, queue 21 | 22 | from scene_loader import THORDiscreteEnvironment as Environment 23 | 24 | import pdb 25 | 26 | EP_MAX = 1000 27 | EP_LEN = 500 28 | N_WORKER = 8 # parallel workers 29 | GAMMA = 0.9 # reward discount factor 30 | A_LR = 0.0001 # learning rate for actor 31 | C_LR = 0.0001 # learning rate for critic 32 | MIN_BATCH_SIZE = 64 # minimum batch size for updating PPO 33 | UPDATE_STEP = 15 # loop update operation n-steps 34 | EPSILON = 0.2 # for clipping surrogate objective 35 | GAME = 'CartPole-v0' 36 | 37 | #env = gym.make(GAME) 38 | #S_DIM = env.observation_space.shape[0] 39 | #A_DIM = env.action_space.n 40 | 41 | from constants import TASK_TYPE 42 | from constants import TASK_LIST 43 | 44 | 45 | class PPONet(object): 46 | def __init__(self): 47 | self.sess = tf.Session() 48 | #self.tfs = tf.placeholder(tf.float32, [None, S_DIM], 'state') 49 | 50 | self.tfs_S = tf.placeholder("float", [None, 2048,4], 'state_new') 51 | self.tfs_T = tf.placeholder("float", [None, 2048,4], 'target_new') 52 | 53 | self.tfs_S_N=tf.reshape(self.tfs_S, [-1, 8192]) 54 | self.tfs_T_N=tf.reshape(self.tfs_T, [-1, 8192]) 55 | 56 | 57 | 58 | # critic 59 | #w_init = tf.random_normal_initializer(0., .1) 60 | #lc = tf.layers.dense(self.tfs, 200, tf.nn.relu, kernel_initializer=w_init, name='lc') 61 | #self.v = tf.layers.dense(lc, 1) 62 | #self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') 63 | #self.advantage = self.tfdc_r - self.v 64 | ##self.closs = tf.reduce_mean(tf.square(self.advantage)) 65 | #self.ctrain_op = tf.train.AdamOptimizer(C_LR).minimize(self.closs) 66 | 67 | # actor 68 | #self.pi, pi_params,self.pi_new,self.v_new = self._build_anet('pi', trainable=True) 69 | pi_params,self.pi_new,self.v_new = self._build_anet('pi', trainable=True) 70 | 71 | #oldpi, oldpi_params,oldpi_new,oldv_new = self._build_anet('oldpi', trainable=False) 72 | oldpi_params,oldpi_new,oldv_new = self._build_anet('oldpi', trainable=False) 73 | 74 | self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)] 75 | 76 | 77 | self.tfdc_r = tf.placeholder(tf.float32, [None, 1], 'discounted_r') 78 | self.advantage = self.tfdc_r - self.v_new 79 | self.closs = tf.reduce_mean(tf.square(self.advantage)) 80 | 81 | 82 | self.tfa = tf.placeholder(tf.int32, [None, ], 'action') 83 | self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage') 84 | 85 | a_indices = tf.stack([tf.range(tf.shape(self.tfa)[0], dtype=tf.int32), self.tfa], axis=1) 86 | pi_prob = tf.gather_nd(params=self.pi_new, indices=a_indices) # shape=(None, ) 87 | oldpi_prob = tf.gather_nd(params=oldpi_new, indices=a_indices) # shape=(None, ) 88 | ratio = pi_prob/oldpi_prob 89 | surr = ratio * self.tfadv # surrogate loss 90 | 91 | self.aloss = -tf.reduce_mean(tf.minimum( # clipped surrogate objective 92 | surr, 93 | tf.clip_by_value(ratio, 1. - EPSILON, 1. + EPSILON) * self.tfadv)) 94 | 95 | self.total_loss= self.aloss + 0.5*self.closs 96 | 97 | self.atrain_op = tf.train.AdamOptimizer(A_LR).minimize(self.total_loss) 98 | self.sess.run(tf.global_variables_initializer()) 99 | 100 | def update(self): 101 | global GLOBAL_UPDATE_COUNTER 102 | while not COORD.should_stop(): 103 | if GLOBAL_EP < EP_MAX: 104 | UPDATE_EVENT.wait() # wait until get batch of data 105 | self.sess.run(self.update_oldpi_op) # copy pi to old pi 106 | data = [QUEUE.get() for _ in range(QUEUE.qsize())] # collect data from all workers 107 | 108 | data = np.vstack(data) 109 | 110 | #s, a, r = data[:, :S_DIM], data[:, S_DIM: S_DIM + 1].ravel(), data[:, -1:] 111 | 112 | 113 | 114 | s, t,a, r = data[:, :8192],data[:, 8192: 16384], data[:, 16384: 16384 + 1].ravel(), data[:, -1:] 115 | 116 | 117 | 118 | s=np.reshape(s,[s.shape[0],2048,4]) 119 | t=np.reshape(t,[t.shape[0],2048,4]) 120 | 121 | 122 | adv = self.sess.run(self.advantage, {self.tfs_S: s,self.tfs_T: t, self.tfdc_r: r}) 123 | 124 | loss=self.sess.run(self.closs,{self.tfs_S: s,self.tfs_T: t, self.tfdc_r: r, self.tfa: a, self.tfadv: adv}) 125 | 126 | # update actor and critic in a update loop 127 | [self.sess.run(self.atrain_op, {self.tfs_S: s,self.tfs_T: t, self.tfdc_r: r, self.tfa: a, self.tfadv: adv}) for _ in range(UPDATE_STEP)] 128 | #[self.sess.run(self.ctrain_op, {self.tfs: s, self.tfdc_r: r}) for _ in range(UPDATE_STEP)] 129 | 130 | 131 | UPDATE_EVENT.clear() # updating finished 132 | GLOBAL_UPDATE_COUNTER = 0 # reset counter 133 | ROLLING_EVENT.set() # set roll-out available 134 | 135 | def _build_anet(self, name, trainable): 136 | 137 | with tf.variable_scope(name): 138 | with tf.variable_scope("Siamese", reuse=tf.AUTO_REUSE): 139 | self.siamese_s=self.construct_Siamese(self.tfs_S_N,trainable) 140 | self.siamese_t=self.construct_Siamese(self.tfs_T_N,trainable) 141 | self.concat=tf.concat(values=[self.siamese_s, self.siamese_t], axis=1) 142 | #self.obs=self.fusion_layer(self.concat,trainable) 143 | 144 | 145 | self.obs=self.fusion_layer(self.concat,trainable) 146 | 147 | 148 | #l_a = tf.layers.dense(self.tfs, 200, tf.nn.relu, trainable=trainable) 149 | 150 | #a_prob = tf.layers.dense(l_a, A_DIM, tf.nn.softmax, trainable=trainable) 151 | 152 | 153 | #Newly added 154 | l_a_new = tf.layers.dense(self.obs, 512, tf.nn.relu, trainable=trainable) 155 | a_prob_new = tf.layers.dense(l_a_new, 4, tf.nn.softmax, trainable=trainable) 156 | 157 | v_new = tf.layers.dense(l_a_new, 1,trainable=trainable) 158 | 159 | 160 | 161 | 162 | params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name) 163 | 164 | #return a_prob, params,a_prob_new,v_new 165 | return params,a_prob_new,v_new 166 | 167 | def construct_Siamese(self, input,trainable): 168 | layer_1 = tf.layers.dense(inputs=input, units=512, activation=tf.nn.leaky_relu, name='Siamese_layer_1',trainable=trainable) 169 | #layer_2=tf.layers.dropout(layer_1,rate=0.5,noise_shape=None,seed=None,training=True,name='Drop_out_1') 170 | #layer_3=tf.layers.dense(inputs=layer_2, units=128, activation=tf.nn.leaky_relu, name='Siamese_layer_2') 171 | return layer_1 172 | 173 | def fusion_layer(self, input,trainable): #This is also a shared fusion layer 174 | fuse_layer_1 = tf.layers.dense(inputs=input, units=512, activation=tf.nn.leaky_relu, name='Fuse_layer',trainable=trainable) 175 | #fuse_layer_2=tf.layers.dropout(fuse_layer_1,rate=0.5,noise_shape=None,seed=None,training=True,name='Drop_out_2') #added dropout 176 | #fuse_layer_3 = tf.layers.dense(inputs=fuse_layer_1, units=128, activation=tf.nn.leaky_relu, name='Fuse_layer_3') 177 | return fuse_layer_1 178 | 179 | def choose_action(self,s_new,t_new): # run by a local 180 | #prob_weights = self.sess.run(self.pi, feed_dict={self.tfs: s[None, :]}) 181 | 182 | 183 | prob_weights_new = self.sess.run(self.pi_new, feed_dict={self.tfs_S: [s_new],self.tfs_T:[t_new]}) 184 | # action = np.random.choice(range(prob_weights.shape[1]), 185 | # p=prob_weights.ravel()) # select action w.r.t the actions prob 186 | 187 | 188 | action_new = np.random.choice(range(prob_weights_new.shape[1]), 189 | p=prob_weights_new.ravel()) # select action w.r.t the actions prob 190 | 191 | 192 | #return action,action_new 193 | return action_new 194 | 195 | 196 | 197 | def get_v(self, s): 198 | if s.ndim < 2: s = s[np.newaxis, :] 199 | return self.sess.run(self.v, {self.tfs: s})[0, 0] 200 | 201 | def get_v_new(self, S,T): 202 | #if s.ndim < 2: s = s[np.newaxis, :] 203 | return self.sess.run(self.v_new, {self.tfs_S: [S],self.tfs_T:[T]})[0, 0] 204 | 205 | 206 | class Worker(object): 207 | def __init__(self, wid,target_id): 208 | self.wid = wid 209 | #self.env = gym.make(GAME).unwrapped 210 | self.env_new=Environment({'scene_name':'bathroom_02','terminal_state_id': int(target_id)}) 211 | self.ppo = GLOBAL_PPO 212 | self.task_scope = target_id 213 | 214 | 215 | 216 | 217 | 218 | def work(self): 219 | global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER 220 | global GLOBAL_EP_new, GLOBAL_RUNNING_R_new, GLOBAL_UPDATE_COUNTER_new 221 | while not COORD.should_stop(): 222 | #s = self.env.reset() 223 | self.env_new.reset() 224 | 225 | ep_r = 0 226 | ep_r_new = 0 227 | buffer_s, buffer_a, buffer_r = [], [], [] 228 | 229 | buffer_s_new,buffer_t_new, buffer_a_new, buffer_r_new = [], [], [],[] 230 | 231 | 232 | for t in range(EP_LEN): 233 | if not ROLLING_EVENT.is_set(): # while global PPO is updating 234 | ROLLING_EVENT.wait() # wait until PPO is updated 235 | buffer_s, buffer_a, buffer_r = [], [], [] # clear history buffer, use new policy to collect data 236 | buffer_s_new, buffer_a_new, buffer_r_new = [], [], [] 237 | a_new = self.ppo.choose_action(self.env_new.s_t, self.env_new.target) #This is suppose to take the optimal action 238 | 239 | 240 | # process game 241 | self.env_new.step(a_new) 242 | 243 | #s_, r, done, _ = self.env.step(a) 244 | 245 | 246 | r_new = self.env_new.reward 247 | done_new = self.env_new.terminal 248 | r_new = 1 if done_new else -0.01 249 | 250 | 251 | 252 | # if done: r = -10 253 | # buffer_s.append(s) 254 | # buffer_a.append(a) 255 | # buffer_r.append(r-1) # 0 for not down, -11 for down. Reward engineering 256 | 257 | buffer_s_new.append(self.env_new.s_t) 258 | buffer_t_new.append(self.env_new.target) 259 | buffer_a_new.append(a_new) 260 | buffer_r_new.append(r_new) 261 | 262 | self.env_new.update() 263 | 264 | 265 | #s = s_ 266 | s_new=self.env_new.s_t 267 | target=self.env_new.target 268 | #ep_r += r 269 | ep_r_new += r_new 270 | 271 | GLOBAL_UPDATE_COUNTER += 1 # count to minimum batch size, no need to wait other workers 272 | if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE or done_new: 273 | if done_new: 274 | #v_s_ = 0 275 | v_s_new=0 # end of episode 276 | else: 277 | #v_s_ = self.ppo.get_v(s_) 278 | v_s_new = self.ppo.get_v_new(s_new,target) 279 | 280 | # discounted_r = [] # compute discounted reward 281 | # for r in buffer_r[::-1]: 282 | # v_s_ = r + GAMMA * v_s_ 283 | # discounted_r.append(v_s_) 284 | # discounted_r.reverse() 285 | 286 | 287 | discounted_r_new = [] # compute discounted reward 288 | for r in buffer_r_new[::-1]: 289 | v_s_new = r_new + GAMMA * v_s_new 290 | discounted_r_new.append(v_s_new) 291 | discounted_r_new.reverse() 292 | 293 | # bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, None] 294 | 295 | 296 | 297 | # buffer_s, buffer_a, buffer_r = [], [], [] 298 | # QUEUE.put(np.hstack((bs, ba, br))) # put data in the queue 299 | 300 | 301 | 302 | 303 | bs_new,bt_new, ba_new, br_new = np.vstack([buffer_s_new]),np.vstack([buffer_t_new]), np.vstack(buffer_a_new), np.array(discounted_r_new)[:, None] 304 | bs_new=np.reshape(bs_new,[bs_new.shape[0],-1]) 305 | 306 | bt_new=np.reshape(bt_new,[bs_new.shape[0],-1]) 307 | 308 | 309 | 310 | 311 | buffer_s_new,buffer_t_new, buffer_a_new, buffer_r_new = [], [], [],[] 312 | QUEUE.put(np.hstack((bs_new, bt_new,ba_new, br_new))) # put data in the queue 313 | 314 | 315 | 316 | if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE: 317 | ROLLING_EVENT.clear() # stop collecting data 318 | UPDATE_EVENT.set() # globalPPO update 319 | 320 | if GLOBAL_EP >= EP_MAX: # stop training 321 | COORD.request_stop() 322 | break 323 | 324 | if done_new: break 325 | 326 | # record reward changes, plot later 327 | 328 | 329 | # if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r) 330 | # else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r*0.1) 331 | # GLOBAL_EP += 1 332 | # print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r,) 333 | 334 | # record reward changes, plot later 335 | if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R_new.append(ep_r_new) 336 | else: GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1]*0.9+ep_r_new*0.1) 337 | GLOBAL_EP += 1 338 | print('{0:.1f}%'.format(GLOBAL_EP/EP_MAX*100), '|W%i' % self.wid, '|Ep_r: %.2f' % ep_r_new,) 339 | 340 | 341 | 342 | 343 | 344 | if __name__ == '__main__': 345 | GLOBAL_PPO = PPONet() 346 | UPDATE_EVENT, ROLLING_EVENT = threading.Event(), threading.Event() 347 | UPDATE_EVENT.clear() # not update now 348 | ROLLING_EVENT.set() # start to roll out 349 | 350 | 351 | network_scope = TASK_TYPE 352 | list_of_tasks = TASK_LIST 353 | scene_scopes = list_of_tasks.keys() 354 | 355 | branches=[] 356 | 357 | for scene in scene_scopes: 358 | for task in list_of_tasks[scene]: 359 | branches.append((scene, task)) 360 | 361 | NUM_TASKS = len(branches) 362 | 363 | workers = [] 364 | 365 | for i in range(N_WORKER): #This is the parrele size 366 | 367 | scene, task = branches[i%NUM_TASKS] 368 | training_thread = Worker(wid=i,target_id=task) 369 | workers.append(training_thread) 370 | 371 | 372 | #workers = [Worker(wid=i,target_id=i) for i in range(N_WORKER)] 373 | 374 | 375 | GLOBAL_UPDATE_COUNTER, GLOBAL_EP = 0, 0 376 | GLOBAL_RUNNING_R = [] 377 | GLOBAL_RUNNING_R_new =[] 378 | COORD = tf.train.Coordinator() 379 | QUEUE = queue.Queue() # workers putting data in this queue 380 | threads = [] 381 | for worker in workers: # worker threads 382 | t = threading.Thread(target=worker.work, args=()) 383 | t.start() # training 384 | threads.append(t) 385 | 386 | 387 | 388 | 389 | 390 | # add a PPO updating thread 391 | threads.append(threading.Thread(target=GLOBAL_PPO.update,)) 392 | 393 | 394 | threads[-1].start() 395 | COORD.join(threads) 396 | 397 | # plot reward change and test 398 | plt.plot(np.arange(len(GLOBAL_RUNNING_R)), GLOBAL_RUNNING_R) 399 | plt.xlabel('Episode'); plt.ylabel('Moving reward'); plt.ion(); plt.show() 400 | #env = gym.make('CartPole-v0') 401 | 402 | self.env =Environment({'scene_name':'bathroom_02','terminal_state_id': int(26)}) 403 | while True: 404 | self.env.reset() 405 | for t in range(1000): 406 | env.render() 407 | self.env.step(GLOBAL_PPO.choose_action(self.env.s_t, self.env.target)) 408 | 409 | print(t) 410 | if done: 411 | break 412 | 413 | -------------------------------------------------------------------------------- /scene_loader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import h5py 4 | import json 5 | import numpy as np 6 | import random 7 | import skimage.io 8 | from skimage.transform import resize 9 | from constants import ACTION_SIZE 10 | from constants import SCREEN_WIDTH 11 | from constants import SCREEN_HEIGHT 12 | from constants import HISTORY_LENGTH 13 | 14 | class THORDiscreteEnvironment(object): 15 | 16 | def __init__(self, config=dict()): 17 | 18 | # configurations 19 | self.scene_name = config.get('scene_name', 'bedroom_04') 20 | self.random_start = config.get('random_start', True) 21 | self.n_feat_per_locaiton = config.get('n_feat_per_locaiton', 1) # 1 for no sampling 22 | self.terminal_state_id = config.get('terminal_state_id', 0) 23 | 24 | self.h5_file_path = config.get('h5_file_path', 'data/%s.h5'%self.scene_name) 25 | self.h5_file = h5py.File(self.h5_file_path, 'r') 26 | 27 | self.locations = self.h5_file['location'][()] 28 | self.rotations = self.h5_file['rotation'][()] 29 | self.n_locations = self.locations.shape[0] 30 | 31 | self.terminals = np.zeros(self.n_locations) 32 | self.terminals[self.terminal_state_id] = 1 33 | self.terminal_states, = np.where(self.terminals) 34 | 35 | self.transition_graph = self.h5_file['graph'][()] 36 | self.shortest_path_distances = self.h5_file['shortest_path_distance'][()] 37 | 38 | self.history_length = HISTORY_LENGTH 39 | self.screen_height = SCREEN_HEIGHT 40 | self.screen_width = SCREEN_WIDTH 41 | 42 | # we use pre-computed fc7 features from ResNet-50 43 | # self.s_t = np.zeros([self.screen_height, self.screen_width, self.history_length]) 44 | self.s_t = np.zeros([2048, self.history_length]) 45 | self.s_t1 = np.zeros_like(self.s_t) 46 | self.s_target = self._tiled_state(self.terminal_state_id) 47 | 48 | self.reset() 49 | 50 | # public methods 51 | 52 | def reset(self): 53 | # randomize initial state 54 | while True: 55 | k = random.randrange(self.n_locations) 56 | min_d = np.inf 57 | # check if target is reachable 58 | for t_state in self.terminal_states: 59 | dist = self.shortest_path_distances[k][t_state] 60 | min_d = min(min_d, dist) 61 | # min_d = 0 if k is a terminal state 62 | # min_d = -1 if no terminal state is reachable from k 63 | if min_d > 0: break 64 | 65 | # reset parameters 66 | self.current_state_id = k 67 | self.s_t = self._tiled_state(self.current_state_id) 68 | 69 | self.reward = 0 70 | self.collided = False 71 | self.terminal = False 72 | 73 | def step(self, action): 74 | assert not self.terminal, 'step() called in terminal state' 75 | k = self.current_state_id 76 | if self.transition_graph[k][action] != -1: 77 | self.current_state_id = self.transition_graph[k][action] 78 | if self.terminals[self.current_state_id]: 79 | self.terminal = True 80 | self.collided = False 81 | else: 82 | self.terminal = False 83 | self.collided = False 84 | else: 85 | self.terminal = False 86 | self.collided = True 87 | 88 | self.reward = self._reward(self.terminal, self.collided) 89 | self.s_t1 = np.append(self.s_t[:,1:], self.state, axis=1) 90 | 91 | def update(self): 92 | self.s_t = self.s_t1 93 | 94 | # private methods 95 | 96 | def _tiled_state(self, state_id): 97 | k = random.randrange(self.n_feat_per_locaiton) 98 | f = self.h5_file['resnet_feature'][state_id][k][:,np.newaxis] 99 | return np.tile(f, (1, self.history_length)) 100 | 101 | def _reward(self, terminal, collided): 102 | # positive reward upon task completion 103 | if terminal: return 10.0 104 | # time penalty or collision penalty 105 | return -0.1 if collided else -0.01 106 | 107 | # properties 108 | 109 | @property 110 | def action_size(self): 111 | # move forward/backward, turn left/right for navigation 112 | return ACTION_SIZE 113 | 114 | @property 115 | def action_definitions(self): 116 | action_vocab = ["MoveForward", "RotateRight", "RotateLeft", "MoveBackward"] 117 | return action_vocab[:ACTION_SIZE] 118 | 119 | @property 120 | def observation(self): 121 | return self.h5_file['observation'][self.current_state_id] 122 | 123 | @property 124 | def state(self): 125 | # read from hdf5 cache 126 | k = random.randrange(self.n_feat_per_locaiton) 127 | return self.h5_file['resnet_feature'][self.current_state_id][k][:,np.newaxis] 128 | 129 | @property 130 | def target(self): 131 | return self.s_target 132 | 133 | @property 134 | def x(self): 135 | return self.locations[self.current_state_id][0] 136 | 137 | @property 138 | def z(self): 139 | return self.locations[self.current_state_id][1] 140 | 141 | @property 142 | def r(self): 143 | return self.rotations[self.current_state_id] 144 | 145 | if __name__ == "__main__": 146 | scene_name = 'bedroom_04' 147 | 148 | env = THORDiscreteEnvironment({ 149 | 'random_start': True, 150 | 'scene_name': scene_name, 151 | 'h5_file_path': 'data/%s.h5'%scene_name 152 | }) 153 | --------------------------------------------------------------------------------