├── README.md
├── dqn.py
├── hierarchical_dqn.py
├── replay_buffer.py
└── train_dqn.py


/README.md:
--------------------------------------------------------------------------------
1 | # Hierarchical-DQN
2 | 


--------------------------------------------------------------------------------
/dqn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | from replay_buffer import ReplayBuffer
  4 | import tensorflow as tf
  5 | 
  6 | class DqnAgent(object):
  7 | 
  8 |     # Discount factor for future rewards.
  9 |     DISCOUNT = 0.99
 10 |     # Max size of the replay buffer.
 11 |     REPLAY_MEMORY_SIZE = 500000
 12 |     # Batch size for updates from the replay buffer.
 13 |     BATCH_SIZE = 32
 14 |     # Initial size of replay memory prior to beginning sampling batches.
 15 |     REPLAY_MEMORY_INIT_SIZE = 5000
 16 |     # Update the target network every TARGET_UPDATE timesteps.
 17 |     TARGET_UPDATE = 1000 #10000
 18 | 
 19 |     def __init__(self, sess=None, learning_rate=0.00025, state_dims=[], num_actions=0,
 20 |         epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=50000, replay_memory_init_size=None,
 21 |         target_update=None):
 22 | 
 23 |         self._learning_rate = learning_rate
 24 |         self._state_dims = state_dims
 25 |         self._num_actions = num_actions
 26 | 
 27 |         self._epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
 28 |         self._epsilon_decay_steps = epsilon_decay_steps
 29 | 
 30 |         if replay_memory_init_size is not None:
 31 |             self.REPLAY_MEMORY_INIT_SIZE = replay_memory_init_size
 32 | 
 33 |         if target_update is not None:
 34 |             self.TARGET_UPDATE = target_update
 35 | 
 36 |         self._replay_buffer = ReplayBuffer(
 37 |             self.REPLAY_MEMORY_SIZE,
 38 |             self.REPLAY_MEMORY_INIT_SIZE,
 39 |             self.BATCH_SIZE)
 40 | 
 41 |         self._current_time_step = 0
 42 | 
 43 |         with tf.Graph().as_default():
 44 |             self._construct_graph()
 45 |             self._saver = tf.train.Saver()
 46 |             if sess is None:
 47 |                 self.sess = tf.Session()
 48 |             else:
 49 |                 self.sess = sess
 50 |             self.sess.run(tf.global_variables_initializer())
 51 | 
 52 |     def _q_network(self, state):
 53 | 
 54 |         layer1 = tf.contrib.layers.fully_connected(state, 64, activation_fn=tf.nn.relu)
 55 |         q_values = tf.contrib.layers.fully_connected(layer1, self._num_actions, activation_fn=None)
 56 | 
 57 |         return q_values
 58 | 
 59 |     def _construct_graph(self):
 60 |         shape=[None]
 61 |         for dim in self._state_dims:
 62 |             shape.append(dim)
 63 |         self._state = tf.placeholder(shape=shape, dtype=tf.float32)
 64 | 
 65 |         with tf.variable_scope('q_network'):
 66 |             self._q_values = self._q_network(self._state)
 67 |         with tf.variable_scope('target_q_network'):
 68 |             self._target_q_values = self._q_network(self._state)
 69 |         with tf.variable_scope('q_network_update'):
 70 |             self._picked_actions = tf.placeholder(shape=[None, 2], dtype=tf.int32)
 71 |             self._td_targets = tf.placeholder(shape=[None], dtype=tf.float32)
 72 |             self._q_values_pred = tf.gather_nd(self._q_values, self._picked_actions)
 73 |             self._losses = clipped_error(self._q_values_pred - self._td_targets)
 74 |             self._loss = tf.reduce_mean(self._losses)
 75 | 
 76 |             self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate)
 77 | 
 78 |             grads_and_vars = self.optimizer.compute_gradients(self._loss, tf.trainable_variables())
 79 | 
 80 |             grads = [gv[0] for gv in grads_and_vars]
 81 |             params = [gv[1] for gv in grads_and_vars]
 82 |             grads = tf.clip_by_global_norm(grads, 5.0)[0]
 83 | 
 84 |             clipped_grads_and_vars = zip(grads, params)
 85 |             self.train_op = self.optimizer.apply_gradients(clipped_grads_and_vars,
 86 |                 global_step=tf.contrib.framework.get_global_step())
 87 | 
 88 |         with tf.name_scope('target_network_update'):
 89 |             q_network_params = [t for t in tf.trainable_variables() if t.name.startswith(
 90 |                 'q_network')]
 91 |             q_network_params = sorted(q_network_params, key=lambda v: v.name)
 92 | 
 93 |             target_q_network_params = [t for t in tf.trainable_variables() if t.name.startswith(
 94 |                 'target_q_network')]
 95 |             target_q_network_params = sorted(target_q_network_params, key=lambda v: v.name)
 96 | 
 97 |             self.target_update_ops = []
 98 |             for e1_v, e2_v in zip(q_network_params, target_q_network_params):
 99 |                 op = e2_v.assign(e1_v)
100 |                 self.target_update_ops.append(op)
101 | 
102 |     def sample(self, state):
103 |         self._current_time_step += 1
104 |         q_values = self.sess.run(self._q_values, {self._state: state})
105 | 
106 |         epsilon = self._epsilons[min(self._current_time_step, self._epsilon_decay_steps - 1)]
107 | 
108 |         e = random.random()
109 |         if e < epsilon:
110 |             return random.randint(0, self._num_actions - 1)
111 |         else:
112 |             return np.argmax(q_values)
113 | 
114 |     def best_action(self, state):
115 |         q_values = self.sess.run(self._q_values, {self._state: state})
116 |         return np.argmax(q_values)
117 | 
118 |     def store(self, state, action, reward, next_state, terminal, eval=False, curr_reward=False):
119 |         if not eval:
120 |             self._replay_buffer.add(state, action, reward, next_state, terminal)
121 | 
122 |     def update(self):
123 |         states, actions, rewards, next_states, terminals = self._replay_buffer.sample()
124 |         actions = zip(np.arange(len(actions)), actions)
125 | 
126 |         if len(states) > 0:
127 |             next_states_q_values = self.sess.run(self._target_q_values, {self._state: next_states})
128 |             next_states_max_q_values = np.max(next_states_q_values, axis=1)
129 |             td_targets = rewards + (1 - terminals) * self.DISCOUNT * next_states_max_q_values
130 | 
131 |             feed_dict = {self._state: states,
132 |                          self._picked_actions: actions,
133 |                          self._td_targets: td_targets}
134 | 
135 |             _ = self.sess.run(self.train_op, feed_dict=feed_dict)
136 | 
137 |         # Update the target q-network.
138 |         if not self._current_time_step % self.TARGET_UPDATE:
139 |             self.sess.run(self.target_update_ops)
140 | 
141 | def clipped_error(x):
142 |     return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5)
143 | 
144 | def compute_gradients(tensor, var_list):
145 |   grads = tf.gradients(tensor, var_list)
146 |   return [grad if grad is not None else tf.zeros_like(var)
147 |           for var, grad in zip(var_list, grads)]
148 | 


--------------------------------------------------------------------------------
/hierarchical_dqn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Hierarchical DQN implementation as described in Kulkarni et al.
  3 | https://arxiv.org/pdf/1604.06057.pdf
  4 | @author: Saurabh Kumar
  5 | """
  6 | 
  7 | from collections import defaultdict
  8 | from dqn import DqnAgent
  9 | import numpy as np
 10 | from qLearning import QLearningAgent
 11 | import sys
 12 | 
 13 | 
 14 | class HierarchicalDqnAgent(object):
 15 |     INTRINSIC_STEP_COST = -1    # Step cost for the controller.
 16 | 
 17 |     def __init__(self,
 18 |                  learning_rates=[0.1, 0.00025],
 19 |                  state_sizes=[0, 0],
 20 |                  subgoals=None,
 21 |                  num_subgoals=0,
 22 |                  num_primitive_actions=0,
 23 |                  meta_controller_state_fn=None,
 24 |                  check_subgoal_fn=None):
 25 | 
 26 |         """Initializes a hierarchical DQN agent.
 27 | 
 28 |            Args:
 29 |             learning_rates: learning rates of the meta-controller and controller agents.
 30 |             state_sizes: state sizes of the meta-controller and controller agents.
 31 |                          State sizes are assumed to be 1-dimensional.
 32 |             subgoals: array of subgoals for the meta-controller.
 33 |             num_subgoals: the action space of the meta-controller.
 34 |             num_primitive_actions: the action space of the controller.
 35 |             meta_controller_state_fn: function that returns the state of the meta-controller.
 36 |             check_subgoal_fn: function that checks if agent has satisfied a particular subgoal.
 37 |         """
 38 | 
 39 |         self._meta_controller = DqnAgent(state_dims=state_sizes[0],
 40 |             num_actions=num_subgoals,
 41 |             learning_rate=learning_rates[0],
 42 |             epsilon_end=0.01)
 43 | 
 44 |         self._controller = DqnAgent(learning_rate=learning_rates[1],
 45 |                 num_actions=num_primitive_actions,
 46 |                 state_dims=[state_sizes[1] + num_subgoals],
 47 |                 epsilon_end=0.01)
 48 | 
 49 |         self._subgoals = subgoals
 50 |         self._num_subgoals = num_subgoals
 51 | 
 52 |         self._meta_controller_state_fn = meta_controller_state_fn
 53 |         self._check_subgoal_fn = check_subgoal_fn
 54 | 
 55 |         self._meta_controller_state = None
 56 |         self._curr_subgoal = None
 57 |         self._meta_controller_reward = 0
 58 |         self._intrinsic_time_step = 0
 59 |         self._episode = 0
 60 | 
 61 |     def get_meta_controller_state(self, state):
 62 |         returned_state = state
 63 |         if self._meta_controller_state_fn:
 64 |             returned_state = self._meta_controller_state_fn(state, self._original_state)
 65 | 
 66 |         return np.copy(returned_state)
 67 | 
 68 |     def get_controller_state(self, state, subgoal_index):
 69 |         # Concatenates the environment state with the current subgoal.
 70 | 
 71 |         # curr_subgoal is a 1-hot vector indicating the current subgoal selected by the meta-controller.
 72 |         curr_subgoal = np.array(self._subgoals[subgoal_index])
 73 | 
 74 |         # Concatenate the environment state with the subgoal.
 75 |         controller_state = np.array(state)
 76 |         controller_state = np.concatenate((controller_state, curr_subgoal), axis=0)
 77 | 
 78 |         return np.copy(controller_state)
 79 | 
 80 |     def intrinsic_reward(self, state, subgoal_index):
 81 |         # Intrinsically rewards the controller - this is the critic in the h-DQN algorithm.
 82 |         if self.subgoal_completed(state, subgoal_index):
 83 |             return 1
 84 |         else:
 85 |             return self.INTRINSIC_STEP_COST
 86 | 
 87 |     def subgoal_completed(self, state, subgoal_index):
 88 |         # Checks whether the controller has completed the currently specified subgoal.
 89 |         if self._check_subgoal_fn is None:
 90 |             return state == self._subgoals[subgoal_index]
 91 |         else:
 92 |             return self._check_subgoal_fn(state, subgoal_index)
 93 | 
 94 |     def store(self, state, action, reward, next_state, terminal, eval=False):
 95 |         """Stores the current transition in replay memory.
 96 |            The transition is stored in the replay memory of the controller.
 97 |            If the transition culminates in a subgoal's completion or a terminal state, a
 98 |            transition for the meta-controller is constructed and stored in its replay buffer.
 99 | 
100 |            Args:
101 |             state: current state
102 |             action: primitive action taken
103 |             reward: reward received from state-action pair
104 |             next_state: next state
105 |             terminal: extrinsic terminal (True or False)
106 |             eval: Whether the current episode is a train or eval episode.
107 |         """
108 | 
109 |         # Compute the controller state, reward, next state, and terminal.
110 |         intrinsic_state = np.copy(self.get_controller_state(state, self._curr_subgoal))
111 |         intrinsic_next_state = np.copy(self.get_controller_state(next_state, self._curr_subgoal))
112 |         intrinsic_reward = self.intrinsic_reward(next_state, self._curr_subgoal)
113 |         subgoal_completed = self.subgoal_completed(next_state, self._curr_subgoal)
114 |         intrinsic_terminal = subgoal_completed or terminal
115 | 
116 |         # Store the controller transition in memory.
117 |         self._controller.store(intrinsic_state, action,
118 |             intrinsic_reward, intrinsic_next_state, intrinsic_terminal, eval)
119 | 
120 |         self._meta_controller_reward += reward
121 | 
122 |         if terminal and not eval:
123 |             self._episode += 1
124 | 
125 |         if subgoal_completed or terminal:
126 | 
127 |             # Store the meta-controller transition in memory.
128 |             meta_controller_state = np.copy(self._meta_controller_state)
129 |             next_meta_controller_state = np.copy(self.get_meta_controller_state(next_state))
130 |             
131 |             self._meta_controller.store(meta_controller_state, self._curr_subgoal,
132 |                 self._meta_controller_reward, next_meta_controller_state,
133 |                 terminal, eval)
134 | 
135 |             # Reset the current meta-controller state and current subgoal to be None
136 |             # since the current subgoal is finished. Also reset the meta-controller's reward.
137 |             self._meta_controller_state = None
138 |             self._curr_subgoal = None
139 |             self._meta_controller_reward = 0
140 |             self._intrinsic_time_step = 0
141 | 
142 |     def sample(self, state):
143 |         """Samples an action from the hierarchical DQN agent.
144 |            Samples a subgoal if necessary from the meta-controller and samples a primitive action
145 |            from the controller.
146 | 
147 |            Args:
148 |             state: the current environment state.
149 | 
150 |            Returns:
151 |             action: a sampled primitive action.
152 |         """
153 |         self._intrinsic_time_step += 1
154 | 
155 |         # If the meta-controller state is None, it means that either this is a new episode 
156 |         # or a subgoal has just been completed.
157 |         if self._meta_controller_state is None:
158 |             self._meta_controller_state = self.get_meta_controller_state(state)
159 |             self._curr_subgoal = self._meta_controller.sample([self._meta_controller_state])
160 | 
161 |         controller_state = self.get_controller_state(state, self._curr_subgoal)
162 |         action = self._controller.sample(controller_state)
163 | 
164 |         return action
165 | 
166 |     def best_action(self, state):
167 |         """Returns the greedy action from the hierarchical DQN agent.
168 |            Gets the greedy subgoal if necessary from the meta-controller and gets
169 |            the greedy primitive action from the controller.
170 | 
171 |            Args:
172 |             state: the current environment state.
173 | 
174 |            Returns:
175 |             action: the controller's greedy primitive action.
176 |         """
177 | 
178 |         # If the meta-controller state is None, it means that either this is a new episode 
179 |         # or a subgoal has just been completed.
180 |         if self._meta_controller_state is None:
181 |             self._meta_controller_state = self.get_meta_controller_state(state)
182 |             self._curr_subgoal = self._meta_controller.best_action([self._meta_controller_state])
183 | 
184 |         controller_state = self.get_controller_state(state, self._curr_subgoal)
185 |         action = self._controller.best_action(controller_state)
186 |         return action
187 | 
188 |     def update(self):
189 |         self._controller.update()
190 |         # Only update meta-controller right after a meta-controller transition has taken place,
191 |         # which occurs only when either a subgoal has been completed or the agnent has reached a
192 |         # terminal state.
193 |         if self._meta_controller_state is None:
194 |             self._meta_controller.update()
195 | 


--------------------------------------------------------------------------------
/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | 
 5 | class ReplayBuffer(object):
 6 | 
 7 |     def __init__(self, max_size, init_size, batch_size):
 8 |         self.max_size = max_size
 9 |         self.init_size = init_size
10 |         self.batch_size = batch_size
11 | 
12 |         self.states = np.array([None] * self.max_size)
13 |         self.actions = np.array([None] * self.max_size)
14 |         self.rewards = np.array([None] * self.max_size)
15 |         self.next_states = np.array([None] * self.max_size)
16 |         self.terminals = np.array([None] * self.max_size)
17 | 
18 |         self.curr_pointer = 0
19 |         self.curr_size = 0
20 | 
21 |     def add(self, state, action, reward, next_state, terminal):
22 |         self.states[self.curr_pointer] = np.squeeze(state)
23 |         self.actions[self.curr_pointer] = action
24 |         self.rewards[self.curr_pointer] = reward
25 |         self.next_states[self.curr_pointer] = np.squeeze(next_state)
26 |         self.terminals[self.curr_pointer] = terminal
27 | 
28 |         self.curr_pointer += 1
29 |         self.curr_size = min(self.max_size, self.curr_size + 1)
30 |         # If replay buffer is full, set current pointer to be at the beginning of the buffer.
31 |         if self.curr_pointer >= self.max_size:
32 |             self.curr_pointer -= self.max_size
33 | 
34 |     def sample(self):
35 |         if self.curr_size < self.init_size:
36 |             return [], [], [], [], []
37 |         sample_indices = []
38 | 
39 |         # Ensure that the most recent transition is in the returned batch.
40 |         sample_indices.append(self.curr_pointer - 1)
41 |         for i in xrange(self.batch_size - 1):
42 |             sample_indices.append(random.randint(0, self.curr_size - 1))
43 | 
44 |         returned_states = []
45 |         returned_actions = []
46 |         returned_rewards = []
47 |         returned_next_states = []
48 |         returned_terminals = []
49 | 
50 |         for i in xrange(len(sample_indices)):
51 |             index = sample_indices[i]
52 |             returned_states.append(self.states[index])
53 |             returned_actions.append(self.actions[index])
54 |             returned_rewards.append(self.rewards[index])
55 |             returned_next_states.append(self.next_states[index])
56 |             returned_terminals.append(self.terminals[index])
57 | 
58 |         return np.array(returned_states), np.array(returned_actions), np.array(
59 |             returned_rewards), np.array(returned_next_states), np.array(returned_terminals)
60 |         # return self.states[sample_indices], self.actions[sample_indices], self.rewards[sample_indices], self.next_states[sample_indices], self.terminals[sample_indices]


--------------------------------------------------------------------------------
/train_dqn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @author: Saurabh Kumar
  3 | """
  4 | 
  5 | import os
  6 | 
  7 | import matplotlib
  8 | matplotlib.use('Agg')
  9 | 
 10 | import clustering
 11 | import dqn
 12 | import gym
 13 | from gym.wrappers import Monitor
 14 | import hierarchical_dqn
 15 | import matplotlib.pyplot as plt
 16 | import numpy as np
 17 | import tensorflow as tf
 18 | import pickle
 19 | 
 20 | tf.flags.DEFINE_string('agent_type', 'h_dqn', 'RL agent type.')
 21 | tf.flags.DEFINE_string('logdir', 'experiment_logs/', 'Directory of logfile.')
 22 | tf.flags.DEFINE_string('experiment_dir', '', 'Directory of experiment files.')
 23 | tf.flags.DEFINE_string('logfile', 'log.txt', 'Name of the logfile.')
 24 | tf.flags.DEFINE_string('env_name', 'MountainCar-v0', 'Name of the environment.')
 25 | 
 26 | env_name = ''
 27 | 
 28 | FLAGS = tf.flags.FLAGS
 29 | 
 30 | 
 31 | def log(logfile, iteration, rewards):
 32 |     """Function that logs the reward statistics obtained by the agent.
 33 | 
 34 |     Args:
 35 |         logfile: File to log reward statistics.
 36 |         iteration: The current iteration.
 37 |         rewards: Array of rewards obtained in the current iteration.
 38 |     """
 39 |     log_string = '{} {} {} {}'.format(
 40 |         iteration, np.min(rewards), np.mean(rewards), np.max(rewards))
 41 |     print(log_string)
 42 | 
 43 |     with open(logfile, 'a') as f:
 44 |         f.write(log_string + '\n')
 45 | 
 46 | 
 47 | def make_environment(env_name):
 48 |     return gym.make(env_name)
 49 | 
 50 | 
 51 | def make_agent(agent_type, env, num_clusters, use_extra_travel_penalty, use_extra_bit,
 52 |     use_controller_dqn, use_intrinsic_timeout, use_memory, memory_size, pretrain_controller):
 53 |     if agent_type == 'dqn':
 54 |         return dqn.DqnAgent(state_dims=[2],
 55 |                             num_actions=2) # env.action_space.n
 56 |     elif agent_type == 'h_dqn':
 57 |         meta_controller_state_fn, check_subgoal_fn, num_subgoals, subgoals = clustering.get_cluster_fn(
 58 |             n_clusters=num_clusters, extra_bit=use_extra_bit)
 59 | 
 60 |         return hierarchical_dqn.HierarchicalDqnAgent(
 61 |             state_sizes=[num_subgoals, [2]],
 62 |             agent_types=['tabular', 'network'],
 63 |             subgoals=subgoals,
 64 |             num_subgoals=num_subgoals,
 65 |             num_primitive_actions=2, # env.action_space.n
 66 |             meta_controller_state_fn=meta_controller_state_fn,
 67 |             check_subgoal_fn=check_subgoal_fn,
 68 |             use_extra_travel_penalty=use_extra_travel_penalty,
 69 |             use_extra_bit_for_subgoal_center=use_extra_bit,
 70 |             use_controller_dqn=use_controller_dqn,
 71 |             use_intrinsic_timeout=use_intrinsic_timeout,
 72 |             use_memory=use_memory,
 73 |             memory_size=memory_size,
 74 |             pretrain_controller=pretrain_controller)
 75 | 
 76 | 
 77 | def run(env_name='MountainCar-v0',
 78 |         agent_type='dqn',
 79 |         num_iterations=1000,
 80 |         num_train_episodes=100,
 81 |         num_eval_episodes=100,
 82 |         logdir=None,
 83 |         experiment_dir=None,
 84 |         logfile=None):
 85 |     """Function that executes RL training and evaluation.
 86 | 
 87 |     Args:
 88 |         env_name: Name of the environment that the agent will interact with.
 89 |         agent_type: The type RL agent that will be used for training.
 90 |         num_iterations: Number of iterations to train for.
 91 |         num_train_episodes: Number of training episodes per iteration.
 92 |         num_eval_episodes: Number of evaluation episodes per iteration.
 93 |         logdir: Directory for log file.
 94 |         logfile: File to log the agent's performance over training.
 95 |     """
 96 |     experiment_dir += '_agent_type_' + agent_type
 97 | 
 98 |     experiment_dir = logdir + experiment_dir
 99 |     logfile = experiment_dir + '/' + logfile
100 | 
101 |     try:
102 |         os.stat(experiment_dir)
103 |     except:
104 |         os.mkdir(experiment_dir)
105 | 
106 | 
107 |     env = make_environment(env_name)
108 |     env_test = make_environment(env_name)
109 |     # env_test = Monitor(env_test, directory='videos/', video_callable=lambda x: True, resume=True)
110 |     print 'Made environment!'
111 |     agent = make_agent(agent_type, env)
112 |     print 'Made agent!'
113 | 
114 |     for it in range(num_iterations):
115 | 
116 |         # Run train episodes.
117 |         for train_episode in range(num_train_episodes):
118 |             # Reset the environment.
119 |             state = env.reset()
120 |             state = np.expand_dims(state, axis=0)
121 | 
122 |             episode_reward = 0
123 | 
124 |             # Run the episode.
125 |             terminal = False
126 | 
127 |             while not terminal:
128 |                 action = agent.sample(state)
129 |                 # Remove the do-nothing action.
130 |                 if env_name == 'MountainCar-v0':
131 |                     if action == 1:
132 |                         env_action = 2
133 |                     else:
134 |                         env_action = action
135 | 
136 |                 next_state, reward, terminal, _ = env.step(env_action)
137 |                 next_state = np.expand_dims(next_state, axis=0)
138 | 
139 |                 agent.store(state, action, reward, next_state, terminal)
140 |                 agent.update()
141 | 
142 |                 episode_reward += reward
143 |                 # Update the state.
144 |                 state = next_state
145 | 
146 |         eval_rewards = []
147 | 
148 |         # Run eval episodes.
149 |         for eval_episode in range(num_eval_episodes):
150 | 
151 |             # Reset the environment.
152 |             state = env_test.reset()
153 |             state = np.expand_dims(state, axis=0)
154 | 
155 |             episode_reward = 0
156 | 
157 |             # Run the episode.
158 |             terminal = False
159 | 
160 |             while not terminal:
161 |     		if agent_type == 'dqn':
162 |     		    action = agent.best_action(state)
163 |                 else:
164 |                     action, info = agent.best_action(state)
165 |                 if agent_type == 'h_dqn' and info is not None:
166 |                     curr_state = info[0]
167 |                     if not use_memory:
168 |                         curr_state = np.where(np.squeeze(curr_state) == 1)[0][0]
169 |                     else:
170 |                         curr_state = np.squeeze(curr_state)[-1] - 1
171 |                     goal = info[1]
172 |                     heat_map[curr_state][goal] += 1
173 | 
174 |                 # Remove the do-nothing action.
175 |                 if action == 1:
176 |                     env_action = 2
177 |                 else:
178 |                     env_action = action
179 | 
180 |                 next_state, reward, terminal, _ = env_test.step(env_action)
181 | 
182 |                 next_state = np.expand_dims(next_state, axis=0)
183 |                 # env_test.render()
184 |                 agent.store(state, action, reward, next_state, terminal, eval=True)
185 |                 if reward > 1:
186 |                     reward = 1 # For sake of comparison.
187 | 
188 |                 episode_reward += reward
189 | 
190 |                 state = next_state
191 | 
192 |             eval_rewards.append(episode_reward)
193 | 
194 |         with open(experiment_dir + '/eval_rewards_' + str(it), 'wb') as f:
195 |             pickle.dump(eval_rewards, f)
196 | 
197 |         log(logfile, it, eval_rewards)
198 | 
199 | 
200 | run(agent_type=FLAGS.agent_type, logdir=FLAGS.logdir, experiment_dir=FLAGS.experiment_dir,
201 |     logfile=FLAGS.logfile)
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------