├── README.md ├── dqn.py ├── hierarchical_dqn.py ├── replay_buffer.py └── train_dqn.py /README.md: -------------------------------------------------------------------------------- 1 | # Hierarchical-DQN 2 | -------------------------------------------------------------------------------- /dqn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | from replay_buffer import ReplayBuffer 4 | import tensorflow as tf 5 | 6 | class DqnAgent(object): 7 | 8 | # Discount factor for future rewards. 9 | DISCOUNT = 0.99 10 | # Max size of the replay buffer. 11 | REPLAY_MEMORY_SIZE = 500000 12 | # Batch size for updates from the replay buffer. 13 | BATCH_SIZE = 32 14 | # Initial size of replay memory prior to beginning sampling batches. 15 | REPLAY_MEMORY_INIT_SIZE = 5000 16 | # Update the target network every TARGET_UPDATE timesteps. 17 | TARGET_UPDATE = 1000 #10000 18 | 19 | def __init__(self, sess=None, learning_rate=0.00025, state_dims=[], num_actions=0, 20 | epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=50000, replay_memory_init_size=None, 21 | target_update=None): 22 | 23 | self._learning_rate = learning_rate 24 | self._state_dims = state_dims 25 | self._num_actions = num_actions 26 | 27 | self._epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) 28 | self._epsilon_decay_steps = epsilon_decay_steps 29 | 30 | if replay_memory_init_size is not None: 31 | self.REPLAY_MEMORY_INIT_SIZE = replay_memory_init_size 32 | 33 | if target_update is not None: 34 | self.TARGET_UPDATE = target_update 35 | 36 | self._replay_buffer = ReplayBuffer( 37 | self.REPLAY_MEMORY_SIZE, 38 | self.REPLAY_MEMORY_INIT_SIZE, 39 | self.BATCH_SIZE) 40 | 41 | self._current_time_step = 0 42 | 43 | with tf.Graph().as_default(): 44 | self._construct_graph() 45 | self._saver = tf.train.Saver() 46 | if sess is None: 47 | self.sess = tf.Session() 48 | else: 49 | self.sess = sess 50 | self.sess.run(tf.global_variables_initializer()) 51 | 52 | def _q_network(self, state): 53 | 54 | layer1 = tf.contrib.layers.fully_connected(state, 64, activation_fn=tf.nn.relu) 55 | q_values = tf.contrib.layers.fully_connected(layer1, self._num_actions, activation_fn=None) 56 | 57 | return q_values 58 | 59 | def _construct_graph(self): 60 | shape=[None] 61 | for dim in self._state_dims: 62 | shape.append(dim) 63 | self._state = tf.placeholder(shape=shape, dtype=tf.float32) 64 | 65 | with tf.variable_scope('q_network'): 66 | self._q_values = self._q_network(self._state) 67 | with tf.variable_scope('target_q_network'): 68 | self._target_q_values = self._q_network(self._state) 69 | with tf.variable_scope('q_network_update'): 70 | self._picked_actions = tf.placeholder(shape=[None, 2], dtype=tf.int32) 71 | self._td_targets = tf.placeholder(shape=[None], dtype=tf.float32) 72 | self._q_values_pred = tf.gather_nd(self._q_values, self._picked_actions) 73 | self._losses = clipped_error(self._q_values_pred - self._td_targets) 74 | self._loss = tf.reduce_mean(self._losses) 75 | 76 | self.optimizer = tf.train.RMSPropOptimizer(self._learning_rate) 77 | 78 | grads_and_vars = self.optimizer.compute_gradients(self._loss, tf.trainable_variables()) 79 | 80 | grads = [gv[0] for gv in grads_and_vars] 81 | params = [gv[1] for gv in grads_and_vars] 82 | grads = tf.clip_by_global_norm(grads, 5.0)[0] 83 | 84 | clipped_grads_and_vars = zip(grads, params) 85 | self.train_op = self.optimizer.apply_gradients(clipped_grads_and_vars, 86 | global_step=tf.contrib.framework.get_global_step()) 87 | 88 | with tf.name_scope('target_network_update'): 89 | q_network_params = [t for t in tf.trainable_variables() if t.name.startswith( 90 | 'q_network')] 91 | q_network_params = sorted(q_network_params, key=lambda v: v.name) 92 | 93 | target_q_network_params = [t for t in tf.trainable_variables() if t.name.startswith( 94 | 'target_q_network')] 95 | target_q_network_params = sorted(target_q_network_params, key=lambda v: v.name) 96 | 97 | self.target_update_ops = [] 98 | for e1_v, e2_v in zip(q_network_params, target_q_network_params): 99 | op = e2_v.assign(e1_v) 100 | self.target_update_ops.append(op) 101 | 102 | def sample(self, state): 103 | self._current_time_step += 1 104 | q_values = self.sess.run(self._q_values, {self._state: state}) 105 | 106 | epsilon = self._epsilons[min(self._current_time_step, self._epsilon_decay_steps - 1)] 107 | 108 | e = random.random() 109 | if e < epsilon: 110 | return random.randint(0, self._num_actions - 1) 111 | else: 112 | return np.argmax(q_values) 113 | 114 | def best_action(self, state): 115 | q_values = self.sess.run(self._q_values, {self._state: state}) 116 | return np.argmax(q_values) 117 | 118 | def store(self, state, action, reward, next_state, terminal, eval=False, curr_reward=False): 119 | if not eval: 120 | self._replay_buffer.add(state, action, reward, next_state, terminal) 121 | 122 | def update(self): 123 | states, actions, rewards, next_states, terminals = self._replay_buffer.sample() 124 | actions = zip(np.arange(len(actions)), actions) 125 | 126 | if len(states) > 0: 127 | next_states_q_values = self.sess.run(self._target_q_values, {self._state: next_states}) 128 | next_states_max_q_values = np.max(next_states_q_values, axis=1) 129 | td_targets = rewards + (1 - terminals) * self.DISCOUNT * next_states_max_q_values 130 | 131 | feed_dict = {self._state: states, 132 | self._picked_actions: actions, 133 | self._td_targets: td_targets} 134 | 135 | _ = self.sess.run(self.train_op, feed_dict=feed_dict) 136 | 137 | # Update the target q-network. 138 | if not self._current_time_step % self.TARGET_UPDATE: 139 | self.sess.run(self.target_update_ops) 140 | 141 | def clipped_error(x): 142 | return tf.where(tf.abs(x) < 1.0, 0.5 * tf.square(x), tf.abs(x) - 0.5) 143 | 144 | def compute_gradients(tensor, var_list): 145 | grads = tf.gradients(tensor, var_list) 146 | return [grad if grad is not None else tf.zeros_like(var) 147 | for var, grad in zip(var_list, grads)] 148 | -------------------------------------------------------------------------------- /hierarchical_dqn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hierarchical DQN implementation as described in Kulkarni et al. 3 | https://arxiv.org/pdf/1604.06057.pdf 4 | @author: Saurabh Kumar 5 | """ 6 | 7 | from collections import defaultdict 8 | from dqn import DqnAgent 9 | import numpy as np 10 | from qLearning import QLearningAgent 11 | import sys 12 | 13 | 14 | class HierarchicalDqnAgent(object): 15 | INTRINSIC_STEP_COST = -1 # Step cost for the controller. 16 | 17 | def __init__(self, 18 | learning_rates=[0.1, 0.00025], 19 | state_sizes=[0, 0], 20 | subgoals=None, 21 | num_subgoals=0, 22 | num_primitive_actions=0, 23 | meta_controller_state_fn=None, 24 | check_subgoal_fn=None): 25 | 26 | """Initializes a hierarchical DQN agent. 27 | 28 | Args: 29 | learning_rates: learning rates of the meta-controller and controller agents. 30 | state_sizes: state sizes of the meta-controller and controller agents. 31 | State sizes are assumed to be 1-dimensional. 32 | subgoals: array of subgoals for the meta-controller. 33 | num_subgoals: the action space of the meta-controller. 34 | num_primitive_actions: the action space of the controller. 35 | meta_controller_state_fn: function that returns the state of the meta-controller. 36 | check_subgoal_fn: function that checks if agent has satisfied a particular subgoal. 37 | """ 38 | 39 | self._meta_controller = DqnAgent(state_dims=state_sizes[0], 40 | num_actions=num_subgoals, 41 | learning_rate=learning_rates[0], 42 | epsilon_end=0.01) 43 | 44 | self._controller = DqnAgent(learning_rate=learning_rates[1], 45 | num_actions=num_primitive_actions, 46 | state_dims=[state_sizes[1] + num_subgoals], 47 | epsilon_end=0.01) 48 | 49 | self._subgoals = subgoals 50 | self._num_subgoals = num_subgoals 51 | 52 | self._meta_controller_state_fn = meta_controller_state_fn 53 | self._check_subgoal_fn = check_subgoal_fn 54 | 55 | self._meta_controller_state = None 56 | self._curr_subgoal = None 57 | self._meta_controller_reward = 0 58 | self._intrinsic_time_step = 0 59 | self._episode = 0 60 | 61 | def get_meta_controller_state(self, state): 62 | returned_state = state 63 | if self._meta_controller_state_fn: 64 | returned_state = self._meta_controller_state_fn(state, self._original_state) 65 | 66 | return np.copy(returned_state) 67 | 68 | def get_controller_state(self, state, subgoal_index): 69 | # Concatenates the environment state with the current subgoal. 70 | 71 | # curr_subgoal is a 1-hot vector indicating the current subgoal selected by the meta-controller. 72 | curr_subgoal = np.array(self._subgoals[subgoal_index]) 73 | 74 | # Concatenate the environment state with the subgoal. 75 | controller_state = np.array(state) 76 | controller_state = np.concatenate((controller_state, curr_subgoal), axis=0) 77 | 78 | return np.copy(controller_state) 79 | 80 | def intrinsic_reward(self, state, subgoal_index): 81 | # Intrinsically rewards the controller - this is the critic in the h-DQN algorithm. 82 | if self.subgoal_completed(state, subgoal_index): 83 | return 1 84 | else: 85 | return self.INTRINSIC_STEP_COST 86 | 87 | def subgoal_completed(self, state, subgoal_index): 88 | # Checks whether the controller has completed the currently specified subgoal. 89 | if self._check_subgoal_fn is None: 90 | return state == self._subgoals[subgoal_index] 91 | else: 92 | return self._check_subgoal_fn(state, subgoal_index) 93 | 94 | def store(self, state, action, reward, next_state, terminal, eval=False): 95 | """Stores the current transition in replay memory. 96 | The transition is stored in the replay memory of the controller. 97 | If the transition culminates in a subgoal's completion or a terminal state, a 98 | transition for the meta-controller is constructed and stored in its replay buffer. 99 | 100 | Args: 101 | state: current state 102 | action: primitive action taken 103 | reward: reward received from state-action pair 104 | next_state: next state 105 | terminal: extrinsic terminal (True or False) 106 | eval: Whether the current episode is a train or eval episode. 107 | """ 108 | 109 | # Compute the controller state, reward, next state, and terminal. 110 | intrinsic_state = np.copy(self.get_controller_state(state, self._curr_subgoal)) 111 | intrinsic_next_state = np.copy(self.get_controller_state(next_state, self._curr_subgoal)) 112 | intrinsic_reward = self.intrinsic_reward(next_state, self._curr_subgoal) 113 | subgoal_completed = self.subgoal_completed(next_state, self._curr_subgoal) 114 | intrinsic_terminal = subgoal_completed or terminal 115 | 116 | # Store the controller transition in memory. 117 | self._controller.store(intrinsic_state, action, 118 | intrinsic_reward, intrinsic_next_state, intrinsic_terminal, eval) 119 | 120 | self._meta_controller_reward += reward 121 | 122 | if terminal and not eval: 123 | self._episode += 1 124 | 125 | if subgoal_completed or terminal: 126 | 127 | # Store the meta-controller transition in memory. 128 | meta_controller_state = np.copy(self._meta_controller_state) 129 | next_meta_controller_state = np.copy(self.get_meta_controller_state(next_state)) 130 | 131 | self._meta_controller.store(meta_controller_state, self._curr_subgoal, 132 | self._meta_controller_reward, next_meta_controller_state, 133 | terminal, eval) 134 | 135 | # Reset the current meta-controller state and current subgoal to be None 136 | # since the current subgoal is finished. Also reset the meta-controller's reward. 137 | self._meta_controller_state = None 138 | self._curr_subgoal = None 139 | self._meta_controller_reward = 0 140 | self._intrinsic_time_step = 0 141 | 142 | def sample(self, state): 143 | """Samples an action from the hierarchical DQN agent. 144 | Samples a subgoal if necessary from the meta-controller and samples a primitive action 145 | from the controller. 146 | 147 | Args: 148 | state: the current environment state. 149 | 150 | Returns: 151 | action: a sampled primitive action. 152 | """ 153 | self._intrinsic_time_step += 1 154 | 155 | # If the meta-controller state is None, it means that either this is a new episode 156 | # or a subgoal has just been completed. 157 | if self._meta_controller_state is None: 158 | self._meta_controller_state = self.get_meta_controller_state(state) 159 | self._curr_subgoal = self._meta_controller.sample([self._meta_controller_state]) 160 | 161 | controller_state = self.get_controller_state(state, self._curr_subgoal) 162 | action = self._controller.sample(controller_state) 163 | 164 | return action 165 | 166 | def best_action(self, state): 167 | """Returns the greedy action from the hierarchical DQN agent. 168 | Gets the greedy subgoal if necessary from the meta-controller and gets 169 | the greedy primitive action from the controller. 170 | 171 | Args: 172 | state: the current environment state. 173 | 174 | Returns: 175 | action: the controller's greedy primitive action. 176 | """ 177 | 178 | # If the meta-controller state is None, it means that either this is a new episode 179 | # or a subgoal has just been completed. 180 | if self._meta_controller_state is None: 181 | self._meta_controller_state = self.get_meta_controller_state(state) 182 | self._curr_subgoal = self._meta_controller.best_action([self._meta_controller_state]) 183 | 184 | controller_state = self.get_controller_state(state, self._curr_subgoal) 185 | action = self._controller.best_action(controller_state) 186 | return action 187 | 188 | def update(self): 189 | self._controller.update() 190 | # Only update meta-controller right after a meta-controller transition has taken place, 191 | # which occurs only when either a subgoal has been completed or the agnent has reached a 192 | # terminal state. 193 | if self._meta_controller_state is None: 194 | self._meta_controller.update() 195 | -------------------------------------------------------------------------------- /replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | class ReplayBuffer(object): 6 | 7 | def __init__(self, max_size, init_size, batch_size): 8 | self.max_size = max_size 9 | self.init_size = init_size 10 | self.batch_size = batch_size 11 | 12 | self.states = np.array([None] * self.max_size) 13 | self.actions = np.array([None] * self.max_size) 14 | self.rewards = np.array([None] * self.max_size) 15 | self.next_states = np.array([None] * self.max_size) 16 | self.terminals = np.array([None] * self.max_size) 17 | 18 | self.curr_pointer = 0 19 | self.curr_size = 0 20 | 21 | def add(self, state, action, reward, next_state, terminal): 22 | self.states[self.curr_pointer] = np.squeeze(state) 23 | self.actions[self.curr_pointer] = action 24 | self.rewards[self.curr_pointer] = reward 25 | self.next_states[self.curr_pointer] = np.squeeze(next_state) 26 | self.terminals[self.curr_pointer] = terminal 27 | 28 | self.curr_pointer += 1 29 | self.curr_size = min(self.max_size, self.curr_size + 1) 30 | # If replay buffer is full, set current pointer to be at the beginning of the buffer. 31 | if self.curr_pointer >= self.max_size: 32 | self.curr_pointer -= self.max_size 33 | 34 | def sample(self): 35 | if self.curr_size < self.init_size: 36 | return [], [], [], [], [] 37 | sample_indices = [] 38 | 39 | # Ensure that the most recent transition is in the returned batch. 40 | sample_indices.append(self.curr_pointer - 1) 41 | for i in xrange(self.batch_size - 1): 42 | sample_indices.append(random.randint(0, self.curr_size - 1)) 43 | 44 | returned_states = [] 45 | returned_actions = [] 46 | returned_rewards = [] 47 | returned_next_states = [] 48 | returned_terminals = [] 49 | 50 | for i in xrange(len(sample_indices)): 51 | index = sample_indices[i] 52 | returned_states.append(self.states[index]) 53 | returned_actions.append(self.actions[index]) 54 | returned_rewards.append(self.rewards[index]) 55 | returned_next_states.append(self.next_states[index]) 56 | returned_terminals.append(self.terminals[index]) 57 | 58 | return np.array(returned_states), np.array(returned_actions), np.array( 59 | returned_rewards), np.array(returned_next_states), np.array(returned_terminals) 60 | # return self.states[sample_indices], self.actions[sample_indices], self.rewards[sample_indices], self.next_states[sample_indices], self.terminals[sample_indices] -------------------------------------------------------------------------------- /train_dqn.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Saurabh Kumar 3 | """ 4 | 5 | import os 6 | 7 | import matplotlib 8 | matplotlib.use('Agg') 9 | 10 | import clustering 11 | import dqn 12 | import gym 13 | from gym.wrappers import Monitor 14 | import hierarchical_dqn 15 | import matplotlib.pyplot as plt 16 | import numpy as np 17 | import tensorflow as tf 18 | import pickle 19 | 20 | tf.flags.DEFINE_string('agent_type', 'h_dqn', 'RL agent type.') 21 | tf.flags.DEFINE_string('logdir', 'experiment_logs/', 'Directory of logfile.') 22 | tf.flags.DEFINE_string('experiment_dir', '', 'Directory of experiment files.') 23 | tf.flags.DEFINE_string('logfile', 'log.txt', 'Name of the logfile.') 24 | tf.flags.DEFINE_string('env_name', 'MountainCar-v0', 'Name of the environment.') 25 | 26 | env_name = '' 27 | 28 | FLAGS = tf.flags.FLAGS 29 | 30 | 31 | def log(logfile, iteration, rewards): 32 | """Function that logs the reward statistics obtained by the agent. 33 | 34 | Args: 35 | logfile: File to log reward statistics. 36 | iteration: The current iteration. 37 | rewards: Array of rewards obtained in the current iteration. 38 | """ 39 | log_string = '{} {} {} {}'.format( 40 | iteration, np.min(rewards), np.mean(rewards), np.max(rewards)) 41 | print(log_string) 42 | 43 | with open(logfile, 'a') as f: 44 | f.write(log_string + '\n') 45 | 46 | 47 | def make_environment(env_name): 48 | return gym.make(env_name) 49 | 50 | 51 | def make_agent(agent_type, env, num_clusters, use_extra_travel_penalty, use_extra_bit, 52 | use_controller_dqn, use_intrinsic_timeout, use_memory, memory_size, pretrain_controller): 53 | if agent_type == 'dqn': 54 | return dqn.DqnAgent(state_dims=[2], 55 | num_actions=2) # env.action_space.n 56 | elif agent_type == 'h_dqn': 57 | meta_controller_state_fn, check_subgoal_fn, num_subgoals, subgoals = clustering.get_cluster_fn( 58 | n_clusters=num_clusters, extra_bit=use_extra_bit) 59 | 60 | return hierarchical_dqn.HierarchicalDqnAgent( 61 | state_sizes=[num_subgoals, [2]], 62 | agent_types=['tabular', 'network'], 63 | subgoals=subgoals, 64 | num_subgoals=num_subgoals, 65 | num_primitive_actions=2, # env.action_space.n 66 | meta_controller_state_fn=meta_controller_state_fn, 67 | check_subgoal_fn=check_subgoal_fn, 68 | use_extra_travel_penalty=use_extra_travel_penalty, 69 | use_extra_bit_for_subgoal_center=use_extra_bit, 70 | use_controller_dqn=use_controller_dqn, 71 | use_intrinsic_timeout=use_intrinsic_timeout, 72 | use_memory=use_memory, 73 | memory_size=memory_size, 74 | pretrain_controller=pretrain_controller) 75 | 76 | 77 | def run(env_name='MountainCar-v0', 78 | agent_type='dqn', 79 | num_iterations=1000, 80 | num_train_episodes=100, 81 | num_eval_episodes=100, 82 | logdir=None, 83 | experiment_dir=None, 84 | logfile=None): 85 | """Function that executes RL training and evaluation. 86 | 87 | Args: 88 | env_name: Name of the environment that the agent will interact with. 89 | agent_type: The type RL agent that will be used for training. 90 | num_iterations: Number of iterations to train for. 91 | num_train_episodes: Number of training episodes per iteration. 92 | num_eval_episodes: Number of evaluation episodes per iteration. 93 | logdir: Directory for log file. 94 | logfile: File to log the agent's performance over training. 95 | """ 96 | experiment_dir += '_agent_type_' + agent_type 97 | 98 | experiment_dir = logdir + experiment_dir 99 | logfile = experiment_dir + '/' + logfile 100 | 101 | try: 102 | os.stat(experiment_dir) 103 | except: 104 | os.mkdir(experiment_dir) 105 | 106 | 107 | env = make_environment(env_name) 108 | env_test = make_environment(env_name) 109 | # env_test = Monitor(env_test, directory='videos/', video_callable=lambda x: True, resume=True) 110 | print 'Made environment!' 111 | agent = make_agent(agent_type, env) 112 | print 'Made agent!' 113 | 114 | for it in range(num_iterations): 115 | 116 | # Run train episodes. 117 | for train_episode in range(num_train_episodes): 118 | # Reset the environment. 119 | state = env.reset() 120 | state = np.expand_dims(state, axis=0) 121 | 122 | episode_reward = 0 123 | 124 | # Run the episode. 125 | terminal = False 126 | 127 | while not terminal: 128 | action = agent.sample(state) 129 | # Remove the do-nothing action. 130 | if env_name == 'MountainCar-v0': 131 | if action == 1: 132 | env_action = 2 133 | else: 134 | env_action = action 135 | 136 | next_state, reward, terminal, _ = env.step(env_action) 137 | next_state = np.expand_dims(next_state, axis=0) 138 | 139 | agent.store(state, action, reward, next_state, terminal) 140 | agent.update() 141 | 142 | episode_reward += reward 143 | # Update the state. 144 | state = next_state 145 | 146 | eval_rewards = [] 147 | 148 | # Run eval episodes. 149 | for eval_episode in range(num_eval_episodes): 150 | 151 | # Reset the environment. 152 | state = env_test.reset() 153 | state = np.expand_dims(state, axis=0) 154 | 155 | episode_reward = 0 156 | 157 | # Run the episode. 158 | terminal = False 159 | 160 | while not terminal: 161 | if agent_type == 'dqn': 162 | action = agent.best_action(state) 163 | else: 164 | action, info = agent.best_action(state) 165 | if agent_type == 'h_dqn' and info is not None: 166 | curr_state = info[0] 167 | if not use_memory: 168 | curr_state = np.where(np.squeeze(curr_state) == 1)[0][0] 169 | else: 170 | curr_state = np.squeeze(curr_state)[-1] - 1 171 | goal = info[1] 172 | heat_map[curr_state][goal] += 1 173 | 174 | # Remove the do-nothing action. 175 | if action == 1: 176 | env_action = 2 177 | else: 178 | env_action = action 179 | 180 | next_state, reward, terminal, _ = env_test.step(env_action) 181 | 182 | next_state = np.expand_dims(next_state, axis=0) 183 | # env_test.render() 184 | agent.store(state, action, reward, next_state, terminal, eval=True) 185 | if reward > 1: 186 | reward = 1 # For sake of comparison. 187 | 188 | episode_reward += reward 189 | 190 | state = next_state 191 | 192 | eval_rewards.append(episode_reward) 193 | 194 | with open(experiment_dir + '/eval_rewards_' + str(it), 'wb') as f: 195 | pickle.dump(eval_rewards, f) 196 | 197 | log(logfile, it, eval_rewards) 198 | 199 | 200 | run(agent_type=FLAGS.agent_type, logdir=FLAGS.logdir, experiment_dir=FLAGS.experiment_dir, 201 | logfile=FLAGS.logfile) 202 | 203 | 204 | --------------------------------------------------------------------------------