├── .gitignore ├── __init__.py ├── requirements.txt ├── rl ├── __init__.py ├── agent.py ├── memory.py ├── util.py ├── a3c_model.py ├── dqn_2.py ├── a3c.py └── dqn.py ├── README.md ├── example_dqn.py └── example_a3c.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | out/ 3 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .rl import * 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow 2 | keras 3 | gym 4 | -------------------------------------------------------------------------------- /rl/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import * 2 | from .a3c import * 3 | from .dqn import * 4 | -------------------------------------------------------------------------------- /rl/agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | General interface for an agent 3 | """ 4 | class Agent: 5 | """ 6 | Represents an agent 7 | """ 8 | def compile(self, sess): 9 | """ 10 | Compiles the agent, setting up all the models and ops. 11 | """ 12 | pass 13 | 14 | def train(self, env_builder): 15 | """ 16 | Trains the agent on an environment 17 | """ 18 | pass 19 | 20 | def run(self, env_builder): 21 | """ 22 | Runs the agent in an environment 23 | """ 24 | pass 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning Library 2 | This repository aims to contain the latest reinforcement learning algorithms 3 | implemented using Tensorflow, Keras and OpenAI Gym. 4 | 5 | Currently, A3C has been implemented. 6 | 7 | ## Requirements 8 | - Python 3.5 9 | 10 | ``` 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | ## Usage 15 | ``` 16 | agent = A3CAgent(num_actions, lambda: model) 17 | agent.train(env_name) 18 | ``` 19 | 20 | Tensorboard Logging 21 | ``` 22 | tensorboard --logdir=out --reload_interval=2 23 | ``` 24 | 25 | Sources: 26 | - https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2 27 | -------------------------------------------------------------------------------- /example_dqn.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple example to run the DQN algorithm on a toy example. 3 | """ 4 | import gym 5 | import tensorflow as tf 6 | from rl import DQNAgent 7 | from keras.layers import Dense, Input, merge, Activation, Flatten 8 | from keras.models import Model 9 | 10 | env_name = 'CartPole-v0' 11 | num_actions = 2 12 | 13 | def make_model(): 14 | i = Input((4,)) 15 | x = i 16 | x = Dense(128, activation='relu')(x) 17 | policy = Dense(num_actions, activation='softmax')(x) 18 | value = Dense(1, activation='linear')(x) 19 | return Model([i], [value]) 20 | 21 | with tf.Session() as sess, tf.device('/cpu:0'): 22 | agent = DQNAgent(make_model) 23 | agent.compile(sess) 24 | agent.train(sess, lambda: gym.make('CartPole-v0')) 25 | -------------------------------------------------------------------------------- /example_a3c.py: -------------------------------------------------------------------------------- 1 | """ 2 | A simple example to run the A3C algorithm on a toy example. 3 | """ 4 | import gym 5 | import tensorflow as tf 6 | from rl import A3CAgent 7 | from keras.layers import Dense, Input, merge, Activation, Flatten 8 | from keras.models import Model 9 | 10 | env_name = 'CartPole-v0' 11 | num_actions = 2 12 | 13 | def make_model(): 14 | i = Input((4,)) 15 | x = i 16 | x = Dense(128, activation='relu')(x) 17 | policy = Dense(num_actions, activation='softmax')(x) 18 | value = Dense(1, activation='linear')(x) 19 | return Model([i], [policy, value]) 20 | 21 | with tf.Session() as sess, tf.device('/cpu:0'): 22 | agent = A3CAgent(make_model) 23 | agent.compile(sess) 24 | agent.train(sess, lambda: gym.make('CartPole-v0')).join() 25 | -------------------------------------------------------------------------------- /rl/memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | 4 | class Memory: 5 | """ 6 | Represents the memory of the agent. 7 | The agent by default stores only the current time step, but is capable 8 | of holding memory of previos time steps for training RNNs. 9 | """ 10 | 11 | def __init__(self, time_steps): 12 | self.time_steps = time_steps 13 | 14 | def reset(self, init_state=None): 15 | self._memory = [] 16 | 17 | if init_state is not None: 18 | # Handle non-tuple states 19 | if not isinstance(init_state, tuple): 20 | self.is_tuple = False 21 | init_state = (init_state,) 22 | else: 23 | self.is_tuple = True 24 | 25 | for input_state in init_state: 26 | # lookback buffer 27 | temporal_memory = deque(maxlen=max(self.time_steps, 1)) 28 | # Fill temporal memory with zeros 29 | while len(temporal_memory) < self.time_steps - 1: 30 | temporal_memory.appendleft(np.zeros_like(input_state)) 31 | 32 | temporal_memory.append(input_state) 33 | self._memory.append(temporal_memory) 34 | 35 | def remember(self, state): 36 | if not self.is_tuple: 37 | state = (state,) 38 | 39 | for i, input_state in enumerate(state): 40 | self._memory[i].append(input_state) 41 | 42 | def to_states(self): 43 | """ Returns a state per input """ 44 | if self.time_steps == 0: 45 | # No time_steps = not recurrent 46 | return [m[0] for m in self._memory] 47 | else: 48 | return [list(m) for m in self._memory] 49 | 50 | def build_single_feed(self, inputs): 51 | if self.time_steps == 0: 52 | # No time_steps = not recurrent 53 | return {i: list(m) for i, m in zip(inputs, self._memory)} 54 | else: 55 | return {i: [list(m)] for i, m in zip(inputs, self._memory)} 56 | -------------------------------------------------------------------------------- /rl/util.py: -------------------------------------------------------------------------------- 1 | from keras import backend as K 2 | import time 3 | import numpy as np 4 | import tensorflow as tf 5 | from gym import spaces 6 | 7 | def discount(rewards, discount, current=0): 8 | """ Takes an array of rewards and compute array of discounted reward """ 9 | discounted_r = np.zeros_like(rewards) 10 | 11 | for t in reversed(range(len(rewards))): 12 | current = current * discount + rewards[t] 13 | discounted_r[t] = current 14 | 15 | return discounted_r 16 | 17 | def make_summary(data, prefix=''): 18 | if prefix != '': 19 | prefix += '/' 20 | 21 | summary = tf.Summary() 22 | for name, value in data.items(): 23 | summary.value.add(tag=prefix + name, simple_value=float(value)) 24 | 25 | return summary 26 | 27 | def save_worker(sess, coord, agent): 28 | while not coord.should_stop(): 29 | time.sleep(60) 30 | agent.save(sess) 31 | 32 | def update_target_graph(from_scope, to_scope): 33 | """ 34 | Copies one set of variables to another. 35 | Used to set worker network parameters to those of global network. 36 | """ 37 | from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) 38 | to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) 39 | 40 | op_holder = [] 41 | for from_var, to_var in zip(from_vars, to_vars): 42 | op_holder.append(to_var.assign(from_var)) 43 | return op_holder 44 | 45 | def track(env): 46 | """ 47 | Wraps a Gym environment to keep track of the results of step calls visited. 48 | """ 49 | step = env.step 50 | def step_override(*args, **kwargs): 51 | result = step(*args, **kwargs) 52 | env.step_cache.append(result) 53 | env.total_reward += result[1] 54 | return result 55 | env.step = step_override 56 | 57 | reset = env.reset 58 | def reset_override(*args, **kwargs): 59 | env.total_reward = 0 60 | env.step_cache = [] 61 | return reset(*args, **kwargs) 62 | env.reset = reset_override 63 | 64 | return env 65 | -------------------------------------------------------------------------------- /rl/a3c_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import keras.backend as K 3 | 4 | class ACModel: 5 | """ 6 | Holds the AC model and Keras model that has been passed in. 7 | Compiles the policy and value loss functions. 8 | """ 9 | 10 | def __init__(self, model_builder, beta): 11 | # Entropy weight 12 | self.beta = beta 13 | 14 | self.model = model_builder() 15 | # Output layers for policy and value estimations 16 | self.policies = self.model.outputs[:-1] 17 | self.value = self.model.outputs[-1] 18 | 19 | # The Keras learning phase tensor. Needs to be the same in all threads. 20 | # Boolean tensor that becomes true while training 21 | self.isTrain = K.learning_phase() 22 | 23 | def compile(self, optimizer, grad_clip): 24 | # Only the worker network need ops for loss functions and gradient 25 | # updating. 26 | self.target_v = tf.placeholder( 27 | tf.float32, [None], name='target_values') 28 | self.advantages = tf.placeholder( 29 | tf.float32, [None], name='advantages') 30 | 31 | # Action chosen for every single policy output 32 | self.actions = [] 33 | policy_losses = [] 34 | entropies = [] 35 | 36 | # Every policy output 37 | for policy in self.policies: 38 | num_actions = policy.get_shape()[1] 39 | action = tf.placeholder(tf.int32, [None]) 40 | actions_hot = tf.one_hot(action, num_actions) 41 | self.actions.append(action) 42 | 43 | responsible_outputs = tf.reduce_sum(policy * actions_hot, [1]) 44 | # Entropy regularization 45 | # TODO: Clipping should be configurable 46 | entropies.append(-tf.reduce_sum(policy * 47 | tf.log(tf.clip_by_value(policy, 1e-20, 1.0)))) 48 | # Policy loss 49 | policy_losses.append(-tf.reduce_sum(tf.log(responsible_outputs) 50 | * self.advantages)) 51 | 52 | # Compute average policy and entropy loss 53 | self.policy_loss = tf.reduce_sum(policy_losses) 54 | self.entropy = tf.reduce_sum(entropies) 55 | 56 | # Value loss (Mean squared error) 57 | self.value_loss = tf.reduce_mean( 58 | tf.square(self.target_v - tf.reshape(self.value, [-1]))) 59 | # Learning rate for Critic is half of Actor's, so multiply by 0.5 60 | self.loss = 0.5 * self.value_loss + self.policy_loss - self.beta * self.entropy 61 | 62 | # Get gradients from local network using local losses 63 | local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) 64 | self.gradients = tf.gradients(self.loss, local_vars) 65 | self.var_norms = tf.global_norm(local_vars) 66 | # Clip norm of gradients 67 | if grad_clip > 0: 68 | grads, self.grad_norms = tf.clip_by_global_norm( 69 | self.gradients, grad_clip) 70 | else: 71 | grads = self.gradients 72 | 73 | # Apply local gradients to global network 74 | global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) 75 | self.train = optimizer.apply_gradients(zip(grads, global_vars)) 76 | -------------------------------------------------------------------------------- /rl/dqn_2.py: -------------------------------------------------------------------------------- 1 | """ 2 | TODO: Incomplete WIP 3 | """ 4 | import gym 5 | import random 6 | from collections import deque 7 | import numpy as np 8 | import tensorflow as tf 9 | 10 | class DQN: 11 | REPLAY_MEMORY_SIZE = 10000 12 | RANDOM_ACTION_PROB = 0.5 13 | RANDOM_ACTION_DECAY = 0.99 14 | HIDDEN1_SIZE = 128 15 | HIDDEN2_SIZE = 128 16 | NUM_EPISODES = 3000 17 | LEARNING_RATE = 1e-4 18 | MINIBATCH_SIZE = 10 19 | DISCOUNT_FACTOR = 0.99 20 | TARGET_UPDATE_FREQ = 100 21 | REG_FACTOR = 0.001 22 | LOG_DIR = 'out/dqn' 23 | 24 | def __init__(self, env): 25 | self.env = gym.make(env) 26 | assert len(self.env.observation_space.shape) == 1 27 | self.input_size = self.env.observation_space.shape[0] 28 | self.output_size = self.env.action_space.n 29 | 30 | def init_network(self): 31 | # Inference 32 | self.x = tf.placeholder(tf.float32, [None, self.input_size]) 33 | with tf.name_scope('hidden1'): 34 | W1 = tf.Variable( 35 | tf.truncated_normal([self.input_size, self.HIDDEN1_SIZE], 36 | stddev=0.01), name='W1') 37 | b1 = tf.Variable(tf.zeros(self.HIDDEN1_SIZE), name='b1') 38 | h1 = tf.nn.tanh(tf.matmul(self.x, W1) + b1) 39 | with tf.name_scope('hidden2'): 40 | W2 = tf.Variable( 41 | tf.truncated_normal([self.HIDDEN1_SIZE, self.HIDDEN2_SIZE], 42 | stddev=0.01), name='W2') 43 | b2 = tf.Variable(tf.zeros(self.HIDDEN2_SIZE), name='b2') 44 | h2 = tf.nn.tanh(tf.matmul(h1, W2) + b2) 45 | with tf.name_scope('output'): 46 | W3 = tf.Variable( 47 | tf.truncated_normal([self.HIDDEN2_SIZE, self.output_size], 48 | stddev=0.01), name='W3') 49 | b3 = tf.Variable(tf.zeros(self.output_size), name='b3') 50 | self.Q = tf.matmul(h2, W3) + b3 51 | self.weights = [W1, b1, W2, b2, W3, b3] 52 | 53 | # Loss 54 | self.targetQ = tf.placeholder(tf.float32, [None]) 55 | self.targetActionMask = tf.placeholder(tf.float32, [None, self.output_size]) 56 | # TODO: Optimize this 57 | q_values = tf.reduce_sum(tf.mul(self.Q, self.targetActionMask), 58 | reduction_indices=[1]) 59 | self.loss = tf.reduce_mean(tf.square(tf.sub(q_values, self.targetQ))) 60 | 61 | # Reguralization 62 | for w in [W1, W2, W3]: 63 | self.loss += self.REG_FACTOR * tf.reduce_sum(tf.square(w)) 64 | 65 | # Training 66 | optimizer = tf.train.GradientDescentOptimizer(self.LEARNING_RATE) 67 | global_step = tf.Variable(0, name='global_step', trainable=False) 68 | self.train_op = optimizer.minimize(self.loss, global_step=global_step) 69 | 70 | self.reward = tf.Variable(0.0, [None]) 71 | 72 | def train(self, num_episodes=NUM_EPISODES): 73 | replay_memory = deque(maxlen=self.REPLAY_MEMORY_SIZE) 74 | 75 | self.session = tf.Session() 76 | 77 | # Summary for TensorBoard 78 | tf.scalar_summary('loss', self.loss) 79 | tf.scalar_summary('reward', self.reward) 80 | self.summary = tf.merge_all_summaries() 81 | self.summary_writer = tf.train.SummaryWriter(self.LOG_DIR, self.session.graph) 82 | 83 | self.session.run(tf.initialize_all_variables()) 84 | total_steps = 0 85 | 86 | for episode in range(num_episodes): 87 | print("Training: Episode = %d, Global step = %d" % (episode, total_steps)) 88 | state = self.env.reset() 89 | target_weights = self.session.run(self.weights) 90 | total_reward = 0 91 | done = False 92 | 93 | while not done: 94 | # Pick the next action and execute it 95 | action = None 96 | if random.random() < self.RANDOM_ACTION_PROB: 97 | action = self.env.action_space.sample() 98 | else: 99 | q_values = self.session.run(self.Q, feed_dict={self.x: [state]}) 100 | action = q_values.argmax() 101 | self.RANDOM_ACTION_PROB *= self.RANDOM_ACTION_DECAY 102 | obs, reward, done, _ = self.env.step(action) 103 | total_reward += reward 104 | # Update replay memory 105 | if done: 106 | # TODO: Seems to matter a lot... 107 | reward -= 100 108 | self.session.run(self.reward.assign(total_reward)) 109 | 110 | replay_memory.append((state, action, reward, obs, done)) 111 | 112 | state = obs 113 | 114 | # Sample a random minibatch and fetch max Q at s' 115 | if len(replay_memory) >= self.MINIBATCH_SIZE: 116 | minibatch = random.sample(replay_memory, self.MINIBATCH_SIZE) 117 | next_states = [m[3] for m in minibatch] 118 | # TODO: Optimize to skip terminal states 119 | feed_dict = {self.x: next_states} 120 | feed_dict.update(zip(self.weights, target_weights)) 121 | q_values = self.session.run(self.Q, feed_dict) 122 | max_q_values = q_values.max(axis=1) 123 | 124 | # Compute target Q values 125 | target_q = np.zeros(self.MINIBATCH_SIZE) 126 | target_action_mask = np.zeros((self.MINIBATCH_SIZE, self.output_size), dtype=int) 127 | for i in range(self.MINIBATCH_SIZE): 128 | _, action, reward, _, terminal = minibatch[i] 129 | target_q[i] = reward 130 | if not terminal: 131 | target_q[i] += self.DISCOUNT_FACTOR * max_q_values[i] 132 | target_action_mask[i][action] = 1 133 | 134 | # Gradient descent 135 | states = [m[0] for m in minibatch] 136 | feed_dict = { 137 | self.x: states, 138 | self.targetQ: target_q, 139 | self.targetActionMask: target_action_mask, 140 | } 141 | _, summary = self.session.run([self.train_op, self.summary], 142 | feed_dict=feed_dict) 143 | 144 | # Write summary for TensorBoard 145 | self.summary_writer.add_summary(summary, total_steps) 146 | 147 | # Update target weights 148 | if total_steps % self.TARGET_UPDATE_FREQ == 0: 149 | target_weights = self.session.run(self.weights) 150 | 151 | total_steps += 1 152 | if done: 153 | break 154 | 155 | def play(self): 156 | state = self.env.reset() 157 | done = False 158 | steps = 0 159 | while not done and steps < 200: 160 | self.env.render() 161 | q_values = self.session.run(self.Q, feed_dict={self.x: [state]}) 162 | action = q_values.argmax() 163 | state, _, done, _ = self.env.step(action) 164 | steps += 1 165 | return steps 166 | 167 | if __name__ == '__main__': 168 | dqn = DQN('CartPole-v0') 169 | dqn.init_network() 170 | 171 | #dqn.env.monitor.start('/tmp/cartpole') 172 | dqn.train() 173 | #dqn.env.monitor.close() 174 | 175 | res = [] 176 | for i in range(100): 177 | steps = dqn.play() 178 | print("Test steps = ", steps) 179 | res.append(steps) 180 | print("Mean steps = ", sum(res) / len(res)) 181 | -------------------------------------------------------------------------------- /rl/a3c.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import os 4 | import threading 5 | import multiprocessing 6 | import time 7 | 8 | from .a3c_model import ACModel 9 | from .util import * 10 | from .memory import Memory 11 | from .agent import Agent 12 | 13 | class ACAgentRunner(Agent): 14 | 15 | def __init__(self, model, memory, preprocess, batch_size): 16 | self.model = model 17 | self.memory = memory 18 | self.preprocess = preprocess 19 | self.batch_size = batch_size 20 | 21 | def perform(self, sess, env): 22 | """ 23 | Perform action according to policy pi(a | s) 24 | """ 25 | *probs, value = sess.run( 26 | self.model.model.outputs, 27 | { 28 | **self.memory.build_single_feed(self.model.model.inputs), 29 | self.model.isTrain: False 30 | } 31 | ) 32 | 33 | # Remove batch dimension 34 | value = value[0][0] 35 | 36 | # Sample an action from an action probability distribution output 37 | action = [] 38 | 39 | for p in probs: 40 | p = p[0] 41 | action.append(np.random.choice(len(p), p=p)) 42 | 43 | flatten_action = action[0] if len(action) == 1 else action 44 | next_state, reward, terminal, info = env.step(flatten_action) 45 | next_state = self.preprocess(env, next_state) 46 | 47 | return value, action, next_state, reward, terminal 48 | 49 | def train(self, sess, coord, env_builder, writer, gamma): 50 | try: 51 | # Thread setup 52 | env = env_builder() 53 | 54 | episode_count = 0 55 | 56 | # Reset per-episode vars 57 | terminal = False 58 | total_reward = 0 59 | step_count = 0 60 | 61 | # Each memory corresponds to one input. 62 | self.memory.reset(self.preprocess(env, env.reset())) 63 | 64 | print("Training ACAgentRunner...") 65 | 66 | while not coord.should_stop(): 67 | # Run a training batch 68 | t = 0 69 | t_start = t 70 | 71 | # Batched based variables 72 | state_batches = [[] for _ in self.model.model.inputs] 73 | actions = [] 74 | rewards = [] 75 | values = [] 76 | 77 | while not (terminal or ((t - t_start) == self.batch_size)): 78 | value, action, next_state, reward, terminal = self.perform( 79 | sess, env 80 | ) 81 | 82 | # Bookkeeping 83 | for i, state in enumerate(self.memory.to_states()): 84 | state_batches[i].append(state) 85 | 86 | self.memory.remember(next_state) 87 | actions.append(action) 88 | values.append(value) 89 | rewards.append(reward) 90 | 91 | total_reward += reward 92 | step_count += 1 93 | t += 1 94 | 95 | if terminal: 96 | reward = 0 97 | else: 98 | # Bootstrap from last state 99 | reward = sess.run( 100 | self.model.value, 101 | { 102 | **self.memory.build_single_feed(self.model.model.inputs), 103 | self.model.isTrain: False 104 | } 105 | )[0][0] 106 | 107 | # Here we take the rewards and values from the exp, and use them to 108 | # generate the advantage and discounted returns. 109 | # The advantage function uses "Generalized Advantage 110 | # Estimation" 111 | discounted_rewards = discount(rewards, gamma, reward) 112 | value_plus = np.array(values + [reward]) 113 | advantages = discount( 114 | rewards + gamma * value_plus[1:] - value_plus[:-1], gamma) 115 | 116 | # Train network 117 | v_l, p_l, e_l, g_n, v_n, _ = sess.run([ 118 | self.model.value_loss, 119 | self.model.policy_loss, 120 | self.model.entropy, 121 | self.model.grad_norms, 122 | self.model.var_norms, 123 | self.model.train 124 | ], 125 | { 126 | **dict(zip(self.model.model.inputs, state_batches)), 127 | **dict(zip(self.model.actions, zip(*actions))), 128 | ** 129 | { 130 | self.model.target_v: discounted_rewards, 131 | self.model.advantages: advantages, 132 | self.model.isTrain: True 133 | }, 134 | } 135 | ) 136 | 137 | if terminal: 138 | # Record metrics 139 | writer.add_summary( 140 | make_summary({ 141 | 'rewards': total_reward, 142 | 'lengths': step_count, 143 | 'value_loss': v_l, 144 | 'policy_loss': p_l, 145 | 'entropy_loss': e_l, 146 | 'grad_norm': g_n, 147 | 'value_norm': v_n, 148 | 'mean_values': np.mean(values) 149 | }), 150 | episode_count 151 | ) 152 | 153 | episode_count += 1 154 | 155 | # Reset per-episode counters 156 | terminal = False 157 | total_reward = 0 158 | step_count = 0 159 | # Each memory corresponds to one input. 160 | self.memory.reset(self.preprocess(env, env.reset())) 161 | except Exception as e: 162 | # Report exceptions to the coordinator. 163 | coord.request_stop(e) 164 | 165 | def run(self, sess, env): 166 | self.memory.reset(self.preprocess(env, env.reset())) 167 | total_reward = 0 168 | terminal = False 169 | 170 | while not terminal: 171 | value,\ 172 | action,\ 173 | next_state,\ 174 | reward,\ 175 | terminal = self.perform(sess, env) 176 | 177 | total_reward += reward 178 | self.memory.remember(next_state) 179 | 180 | # TODO: Refactor to async coordinator? 181 | class A3CAgent(Agent): 182 | # TODO: Refactor these hyperparameters to one object 183 | 184 | def __init__(self, 185 | model_builder, 186 | time_steps=0, 187 | preprocess=lambda e, x: x, 188 | model_path='out/model', 189 | num_workers=multiprocessing.cpu_count(), 190 | entropy_factor=1e-2, 191 | batch_size=32): 192 | self.model_builder = model_builder 193 | self.time_steps = time_steps 194 | self.model_path = model_path 195 | self.entropy_factor = entropy_factor 196 | self.preprocess = preprocess 197 | self.batch_size = batch_size 198 | self.save_count = 0 199 | self.model = ACModel(model_builder, entropy_factor) 200 | self.saver = tf.train.Saver(max_to_keep=5) 201 | 202 | # Create agents 203 | self.agents = [] 204 | 205 | for i in range(num_workers): 206 | self.add_agent() 207 | 208 | def add_agent(self, Agent=ACAgentRunner): 209 | self.agents.append(Agent( 210 | self.model, 211 | Memory(self.time_steps), 212 | self.preprocess, 213 | self.batch_size 214 | )) 215 | 216 | # TODO: Not SRP. Agent shouldn't handle model saving. 217 | def load(self, sess): 218 | self.model.model.load_weights(self.model_path + '/model.h5') 219 | 220 | def save(self, sess): 221 | if not os.path.exists(self.model_path): 222 | os.makedirs(self.model_path) 223 | self.model.model.save_weights(self.model_path + '/model_' + str(self.save_count) + '.h5') 224 | self.save_count += 1 225 | 226 | def compile(self, 227 | sess, 228 | grad_clip=50., 229 | optimizer=tf.train.AdamOptimizer()): 230 | self.model.compile(optimizer, grad_clip) 231 | print(self.model.model.summary()) 232 | 233 | # Initialize variables 234 | sess.run(tf.global_variables_initializer()) 235 | 236 | def train(self, 237 | sess, 238 | env_builder, 239 | summary_path='out/summary/', 240 | discount=.99): 241 | """ 242 | Starts training. 243 | Return: The coordinator for all the threads 244 | """ 245 | print('Training model') 246 | 247 | coord = tf.train.Coordinator() 248 | 249 | for i, agent in enumerate(self.agents): 250 | name = 'worker_' + str(i) 251 | writer = tf.summary.FileWriter( 252 | summary_path + name, sess.graph, flush_secs=2) 253 | 254 | t = threading.Thread( 255 | target=agent.train, 256 | args=( 257 | sess, 258 | coord, 259 | env_builder, 260 | writer, 261 | discount 262 | ) 263 | ) 264 | 265 | t.start() 266 | coord.register_thread(t) 267 | 268 | # Stagger threads to decorrelate experience 269 | time.sleep(1) 270 | 271 | # Create thread that auto-saves 272 | t = threading.Thread(target=save_worker, args=(sess, coord, self)) 273 | t.start() 274 | coord.register_thread(t) 275 | 276 | return coord 277 | 278 | def run(self, sess, env): 279 | # Pick the first agent to run the environment 280 | self.agents[0].run(sess, env) 281 | -------------------------------------------------------------------------------- /rl/dqn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Deep Q Network. 3 | TODO: Incomplete WIP 4 | """ 5 | import os 6 | import gym 7 | import random 8 | import numpy as np 9 | import tensorflow as tf 10 | from collections import deque 11 | from keras.models import Model 12 | 13 | from .agent import Agent 14 | from .memory import Memory 15 | 16 | NUM_EPISODES = 100000 # Number of episodes the agent plays 17 | GAMMA = 0.99 # Discount factor 18 | # Number of steps to populate the replay memory before training starts 19 | INITIAL_REPLAY_SIZE = 1000 20 | NUM_REPLAY_MEMORY = 10000 # Number of replay memory the agent uses for training 21 | BATCH_SIZE = 16 # Mini batch size 22 | # The frequency with which the target network is updated 23 | TARGET_UPDATE_INTERVAL = 100 24 | TRAIN_INTERVAL = 1 # The agent selects 4 actions between successive updates 25 | # Constant added to the squared gradient in the denominator of the RMSProp 26 | # update 27 | MIN_GRAD = 0.01 28 | SAVE_SUMMARY_PATH = 'out/summary/' 29 | 30 | # TODO: Depend only on Keras? 31 | # TODO: Timestep support 32 | # TODO: Multi-input support 33 | class DQNAgent(Agent): 34 | 35 | def __init__(self, 36 | model_builder, 37 | initial_epsilon=1, 38 | final_epsilon=0.1, 39 | explore_steps=1000000, 40 | preprocess=lambda x: x): 41 | """ 42 | Args 43 | model_builder: A function that create a new model for the network 44 | initial_epsilon: Starting epsilon 45 | final_epsilon: Ending epsilon 46 | explore_steps: Number of steps over which the initial value of 47 | epsilon is linearly annealed to its final value 48 | preprocess: Function called to preprocess observations 49 | """ 50 | self.epsilon = initial_epsilon 51 | self.final_epsilon = final_epsilon 52 | self.epsilon_step = (initial_epsilon - final_epsilon) / explore_steps 53 | self.explore_steps = explore_steps 54 | self.t = 0 55 | 56 | # Parameters used for summary 57 | self.total_reward = 0 58 | self.total_q_max = 0 59 | self.total_loss = 0 60 | self.duration = 0 61 | self.episode = 0 62 | 63 | # Create replay memory 64 | self.memory = deque() 65 | 66 | # Misc 67 | self.model_builder = model_builder 68 | self.preprocess = preprocess 69 | 70 | def compile(self, sess, optimizer=tf.train.AdamOptimizer(learning_rate=1e-4)): 71 | self.sess = sess 72 | 73 | # Create q network 74 | self.q_model = self.model_builder() 75 | self.num_actions = self.q_model.outputs[0].get_shape()[1] 76 | q_weights = self.q_model.trainable_weights 77 | 78 | # Create target network 79 | self.t_model = self.model_builder() 80 | t_weights = self.t_model.trainable_weights 81 | 82 | # Syncs the target Q network's weight with the Q network's weights 83 | self.sync = [t_weights[i].assign(q_weights[i]) 84 | for i in range(len(t_weights))] 85 | 86 | # Define loss and gradient update operation 87 | self.a = tf.placeholder(tf.int64, [None]) 88 | self.y = tf.placeholder(tf.float32, [None]) 89 | 90 | # Convert action to one hot vector 91 | a_one_hot = tf.one_hot(self.a, self.num_actions, 1.0, 0.0) 92 | q_value = tf.reduce_sum( 93 | tf.mul(self.q_model.outputs[0], a_one_hot), reduction_indices=1 94 | ) 95 | 96 | # Clip the error, the loss is quadratic when the error is in (-1, 1), 97 | # and linear outside of that region 98 | error = tf.abs(self.y - q_value) 99 | quadratic_part = tf.clip_by_value(error, 0.0, 1.0) 100 | linear_part = error - quadratic_part 101 | # Define loss and gradient update operation 102 | self.loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part) 103 | self.train_op = optimizer.minimize(self.loss, var_list=q_weights) 104 | 105 | # Setup metrics 106 | self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary() 107 | self.summary_writer = tf.summary.FileWriter( 108 | SAVE_SUMMARY_PATH, self.sess.graph 109 | ) 110 | 111 | # Init vars 112 | self.sess.run(tf.global_variables_initializer()) 113 | 114 | # Initialize target network 115 | self.sess.run(self.sync) 116 | 117 | def train(self, sess, env_builder): 118 | env = env_builder() 119 | 120 | for _ in range(NUM_EPISODES): 121 | terminal = False 122 | state = self.preprocess(env.reset()) 123 | while not terminal: 124 | action = self.get_action(state) 125 | next_state, reward, terminal, _ = env.step(action) 126 | next_state = self.preprocess(next_state) 127 | self.run(state, action, reward, terminal, next_state) 128 | state = next_state 129 | 130 | def get_action(self, state): 131 | """ 132 | Picks an action given a state based on epsilon greedy policy. 133 | """ 134 | if self.epsilon >= random.random() or self.t < INITIAL_REPLAY_SIZE: 135 | action = random.randrange(self.num_actions) 136 | else: 137 | action = np.argmax(self.q_model.predict(np.array([state]))[0]) 138 | 139 | # Anneal epsilon linearly over time 140 | if self.epsilon > self.final_epsilon and self.t >= INITIAL_REPLAY_SIZE: 141 | self.epsilon -= self.epsilon_step 142 | return action 143 | 144 | def run(self, state, action, reward, terminal, next_state): 145 | # Clip all positive rewards at 1 and all negative rewards at -1, 146 | # leaving 0 rewards unchanged 147 | reward = np.clip(reward, -1, 1) 148 | 149 | # Store transition in replay memory 150 | self.memory.append((state, action, reward, next_state, terminal)) 151 | 152 | if len(self.memory) > NUM_REPLAY_MEMORY: 153 | self.memory.popleft() 154 | 155 | if self.t >= INITIAL_REPLAY_SIZE: 156 | # Train network 157 | if self.t % TRAIN_INTERVAL == 0: 158 | self.learn() 159 | 160 | # Update target network 161 | if self.t % TARGET_UPDATE_INTERVAL == 0: 162 | self.sess.run(self.sync) 163 | 164 | self.total_reward += reward 165 | self.total_q_max += np.amax(self.q_model.predict(np.array([state]))[0]) 166 | self.duration += 1 167 | 168 | if terminal: 169 | # Write summary 170 | if self.t >= INITIAL_REPLAY_SIZE: 171 | stats = [self.total_reward, self.total_q_max / float(self.duration), 172 | self.duration, self.total_loss / (float(self.duration) / float(TRAIN_INTERVAL))] 173 | for i in range(len(stats)): 174 | self.sess.run(self.update_ops[i], feed_dict={ 175 | self.summary_placeholders[i]: float(stats[i]) 176 | }) 177 | summary_str = self.sess.run(self.summary_op) 178 | self.summary_writer.add_summary(summary_str, self.episode + 1) 179 | 180 | # Debug 181 | """ 182 | if self.t < INITIAL_REPLAY_SIZE: 183 | mode = 'random' 184 | elif INITIAL_REPLAY_SIZE <= self.t < INITIAL_REPLAY_SIZE + self.explore_steps: 185 | mode = 'explore' 186 | else: 187 | mode = 'exploit' 188 | print('EPISODE: {0:6d} / TIMESTEP: {1:8d} / DURATION: {2:5d} / EPSILON: {3:.5f} / TOTAL_REWARD: {4:3.0f} / AVG_MAX_Q: {5:2.4f} / AVG_LOSS: {6:.5f} / MODE: {7}'.format( 189 | self.episode + 1, self.t, self.duration, self.epsilon, 190 | self.total_reward, self.total_q_max / float(self.duration), 191 | self.total_loss / (float(self.duration) / float(TRAIN_INTERVAL)), mode)) 192 | """ 193 | 194 | self.total_reward = 0 195 | self.total_q_max = 0 196 | self.total_loss = 0 197 | self.duration = 0 198 | self.episode += 1 199 | 200 | self.t += 1 201 | 202 | return next_state 203 | 204 | def learn(self): 205 | state_batch = [] 206 | action_batch = [] 207 | reward_batch = [] 208 | next_state_batch = [] 209 | terminal_batch = [] 210 | y_batch = [] 211 | 212 | # Sample random minibatch of transition from replay memory 213 | minibatch = random.sample(self.memory, BATCH_SIZE) 214 | for data in minibatch: 215 | state_batch.append(data[0]) 216 | action_batch.append(data[1]) 217 | reward_batch.append(data[2]) 218 | next_state_batch.append(data[3]) 219 | terminal_batch.append(data[4]) 220 | 221 | # Convert True to 1, False to 0 222 | terminal_batch = np.array(terminal_batch) + 0 223 | 224 | target_q_values_batch = self.t_model.predict(np.array(next_state_batch)) 225 | y_batch = reward_batch + (1 - terminal_batch) * \ 226 | GAMMA * np.amax(target_q_values_batch, axis=1) 227 | 228 | loss, _ = self.sess.run([self.loss, self.train_op], { 229 | self.q_model.inputs[0]: np.array(state_batch), 230 | self.a: action_batch, 231 | self.y: y_batch 232 | }) 233 | 234 | self.total_loss += loss 235 | 236 | def setup_summary(self): 237 | episode_total_reward = tf.Variable(0.) 238 | tf.scalar_summary('/Total Reward/Episode', 239 | episode_total_reward) 240 | episode_avg_max_q = tf.Variable(0.) 241 | tf.scalar_summary( 242 | '/Average Max Q/Episode', episode_avg_max_q) 243 | episode_duration = tf.Variable(0.) 244 | tf.scalar_summary('/Duration/Episode', episode_duration) 245 | episode_avg_loss = tf.Variable(0.) 246 | tf.scalar_summary('/Average Loss/Episode', episode_avg_loss) 247 | summary_vars = [episode_total_reward, 248 | episode_avg_max_q, episode_duration, episode_avg_loss] 249 | summary_placeholders = [tf.placeholder( 250 | tf.float32) for _ in range(len(summary_vars))] 251 | update_ops = [summary_vars[i].assign( 252 | summary_placeholders[i]) for i in range(len(summary_vars))] 253 | summary_op = tf.summary.merge_all() 254 | return summary_placeholders, update_ops, summary_op 255 | --------------------------------------------------------------------------------