├── .gitignore
├── __init__.py
├── requirements.txt
├── rl
    ├── __init__.py
    ├── agent.py
    ├── memory.py
    ├── util.py
    ├── a3c_model.py
    ├── dqn_2.py
    ├── a3c.py
    └── dqn.py
├── README.md
├── example_dqn.py
└── example_a3c.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | out/
3 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from .rl import *
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow
2 | keras
3 | gym
4 | 


--------------------------------------------------------------------------------
/rl/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 | from .a3c import *
3 | from .dqn import *
4 | 


--------------------------------------------------------------------------------
/rl/agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | General interface for an agent
 3 | """
 4 | class Agent:
 5 |     """
 6 |     Represents an agent
 7 |     """
 8 |     def compile(self, sess):
 9 |         """
10 |         Compiles the agent, setting up all the models and ops.
11 |         """
12 |         pass
13 | 
14 |     def train(self, env_builder):
15 |         """
16 |         Trains the agent on an environment
17 |         """
18 |         pass
19 | 
20 |     def run(self, env_builder):
21 |         """
22 |         Runs the agent in an environment
23 |         """
24 |         pass
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement Learning Library
 2 | This repository aims to contain the latest reinforcement learning algorithms
 3 | implemented using Tensorflow, Keras and OpenAI Gym.
 4 | 
 5 | Currently, A3C has been implemented.
 6 | 
 7 | ## Requirements
 8 | - Python 3.5
 9 | 
10 | ```
11 | pip install -r requirements.txt
12 | ```
13 | 
14 | ## Usage
15 | ```
16 | agent = A3CAgent(num_actions, lambda: model)
17 | agent.train(env_name)
18 | ```
19 | 
20 | Tensorboard Logging
21 | ```
22 | tensorboard --logdir=out --reload_interval=2
23 | ```
24 | 
25 | Sources:
26 | - https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2
27 | 


--------------------------------------------------------------------------------
/example_dqn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A simple example to run the DQN algorithm on a toy example.
 3 | """
 4 | import gym
 5 | import tensorflow as tf
 6 | from rl import DQNAgent
 7 | from keras.layers import Dense, Input, merge, Activation, Flatten
 8 | from keras.models import Model
 9 | 
10 | env_name = 'CartPole-v0'
11 | num_actions = 2
12 | 
13 | def make_model():
14 |     i = Input((4,))
15 |     x = i
16 |     x = Dense(128, activation='relu')(x)
17 |     policy = Dense(num_actions, activation='softmax')(x)
18 |     value = Dense(1, activation='linear')(x)
19 |     return Model([i], [value])
20 | 
21 | with tf.Session() as sess, tf.device('/cpu:0'):
22 |     agent = DQNAgent(make_model)
23 |     agent.compile(sess)
24 |     agent.train(sess, lambda: gym.make('CartPole-v0'))
25 | 


--------------------------------------------------------------------------------
/example_a3c.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A simple example to run the A3C algorithm on a toy example.
 3 | """
 4 | import gym
 5 | import tensorflow as tf
 6 | from rl import A3CAgent
 7 | from keras.layers import Dense, Input, merge, Activation, Flatten
 8 | from keras.models import Model
 9 | 
10 | env_name = 'CartPole-v0'
11 | num_actions = 2
12 | 
13 | def make_model():
14 |     i = Input((4,))
15 |     x = i
16 |     x = Dense(128, activation='relu')(x)
17 |     policy = Dense(num_actions, activation='softmax')(x)
18 |     value = Dense(1, activation='linear')(x)
19 |     return Model([i], [policy, value])
20 | 
21 | with tf.Session() as sess, tf.device('/cpu:0'):
22 |     agent = A3CAgent(make_model)
23 |     agent.compile(sess)
24 |     agent.train(sess, lambda: gym.make('CartPole-v0')).join()
25 | 


--------------------------------------------------------------------------------
/rl/memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import deque
 3 | 
 4 | class Memory:
 5 |     """
 6 |     Represents the memory of the agent.
 7 |     The agent by default stores only the current time step, but is capable
 8 |     of holding memory of previos time steps for training RNNs.
 9 |     """
10 | 
11 |     def __init__(self, time_steps):
12 |         self.time_steps = time_steps
13 | 
14 |     def reset(self, init_state=None):
15 |         self._memory = []
16 | 
17 |         if init_state is not None:
18 |             # Handle non-tuple states
19 |             if not isinstance(init_state, tuple):
20 |                 self.is_tuple = False
21 |                 init_state = (init_state,)
22 |             else:
23 |                 self.is_tuple = True
24 | 
25 |             for input_state in init_state:
26 |                 # lookback buffer
27 |                 temporal_memory = deque(maxlen=max(self.time_steps, 1))
28 |                 # Fill temporal memory with zeros
29 |                 while len(temporal_memory) < self.time_steps - 1:
30 |                     temporal_memory.appendleft(np.zeros_like(input_state))
31 | 
32 |                 temporal_memory.append(input_state)
33 |                 self._memory.append(temporal_memory)
34 | 
35 |     def remember(self, state):
36 |         if not self.is_tuple:
37 |             state = (state,)
38 | 
39 |         for i, input_state in enumerate(state):
40 |             self._memory[i].append(input_state)
41 | 
42 |     def to_states(self):
43 |         """ Returns a state per input """
44 |         if self.time_steps == 0:
45 |             # No time_steps = not recurrent
46 |             return [m[0] for m in self._memory]
47 |         else:
48 |             return [list(m) for m in self._memory]
49 | 
50 |     def build_single_feed(self, inputs):
51 |         if self.time_steps == 0:
52 |             # No time_steps = not recurrent
53 |             return {i: list(m) for i, m in zip(inputs, self._memory)}
54 |         else:
55 |             return {i: [list(m)] for i, m in zip(inputs, self._memory)}
56 | 


--------------------------------------------------------------------------------
/rl/util.py:
--------------------------------------------------------------------------------
 1 | from keras import backend as K
 2 | import time
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | from gym import spaces
 6 | 
 7 | def discount(rewards, discount, current=0):
 8 |     """ Takes an array of rewards and compute array of discounted reward """
 9 |     discounted_r = np.zeros_like(rewards)
10 | 
11 |     for t in reversed(range(len(rewards))):
12 |         current = current * discount + rewards[t]
13 |         discounted_r[t] = current
14 | 
15 |     return discounted_r
16 | 
17 | def make_summary(data, prefix=''):
18 |     if prefix != '':
19 |         prefix += '/'
20 | 
21 |     summary = tf.Summary()
22 |     for name, value in data.items():
23 |         summary.value.add(tag=prefix + name, simple_value=float(value))
24 | 
25 |     return summary
26 | 
27 | def save_worker(sess, coord, agent):
28 |     while not coord.should_stop():
29 |         time.sleep(60)
30 |         agent.save(sess)
31 | 
32 | def update_target_graph(from_scope, to_scope):
33 |     """
34 |     Copies one set of variables to another.
35 |     Used to set worker network parameters to those of global network.
36 |     """
37 |     from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
38 |     to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
39 | 
40 |     op_holder = []
41 |     for from_var, to_var in zip(from_vars, to_vars):
42 |         op_holder.append(to_var.assign(from_var))
43 |     return op_holder
44 | 
45 | def track(env):
46 |     """
47 |     Wraps a Gym environment to keep track of the results of step calls visited.
48 |     """
49 |     step = env.step
50 |     def step_override(*args, **kwargs):
51 |         result = step(*args, **kwargs)
52 |         env.step_cache.append(result)
53 |         env.total_reward += result[1]
54 |         return result
55 |     env.step = step_override
56 | 
57 |     reset = env.reset
58 |     def reset_override(*args, **kwargs):
59 |         env.total_reward = 0
60 |         env.step_cache = []
61 |         return reset(*args, **kwargs)
62 |     env.reset = reset_override
63 | 
64 |     return env
65 | 


--------------------------------------------------------------------------------
/rl/a3c_model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import keras.backend as K
 3 | 
 4 | class ACModel:
 5 |     """
 6 |     Holds the AC model and Keras model that has been passed in.
 7 |     Compiles the policy and value loss functions.
 8 |     """
 9 | 
10 |     def __init__(self, model_builder, beta):
11 |         # Entropy weight
12 |         self.beta = beta
13 | 
14 |         self.model = model_builder()
15 |         # Output layers for policy and value estimations
16 |         self.policies = self.model.outputs[:-1]
17 |         self.value = self.model.outputs[-1]
18 | 
19 |         # The Keras learning phase tensor. Needs to be the same in all threads.
20 |         # Boolean tensor that becomes true while training
21 |         self.isTrain = K.learning_phase()
22 | 
23 |     def compile(self, optimizer, grad_clip):
24 |         # Only the worker network need ops for loss functions and gradient
25 |         # updating.
26 |         self.target_v = tf.placeholder(
27 |             tf.float32, [None],  name='target_values')
28 |         self.advantages = tf.placeholder(
29 |             tf.float32, [None],  name='advantages')
30 | 
31 |         # Action chosen for every single policy output
32 |         self.actions = []
33 |         policy_losses = []
34 |         entropies = []
35 | 
36 |         # Every policy output
37 |         for policy in self.policies:
38 |             num_actions = policy.get_shape()[1]
39 |             action = tf.placeholder(tf.int32, [None])
40 |             actions_hot = tf.one_hot(action, num_actions)
41 |             self.actions.append(action)
42 | 
43 |             responsible_outputs = tf.reduce_sum(policy * actions_hot, [1])
44 |             # Entropy regularization
45 |             # TODO: Clipping should be configurable
46 |             entropies.append(-tf.reduce_sum(policy *
47 |                                             tf.log(tf.clip_by_value(policy, 1e-20, 1.0))))
48 |             # Policy loss
49 |             policy_losses.append(-tf.reduce_sum(tf.log(responsible_outputs)
50 |                                                 * self.advantages))
51 | 
52 |         # Compute average policy and entropy loss
53 |         self.policy_loss = tf.reduce_sum(policy_losses)
54 |         self.entropy = tf.reduce_sum(entropies)
55 | 
56 |         # Value loss (Mean squared error)
57 |         self.value_loss = tf.reduce_mean(
58 |             tf.square(self.target_v - tf.reshape(self.value, [-1])))
59 |         # Learning rate for Critic is half of Actor's, so multiply by 0.5
60 |         self.loss = 0.5 * self.value_loss + self.policy_loss - self.beta * self.entropy
61 | 
62 |         # Get gradients from local network using local losses
63 |         local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
64 |         self.gradients = tf.gradients(self.loss, local_vars)
65 |         self.var_norms = tf.global_norm(local_vars)
66 |         # Clip norm of gradients
67 |         if grad_clip > 0:
68 |             grads, self.grad_norms = tf.clip_by_global_norm(
69 |             self.gradients, grad_clip)
70 |         else:
71 |             grads = self.gradients
72 | 
73 |         # Apply local gradients to global network
74 |         global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
75 |         self.train = optimizer.apply_gradients(zip(grads, global_vars))
76 | 


--------------------------------------------------------------------------------
/rl/dqn_2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | TODO: Incomplete WIP
  3 | """
  4 | import gym
  5 | import random
  6 | from collections import deque
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | 
 10 | class DQN:
 11 |   REPLAY_MEMORY_SIZE = 10000
 12 |   RANDOM_ACTION_PROB = 0.5
 13 |   RANDOM_ACTION_DECAY = 0.99
 14 |   HIDDEN1_SIZE = 128
 15 |   HIDDEN2_SIZE = 128
 16 |   NUM_EPISODES = 3000
 17 |   LEARNING_RATE = 1e-4
 18 |   MINIBATCH_SIZE = 10
 19 |   DISCOUNT_FACTOR = 0.99
 20 |   TARGET_UPDATE_FREQ = 100
 21 |   REG_FACTOR = 0.001
 22 |   LOG_DIR = 'out/dqn'
 23 | 
 24 |   def __init__(self, env):
 25 |     self.env = gym.make(env)
 26 |     assert len(self.env.observation_space.shape) == 1
 27 |     self.input_size = self.env.observation_space.shape[0]
 28 |     self.output_size = self.env.action_space.n
 29 | 
 30 |   def init_network(self):
 31 |     # Inference
 32 |     self.x = tf.placeholder(tf.float32, [None, self.input_size])
 33 |     with tf.name_scope('hidden1'):
 34 |       W1 = tf.Variable(
 35 |                  tf.truncated_normal([self.input_size, self.HIDDEN1_SIZE],
 36 |                  stddev=0.01), name='W1')
 37 |       b1 = tf.Variable(tf.zeros(self.HIDDEN1_SIZE), name='b1')
 38 |       h1 = tf.nn.tanh(tf.matmul(self.x, W1) + b1)
 39 |     with tf.name_scope('hidden2'):
 40 |       W2 = tf.Variable(
 41 |                  tf.truncated_normal([self.HIDDEN1_SIZE, self.HIDDEN2_SIZE],
 42 |                  stddev=0.01), name='W2')
 43 |       b2 = tf.Variable(tf.zeros(self.HIDDEN2_SIZE), name='b2')
 44 |       h2 = tf.nn.tanh(tf.matmul(h1, W2) + b2)
 45 |     with tf.name_scope('output'):
 46 |       W3 = tf.Variable(
 47 |                  tf.truncated_normal([self.HIDDEN2_SIZE, self.output_size],
 48 |                  stddev=0.01), name='W3')
 49 |       b3 = tf.Variable(tf.zeros(self.output_size), name='b3')
 50 |       self.Q = tf.matmul(h2, W3) + b3
 51 |     self.weights = [W1, b1, W2, b2, W3, b3]
 52 | 
 53 |     # Loss
 54 |     self.targetQ = tf.placeholder(tf.float32, [None])
 55 |     self.targetActionMask = tf.placeholder(tf.float32, [None, self.output_size])
 56 |     # TODO: Optimize this
 57 |     q_values = tf.reduce_sum(tf.mul(self.Q, self.targetActionMask),
 58 |                   reduction_indices=[1])
 59 |     self.loss = tf.reduce_mean(tf.square(tf.sub(q_values, self.targetQ)))
 60 | 
 61 |     # Reguralization
 62 |     for w in [W1, W2, W3]:
 63 |       self.loss += self.REG_FACTOR * tf.reduce_sum(tf.square(w))
 64 | 
 65 |     # Training
 66 |     optimizer = tf.train.GradientDescentOptimizer(self.LEARNING_RATE)
 67 |     global_step = tf.Variable(0, name='global_step', trainable=False)
 68 |     self.train_op = optimizer.minimize(self.loss, global_step=global_step)
 69 | 
 70 |     self.reward = tf.Variable(0.0, [None])
 71 | 
 72 |   def train(self, num_episodes=NUM_EPISODES):
 73 |     replay_memory = deque(maxlen=self.REPLAY_MEMORY_SIZE)
 74 | 
 75 |     self.session = tf.Session()
 76 | 
 77 |     # Summary for TensorBoard
 78 |     tf.scalar_summary('loss', self.loss)
 79 |     tf.scalar_summary('reward', self.reward)
 80 |     self.summary = tf.merge_all_summaries()
 81 |     self.summary_writer = tf.train.SummaryWriter(self.LOG_DIR, self.session.graph)
 82 | 
 83 |     self.session.run(tf.initialize_all_variables())
 84 |     total_steps = 0
 85 | 
 86 |     for episode in range(num_episodes):
 87 |       print("Training: Episode = %d, Global step = %d" % (episode, total_steps))
 88 |       state = self.env.reset()
 89 |       target_weights = self.session.run(self.weights)
 90 |       total_reward = 0
 91 |       done = False
 92 | 
 93 |       while not done:
 94 |         # Pick the next action and execute it
 95 |         action = None
 96 |         if random.random() < self.RANDOM_ACTION_PROB:
 97 |           action = self.env.action_space.sample()
 98 |         else:
 99 |           q_values = self.session.run(self.Q, feed_dict={self.x: [state]})
100 |           action = q_values.argmax()
101 |         self.RANDOM_ACTION_PROB *= self.RANDOM_ACTION_DECAY
102 |         obs, reward, done, _ = self.env.step(action)
103 |         total_reward += reward
104 |         # Update replay memory
105 |         if done:
106 |             # TODO: Seems to matter a lot...
107 |             reward -= 100
108 |             self.session.run(self.reward.assign(total_reward))
109 | 
110 |         replay_memory.append((state, action, reward, obs, done))
111 | 
112 |         state = obs
113 | 
114 |         # Sample a random minibatch and fetch max Q at s'
115 |         if len(replay_memory) >= self.MINIBATCH_SIZE:
116 |           minibatch = random.sample(replay_memory, self.MINIBATCH_SIZE)
117 |           next_states = [m[3] for m in minibatch]
118 |           # TODO: Optimize to skip terminal states
119 |           feed_dict = {self.x: next_states}
120 |           feed_dict.update(zip(self.weights, target_weights))
121 |           q_values = self.session.run(self.Q, feed_dict)
122 |           max_q_values = q_values.max(axis=1)
123 | 
124 |           # Compute target Q values
125 |           target_q = np.zeros(self.MINIBATCH_SIZE)
126 |           target_action_mask = np.zeros((self.MINIBATCH_SIZE, self.output_size), dtype=int)
127 |           for i in range(self.MINIBATCH_SIZE):
128 |             _, action, reward, _, terminal = minibatch[i]
129 |             target_q[i] = reward
130 |             if not terminal:
131 |               target_q[i] += self.DISCOUNT_FACTOR * max_q_values[i]
132 |             target_action_mask[i][action] = 1
133 | 
134 |           # Gradient descent
135 |           states = [m[0] for m in minibatch]
136 |           feed_dict = {
137 |             self.x: states,
138 |             self.targetQ: target_q,
139 |             self.targetActionMask: target_action_mask,
140 |           }
141 |           _, summary = self.session.run([self.train_op, self.summary],
142 |                                         feed_dict=feed_dict)
143 | 
144 |           # Write summary for TensorBoard
145 |           self.summary_writer.add_summary(summary, total_steps)
146 | 
147 |           # Update target weights
148 |           if total_steps % self.TARGET_UPDATE_FREQ == 0:
149 |             target_weights = self.session.run(self.weights)
150 | 
151 |         total_steps += 1
152 |         if done:
153 |           break
154 | 
155 |   def play(self):
156 |     state = self.env.reset()
157 |     done = False
158 |     steps = 0
159 |     while not done and steps < 200:
160 |       self.env.render()
161 |       q_values = self.session.run(self.Q, feed_dict={self.x: [state]})
162 |       action = q_values.argmax()
163 |       state, _, done, _ = self.env.step(action)
164 |       steps += 1
165 |     return steps
166 | 
167 | if __name__ == '__main__':
168 |   dqn = DQN('CartPole-v0')
169 |   dqn.init_network()
170 | 
171 |   #dqn.env.monitor.start('/tmp/cartpole')
172 |   dqn.train()
173 |   #dqn.env.monitor.close()
174 | 
175 |   res = []
176 |   for i in range(100):
177 |     steps = dqn.play()
178 |     print("Test steps = ", steps)
179 |     res.append(steps)
180 |   print("Mean steps = ", sum(res) / len(res))
181 | 


--------------------------------------------------------------------------------
/rl/a3c.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import os
  4 | import threading
  5 | import multiprocessing
  6 | import time
  7 | 
  8 | from .a3c_model import ACModel
  9 | from .util import *
 10 | from .memory import Memory
 11 | from .agent import Agent
 12 | 
 13 | class ACAgentRunner(Agent):
 14 | 
 15 |     def __init__(self, model, memory, preprocess, batch_size):
 16 |         self.model = model
 17 |         self.memory = memory
 18 |         self.preprocess = preprocess
 19 |         self.batch_size = batch_size
 20 | 
 21 |     def perform(self, sess, env):
 22 |         """
 23 |         Perform action according to policy pi(a | s)
 24 |         """
 25 |         *probs, value = sess.run(
 26 |             self.model.model.outputs,
 27 |             {
 28 |                 **self.memory.build_single_feed(self.model.model.inputs),
 29 |                 self.model.isTrain: False
 30 |             }
 31 |         )
 32 | 
 33 |         # Remove batch dimension
 34 |         value = value[0][0]
 35 | 
 36 |         # Sample an action from an action probability distribution output
 37 |         action = []
 38 | 
 39 |         for p in probs:
 40 |             p = p[0]
 41 |             action.append(np.random.choice(len(p), p=p))
 42 | 
 43 |         flatten_action = action[0] if len(action) == 1 else action
 44 |         next_state, reward, terminal, info = env.step(flatten_action)
 45 |         next_state = self.preprocess(env, next_state)
 46 | 
 47 |         return value, action, next_state, reward, terminal
 48 | 
 49 |     def train(self, sess, coord, env_builder, writer, gamma):
 50 |         try:
 51 |             # Thread setup
 52 |             env = env_builder()
 53 | 
 54 |             episode_count = 0
 55 | 
 56 |             # Reset per-episode vars
 57 |             terminal = False
 58 |             total_reward = 0
 59 |             step_count = 0
 60 | 
 61 |             # Each memory corresponds to one input.
 62 |             self.memory.reset(self.preprocess(env, env.reset()))
 63 | 
 64 |             print("Training ACAgentRunner...")
 65 | 
 66 |             while not coord.should_stop():
 67 |                 # Run a training batch
 68 |                 t = 0
 69 |                 t_start = t
 70 | 
 71 |                 # Batched based variables
 72 |                 state_batches = [[] for _ in self.model.model.inputs]
 73 |                 actions = []
 74 |                 rewards = []
 75 |                 values = []
 76 | 
 77 |                 while not (terminal or ((t - t_start) == self.batch_size)):
 78 |                     value, action, next_state, reward, terminal = self.perform(
 79 |                         sess, env
 80 |                     )
 81 | 
 82 |                     # Bookkeeping
 83 |                     for i, state in enumerate(self.memory.to_states()):
 84 |                         state_batches[i].append(state)
 85 | 
 86 |                     self.memory.remember(next_state)
 87 |                     actions.append(action)
 88 |                     values.append(value)
 89 |                     rewards.append(reward)
 90 | 
 91 |                     total_reward += reward
 92 |                     step_count += 1
 93 |                     t += 1
 94 | 
 95 |                 if terminal:
 96 |                     reward = 0
 97 |                 else:
 98 |                     # Bootstrap from last state
 99 |                     reward = sess.run(
100 |                         self.model.value,
101 |                         {
102 |                             **self.memory.build_single_feed(self.model.model.inputs),
103 |                             self.model.isTrain: False
104 |                         }
105 |                     )[0][0]
106 | 
107 |                 # Here we take the rewards and values from the exp, and use them to
108 |                 # generate the advantage and discounted returns.
109 |                 # The advantage function uses "Generalized Advantage
110 |                 # Estimation"
111 |                 discounted_rewards = discount(rewards, gamma, reward)
112 |                 value_plus = np.array(values + [reward])
113 |                 advantages = discount(
114 |                     rewards + gamma * value_plus[1:] - value_plus[:-1], gamma)
115 | 
116 |                 # Train network
117 |                 v_l, p_l, e_l, g_n, v_n, _ = sess.run([
118 |                     self.model.value_loss,
119 |                     self.model.policy_loss,
120 |                     self.model.entropy,
121 |                     self.model.grad_norms,
122 |                     self.model.var_norms,
123 |                     self.model.train
124 |                 ],
125 |                     {
126 |                         **dict(zip(self.model.model.inputs, state_batches)),
127 |                         **dict(zip(self.model.actions, zip(*actions))),
128 |                         **
129 |                         {
130 |                                 self.model.target_v: discounted_rewards,
131 |                                 self.model.advantages: advantages,
132 |                                 self.model.isTrain: True
133 |                         },
134 |                 }
135 |                 )
136 | 
137 |                 if terminal:
138 |                     # Record metrics
139 |                     writer.add_summary(
140 |                         make_summary({
141 |                             'rewards': total_reward,
142 |                             'lengths': step_count,
143 |                             'value_loss': v_l,
144 |                             'policy_loss': p_l,
145 |                             'entropy_loss': e_l,
146 |                             'grad_norm': g_n,
147 |                             'value_norm': v_n,
148 |                             'mean_values': np.mean(values)
149 |                         }),
150 |                         episode_count
151 |                     )
152 | 
153 |                     episode_count += 1
154 | 
155 |                     # Reset per-episode counters
156 |                     terminal = False
157 |                     total_reward = 0
158 |                     step_count = 0
159 |                     # Each memory corresponds to one input.
160 |                     self.memory.reset(self.preprocess(env, env.reset()))
161 |         except Exception as e:
162 |             # Report exceptions to the coordinator.
163 |             coord.request_stop(e)
164 | 
165 |     def run(self, sess, env):
166 |         self.memory.reset(self.preprocess(env, env.reset()))
167 |         total_reward = 0
168 |         terminal = False
169 | 
170 |         while not terminal:
171 |             value,\
172 |             action,\
173 |             next_state,\
174 |             reward,\
175 |             terminal = self.perform(sess, env)
176 | 
177 |             total_reward += reward
178 |             self.memory.remember(next_state)
179 | 
180 | # TODO: Refactor to async coordinator?
181 | class A3CAgent(Agent):
182 |     # TODO: Refactor these hyperparameters to one object
183 | 
184 |     def __init__(self,
185 |                  model_builder,
186 |                  time_steps=0,
187 |                  preprocess=lambda e, x: x,
188 |                  model_path='out/model',
189 |                  num_workers=multiprocessing.cpu_count(),
190 |                  entropy_factor=1e-2,
191 |                  batch_size=32):
192 |         self.model_builder = model_builder
193 |         self.time_steps = time_steps
194 |         self.model_path = model_path
195 |         self.entropy_factor = entropy_factor
196 |         self.preprocess = preprocess
197 |         self.batch_size = batch_size
198 |         self.save_count = 0
199 |         self.model = ACModel(model_builder, entropy_factor)
200 |         self.saver = tf.train.Saver(max_to_keep=5)
201 | 
202 |         # Create agents
203 |         self.agents = []
204 | 
205 |         for i in range(num_workers):
206 |             self.add_agent()
207 | 
208 |     def add_agent(self, Agent=ACAgentRunner):
209 |         self.agents.append(Agent(
210 |             self.model,
211 |             Memory(self.time_steps),
212 |             self.preprocess,
213 |             self.batch_size
214 |         ))
215 | 
216 |     # TODO: Not SRP. Agent shouldn't handle model saving.
217 |     def load(self, sess):
218 |         self.model.model.load_weights(self.model_path + '/model.h5')
219 | 
220 |     def save(self, sess):
221 |         if not os.path.exists(self.model_path):
222 |             os.makedirs(self.model_path)
223 |         self.model.model.save_weights(self.model_path + '/model_' + str(self.save_count) + '.h5')
224 |         self.save_count += 1
225 | 
226 |     def compile(self,
227 |                 sess,
228 |                 grad_clip=50.,
229 |                 optimizer=tf.train.AdamOptimizer()):
230 |         self.model.compile(optimizer, grad_clip)
231 |         print(self.model.model.summary())
232 | 
233 |         # Initialize variables
234 |         sess.run(tf.global_variables_initializer())
235 | 
236 |     def train(self,
237 |               sess,
238 |               env_builder,
239 |               summary_path='out/summary/',
240 |               discount=.99):
241 |         """
242 |         Starts training.
243 |         Return: The coordinator for all the threads
244 |         """
245 |         print('Training model')
246 | 
247 |         coord = tf.train.Coordinator()
248 | 
249 |         for i, agent in enumerate(self.agents):
250 |             name = 'worker_' + str(i)
251 |             writer = tf.summary.FileWriter(
252 |                 summary_path + name, sess.graph, flush_secs=2)
253 | 
254 |             t = threading.Thread(
255 |                 target=agent.train,
256 |                 args=(
257 |                     sess,
258 |                     coord,
259 |                     env_builder,
260 |                     writer,
261 |                     discount
262 |                 )
263 |             )
264 | 
265 |             t.start()
266 |             coord.register_thread(t)
267 | 
268 |             # Stagger threads to decorrelate experience
269 |             time.sleep(1)
270 | 
271 |         # Create thread that auto-saves
272 |         t = threading.Thread(target=save_worker, args=(sess, coord, self))
273 |         t.start()
274 |         coord.register_thread(t)
275 | 
276 |         return coord
277 | 
278 |     def run(self, sess, env):
279 |         # Pick the first agent to run the environment
280 |         self.agents[0].run(sess, env)
281 | 


--------------------------------------------------------------------------------
/rl/dqn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of Deep Q Network.
  3 | TODO: Incomplete WIP
  4 | """
  5 | import os
  6 | import gym
  7 | import random
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | from collections import deque
 11 | from keras.models import Model
 12 | 
 13 | from .agent import Agent
 14 | from .memory import Memory
 15 | 
 16 | NUM_EPISODES = 100000  # Number of episodes the agent plays
 17 | GAMMA = 0.99  # Discount factor
 18 | # Number of steps to populate the replay memory before training starts
 19 | INITIAL_REPLAY_SIZE = 1000
 20 | NUM_REPLAY_MEMORY = 10000  # Number of replay memory the agent uses for training
 21 | BATCH_SIZE = 16  # Mini batch size
 22 | # The frequency with which the target network is updated
 23 | TARGET_UPDATE_INTERVAL = 100
 24 | TRAIN_INTERVAL = 1  # The agent selects 4 actions between successive updates
 25 | # Constant added to the squared gradient in the denominator of the RMSProp
 26 | # update
 27 | MIN_GRAD = 0.01
 28 | SAVE_SUMMARY_PATH = 'out/summary/'
 29 | 
 30 | # TODO: Depend only on Keras?
 31 | # TODO: Timestep support
 32 | # TODO: Multi-input support
 33 | class DQNAgent(Agent):
 34 | 
 35 |     def __init__(self,
 36 |                  model_builder,
 37 |                  initial_epsilon=1,
 38 |                  final_epsilon=0.1,
 39 |                  explore_steps=1000000,
 40 |                  preprocess=lambda x: x):
 41 |         """
 42 |         Args
 43 |             model_builder: A function that create a new model for the network
 44 |             initial_epsilon: Starting epsilon
 45 |             final_epsilon: Ending epsilon
 46 |             explore_steps: Number of steps over which the initial value of
 47 |                            epsilon is linearly annealed to its final value
 48 |             preprocess: Function called to preprocess observations
 49 |         """
 50 |         self.epsilon = initial_epsilon
 51 |         self.final_epsilon = final_epsilon
 52 |         self.epsilon_step = (initial_epsilon - final_epsilon) / explore_steps
 53 |         self.explore_steps = explore_steps
 54 |         self.t = 0
 55 | 
 56 |         # Parameters used for summary
 57 |         self.total_reward = 0
 58 |         self.total_q_max = 0
 59 |         self.total_loss = 0
 60 |         self.duration = 0
 61 |         self.episode = 0
 62 | 
 63 |         # Create replay memory
 64 |         self.memory = deque()
 65 | 
 66 |         # Misc
 67 |         self.model_builder = model_builder
 68 |         self.preprocess = preprocess
 69 | 
 70 |     def compile(self, sess, optimizer=tf.train.AdamOptimizer(learning_rate=1e-4)):
 71 |         self.sess = sess
 72 | 
 73 |         # Create q network
 74 |         self.q_model = self.model_builder()
 75 |         self.num_actions = self.q_model.outputs[0].get_shape()[1]
 76 |         q_weights = self.q_model.trainable_weights
 77 | 
 78 |         # Create target network
 79 |         self.t_model = self.model_builder()
 80 |         t_weights = self.t_model.trainable_weights
 81 | 
 82 |         # Syncs the target Q network's weight with the Q network's weights
 83 |         self.sync = [t_weights[i].assign(q_weights[i])
 84 |                      for i in range(len(t_weights))]
 85 | 
 86 |         # Define loss and gradient update operation
 87 |         self.a = tf.placeholder(tf.int64, [None])
 88 |         self.y = tf.placeholder(tf.float32, [None])
 89 | 
 90 |         # Convert action to one hot vector
 91 |         a_one_hot = tf.one_hot(self.a, self.num_actions, 1.0, 0.0)
 92 |         q_value = tf.reduce_sum(
 93 |             tf.mul(self.q_model.outputs[0], a_one_hot), reduction_indices=1
 94 |         )
 95 | 
 96 |         # Clip the error, the loss is quadratic when the error is in (-1, 1),
 97 |         # and linear outside of that region
 98 |         error = tf.abs(self.y - q_value)
 99 |         quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
100 |         linear_part = error - quadratic_part
101 |         # Define loss and gradient update operation
102 |         self.loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)
103 |         self.train_op = optimizer.minimize(self.loss, var_list=q_weights)
104 | 
105 |         # Setup metrics
106 |         self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
107 |         self.summary_writer = tf.summary.FileWriter(
108 |             SAVE_SUMMARY_PATH, self.sess.graph
109 |         )
110 | 
111 |         # Init vars
112 |         self.sess.run(tf.global_variables_initializer())
113 | 
114 |         # Initialize target network
115 |         self.sess.run(self.sync)
116 | 
117 |     def train(self, sess, env_builder):
118 |         env = env_builder()
119 | 
120 |         for _ in range(NUM_EPISODES):
121 |             terminal = False
122 |             state = self.preprocess(env.reset())
123 |             while not terminal:
124 |                 action = self.get_action(state)
125 |                 next_state, reward, terminal, _ = env.step(action)
126 |                 next_state = self.preprocess(next_state)
127 |                 self.run(state, action, reward, terminal, next_state)
128 |                 state = next_state
129 | 
130 |     def get_action(self, state):
131 |         """
132 |         Picks an action given a state based on epsilon greedy policy.
133 |         """
134 |         if self.epsilon >= random.random() or self.t < INITIAL_REPLAY_SIZE:
135 |             action = random.randrange(self.num_actions)
136 |         else:
137 |             action = np.argmax(self.q_model.predict(np.array([state]))[0])
138 | 
139 |         # Anneal epsilon linearly over time
140 |         if self.epsilon > self.final_epsilon and self.t >= INITIAL_REPLAY_SIZE:
141 |             self.epsilon -= self.epsilon_step
142 |         return action
143 | 
144 |     def run(self, state, action, reward, terminal, next_state):
145 |         # Clip all positive rewards at 1 and all negative rewards at -1,
146 |         # leaving 0 rewards unchanged
147 |         reward = np.clip(reward, -1, 1)
148 | 
149 |         # Store transition in replay memory
150 |         self.memory.append((state, action, reward, next_state, terminal))
151 | 
152 |         if len(self.memory) > NUM_REPLAY_MEMORY:
153 |             self.memory.popleft()
154 | 
155 |         if self.t >= INITIAL_REPLAY_SIZE:
156 |             # Train network
157 |             if self.t % TRAIN_INTERVAL == 0:
158 |                 self.learn()
159 | 
160 |             # Update target network
161 |             if self.t % TARGET_UPDATE_INTERVAL == 0:
162 |                 self.sess.run(self.sync)
163 | 
164 |         self.total_reward += reward
165 |         self.total_q_max += np.amax(self.q_model.predict(np.array([state]))[0])
166 |         self.duration += 1
167 | 
168 |         if terminal:
169 |             # Write summary
170 |             if self.t >= INITIAL_REPLAY_SIZE:
171 |                 stats = [self.total_reward, self.total_q_max / float(self.duration),
172 |                          self.duration, self.total_loss / (float(self.duration) / float(TRAIN_INTERVAL))]
173 |                 for i in range(len(stats)):
174 |                     self.sess.run(self.update_ops[i], feed_dict={
175 |                         self.summary_placeholders[i]: float(stats[i])
176 |                     })
177 |                 summary_str = self.sess.run(self.summary_op)
178 |                 self.summary_writer.add_summary(summary_str, self.episode + 1)
179 | 
180 |             # Debug
181 |             """
182 |             if self.t < INITIAL_REPLAY_SIZE:
183 |                 mode = 'random'
184 |             elif INITIAL_REPLAY_SIZE <= self.t < INITIAL_REPLAY_SIZE + self.explore_steps:
185 |                 mode = 'explore'
186 |             else:
187 |                 mode = 'exploit'
188 |             print('EPISODE: {0:6d} / TIMESTEP: {1:8d} / DURATION: {2:5d} / EPSILON: {3:.5f} / TOTAL_REWARD: {4:3.0f} / AVG_MAX_Q: {5:2.4f} / AVG_LOSS: {6:.5f} / MODE: {7}'.format(
189 |                 self.episode + 1, self.t, self.duration, self.epsilon,
190 |                 self.total_reward, self.total_q_max / float(self.duration),
191 |                 self.total_loss / (float(self.duration) / float(TRAIN_INTERVAL)), mode))
192 |             """
193 | 
194 |             self.total_reward = 0
195 |             self.total_q_max = 0
196 |             self.total_loss = 0
197 |             self.duration = 0
198 |             self.episode += 1
199 | 
200 |         self.t += 1
201 | 
202 |         return next_state
203 | 
204 |     def learn(self):
205 |         state_batch = []
206 |         action_batch = []
207 |         reward_batch = []
208 |         next_state_batch = []
209 |         terminal_batch = []
210 |         y_batch = []
211 | 
212 |         # Sample random minibatch of transition from replay memory
213 |         minibatch = random.sample(self.memory, BATCH_SIZE)
214 |         for data in minibatch:
215 |             state_batch.append(data[0])
216 |             action_batch.append(data[1])
217 |             reward_batch.append(data[2])
218 |             next_state_batch.append(data[3])
219 |             terminal_batch.append(data[4])
220 | 
221 |         # Convert True to 1, False to 0
222 |         terminal_batch = np.array(terminal_batch) + 0
223 | 
224 |         target_q_values_batch = self.t_model.predict(np.array(next_state_batch))
225 |         y_batch = reward_batch + (1 - terminal_batch) * \
226 |             GAMMA * np.amax(target_q_values_batch, axis=1)
227 | 
228 |         loss, _ = self.sess.run([self.loss, self.train_op], {
229 |             self.q_model.inputs[0]: np.array(state_batch),
230 |             self.a: action_batch,
231 |             self.y: y_batch
232 |         })
233 | 
234 |         self.total_loss += loss
235 | 
236 |     def setup_summary(self):
237 |         episode_total_reward = tf.Variable(0.)
238 |         tf.scalar_summary('/Total Reward/Episode',
239 |                           episode_total_reward)
240 |         episode_avg_max_q = tf.Variable(0.)
241 |         tf.scalar_summary(
242 |             '/Average Max Q/Episode', episode_avg_max_q)
243 |         episode_duration = tf.Variable(0.)
244 |         tf.scalar_summary('/Duration/Episode', episode_duration)
245 |         episode_avg_loss = tf.Variable(0.)
246 |         tf.scalar_summary('/Average Loss/Episode', episode_avg_loss)
247 |         summary_vars = [episode_total_reward,
248 |                         episode_avg_max_q, episode_duration, episode_avg_loss]
249 |         summary_placeholders = [tf.placeholder(
250 |             tf.float32) for _ in range(len(summary_vars))]
251 |         update_ops = [summary_vars[i].assign(
252 |             summary_placeholders[i]) for i in range(len(summary_vars))]
253 |         summary_op = tf.summary.merge_all()
254 |         return summary_placeholders, update_ops, summary_op
255 | 


--------------------------------------------------------------------------------