├── README.md ├── gym_experiment.py ├── policy_gradient.py ├── test_pg.py └── tf_util.py /README.md: -------------------------------------------------------------------------------- 1 | # tensorflow-policy-gradient 2 | 3 | Still under construction... 4 | 5 | ## Dependencies 6 | - Python 2.7 7 | - TensorFlow >= 0.8.0 8 | - NumPy >= 1.10.0 9 | - openai gym 10 | - matplotlib 11 | 12 | ## Quick try 13 | Run 14 | ```bash 15 | python gym_experiment.py 16 | ``` 17 | to train a softmax policy (without bias) using vanilla policy gradient on [CartPole task](https://gym.openai.com/envs/CartPole-v0). You can see that the return is stochastically increasing until it reaches the maximum (200). 18 | -------------------------------------------------------------------------------- /gym_experiment.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cProfile as profile 3 | import os 4 | import sys 5 | import time 6 | 7 | import gym 8 | import tensorflow as tf 9 | import numpy as np 10 | import policy_gradient as pg 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | def main(): 15 | parser = argparse.ArgumentParser() 16 | 17 | parser.add_argument('--env', type=str, 18 | default='CartPole-v0', 19 | help='Environment name.') 20 | 21 | parser.add_argument('--save_path', type=str, 22 | default='', 23 | help='Path to save experiments.') 24 | 25 | parser.add_argument('--config_file', type=str, 26 | default='', 27 | help='Json file containing configurations.') 28 | 29 | parser.add_argument('--n_iters', type=int, 30 | default=200, 31 | help='Number of iterations.') 32 | 33 | parser.add_argument('--evaluate_freq', type=int, 34 | default=10, 35 | help='How often to evaluate on a game.') 36 | 37 | parser.add_argument('--max_steps', type=int, 38 | default=200, 39 | help='Upper limit of episode length.') 40 | 41 | 42 | # Parameters for unittesting the implementation. 43 | parser.add_argument('--record', dest='record', action='store_true', 44 | help='Whether to record and save this experiment.') 45 | parser.set_defaults(record=False) 46 | 47 | args = parser.parse_args() 48 | 49 | if not args.save_path: 50 | args.save_path = os.path.join('/tmp/', args.env + '-tmp-experiment') 51 | 52 | env = gym.make(args.env) 53 | 54 | if args.record: 55 | env.monitor.start(args.save_path, force=True) 56 | 57 | if args.config_file: 58 | with open(args.config_file, 'r') as f: 59 | config = json.load(f) 60 | agent = pg.NNAgent(env.action_space, env.observation_space, 61 | max_steps=args.max_steps, **config) 62 | else: 63 | agent = pg.NNAgent(env.action_space, env.observation_space, 64 | max_steps=args.max_steps, 65 | learning_rate=100.0, discount=0.98, 66 | use_softmax_bias=False, 67 | use_rnn=False) 68 | 69 | n_iters = args.n_iters 70 | iter_num = range(n_iters) 71 | returns = [] 72 | t1 = time.time() 73 | for i in xrange(n_iters): 74 | returns.append(agent.train_batch(env, total_steps=2000, batch_size=None)[:2]) 75 | # print agent.session.run(agent.train_graph.learning_rate) 76 | m_return = returns[-1][0] 77 | m_ep_len = returns[-1][1] 78 | print "Iteration %s:" % i 79 | print " average return {}\n average episode length {}".format(m_return, m_ep_len) 80 | 81 | if i % args.evaluate_freq == 0: 82 | evaluate(env, agent, 5, args.max_steps) 83 | 84 | t2 = time.time() 85 | print '{} sec used, {} sec per iteration.'.format(t2 - t1, (t2 - t1) / n_iters) 86 | 87 | if args.record: 88 | env.monitor.close() 89 | 90 | plt.plot(iter_num, [r[0] for r in returns]) 91 | plt.xlabel('Number of iterations') 92 | plt.ylabel('Average return') 93 | plt.show() 94 | plt.plot(iter_num, [r[1] for r in returns]) 95 | plt.ylabel('Average episode length') 96 | plt.show() 97 | 98 | 99 | def evaluate(env, agent, n_eps, max_steps): 100 | for i_episode in range(n_eps): 101 | observation = env.reset() 102 | for t in range(max_steps): 103 | env.render() 104 | action = agent.get_action(observation) 105 | observation, reward, done, info = env.step(action) 106 | if done: 107 | print("Episode finished after {} timesteps".format(t+1)) 108 | break 109 | else: 110 | print("Episode reached maximum length after {} timesteps".format(t+1)) 111 | 112 | 113 | if __name__ == '__main__': 114 | main() 115 | -------------------------------------------------------------------------------- /policy_gradient.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tensorflow as tf 3 | import numpy as np 4 | import tf_util 5 | 6 | 7 | class NNAgent(object): 8 | # An reinforcement learning agent using vanilla policy gradient. 9 | def __init__(self, action_space, observation_space, 10 | use_rnn=False, use_fnn=False, 11 | max_steps=100, discount=0.9, learning_rate=0.01, 12 | use_softmax_bias=True, 13 | rnn_model='rnn', rnn_hidden_size=32, rnn_num_layers=1, 14 | fnn_hidden_sizes=[32, 32], 15 | fnn_activation_fns=[tf.nn.relu, tf.nn.relu], 16 | fnn_l2_scale=0.0): 17 | self.graph = tf.Graph() 18 | with self.graph.as_default(): 19 | with tf.name_scope('Training'): 20 | self.train_graph = NNGraph(action_space, observation_space, 21 | learning_rate=learning_rate, 22 | use_softmax_bias=use_softmax_bias, 23 | rnn_model=rnn_model, 24 | rnn_hidden_size=rnn_hidden_size, 25 | rnn_num_layers=rnn_num_layers, 26 | fnn_hidden_sizes=fnn_hidden_sizes, 27 | fnn_activation_fns=fnn_activation_fns, 28 | fnn_l2_scale=fnn_l2_scale, 29 | use_rnn=use_rnn, use_fnn=use_fnn) 30 | self.inference_graph = self.train_graph 31 | saver = tf.train.Saver(name='checkpoint_saver') 32 | init_op = tf.initialize_all_variables() 33 | self.session = tf.Session(graph=self.graph) 34 | self.session.run(init_op) 35 | self.n_actions = self.inference_graph.n_actions 36 | self.use_rnn = use_rnn 37 | self.max_steps = max_steps 38 | self.discount = discount 39 | if self.use_rnn: 40 | self.last_state = None 41 | self.need_reset = False 42 | 43 | def reset(self): 44 | if self.use_rnn: 45 | self.need_reset = True 46 | 47 | def get_actions(self, obs): 48 | "Given a batch of observations, produce a batch of actions." 49 | if self.use_rnn: 50 | if self.need_reset: 51 | self.last_state = self.session.run( 52 | self.inference_graph.zero_state, 53 | feed_dict={self.inference_graph.obs: obs}) 54 | self.need_reset = False 55 | 56 | probs, self.last_state = self.session.run( 57 | [self.inference_graph.probs, 58 | self.inference_graph.final_state], 59 | feed_dict={self.inference_graph.obs: obs, 60 | self.inference_graph.initial_state: 61 | self.last_state, 62 | self.inference_graph.seq_lens: [1] * obs.shape[1]}) 63 | else: 64 | probs = self.session.run(self.inference_graph.probs, 65 | feed_dict={self.inference_graph.obs: 66 | obs}) 67 | 68 | actions = [] 69 | for prob in probs: 70 | actions.append(np.random.choice(self.n_actions, 1, p=prob)[0]) 71 | return actions 72 | 73 | def get_action(self, ob): 74 | "Given one observation, produce one action." 75 | return self.get_actions(np.array([[ob]]))[0] 76 | 77 | def get_batch(self, env, batch_size=None, 78 | total_steps=2000): 79 | paths = [] 80 | if batch_size is None: 81 | batch_size = np.inf 82 | if total_steps is None: 83 | total_steps = np.inf 84 | if ((batch_size is None) and 85 | (total_steps is None)): 86 | raise ValueError("batch_size and total_steps can't all be None.") 87 | 88 | steps = 0 89 | i = 0 90 | # for _ in xrange(batch_size): 91 | while True: 92 | obs = [] 93 | actions = [] 94 | rewards = [] 95 | paddings = [] 96 | ob = env.reset() 97 | self.reset() 98 | for _ in xrange(self.max_steps): 99 | if isinstance(ob, np.ndarray): 100 | ob = np.reshape(ob, [-1]) 101 | action = self.get_action(ob) 102 | next_ob, reward, done, _ = env.step(action) 103 | obs.append(ob) 104 | actions.append(action) 105 | rewards.append(reward) 106 | ob = next_ob 107 | if done: 108 | break 109 | # We need to compute the empirical return for each 110 | # time step along the trajectory. 111 | returns = [] 112 | return_so_far = 0.0 113 | for t in xrange(len(rewards) - 1, -1, -1): 114 | return_so_far = rewards[t] + self.discount * return_so_far 115 | returns.append(return_so_far) 116 | # The returns are stored backwards in time, so we need to revert it. 117 | returns = returns[::-1] 118 | 119 | steps += len(actions) 120 | i += 1 121 | if ((steps > total_steps) or 122 | (i > batch_size)): 123 | break 124 | 125 | paths.append(dict( 126 | observations=np.array(obs), 127 | actions=np.array(actions), 128 | rewards=np.array(rewards), 129 | returns=np.array(returns), 130 | ep_len=len(actions))) 131 | 132 | mean_return=np.mean([np.sum(path['rewards']) for path in paths]) 133 | mean_ep_len=np.mean([path['ep_len'] for path in paths]) 134 | return paths, mean_return, mean_ep_len 135 | 136 | def train_batch(self, env, batch_size=None, 137 | total_steps=2000): 138 | paths, mean_return, mean_ep_len = self.get_batch(env, batch_size=batch_size, 139 | total_steps=total_steps) 140 | obs_list = [path['observations'] for path in paths] 141 | actions_list = [path['actions'] for path in paths] 142 | returns_list = [path['returns'] for path in paths] 143 | 144 | if self.use_rnn: 145 | seq_lens = [path['ep_len'] for path in paths] 146 | max_ep_len = np.max(seq_lens) 147 | obs = pad_batch(obs_list, max_ep_len) 148 | actions = pad_batch(actions_list, max_ep_len) 149 | returns = pad_batch(returns_list, max_ep_len) 150 | 151 | self.last_state = self.session.run( 152 | self.inference_graph.zero_state, 153 | feed_dict={self.inference_graph.obs: obs}) 154 | 155 | # print actions.shape 156 | # print obs.shape 157 | # print self.train_graph.actions.get_shape() 158 | _, outputs = self.session.run( 159 | [self.train_graph.train_op, self.train_graph.outputs], 160 | feed_dict={self.train_graph.obs: obs, 161 | self.train_graph.initial_state: 162 | self.last_state, 163 | self.train_graph.seq_lens: seq_lens, 164 | self.train_graph.returns: returns, 165 | self.train_graph.actions: actions}) 166 | # print outputs.shape 167 | else: 168 | # If not useing RNN, just concatenate every 169 | # steps into one large list. 170 | obs = np.array([np.concatenate(obs_list)]) 171 | actions = np.array([np.concatenate(actions_list)]) 172 | returns = np.array([np.concatenate(returns_list)]) 173 | 174 | feed_dict = {self.train_graph.actions: actions, 175 | self.train_graph.returns: returns, 176 | self.train_graph.obs: obs} 177 | 178 | self.session.run([self.train_graph.train_op], 179 | feed_dict=feed_dict) 180 | 181 | return mean_return, mean_ep_len 182 | 183 | 184 | def pad_batch(batch, max_ep_len): 185 | num_dim = len(batch[0].shape) 186 | new_batch = [] 187 | for ep in batch: 188 | # the first dimension, number of steps in the 189 | # episode is padded to be the same as max_ep_len, 190 | # the rest dimensions are not touched. 191 | padded_ep = np.pad(ep, ([(0, max_ep_len - ep.shape[0])] + 192 | [(0, 0)] * (num_dim - 1)), 193 | 'constant', constant_values=0) 194 | new_batch.append(padded_ep) 195 | new_batch = np.array(new_batch) 196 | time_major_batch = np.swapaxes(new_batch, 0, 1) 197 | return time_major_batch 198 | 199 | 200 | class NNGraph(object): 201 | def __init__(self, action_space, observation_space, 202 | learning_rate=0.001, use_rnn=False, use_fnn=False, 203 | max_grad_norm=5.0, rnn_model='lstm', 204 | rnn_hidden_size=128, rnn_num_layers=2, 205 | fnn_hidden_sizes=[128, 128], 206 | fnn_activation_fns=[tf.nn.relu, tf.nn.relu], 207 | fnn_l2_scale=0.0, 208 | use_softmax_bias=True, 209 | is_training=True): 210 | 211 | self.n_actions = action_space.n 212 | 213 | try: 214 | # observation is an instance of Box. 215 | self.ob_dim = np.product(observation_space.shape) 216 | self.is_discrete_ob = False 217 | except AttributeError: 218 | # observation space is an instance of Discrete. 219 | self.ob_dim = observation_space.n 220 | self.is_discrete_ob = True 221 | 222 | self.global_step = tf.get_variable( 223 | 'global_step', [], 224 | initializer=tf.constant_initializer(0.0), 225 | trainable=False) 226 | 227 | if use_rnn: 228 | shape = [None, None] 229 | else: 230 | shape = [1, None] 231 | 232 | # Placeholder to feed in observations, actions and returns. 233 | if self.is_discrete_ob: 234 | # if observation_space is an instance of Discrete, then 235 | # should use embeddings to expand it. 236 | self.obs = tf.placeholder(tf.int64, shape, 237 | name='Observation') 238 | # Embeddings layers. 239 | with tf.name_scope('Embeddings'): 240 | self.embedding = tf.constant(np.eye(self.ob_dim), dtype=tf.float32) 241 | self.inputs = tf.nn.embedding_lookup(self.embedding, self.obs) 242 | input_size = self.ob_dim 243 | else: 244 | # if observation_space is an instance of Box, 245 | # then just use itself. 246 | self.obs = tf.placeholder(tf.float32, 247 | shape + [self.ob_dim], # list(observation_space.shape), 248 | name='Observation') 249 | 250 | self.inputs = self.obs # tf.reshape(self.obs, [-1, ]) 251 | input_size = self.ob_dim 252 | 253 | if use_fnn: 254 | self.processed_inputs = tf_util.create_fnn_ops(self.inputs, input_size, 255 | hidden_sizes=fnn_hidden_sizes, 256 | activation_fns=fnn_activation_fns, 257 | l2_scale=fnn_l2_scale) 258 | self.processed_input_size = fnn_hidden_sizes[-1] 259 | else: 260 | self.processed_inputs = self.inputs 261 | self.processed_input_size = self.ob_dim 262 | 263 | if use_rnn: 264 | with tf.name_scope('Dynamic_RNN'): 265 | in_ops, out_ops = tf_util.create_rnn_ops(self.processed_inputs, 266 | self.processed_input_size, 267 | rnn_model=rnn_model, 268 | hidden_size=rnn_hidden_size, 269 | num_layers=rnn_num_layers) 270 | self.zero_state = in_ops[0] 271 | self.initial_state = in_ops[1] 272 | self.seq_lens = in_ops[2] 273 | self.outputs, self.final_state = out_ops 274 | output_dim = rnn_hidden_size 275 | else: 276 | self.outputs = self.processed_inputs 277 | output_dim = self.processed_input_size 278 | 279 | flat_outputs = tf.reshape(self.outputs, [-1, output_dim]) 280 | 281 | self.logits, self.probs = tf_util.create_softmax_ops(flat_outputs, 282 | output_dim, self.n_actions, 283 | use_softmax_bias=use_softmax_bias) 284 | 285 | with tf.name_scope('Training'): 286 | # actions and returns. 287 | self.actions = tf.placeholder(tf.int64, 288 | [None, None], 289 | name='actions') 290 | 291 | self.returns = tf.placeholder(tf.float32, 292 | [None, None], 293 | name='returns') 294 | 295 | flat_actions = tf.reshape(self.actions, [-1]) 296 | flat_returns = tf.reshape(self.returns, [-1]) 297 | 298 | if use_rnn: 299 | self.mean_weighted_neg_ll = tf_util.rnn_weighted_neg_ll( 300 | self.logits, flat_actions, flat_returns, self.seq_lens) 301 | else: 302 | self.mean_weighted_neg_ll = tf_util.weighted_neg_ll( 303 | self.logits, flat_actions, flat_returns) 304 | 305 | with tf.name_scope('Optimization'): 306 | # self.learning_rate = tf.constant(learning_rate) 307 | self.learning_rate = tf.train.exponential_decay( 308 | learning_rate, self.global_step, 100, 1.0, staircase=True) 309 | 310 | tvars = tf.trainable_variables() 311 | 312 | # print [tvar.name for tvar in tvars] 313 | self.model_size = np.sum([np.product(tvar.get_shape().as_list()) 314 | for tvar in tvars]) 315 | print('model size is %s' % self.model_size) 316 | grads = tf.gradients(self.mean_weighted_neg_ll, tvars) 317 | self.grads = grads 318 | 319 | if use_rnn: 320 | grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) 321 | 322 | # self.grad_1 = grads[0] 323 | # self.grad_2 = grads[1] 324 | 325 | optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) 326 | 327 | # optimizer = tf.train.RMSPropOptimizer(learning_rate, decay_rate) 328 | # optimizer = tf.train.AdamOptimizer(self.learning_rate) 329 | 330 | self.train_op = optimizer.apply_gradients(zip(grads, tvars), 331 | global_step=self.global_step) 332 | 333 | 334 | -------------------------------------------------------------------------------- /test_pg.py: -------------------------------------------------------------------------------- 1 | import json 2 | import unittest 3 | import numpy as np 4 | import gym 5 | 6 | import policy_gradient as pg 7 | 8 | class TestPG(unittest.TestCase): 9 | def setUp(self): 10 | self.env = gym.make('FrozenLake-v0') 11 | self.agent = pg.NNAgent(self.env.action_space, 12 | self.env.observation_space, 13 | max_steps=100, learning_rate=100.0, 14 | discount=0.98) 15 | 16 | def test_get_batch(self): 17 | paths, mean_return, mean_ep_len = self.agent.get_batch( 18 | self.env, total_steps=2000) 19 | self.assertTrue(np.sum([path['ep_len'] for path in paths]) <= 2000) 20 | 21 | 22 | if __name__ == '__main__': 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /tf_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def create_rnn_ops(inputs, input_size, rnn_model='lstm', 6 | hidden_size=128, num_layers=2, use_dropout=False, 7 | dropout_rate=0.0, time_major=True): 8 | "Utility function to create multi-layer RNN." 9 | if rnn_model == 'rnn': 10 | cell_fn = tf.nn.rnn_cell.BasicRNNCell 11 | elif rnn_model == 'lstm': 12 | cell_fn = tf.nn.rnn_cell.BasicLSTMCell 13 | elif rnn_model == 'gru': 14 | cell_fn = tf.nn.rnn_cell.GRUCell 15 | 16 | params = {'input_size': input_size} 17 | if rnn_model == 'lstm': 18 | # add bias to forget gate in lstm. 19 | params['forget_bias'] = 0.0 20 | 21 | # Create multilayer cell. 22 | cell = cell_fn(hidden_size, 23 | **params) 24 | cells = [cell] 25 | params['input_size'] = hidden_size 26 | # more explicit way to create cells for MultiRNNCell than 27 | # [higher_layer_cell] * (self.num_layers - 1) 28 | for i in range(num_layers-1): 29 | higher_layer_cell = cell_fn(hidden_size, 30 | **params) 31 | cells.append(higher_layer_cell) 32 | 33 | if use_dropout and (dropout_rate > 0.0): 34 | # dropout_rate = tf.placeholder(tf.float32, [], 'dropout_rate') 35 | cells = [tf.nn.rnn_cell.DropoutWrapper( 36 | cell, output_keep_prob=1.0-dropout_rate) 37 | for cell in cells] 38 | 39 | multi_cell = tf.nn.rnn_cell.MultiRNNCell(cells) 40 | 41 | # batch_size = tf.placeholder(tf.int32, 42 | # name='batch_size') 43 | 44 | batch_size = tf.shape(inputs)[1] 45 | 46 | with tf.name_scope('initial_state'): 47 | # zero_state is used to compute the intial state for cell. 48 | zero_state = multi_cell.zero_state(batch_size, tf.float32) 49 | # Placeholder to feed in initial state. 50 | initial_state = tf.placeholder(tf.float32, 51 | [None, multi_cell.state_size], 52 | 'initial_state') 53 | 54 | seq_lens = tf.placeholder(tf.int64, None, 'sequence_lengths') 55 | 56 | outputs, final_state = tf.nn.dynamic_rnn(multi_cell, inputs, seq_lens, 57 | initial_state=initial_state, 58 | time_major=time_major) 59 | 60 | return ((zero_state, initial_state, seq_lens), 61 | (outputs, final_state)) 62 | 63 | 64 | def create_fnn_ops(inputs, input_dim, 65 | hidden_sizes, activation_fns, 66 | l2_scale=0.0): 67 | "Utility function to create multi-layer FNN with l2 regularization." 68 | x_dim = input_dim 69 | x = inputs 70 | for i, h in enumerate(hidden_sizes): 71 | if activation_fns[i] == tf.nn.relu: 72 | init_b = 0.1 73 | else: 74 | init_b = 0.0 75 | 76 | a = tf.contrib.layers.fully_connected( 77 | x, h, activation_fn=activation_fns[i], 78 | weight_init=tf.truncated_normal_initializer(mean=0.0, stddev=0.1), 79 | bias_init=tf.constant_initializer(value=init_b), 80 | weight_regularizer=tf.contrib.layers.l2_regularizer(l2_scale)) 81 | x = a 82 | outputs = a 83 | return outputs 84 | 85 | 86 | def create_softmax_ops(inputs, input_dim, n_classes, use_softmax_bias=True): 87 | "Uitlity function to create softmax operations." 88 | with tf.name_scope('Softmax'): 89 | softmax_w = tf.get_variable("weights", #[output_dim, self.n_actions], 90 | initializer=tf.zeros_initializer( 91 | [input_dim, n_classes])) 92 | 93 | if use_softmax_bias: 94 | softmax_b = tf.get_variable( 95 | "bias", #[1, self.n_actions], 96 | initializer=tf.zeros_initializer([n_classes])) 97 | logits = tf.matmul(inputs, softmax_w) + softmax_b 98 | else: 99 | logits = tf.matmul(inputs, softmax_w) 100 | 101 | probs = tf.nn.softmax(logits) 102 | 103 | return logits, probs 104 | 105 | 106 | def weighted_neg_ll(logits, labels, example_weights): 107 | with tf.name_scope('weighted_neg_ll'): 108 | # Compute mean cross entropy loss for each output. 109 | neg_log_likelihood = tf.nn.sparse_softmax_cross_entropy_with_logits( 110 | logits, labels) 111 | mean_weighted_neg_ll = tf.reduce_mean( 112 | neg_log_likelihood * example_weights) 113 | return mean_weighted_neg_ll 114 | 115 | 116 | def rnn_weighted_neg_ll(logits, labels, example_weights, seq_lens): 117 | with tf.name_scope('rnn_weighted_neg_ll'): 118 | # Compute mean cross entropy loss for each output. 119 | neg_log_likelihood = tf.nn.sparse_softmax_cross_entropy_with_logits( 120 | logits, labels) 121 | 122 | mean_weighted_neg_ll = (tf.reduce_sum(neg_log_likelihood * example_weights) / 123 | tf.to_float(tf.reduce_sum(seq_lens))) 124 | 125 | return mean_weighted_neg_ll 126 | --------------------------------------------------------------------------------