├── .gitignore
├── A3C
    ├── README.md
    ├── ac_net.py
    ├── acrobot_a3c.py
    ├── cartpole_a3c.py
    ├── imgs
    │   ├── a3c_acrobot.png
    │   ├── a3c_cartpole_el0.png
    │   ├── a3c_cartpole_el001.png
    │   ├── mountaincar_el1.png
    │   └── mountaincar_tmax15_el1.png
    ├── mountaincar_a3c.py
    ├── tf_utils.py
    └── worker.py
├── DP
    ├── __init__.py
    ├── policy_iteration.py
    ├── test_policy_iteration.py
    ├── test_value_iteration.py
    └── value_iteration.py
├── DQN
    ├── README.md
    ├── cartpole_dqn.py
    ├── dqn.py
    ├── exp_replay.py
    ├── imgs
    │   └── dqn_cartpole_training.png
    ├── test_exp_replay.py
    └── tf_utils.py
├── LICENSE
├── README.md
├── TD
    ├── __init__.py
    ├── cartpole_qlearning.py
    ├── qlearning.py
    └── test_qlearning.py
├── ddpg
    ├── README.md
    ├── actor.py
    ├── critic.py
    ├── ddpg.py
    ├── exp_replay.py
    ├── imgs
    │   └── ddpg_plot.png
    ├── mountaincar_ddpg.py
    ├── ou.py
    ├── pendulum_ddpg.py
    └── tf_utils.py
├── envs
    ├── __init__.py
    ├── env.py
    ├── gridworld.py
    ├── mdp.py
    └── test_gridworld.py
├── imgs
    └── breakout10.gif
├── monte_carlo
    ├── monte_carlo.py
    └── test_monte_carlo.py
├── papers
    ├── AlphaGoNaturePaper.pdf
    ├── GAN.pdf
    ├── Learning2learn_by_GD_by_GD.pdf
    ├── a3c.pdf
    ├── browne_mcts_survey_ieee12.pdf
    ├── ddpg.pdf
    ├── ddqn.pdf
    ├── dpg_silver14.pdf
    ├── dqn.pdf
    ├── dqn_nature.pdf
    ├── drl_bench_mark2016.pdf
    ├── dueling_dqn.pdf
    └── learn2rl.pdf
├── policy_gradient
    ├── README.md
    ├── cartpole_reinforce.py
    ├── cartpole_reinforce_baseline.py
    ├── imgs
    │   ├── cartpole_reinforce.png
    │   └── cartpole_reinforce_w_baseline.png
    ├── reinforce.py
    ├── reinforce_w_baseline.py
    └── tf_utils.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .coverage
 3 | *.pyc
 4 | run_tests.sh
 5 | run_format.sh
 6 | push.sh
 7 | *.png
 8 | *.jpg
 9 | visualize_history.py
10 | log.txt
11 | nohup.out
12 | *log.txt
13 | *log*.txt
14 | tmp/*
15 | *logs/


--------------------------------------------------------------------------------
/A3C/README.md:
--------------------------------------------------------------------------------
 1 | ## Asynchronized Advantage Actor-Critic
 2 | 
 3 | Following paper: Asynchronous Methods for Deep Reinforcement Learning [(https://arxiv.org/pdf/1602.01783.pdf)](https://arxiv.org/pdf/1602.01783.pdf)
 4 | 
 5 | #### Cartpole-v0 result 
 6 | 
 7 | `$ python cartpole_a3c.py --device=cpu --episodes=1000 --workers=4 --log_dir=cartpole_logs`
 8 | 
 9 | The following graph shows the episode rewards (# workers: 4, entropy loss: 0.2)
10 | 
11 | Tensorboard:
12 | 
13 | `$ tensorboard --logdir=cartpole_logs/`
14 | 
15 | ![A3C training](imgs/a3c_cartpole_el001.png "A3C training")
16 | 
17 | <!-- The following graph shows the episode rewards (# workers: 4, entropy loss: 0.0)
18 | 
19 | ![A3C training](imgs/a3c_cartpole_el0.png "A3C training")
20 |  -->
21 | 
22 | #### Acrobot-v1 result 
23 | 
24 | `$ python acrobot_a3c.py --device=cpu --episodes=500 --workers=4 --log_dir=acrobot_logs`
25 | 
26 | The following graph shows the episode rewards (# workers: 4, entropy loss: 0.2)
27 | 
28 | ![A3C training](imgs/a3c_acrobot.png "A3C training")
29 | 
30 | #### MountainCar-v0 result
31 | 
32 | `$ python mountaincar_a3c.py --device=cpu --episodes=20000 --workers=8 --log_dir=mc_logs`
33 | 
34 | The following graph shows the episode rewards (# workers: 8, entropy loss: 1.0, tmax=5)
35 | 
36 | ![A3C training](imgs/mountaincar_tmax15_el1.png "A3C training")
37 | 
38 | 
39 | #### References
40 | 
41 | - Openai's A3C implementation ([https://github.com/openai/universe-starter-agent](https://github.com/openai/universe-starter-agent))
42 | - Arthur Juliani's blog post ([https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2))
43 | 


--------------------------------------------------------------------------------
/A3C/ac_net.py:
--------------------------------------------------------------------------------
 1 | '''Actor-critic network class for a3c'''
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | 
 5 | import tf_utils
 6 | 
 7 | 
 8 | class AC_Net(object):
 9 |   '''Actor-critic network class for a3c'''
10 | 
11 |   def __init__(self, state_size, action_size, lr,
12 |                name, n_h1=400, n_h2=300, global_name='global'):
13 | 
14 |     self.state_size = state_size
15 |     self.action_size = action_size
16 |     self.name = name
17 |     self.n_h1 = n_h1
18 |     self.n_h2 = n_h2
19 | 
20 |     self.optimizer = tf.train.AdamOptimizer(lr)
21 |     self.input_s, self.input_a, self.advantage, self.target_v, self.policy, self.value, self.action_est, self.model_variables = self._build_network(
22 |         name)
23 | 
24 |     # 0.5, 0.2, 1.0
25 |     self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1])))
26 |     self.entropy_loss = 1.0 * tf.reduce_sum(self.policy * tf.log(self.policy))
27 |     self.policy_loss = 1.0 * tf.reduce_sum(-tf.log(self.action_est) * self.advantage)
28 |     self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.model_variables])
29 |     # self.loss = 0.5 * self.value_loss + self.policy_loss + 0.2 * self.entropy_loss
30 |     self.loss = self.value_loss + self.policy_loss + self.entropy_loss
31 |     self.gradients = tf.gradients(self.loss, self.model_variables)
32 |     if name != global_name:
33 |       self.var_norms = tf.global_norm(self.model_variables)
34 |       global_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, global_name)
35 |       self.apply_gradients = self.optimizer.apply_gradients(zip(self.gradients, global_variables))
36 | 
37 |   def _build_network(self, name):
38 |     input_s = tf.placeholder(tf.float32, [None, self.state_size])
39 |     input_a = tf.placeholder(tf.int32, [None])
40 |     advantage = tf.placeholder(tf.float32, [None])
41 |     target_v = tf.placeholder(tf.float32, [None])
42 | 
43 |     with tf.variable_scope(name):
44 |       layer_1 = tf_utils.fc(
45 |           input_s,
46 |           self.n_h1,
47 |           scope="fc1",
48 |           activation_fn=tf.nn.relu,
49 |           initializer=tf.contrib.layers.variance_scaling_initializer(
50 |               mode="FAN_IN"))
51 |       layer_2 = tf_utils.fc(
52 |           layer_1,
53 |           self.n_h2,
54 |           scope="fc2",
55 |           activation_fn=tf.nn.relu,
56 |           initializer=tf.contrib.layers.variance_scaling_initializer(
57 |               mode="FAN_IN"))
58 |       policy = tf_utils.fc(
59 |           layer_2,
60 |           self.action_size,
61 |           activation_fn=tf.nn.softmax,
62 |           scope="policy",
63 |           initializer=tf_utils.normalized_columns_initializer(0.01))
64 |       value = tf_utils.fc(layer_2, 1, activation_fn=None,
65 |                           scope="value", initializer=tf_utils.normalized_columns_initializer(1.0))
66 | 
67 |       action_mask = tf.one_hot(input_a, self.action_size, 1.0, 0.0)
68 |       action_est = tf.reduce_sum(policy * action_mask, 1)
69 | 
70 |     model_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
71 |     return input_s, input_a, advantage, target_v, policy, value, action_est, model_variables
72 | 
73 |   def get_action(self, state, sess):
74 |     state = np.reshape(state, [-1, self.state_size])
75 |     policy = sess.run(self.policy, feed_dict={self.input_s: state})
76 |     return np.random.choice(range(self.action_size), p=policy[0])
77 | 
78 |   def predict_policy(self, state, sess):
79 |     state = np.reshape(state, [-1, self.state_size])
80 |     policy = sess.run(self.policy, feed_dict={self.input_s: state})
81 |     return policy[0]
82 | 
83 |   def predict_value(self, state, sess):
84 |     state = np.reshape(state, [-1, self.state_size])
85 |     return sess.run(self.value, feed_dict={self.input_s: state})
86 | 


--------------------------------------------------------------------------------
/A3C/acrobot_a3c.py:
--------------------------------------------------------------------------------
 1 | '''Example of A3C running on Acrobot environment
 2 | '''
 3 | import argparse
 4 | import time
 5 | import threading
 6 | import tensorflow as tf
 7 | import gym
 8 | # import multiprocessing
 9 | 
10 | import ac_net
11 | import worker
12 | 
13 | PARSER = argparse.ArgumentParser(description=None)
14 | PARSER.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu')
15 | PARSER.add_argument('-e', '--episodes', default=500, type=int, help='number of episodes')
16 | PARSER.add_argument('-w', '--workers', default=4, type=int, help='number of workers')
17 | PARSER.add_argument('-l', '--log_dir', default='acrobot_logs', type=str, help='log directory')
18 | ARGS = PARSER.parse_args()
19 | print ARGS
20 | 
21 | DEVICE = ARGS.device
22 | STATE_SIZE = 6
23 | ACTION_SIZE = 3
24 | LEARNING_RATE = 0.0001
25 | GAMMA = 0.99
26 | T_MAX = 5
27 | # NUM_WORKERS = multiprocessing.cpu_count()
28 | NUM_WORKERS = ARGS.workers
29 | NUM_EPISODES = ARGS.episodes
30 | LOG_DIR = ARGS.log_dir
31 | 
32 | N_H1 = 300
33 | N_H2 = 300
34 | 
35 | 
36 | def main():
37 |   '''Example of A3C running on Acrobot environment'''
38 |   tf.reset_default_graph()
39 | 
40 |   history = []
41 | 
42 |   with tf.device('/{}:0'.format(DEVICE)):
43 |     sess = tf.Session()
44 |     global_model = ac_net.AC_Net(
45 |         STATE_SIZE,
46 |         ACTION_SIZE,
47 |         LEARNING_RATE,
48 |         'global',
49 |         n_h1=N_H1,
50 |         n_h2=N_H2)
51 |     workers = []
52 |     for i in xrange(NUM_WORKERS):
53 |       env = gym.make('Acrobot-v1')
54 |       env._max_episode_steps = 3000
55 |       workers.append(worker.Worker(env,
56 |                                    state_size=STATE_SIZE, action_size=ACTION_SIZE,
57 |                                    worker_name='worker_{}'.format(i), global_name='global',
58 |                                    lr=LEARNING_RATE, gamma=GAMMA, t_max=T_MAX, sess=sess,
59 |                                    history=history, n_h1=N_H1, n_h2=N_H2, logdir=LOG_DIR))
60 | 
61 |     sess.run(tf.global_variables_initializer())
62 | 
63 |     for workeri in workers:
64 |       worker_work = lambda: workeri.work(NUM_EPISODES)
65 |       thread = threading.Thread(target=worker_work)
66 |       thread.start()
67 | 
68 | 
69 | if __name__ == "__main__":
70 |   main()
71 | 


--------------------------------------------------------------------------------
/A3C/cartpole_a3c.py:
--------------------------------------------------------------------------------
 1 | '''Example of A3C running on Cartpole environment'''
 2 | import argparse
 3 | import time
 4 | import threading
 5 | import tensorflow as tf
 6 | import gym
 7 | # import multiprocessing
 8 | 
 9 | import ac_net
10 | import worker
11 | 
12 | PARSER = argparse.ArgumentParser(description=None)
13 | PARSER.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu')
14 | PARSER.add_argument('-e', '--episodes', default=1000, type=int, help='number of episodes')
15 | PARSER.add_argument('-w', '--workers', default=4, type=int, help='number of workers')
16 | PARSER.add_argument('-l', '--log_dir', default='cartpole_logs', type=str, help='log directory')
17 | ARGS = PARSER.parse_args()
18 | print ARGS
19 | 
20 | 
21 | DEVICE = ARGS.device
22 | STATE_SIZE = 4
23 | ACTION_SIZE = 2
24 | LEARNING_RATE = 0.0001
25 | GAMMA = 0.99
26 | T_MAX = 5
27 | # NUM_WORKERS = multiprocessing.cpu_count()
28 | NUM_WORKERS = ARGS.workers
29 | NUM_EPISODES = ARGS.episodes
30 | LOG_DIR = ARGS.log_dir
31 | 
32 | 
33 | N_H1 = 300
34 | N_H2 = 300
35 | 
36 | 
37 | def main():
38 |   '''Example of A3C running on Cartpole environment'''
39 |   tf.reset_default_graph()
40 | 
41 |   history = []
42 | 
43 |   with tf.device('/{}:0'.format(DEVICE)):
44 |     sess = tf.Session()
45 |     global_model = ac_net.AC_Net(
46 |         STATE_SIZE,
47 |         ACTION_SIZE,
48 |         LEARNING_RATE,
49 |         'global',
50 |         n_h1=N_H1,
51 |         n_h2=N_H2)
52 |     workers = []
53 |     for i in xrange(NUM_WORKERS):
54 |       env = gym.make('CartPole-v0')
55 |       env._max_episode_steps = 200
56 |       workers.append(worker.Worker(env,
57 |                                    state_size=STATE_SIZE, action_size=ACTION_SIZE,
58 |                                    worker_name='worker_{}'.format(i), global_name='global',
59 |                                    lr=LEARNING_RATE, gamma=GAMMA, t_max=T_MAX, sess=sess,
60 |                                    history=history, n_h1=N_H1, n_h2=N_H2, logdir=LOG_DIR))
61 | 
62 |     sess.run(tf.global_variables_initializer())
63 | 
64 |     for workeri in workers:
65 |       worker_work = lambda: workeri.work(NUM_EPISODES)
66 |       thread = threading.Thread(target=worker_work)
67 |       thread.start()
68 | 
69 | 
70 | if __name__ == "__main__":
71 |   main()
72 | 


--------------------------------------------------------------------------------
/A3C/imgs/a3c_acrobot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/a3c_acrobot.png


--------------------------------------------------------------------------------
/A3C/imgs/a3c_cartpole_el0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/a3c_cartpole_el0.png


--------------------------------------------------------------------------------
/A3C/imgs/a3c_cartpole_el001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/a3c_cartpole_el001.png


--------------------------------------------------------------------------------
/A3C/imgs/mountaincar_el1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/mountaincar_el1.png


--------------------------------------------------------------------------------
/A3C/imgs/mountaincar_tmax15_el1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/mountaincar_tmax15_el1.png


--------------------------------------------------------------------------------
/A3C/mountaincar_a3c.py:
--------------------------------------------------------------------------------
 1 | '''Example of A3C running on MountainCar environment'''
 2 | import argparse
 3 | import time
 4 | import threading
 5 | import tensorflow as tf
 6 | import gym
 7 | # import multiprocessing
 8 | 
 9 | import ac_net
10 | import worker
11 | 
12 | 
13 | PARSER = argparse.ArgumentParser(description=None)
14 | PARSER.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu')
15 | PARSER.add_argument('-e', '--episodes', default=20000, type=int, help='number of episodes')
16 | PARSER.add_argument('-w', '--workers', default=8, type=int, help='number of workers')
17 | PARSER.add_argument('-l', '--log_dir', default='mountaincar_logs', type=str, help='log directory')
18 | ARGS = PARSER.parse_args()
19 | print ARGS
20 | 
21 | 
22 | DEVICE = ARGS.device
23 | ENV_NAME = 'MountainCar-v0'
24 | ENV = gym.make('MountainCar-v0')
25 | STATE_SIZE = ENV.observation_space.shape[0]  # 2
26 | ACTION_SIZE = ENV.action_space.n  # 3
27 | LEARNING_RATE = 0.0001
28 | GAMMA = 0.99
29 | T_MAX = 5
30 | # NUM_WORKERS = multiprocessing.cpu_count()
31 | NUM_WORKERS = ARGS.workers
32 | NUM_EPISODES = ARGS.episodes
33 | MAX_STEPS = 10000
34 | LOG_DIR = ARGS.log_dir
35 | 
36 | 
37 | N_H1 = 300
38 | N_H2 = 300
39 | 
40 | 
41 | def main():
42 |   '''Example of A3C running on MountainCar environment'''
43 |   tf.reset_default_graph()
44 | 
45 |   history = []
46 | 
47 |   with tf.device('/{}:0'.format(DEVICE)):
48 |     sess = tf.Session()
49 |     global_model = ac_net.AC_Net(
50 |         STATE_SIZE,
51 |         ACTION_SIZE,
52 |         LEARNING_RATE,
53 |         'global',
54 |         n_h1=N_H1,
55 |         n_h2=N_H2)
56 |     workers = []
57 |     for i in xrange(NUM_WORKERS):
58 |       env = gym.make(ENV_NAME)
59 |       env._max_episode_steps = MAX_STEPS
60 |       workers.append(worker.Worker(env,
61 |                                    state_size=STATE_SIZE, action_size=ACTION_SIZE,
62 |                                    worker_name='worker_{}'.format(i), global_name='global',
63 |                                    lr=LEARNING_RATE, gamma=GAMMA, t_max=T_MAX, sess=sess,
64 |                                    history=history, n_h1=N_H1, n_h2=N_H2, logdir=LOG_DIR))
65 | 
66 |     sess.run(tf.global_variables_initializer())
67 | 
68 |     for workeri in workers:
69 |       worker_work = lambda: workeri.work(NUM_EPISODES)
70 |       thread = threading.Thread(target=worker_work)
71 |       thread.start()
72 | 
73 | 
74 | if __name__ == "__main__":
75 |   main()
76 | 


--------------------------------------------------------------------------------
/A3C/tf_utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for tensorflow"""
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | 
 5 | 
 6 | def max_pool(x, k_sz=[2, 2]):
 7 |   """max pooling layer wrapper
 8 |   Args
 9 |     x:      4d tensor [batch, height, width, channels]
10 |     k_sz:   The size of the window for each dimension of the input tensor
11 |   Returns
12 |     a max pooling layer
13 |   """
14 |   return tf.nn.max_pool(
15 |       x, ksize=[
16 |           1, k_sz[0], k_sz[1], 1], strides=[
17 |           1, k_sz[0], k_sz[1], 1], padding='SAME')
18 | 
19 | 
20 | def conv2d(x, n_kernel, k_sz, stride=1):
21 |   """convolutional layer with relu activation wrapper
22 |   Args:
23 |     x:          4d tensor [batch, height, width, channels]
24 |     n_kernel:   number of kernels (output size)
25 |     k_sz:       2d array, kernel size. e.g. [8,8]
26 |     stride:     stride
27 |   Returns
28 |     a conv2d layer
29 |   """
30 |   W = tf.Variable(tf.random_normal([k_sz[0], k_sz[1], int(x.get_shape()[3]), n_kernel]))
31 |   b = tf.Variable(tf.random_normal([n_kernel]))
32 |   # - strides[0] and strides[1] must be 1
33 |   # - padding can be 'VALID'(without padding) or 'SAME'(zero padding)
34 |   #     - http://stackoverflow.com/questions/37674306/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-t
35 |   conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')
36 |   conv = tf.nn.bias_add(conv, b)  # add bias term
37 |   # rectified linear unit: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
38 |   return tf.nn.relu(conv)
39 | 
40 | 
41 | def fc(x, n_output, scope="fc", activation_fn=None, initializer=None):
42 |   """fully connected layer with relu activation wrapper
43 |   Args
44 |     x:          2d tensor [batch, n_input]
45 |     n_output    output size
46 |   """
47 |   with tf.variable_scope(scope):
48 |     if initializer is None:
49 |       # default initialization
50 |       W = tf.Variable(tf.random_normal([int(x.get_shape()[1]), n_output]))
51 |       b = tf.Variable(tf.random_normal([n_output]))
52 |     else:
53 |       W = tf.get_variable("W", shape=[int(x.get_shape()[1]), n_output], initializer=initializer)
54 |       b = tf.get_variable("b", shape=[n_output],
55 |                           initializer=tf.constant_initializer(.0, dtype=tf.float32))
56 |     fc1 = tf.add(tf.matmul(x, W), b)
57 |     if not activation_fn is None:
58 |       fc1 = activation_fn(fc1)
59 |   return fc1
60 | 
61 | 
62 | def flatten(x):
63 |   """flatten a 4d tensor into 2d
64 |   Args
65 |     x:          4d tensor [batch, height, width, channels]
66 |   Returns a flattened 2d tensor
67 |   """
68 |   return tf.reshape(x, [-1, int(x.get_shape()[1] * x.get_shape()[2] * x.get_shape()[3])])
69 | 
70 | 
71 | def update_target_graph(from_scope, to_scope):
72 |   from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
73 |   to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
74 | 
75 |   op_holder = []
76 |   for from_var, to_var in zip(from_vars, to_vars):
77 |       op_holder.append(to_var.assign(from_var))
78 |   return op_holder
79 | 
80 | 
81 | # Used to initialize weights for policy and value output layers
82 | def normalized_columns_initializer(std=1.0):
83 |   def _initializer(shape, dtype=None, partition_info=None):
84 |       out = np.random.randn(*shape).astype(np.float32)
85 |       out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
86 |       return tf.constant(out)
87 |   return _initializer
88 | 


--------------------------------------------------------------------------------
/A3C/worker.py:
--------------------------------------------------------------------------------
  1 | '''Worker class for A3C'''
  2 | from collections import namedtuple
  3 | import random
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | import ac_net
  8 | import tf_utils
  9 | 
 10 | MAX_STEPS = 10000
 11 | 
 12 | 
 13 | Step = namedtuple('Step', 'cur_step action next_step reward done')
 14 | 
 15 | 
 16 | class Worker(object):
 17 |   '''Worker class for A3C'''
 18 | 
 19 |   def __init__(self, env, state_size, action_size,
 20 |                worker_name, global_name, lr, gamma, t_max, sess,
 21 |                history, n_h1=400, n_h2=300, logdir='logs'):
 22 |     self.env = env
 23 |     self.name = worker_name
 24 |     self.gamma = gamma
 25 |     self.sess = sess
 26 |     self.t_max = t_max
 27 |     self.history = history
 28 | 
 29 |     self.local_model = ac_net.AC_Net(state_size, action_size, lr,
 30 |                                      worker_name, n_h1=n_h1, n_h2=n_h2, global_name=global_name)
 31 |     self.copy_to_local_op = tf_utils.update_target_graph(global_name, worker_name)
 32 | 
 33 |     self.summary_writer = tf.summary.FileWriter("{}/train_{}".format(logdir, worker_name))
 34 | 
 35 |   def _copy_to_local(self):
 36 |     self.sess.run(self.copy_to_local_op)
 37 | 
 38 |   def work(self, n_episodes):
 39 |     episode_i = 0
 40 |     episode_len = 1
 41 |     cur_state = self.env.reset()
 42 |     count = 1
 43 |     cum_reward = 0
 44 |     while episode_i < n_episodes:
 45 |       # 1) sync from global model to local model
 46 |       self._copy_to_local()
 47 |       # 2) collect t_max steps (if terminated then i++)
 48 |       steps = []
 49 |       for _ in xrange(self.t_max):
 50 |         action = self.local_model.get_action(cur_state, self.sess)
 51 |         next_state, reward, done, info = self.env.step(action)
 52 |         cum_reward += reward
 53 |         episode_len = episode_len + 1
 54 |         steps.append(
 55 |             Step(
 56 |                 cur_step=cur_state,
 57 |                 action=action,
 58 |                 next_step=next_state,
 59 |                 reward=reward,
 60 |                 done=done))
 61 |         if done or episode_len >= MAX_STEPS:
 62 |           cur_state = self.env.reset()
 63 |           self.history.append(episode_len)
 64 |           summary = tf.Summary()
 65 |           summary.value.add(tag='Perf/episode_len', simple_value=float(episode_len))
 66 |           summary.value.add(tag='Perf/episode_reward', simple_value=float(cum_reward))
 67 |           self.summary_writer.add_summary(summary, episode_i)
 68 |           print 'worker {}: episode {} finished in {} steps, cumulative reward: {}'.format(self.name, episode_i, episode_len, cum_reward)
 69 |           print action
 70 |           print self.local_model.predict_policy(cur_state, self.sess)
 71 |           cum_reward = 0
 72 |           episode_i = episode_i + 1
 73 |           episode_len = 0
 74 |           break
 75 |         cur_state = next_state
 76 |       # 3) convert the t_max steps into a batch
 77 |       if steps[-1].done:
 78 |         R = 0
 79 |       else:
 80 |         R = self.local_model.predict_value(cur_state, self.sess)
 81 |       R_batch = np.zeros(len(steps))
 82 |       advantage_batch = np.zeros(len(steps))
 83 |       target_v_batch = np.zeros(len(steps))
 84 |       for i in reversed(xrange(len(steps))):
 85 |         step = steps[i]
 86 |         R = step.reward + self.gamma * R
 87 |         R_batch[i] = R
 88 |       cur_state_batch = [step.cur_step for step in steps]
 89 |       pred_v_batch = self.local_model.predict_value(cur_state_batch, self.sess)
 90 |       action_batch = [step.action for step in steps]
 91 |       advantage_batch = [R_batch[i] - pred_v_batch[i] for i in xrange(len(steps))]
 92 |       # 4) compute the gradient and update the global model
 93 |       action_batch = np.reshape(action_batch, [-1])
 94 |       advantage_batch = np.reshape(advantage_batch, [-1])
 95 |       R_batch = np.reshape(R_batch, [-1])
 96 |       feed_dict = {
 97 |           self.local_model.input_s: cur_state_batch,
 98 |           self.local_model.input_a: action_batch,
 99 |           self.local_model.advantage: advantage_batch,
100 |           self.local_model.target_v: R_batch,
101 |       }
102 |       v_l, p_l, e_l, loss, _, _, v_n = self.sess.run(
103 |           [self.local_model.value_loss,
104 |            self.local_model.policy_loss,
105 |            self.local_model.entropy_loss,
106 |            self.local_model.loss,
107 |            self.local_model.gradients,
108 |            self.local_model.apply_gradients,
109 |            self.local_model.var_norms],
110 |           feed_dict)
111 | 
112 |       mean_reward = np.mean([step.reward for step in steps])
113 |       mean_value = np.mean(R_batch)
114 | 
115 |       summary = tf.Summary()
116 |       summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward))
117 |       summary.value.add(tag='Perf/Value', simple_value=float(mean_value))
118 |       summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l))
119 |       summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l))
120 |       summary.value.add(tag='Losses/Entropy', simple_value=float(e_l))
121 |       summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n))
122 |       self.summary_writer.add_summary(summary, count)
123 |       count += 1
124 | 


--------------------------------------------------------------------------------
/DP/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/DP/__init__.py


--------------------------------------------------------------------------------
/DP/policy_iteration.py:
--------------------------------------------------------------------------------
 1 | # Policy iteration agent
 2 | # Model-based learning which requires mdp.
 3 | #
 4 | # ---
 5 | # @author Yiren Lu
 6 | # @email luyiren [at] seas [dot] upenn [dot] edu
 7 | #
 8 | # MIT License
 9 | 
10 | import math
11 | 
12 | 
13 | class PolicyIterationAgent(object):
14 | 
15 |   def __init__(self, mdp, gamma, iterations=100):
16 |     """
17 |     The constructor performs policy iteration on mdp using dynamic programming
18 |     ---
19 |     args
20 |       mdp:      markov decision process that is required by value iteration agent
21 |       gamma:    discount factor
22 |     """
23 |     self.mdp = mdp
24 |     self.gamma = gamma
25 |     states = mdp.get_states()
26 |     # init values
27 |     self.values = {}
28 |     # policy is a map from state to action
29 |     self.policy = {}
30 | 
31 |     for s in states:
32 |       if mdp.is_terminal(s):
33 |         self.values[s] = mdp.get_reward(s)
34 |       else:
35 |         self.values[s] = 0
36 |       self.policy[s] = 0
37 | 
38 |     # estimate values
39 |     for i in range(iterations):
40 |       values_tmp = self.values.copy()
41 |       policy_tmp = self.policy.copy()
42 | 
43 |       for s in states:
44 |         # policy iteration
45 |         if mdp.is_terminal(s):
46 |           continue
47 | 
48 |         self.values[s] = sum([P_s1_s_a * (self.mdp.get_reward_sas(s, policy_tmp[s], s1) + self.gamma * values_tmp[s1]) 
49 |                               for s1, P_s1_s_a in self.mdp.get_transition_states_and_probs(s, policy_tmp[s])])
50 | 
51 |         # policy improvement
52 |         actions = mdp.get_actions(s)
53 |         v_a = [sum([P_s1_s_a * (self.mdp.get_reward_sas(s, policy_tmp[s], s1) + self.gamma * values_tmp[s1]) 
54 |               for s1, P_s1_s_a in self.mdp.get_transition_states_and_probs(s, a)]) 
55 |               for a in actions]
56 |         self.policy[s] = actions[v_a.index(max(v_a))]
57 | 
58 |   def get_values(self):
59 |     """
60 |     returns
61 |       a dictionary {<state, value>}
62 |     """
63 |     return self.values
64 | 
65 |   def get_optimal_policy(self):
66 |     """
67 |     returns
68 |       a dictionary {<state, action>}
69 |     """
70 |     states = self.mdp.get_states()
71 |     policy = {}
72 |     for s in states:
73 |       policy[s] = [(self.get_action(s), 1)]
74 |     return policy
75 | 
76 |   def get_action(self, state):
77 |     """
78 |     args
79 |       state    current state
80 |     returns
81 |       an action to take given the state
82 |     """
83 |     return self.policy[state]
84 | 


--------------------------------------------------------------------------------
/DP/test_policy_iteration.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | if "../" not in sys.path:
 4 |   sys.path.append("../") 
 5 | from envs import gridworld
 6 | import policy_iteration
 7 | 
 8 | 
 9 | class PolicyIterationAgentTest(unittest.TestCase):
10 |   """
11 |   Unit test for policy iteration agent
12 |   """
13 | 
14 |   def setUp(self):
15 |     grid = [['0', '0', '0', '1'],
16 |             ['0', 'x', '0', '-1'],
17 |             ['0', '0', '0', '0']]
18 | 
19 |     self.grid = grid
20 |     self.gw_non_deterministic = gridworld.GridWorld(
21 |         grid, {(0, 3), (1, 3)}, 0.8)
22 | 
23 |     self.agent = policy_iteration.PolicyIterationAgent(
24 |         self.gw_non_deterministic, 0.9, 20)
25 | 
26 |   def test_show_policy(self):
27 |     print 'Show policy learned by value iteration:'
28 |     self.gw_non_deterministic.display_policy_grid(
29 |         self.agent.get_optimal_policy())
30 | 
31 |   def test_values(self):
32 |     print 'Show value iteration results:'
33 |     self.gw_non_deterministic.display_value_grid(self.agent.values)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |   unittest.main()
38 | 


--------------------------------------------------------------------------------
/DP/test_value_iteration.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | if "../" not in sys.path:
 4 |   sys.path.append("../") 
 5 | from envs import gridworld
 6 | import value_iteration
 7 | 
 8 | 
 9 | class ValueIterationAgentTest(unittest.TestCase):
10 |   """
11 |   Unit test for value iteration agent
12 |   """
13 | 
14 |   def setUp(self):
15 |     grid = [['0', '0', '0', '1'],
16 |             ['0', 'x', '0', '-1'],
17 |             ['0', '0', '0', '0']]
18 | 
19 |     self.grid = grid
20 |     self.gw_non_deterministic = gridworld.GridWorld(
21 |         grid, {(0, 3), (1, 3)}, 0.8)
22 | 
23 |     self.agent = value_iteration.ValueIterationAgent(
24 |         self.gw_non_deterministic, 0.9, 100)
25 | 
26 |   def test_eval_policy(self):
27 |     print 'Show evaluation of the optimal policy:'
28 |     self.gw_non_deterministic.display_value_grid(
29 |       self.agent.eval_policy_dist(self.agent.get_policy_dist()))
30 | 
31 |   def test_show_policy(self):
32 |     print 'Show policy learned by value iteration:'
33 |     self.gw_non_deterministic.display_policy_grid(
34 |         self.agent.get_optimal_policy())
35 | 
36 |   def test_values(self):
37 |     print 'Show value iteration results:'
38 |     self.gw_non_deterministic.display_value_grid(self.agent.values)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |   unittest.main()
43 | 


--------------------------------------------------------------------------------
/DP/value_iteration.py:
--------------------------------------------------------------------------------
  1 | # Value iteration agent
  2 | # Model-based learning which requires mdp.
  3 | #
  4 | # ---
  5 | # @author Yiren Lu
  6 | # @email luyiren [at] seas [dot] upenn [dot] edu
  7 | #
  8 | # MIT License
  9 | 
 10 | import math
 11 | 
 12 | 
 13 | class ValueIterationAgent(object):
 14 | 
 15 |   def __init__(self, mdp, gamma, iterations=100):
 16 |     """
 17 |     The constructor build a value model from mdp using dynamic programming
 18 |     ---
 19 |     args
 20 |       mdp:      markov decision process that is required by value iteration agent
 21 |       gamma:    discount factor
 22 |     """
 23 |     self.mdp = mdp
 24 |     self.gamma = gamma
 25 |     states = mdp.get_states()
 26 |     # init values
 27 |     self.values = {}
 28 | 
 29 |     for s in states:
 30 |       if mdp.is_terminal(s):
 31 |         self.values[s] = mdp.get_reward(s)
 32 |       else:
 33 |         self.values[s] = 0
 34 | 
 35 |     # estimate values
 36 |     for i in range(iterations):
 37 |       values_tmp = self.values.copy()
 38 | 
 39 |       for s in states:
 40 |         if mdp.is_terminal(s):
 41 |           continue
 42 | 
 43 |         actions = mdp.get_actions(s)
 44 |         v_s = []
 45 |         for a in actions:
 46 |           P_s1sa = mdp.get_transition_states_and_probs(s, a)
 47 |           R_sas1 = [mdp.get_reward(s1) for s1 in [p[0] for p in P_s1sa]]
 48 |           v_s.append(sum([P_s1sa[s1_id][1] * (mdp.get_reward(s) + gamma * \
 49 |                      values_tmp[P_s1sa[s1_id][0]]) for s1_id in range(len(P_s1sa))]))
 50 |         # V(s) = max_{a} \sum_{s'} P(s'| s, a) (R(s,a,s') + \gamma V(s'))
 51 |         self.values[s] = max(v_s)
 52 | 
 53 |   def get_values(self):
 54 |     """
 55 |     returns
 56 |       a dictionary {<state, value>}
 57 |     """
 58 |     return self.values
 59 | 
 60 |   def get_q_values(self, state, action):
 61 |     """
 62 |     returns qvalue of (state, action)
 63 |     """
 64 |     return sum([P_s1_s_a*(self.mdp.get_reward_sas(s, a, s1) + self.gamma*self.values[s1]) 
 65 |                 for s1, P_s1_s_a in self.mdp.get_transition_states_and_probs(state, action)])
 66 | 
 67 | 
 68 |   def eval_policy_dist(self, policy, iterations=100):
 69 |     """
 70 |     evaluate a policy distribution
 71 |     returns
 72 |       a map {<state, value>}
 73 |     """
 74 |     values = {}
 75 |     states = self.mdp.get_states()
 76 |     for s in states:
 77 |       if self.mdp.is_terminal(s):
 78 |         values[s] = self.mdp.get_reward(s)
 79 |       else:
 80 |         values[s] = 0
 81 | 
 82 |     for i in range(iterations):
 83 |       values_tmp = values.copy()
 84 | 
 85 |       for s in states:
 86 |         if self.mdp.is_terminal(s):
 87 |           continue
 88 |         actions = self.mdp.get_actions(s)
 89 |         # v(s) = \sum_{a\in A} \pi(a|s) (R(s,a,s') + \gamma \sum_{s'\in S}
 90 |         # P(s'| s, a) v(s'))
 91 |         values[s] = sum([policy[s][i][1] * (self.mdp.get_reward(s) + self.gamma * sum([s1_p * values_tmp[s1]
 92 |                     for s1, s1_p in self.mdp.get_transition_states_and_probs(s, actions[i])]))
 93 |                     for i in range(len(actions))])
 94 |     return values
 95 | 
 96 |   def get_optimal_policy(self):
 97 |     """
 98 |     returns
 99 |       a dictionary {<state, action>}
100 |     """
101 |     states = self.mdp.get_states()
102 |     policy = {}
103 |     for s in states:
104 |       policy[s] = [(self.get_action(s), 1)]
105 |     return policy
106 | 
107 |   def get_policy_dist(self):
108 |     """
109 |     returns
110 |       a dictionary {<state, action_dist>}
111 |     """
112 |     states = self.mdp.get_states()
113 |     policy = {}
114 |     for s in states:
115 |       policy[s] = self.get_action_dist(s)
116 |     return policy
117 | 
118 |   def get_action_dist(self, state):
119 |     """
120 |     args
121 |       state    current state
122 |     returns
123 |       a list of {<action, prob>} pairs representing the action distribution on state
124 |     """
125 |     actions = self.mdp.get_actions(state)
126 |     # \sum_{s'} P(s'|s,a)*(R(s,a,s') + \gamma v(s'))
127 |     v_a = [sum([s1_p * (self.mdp.get_reward_sas(state, a, s1) + self.gamma * self.values[s1])
128 |                 for s1, s1_p in self.mdp.get_transition_states_and_probs(state, a)])
129 |            for a in actions]
130 | 
131 |     # I exponentiated the v_s^a's to make them positive
132 |     v_a = [math.exp(v) for v in v_a]
133 |     return [(actions[i], v_a[i] / sum(v_a)) for i in range(len(actions))]
134 | 
135 |   def get_action(self, state):
136 |     """
137 |     args
138 |       state    current state
139 |     returns
140 |       an action to take given the state
141 |     """
142 |     actions = self.mdp.get_actions(state)
143 |     v_s = []
144 |     for a in actions:
145 |       P_s1sa = self.mdp.get_transition_states_and_probs(state, a)
146 |       R_sas1 = [self.mdp.get_reward(s1) for s1 in [p[0] for p in P_s1sa]]
147 |       v_s.append(sum([P_s1sa[s1_id][1] *
148 |                       (self.mdp.get_reward(state) +
149 |                        self.gamma *
150 |                        self.values[P_s1sa[s1_id][0]]) for s1_id in range(len(P_s1sa))]))
151 |     a_id = v_s.index(max(v_s))
152 |     return actions[a_id]
153 | 


--------------------------------------------------------------------------------
/DQN/README.md:
--------------------------------------------------------------------------------
 1 | ## Deep Q-Learning
 2 | 
 3 | Well-tuned DQN for low dimensional control tasks.
 4 | 
 5 | #### Run Code
 6 | 
 7 | `$ python cartpole_dqn.py --device=cpu --episodes=150 --model_dir=cartpole-model` to run code.
 8 | 
 9 | `$ python cartpole_dqn.py -h` for help messages.
10 | 
11 | #### Cartpole-v0 Result
12 | 
13 | ![cartpole training](imgs/dqn_cartpole_training.png "cartpole training")
14 | 


--------------------------------------------------------------------------------
/DQN/cartpole_dqn.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | import numpy as np
  4 | import sys
  5 | import tensorflow as tf
  6 | import dqn
  7 | import exp_replay
  8 | from exp_replay import Step
  9 | import matplotlib.pyplot as plt
 10 | import os
 11 | import pickle
 12 | 
 13 | 
 14 | PARSER = argparse.ArgumentParser(description=None)
 15 | PARSER.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu')
 16 | PARSER.add_argument('-e', '--episodes', default=150, type=int, help='number of episodes')
 17 | PARSER.add_argument('-m', '--model_dir', default='cartpole-model/', type=str, help='model directory')
 18 | PARSER.add_argument('-t', '--train', default=False, type=str, help='train for [number of episodes] IF MODEL EXISTS')
 19 | ARGS = PARSER.parse_args()
 20 | print ARGS
 21 | 
 22 | 
 23 | DEVICE = ARGS.device
 24 | NUM_EPISODES = ARGS.episodes
 25 | ACTIONS = {0:0, 1:1}
 26 | MAX_STEPS = 300
 27 | FAIL_PENALTY = 0
 28 | EPSILON = 1
 29 | EPSILON_DECAY = 0.01
 30 | END_EPSILON = 0.1
 31 | LEARNING_RATE = 0.001
 32 | DISCOUNT_FACTOR = 0.9
 33 | BATCH_SIZE = 32
 34 | MEM_SIZE = 1e4
 35 | START_MEM = 1e2
 36 | STATE_SIZE = [4]
 37 | EPOCH_SIZE = 100
 38 | 
 39 | TRAIN = ARGS.train
 40 | 
 41 | MODEL_DIR = ARGS.model_dir
 42 | MODEL_PATH = MODEL_DIR + 'model'
 43 | MEMORY_PATH = MODEL_DIR + 'memory.p'
 44 | 
 45 | 
 46 | def train(agent, exprep, env):
 47 |   for i in xrange(NUM_EPISODES):
 48 |     cur_state = env.reset()
 49 |     for t in xrange(MAX_STEPS):
 50 |       action = agent.get_action(cur_state)
 51 |       next_state, reward, done, info = env.step(action)
 52 |       if done:
 53 |         reward = FAIL_PENALTY
 54 |         exprep.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done))
 55 |         print("Episode {} finished after {} timesteps".format(i, t + 1))
 56 |         yield t + 1 
 57 |         break
 58 |       exprep.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done))
 59 |       cur_state = next_state
 60 |       if t == MAX_STEPS - 1:
 61 |         print("Episode {} finished after {} timesteps".format(i, t + 1))
 62 |         yield t + 1
 63 |     agent.epsilon_decay()
 64 |     agent.learn_epoch(exprep, EPOCH_SIZE)
 65 |     print 'epsilon: {}'.format(agent.epsilon)
 66 | 
 67 | 
 68 | env = gym.make('CartPole-v0')
 69 | exprep = exp_replay.ExpReplay(mem_size=MEM_SIZE, start_mem=START_MEM, state_size=STATE_SIZE, kth=-1, batch_size=BATCH_SIZE)
 70 | 
 71 | 
 72 | 
 73 | sess = tf.Session()
 74 | with tf.device('/{}:0'.format(DEVICE)):
 75 |   agent = dqn.DQNAgent(session=sess, epsilon=EPSILON, epsilon_anneal=EPSILON_DECAY, end_epsilon=END_EPSILON, 
 76 |         lr=LEARNING_RATE, gamma=DISCOUNT_FACTOR, state_size=4, 
 77 |         action_size=len(ACTIONS), n_hidden_1=10, n_hidden_2=10)
 78 | 
 79 | sess.run(tf.initialize_all_variables())
 80 | saver = tf.train.Saver()
 81 | if os.path.isdir(MODEL_DIR):
 82 |   saver.restore(sess, MODEL_PATH)
 83 |   agent.epsilon = agent.end_epsilon
 84 |   print 'restored model'
 85 |   if TRAIN:
 86 |     exprep = pickle.load(open(MEMORY_PATH,"rb"))
 87 |     history = [e_length for e_length in train(agent, exprep, env)]
 88 |     saver.save(sess, MODEL_PATH)
 89 |     pickle.dump(exprep, open(MEMORY_PATH, "wb"))
 90 |     print 'saved model'
 91 |     # plot
 92 |     import matplotlib.pyplot as plt
 93 |     avg_reward = [np.mean(history[i*10:(i+1)*10]) for i in xrange(int(len(history)/10))]
 94 |     f_reward = plt.figure(1)
 95 |     plt.plot(np.linspace(0, len(history), len(avg_reward)), avg_reward)
 96 |     plt.ylabel('Episode length')
 97 |     plt.xlabel('Training episodes')
 98 |     f_reward.show()
 99 |     print 'press enter to continue'
100 |     raw_input()
101 |     plt.close()
102 | 
103 | else:
104 |   os.makedirs(MODEL_DIR)
105 |   history = [e_length for e_length in train(agent, exprep, env)]
106 |   saver.save(sess, MODEL_PATH)
107 |   pickle.dump(exprep, open(MEMORY_PATH, "wb"))
108 |   print 'saved model'
109 |   # plot
110 |   import matplotlib.pyplot as plt
111 |   avg_reward = [np.mean(history[i*10:(i+1)*10]) for i in xrange(int(len(history)/10))]
112 |   f_reward = plt.figure(1)
113 |   plt.plot(np.linspace(0, len(history), len(avg_reward)), avg_reward)
114 |   plt.ylabel('Episode length')
115 |   plt.xlabel('Training episodes')
116 |   f_reward.show()
117 |   print 'press enter to continue'
118 |   raw_input()
119 |   plt.close()
120 | 
121 | 
122 | # Display:
123 | print 'press ctrl-c to stop'
124 | while True:
125 |   cur_state = env.reset()
126 |   done = False
127 |   t = 0
128 |   while not done:
129 |     env.render()
130 |     t = t+1
131 |     action = agent.get_optimal_action(cur_state)
132 |     next_state, reward, done, info = env.step(action)
133 |     cur_state = next_state
134 |     if done:
135 |       print("Episode finished after {} timesteps".format(t+1))
136 |       break
137 |     
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/DQN/dqn.py:
--------------------------------------------------------------------------------
  1 | # Deep Q-learning agent with q-value approximation
  2 | # Following paper: Playing Atari with Deep Reinforcement Learning
  3 | #     https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
  4 | #
  5 | # ---
  6 | # @author Yiren Lu
  7 | # @email luyiren [at] seas [dot] upenn [dot] edu
  8 | #
  9 | # MIT License
 10 | 
 11 | 
 12 | import gym
 13 | import numpy as np
 14 | import random
 15 | import tensorflow as tf
 16 | import tf_utils
 17 | 
 18 | 
 19 | class DQNAgent():
 20 |   """
 21 |   DQN Agent with a 2-hidden-layer fully-connected q-network that acts epsilon-greedily.
 22 |   """
 23 | 
 24 |   def __init__(self,
 25 |     session,
 26 |     epsilon=0.5, 
 27 |     epsilon_anneal = 0.01,
 28 |     end_epsilon=0.1,
 29 |     lr=0.5, 
 30 |     gamma=0.99,
 31 |     state_size=4,
 32 |     action_size=2,
 33 |     scope="dqn",
 34 |     n_hidden_1=20,
 35 |     n_hidden_2=20,
 36 |     ):
 37 |     """
 38 |     args
 39 |       epsilon           exploration rate
 40 |       epsilon_anneal    linear decay rate per call of epsilon_decay() function
 41 |       end_epsilon       lowest exploration rate
 42 |       lr                learning rate
 43 |       gamma             discount factor
 44 |       state_size        network input size
 45 |       action_size       network output size
 46 |     """
 47 |     self.epsilon = epsilon
 48 |     self.epsilon_anneal = epsilon_anneal
 49 |     self.end_epsilon = end_epsilon
 50 |     self.lr = lr
 51 |     self.gamma = gamma
 52 |     self.state_size = state_size
 53 |     self.action_size = action_size
 54 |     self.scope = scope
 55 |     self.n_hidden_1 = n_hidden_1
 56 |     self.n_hidden_2 = n_hidden_2
 57 |     self._build_qnet()
 58 |     self.sess = session
 59 | 
 60 |   def _build_qnet(self):
 61 |     """
 62 |     Build q-network
 63 |     """
 64 |     with tf.variable_scope(self.scope):
 65 |       self.state_input = tf.placeholder(tf.float32, [None, self.state_size])
 66 |       self.action = tf.placeholder(tf.int32, [None])
 67 |       self.target_q = tf.placeholder(tf.float32, [None])
 68 | 
 69 |       fc1 = tf_utils.fc(self.state_input, n_output=self.n_hidden_1, activation_fn=tf.nn.relu)
 70 |       fc2 = tf_utils.fc(fc1, n_output=self.n_hidden_2, activation_fn=tf.nn.relu)
 71 |       self.q_values = tf_utils.fc(fc2, self.action_size, activation_fn=None)
 72 | 
 73 |       action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0)
 74 |       q_value_pred = tf.reduce_sum(self.q_values * action_mask, 1)
 75 | 
 76 |       self.loss = tf.reduce_mean(tf.square(tf.subtract(self.target_q, q_value_pred)))
 77 |       self.optimizer = tf.train.AdamOptimizer(self.lr)
 78 |       self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
 79 | 
 80 |   def get_action_values(self, state):
 81 |     actions = self.sess.run(self.q_values, feed_dict={self.state_input: [state]})
 82 |     return actions
 83 | 
 84 |   def get_optimal_action(self, state):
 85 |     actions = self.sess.run(self.q_values, feed_dict={self.state_input: [state]})
 86 |     return actions.argmax()
 87 | 
 88 |   def get_action(self, state):
 89 |     """
 90 |     Epsilon-greedy action
 91 | 
 92 |     args
 93 |       state           current state      
 94 |     returns
 95 |       an action to take given the state
 96 |     """
 97 |     if np.random.random() < self.epsilon:
 98 |       # act randomly
 99 |       return np.random.randint(0, self.action_size)
100 |     else:
101 |       return self.get_optimal_action(state)
102 | 
103 |   def epsilon_decay(self):    
104 |     if self.epsilon > self.end_epsilon:
105 |       self.epsilon = self.epsilon - self.epsilon_anneal
106 | 
107 |   def learn_epoch(self, exprep, num_steps):
108 |     """
109 |     Deep Q-learing: train qnetwork for num_steps, for each step, sample a batch from exprep
110 | 
111 |     Args
112 |       exprep:         experience replay
113 |       num_steps:      num of steps
114 |     """
115 |     for i in xrange(num_steps):
116 |       self.learn_batch(exprep.sample())
117 | 
118 |   def learn_batch(self, batch_steps):
119 |     """
120 |     Deep Q-learing: train qnetwork with the input batch
121 |     Args
122 |       batch_steps:    a batch of sampled namedtuple Step, where Step.cur_step and 
123 |                       Step.next_step are of shape {self.state_size}
124 |       sess:           tf session
125 |     Returns 
126 |       batch loss (-1 if input is empty)
127 |     """
128 |     if len(batch_steps) == 0:
129 |       return -1
130 | 
131 |     next_state_batch = [s.next_step for s in batch_steps]
132 |     q_values = self.sess.run(self.q_values, feed_dict={self.state_input: next_state_batch})
133 | 
134 |     max_q_values = q_values.max(axis=1)
135 |     # compute target q value
136 |     target_q = np.array([s.reward + self.gamma*max_q_values[i]*(1-s.done) for i,s in enumerate(batch_steps)])
137 |     target_q = target_q.reshape([len(batch_steps)])
138 |     
139 |     # minimize the TD-error
140 |     cur_state_batch = [s.cur_step for s in batch_steps]
141 |     actions = [s.action for s in batch_steps]
142 |     l, _, = self.sess.run([self.loss, self.train_op], feed_dict={ self.state_input: cur_state_batch,
143 |                                                                   self.target_q: target_q,
144 |                                                                   self.action: actions })
145 |     return l
146 | 
147 | 


--------------------------------------------------------------------------------
/DQN/exp_replay.py:
--------------------------------------------------------------------------------
 1 | # Experience Replay
 2 | # Following paper: Playing Atari with Deep Reinforcement Learning
 3 | #     https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
 4 | #
 5 | # ---
 6 | # @author Yiren Lu
 7 | # @email luyiren [at] seas [dot] upenn [dot] edu
 8 | #
 9 | # MIT License
10 | 
11 | 
12 | import numpy as np
13 | import random
14 | from collections import namedtuple
15 | 
16 | 
17 | Step = namedtuple('Step','cur_step action next_step reward done')
18 | 
19 | 
20 | class ExpReplay():
21 |   """Experience replay"""
22 | 
23 | 
24 |   def __init__(self, mem_size, start_mem=None, state_size=[84, 84], kth=4, drop_rate=0.2, batch_size=32):
25 |     # k = -1 for sending raw state
26 |     self.state_size = state_size
27 |     self.drop_rate = drop_rate
28 |     self.mem_size = mem_size
29 |     self.start_mem = start_mem
30 |     if start_mem == None:
31 |       self.start_mem = mem_size/20
32 |     self.kth = kth
33 |     self.batch_size = batch_size
34 |     self.mem = []
35 |     self.total_steps = 0
36 | 
37 | 
38 |   def add_step(self, step):
39 |     """
40 |     Store episode to memory and check if it reaches the mem_size. 
41 |     If so, drop [self.drop_rate] of the oldest memory
42 | 
43 |     args
44 |       step      namedtuple Step, where step.cur_step and step.next_step are of size {state_size}
45 |     """
46 |     self.mem.append(step)
47 |     self.total_steps = self.total_steps + 1
48 |     while len(self.mem) > self.mem_size:
49 |       self.mem = self.mem[int(len(self.mem)*self.drop_rate):]
50 | 
51 | 
52 |   def get_last_state(self):
53 |     if len(self.mem) > abs(self.kth):
54 |       if self.kth == -1:
55 |         return self.mem[-1].cur_step
56 |       if len(self.state_size) == 1:
57 |         return [s.cur_step for s in self.mem[-abs(self.kth):]]
58 |       last_state = np.stack([s.cur_step for s in self.mem[-abs(self.kth):]], axis=len(self.state_size))
59 |       return np.stack([s.cur_step for s in self.mem[-abs(self.kth):]], axis=len(self.state_size))
60 |     return []
61 | 
62 | 
63 |   def sample(self, num=None):
64 |     """Randomly draw [num] samples"""
65 |     if num == None:
66 |       num = self.batch_size
67 |     if len(self.mem) < self.start_mem:
68 |       return []
69 |     sampled_idx = random.sample(range(abs(self.kth),len(self.mem)), num)
70 |     samples = []
71 |     for idx in sampled_idx:
72 |       steps = self.mem[idx-abs(self.kth):idx]
73 |       cur_state = np.stack([s.cur_step for s in steps], axis=len(self.state_size))
74 |       next_state = np.stack([s.next_step for s in steps], axis=len(self.state_size))
75 |       # handle special cases
76 |       if self.kth == -1:
77 |         cur_state = steps[0].cur_step
78 |         next_state = steps[0].next_step
79 |       elif len(self.state_size) == 1:
80 |         cur_state = [steps[0].cur_step]
81 |         next_state = [steps[0].next_step]
82 |       reward = steps[-1].reward
83 |       action = steps[-1].action
84 |       done = steps[-1].done
85 |       samples.append(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done))
86 |     return samples
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/DQN/imgs/dqn_cartpole_training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/DQN/imgs/dqn_cartpole_training.png


--------------------------------------------------------------------------------
/DQN/test_exp_replay.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import exp_replay
 3 | from exp_replay import Step
 4 | import numpy as np
 5 | 
 6 | 
 7 | class ExpReplayTest(unittest.TestCase):
 8 |   """
 9 |   Unit test for ExpReplay class
10 |   """
11 | 
12 | 
13 |   def test1(self):
14 |     exprep = exp_replay.ExpReplay(mem_size=100, state_size=[1], kth=1)
15 |     for i in xrange(120):
16 |       exprep.add_step(Step(cur_step=i, action=0, next_step=i+1, reward=0, done=False))
17 |     self.assertEqual(len(exprep.mem), 100)
18 |     self.assertEqual(exprep.mem[-1:][0].cur_step, 119)
19 | 
20 | 
21 |   def test2(self):
22 |     exprep = exp_replay.ExpReplay(mem_size=100, state_size=[1], kth=4)
23 |     for i in xrange(120):
24 |       exprep.add_step(Step(cur_step=i, action=0, next_step=i+1, reward=0, done=False))
25 |     self.assertEqual(len(exprep.mem), 100)
26 |     self.assertEqual(exprep.mem[-1:][0].cur_step, 119)
27 |     self.assertEqual(exprep.get_last_state(), [116,117,118,119])
28 | 
29 | 
30 |   def test3(self):
31 |     exprep = exp_replay.ExpReplay(mem_size=100, state_size=[2,2], kth=4)
32 |     for i in xrange(120):
33 |       exprep.add_step(Step(cur_step=[[i,i],[i,i]], action=0, next_step=[[i+1,i+1],[i+1,i+1]], reward=0, done=False))
34 |     self.assertEqual(len(exprep.mem), 100)
35 |     self.assertEqual(exprep.mem[-1:][0].cur_step, [[119,119],[119,119]])
36 |     last_state = exprep.get_last_state()
37 | 
38 |     self.assertEqual(np.shape(last_state),(2,2,4))
39 |     self.assertTrue(np.array_equal(last_state[:,:,0], [[116,116],[116,116]]))
40 |     self.assertTrue(np.array_equal(last_state[:,:,1], [[117,117],[117,117]]))
41 |     self.assertTrue(np.array_equal(last_state[:,:,2], [[118,118],[118,118]]))
42 |     self.assertTrue(np.array_equal(last_state[:,:,3], [[119,119],[119,119]]))
43 | 
44 |     sample = exprep.sample(5)
45 |     self.assertEqual(len(sample), 5)
46 |     self.assertEqual(np.shape(sample[0].cur_step), (2,2,4))
47 |     self.assertEqual(np.shape(sample[0].next_step), (2,2,4))
48 | 
49 | 
50 |   def test4(self):
51 |     # -1 for sending raw state
52 |     exprep = exp_replay.ExpReplay(mem_size=100, state_size=[4], kth=-1)
53 |     for i in xrange(120):
54 |       exprep.add_step(Step(cur_step=[i,i,i,i], action=0, next_step=[i+1,i+1,i+1,i+1], reward=0, done=False))
55 |     last_state = exprep.get_last_state()
56 |     self.assertEqual(np.shape(last_state),(4,))
57 |     self.assertTrue(np.array_equal(last_state, [119,119,119,119]))
58 | 
59 |     sample = exprep.sample(5)
60 |     self.assertEqual(len(sample), 5)
61 |     self.assertEqual(np.shape(sample[0].cur_step), (4,))
62 | 
63 | if __name__ == '__main__':
64 |   unittest.main()
65 | 


--------------------------------------------------------------------------------
/DQN/tf_utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for tensorflow"""
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def max_pool(x, k_sz=[2,2]):
 6 |   """max pooling layer wrapper
 7 |   Args
 8 |     x:      4d tensor [batch, height, width, channels]
 9 |     k_sz:   The size of the window for each dimension of the input tensor
10 |   Returns
11 |     a max pooling layer
12 |   """
13 |   return tf.nn.max_pool(x, ksize=[1, k_sz[0], k_sz[1], 1], strides=[1, k_sz[0], k_sz[1], 1], padding='SAME')
14 | 
15 | def conv2d(x, n_kernel, k_sz, stride=1):
16 |   """convolutional layer with relu activation wrapper
17 |   Args:
18 |     x:          4d tensor [batch, height, width, channels]
19 |     n_kernel:   number of kernels (output size)
20 |     k_sz:       2d array, kernel size. e.g. [8,8]
21 |     stride:     stride
22 |   Returns
23 |     a conv2d layer
24 |   """
25 |   W = tf.Variable(tf.random_normal([k_sz[0], k_sz[1], int(x.get_shape()[3]), n_kernel]))
26 |   b = tf.Variable(tf.random_normal([n_kernel]))
27 |   # - strides[0] and strides[1] must be 1
28 |   # - padding can be 'VALID'(without padding) or 'SAME'(zero padding)
29 |   #     - http://stackoverflow.com/questions/37674306/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-t
30 |   conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')
31 |   conv = tf.nn.bias_add(conv, b) # add bias term
32 |   return tf.nn.relu(conv) # rectified linear unit: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
33 | 
34 | 
35 | def fc(x, n_output, activation_fn=None):
36 |   """fully connected layer with relu activation wrapper
37 |   Args
38 |     x:          2d tensor [batch, n_input]
39 |     n_output    output size
40 |   """
41 |   W=tf.Variable(tf.random_normal([int(x.get_shape()[1]), n_output]))
42 |   b=tf.Variable(tf.random_normal([n_output]))
43 |   fc1 = tf.add(tf.matmul(x, W), b)
44 |   if not activation_fn == None:
45 |     fc1 = activation_fn(fc1)
46 |   return fc1
47 | 
48 | 
49 | def flatten(x):
50 |   """flatten a 4d tensor into 2d
51 |   Args
52 |     x:          4d tensor [batch, height, width, channels]
53 |   Returns a flattened 2d tensor
54 |   """
55 |   return tf.reshape(x, [-1, int(x.get_shape()[1]*x.get_shape()[2]*x.get_shape()[3])])
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Yiren Lu
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Implementation of Reinforcement Learning Algorithms in Python
 2 | 
 3 | Implementation of selected reinforcement learning algorithms with tensorflow.
 4 | <!-- | Implemented Algorthms   |      Working Examples
 5 | |-----------------|:--------------|
 6 | | *Policy Gradient Methods* |   |
 7 | | [REINFORCE with policy function approximation](policy_gradient/) |    [`policy_gradient/cartpole_policy_gradient.py`](policy_gradient/)   |
 8 | | [REINFORCE with baseline](policy_gradient/) | [`policy_gradient/cartpole_reinforce_baseline.py`](policy_gradient/) |
 9 | | *TD Learning* |   |
10 | | [Standard epsilon greedy Q-learning](TD/qlearning.py) | [`TD/cartpole_qlearning.py`](TD/cartpole_qlearning.py) |
11 | | [Deep Q-learning](DQN/) | [`DQN/cartpole_dqn.py`](DQN/) |
12 | | *Monte Carlo Methods* |   |
13 | | [Monte Carlo (MC) estimation of action values](monte_carlo/monte_carlo.py) | [`monte_carlo/test_monte_carlo.py`](monte_carlo/test_monte_carlo.py) |
14 | | *Dynamic Programming MDP Solver* |   |
15 | | [Value iteration](DP/value_iteration.py) | [`DP/test_value_iteration.py`](DP/test_value_iteration.py) |
16 | | [Policy iteration - policy evaluation & policy improvement](DP/policy_iteration.py) | [`DP/test_value_iteration.py`](DP/test_value_iteration.py) | -->
17 | 
18 | ### Implemented Algorithms
19 | 
20 | (Click into the links for more details)
21 | 
22 | ##### Advanced 
23 | 
24 | - [Asynchronized Advantage Actor-Critic (A3C)](A3C/)
25 | - [Deep Deterministic Policy Gradient (DDPG)](ddpg/)
26 | 
27 | ##### Policy Gradient Methods
28 | 
29 | - [REINFORCE with policy function approximation](policy_gradient/)
30 | - [REINFORCE with baseline](policy_gradient/reinforce_w_baseline.py)
31 | 
32 | ##### Temporal Difference Learning
33 | 
34 | - [Standard epsilon greedy Q-learning](TD/qlearning.py)
35 | - [Deep Q-learning](DQN/)
36 | 
37 | ##### Monte Carlo Methods
38 | 
39 | - [Monte Carlo (MC) estimation of action values](monte_carlo/monte_carlo.py)
40 | 
41 | ##### Dynamic Programming MDP Solver
42 | 
43 | - [Value iteration](DP/value_iteration.py)
44 | - [Policy iteration - policy evaluation & policy improvement](DP/policy_iteration.py)
45 | 
46 | ### Environments
47 | 
48 | - `envs/gridworld.py`: minimium gridworld implementation for testings
49 | 
50 | ### Dependencies
51 | 
52 | - Python 2.7
53 | - Numpy
54 | - Tensorflow 0.12.1
55 | - OpenAI Gym (with Atari) 0.8.0
56 | - matplotlib (optional)
57 | 
58 | ### Tests
59 | 
60 | - Files: `test_*.py`
61 | - Run unit test for [class]:
62 | 
63 | `python test_[class].py`
64 | <!-- 
65 | - Test coverage (requires `coverage` and `nose`):
66 | 
67 | `nosetests --with-coverage --cover-package=.`
68 |  -->
69 | ### MIT License
70 | 
71 | 


--------------------------------------------------------------------------------
/TD/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/TD/__init__.py


--------------------------------------------------------------------------------
/TD/cartpole_qlearning.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym import wrappers
  3 | import qlearning
  4 | import numpy
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | NUM_EPISODES = 2000
  8 | N_BINS = [8, 8, 8, 8]
  9 | MAX_STEPS = 200
 10 | FAIL_PENALTY = -100
 11 | EPSILON = 0.5
 12 | EPSILON_DECAY = 0.99
 13 | LEARNING_RATE = 0.05
 14 | DISCOUNT_FACTOR = 0.9
 15 | 
 16 | RECORD = False
 17 | 
 18 | MIN_VALUES = [-0.5, -2.0, -0.5, -3.0]
 19 | MAX_VALUES = [0.5, 2.0, 0.5, 3.0]
 20 | BINS = [numpy.linspace(MIN_VALUES[i], MAX_VALUES[i], N_BINS[i])
 21 |         for i in xrange(4)]
 22 | 
 23 | 
 24 | def discretize(obs):
 25 |   return tuple([int(numpy.digitize(obs[i], BINS[i])) for i in xrange(4)])
 26 | 
 27 | 
 28 | def train(agent, env, history, num_episodes=NUM_EPISODES):
 29 |   for i in xrange(NUM_EPISODES):
 30 |     if i % 100:
 31 |       print "Episode {}".format(i + 1)
 32 |     obs = env.reset()
 33 |     cur_state = discretize(obs)
 34 |     
 35 |     for t in xrange(MAX_STEPS):
 36 |       action = agent.get_action(cur_state)
 37 |       observation, reward, done, info = env.step(action)
 38 |       next_state = discretize(observation)
 39 |       if done:
 40 |         reward = FAIL_PENALTY
 41 |         agent.learn(cur_state, action, next_state, reward, done)
 42 |         print("Episode finished after {} timesteps".format(t + 1))
 43 |         history.append(t + 1)
 44 |         break
 45 |       agent.learn(cur_state, action, next_state, reward, done)
 46 |       cur_state = next_state
 47 |       if t == MAX_STEPS - 1:
 48 |         history.append(t + 1)
 49 |         print("Episode finished after {} timesteps".format(t + 1))
 50 |   return agent, history
 51 | 
 52 | 
 53 | env = gym.make('CartPole-v0')
 54 | if RECORD:
 55 |   env = wrappers.Monitor(env, '/tmp/cartpole-experiment-1', force=True)
 56 | def get_actions(state):
 57 |   return [0, 1]
 58 | 
 59 | 
 60 | agent = qlearning.QLearningAgent(get_actions,
 61 |                                      epsilon=EPSILON,
 62 |                                      alpha=LEARNING_RATE,
 63 |                                      gamma=DISCOUNT_FACTOR,
 64 |                                      epsilon_decay=EPSILON_DECAY)
 65 | 
 66 | history = []
 67 | 
 68 | agent, history = train(agent, env, history)
 69 | 
 70 | if RECORD:
 71 |   env.monitor.close()
 72 | 
 73 | avg_reward = [numpy.mean(history[i*100:(i+1)*100]) for i in xrange(int(len(history)/100))]
 74 | f_reward = plt.figure(1)
 75 | plt.plot(numpy.linspace(0, len(history), len(avg_reward)), avg_reward)
 76 | plt.ylabel('Rewards')
 77 | f_reward.show()
 78 | print 'press enter to continue'
 79 | raw_input()
 80 | plt.close()
 81 | 
 82 | 
 83 | # Display:
 84 | print 'press ctrl-c to stop'
 85 | while True:
 86 |   obs = env.reset()
 87 |   cur_state = discretize(obs)
 88 |   done = False
 89 | 
 90 |   t = 0
 91 |   while not done:
 92 |     env.render()
 93 |     t = t+1
 94 |     action = agent.get_action(cur_state)
 95 |     observation, reward, done, info = env.step(action)
 96 |     next_state = discretize(observation)
 97 |     if done:
 98 |       reward = FAIL_PENALTY
 99 |       agent.learn(cur_state, action, next_state, reward, done)
100 |       print("Episode finished after {} timesteps".format(t+1))
101 |       history.append(t+1)
102 |       break
103 |     agent.learn(cur_state, action, next_state, reward, done)
104 |     cur_state = next_state
105 | 


--------------------------------------------------------------------------------
/TD/qlearning.py:
--------------------------------------------------------------------------------
  1 | # Q-learning Agent
  2 | # Model-free Temporal Difference learning
  3 | #
  4 | # ---
  5 | # @author Yiren Lu
  6 | # @email luyiren [at] seas [dot] upenn [dot] edu
  7 | #
  8 | # MIT License
  9 | 
 10 | import numpy
 11 | 
 12 | class QLearningAgent(object):
 13 | 
 14 | 
 15 |   def __init__(self, legal_actions_fn, epsilon=0.5, alpha=0.5, gamma=0.9, epsilon_decay=1):
 16 |     """
 17 |     args
 18 |       legal_actions_fn    takes a state and returns a list of legal actions
 19 |       alpha       learning rate
 20 |       epsilon     exploration rate
 21 |       gamma       discount factor
 22 |     """
 23 |     self.epsilon = epsilon
 24 |     self.alpha = alpha
 25 |     self.gamma = gamma
 26 |     self.epsilon_decay=epsilon_decay
 27 |     self.legal_actions_fn = legal_actions_fn
 28 | 
 29 |     # map: {(state, action): q-value}
 30 |     self.q_values = {}
 31 |     # map: {state: action}
 32 |     self.policy = {}
 33 |     
 34 | 
 35 |   def get_value(self, s):
 36 |     a = self.get_optimal_action(s)
 37 |     return self.get_qvalue(s, a)
 38 | 
 39 | 
 40 |   def get_qvalue(self, s, a):
 41 |     if (s,a) in self.q_values:
 42 |       return self.q_values[(s,a)]
 43 |     else:
 44 |       # set to 0
 45 |       self.q_values[(s,a)] = 0
 46 |       return 0
 47 | 
 48 |   def _set_qvalue(self, s, a, v):
 49 |     self.q_values[(s,a)] = v
 50 | 
 51 | 
 52 |   def get_optimal_action(self, state):
 53 |     legal_actions = self.legal_actions_fn(state)
 54 |     assert len(legal_actions) > 0, "no legal actions"
 55 |     if state in self.policy:
 56 |       return self.policy[state]
 57 |     else:
 58 |       # randomly select an action as default and return
 59 |       self.policy[state] = legal_actions[numpy.random.randint(0, len(legal_actions))]
 60 |       return self.policy[state]
 61 | 
 62 |   def get_action(self, state):
 63 |     """
 64 |     Epsilon-greedy action
 65 |     args
 66 |       state           current state      
 67 |     returns
 68 |       an action to take given the state
 69 |     """
 70 |     legal_actions = self.legal_actions_fn(state)
 71 | 
 72 |     assert len(legal_actions) > 0, "no legal actions on state {}".format(state)
 73 | 
 74 |     if numpy.random.random() < self.epsilon:
 75 |       # act randomly
 76 |       return legal_actions[numpy.random.randint(0, len(legal_actions))]
 77 |     else:
 78 |       if state in self.policy:
 79 |         return self.policy[state]
 80 |       else:
 81 |         # set the first action in the list to default and return
 82 |         self.policy[state] = legal_actions[0]
 83 |         return legal_actions[0]
 84 | 
 85 | 
 86 |   def learn(self, s, a, s1, r, is_done):
 87 |     """
 88 |     Updates self.q_values[(s,a)] and self.policy[s]
 89 |     args
 90 |       s         current state
 91 |       a         action taken
 92 |       s1        next state
 93 |       r         reward
 94 |       is_done   True if the episode concludes
 95 |     """
 96 |     # update q value
 97 |     if is_done:
 98 |       sample = r
 99 |     else:
100 |       sample = r + self.gamma*max([self.get_qvalue(s1,a1) for a1 in self.legal_actions_fn(s1)])
101 |     
102 |     q_s_a = self.get_qvalue(s,a)
103 |     q_s_a = q_s_a + self.alpha*(sample - q_s_a)
104 |     self._set_qvalue(s,a,q_s_a)
105 | 
106 |     # policy improvement
107 |     legal_actions = self.legal_actions_fn(s)
108 |     s_q_values = [self.get_qvalue(s,a) for a in legal_actions]
109 |     self.policy[s] = legal_actions[s_q_values.index(max(s_q_values))]
110 | 
111 |     self.epsilon = self.epsilon*self.epsilon_decay


--------------------------------------------------------------------------------
/TD/test_qlearning.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | if "../" not in sys.path:
 4 |   sys.path.append("../") 
 5 | from envs import gridworld
 6 | import qlearning
 7 | 
 8 | 
 9 | class QLearningAgentTest(unittest.TestCase):
10 |   """
11 |   Unit test for q-learning agent
12 |   """
13 | 
14 |   def test2(self):
15 |     print 'Test 1 -- Bridge Crossing Analysis'
16 |     grid = [['x', '-100', '-100', '-100', 'x'],
17 |             ['1', '0',    '0',    '0',   '10'],
18 |             ['x', '-100', '-100', '-100', 'x']]
19 | 
20 |     gw = gridworld.GridWorld(
21 |         grid, {(1,0), (1,4), 
22 |                (0,1), (0,2), (0,3), 
23 |                (2,1), (2,2), (2,3)}, 0.9)
24 | 
25 |     agent = qlearning.QLearningAgent(gw.get_actions, 
26 |                   epsilon=0.1, alpha=0.5, gamma=0.9)
27 | 
28 |     # Training
29 |     episodes = 5000
30 |     for i in range(episodes):
31 |       gw.reset((1,1))
32 |       cur_s = gw.get_current_state()
33 |       is_done = False
34 |       while not is_done:
35 |         a = agent.get_action(cur_s)
36 |         last_state, action, next_state, reward, is_done = gw.step(a)
37 |         agent.learn(last_state, action, next_state, reward, is_done)
38 |         cur_s = next_state
39 |     # show optimal policy
40 |     opt_policy = gw.get_optimal_policy(agent)
41 |     gw.display_policy_grid(opt_policy)
42 |     gw.display_value_grid(gw.get_values(agent))
43 |     gw.display_qvalue_grid(gw.get_qvalues(agent))
44 | 
45 |   def test1(self):
46 |     print 'Test 1 -- Regular Case'
47 |     grid = [['0', '0', '0', '1'],
48 |             ['0', 'x', '0', '-1'],
49 |             ['0', '0', '0', '0']]
50 | 
51 |     gw = gridworld.GridWorld(
52 |         grid, {(0, 3), (1, 3)}, 0.8)
53 | 
54 |     agent = qlearning.QLearningAgent(gw.get_actions, 
55 |                   epsilon=0.2, alpha=0.5, gamma=0.9)
56 |     
57 |     # Training
58 |     episodes = 5000
59 |     for i in range(episodes):
60 |       gw.reset((2,0))
61 |       cur_s = gw.get_current_state()
62 |       is_done = False
63 |       while not is_done:
64 |         a = agent.get_action(cur_s)
65 |         last_state, action, next_state, reward, is_done = gw.step(a)
66 |         agent.learn(last_state, action, next_state, reward, is_done)
67 |         cur_s = next_state
68 | 
69 |     # show optimal policy
70 |     opt_policy = gw.get_optimal_policy(agent)
71 |     gw.display_policy_grid(opt_policy)
72 |     gw.display_value_grid(gw.get_values(agent))
73 |     gw.display_qvalue_grid(gw.get_qvalues(agent))
74 | 
75 | if __name__ == '__main__':
76 |   unittest.main()
77 | 


--------------------------------------------------------------------------------
/ddpg/README.md:
--------------------------------------------------------------------------------
 1 | ## Deep Deterministic Policy Gradient
 2 | 
 3 | Following paper: Continuous control with deep reinforcement learning [(https://arxiv.org/abs/1509.02971)](https://arxiv.org/abs/1509.02971)
 4 | 
 5 | Tested on pendulum-v0: [openai submission page](https://gym.openai.com/evaluations/eval_9kvdhHSCTMqU8mYTaPWFrQ)
 6 | 
 7 | #### Run code
 8 | 
 9 | `$ python pendulum_ddpg.py --device=cpu --episodes=300`
10 | 
11 | #### Pendulum-v0 result
12 | 
13 | ![ddpg training](imgs/ddpg_plot.png "ddpg training")
14 | 


--------------------------------------------------------------------------------
/ddpg/actor.py:
--------------------------------------------------------------------------------
 1 | # Deep Deterministic Policy Gradient
 2 | #   following paper: Continuous control with deep reinforcement learning
 3 | #                   (https://arxiv.org/pdf/1509.02971.pdf)
 4 | #
 5 | # ---
 6 | # @author Yiren Lu
 7 | # @email luyiren [at] seas [dot] upenn [dot] edu
 8 | #
 9 | # MIT License
10 | 
11 | import tensorflow as tf
12 | import tf_utils
13 | 
14 | 
15 | 
16 | class ActorNetwork(object):
17 | 
18 | 
19 |   def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001):
20 |     self.state_size = state_size
21 |     self.action_size = action_size
22 |     self.optimizer = tf.train.AdamOptimizer(lr)
23 |     self.tau = tau
24 | 
25 |     self.n_h1 = n_h1
26 |     self.n_h2 = n_h2
27 | 
28 |     self.input_s, self.actor_variables, self.action_values = self._build_network("actor")
29 |     self.input_s_target, self.actor_variables_target, self.action_values_target = self._build_network("actor_target")
30 | 
31 |     self.action_gradients = tf.placeholder(tf.float32, [None, self.action_size])
32 |     self.actor_gradients = tf.gradients(self.action_values, self.actor_variables, -self.action_gradients)
33 |     self.update_target_op = [self.actor_variables_target[i].assign(tf.multiply(self.actor_variables[i], self.tau) + tf.multiply(self.actor_variables_target[i], 1 - self.tau)) 
34 |                               for i in range(len(self.actor_variables))]
35 |     self.optimize = self.optimizer.apply_gradients(zip(self.actor_gradients, self.actor_variables))
36 | 
37 | 
38 |   def _build_network(self, name):
39 |     input_s = tf.placeholder(tf.float32, [None, self.state_size])
40 |     with tf.variable_scope(name):
41 |       layer_1 = tf_utils.fc(input_s, self.n_h1, scope="fc1", activation_fn=tf.nn.relu, 
42 |         initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN"))
43 |       layer_2 = tf_utils.fc(layer_1, self.n_h2, scope="fc2", activation_fn=tf.nn.relu,
44 |         initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN"))
45 |       action_values = tf_utils.fc(layer_2, self.action_size, scope="out", activation_fn=tf.nn.tanh,
46 |         initializer=tf.random_uniform_initializer(-3e-3, 3e-3))
47 |     actor_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
48 |     return input_s, actor_variables, action_values
49 | 
50 | 
51 |   def get_action(self, state, sess):
52 |     return sess.run(self.action_values, feed_dict={self.input_s: state})
53 | 
54 | 
55 |   def get_action_target(self, state, sess):
56 |     return sess.run(self.action_values_target, feed_dict={self.input_s_target: state})
57 | 
58 | 
59 |   def train(self, state, action_gradients, sess):
60 |     sess.run(self.optimize, feed_dict={
61 |         self.input_s: state, 
62 |         self.action_gradients: action_gradients
63 |       })
64 | 
65 | 
66 |   def update_target(self, sess):
67 |     sess.run(self.update_target_op)


--------------------------------------------------------------------------------
/ddpg/critic.py:
--------------------------------------------------------------------------------
 1 | # Deep Deterministic Policy Gradient
 2 | #   following paper: Continuous control with deep reinforcement learning
 3 | #                   (https://arxiv.org/pdf/1509.02971.pdf)
 4 | #
 5 | # ---
 6 | # @author Yiren Lu
 7 | # @email luyiren [at] seas [dot] upenn [dot] edu
 8 | #
 9 | # MIT License
10 | 
11 | import tensorflow as tf
12 | import tf_utils
13 | 
14 | 
15 | class CriticNetwork(object):
16 | 
17 | 
18 |   def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001):
19 |     self.state_size = state_size
20 |     self.action_size = action_size
21 |     self.optimizer = tf.train.AdamOptimizer(lr)
22 |     self.tau = tau
23 | 
24 |     self.n_h1 = n_h1
25 |     self.n_h2 = n_h2
26 | 
27 |     self.input_s, self.action, self.critic_variables, self.q_value = self._build_network("critic")
28 |     self.input_s_target, self.action_target, self.critic_variables_target, self.q_value_target = self._build_network("critic_target")
29 | 
30 |     self.target = tf.placeholder(tf.float32, [None])
31 |     self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.critic_variables])
32 |     self.loss = tf.reduce_mean(tf.square(self.target - self.q_value)) + 0.01*self.l2_loss
33 |     self.optimize = self.optimizer.minimize(self.loss)
34 |     self.update_target_op = [self.critic_variables_target[i].assign(tf.multiply(self.critic_variables[i], self.tau) + tf.multiply(self.critic_variables_target[i], 1 - self.tau)) for i in range(len(self.critic_variables))]
35 |     self.action_gradients = tf.gradients(self.q_value, self.action)
36 | 
37 | 
38 |   def _build_network(self, name):
39 |     input_s = tf.placeholder(tf.float32, [None, self.state_size])
40 |     action = tf.placeholder(tf.float32, [None, self.action_size])
41 |     with tf.variable_scope(name):
42 |       layer_1 = tf_utils.fc(input_s, self.n_h1, scope="fc1", activation_fn=tf.nn.relu, 
43 |         initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN"))
44 |       # tf.concat((layer_1, action), 1)
45 |       layer_2 = tf_utils.fc(tf.concat((layer_1, action), 1), self.n_h2, scope="fc2", activation_fn=tf.nn.relu,
46 |         initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN"))
47 |       q_value = tf_utils.fc(layer_2, 1, scope="out", initializer=tf.random_uniform_initializer(-3e-3, 3e-3))
48 |     critic_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
49 |     return input_s, action, critic_variables, tf.squeeze(q_value)
50 | 
51 | 
52 |   def get_qvalue_target(self, state, action, sess):
53 |     return sess.run(self.q_value_target, feed_dict={
54 |         self.input_s_target: state,
55 |         self.action_target: action
56 |       })
57 | 
58 | 
59 |   def get_gradients(self, state, action, sess):
60 |     return sess.run(self.action_gradients, feed_dict={
61 |         self.input_s: state,
62 |         self.action: action
63 |       })
64 | 
65 | 
66 |   def train(self, state, action, target, sess):
67 |     _, loss = sess.run([self.optimize, self.loss], feed_dict={
68 |         self.input_s: state,
69 |         self.action: action,
70 |         self.target: target
71 |       })
72 |     return loss
73 | 
74 | 
75 |   def update_target(self, sess):
76 |     sess.run(self.update_target_op)
77 | 


--------------------------------------------------------------------------------
/ddpg/ddpg.py:
--------------------------------------------------------------------------------
 1 | # Deep Deterministic Policy Gradient
 2 | #   following paper: Continuous control with deep reinforcement learning
 3 | #                   (https://arxiv.org/pdf/1509.02971.pdf)
 4 | #
 5 | # ---
 6 | # @author Yiren Lu
 7 | # @email luyiren [at] seas [dot] upenn [dot] edu
 8 | #
 9 | # MIT License
10 | 
11 | import numpy as np
12 | import tensorflow as tf
13 | 
14 | 
15 | class DDPG(object):
16 | 
17 | 
18 |   def __init__(self, actor, critic, exprep, noise, gamma=0.99, action_bound=1):
19 |     self.actor = actor
20 |     self.critic = critic
21 |     self.exprep = exprep
22 |     self.noise = noise
23 |     self.total_steps = 0
24 |     self.gamma = 0.99
25 |     self.action_bound = action_bound
26 | 
27 | 
28 |   def add_step(self, step):
29 |     self.total_steps = self.total_steps + 1
30 |     self.exprep.add_step(step)
31 | 
32 | 
33 |   def get_action(self, state, sess):
34 |     state = np.reshape(state,[-1, self.actor.state_size])
35 |     action = self.actor.get_action(state, sess) * self.action_bound
36 |     return action
37 | 
38 | 
39 |   def get_action_noise(self, state, sess, rate=1):
40 |     state = np.reshape(state,[-1, self.actor.state_size])
41 |     action = self.actor.get_action(state, sess) * self.action_bound
42 |     action = action + self.noise.noise() * rate
43 |     return action
44 | 
45 | 
46 |   def learn_batch(self, sess):
47 |     # sample a random minibatch of N tranistions
48 |     batch = self.exprep.sample()
49 |     if len(batch)==0:
50 |       return
51 | 
52 |     # compute y_i (target q)
53 |     next_s = [s.next_step for s in batch]
54 |     next_a_target = self.actor.get_action_target(next_s, sess)
55 |     next_q_target = self.critic.get_qvalue_target(next_s, next_a_target, sess)
56 |     y = np.array([s.reward + self.gamma*next_q_target[i]*(1-s.done) for i,s in enumerate(batch)])
57 |     y = y.reshape([len(batch)])
58 | 
59 |     # update ciritc by minimizing l2 loss
60 |     cur_s = [s.cur_step for s in batch]
61 |     a = [s.action for s in batch]
62 |     l = self.critic.train(cur_s, a, y, sess)
63 | 
64 |     # update actor policy with sampled gradient
65 |     cur_a_pred = self.actor.get_action(cur_s, sess)
66 |     a_gradients = self.critic.get_gradients(cur_s, cur_a_pred, sess)
67 |     self.actor.train(cur_s, a_gradients[0], sess)
68 | 
69 |     # update target network:
70 |     self.actor.update_target(sess)
71 |     self.critic.update_target(sess)
72 |     return l
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/ddpg/exp_replay.py:
--------------------------------------------------------------------------------
 1 | # Experience Replay
 2 | # Following paper: Playing Atari with Deep Reinforcement Learning
 3 | #     https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
 4 | #
 5 | # ---
 6 | # @author Yiren Lu
 7 | # @email luyiren [at] seas [dot] upenn [dot] edu
 8 | #
 9 | # MIT License
10 | 
11 | 
12 | import numpy as np
13 | import random
14 | from collections import namedtuple
15 | 
16 | 
17 | Step = namedtuple('Step','cur_step action next_step reward done')
18 | 
19 | 
20 | class ExpReplay():
21 |   """Experience replay"""
22 | 
23 | 
24 |   def __init__(self, mem_size, start_mem=None, state_size=[84, 84], kth=4, drop_rate=0.2, batch_size=32):
25 |     # k = -1 for sending raw state
26 |     self.state_size = state_size
27 |     self.drop_rate = drop_rate
28 |     self.mem_size = mem_size
29 |     self.start_mem = start_mem
30 |     if start_mem == None:
31 |       self.start_mem = mem_size/20
32 |     self.kth = kth
33 |     self.batch_size = batch_size
34 |     self.mem = []
35 |     self.total_steps = 0
36 | 
37 | 
38 |   def add_step(self, step):
39 |     """
40 |     Store episode to memory and check if it reaches the mem_size. 
41 |     If so, drop [self.drop_rate] of the oldest memory
42 | 
43 |     args
44 |       step      namedtuple Step, where step.cur_step and step.next_step are of size {state_size}
45 |     """
46 |     self.mem.append(step)
47 |     self.total_steps = self.total_steps + 1
48 |     while len(self.mem) > self.mem_size:
49 |       self.mem = self.mem[int(len(self.mem)*self.drop_rate):]
50 | 
51 | 
52 |   def get_last_state(self):
53 |     if len(self.mem) > abs(self.kth):
54 |       if self.kth == -1:
55 |         return self.mem[-1].cur_step
56 |       if len(self.state_size) == 1:
57 |         return [s.cur_step for s in self.mem[-abs(self.kth):]]
58 |       last_state = np.stack([s.cur_step for s in self.mem[-abs(self.kth):]], axis=len(self.state_size))
59 |       return np.stack([s.cur_step for s in self.mem[-abs(self.kth):]], axis=len(self.state_size))
60 |     return []
61 | 
62 | 
63 |   def sample(self, num=None):
64 |     """Randomly draw [num] samples"""
65 |     if num == None:
66 |       num = self.batch_size
67 |     if len(self.mem) < self.start_mem:
68 |       return []
69 |     sampled_idx = random.sample(range(abs(self.kth),len(self.mem)), num)
70 |     samples = []
71 |     for idx in sampled_idx:
72 |       steps = self.mem[idx-abs(self.kth):idx]
73 |       cur_state = np.stack([s.cur_step for s in steps], axis=len(self.state_size))
74 |       next_state = np.stack([s.next_step for s in steps], axis=len(self.state_size))
75 |       # handle special cases
76 |       if self.kth == -1:
77 |         cur_state = steps[0].cur_step
78 |         next_state = steps[0].next_step
79 |       elif len(self.state_size) == 1:
80 |         cur_state = [steps[0].cur_step]
81 |         next_state = [steps[0].next_step]
82 |       reward = steps[-1].reward
83 |       action = steps[-1].action
84 |       done = steps[-1].done
85 |       samples.append(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done))
86 |     return samples
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/ddpg/imgs/ddpg_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/ddpg/imgs/ddpg_plot.png


--------------------------------------------------------------------------------
/ddpg/mountaincar_ddpg.py:
--------------------------------------------------------------------------------
  1 | # DDPG Pendulum-v0 example
  2 | # ---
  3 | # @author Yiren Lu
  4 | # @email luyiren [at] seas [dot] upenn [dot] edu
  5 | #
  6 | # MIT License
  7 | 
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import argparse
 11 | from ddpg import DDPG
 12 | from actor import ActorNetwork
 13 | from critic import CriticNetwork
 14 | from exp_replay import ExpReplay
 15 | from exp_replay import Step
 16 | from ou import OUProcess
 17 | import matplotlib.pyplot as plt
 18 | import sys
 19 | import gym
 20 | from gym import wrappers
 21 | 
 22 | # env = gym.make('MountainCarContinuous-v0')
 23 | # print(env.observation_space)
 24 | # print(env.action_space)
 25 | # print(env.action_space.low)
 26 | # print(env.action_space.high)
 27 | 
 28 | parser = argparse.ArgumentParser(description=None)
 29 | parser.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu')
 30 | parser.add_argument('-e', '--episodes', default=50, type=int, help='number of episodes')
 31 | parser.add_argument('-l', '--log_dir', default='/tmp/mountaincar-log-0', type=str, help='log directory')
 32 | args = parser.parse_args()
 33 | print(args)
 34 | 
 35 | 
 36 | DEVICE = args.device
 37 | NUM_EPISODES = args.episodes
 38 | LOG_DIR=args.log_dir
 39 | 
 40 | ACTOR_LEARNING_RATE = 0.0001
 41 | CRITIC_LEARNING_RATE = 0.001
 42 | GAMMA = 0.99
 43 | TAU = 0.001
 44 | MEM_SIZE = 1000000
 45 | 
 46 | 
 47 | STATE_SIZE = 2
 48 | ACTION_SIZE = 1
 49 | BATCH_SIZE = 64
 50 | MAX_STEPS = 10000
 51 | FAIL_PENALTY = 0
 52 | ACTION_RANGE = 1
 53 | EVALUATE_EVERY = 10
 54 | 
 55 | 
 56 | def summarize(cum_reward, i, summary_writer):
 57 |   summary = tf.Summary()
 58 |   summary.value.add(tag="cumulative reward", simple_value=cum_reward)
 59 |   summary_writer.add_summary(summary, i)
 60 |   summary_writer.flush()
 61 | 
 62 | 
 63 | def train(agent, env, sess):
 64 |   for i in xrange(NUM_EPISODES):
 65 |     cur_state = env.reset()
 66 |     cum_reward = 0
 67 |     # tensorboard summary
 68 |     summary_writer = tf.summary.FileWriter(LOG_DIR+'/train', graph=tf.get_default_graph())
 69 | 
 70 |     if (i % EVALUATE_EVERY) == 0:
 71 |       print '====evaluation===='
 72 |     for t in xrange(MAX_STEPS):
 73 |       if t % 500 == 0:
 74 |         print 'step {}'.format(t)
 75 |       if (i % EVALUATE_EVERY) == 0:
 76 |         env.render()
 77 |         action = agent.get_action(cur_state, sess)[0]
 78 |       else:
 79 |         # decaying noise
 80 |         action = agent.get_action_noise(cur_state, sess, rate=(NUM_EPISODES-i)/NUM_EPISODES)[0]
 81 |         # action = agent.get_action_noise(cur_state, sess, rate=0.01)[0]
 82 |       next_state, reward, done, info = env.step(action)
 83 |       if done:
 84 |         cum_reward += reward
 85 |         agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done))
 86 |         print("Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward))
 87 |         summarize(cum_reward, i, summary_writer)
 88 |         break
 89 |       cum_reward += reward
 90 |       agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done))
 91 |       cur_state = next_state
 92 |       if t == MAX_STEPS - 1:
 93 |         print("Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward))
 94 |         print action
 95 |         summarize(cum_reward, i, summary_writer)
 96 |       agent.learn_batch(sess)
 97 | 
 98 | 
 99 | env = gym.make('MountainCarContinuous-v0')
100 | env._max_episode_steps = MAX_STEPS
101 | # env = wrappers.Monitor(env, '/tmp/pendulum-experiment-0', force=True)
102 | 
103 | actor = ActorNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=ACTOR_LEARNING_RATE, tau=TAU)
104 | critic = CriticNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=CRITIC_LEARNING_RATE, tau=TAU)
105 | noise = OUProcess(ACTION_SIZE)
106 | exprep = ExpReplay(mem_size=MEM_SIZE, start_mem=10000, state_size=[STATE_SIZE], kth=-1, batch_size=BATCH_SIZE)
107 | 
108 | sess = tf.Session()
109 | with tf.device('/{}:0'.format(DEVICE)):
110 |   agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=env.action_space.high)
111 | sess.run(tf.initialize_all_variables())
112 | 
113 | train(agent, env, sess)
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/ddpg/ou.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class OUProcess(object):
 5 |   """Ornstein-Uhlenbeck process"""
 6 | 
 7 | 
 8 |   def __init__(self, x_size, mu=0, theta=0.15, sigma=0.3):
 9 |     self.x = np.ones(x_size) * mu
10 |     self.x_size = x_size
11 |     self.mu = mu
12 |     self.theta = theta
13 |     self.sigma = sigma
14 | 
15 | 
16 |   def noise(self):
17 |     dx = self.theta * (self.mu - self.x) + self.sigma * np.random.randn(self.x_size)
18 |     self.x = self.x + dx
19 |     return self.x
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/ddpg/pendulum_ddpg.py:
--------------------------------------------------------------------------------
  1 | # DDPG Pendulum-v0 example
  2 | # ---
  3 | # @author Yiren Lu
  4 | # @email luyiren [at] seas [dot] upenn [dot] edu
  5 | #
  6 | # MIT License
  7 | 
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import argparse
 11 | from ddpg import DDPG
 12 | from actor import ActorNetwork
 13 | from critic import CriticNetwork
 14 | from exp_replay import ExpReplay
 15 | from exp_replay import Step
 16 | from ou import OUProcess
 17 | import matplotlib.pyplot as plt
 18 | import sys
 19 | import gym
 20 | from gym import wrappers
 21 | 
 22 | 
 23 | parser = argparse.ArgumentParser(description=None)
 24 | parser.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu')
 25 | parser.add_argument('-e', '--episodes', default=300, type=int, help='number of episodes')
 26 | parser.add_argument('-l', '--log_dir', default='/tmp/pendulum-log-0', type=str, help='log directory')
 27 | args = parser.parse_args()
 28 | print(args)
 29 | 
 30 | 
 31 | DEVICE = args.device
 32 | NUM_EPISODES = args.episodes
 33 | LOG_DIR=args.log_dir
 34 | 
 35 | ACTOR_LEARNING_RATE = 0.0001
 36 | CRITIC_LEARNING_RATE = 0.001
 37 | GAMMA = 0.99
 38 | TAU = 0.001
 39 | MEM_SIZE = 1000000
 40 | 
 41 | 
 42 | STATE_SIZE = 3
 43 | ACTION_SIZE = 1
 44 | BATCH_SIZE = 64
 45 | MAX_STEPS = 200
 46 | FAIL_PENALTY = 0
 47 | ACTION_RANGE = 1
 48 | EVALUATE_EVERY = 10
 49 | 
 50 | 
 51 | def summarize(cum_reward, i, summary_writer):
 52 |   summary = tf.Summary()
 53 |   summary.value.add(tag="cumulative reward", simple_value=cum_reward)
 54 |   summary_writer.add_summary(summary, i)
 55 |   summary_writer.flush()
 56 | 
 57 | 
 58 | def train(agent, env, sess):
 59 |   for i in xrange(NUM_EPISODES):
 60 |     cur_state = env.reset()
 61 |     cum_reward = 0
 62 |     # tensorboard summary
 63 |     summary_writer = tf.summary.FileWriter(LOG_DIR+'/train', graph=tf.get_default_graph())
 64 | 
 65 |     if (i % EVALUATE_EVERY) == 0:
 66 |       print '====evaluation===='
 67 |     for t in xrange(MAX_STEPS):
 68 |       if (i % EVALUATE_EVERY) == 0:
 69 |         env.render()
 70 |         action = agent.get_action(cur_state, sess)[0]
 71 |       else:
 72 |         # decaying noise
 73 |         action = agent.get_action_noise(cur_state, sess, rate=(NUM_EPISODES-i)/NUM_EPISODES)[0]
 74 |       next_state, reward, done, info = env.step(action)
 75 |       if done:
 76 |         cum_reward += reward
 77 |         agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done))
 78 |         print("Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward))
 79 |         summarize(cum_reward, i, summary_writer)
 80 |         break
 81 |       cum_reward += reward
 82 |       agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done))
 83 |       cur_state = next_state
 84 |       if t == MAX_STEPS - 1:
 85 |         print("Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward))
 86 |         print action
 87 |         summarize(cum_reward, i, summary_writer)
 88 |       agent.learn_batch(sess)
 89 | 
 90 | 
 91 | env = gym.make('Pendulum-v0')
 92 | # env = wrappers.Monitor(env, '/tmp/pendulum-experiment-0', force=True)
 93 | 
 94 | actor = ActorNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=ACTOR_LEARNING_RATE, tau=TAU)
 95 | critic = CriticNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=CRITIC_LEARNING_RATE, tau=TAU)
 96 | noise = OUProcess(ACTION_SIZE)
 97 | exprep = ExpReplay(mem_size=MEM_SIZE, start_mem=10000, state_size=[STATE_SIZE], kth=-1, batch_size=BATCH_SIZE)
 98 | 
 99 | sess = tf.Session()
100 | with tf.device('/{}:0'.format(DEVICE)):
101 |   agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=env.action_space.high)
102 | sess.run(tf.initialize_all_variables())
103 | 
104 | train(agent, env, sess)
105 | 


--------------------------------------------------------------------------------
/ddpg/tf_utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for tensorflow"""
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def max_pool(x, k_sz=[2,2]):
 6 |   """max pooling layer wrapper
 7 |   Args
 8 |     x:      4d tensor [batch, height, width, channels]
 9 |     k_sz:   The size of the window for each dimension of the input tensor
10 |   Returns
11 |     a max pooling layer
12 |   """
13 |   return tf.nn.max_pool(x, ksize=[1, k_sz[0], k_sz[1], 1], strides=[1, k_sz[0], k_sz[1], 1], padding='SAME')
14 | 
15 | def conv2d(x, n_kernel, k_sz, stride=1):
16 |   """convolutional layer with relu activation wrapper
17 |   Args:
18 |     x:          4d tensor [batch, height, width, channels]
19 |     n_kernel:   number of kernels (output size)
20 |     k_sz:       2d array, kernel size. e.g. [8,8]
21 |     stride:     stride
22 |   Returns
23 |     a conv2d layer
24 |   """
25 |   W = tf.Variable(tf.random_normal([k_sz[0], k_sz[1], int(x.get_shape()[3]), n_kernel]))
26 |   b = tf.Variable(tf.random_normal([n_kernel]))
27 |   # - strides[0] and strides[1] must be 1
28 |   # - padding can be 'VALID'(without padding) or 'SAME'(zero padding)
29 |   #     - http://stackoverflow.com/questions/37674306/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-t
30 |   conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')
31 |   conv = tf.nn.bias_add(conv, b) # add bias term
32 |   return tf.nn.relu(conv) # rectified linear unit: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
33 | 
34 | 
35 | def fc(x, n_output, scope="fc", activation_fn=None, initializer=None):
36 |   """fully connected layer with relu activation wrapper
37 |   Args
38 |     x:          2d tensor [batch, n_input]
39 |     n_output    output size
40 |   """
41 |   with tf.variable_scope(scope):
42 |     if initializer is None:
43 |       # default initialization
44 |       W = tf.Variable(tf.random_normal([int(x.get_shape()[1]), n_output]))
45 |       b = tf.Variable(tf.random_normal([n_output]))
46 |     else:
47 |       W = tf.get_variable("W", shape=[int(x.get_shape()[1]), n_output], initializer=initializer)
48 |       b = tf.get_variable("b", shape=[n_output], initializer=tf.constant_initializer(.0, dtype=tf.float32))
49 |     fc1 = tf.add(tf.matmul(x, W), b)
50 |     if not activation_fn is None:
51 |       fc1 = activation_fn(fc1)
52 |   return fc1
53 | 
54 | 
55 | def flatten(x):
56 |   """flatten a 4d tensor into 2d
57 |   Args
58 |     x:          4d tensor [batch, height, width, channels]
59 |   Returns a flattened 2d tensor
60 |   """
61 |   return tf.reshape(x, [-1, int(x.get_shape()[1]*x.get_shape()[2]*x.get_shape()[3])])
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/envs/__init__.py


--------------------------------------------------------------------------------
/envs/env.py:
--------------------------------------------------------------------------------
 1 | # Environment Abstract Class
 2 | # ---
 3 | # @author Yiren Lu
 4 | # @email luyiren [at] seas [dot] upenn [dot] edu
 5 | #
 6 | # MIT License
 7 | 
 8 | 
 9 | class Env:
10 |   
11 |   def reset(self, start_state):
12 |     """
13 |     Reset the gridworld for model-free learning. It assumes only 1 agent in the gridworld.
14 |     """
15 |     abstract
16 | 
17 | 
18 |   def get_current_state(self):
19 |     abstract
20 | 
21 | 
22 |   def step(self, action):
23 |     abstract


--------------------------------------------------------------------------------
/envs/gridworld.py:
--------------------------------------------------------------------------------
  1 | # Gridworld environment based on mdp.py
  2 | # Gridworld provides a basic environment for RL agents to interact with
  3 | #
  4 | # ---
  5 | # @author Yiren Lu
  6 | # @email luyiren [at] seas [dot] upenn [dot] edu
  7 | #
  8 | # MIT License
  9 | 
 10 | import mdp
 11 | import env
 12 | import numpy as np
 13 | import unittest
 14 | 
 15 | 
 16 | class GridWorld(mdp.MDP, env.Env):
 17 |   """
 18 |   Grid world environment
 19 |   """
 20 | 
 21 |   def __init__(self, grid, terminals, trans_prob=1):
 22 |     """
 23 |     input:
 24 |       grid        2-d list of the grid including the reward
 25 |       terminals   a set of all the terminal states
 26 |       trans_prob  transition probability when given a certain action
 27 |     """
 28 |     self.height = len(grid)
 29 |     self.width = len(grid[0])
 30 |     self.terminals = terminals
 31 |     self.grid = grid
 32 |     self.neighbors = [(0, 1), (0, -1), (1, 0), (-1, 0), (0, 0)]
 33 |     self.actions = [0, 1, 2, 3, 4]
 34 |     self.dirs = {0: 'r', 1: 'l', 2: 'd', 3: 'u', 4: 's'}
 35 |     #              right,    left,   down,   up ,   stay
 36 |     # self.action_nei = {0: (0,1), 1:(0,-1), 2:(1,0), 3:(-1,0)}
 37 | 
 38 |     # If the mdp is deterministic, the transition probability of taken a certain action should be 1
 39 |     # otherwise < 1, the rest of the probability are equally spreaded onto
 40 |     # other neighboring states.
 41 |     self.trans_prob = trans_prob
 42 | 
 43 |   def show_grid(self):
 44 |     for i in range(len(self.grid)):
 45 |       print self.grid[i]
 46 | 
 47 |   def get_grid(self):
 48 |     return self.grid
 49 | 
 50 |   def get_states(self):
 51 |     """
 52 |     returns
 53 |       a list of all states
 54 |     """
 55 |     return filter(
 56 |         lambda x: self.grid[x[0]][x[1]] != 'x',
 57 |         [(i, j) for i in range(self.height) for j in range(self.width)])
 58 | 
 59 |   def get_actions(self, state):
 60 |     """
 61 |     get all the actions that can be takens on the current state
 62 |     returns
 63 |       a list of actions
 64 |     """
 65 |     if self.grid[state[0]][state[1]] == 'x':
 66 |       return [4]
 67 | 
 68 |     actions = []
 69 |     for i in range(len(self.actions)-1):
 70 |       inc = self.neighbors[i]
 71 |       a = self.actions[i]
 72 |       nei_s = (state[0] + inc[0], state[1] + inc[1])
 73 |       if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[1] >= 0 and nei_s[
 74 |               1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x':
 75 |         actions.append(a)
 76 |     return actions
 77 | 
 78 |   def __get_action_states(self, state):
 79 |     """
 80 |     get all the actions that can be takens on the current state
 81 |     returns
 82 |       a list of (action, state) pairs
 83 |     """
 84 |     a_s = []
 85 |     for i in range(len(self.actions)):
 86 |       inc = self.neighbors[i]
 87 |       a = self.actions[i]
 88 |       nei_s = (state[0] + inc[0], state[1] + inc[1])
 89 |       if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[1] >= 0 and nei_s[
 90 |               1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x':
 91 |         a_s.append((a, nei_s))
 92 |     return a_s
 93 | 
 94 |   def get_reward_sas(self, state, action, state1):
 95 |     """
 96 |     args
 97 |       state     current state
 98 |       action    action
 99 |       state1    next state
100 |     returns
101 |       the reward on current state
102 |     """
103 |     if not self.grid[state[0]][state[1]] == 'x':
104 |       return float(self.grid[state[0]][state[1]])
105 |     else:
106 |       return 0
107 | 
108 |   def get_reward(self, state):
109 |     """
110 |     returns
111 |       the reward on current state
112 |     """
113 |     if not self.grid[state[0]][state[1]] == 'x':
114 |       return float(self.grid[state[0]][state[1]])
115 |     else:
116 |       return 0
117 | 
118 |   def get_transition_states_and_probs(self, state, action):
119 |     """
120 |     get all the possible transition states and their probabilities with [action] on [state]
121 |     args
122 |       state     (y, x)
123 |       action    int
124 |     returns
125 |       a list of (state, probability) pair
126 |     """
127 |     if self.trans_prob == 1:
128 |       inc = self.neighbors[action]
129 |       nei_s = (state[0] + inc[0], state[1] + inc[1])
130 |       if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[
131 |               1] >= 0 and nei_s[1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x':
132 |         return [(nei_s, 1)]
133 |       else:
134 |         # if the state is invalid, stay in the current state
135 |         return [(state, 1)]
136 |     else:
137 |       action_states = self.__get_action_states(state)
138 |       inc = self.neighbors[action]
139 |       nei_s = (state[0] + inc[0], state[1] + inc[1])
140 |       res = []
141 | 
142 |       if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[
143 |               1] >= 0 and nei_s[1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x':
144 |         for i in range(len(action_states)):
145 |           if action_states[i][0] == action:
146 |             res.append((action_states[i][1], self.trans_prob))
147 |           else:
148 |             res.append(
149 |                 (action_states[i][1], (1 - self.trans_prob) / (len(action_states) - 1)))
150 |       else:
151 |         # if the action is not valid, then return uniform distribution of the valid moves.
152 |         for i in range(len(action_states)):
153 |           res.append((action_states[i][1], 1.0 / len(action_states)))
154 |       return res
155 | 
156 |   def is_terminal(self, state):
157 |     """
158 |     returns
159 |       True if the [state] is terminal
160 |     """
161 |     if state in self.terminals:
162 |       return True
163 |     else:
164 |       return False
165 | 
166 |   ##############################################
167 |   # Stateful Functions For Model-Free Leanring #
168 |   ##############################################
169 | 
170 |   def reset(self, start_pos):
171 |     """
172 |     Reset the gridworld for model-free learning. It assumes only 1 agent in the gridworld.
173 |     args
174 |       start_pos     (i,j) pair of the start location
175 |     """
176 |     self._cur_state = start_pos
177 | 
178 | 
179 |   def get_current_state(self):
180 |     return self._cur_state
181 | 
182 |   def step(self, action):
183 |     """
184 |     Step function for the agent to interact with gridworld
185 |     args
186 |       action        action taken by the agent
187 |     returns
188 |       current_state current state
189 |       action        input action
190 |       next_state    next_state
191 |       reward        reward on the next state
192 |       is_done       True/False - if the episode terminates on the next_state
193 |     """
194 |     if self.is_terminal(self._cur_state):
195 |       self._is_done = True
196 |       return self._cur_state, action, self._cur_state, self.get_reward(self._cur_state), True
197 | 
198 |     st_prob = self.get_transition_states_and_probs(self._cur_state, action)
199 |     
200 |     sampled_idx = np.random.choice(np.arange(0,len(st_prob)), p=[prob for st, prob in st_prob])
201 |     last_state = self._cur_state
202 |     next_state = st_prob[sampled_idx][0]
203 |     reward = self.get_reward(last_state)
204 |     self._cur_state = next_state
205 |     return last_state, action, next_state, reward, False
206 |   
207 |   ###########################################
208 |   # Policy Evaluation for Model-free Agents #
209 |   ###########################################
210 | 
211 |   def get_optimal_policy(self, agent):
212 |     states = self.get_states()
213 |     policy = {}
214 |     for s in states:
215 |       policy[s] = [(agent.get_optimal_action(s), 1)]
216 |     return policy
217 | 
218 |   def get_values(self, agent):
219 |     states = self.get_states()
220 |     values = {}
221 |     for s in states:
222 |       values[s] = agent.get_value(s)
223 |     return values
224 | 
225 | 
226 |   def get_qvalues(self, agent):
227 |     states = self.get_states()
228 |     q_values = {}
229 |     for s in states:
230 |       for a in self.get_actions(s):
231 |         q_values[(s,a)] = agent.get_qvalue(s,a)
232 |     return q_values
233 | 
234 |   ###############
235 |   # For Display #
236 |   ###############
237 |   
238 |   def display_qvalue_grid(self, qvalues):
239 |     print "==Display q-value grid=="
240 | 
241 |     qvalues_grid = np.empty((len(self.grid), len(self.grid[0])), dtype=object)
242 |     for s in self.get_states():
243 |       if self.grid[s[0]][s[1]] == 'x':
244 |         qvalues_grid[s[0]][s[1]] = '-'
245 |       else:
246 |         tmp_str = ""
247 |         for a in self.get_actions(s):
248 |           tmp_str = tmp_str + self.dirs[a]
249 |           tmp_str = tmp_str + str(' {:.2f} '.format(qvalues[(s,a)]))
250 |           # print tmp_str
251 |         qvalues_grid[s[0]][s[1]] = tmp_str
252 | 
253 |     row_format = '{:>40}' * (len(self.grid[0]))
254 |     for row in qvalues_grid:
255 |       print row_format.format(*row)      
256 | 
257 | 
258 |   def display_value_grid(self, values):
259 |     """
260 |     Prints a nice table of the values in grid
261 |     """
262 |     print "==Display value grid=="
263 | 
264 |     value_grid = np.zeros((len(self.grid), len(self.grid[0])))
265 |     for k in values:
266 |       value_grid[k[0]][k[1]] = float(values[k])
267 | 
268 |     row_format = '{:>20.4}' * (len(self.grid[0]))
269 |     for row in value_grid:
270 |       print row_format.format(*row)
271 | 
272 |   def display_policy_grid(self, policy):
273 |     """
274 |     prints a nice table of the policy in grid
275 |     input:
276 |       policy    a dictionary of the optimal policy {<state, action_dist>}
277 |     """
278 |     print "==Display policy grid=="
279 | 
280 |     policy_grid = np.chararray((len(self.grid), len(self.grid[0])))
281 |     for k in self.get_states():
282 |       if self.is_terminal((k[0], k[1])) or self.grid[k[0]][k[1]] == 'x':
283 |         policy_grid[k[0]][k[1]] = '-'
284 |       else:
285 |         # policy_grid[k[0]][k[1]] = self.dirs[agent.get_action((k[0], k[1]))]
286 |         policy_grid[k[0]][k[1]] = self.dirs[policy[(k[0], k[1])][0][0]]
287 | 
288 |     row_format = '{:>20}' * (len(self.grid[0]))
289 |     for row in policy_grid:
290 |       print row_format.format(*row)
291 | 


--------------------------------------------------------------------------------
/envs/mdp.py:
--------------------------------------------------------------------------------
 1 | # Markov Decision Process
 2 | # ---
 3 | # @author Yiren Lu
 4 | # @email luyiren [at] seas [dot] upenn [dot] edu
 5 | #
 6 | # MIT License
 7 | 
 8 | 
 9 | class MDP:
10 | 
11 |   def get_states(self):
12 |     """
13 |     get a list of all states
14 |     """
15 |     abstract
16 | 
17 |   def get_actions(self, state):
18 |     """
19 |     get all the actions that can be takens on the current state
20 |     """
21 |     abstract
22 | 
23 |   def get_reward(self, state):
24 |     """
25 |     return the reward on current state
26 |     """
27 |     abstract
28 | 
29 |   def get_transition_states_and_probs(self, state, action):
30 |     """
31 |     get all the possible transition states and their probabilities with [action] on [state]
32 |     """
33 |     abstract
34 | 
35 |   def is_terminal(self, state):
36 |     """
37 |     return True is the [state] is terminal
38 |     """
39 |     abstract
40 | 


--------------------------------------------------------------------------------
/envs/test_gridworld.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import gridworld
 3 | 
 4 | 
 5 | class GridWorldTest(unittest.TestCase):
 6 |   """
 7 |   Unit test for grid world
 8 |   """
 9 | 
10 |   def setUp(self):
11 |     grid = [['0', '0', '0', '0', '10'],
12 |             ['0', 'x', '0', '0', '-10'],
13 |             ['0', '0', '0', '0', '0']]
14 | 
15 |     self.grid = grid
16 |     self.gw_deterministic = gridworld.GridWorld(grid, {(0, 4), (1, 4)}, 1)
17 |     self.gw_non_deterministic = gridworld.GridWorld(
18 |         grid, {(0, 4), (1, 4)}, 0.8)
19 | 
20 |   def test_grid_dims(self):
21 |     self.assertEqual(len(self.gw_deterministic.get_grid()), 3)
22 |     self.assertEqual(len(self.gw_deterministic.get_grid()[0]), 5)
23 | 
24 |   def test_grid_values(self):
25 |     grid_tmp = self.gw_deterministic.get_grid()
26 |     for i in range(len(grid_tmp)):
27 |       for j in range(len(grid_tmp[0])):
28 |         self.assertEqual(self.grid[i][j], grid_tmp[i][j])
29 | 
30 |   def test_get_states(self):
31 |     self.assertEqual(len(self.gw_deterministic.get_states()), 14)
32 | 
33 |   def test_get_actions(self):
34 |     self.assertEqual(len(self.gw_deterministic.get_actions((0, 0))), 2)
35 |     self.assertEqual(len(self.gw_deterministic.get_actions((2, 0))), 2)
36 |     self.assertEqual(len(self.gw_deterministic.get_actions((2, 4))), 2)
37 |     self.assertEqual(len(self.gw_deterministic.get_actions((0, 4))), 2)
38 |     self.assertEqual(len(self.gw_deterministic.get_actions((1, 0))), 2)
39 | 
40 |   def test_get_reward(self):
41 |     self.assertEqual(self.gw_deterministic.get_reward((0, 0)), 0)
42 |     self.assertEqual(self.gw_deterministic.get_reward((0, 4)), 10.0)
43 |     self.assertEqual(self.gw_deterministic.get_reward((1, 4)), -10.0)
44 | 
45 |   def test_trans_prob_deter(self):
46 |     self.assertEqual(
47 |         len(
48 |             self.gw_deterministic.get_transition_states_and_probs(
49 |                 (0, 0), 0)), 1)
50 |     self.assertEqual(
51 |         self.gw_deterministic.get_transition_states_and_probs(
52 |             (0, 0), 0)[0][0], (0, 1))
53 |     self.assertEqual(
54 |         self.gw_deterministic.get_transition_states_and_probs(
55 |             (0, 0), 0)[0][1], 1)
56 | 
57 |     self.assertEqual(
58 |         len(
59 |             self.gw_deterministic.get_transition_states_and_probs(
60 |                 (0, 0), 1)), 1)
61 |     self.assertEqual(
62 |         self.gw_deterministic.get_transition_states_and_probs(
63 |             (0, 0), 1)[0][0], (0, 0))
64 |     self.assertEqual(
65 |         self.gw_deterministic.get_transition_states_and_probs(
66 |             (0, 0), 1)[0][1], 1)
67 | 
68 |   def test_trans_prob_non_deter(self):
69 |     self.assertEqual(
70 |         len(
71 |             self.gw_non_deterministic.get_transition_states_and_probs(
72 |                 (0, 0), 0)), 3)
73 |     self.assertEqual(
74 |         self.gw_non_deterministic.get_transition_states_and_probs(
75 |             (0, 0), 0)[0][0], (0, 1))
76 |     self.assertEqual(
77 |         self.gw_non_deterministic.get_transition_states_and_probs(
78 |             (0, 0), 0)[0][1], 0.8)
79 | 
80 |     self.assertTrue(
81 |         self.gw_non_deterministic.get_transition_states_and_probs(
82 |             (0, 0), 0)[1][1] - 0.1 < 1e-5)
83 |     self.assertTrue(
84 |         self.gw_non_deterministic.get_transition_states_and_probs(
85 |             (0, 0), 0)[2][1] - 0.1 < 1e-5)
86 | 
87 |     self.assertEqual(
88 |         len(
89 |             self.gw_non_deterministic.get_transition_states_and_probs(
90 |                 (1, 0), 0)), 3)
91 | 
92 |   def test_terminals(self):
93 |     self.assertTrue(self.gw_deterministic.is_terminal((0, 4)))
94 |     self.assertTrue(self.gw_deterministic.is_terminal((1, 4)))
95 | 
96 | if __name__ == '__main__':
97 |   unittest.main()
98 | 


--------------------------------------------------------------------------------
/imgs/breakout10.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/imgs/breakout10.gif


--------------------------------------------------------------------------------
/monte_carlo/monte_carlo.py:
--------------------------------------------------------------------------------
 1 | # Monte Carlo Agent
 2 | # Epsilon-greedy monte carlo agent
 3 | #
 4 | # ---
 5 | # @author Yiren Lu
 6 | # @email luyiren [at] seas [dot] upenn [dot] edu
 7 | #
 8 | # MIT License
 9 | 
10 | import sys
11 | if "../" not in sys.path:
12 |   sys.path.append("../")
13 | from TD import qlearning
14 | import numpy
15 | 
16 | class Counter:
17 |   """
18 |   Counter class 
19 |   """
20 | 
21 |   def __init__(self):
22 |     self.counter = {}
23 | 
24 |   def add(self, key):
25 |     if key in self.counter:
26 |       self.counter[key] = self.counter[key] + 1
27 |     else:
28 |       self.counter[key] = 1
29 | 
30 |   def get(self, key):
31 |     if key in self.counter:
32 |       return self.counter[key]
33 |     else:
34 |       return 0
35 | 
36 | 
37 | class MonteCarloAgent(qlearning.QLearningAgent):
38 | 
39 |   def __init__(self, legal_actions_fn, epsilon=0.5, alpha=0.5, gamma=0.9, epsilon_decay=1):
40 |     self.n_s_a = Counter()
41 |     super(MonteCarloAgent, self).__init__(legal_actions_fn, epsilon, alpha, gamma, epsilon_decay)
42 | 
43 | 
44 |   @staticmethod
45 |   def compute_G_t(rewards, gamma):
46 |     """
47 |     args
48 |       a list of rewards
49 |     returns 
50 |       a list of cummulated rewards G_t = R_{t+1} + gamma*R_{t+2} + gamma^2*R_{t+3} + .. + gamma^{T-t-1}*R_{T}
51 |     """
52 |     G_t = [0]*len(rewards)
53 | 
54 |     for i in xrange(0,len(rewards)):
55 |       G_t[0] = G_t[0] + rewards[i]*(gamma**i)
56 | 
57 |     for i in xrange(1,len(rewards)):
58 |       G_t[i] = (G_t[i-1] - rewards[i-1])/gamma
59 | 
60 |     return G_t
61 | 
62 | 
63 |   def learn(self, episode):
64 |     """
65 |     args
66 |       episode       a list of (current state, action, next state, reward)
67 |     """
68 |     q_values = self.q_values.copy()
69 |     
70 |     rewards = [r for c, a, n, r in episode]
71 |     G_t = MonteCarloAgent.compute_G_t(rewards, self.gamma)
72 |     for i in xrange(len(episode)):
73 |       c, a, n, r = episode[i]
74 |       # q-state count++
75 |       self.n_s_a.add((c,a))
76 |       # update q-value 
77 |       # notices here I took the max of the weights and self.alpha to ensure it actually 
78 |       # learns some thing from each episode of experience
79 |       q_values[(c,a)] = self.get_qvalue(c,a) + max(1/self.n_s_a.get((c,a)), self.alpha) * (G_t[i] - self.get_qvalue(c,a))
80 | 
81 |     self.q_values = q_values
82 | 
83 |     # policy improvement
84 |     policy = self.policy.copy()
85 |     for c, a, n, r in episode:
86 |       legal_actions = self.legal_actions_fn(c)
87 |       s_q_values = [self.get_qvalue(c,a) for a in legal_actions]
88 |       policy[c] = legal_actions[s_q_values.index(max(s_q_values))]  
89 |     self.policy = policy
90 | 
91 |     self.epsilon = self.epsilon*self.epsilon_decay
92 | 
93 | 


--------------------------------------------------------------------------------
/monte_carlo/test_monte_carlo.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | if "../" not in sys.path:
 4 |   sys.path.append("../")
 5 | from envs import gridworld
 6 | import monte_carlo
 7 | 
 8 | 
 9 | class MonteCarloAgentTest(unittest.TestCase):
10 |   """
11 |   Unit test for monte carlo agent
12 |   """
13 | 
14 | 
15 |   def test2(self):
16 |     print 'Test 2 -- Gridworld test'
17 |     grid = [['0', '0', '0', '1'],
18 |             ['0', 'x', '0', '-1'],
19 |             ['0', '0', '0', '0']]
20 | 
21 |     gw = gridworld.GridWorld(
22 |         grid, {(0, 3), (1, 3)}, 0.8)
23 | 
24 |     agent = monte_carlo.MonteCarloAgent(gw.get_actions, 
25 |                   epsilon=0.4, gamma=0.9, alpha=0.01, epsilon_decay=1)
26 |     # Training
27 |     episodes = 1000
28 |     for i in range(episodes):
29 |       episode = []
30 |       gw.reset((2,0))
31 |       cur_s = gw.get_current_state()
32 |       is_done = False
33 |       while not is_done:
34 |         a = agent.get_action(cur_s)
35 |         last_state, action, next_state, reward, is_done = gw.step(a)
36 |         episode.append((last_state, action, next_state, reward))
37 |         # agent.learn(last_state, action, next_state, reward, is_done)
38 |         cur_s = next_state
39 |         if is_done:
40 |           agent.learn(episode)
41 |     
42 |     opt_policy = gw.get_optimal_policy(agent)
43 |     gw.display_policy_grid(opt_policy)
44 |     gw.display_value_grid(gw.get_values(agent))
45 |     gw.display_qvalue_grid(gw.get_qvalues(agent))
46 | 
47 | 
48 |   def test1(self):
49 |     print 'Test 1 -- test G_t'
50 |     G_t = monte_carlo.MonteCarloAgent.compute_G_t([1,2,3,4], 0.5)
51 |     self.assertEqual(G_t, [3.25,4.5,5,4])
52 | 
53 | if __name__ == '__main__':
54 |   unittest.main()


--------------------------------------------------------------------------------
/papers/AlphaGoNaturePaper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/AlphaGoNaturePaper.pdf


--------------------------------------------------------------------------------
/papers/GAN.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/GAN.pdf


--------------------------------------------------------------------------------
/papers/Learning2learn_by_GD_by_GD.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/Learning2learn_by_GD_by_GD.pdf


--------------------------------------------------------------------------------
/papers/a3c.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/a3c.pdf


--------------------------------------------------------------------------------
/papers/browne_mcts_survey_ieee12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/browne_mcts_survey_ieee12.pdf


--------------------------------------------------------------------------------
/papers/ddpg.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/ddpg.pdf


--------------------------------------------------------------------------------
/papers/ddqn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/ddqn.pdf


--------------------------------------------------------------------------------
/papers/dpg_silver14.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/dpg_silver14.pdf


--------------------------------------------------------------------------------
/papers/dqn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/dqn.pdf


--------------------------------------------------------------------------------
/papers/dqn_nature.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/dqn_nature.pdf


--------------------------------------------------------------------------------
/papers/drl_bench_mark2016.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/drl_bench_mark2016.pdf


--------------------------------------------------------------------------------
/papers/dueling_dqn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/dueling_dqn.pdf


--------------------------------------------------------------------------------
/papers/learn2rl.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/learn2rl.pdf


--------------------------------------------------------------------------------
/policy_gradient/README.md:
--------------------------------------------------------------------------------
 1 | ## Policy Gradient Methods
 2 | 
 3 | ### REINFORCE 
 4 | 
 5 | The policy function is approximated by a 4-layer fully connected network with l2 regularization. The algorithm [solved cartpole-v0 after 632 episodes](https://gym.openai.com/evaluations/eval_0qE4YdUoQMi60hslLEGg)
 6 | 
 7 | - `reinforce.py`: REINFORCE with policy function approximation
 8 | - `cartpole_reinforce.py`: working example on cartpole-v0
 9 | 
10 | #### Run Code
11 | 
12 | `$ python cartpole_reinforce.py`
13 | 
14 | #### Cartpole-v0 Result
15 | 
16 | ![cartpole training](imgs/cartpole_reinforce.png "cartpole training")
17 | 
18 | ### REINFORCE with Baseline
19 | 
20 | Here the code shows REINFORCE algorithm with baseline. The policy and value function share the same network regularized by l2. Have not been tuning the hyperparameters too much. Sometimes the model quickly converges to a local optimal (degenerate policy) due to random initialization, but a few attempts (<5) should be sufficient.
21 | 
22 | - `reinforce_w_baseline.py`: REINFORCE with baseline
23 | - `cartpole_reinforce_baseline.py`: working example on cartpole-v0
24 | 
25 | #### Run Code
26 | 
27 | `$ python cartpole_reinforce_baseline.py`
28 | 
29 | #### Cartpole-v0 Result
30 | 
31 | ![cartpole training](imgs/cartpole_reinforce_w_baseline.png "cartpole training")


--------------------------------------------------------------------------------
/policy_gradient/cartpole_reinforce.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym import wrappers
 3 | import reinforce
 4 | import numpy
 5 | import matplotlib.pyplot as plt
 6 | import tensorflow as tf
 7 | 
 8 | NUM_EPISODES = 400
 9 | MAX_STEPS = 300
10 | FAIL_PENALTY = -100
11 | # LEARNING_RATE = 0.0001 # hidden layer 10/20
12 | LEARNING_RATE = 0.002 # hidden layer 5
13 | # LEARNING_RATE = 0.1 # hidden layer 3
14 | DISCOUNT_FACTOR = 0.9
15 | TRAIN_EVERY_NUM_EPISODES = 1
16 | EPOCH_SIZE = 1
17 | MEM_SIZE = 100
18 | 
19 | RECORD = False
20 | 
21 | 
22 | def train(agent, env, sess, num_episodes=NUM_EPISODES):
23 |   history = []
24 |   for i in xrange(NUM_EPISODES):
25 |     if i % 100:
26 |       print "Episode {}".format(i + 1)
27 |     cur_state = env.reset()
28 |     episode = []
29 |     for t in xrange(MAX_STEPS):
30 |       action = agent.get_action(cur_state, sess)
31 |       next_state, reward, done, info = env.step(action)
32 |       if done:
33 |         reward = FAIL_PENALTY
34 |         episode.append([cur_state, action, next_state, reward, done])
35 |         print("Episode finished after {} timesteps".format(t + 1))
36 |         print agent.get_policy(cur_state, sess)
37 |         history.append(t + 1)
38 |         break
39 |       episode.append([cur_state, action, next_state, 1, done])
40 |       cur_state = next_state
41 |       if t == MAX_STEPS - 1:
42 |         history.append(t + 1)
43 |         print("Episode finished after {} timesteps".format(t + 1))
44 |     # agent.add_episode(episode)
45 |     if i % TRAIN_EVERY_NUM_EPISODES == 0:
46 |       print 'train at episode {}'.format(i)
47 |       agent.learn(episode, sess, EPOCH_SIZE)
48 |   return agent, history
49 | 
50 | 
51 | agent = reinforce.PolicyGradientNNAgent(lr=LEARNING_RATE,
52 |                                           gamma=DISCOUNT_FACTOR,
53 |                                           state_size=4,
54 |                                           action_size=2,
55 |                                           n_hidden_1=5,
56 |                                           n_hidden_2=5)
57 | 
58 | 
59 | env = gym.make('CartPole-v0')
60 | if RECORD:
61 |   env = wrappers.Monitor(env, '/tmp/cartpole-experiment-2', force=True)
62 | 
63 | 
64 | with tf.Session() as sess:
65 |   sess.run(tf.global_variables_initializer())
66 |   agent, history = train(agent, env, sess)
67 | 
68 | 
69 | if RECORD:
70 |   env.monitor.close()
71 | 
72 | window = 10
73 | avg_reward = [numpy.mean(history[i*window:(i+1)*window]) for i in xrange(int(len(history)/window))]
74 | f_reward = plt.figure(1)
75 | plt.plot(numpy.linspace(0, len(history), len(avg_reward)), avg_reward)
76 | plt.ylabel('Rewards')
77 | f_reward.show()
78 | print 'press enter to continue'
79 | raw_input()
80 | 
81 | 


--------------------------------------------------------------------------------
/policy_gradient/cartpole_reinforce_baseline.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym import wrappers
 3 | import reinforce_w_baseline
 4 | import numpy
 5 | import matplotlib.pyplot as plt
 6 | import tensorflow as tf
 7 | 
 8 | NUM_EPISODES = 200
 9 | MAX_STEPS = 300
10 | FAIL_PENALTY = -100
11 | LEARNING_RATE = 0.002 
12 | DISCOUNT_FACTOR = 0.9
13 | TRAIN_EVERY_NUM_EPISODES = 1
14 | EPOCH_SIZE = 1
15 | MEM_SIZE = 100
16 | 
17 | RECORD = False
18 | 
19 | 
20 | def train(agent, env, sess, num_episodes=NUM_EPISODES):
21 |   history = []
22 |   for i in xrange(NUM_EPISODES):
23 |     if i % 100:
24 |       print "Episode {}".format(i + 1)
25 |     cur_state = env.reset()
26 |     episode = []
27 |     for t in xrange(MAX_STEPS):
28 |       action = agent.get_action(cur_state, sess)
29 |       next_state, reward, done, info = env.step(action)
30 |       if done:
31 |         reward = FAIL_PENALTY
32 |         episode.append([cur_state, action, next_state, reward, done])
33 |         print("Episode finished after {} timesteps".format(t + 1))
34 |         print agent.get_policy(cur_state, sess)
35 |         history.append(t + 1)
36 |         break
37 |       episode.append([cur_state, action, next_state, 1, done])
38 |       cur_state = next_state
39 |       if t == MAX_STEPS - 1:
40 |         history.append(t + 1)
41 |         print("Episode finished after {} timesteps".format(t + 1))
42 |     if i % TRAIN_EVERY_NUM_EPISODES == 0:
43 |       print 'train at episode {}'.format(i)
44 |       agent.learn(episode, sess, EPOCH_SIZE)
45 |   return agent, history
46 | 
47 | 
48 | agent = reinforce_w_baseline.PolicyGradientNNAgent(lr=LEARNING_RATE,
49 |                                           gamma=DISCOUNT_FACTOR,
50 |                                           state_size=4,
51 |                                           action_size=2,
52 |                                           n_hidden_1=10,
53 |                                           n_hidden_2=10)
54 | 
55 | 
56 | env = gym.make('CartPole-v0')
57 | 
58 | 
59 | with tf.Session() as sess:
60 |   sess.run(tf.global_variables_initializer())
61 |   agent, history = train(agent, env, sess)
62 | 
63 | 
64 | window = 10
65 | avg_reward = [numpy.mean(history[i*window:(i+1)*window]) for i in xrange(int(len(history)/window))]
66 | f_reward = plt.figure(1)
67 | plt.plot(numpy.linspace(0, len(history), len(avg_reward)), avg_reward)
68 | plt.ylabel('Rewards')
69 | plt.xlabel('Episodes')
70 | f_reward.show()
71 | print 'press enter to continue'
72 | raw_input()
73 | 
74 | 


--------------------------------------------------------------------------------
/policy_gradient/imgs/cartpole_reinforce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/policy_gradient/imgs/cartpole_reinforce.png


--------------------------------------------------------------------------------
/policy_gradient/imgs/cartpole_reinforce_w_baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/policy_gradient/imgs/cartpole_reinforce_w_baseline.png


--------------------------------------------------------------------------------
/policy_gradient/reinforce.py:
--------------------------------------------------------------------------------
 1 | # Policy Gradient Agent 
 2 | #   - policy approximation with fully connected neural network
 3 | #
 4 | # ---
 5 | # @author Yiren Lu
 6 | # @email luyiren [at] seas [dot] upenn [dot] edu
 7 | #
 8 | # MIT License
 9 | import gym
10 | import numpy as np
11 | import random
12 | import tensorflow as tf
13 | import tensorflow.contrib.slim as slim
14 | import tf_utils
15 | 
16 | 
17 | class PolicyGradientNNAgent():
18 | 
19 |   def __init__(self,
20 |     lr=0.5, 
21 |     gamma=0.99, 
22 |     state_size=4,
23 |     action_size=2,
24 |     n_hidden_1=20,
25 |     n_hidden_2=20,
26 |     scope="pg"
27 |     ):
28 |     """
29 |     args
30 |       epsilon           exploration rate
31 |       epsilon_anneal    linear decay rate per call of learn() function (iteration)
32 |       end_epsilon       lowest exploration rate
33 |       lr                learning rate
34 |       gamma             discount factor
35 |       state_size        network input size
36 |       action_size       network output size
37 |     """
38 |     self.lr = lr
39 |     self.gamma = gamma
40 |     self.state_size = state_size
41 |     self.action_size = action_size
42 |     self.total_steps = 0
43 |     self.n_hidden_1 = n_hidden_1
44 |     self.n_hidden_2 = n_hidden_2
45 |     self.scope = scope
46 | 
47 |     self._build_policy_net()
48 | 
49 | 
50 |   def _build_policy_net(self):
51 |     """Build policy network"""
52 |     with tf.variable_scope(self.scope):
53 |       self.state_input = tf.placeholder(tf.float32, [None, self.state_size])
54 |       self.action = tf.placeholder(tf.int32, [None])
55 |       self.target = tf.placeholder(tf.float32, [None])
56 | 
57 |       layer_1 = tf_utils.fc(self.state_input, self.n_hidden_1, tf.nn.relu)
58 |       layer_2 = tf_utils.fc(layer_1, self.n_hidden_2, tf.nn.relu)
59 | 
60 |       self.action_values = tf_utils.fc(layer_2, self.action_size)
61 |       action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0)
62 |       self.action_prob = tf.nn.softmax(self.action_values)
63 |       self.action_value_pred = tf.reduce_sum(self.action_prob * action_mask, 1)
64 | 
65 |       # l2 regularization
66 |       self.l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables()  ]) 
67 |       self.pg_loss = tf.reduce_mean(-tf.log(self.action_value_pred) * self.target)
68 | 
69 |       self.loss = self.pg_loss + 0.002 * self.l2_loss
70 |       self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
71 |       self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
72 | 
73 | 
74 |   def get_action(self, state, sess):
75 |     """Returns stochastic policy"""
76 |     pi = self.get_policy(state, sess)
77 |     return np.random.choice(range(self.action_size), p=pi)
78 | 
79 | 
80 |   def get_policy(self, state, sess):
81 |     """returns policy as probability distribution of actions"""
82 |     pi = sess.run(self.action_prob, feed_dict={self.state_input: [state]})
83 |     return pi[0]
84 | 
85 | 
86 |   def learn(self, episode, sess, train_epoch = 1):
87 |     for t in xrange(len(episode)):
88 |       self.total_steps = self.total_steps + 1
89 |       target = sum([self.gamma**i * r for i, (s, a, s1, r, d) in enumerate(episode[t:])])
90 |       state, action, next_state, reward, done = episode[t]
91 |       feed_dict = { self.state_input: [state], self.target: [target], self.action: [action] }
92 |       _, loss = sess.run([self.train_op, self.loss], feed_dict)


--------------------------------------------------------------------------------
/policy_gradient/reinforce_w_baseline.py:
--------------------------------------------------------------------------------
 1 | # Policy Gradient Agent 
 2 | #   - REINFORCE algorithm with baseline
 3 | #   - Policy/value function approximation
 4 | #
 5 | # ---
 6 | # @author Yiren Lu
 7 | # @email luyiren [at] seas [dot] upenn [dot] edu
 8 | #
 9 | # MIT License
10 | import gym
11 | import numpy as np
12 | import random
13 | import tensorflow as tf
14 | import tensorflow.contrib.slim as slim
15 | import tf_utils
16 | 
17 | 
18 | class PolicyGradientNNAgent():
19 | 
20 |   def __init__(self,
21 |     lr=0.5, 
22 |     gamma=0.99, 
23 |     state_size=4,
24 |     action_size=2,
25 |     n_hidden_1=20,
26 |     n_hidden_2=20,
27 |     scope="pg"
28 |     ):
29 |     """
30 |     args
31 |       epsilon           exploration rate
32 |       epsilon_anneal    linear decay rate per call of learn() function (iteration)
33 |       end_epsilon       lowest exploration rate
34 |       lr                learning rate
35 |       gamma             discount factor
36 |       state_size        network input size
37 |       action_size       network output size
38 |     """
39 |     self.lr = lr
40 |     self.gamma = gamma
41 |     self.state_size = state_size
42 |     self.action_size = action_size
43 |     self.total_steps = 0
44 |     self.n_hidden_1 = n_hidden_1
45 |     self.n_hidden_2 = n_hidden_2
46 |     self.scope = scope
47 | 
48 |     self._build_policy_net()
49 | 
50 | 
51 | 
52 |   def _build_policy_net(self):
53 |     """Build policy network"""
54 |     with tf.variable_scope(self.scope):
55 |       self.state_input = tf.placeholder(tf.float32, [None, self.state_size])
56 |       self.action = tf.placeholder(tf.int32, [None])
57 |       self.target = tf.placeholder(tf.float32, [None])
58 | 
59 |       layer_1 = tf_utils.fc(self.state_input, self.n_hidden_1, tf.nn.relu)
60 |       layer_2 = tf_utils.fc(layer_1, self.n_hidden_2, tf.nn.relu)
61 | 
62 |       self.value = tf_utils.fc(layer_2, 1)
63 | 
64 |       self.action_values = tf_utils.fc(layer_2, self.action_size)
65 |       action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0)
66 |       self.action_value_pred = tf.reduce_sum(tf.nn.softmax(self.action_values) * action_mask, 1)
67 |       
68 |       self.action_probs = tf.nn.softmax(self.action_values)
69 |       self.value_loss = tf.reduce_mean(tf.square(self.target - self.value))
70 |       self.pg_loss = tf.reduce_mean(-tf.log(self.action_value_pred) * (self.target - self.value))
71 |       self.l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables()  ]) 
72 |       self.loss = self.pg_loss + 5*self.value_loss + 0.002 * self.l2_loss
73 |       
74 |       self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
75 |       self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())
76 | 
77 | 
78 |   def get_action(self, state, sess):
79 |     """Returns stochastic policy"""
80 |     pi = self.get_policy(state, sess)
81 |     return np.random.choice(range(self.action_size), p=pi)
82 | 
83 | 
84 |   def get_policy(self, state, sess):
85 |     """returns policy as probability distribution of actions"""
86 |     pi = sess.run(self.action_probs, feed_dict={self.state_input: [state]})
87 |     return pi[0]
88 | 
89 | 
90 |   def learn(self, episode, sess, train_epoch = 1):
91 |     for t in xrange(len(episode)):
92 |       self.total_steps = self.total_steps + 1
93 |       target = sum([self.gamma**i * r for i, (s, a, s1, r, d) in enumerate(episode[t:])])
94 |       state, action, next_state, reward, done = episode[t]
95 |       feed_dict = { self.state_input: [state], self.target: [target], self.action: [action] }
96 |       _, loss, v, pg_loss, v_a = sess.run([self.train_op, self.loss, self.value, self.pg_loss, self.action_value_pred], feed_dict)
97 |       # print target, v
98 |       # print pg_loss, v, v_a, target, -np.log(v_a) * target


--------------------------------------------------------------------------------
/policy_gradient/tf_utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for tensorflow"""
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def max_pool(x, k_sz=[2,2]):
 6 |   """max pooling layer wrapper
 7 |   Args
 8 |     x:      4d tensor [batch, height, width, channels]
 9 |     k_sz:   The size of the window for each dimension of the input tensor
10 |   Returns
11 |     a max pooling layer
12 |   """
13 |   return tf.nn.max_pool(x, ksize=[1, k_sz[0], k_sz[1], 1], strides=[1, k_sz[0], k_sz[1], 1], padding='SAME')
14 | 
15 | def conv2d(x, n_kernel, k_sz, stride=1):
16 |   """convolutional layer with relu activation wrapper
17 |   Args:
18 |     x:          4d tensor [batch, height, width, channels]
19 |     n_kernel:   number of kernels (output size)
20 |     k_sz:       2d array, kernel size. e.g. [8,8]
21 |     stride:     stride
22 |   Returns
23 |     a conv2d layer
24 |   """
25 |   W = tf.Variable(tf.random_normal([k_sz[0], k_sz[1], int(x.get_shape()[3]), n_kernel]))
26 |   b = tf.Variable(tf.random_normal([n_kernel]))
27 |   # - strides[0] and strides[1] must be 1
28 |   # - padding can be 'VALID'(without padding) or 'SAME'(zero padding)
29 |   #     - http://stackoverflow.com/questions/37674306/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-t
30 |   conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')
31 |   conv = tf.nn.bias_add(conv, b) # add bias term
32 |   return tf.nn.relu(conv) # rectified linear unit: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
33 | 
34 | 
35 | def fc(x, n_output, activation_fn=None):
36 |   """fully connected layer with relu activation wrapper
37 |   Args
38 |     x:          2d tensor [batch, n_input]
39 |     n_output    output size
40 |   """
41 |   W=tf.Variable(tf.random_normal([int(x.get_shape()[1]), n_output]))
42 |   b=tf.Variable(tf.random_normal([n_output]))
43 |   fc1 = tf.add(tf.matmul(x, W), b)
44 |   if not activation_fn == None:
45 |     fc1 = activation_fn(fc1)
46 |   return fc1
47 | 
48 | 
49 | def flatten(x):
50 |   """flatten a 4d tensor into 2d
51 |   Args
52 |     x:          4d tensor [batch, height, width, channels]
53 |   Returns a flattened 2d tensor
54 |   """
55 |   return tf.reshape(x, [-1, int(x.get_shape()[1]*x.get_shape()[2]*x.get_shape()[3])])
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow-gpu
2 | gym[all]
3 | scikit-image
4 | scipy
5 | numpy
6 | 


--------------------------------------------------------------------------------