├── .gitignore ├── A3C ├── README.md ├── ac_net.py ├── acrobot_a3c.py ├── cartpole_a3c.py ├── imgs │ ├── a3c_acrobot.png │ ├── a3c_cartpole_el0.png │ ├── a3c_cartpole_el001.png │ ├── mountaincar_el1.png │ └── mountaincar_tmax15_el1.png ├── mountaincar_a3c.py ├── tf_utils.py └── worker.py ├── DP ├── __init__.py ├── policy_iteration.py ├── test_policy_iteration.py ├── test_value_iteration.py └── value_iteration.py ├── DQN ├── README.md ├── cartpole_dqn.py ├── dqn.py ├── exp_replay.py ├── imgs │ └── dqn_cartpole_training.png ├── test_exp_replay.py └── tf_utils.py ├── LICENSE ├── README.md ├── TD ├── __init__.py ├── cartpole_qlearning.py ├── qlearning.py └── test_qlearning.py ├── ddpg ├── README.md ├── actor.py ├── critic.py ├── ddpg.py ├── exp_replay.py ├── imgs │ └── ddpg_plot.png ├── mountaincar_ddpg.py ├── ou.py ├── pendulum_ddpg.py └── tf_utils.py ├── envs ├── __init__.py ├── env.py ├── gridworld.py ├── mdp.py └── test_gridworld.py ├── imgs └── breakout10.gif ├── monte_carlo ├── monte_carlo.py └── test_monte_carlo.py ├── papers ├── AlphaGoNaturePaper.pdf ├── GAN.pdf ├── Learning2learn_by_GD_by_GD.pdf ├── a3c.pdf ├── browne_mcts_survey_ieee12.pdf ├── ddpg.pdf ├── ddqn.pdf ├── dpg_silver14.pdf ├── dqn.pdf ├── dqn_nature.pdf ├── drl_bench_mark2016.pdf ├── dueling_dqn.pdf └── learn2rl.pdf ├── policy_gradient ├── README.md ├── cartpole_reinforce.py ├── cartpole_reinforce_baseline.py ├── imgs │ ├── cartpole_reinforce.png │ └── cartpole_reinforce_w_baseline.png ├── reinforce.py ├── reinforce_w_baseline.py └── tf_utils.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .coverage 3 | *.pyc 4 | run_tests.sh 5 | run_format.sh 6 | push.sh 7 | *.png 8 | *.jpg 9 | visualize_history.py 10 | log.txt 11 | nohup.out 12 | *log.txt 13 | *log*.txt 14 | tmp/* 15 | *logs/ -------------------------------------------------------------------------------- /A3C/README.md: -------------------------------------------------------------------------------- 1 | ## Asynchronized Advantage Actor-Critic 2 | 3 | Following paper: Asynchronous Methods for Deep Reinforcement Learning [(https://arxiv.org/pdf/1602.01783.pdf)](https://arxiv.org/pdf/1602.01783.pdf) 4 | 5 | #### Cartpole-v0 result 6 | 7 | `$ python cartpole_a3c.py --device=cpu --episodes=1000 --workers=4 --log_dir=cartpole_logs` 8 | 9 | The following graph shows the episode rewards (# workers: 4, entropy loss: 0.2) 10 | 11 | Tensorboard: 12 | 13 | `$ tensorboard --logdir=cartpole_logs/` 14 | 15 | ![A3C training](imgs/a3c_cartpole_el001.png "A3C training") 16 | 17 | 21 | 22 | #### Acrobot-v1 result 23 | 24 | `$ python acrobot_a3c.py --device=cpu --episodes=500 --workers=4 --log_dir=acrobot_logs` 25 | 26 | The following graph shows the episode rewards (# workers: 4, entropy loss: 0.2) 27 | 28 | ![A3C training](imgs/a3c_acrobot.png "A3C training") 29 | 30 | #### MountainCar-v0 result 31 | 32 | `$ python mountaincar_a3c.py --device=cpu --episodes=20000 --workers=8 --log_dir=mc_logs` 33 | 34 | The following graph shows the episode rewards (# workers: 8, entropy loss: 1.0, tmax=5) 35 | 36 | ![A3C training](imgs/mountaincar_tmax15_el1.png "A3C training") 37 | 38 | 39 | #### References 40 | 41 | - Openai's A3C implementation ([https://github.com/openai/universe-starter-agent](https://github.com/openai/universe-starter-agent)) 42 | - Arthur Juliani's blog post ([https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2)) 43 | -------------------------------------------------------------------------------- /A3C/ac_net.py: -------------------------------------------------------------------------------- 1 | '''Actor-critic network class for a3c''' 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | import tf_utils 6 | 7 | 8 | class AC_Net(object): 9 | '''Actor-critic network class for a3c''' 10 | 11 | def __init__(self, state_size, action_size, lr, 12 | name, n_h1=400, n_h2=300, global_name='global'): 13 | 14 | self.state_size = state_size 15 | self.action_size = action_size 16 | self.name = name 17 | self.n_h1 = n_h1 18 | self.n_h2 = n_h2 19 | 20 | self.optimizer = tf.train.AdamOptimizer(lr) 21 | self.input_s, self.input_a, self.advantage, self.target_v, self.policy, self.value, self.action_est, self.model_variables = self._build_network( 22 | name) 23 | 24 | # 0.5, 0.2, 1.0 25 | self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value, [-1]))) 26 | self.entropy_loss = 1.0 * tf.reduce_sum(self.policy * tf.log(self.policy)) 27 | self.policy_loss = 1.0 * tf.reduce_sum(-tf.log(self.action_est) * self.advantage) 28 | self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.model_variables]) 29 | # self.loss = 0.5 * self.value_loss + self.policy_loss + 0.2 * self.entropy_loss 30 | self.loss = self.value_loss + self.policy_loss + self.entropy_loss 31 | self.gradients = tf.gradients(self.loss, self.model_variables) 32 | if name != global_name: 33 | self.var_norms = tf.global_norm(self.model_variables) 34 | global_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, global_name) 35 | self.apply_gradients = self.optimizer.apply_gradients(zip(self.gradients, global_variables)) 36 | 37 | def _build_network(self, name): 38 | input_s = tf.placeholder(tf.float32, [None, self.state_size]) 39 | input_a = tf.placeholder(tf.int32, [None]) 40 | advantage = tf.placeholder(tf.float32, [None]) 41 | target_v = tf.placeholder(tf.float32, [None]) 42 | 43 | with tf.variable_scope(name): 44 | layer_1 = tf_utils.fc( 45 | input_s, 46 | self.n_h1, 47 | scope="fc1", 48 | activation_fn=tf.nn.relu, 49 | initializer=tf.contrib.layers.variance_scaling_initializer( 50 | mode="FAN_IN")) 51 | layer_2 = tf_utils.fc( 52 | layer_1, 53 | self.n_h2, 54 | scope="fc2", 55 | activation_fn=tf.nn.relu, 56 | initializer=tf.contrib.layers.variance_scaling_initializer( 57 | mode="FAN_IN")) 58 | policy = tf_utils.fc( 59 | layer_2, 60 | self.action_size, 61 | activation_fn=tf.nn.softmax, 62 | scope="policy", 63 | initializer=tf_utils.normalized_columns_initializer(0.01)) 64 | value = tf_utils.fc(layer_2, 1, activation_fn=None, 65 | scope="value", initializer=tf_utils.normalized_columns_initializer(1.0)) 66 | 67 | action_mask = tf.one_hot(input_a, self.action_size, 1.0, 0.0) 68 | action_est = tf.reduce_sum(policy * action_mask, 1) 69 | 70 | model_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) 71 | return input_s, input_a, advantage, target_v, policy, value, action_est, model_variables 72 | 73 | def get_action(self, state, sess): 74 | state = np.reshape(state, [-1, self.state_size]) 75 | policy = sess.run(self.policy, feed_dict={self.input_s: state}) 76 | return np.random.choice(range(self.action_size), p=policy[0]) 77 | 78 | def predict_policy(self, state, sess): 79 | state = np.reshape(state, [-1, self.state_size]) 80 | policy = sess.run(self.policy, feed_dict={self.input_s: state}) 81 | return policy[0] 82 | 83 | def predict_value(self, state, sess): 84 | state = np.reshape(state, [-1, self.state_size]) 85 | return sess.run(self.value, feed_dict={self.input_s: state}) 86 | -------------------------------------------------------------------------------- /A3C/acrobot_a3c.py: -------------------------------------------------------------------------------- 1 | '''Example of A3C running on Acrobot environment 2 | ''' 3 | import argparse 4 | import time 5 | import threading 6 | import tensorflow as tf 7 | import gym 8 | # import multiprocessing 9 | 10 | import ac_net 11 | import worker 12 | 13 | PARSER = argparse.ArgumentParser(description=None) 14 | PARSER.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu') 15 | PARSER.add_argument('-e', '--episodes', default=500, type=int, help='number of episodes') 16 | PARSER.add_argument('-w', '--workers', default=4, type=int, help='number of workers') 17 | PARSER.add_argument('-l', '--log_dir', default='acrobot_logs', type=str, help='log directory') 18 | ARGS = PARSER.parse_args() 19 | print ARGS 20 | 21 | DEVICE = ARGS.device 22 | STATE_SIZE = 6 23 | ACTION_SIZE = 3 24 | LEARNING_RATE = 0.0001 25 | GAMMA = 0.99 26 | T_MAX = 5 27 | # NUM_WORKERS = multiprocessing.cpu_count() 28 | NUM_WORKERS = ARGS.workers 29 | NUM_EPISODES = ARGS.episodes 30 | LOG_DIR = ARGS.log_dir 31 | 32 | N_H1 = 300 33 | N_H2 = 300 34 | 35 | 36 | def main(): 37 | '''Example of A3C running on Acrobot environment''' 38 | tf.reset_default_graph() 39 | 40 | history = [] 41 | 42 | with tf.device('/{}:0'.format(DEVICE)): 43 | sess = tf.Session() 44 | global_model = ac_net.AC_Net( 45 | STATE_SIZE, 46 | ACTION_SIZE, 47 | LEARNING_RATE, 48 | 'global', 49 | n_h1=N_H1, 50 | n_h2=N_H2) 51 | workers = [] 52 | for i in xrange(NUM_WORKERS): 53 | env = gym.make('Acrobot-v1') 54 | env._max_episode_steps = 3000 55 | workers.append(worker.Worker(env, 56 | state_size=STATE_SIZE, action_size=ACTION_SIZE, 57 | worker_name='worker_{}'.format(i), global_name='global', 58 | lr=LEARNING_RATE, gamma=GAMMA, t_max=T_MAX, sess=sess, 59 | history=history, n_h1=N_H1, n_h2=N_H2, logdir=LOG_DIR)) 60 | 61 | sess.run(tf.global_variables_initializer()) 62 | 63 | for workeri in workers: 64 | worker_work = lambda: workeri.work(NUM_EPISODES) 65 | thread = threading.Thread(target=worker_work) 66 | thread.start() 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /A3C/cartpole_a3c.py: -------------------------------------------------------------------------------- 1 | '''Example of A3C running on Cartpole environment''' 2 | import argparse 3 | import time 4 | import threading 5 | import tensorflow as tf 6 | import gym 7 | # import multiprocessing 8 | 9 | import ac_net 10 | import worker 11 | 12 | PARSER = argparse.ArgumentParser(description=None) 13 | PARSER.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu') 14 | PARSER.add_argument('-e', '--episodes', default=1000, type=int, help='number of episodes') 15 | PARSER.add_argument('-w', '--workers', default=4, type=int, help='number of workers') 16 | PARSER.add_argument('-l', '--log_dir', default='cartpole_logs', type=str, help='log directory') 17 | ARGS = PARSER.parse_args() 18 | print ARGS 19 | 20 | 21 | DEVICE = ARGS.device 22 | STATE_SIZE = 4 23 | ACTION_SIZE = 2 24 | LEARNING_RATE = 0.0001 25 | GAMMA = 0.99 26 | T_MAX = 5 27 | # NUM_WORKERS = multiprocessing.cpu_count() 28 | NUM_WORKERS = ARGS.workers 29 | NUM_EPISODES = ARGS.episodes 30 | LOG_DIR = ARGS.log_dir 31 | 32 | 33 | N_H1 = 300 34 | N_H2 = 300 35 | 36 | 37 | def main(): 38 | '''Example of A3C running on Cartpole environment''' 39 | tf.reset_default_graph() 40 | 41 | history = [] 42 | 43 | with tf.device('/{}:0'.format(DEVICE)): 44 | sess = tf.Session() 45 | global_model = ac_net.AC_Net( 46 | STATE_SIZE, 47 | ACTION_SIZE, 48 | LEARNING_RATE, 49 | 'global', 50 | n_h1=N_H1, 51 | n_h2=N_H2) 52 | workers = [] 53 | for i in xrange(NUM_WORKERS): 54 | env = gym.make('CartPole-v0') 55 | env._max_episode_steps = 200 56 | workers.append(worker.Worker(env, 57 | state_size=STATE_SIZE, action_size=ACTION_SIZE, 58 | worker_name='worker_{}'.format(i), global_name='global', 59 | lr=LEARNING_RATE, gamma=GAMMA, t_max=T_MAX, sess=sess, 60 | history=history, n_h1=N_H1, n_h2=N_H2, logdir=LOG_DIR)) 61 | 62 | sess.run(tf.global_variables_initializer()) 63 | 64 | for workeri in workers: 65 | worker_work = lambda: workeri.work(NUM_EPISODES) 66 | thread = threading.Thread(target=worker_work) 67 | thread.start() 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /A3C/imgs/a3c_acrobot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/a3c_acrobot.png -------------------------------------------------------------------------------- /A3C/imgs/a3c_cartpole_el0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/a3c_cartpole_el0.png -------------------------------------------------------------------------------- /A3C/imgs/a3c_cartpole_el001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/a3c_cartpole_el001.png -------------------------------------------------------------------------------- /A3C/imgs/mountaincar_el1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/mountaincar_el1.png -------------------------------------------------------------------------------- /A3C/imgs/mountaincar_tmax15_el1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/A3C/imgs/mountaincar_tmax15_el1.png -------------------------------------------------------------------------------- /A3C/mountaincar_a3c.py: -------------------------------------------------------------------------------- 1 | '''Example of A3C running on MountainCar environment''' 2 | import argparse 3 | import time 4 | import threading 5 | import tensorflow as tf 6 | import gym 7 | # import multiprocessing 8 | 9 | import ac_net 10 | import worker 11 | 12 | 13 | PARSER = argparse.ArgumentParser(description=None) 14 | PARSER.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu') 15 | PARSER.add_argument('-e', '--episodes', default=20000, type=int, help='number of episodes') 16 | PARSER.add_argument('-w', '--workers', default=8, type=int, help='number of workers') 17 | PARSER.add_argument('-l', '--log_dir', default='mountaincar_logs', type=str, help='log directory') 18 | ARGS = PARSER.parse_args() 19 | print ARGS 20 | 21 | 22 | DEVICE = ARGS.device 23 | ENV_NAME = 'MountainCar-v0' 24 | ENV = gym.make('MountainCar-v0') 25 | STATE_SIZE = ENV.observation_space.shape[0] # 2 26 | ACTION_SIZE = ENV.action_space.n # 3 27 | LEARNING_RATE = 0.0001 28 | GAMMA = 0.99 29 | T_MAX = 5 30 | # NUM_WORKERS = multiprocessing.cpu_count() 31 | NUM_WORKERS = ARGS.workers 32 | NUM_EPISODES = ARGS.episodes 33 | MAX_STEPS = 10000 34 | LOG_DIR = ARGS.log_dir 35 | 36 | 37 | N_H1 = 300 38 | N_H2 = 300 39 | 40 | 41 | def main(): 42 | '''Example of A3C running on MountainCar environment''' 43 | tf.reset_default_graph() 44 | 45 | history = [] 46 | 47 | with tf.device('/{}:0'.format(DEVICE)): 48 | sess = tf.Session() 49 | global_model = ac_net.AC_Net( 50 | STATE_SIZE, 51 | ACTION_SIZE, 52 | LEARNING_RATE, 53 | 'global', 54 | n_h1=N_H1, 55 | n_h2=N_H2) 56 | workers = [] 57 | for i in xrange(NUM_WORKERS): 58 | env = gym.make(ENV_NAME) 59 | env._max_episode_steps = MAX_STEPS 60 | workers.append(worker.Worker(env, 61 | state_size=STATE_SIZE, action_size=ACTION_SIZE, 62 | worker_name='worker_{}'.format(i), global_name='global', 63 | lr=LEARNING_RATE, gamma=GAMMA, t_max=T_MAX, sess=sess, 64 | history=history, n_h1=N_H1, n_h2=N_H2, logdir=LOG_DIR)) 65 | 66 | sess.run(tf.global_variables_initializer()) 67 | 68 | for workeri in workers: 69 | worker_work = lambda: workeri.work(NUM_EPISODES) 70 | thread = threading.Thread(target=worker_work) 71 | thread.start() 72 | 73 | 74 | if __name__ == "__main__": 75 | main() 76 | -------------------------------------------------------------------------------- /A3C/tf_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for tensorflow""" 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | 6 | def max_pool(x, k_sz=[2, 2]): 7 | """max pooling layer wrapper 8 | Args 9 | x: 4d tensor [batch, height, width, channels] 10 | k_sz: The size of the window for each dimension of the input tensor 11 | Returns 12 | a max pooling layer 13 | """ 14 | return tf.nn.max_pool( 15 | x, ksize=[ 16 | 1, k_sz[0], k_sz[1], 1], strides=[ 17 | 1, k_sz[0], k_sz[1], 1], padding='SAME') 18 | 19 | 20 | def conv2d(x, n_kernel, k_sz, stride=1): 21 | """convolutional layer with relu activation wrapper 22 | Args: 23 | x: 4d tensor [batch, height, width, channels] 24 | n_kernel: number of kernels (output size) 25 | k_sz: 2d array, kernel size. e.g. [8,8] 26 | stride: stride 27 | Returns 28 | a conv2d layer 29 | """ 30 | W = tf.Variable(tf.random_normal([k_sz[0], k_sz[1], int(x.get_shape()[3]), n_kernel])) 31 | b = tf.Variable(tf.random_normal([n_kernel])) 32 | # - strides[0] and strides[1] must be 1 33 | # - padding can be 'VALID'(without padding) or 'SAME'(zero padding) 34 | # - http://stackoverflow.com/questions/37674306/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-t 35 | conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME') 36 | conv = tf.nn.bias_add(conv, b) # add bias term 37 | # rectified linear unit: https://en.wikipedia.org/wiki/Rectifier_(neural_networks) 38 | return tf.nn.relu(conv) 39 | 40 | 41 | def fc(x, n_output, scope="fc", activation_fn=None, initializer=None): 42 | """fully connected layer with relu activation wrapper 43 | Args 44 | x: 2d tensor [batch, n_input] 45 | n_output output size 46 | """ 47 | with tf.variable_scope(scope): 48 | if initializer is None: 49 | # default initialization 50 | W = tf.Variable(tf.random_normal([int(x.get_shape()[1]), n_output])) 51 | b = tf.Variable(tf.random_normal([n_output])) 52 | else: 53 | W = tf.get_variable("W", shape=[int(x.get_shape()[1]), n_output], initializer=initializer) 54 | b = tf.get_variable("b", shape=[n_output], 55 | initializer=tf.constant_initializer(.0, dtype=tf.float32)) 56 | fc1 = tf.add(tf.matmul(x, W), b) 57 | if not activation_fn is None: 58 | fc1 = activation_fn(fc1) 59 | return fc1 60 | 61 | 62 | def flatten(x): 63 | """flatten a 4d tensor into 2d 64 | Args 65 | x: 4d tensor [batch, height, width, channels] 66 | Returns a flattened 2d tensor 67 | """ 68 | return tf.reshape(x, [-1, int(x.get_shape()[1] * x.get_shape()[2] * x.get_shape()[3])]) 69 | 70 | 71 | def update_target_graph(from_scope, to_scope): 72 | from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) 73 | to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) 74 | 75 | op_holder = [] 76 | for from_var, to_var in zip(from_vars, to_vars): 77 | op_holder.append(to_var.assign(from_var)) 78 | return op_holder 79 | 80 | 81 | # Used to initialize weights for policy and value output layers 82 | def normalized_columns_initializer(std=1.0): 83 | def _initializer(shape, dtype=None, partition_info=None): 84 | out = np.random.randn(*shape).astype(np.float32) 85 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 86 | return tf.constant(out) 87 | return _initializer 88 | -------------------------------------------------------------------------------- /A3C/worker.py: -------------------------------------------------------------------------------- 1 | '''Worker class for A3C''' 2 | from collections import namedtuple 3 | import random 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | import ac_net 8 | import tf_utils 9 | 10 | MAX_STEPS = 10000 11 | 12 | 13 | Step = namedtuple('Step', 'cur_step action next_step reward done') 14 | 15 | 16 | class Worker(object): 17 | '''Worker class for A3C''' 18 | 19 | def __init__(self, env, state_size, action_size, 20 | worker_name, global_name, lr, gamma, t_max, sess, 21 | history, n_h1=400, n_h2=300, logdir='logs'): 22 | self.env = env 23 | self.name = worker_name 24 | self.gamma = gamma 25 | self.sess = sess 26 | self.t_max = t_max 27 | self.history = history 28 | 29 | self.local_model = ac_net.AC_Net(state_size, action_size, lr, 30 | worker_name, n_h1=n_h1, n_h2=n_h2, global_name=global_name) 31 | self.copy_to_local_op = tf_utils.update_target_graph(global_name, worker_name) 32 | 33 | self.summary_writer = tf.summary.FileWriter("{}/train_{}".format(logdir, worker_name)) 34 | 35 | def _copy_to_local(self): 36 | self.sess.run(self.copy_to_local_op) 37 | 38 | def work(self, n_episodes): 39 | episode_i = 0 40 | episode_len = 1 41 | cur_state = self.env.reset() 42 | count = 1 43 | cum_reward = 0 44 | while episode_i < n_episodes: 45 | # 1) sync from global model to local model 46 | self._copy_to_local() 47 | # 2) collect t_max steps (if terminated then i++) 48 | steps = [] 49 | for _ in xrange(self.t_max): 50 | action = self.local_model.get_action(cur_state, self.sess) 51 | next_state, reward, done, info = self.env.step(action) 52 | cum_reward += reward 53 | episode_len = episode_len + 1 54 | steps.append( 55 | Step( 56 | cur_step=cur_state, 57 | action=action, 58 | next_step=next_state, 59 | reward=reward, 60 | done=done)) 61 | if done or episode_len >= MAX_STEPS: 62 | cur_state = self.env.reset() 63 | self.history.append(episode_len) 64 | summary = tf.Summary() 65 | summary.value.add(tag='Perf/episode_len', simple_value=float(episode_len)) 66 | summary.value.add(tag='Perf/episode_reward', simple_value=float(cum_reward)) 67 | self.summary_writer.add_summary(summary, episode_i) 68 | print 'worker {}: episode {} finished in {} steps, cumulative reward: {}'.format(self.name, episode_i, episode_len, cum_reward) 69 | print action 70 | print self.local_model.predict_policy(cur_state, self.sess) 71 | cum_reward = 0 72 | episode_i = episode_i + 1 73 | episode_len = 0 74 | break 75 | cur_state = next_state 76 | # 3) convert the t_max steps into a batch 77 | if steps[-1].done: 78 | R = 0 79 | else: 80 | R = self.local_model.predict_value(cur_state, self.sess) 81 | R_batch = np.zeros(len(steps)) 82 | advantage_batch = np.zeros(len(steps)) 83 | target_v_batch = np.zeros(len(steps)) 84 | for i in reversed(xrange(len(steps))): 85 | step = steps[i] 86 | R = step.reward + self.gamma * R 87 | R_batch[i] = R 88 | cur_state_batch = [step.cur_step for step in steps] 89 | pred_v_batch = self.local_model.predict_value(cur_state_batch, self.sess) 90 | action_batch = [step.action for step in steps] 91 | advantage_batch = [R_batch[i] - pred_v_batch[i] for i in xrange(len(steps))] 92 | # 4) compute the gradient and update the global model 93 | action_batch = np.reshape(action_batch, [-1]) 94 | advantage_batch = np.reshape(advantage_batch, [-1]) 95 | R_batch = np.reshape(R_batch, [-1]) 96 | feed_dict = { 97 | self.local_model.input_s: cur_state_batch, 98 | self.local_model.input_a: action_batch, 99 | self.local_model.advantage: advantage_batch, 100 | self.local_model.target_v: R_batch, 101 | } 102 | v_l, p_l, e_l, loss, _, _, v_n = self.sess.run( 103 | [self.local_model.value_loss, 104 | self.local_model.policy_loss, 105 | self.local_model.entropy_loss, 106 | self.local_model.loss, 107 | self.local_model.gradients, 108 | self.local_model.apply_gradients, 109 | self.local_model.var_norms], 110 | feed_dict) 111 | 112 | mean_reward = np.mean([step.reward for step in steps]) 113 | mean_value = np.mean(R_batch) 114 | 115 | summary = tf.Summary() 116 | summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) 117 | summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) 118 | summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) 119 | summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) 120 | summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) 121 | summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n)) 122 | self.summary_writer.add_summary(summary, count) 123 | count += 1 124 | -------------------------------------------------------------------------------- /DP/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/DP/__init__.py -------------------------------------------------------------------------------- /DP/policy_iteration.py: -------------------------------------------------------------------------------- 1 | # Policy iteration agent 2 | # Model-based learning which requires mdp. 3 | # 4 | # --- 5 | # @author Yiren Lu 6 | # @email luyiren [at] seas [dot] upenn [dot] edu 7 | # 8 | # MIT License 9 | 10 | import math 11 | 12 | 13 | class PolicyIterationAgent(object): 14 | 15 | def __init__(self, mdp, gamma, iterations=100): 16 | """ 17 | The constructor performs policy iteration on mdp using dynamic programming 18 | --- 19 | args 20 | mdp: markov decision process that is required by value iteration agent 21 | gamma: discount factor 22 | """ 23 | self.mdp = mdp 24 | self.gamma = gamma 25 | states = mdp.get_states() 26 | # init values 27 | self.values = {} 28 | # policy is a map from state to action 29 | self.policy = {} 30 | 31 | for s in states: 32 | if mdp.is_terminal(s): 33 | self.values[s] = mdp.get_reward(s) 34 | else: 35 | self.values[s] = 0 36 | self.policy[s] = 0 37 | 38 | # estimate values 39 | for i in range(iterations): 40 | values_tmp = self.values.copy() 41 | policy_tmp = self.policy.copy() 42 | 43 | for s in states: 44 | # policy iteration 45 | if mdp.is_terminal(s): 46 | continue 47 | 48 | self.values[s] = sum([P_s1_s_a * (self.mdp.get_reward_sas(s, policy_tmp[s], s1) + self.gamma * values_tmp[s1]) 49 | for s1, P_s1_s_a in self.mdp.get_transition_states_and_probs(s, policy_tmp[s])]) 50 | 51 | # policy improvement 52 | actions = mdp.get_actions(s) 53 | v_a = [sum([P_s1_s_a * (self.mdp.get_reward_sas(s, policy_tmp[s], s1) + self.gamma * values_tmp[s1]) 54 | for s1, P_s1_s_a in self.mdp.get_transition_states_and_probs(s, a)]) 55 | for a in actions] 56 | self.policy[s] = actions[v_a.index(max(v_a))] 57 | 58 | def get_values(self): 59 | """ 60 | returns 61 | a dictionary {} 62 | """ 63 | return self.values 64 | 65 | def get_optimal_policy(self): 66 | """ 67 | returns 68 | a dictionary {} 69 | """ 70 | states = self.mdp.get_states() 71 | policy = {} 72 | for s in states: 73 | policy[s] = [(self.get_action(s), 1)] 74 | return policy 75 | 76 | def get_action(self, state): 77 | """ 78 | args 79 | state current state 80 | returns 81 | an action to take given the state 82 | """ 83 | return self.policy[state] 84 | -------------------------------------------------------------------------------- /DP/test_policy_iteration.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | if "../" not in sys.path: 4 | sys.path.append("../") 5 | from envs import gridworld 6 | import policy_iteration 7 | 8 | 9 | class PolicyIterationAgentTest(unittest.TestCase): 10 | """ 11 | Unit test for policy iteration agent 12 | """ 13 | 14 | def setUp(self): 15 | grid = [['0', '0', '0', '1'], 16 | ['0', 'x', '0', '-1'], 17 | ['0', '0', '0', '0']] 18 | 19 | self.grid = grid 20 | self.gw_non_deterministic = gridworld.GridWorld( 21 | grid, {(0, 3), (1, 3)}, 0.8) 22 | 23 | self.agent = policy_iteration.PolicyIterationAgent( 24 | self.gw_non_deterministic, 0.9, 20) 25 | 26 | def test_show_policy(self): 27 | print 'Show policy learned by value iteration:' 28 | self.gw_non_deterministic.display_policy_grid( 29 | self.agent.get_optimal_policy()) 30 | 31 | def test_values(self): 32 | print 'Show value iteration results:' 33 | self.gw_non_deterministic.display_value_grid(self.agent.values) 34 | 35 | 36 | if __name__ == '__main__': 37 | unittest.main() 38 | -------------------------------------------------------------------------------- /DP/test_value_iteration.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | if "../" not in sys.path: 4 | sys.path.append("../") 5 | from envs import gridworld 6 | import value_iteration 7 | 8 | 9 | class ValueIterationAgentTest(unittest.TestCase): 10 | """ 11 | Unit test for value iteration agent 12 | """ 13 | 14 | def setUp(self): 15 | grid = [['0', '0', '0', '1'], 16 | ['0', 'x', '0', '-1'], 17 | ['0', '0', '0', '0']] 18 | 19 | self.grid = grid 20 | self.gw_non_deterministic = gridworld.GridWorld( 21 | grid, {(0, 3), (1, 3)}, 0.8) 22 | 23 | self.agent = value_iteration.ValueIterationAgent( 24 | self.gw_non_deterministic, 0.9, 100) 25 | 26 | def test_eval_policy(self): 27 | print 'Show evaluation of the optimal policy:' 28 | self.gw_non_deterministic.display_value_grid( 29 | self.agent.eval_policy_dist(self.agent.get_policy_dist())) 30 | 31 | def test_show_policy(self): 32 | print 'Show policy learned by value iteration:' 33 | self.gw_non_deterministic.display_policy_grid( 34 | self.agent.get_optimal_policy()) 35 | 36 | def test_values(self): 37 | print 'Show value iteration results:' 38 | self.gw_non_deterministic.display_value_grid(self.agent.values) 39 | 40 | 41 | if __name__ == '__main__': 42 | unittest.main() 43 | -------------------------------------------------------------------------------- /DP/value_iteration.py: -------------------------------------------------------------------------------- 1 | # Value iteration agent 2 | # Model-based learning which requires mdp. 3 | # 4 | # --- 5 | # @author Yiren Lu 6 | # @email luyiren [at] seas [dot] upenn [dot] edu 7 | # 8 | # MIT License 9 | 10 | import math 11 | 12 | 13 | class ValueIterationAgent(object): 14 | 15 | def __init__(self, mdp, gamma, iterations=100): 16 | """ 17 | The constructor build a value model from mdp using dynamic programming 18 | --- 19 | args 20 | mdp: markov decision process that is required by value iteration agent 21 | gamma: discount factor 22 | """ 23 | self.mdp = mdp 24 | self.gamma = gamma 25 | states = mdp.get_states() 26 | # init values 27 | self.values = {} 28 | 29 | for s in states: 30 | if mdp.is_terminal(s): 31 | self.values[s] = mdp.get_reward(s) 32 | else: 33 | self.values[s] = 0 34 | 35 | # estimate values 36 | for i in range(iterations): 37 | values_tmp = self.values.copy() 38 | 39 | for s in states: 40 | if mdp.is_terminal(s): 41 | continue 42 | 43 | actions = mdp.get_actions(s) 44 | v_s = [] 45 | for a in actions: 46 | P_s1sa = mdp.get_transition_states_and_probs(s, a) 47 | R_sas1 = [mdp.get_reward(s1) for s1 in [p[0] for p in P_s1sa]] 48 | v_s.append(sum([P_s1sa[s1_id][1] * (mdp.get_reward(s) + gamma * \ 49 | values_tmp[P_s1sa[s1_id][0]]) for s1_id in range(len(P_s1sa))])) 50 | # V(s) = max_{a} \sum_{s'} P(s'| s, a) (R(s,a,s') + \gamma V(s')) 51 | self.values[s] = max(v_s) 52 | 53 | def get_values(self): 54 | """ 55 | returns 56 | a dictionary {} 57 | """ 58 | return self.values 59 | 60 | def get_q_values(self, state, action): 61 | """ 62 | returns qvalue of (state, action) 63 | """ 64 | return sum([P_s1_s_a*(self.mdp.get_reward_sas(s, a, s1) + self.gamma*self.values[s1]) 65 | for s1, P_s1_s_a in self.mdp.get_transition_states_and_probs(state, action)]) 66 | 67 | 68 | def eval_policy_dist(self, policy, iterations=100): 69 | """ 70 | evaluate a policy distribution 71 | returns 72 | a map {} 73 | """ 74 | values = {} 75 | states = self.mdp.get_states() 76 | for s in states: 77 | if self.mdp.is_terminal(s): 78 | values[s] = self.mdp.get_reward(s) 79 | else: 80 | values[s] = 0 81 | 82 | for i in range(iterations): 83 | values_tmp = values.copy() 84 | 85 | for s in states: 86 | if self.mdp.is_terminal(s): 87 | continue 88 | actions = self.mdp.get_actions(s) 89 | # v(s) = \sum_{a\in A} \pi(a|s) (R(s,a,s') + \gamma \sum_{s'\in S} 90 | # P(s'| s, a) v(s')) 91 | values[s] = sum([policy[s][i][1] * (self.mdp.get_reward(s) + self.gamma * sum([s1_p * values_tmp[s1] 92 | for s1, s1_p in self.mdp.get_transition_states_and_probs(s, actions[i])])) 93 | for i in range(len(actions))]) 94 | return values 95 | 96 | def get_optimal_policy(self): 97 | """ 98 | returns 99 | a dictionary {} 100 | """ 101 | states = self.mdp.get_states() 102 | policy = {} 103 | for s in states: 104 | policy[s] = [(self.get_action(s), 1)] 105 | return policy 106 | 107 | def get_policy_dist(self): 108 | """ 109 | returns 110 | a dictionary {} 111 | """ 112 | states = self.mdp.get_states() 113 | policy = {} 114 | for s in states: 115 | policy[s] = self.get_action_dist(s) 116 | return policy 117 | 118 | def get_action_dist(self, state): 119 | """ 120 | args 121 | state current state 122 | returns 123 | a list of {} pairs representing the action distribution on state 124 | """ 125 | actions = self.mdp.get_actions(state) 126 | # \sum_{s'} P(s'|s,a)*(R(s,a,s') + \gamma v(s')) 127 | v_a = [sum([s1_p * (self.mdp.get_reward_sas(state, a, s1) + self.gamma * self.values[s1]) 128 | for s1, s1_p in self.mdp.get_transition_states_and_probs(state, a)]) 129 | for a in actions] 130 | 131 | # I exponentiated the v_s^a's to make them positive 132 | v_a = [math.exp(v) for v in v_a] 133 | return [(actions[i], v_a[i] / sum(v_a)) for i in range(len(actions))] 134 | 135 | def get_action(self, state): 136 | """ 137 | args 138 | state current state 139 | returns 140 | an action to take given the state 141 | """ 142 | actions = self.mdp.get_actions(state) 143 | v_s = [] 144 | for a in actions: 145 | P_s1sa = self.mdp.get_transition_states_and_probs(state, a) 146 | R_sas1 = [self.mdp.get_reward(s1) for s1 in [p[0] for p in P_s1sa]] 147 | v_s.append(sum([P_s1sa[s1_id][1] * 148 | (self.mdp.get_reward(state) + 149 | self.gamma * 150 | self.values[P_s1sa[s1_id][0]]) for s1_id in range(len(P_s1sa))])) 151 | a_id = v_s.index(max(v_s)) 152 | return actions[a_id] 153 | -------------------------------------------------------------------------------- /DQN/README.md: -------------------------------------------------------------------------------- 1 | ## Deep Q-Learning 2 | 3 | Well-tuned DQN for low dimensional control tasks. 4 | 5 | #### Run Code 6 | 7 | `$ python cartpole_dqn.py --device=cpu --episodes=150 --model_dir=cartpole-model` to run code. 8 | 9 | `$ python cartpole_dqn.py -h` for help messages. 10 | 11 | #### Cartpole-v0 Result 12 | 13 | ![cartpole training](imgs/dqn_cartpole_training.png "cartpole training") 14 | -------------------------------------------------------------------------------- /DQN/cartpole_dqn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import numpy as np 4 | import sys 5 | import tensorflow as tf 6 | import dqn 7 | import exp_replay 8 | from exp_replay import Step 9 | import matplotlib.pyplot as plt 10 | import os 11 | import pickle 12 | 13 | 14 | PARSER = argparse.ArgumentParser(description=None) 15 | PARSER.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu') 16 | PARSER.add_argument('-e', '--episodes', default=150, type=int, help='number of episodes') 17 | PARSER.add_argument('-m', '--model_dir', default='cartpole-model/', type=str, help='model directory') 18 | PARSER.add_argument('-t', '--train', default=False, type=str, help='train for [number of episodes] IF MODEL EXISTS') 19 | ARGS = PARSER.parse_args() 20 | print ARGS 21 | 22 | 23 | DEVICE = ARGS.device 24 | NUM_EPISODES = ARGS.episodes 25 | ACTIONS = {0:0, 1:1} 26 | MAX_STEPS = 300 27 | FAIL_PENALTY = 0 28 | EPSILON = 1 29 | EPSILON_DECAY = 0.01 30 | END_EPSILON = 0.1 31 | LEARNING_RATE = 0.001 32 | DISCOUNT_FACTOR = 0.9 33 | BATCH_SIZE = 32 34 | MEM_SIZE = 1e4 35 | START_MEM = 1e2 36 | STATE_SIZE = [4] 37 | EPOCH_SIZE = 100 38 | 39 | TRAIN = ARGS.train 40 | 41 | MODEL_DIR = ARGS.model_dir 42 | MODEL_PATH = MODEL_DIR + 'model' 43 | MEMORY_PATH = MODEL_DIR + 'memory.p' 44 | 45 | 46 | def train(agent, exprep, env): 47 | for i in xrange(NUM_EPISODES): 48 | cur_state = env.reset() 49 | for t in xrange(MAX_STEPS): 50 | action = agent.get_action(cur_state) 51 | next_state, reward, done, info = env.step(action) 52 | if done: 53 | reward = FAIL_PENALTY 54 | exprep.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) 55 | print("Episode {} finished after {} timesteps".format(i, t + 1)) 56 | yield t + 1 57 | break 58 | exprep.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) 59 | cur_state = next_state 60 | if t == MAX_STEPS - 1: 61 | print("Episode {} finished after {} timesteps".format(i, t + 1)) 62 | yield t + 1 63 | agent.epsilon_decay() 64 | agent.learn_epoch(exprep, EPOCH_SIZE) 65 | print 'epsilon: {}'.format(agent.epsilon) 66 | 67 | 68 | env = gym.make('CartPole-v0') 69 | exprep = exp_replay.ExpReplay(mem_size=MEM_SIZE, start_mem=START_MEM, state_size=STATE_SIZE, kth=-1, batch_size=BATCH_SIZE) 70 | 71 | 72 | 73 | sess = tf.Session() 74 | with tf.device('/{}:0'.format(DEVICE)): 75 | agent = dqn.DQNAgent(session=sess, epsilon=EPSILON, epsilon_anneal=EPSILON_DECAY, end_epsilon=END_EPSILON, 76 | lr=LEARNING_RATE, gamma=DISCOUNT_FACTOR, state_size=4, 77 | action_size=len(ACTIONS), n_hidden_1=10, n_hidden_2=10) 78 | 79 | sess.run(tf.initialize_all_variables()) 80 | saver = tf.train.Saver() 81 | if os.path.isdir(MODEL_DIR): 82 | saver.restore(sess, MODEL_PATH) 83 | agent.epsilon = agent.end_epsilon 84 | print 'restored model' 85 | if TRAIN: 86 | exprep = pickle.load(open(MEMORY_PATH,"rb")) 87 | history = [e_length for e_length in train(agent, exprep, env)] 88 | saver.save(sess, MODEL_PATH) 89 | pickle.dump(exprep, open(MEMORY_PATH, "wb")) 90 | print 'saved model' 91 | # plot 92 | import matplotlib.pyplot as plt 93 | avg_reward = [np.mean(history[i*10:(i+1)*10]) for i in xrange(int(len(history)/10))] 94 | f_reward = plt.figure(1) 95 | plt.plot(np.linspace(0, len(history), len(avg_reward)), avg_reward) 96 | plt.ylabel('Episode length') 97 | plt.xlabel('Training episodes') 98 | f_reward.show() 99 | print 'press enter to continue' 100 | raw_input() 101 | plt.close() 102 | 103 | else: 104 | os.makedirs(MODEL_DIR) 105 | history = [e_length for e_length in train(agent, exprep, env)] 106 | saver.save(sess, MODEL_PATH) 107 | pickle.dump(exprep, open(MEMORY_PATH, "wb")) 108 | print 'saved model' 109 | # plot 110 | import matplotlib.pyplot as plt 111 | avg_reward = [np.mean(history[i*10:(i+1)*10]) for i in xrange(int(len(history)/10))] 112 | f_reward = plt.figure(1) 113 | plt.plot(np.linspace(0, len(history), len(avg_reward)), avg_reward) 114 | plt.ylabel('Episode length') 115 | plt.xlabel('Training episodes') 116 | f_reward.show() 117 | print 'press enter to continue' 118 | raw_input() 119 | plt.close() 120 | 121 | 122 | # Display: 123 | print 'press ctrl-c to stop' 124 | while True: 125 | cur_state = env.reset() 126 | done = False 127 | t = 0 128 | while not done: 129 | env.render() 130 | t = t+1 131 | action = agent.get_optimal_action(cur_state) 132 | next_state, reward, done, info = env.step(action) 133 | cur_state = next_state 134 | if done: 135 | print("Episode finished after {} timesteps".format(t+1)) 136 | break 137 | 138 | 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /DQN/dqn.py: -------------------------------------------------------------------------------- 1 | # Deep Q-learning agent with q-value approximation 2 | # Following paper: Playing Atari with Deep Reinforcement Learning 3 | # https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf 4 | # 5 | # --- 6 | # @author Yiren Lu 7 | # @email luyiren [at] seas [dot] upenn [dot] edu 8 | # 9 | # MIT License 10 | 11 | 12 | import gym 13 | import numpy as np 14 | import random 15 | import tensorflow as tf 16 | import tf_utils 17 | 18 | 19 | class DQNAgent(): 20 | """ 21 | DQN Agent with a 2-hidden-layer fully-connected q-network that acts epsilon-greedily. 22 | """ 23 | 24 | def __init__(self, 25 | session, 26 | epsilon=0.5, 27 | epsilon_anneal = 0.01, 28 | end_epsilon=0.1, 29 | lr=0.5, 30 | gamma=0.99, 31 | state_size=4, 32 | action_size=2, 33 | scope="dqn", 34 | n_hidden_1=20, 35 | n_hidden_2=20, 36 | ): 37 | """ 38 | args 39 | epsilon exploration rate 40 | epsilon_anneal linear decay rate per call of epsilon_decay() function 41 | end_epsilon lowest exploration rate 42 | lr learning rate 43 | gamma discount factor 44 | state_size network input size 45 | action_size network output size 46 | """ 47 | self.epsilon = epsilon 48 | self.epsilon_anneal = epsilon_anneal 49 | self.end_epsilon = end_epsilon 50 | self.lr = lr 51 | self.gamma = gamma 52 | self.state_size = state_size 53 | self.action_size = action_size 54 | self.scope = scope 55 | self.n_hidden_1 = n_hidden_1 56 | self.n_hidden_2 = n_hidden_2 57 | self._build_qnet() 58 | self.sess = session 59 | 60 | def _build_qnet(self): 61 | """ 62 | Build q-network 63 | """ 64 | with tf.variable_scope(self.scope): 65 | self.state_input = tf.placeholder(tf.float32, [None, self.state_size]) 66 | self.action = tf.placeholder(tf.int32, [None]) 67 | self.target_q = tf.placeholder(tf.float32, [None]) 68 | 69 | fc1 = tf_utils.fc(self.state_input, n_output=self.n_hidden_1, activation_fn=tf.nn.relu) 70 | fc2 = tf_utils.fc(fc1, n_output=self.n_hidden_2, activation_fn=tf.nn.relu) 71 | self.q_values = tf_utils.fc(fc2, self.action_size, activation_fn=None) 72 | 73 | action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0) 74 | q_value_pred = tf.reduce_sum(self.q_values * action_mask, 1) 75 | 76 | self.loss = tf.reduce_mean(tf.square(tf.subtract(self.target_q, q_value_pred))) 77 | self.optimizer = tf.train.AdamOptimizer(self.lr) 78 | self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step()) 79 | 80 | def get_action_values(self, state): 81 | actions = self.sess.run(self.q_values, feed_dict={self.state_input: [state]}) 82 | return actions 83 | 84 | def get_optimal_action(self, state): 85 | actions = self.sess.run(self.q_values, feed_dict={self.state_input: [state]}) 86 | return actions.argmax() 87 | 88 | def get_action(self, state): 89 | """ 90 | Epsilon-greedy action 91 | 92 | args 93 | state current state 94 | returns 95 | an action to take given the state 96 | """ 97 | if np.random.random() < self.epsilon: 98 | # act randomly 99 | return np.random.randint(0, self.action_size) 100 | else: 101 | return self.get_optimal_action(state) 102 | 103 | def epsilon_decay(self): 104 | if self.epsilon > self.end_epsilon: 105 | self.epsilon = self.epsilon - self.epsilon_anneal 106 | 107 | def learn_epoch(self, exprep, num_steps): 108 | """ 109 | Deep Q-learing: train qnetwork for num_steps, for each step, sample a batch from exprep 110 | 111 | Args 112 | exprep: experience replay 113 | num_steps: num of steps 114 | """ 115 | for i in xrange(num_steps): 116 | self.learn_batch(exprep.sample()) 117 | 118 | def learn_batch(self, batch_steps): 119 | """ 120 | Deep Q-learing: train qnetwork with the input batch 121 | Args 122 | batch_steps: a batch of sampled namedtuple Step, where Step.cur_step and 123 | Step.next_step are of shape {self.state_size} 124 | sess: tf session 125 | Returns 126 | batch loss (-1 if input is empty) 127 | """ 128 | if len(batch_steps) == 0: 129 | return -1 130 | 131 | next_state_batch = [s.next_step for s in batch_steps] 132 | q_values = self.sess.run(self.q_values, feed_dict={self.state_input: next_state_batch}) 133 | 134 | max_q_values = q_values.max(axis=1) 135 | # compute target q value 136 | target_q = np.array([s.reward + self.gamma*max_q_values[i]*(1-s.done) for i,s in enumerate(batch_steps)]) 137 | target_q = target_q.reshape([len(batch_steps)]) 138 | 139 | # minimize the TD-error 140 | cur_state_batch = [s.cur_step for s in batch_steps] 141 | actions = [s.action for s in batch_steps] 142 | l, _, = self.sess.run([self.loss, self.train_op], feed_dict={ self.state_input: cur_state_batch, 143 | self.target_q: target_q, 144 | self.action: actions }) 145 | return l 146 | 147 | -------------------------------------------------------------------------------- /DQN/exp_replay.py: -------------------------------------------------------------------------------- 1 | # Experience Replay 2 | # Following paper: Playing Atari with Deep Reinforcement Learning 3 | # https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf 4 | # 5 | # --- 6 | # @author Yiren Lu 7 | # @email luyiren [at] seas [dot] upenn [dot] edu 8 | # 9 | # MIT License 10 | 11 | 12 | import numpy as np 13 | import random 14 | from collections import namedtuple 15 | 16 | 17 | Step = namedtuple('Step','cur_step action next_step reward done') 18 | 19 | 20 | class ExpReplay(): 21 | """Experience replay""" 22 | 23 | 24 | def __init__(self, mem_size, start_mem=None, state_size=[84, 84], kth=4, drop_rate=0.2, batch_size=32): 25 | # k = -1 for sending raw state 26 | self.state_size = state_size 27 | self.drop_rate = drop_rate 28 | self.mem_size = mem_size 29 | self.start_mem = start_mem 30 | if start_mem == None: 31 | self.start_mem = mem_size/20 32 | self.kth = kth 33 | self.batch_size = batch_size 34 | self.mem = [] 35 | self.total_steps = 0 36 | 37 | 38 | def add_step(self, step): 39 | """ 40 | Store episode to memory and check if it reaches the mem_size. 41 | If so, drop [self.drop_rate] of the oldest memory 42 | 43 | args 44 | step namedtuple Step, where step.cur_step and step.next_step are of size {state_size} 45 | """ 46 | self.mem.append(step) 47 | self.total_steps = self.total_steps + 1 48 | while len(self.mem) > self.mem_size: 49 | self.mem = self.mem[int(len(self.mem)*self.drop_rate):] 50 | 51 | 52 | def get_last_state(self): 53 | if len(self.mem) > abs(self.kth): 54 | if self.kth == -1: 55 | return self.mem[-1].cur_step 56 | if len(self.state_size) == 1: 57 | return [s.cur_step for s in self.mem[-abs(self.kth):]] 58 | last_state = np.stack([s.cur_step for s in self.mem[-abs(self.kth):]], axis=len(self.state_size)) 59 | return np.stack([s.cur_step for s in self.mem[-abs(self.kth):]], axis=len(self.state_size)) 60 | return [] 61 | 62 | 63 | def sample(self, num=None): 64 | """Randomly draw [num] samples""" 65 | if num == None: 66 | num = self.batch_size 67 | if len(self.mem) < self.start_mem: 68 | return [] 69 | sampled_idx = random.sample(range(abs(self.kth),len(self.mem)), num) 70 | samples = [] 71 | for idx in sampled_idx: 72 | steps = self.mem[idx-abs(self.kth):idx] 73 | cur_state = np.stack([s.cur_step for s in steps], axis=len(self.state_size)) 74 | next_state = np.stack([s.next_step for s in steps], axis=len(self.state_size)) 75 | # handle special cases 76 | if self.kth == -1: 77 | cur_state = steps[0].cur_step 78 | next_state = steps[0].next_step 79 | elif len(self.state_size) == 1: 80 | cur_state = [steps[0].cur_step] 81 | next_state = [steps[0].next_step] 82 | reward = steps[-1].reward 83 | action = steps[-1].action 84 | done = steps[-1].done 85 | samples.append(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) 86 | return samples 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /DQN/imgs/dqn_cartpole_training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/DQN/imgs/dqn_cartpole_training.png -------------------------------------------------------------------------------- /DQN/test_exp_replay.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import exp_replay 3 | from exp_replay import Step 4 | import numpy as np 5 | 6 | 7 | class ExpReplayTest(unittest.TestCase): 8 | """ 9 | Unit test for ExpReplay class 10 | """ 11 | 12 | 13 | def test1(self): 14 | exprep = exp_replay.ExpReplay(mem_size=100, state_size=[1], kth=1) 15 | for i in xrange(120): 16 | exprep.add_step(Step(cur_step=i, action=0, next_step=i+1, reward=0, done=False)) 17 | self.assertEqual(len(exprep.mem), 100) 18 | self.assertEqual(exprep.mem[-1:][0].cur_step, 119) 19 | 20 | 21 | def test2(self): 22 | exprep = exp_replay.ExpReplay(mem_size=100, state_size=[1], kth=4) 23 | for i in xrange(120): 24 | exprep.add_step(Step(cur_step=i, action=0, next_step=i+1, reward=0, done=False)) 25 | self.assertEqual(len(exprep.mem), 100) 26 | self.assertEqual(exprep.mem[-1:][0].cur_step, 119) 27 | self.assertEqual(exprep.get_last_state(), [116,117,118,119]) 28 | 29 | 30 | def test3(self): 31 | exprep = exp_replay.ExpReplay(mem_size=100, state_size=[2,2], kth=4) 32 | for i in xrange(120): 33 | exprep.add_step(Step(cur_step=[[i,i],[i,i]], action=0, next_step=[[i+1,i+1],[i+1,i+1]], reward=0, done=False)) 34 | self.assertEqual(len(exprep.mem), 100) 35 | self.assertEqual(exprep.mem[-1:][0].cur_step, [[119,119],[119,119]]) 36 | last_state = exprep.get_last_state() 37 | 38 | self.assertEqual(np.shape(last_state),(2,2,4)) 39 | self.assertTrue(np.array_equal(last_state[:,:,0], [[116,116],[116,116]])) 40 | self.assertTrue(np.array_equal(last_state[:,:,1], [[117,117],[117,117]])) 41 | self.assertTrue(np.array_equal(last_state[:,:,2], [[118,118],[118,118]])) 42 | self.assertTrue(np.array_equal(last_state[:,:,3], [[119,119],[119,119]])) 43 | 44 | sample = exprep.sample(5) 45 | self.assertEqual(len(sample), 5) 46 | self.assertEqual(np.shape(sample[0].cur_step), (2,2,4)) 47 | self.assertEqual(np.shape(sample[0].next_step), (2,2,4)) 48 | 49 | 50 | def test4(self): 51 | # -1 for sending raw state 52 | exprep = exp_replay.ExpReplay(mem_size=100, state_size=[4], kth=-1) 53 | for i in xrange(120): 54 | exprep.add_step(Step(cur_step=[i,i,i,i], action=0, next_step=[i+1,i+1,i+1,i+1], reward=0, done=False)) 55 | last_state = exprep.get_last_state() 56 | self.assertEqual(np.shape(last_state),(4,)) 57 | self.assertTrue(np.array_equal(last_state, [119,119,119,119])) 58 | 59 | sample = exprep.sample(5) 60 | self.assertEqual(len(sample), 5) 61 | self.assertEqual(np.shape(sample[0].cur_step), (4,)) 62 | 63 | if __name__ == '__main__': 64 | unittest.main() 65 | -------------------------------------------------------------------------------- /DQN/tf_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for tensorflow""" 2 | import tensorflow as tf 3 | 4 | 5 | def max_pool(x, k_sz=[2,2]): 6 | """max pooling layer wrapper 7 | Args 8 | x: 4d tensor [batch, height, width, channels] 9 | k_sz: The size of the window for each dimension of the input tensor 10 | Returns 11 | a max pooling layer 12 | """ 13 | return tf.nn.max_pool(x, ksize=[1, k_sz[0], k_sz[1], 1], strides=[1, k_sz[0], k_sz[1], 1], padding='SAME') 14 | 15 | def conv2d(x, n_kernel, k_sz, stride=1): 16 | """convolutional layer with relu activation wrapper 17 | Args: 18 | x: 4d tensor [batch, height, width, channels] 19 | n_kernel: number of kernels (output size) 20 | k_sz: 2d array, kernel size. e.g. [8,8] 21 | stride: stride 22 | Returns 23 | a conv2d layer 24 | """ 25 | W = tf.Variable(tf.random_normal([k_sz[0], k_sz[1], int(x.get_shape()[3]), n_kernel])) 26 | b = tf.Variable(tf.random_normal([n_kernel])) 27 | # - strides[0] and strides[1] must be 1 28 | # - padding can be 'VALID'(without padding) or 'SAME'(zero padding) 29 | # - http://stackoverflow.com/questions/37674306/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-t 30 | conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME') 31 | conv = tf.nn.bias_add(conv, b) # add bias term 32 | return tf.nn.relu(conv) # rectified linear unit: https://en.wikipedia.org/wiki/Rectifier_(neural_networks) 33 | 34 | 35 | def fc(x, n_output, activation_fn=None): 36 | """fully connected layer with relu activation wrapper 37 | Args 38 | x: 2d tensor [batch, n_input] 39 | n_output output size 40 | """ 41 | W=tf.Variable(tf.random_normal([int(x.get_shape()[1]), n_output])) 42 | b=tf.Variable(tf.random_normal([n_output])) 43 | fc1 = tf.add(tf.matmul(x, W), b) 44 | if not activation_fn == None: 45 | fc1 = activation_fn(fc1) 46 | return fc1 47 | 48 | 49 | def flatten(x): 50 | """flatten a 4d tensor into 2d 51 | Args 52 | x: 4d tensor [batch, height, width, channels] 53 | Returns a flattened 2d tensor 54 | """ 55 | return tf.reshape(x, [-1, int(x.get_shape()[1]*x.get_shape()[2]*x.get_shape()[3])]) 56 | 57 | 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Yiren Lu 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Implementation of Reinforcement Learning Algorithms in Python 2 | 3 | Implementation of selected reinforcement learning algorithms with tensorflow. 4 | 17 | 18 | ### Implemented Algorithms 19 | 20 | (Click into the links for more details) 21 | 22 | ##### Advanced 23 | 24 | - [Asynchronized Advantage Actor-Critic (A3C)](A3C/) 25 | - [Deep Deterministic Policy Gradient (DDPG)](ddpg/) 26 | 27 | ##### Policy Gradient Methods 28 | 29 | - [REINFORCE with policy function approximation](policy_gradient/) 30 | - [REINFORCE with baseline](policy_gradient/reinforce_w_baseline.py) 31 | 32 | ##### Temporal Difference Learning 33 | 34 | - [Standard epsilon greedy Q-learning](TD/qlearning.py) 35 | - [Deep Q-learning](DQN/) 36 | 37 | ##### Monte Carlo Methods 38 | 39 | - [Monte Carlo (MC) estimation of action values](monte_carlo/monte_carlo.py) 40 | 41 | ##### Dynamic Programming MDP Solver 42 | 43 | - [Value iteration](DP/value_iteration.py) 44 | - [Policy iteration - policy evaluation & policy improvement](DP/policy_iteration.py) 45 | 46 | ### Environments 47 | 48 | - `envs/gridworld.py`: minimium gridworld implementation for testings 49 | 50 | ### Dependencies 51 | 52 | - Python 2.7 53 | - Numpy 54 | - Tensorflow 0.12.1 55 | - OpenAI Gym (with Atari) 0.8.0 56 | - matplotlib (optional) 57 | 58 | ### Tests 59 | 60 | - Files: `test_*.py` 61 | - Run unit test for [class]: 62 | 63 | `python test_[class].py` 64 | 69 | ### MIT License 70 | 71 | -------------------------------------------------------------------------------- /TD/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/TD/__init__.py -------------------------------------------------------------------------------- /TD/cartpole_qlearning.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import wrappers 3 | import qlearning 4 | import numpy 5 | import matplotlib.pyplot as plt 6 | 7 | NUM_EPISODES = 2000 8 | N_BINS = [8, 8, 8, 8] 9 | MAX_STEPS = 200 10 | FAIL_PENALTY = -100 11 | EPSILON = 0.5 12 | EPSILON_DECAY = 0.99 13 | LEARNING_RATE = 0.05 14 | DISCOUNT_FACTOR = 0.9 15 | 16 | RECORD = False 17 | 18 | MIN_VALUES = [-0.5, -2.0, -0.5, -3.0] 19 | MAX_VALUES = [0.5, 2.0, 0.5, 3.0] 20 | BINS = [numpy.linspace(MIN_VALUES[i], MAX_VALUES[i], N_BINS[i]) 21 | for i in xrange(4)] 22 | 23 | 24 | def discretize(obs): 25 | return tuple([int(numpy.digitize(obs[i], BINS[i])) for i in xrange(4)]) 26 | 27 | 28 | def train(agent, env, history, num_episodes=NUM_EPISODES): 29 | for i in xrange(NUM_EPISODES): 30 | if i % 100: 31 | print "Episode {}".format(i + 1) 32 | obs = env.reset() 33 | cur_state = discretize(obs) 34 | 35 | for t in xrange(MAX_STEPS): 36 | action = agent.get_action(cur_state) 37 | observation, reward, done, info = env.step(action) 38 | next_state = discretize(observation) 39 | if done: 40 | reward = FAIL_PENALTY 41 | agent.learn(cur_state, action, next_state, reward, done) 42 | print("Episode finished after {} timesteps".format(t + 1)) 43 | history.append(t + 1) 44 | break 45 | agent.learn(cur_state, action, next_state, reward, done) 46 | cur_state = next_state 47 | if t == MAX_STEPS - 1: 48 | history.append(t + 1) 49 | print("Episode finished after {} timesteps".format(t + 1)) 50 | return agent, history 51 | 52 | 53 | env = gym.make('CartPole-v0') 54 | if RECORD: 55 | env = wrappers.Monitor(env, '/tmp/cartpole-experiment-1', force=True) 56 | def get_actions(state): 57 | return [0, 1] 58 | 59 | 60 | agent = qlearning.QLearningAgent(get_actions, 61 | epsilon=EPSILON, 62 | alpha=LEARNING_RATE, 63 | gamma=DISCOUNT_FACTOR, 64 | epsilon_decay=EPSILON_DECAY) 65 | 66 | history = [] 67 | 68 | agent, history = train(agent, env, history) 69 | 70 | if RECORD: 71 | env.monitor.close() 72 | 73 | avg_reward = [numpy.mean(history[i*100:(i+1)*100]) for i in xrange(int(len(history)/100))] 74 | f_reward = plt.figure(1) 75 | plt.plot(numpy.linspace(0, len(history), len(avg_reward)), avg_reward) 76 | plt.ylabel('Rewards') 77 | f_reward.show() 78 | print 'press enter to continue' 79 | raw_input() 80 | plt.close() 81 | 82 | 83 | # Display: 84 | print 'press ctrl-c to stop' 85 | while True: 86 | obs = env.reset() 87 | cur_state = discretize(obs) 88 | done = False 89 | 90 | t = 0 91 | while not done: 92 | env.render() 93 | t = t+1 94 | action = agent.get_action(cur_state) 95 | observation, reward, done, info = env.step(action) 96 | next_state = discretize(observation) 97 | if done: 98 | reward = FAIL_PENALTY 99 | agent.learn(cur_state, action, next_state, reward, done) 100 | print("Episode finished after {} timesteps".format(t+1)) 101 | history.append(t+1) 102 | break 103 | agent.learn(cur_state, action, next_state, reward, done) 104 | cur_state = next_state 105 | -------------------------------------------------------------------------------- /TD/qlearning.py: -------------------------------------------------------------------------------- 1 | # Q-learning Agent 2 | # Model-free Temporal Difference learning 3 | # 4 | # --- 5 | # @author Yiren Lu 6 | # @email luyiren [at] seas [dot] upenn [dot] edu 7 | # 8 | # MIT License 9 | 10 | import numpy 11 | 12 | class QLearningAgent(object): 13 | 14 | 15 | def __init__(self, legal_actions_fn, epsilon=0.5, alpha=0.5, gamma=0.9, epsilon_decay=1): 16 | """ 17 | args 18 | legal_actions_fn takes a state and returns a list of legal actions 19 | alpha learning rate 20 | epsilon exploration rate 21 | gamma discount factor 22 | """ 23 | self.epsilon = epsilon 24 | self.alpha = alpha 25 | self.gamma = gamma 26 | self.epsilon_decay=epsilon_decay 27 | self.legal_actions_fn = legal_actions_fn 28 | 29 | # map: {(state, action): q-value} 30 | self.q_values = {} 31 | # map: {state: action} 32 | self.policy = {} 33 | 34 | 35 | def get_value(self, s): 36 | a = self.get_optimal_action(s) 37 | return self.get_qvalue(s, a) 38 | 39 | 40 | def get_qvalue(self, s, a): 41 | if (s,a) in self.q_values: 42 | return self.q_values[(s,a)] 43 | else: 44 | # set to 0 45 | self.q_values[(s,a)] = 0 46 | return 0 47 | 48 | def _set_qvalue(self, s, a, v): 49 | self.q_values[(s,a)] = v 50 | 51 | 52 | def get_optimal_action(self, state): 53 | legal_actions = self.legal_actions_fn(state) 54 | assert len(legal_actions) > 0, "no legal actions" 55 | if state in self.policy: 56 | return self.policy[state] 57 | else: 58 | # randomly select an action as default and return 59 | self.policy[state] = legal_actions[numpy.random.randint(0, len(legal_actions))] 60 | return self.policy[state] 61 | 62 | def get_action(self, state): 63 | """ 64 | Epsilon-greedy action 65 | args 66 | state current state 67 | returns 68 | an action to take given the state 69 | """ 70 | legal_actions = self.legal_actions_fn(state) 71 | 72 | assert len(legal_actions) > 0, "no legal actions on state {}".format(state) 73 | 74 | if numpy.random.random() < self.epsilon: 75 | # act randomly 76 | return legal_actions[numpy.random.randint(0, len(legal_actions))] 77 | else: 78 | if state in self.policy: 79 | return self.policy[state] 80 | else: 81 | # set the first action in the list to default and return 82 | self.policy[state] = legal_actions[0] 83 | return legal_actions[0] 84 | 85 | 86 | def learn(self, s, a, s1, r, is_done): 87 | """ 88 | Updates self.q_values[(s,a)] and self.policy[s] 89 | args 90 | s current state 91 | a action taken 92 | s1 next state 93 | r reward 94 | is_done True if the episode concludes 95 | """ 96 | # update q value 97 | if is_done: 98 | sample = r 99 | else: 100 | sample = r + self.gamma*max([self.get_qvalue(s1,a1) for a1 in self.legal_actions_fn(s1)]) 101 | 102 | q_s_a = self.get_qvalue(s,a) 103 | q_s_a = q_s_a + self.alpha*(sample - q_s_a) 104 | self._set_qvalue(s,a,q_s_a) 105 | 106 | # policy improvement 107 | legal_actions = self.legal_actions_fn(s) 108 | s_q_values = [self.get_qvalue(s,a) for a in legal_actions] 109 | self.policy[s] = legal_actions[s_q_values.index(max(s_q_values))] 110 | 111 | self.epsilon = self.epsilon*self.epsilon_decay -------------------------------------------------------------------------------- /TD/test_qlearning.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | if "../" not in sys.path: 4 | sys.path.append("../") 5 | from envs import gridworld 6 | import qlearning 7 | 8 | 9 | class QLearningAgentTest(unittest.TestCase): 10 | """ 11 | Unit test for q-learning agent 12 | """ 13 | 14 | def test2(self): 15 | print 'Test 1 -- Bridge Crossing Analysis' 16 | grid = [['x', '-100', '-100', '-100', 'x'], 17 | ['1', '0', '0', '0', '10'], 18 | ['x', '-100', '-100', '-100', 'x']] 19 | 20 | gw = gridworld.GridWorld( 21 | grid, {(1,0), (1,4), 22 | (0,1), (0,2), (0,3), 23 | (2,1), (2,2), (2,3)}, 0.9) 24 | 25 | agent = qlearning.QLearningAgent(gw.get_actions, 26 | epsilon=0.1, alpha=0.5, gamma=0.9) 27 | 28 | # Training 29 | episodes = 5000 30 | for i in range(episodes): 31 | gw.reset((1,1)) 32 | cur_s = gw.get_current_state() 33 | is_done = False 34 | while not is_done: 35 | a = agent.get_action(cur_s) 36 | last_state, action, next_state, reward, is_done = gw.step(a) 37 | agent.learn(last_state, action, next_state, reward, is_done) 38 | cur_s = next_state 39 | # show optimal policy 40 | opt_policy = gw.get_optimal_policy(agent) 41 | gw.display_policy_grid(opt_policy) 42 | gw.display_value_grid(gw.get_values(agent)) 43 | gw.display_qvalue_grid(gw.get_qvalues(agent)) 44 | 45 | def test1(self): 46 | print 'Test 1 -- Regular Case' 47 | grid = [['0', '0', '0', '1'], 48 | ['0', 'x', '0', '-1'], 49 | ['0', '0', '0', '0']] 50 | 51 | gw = gridworld.GridWorld( 52 | grid, {(0, 3), (1, 3)}, 0.8) 53 | 54 | agent = qlearning.QLearningAgent(gw.get_actions, 55 | epsilon=0.2, alpha=0.5, gamma=0.9) 56 | 57 | # Training 58 | episodes = 5000 59 | for i in range(episodes): 60 | gw.reset((2,0)) 61 | cur_s = gw.get_current_state() 62 | is_done = False 63 | while not is_done: 64 | a = agent.get_action(cur_s) 65 | last_state, action, next_state, reward, is_done = gw.step(a) 66 | agent.learn(last_state, action, next_state, reward, is_done) 67 | cur_s = next_state 68 | 69 | # show optimal policy 70 | opt_policy = gw.get_optimal_policy(agent) 71 | gw.display_policy_grid(opt_policy) 72 | gw.display_value_grid(gw.get_values(agent)) 73 | gw.display_qvalue_grid(gw.get_qvalues(agent)) 74 | 75 | if __name__ == '__main__': 76 | unittest.main() 77 | -------------------------------------------------------------------------------- /ddpg/README.md: -------------------------------------------------------------------------------- 1 | ## Deep Deterministic Policy Gradient 2 | 3 | Following paper: Continuous control with deep reinforcement learning [(https://arxiv.org/abs/1509.02971)](https://arxiv.org/abs/1509.02971) 4 | 5 | Tested on pendulum-v0: [openai submission page](https://gym.openai.com/evaluations/eval_9kvdhHSCTMqU8mYTaPWFrQ) 6 | 7 | #### Run code 8 | 9 | `$ python pendulum_ddpg.py --device=cpu --episodes=300` 10 | 11 | #### Pendulum-v0 result 12 | 13 | ![ddpg training](imgs/ddpg_plot.png "ddpg training") 14 | -------------------------------------------------------------------------------- /ddpg/actor.py: -------------------------------------------------------------------------------- 1 | # Deep Deterministic Policy Gradient 2 | # following paper: Continuous control with deep reinforcement learning 3 | # (https://arxiv.org/pdf/1509.02971.pdf) 4 | # 5 | # --- 6 | # @author Yiren Lu 7 | # @email luyiren [at] seas [dot] upenn [dot] edu 8 | # 9 | # MIT License 10 | 11 | import tensorflow as tf 12 | import tf_utils 13 | 14 | 15 | 16 | class ActorNetwork(object): 17 | 18 | 19 | def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001): 20 | self.state_size = state_size 21 | self.action_size = action_size 22 | self.optimizer = tf.train.AdamOptimizer(lr) 23 | self.tau = tau 24 | 25 | self.n_h1 = n_h1 26 | self.n_h2 = n_h2 27 | 28 | self.input_s, self.actor_variables, self.action_values = self._build_network("actor") 29 | self.input_s_target, self.actor_variables_target, self.action_values_target = self._build_network("actor_target") 30 | 31 | self.action_gradients = tf.placeholder(tf.float32, [None, self.action_size]) 32 | self.actor_gradients = tf.gradients(self.action_values, self.actor_variables, -self.action_gradients) 33 | self.update_target_op = [self.actor_variables_target[i].assign(tf.multiply(self.actor_variables[i], self.tau) + tf.multiply(self.actor_variables_target[i], 1 - self.tau)) 34 | for i in range(len(self.actor_variables))] 35 | self.optimize = self.optimizer.apply_gradients(zip(self.actor_gradients, self.actor_variables)) 36 | 37 | 38 | def _build_network(self, name): 39 | input_s = tf.placeholder(tf.float32, [None, self.state_size]) 40 | with tf.variable_scope(name): 41 | layer_1 = tf_utils.fc(input_s, self.n_h1, scope="fc1", activation_fn=tf.nn.relu, 42 | initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) 43 | layer_2 = tf_utils.fc(layer_1, self.n_h2, scope="fc2", activation_fn=tf.nn.relu, 44 | initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) 45 | action_values = tf_utils.fc(layer_2, self.action_size, scope="out", activation_fn=tf.nn.tanh, 46 | initializer=tf.random_uniform_initializer(-3e-3, 3e-3)) 47 | actor_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) 48 | return input_s, actor_variables, action_values 49 | 50 | 51 | def get_action(self, state, sess): 52 | return sess.run(self.action_values, feed_dict={self.input_s: state}) 53 | 54 | 55 | def get_action_target(self, state, sess): 56 | return sess.run(self.action_values_target, feed_dict={self.input_s_target: state}) 57 | 58 | 59 | def train(self, state, action_gradients, sess): 60 | sess.run(self.optimize, feed_dict={ 61 | self.input_s: state, 62 | self.action_gradients: action_gradients 63 | }) 64 | 65 | 66 | def update_target(self, sess): 67 | sess.run(self.update_target_op) -------------------------------------------------------------------------------- /ddpg/critic.py: -------------------------------------------------------------------------------- 1 | # Deep Deterministic Policy Gradient 2 | # following paper: Continuous control with deep reinforcement learning 3 | # (https://arxiv.org/pdf/1509.02971.pdf) 4 | # 5 | # --- 6 | # @author Yiren Lu 7 | # @email luyiren [at] seas [dot] upenn [dot] edu 8 | # 9 | # MIT License 10 | 11 | import tensorflow as tf 12 | import tf_utils 13 | 14 | 15 | class CriticNetwork(object): 16 | 17 | 18 | def __init__(self, state_size, action_size, lr, n_h1=400, n_h2=300, tau=0.001): 19 | self.state_size = state_size 20 | self.action_size = action_size 21 | self.optimizer = tf.train.AdamOptimizer(lr) 22 | self.tau = tau 23 | 24 | self.n_h1 = n_h1 25 | self.n_h2 = n_h2 26 | 27 | self.input_s, self.action, self.critic_variables, self.q_value = self._build_network("critic") 28 | self.input_s_target, self.action_target, self.critic_variables_target, self.q_value_target = self._build_network("critic_target") 29 | 30 | self.target = tf.placeholder(tf.float32, [None]) 31 | self.l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.critic_variables]) 32 | self.loss = tf.reduce_mean(tf.square(self.target - self.q_value)) + 0.01*self.l2_loss 33 | self.optimize = self.optimizer.minimize(self.loss) 34 | self.update_target_op = [self.critic_variables_target[i].assign(tf.multiply(self.critic_variables[i], self.tau) + tf.multiply(self.critic_variables_target[i], 1 - self.tau)) for i in range(len(self.critic_variables))] 35 | self.action_gradients = tf.gradients(self.q_value, self.action) 36 | 37 | 38 | def _build_network(self, name): 39 | input_s = tf.placeholder(tf.float32, [None, self.state_size]) 40 | action = tf.placeholder(tf.float32, [None, self.action_size]) 41 | with tf.variable_scope(name): 42 | layer_1 = tf_utils.fc(input_s, self.n_h1, scope="fc1", activation_fn=tf.nn.relu, 43 | initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) 44 | # tf.concat((layer_1, action), 1) 45 | layer_2 = tf_utils.fc(tf.concat((layer_1, action), 1), self.n_h2, scope="fc2", activation_fn=tf.nn.relu, 46 | initializer=tf.contrib.layers.variance_scaling_initializer(mode="FAN_IN")) 47 | q_value = tf_utils.fc(layer_2, 1, scope="out", initializer=tf.random_uniform_initializer(-3e-3, 3e-3)) 48 | critic_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) 49 | return input_s, action, critic_variables, tf.squeeze(q_value) 50 | 51 | 52 | def get_qvalue_target(self, state, action, sess): 53 | return sess.run(self.q_value_target, feed_dict={ 54 | self.input_s_target: state, 55 | self.action_target: action 56 | }) 57 | 58 | 59 | def get_gradients(self, state, action, sess): 60 | return sess.run(self.action_gradients, feed_dict={ 61 | self.input_s: state, 62 | self.action: action 63 | }) 64 | 65 | 66 | def train(self, state, action, target, sess): 67 | _, loss = sess.run([self.optimize, self.loss], feed_dict={ 68 | self.input_s: state, 69 | self.action: action, 70 | self.target: target 71 | }) 72 | return loss 73 | 74 | 75 | def update_target(self, sess): 76 | sess.run(self.update_target_op) 77 | -------------------------------------------------------------------------------- /ddpg/ddpg.py: -------------------------------------------------------------------------------- 1 | # Deep Deterministic Policy Gradient 2 | # following paper: Continuous control with deep reinforcement learning 3 | # (https://arxiv.org/pdf/1509.02971.pdf) 4 | # 5 | # --- 6 | # @author Yiren Lu 7 | # @email luyiren [at] seas [dot] upenn [dot] edu 8 | # 9 | # MIT License 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | 15 | class DDPG(object): 16 | 17 | 18 | def __init__(self, actor, critic, exprep, noise, gamma=0.99, action_bound=1): 19 | self.actor = actor 20 | self.critic = critic 21 | self.exprep = exprep 22 | self.noise = noise 23 | self.total_steps = 0 24 | self.gamma = 0.99 25 | self.action_bound = action_bound 26 | 27 | 28 | def add_step(self, step): 29 | self.total_steps = self.total_steps + 1 30 | self.exprep.add_step(step) 31 | 32 | 33 | def get_action(self, state, sess): 34 | state = np.reshape(state,[-1, self.actor.state_size]) 35 | action = self.actor.get_action(state, sess) * self.action_bound 36 | return action 37 | 38 | 39 | def get_action_noise(self, state, sess, rate=1): 40 | state = np.reshape(state,[-1, self.actor.state_size]) 41 | action = self.actor.get_action(state, sess) * self.action_bound 42 | action = action + self.noise.noise() * rate 43 | return action 44 | 45 | 46 | def learn_batch(self, sess): 47 | # sample a random minibatch of N tranistions 48 | batch = self.exprep.sample() 49 | if len(batch)==0: 50 | return 51 | 52 | # compute y_i (target q) 53 | next_s = [s.next_step for s in batch] 54 | next_a_target = self.actor.get_action_target(next_s, sess) 55 | next_q_target = self.critic.get_qvalue_target(next_s, next_a_target, sess) 56 | y = np.array([s.reward + self.gamma*next_q_target[i]*(1-s.done) for i,s in enumerate(batch)]) 57 | y = y.reshape([len(batch)]) 58 | 59 | # update ciritc by minimizing l2 loss 60 | cur_s = [s.cur_step for s in batch] 61 | a = [s.action for s in batch] 62 | l = self.critic.train(cur_s, a, y, sess) 63 | 64 | # update actor policy with sampled gradient 65 | cur_a_pred = self.actor.get_action(cur_s, sess) 66 | a_gradients = self.critic.get_gradients(cur_s, cur_a_pred, sess) 67 | self.actor.train(cur_s, a_gradients[0], sess) 68 | 69 | # update target network: 70 | self.actor.update_target(sess) 71 | self.critic.update_target(sess) 72 | return l 73 | 74 | 75 | -------------------------------------------------------------------------------- /ddpg/exp_replay.py: -------------------------------------------------------------------------------- 1 | # Experience Replay 2 | # Following paper: Playing Atari with Deep Reinforcement Learning 3 | # https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf 4 | # 5 | # --- 6 | # @author Yiren Lu 7 | # @email luyiren [at] seas [dot] upenn [dot] edu 8 | # 9 | # MIT License 10 | 11 | 12 | import numpy as np 13 | import random 14 | from collections import namedtuple 15 | 16 | 17 | Step = namedtuple('Step','cur_step action next_step reward done') 18 | 19 | 20 | class ExpReplay(): 21 | """Experience replay""" 22 | 23 | 24 | def __init__(self, mem_size, start_mem=None, state_size=[84, 84], kth=4, drop_rate=0.2, batch_size=32): 25 | # k = -1 for sending raw state 26 | self.state_size = state_size 27 | self.drop_rate = drop_rate 28 | self.mem_size = mem_size 29 | self.start_mem = start_mem 30 | if start_mem == None: 31 | self.start_mem = mem_size/20 32 | self.kth = kth 33 | self.batch_size = batch_size 34 | self.mem = [] 35 | self.total_steps = 0 36 | 37 | 38 | def add_step(self, step): 39 | """ 40 | Store episode to memory and check if it reaches the mem_size. 41 | If so, drop [self.drop_rate] of the oldest memory 42 | 43 | args 44 | step namedtuple Step, where step.cur_step and step.next_step are of size {state_size} 45 | """ 46 | self.mem.append(step) 47 | self.total_steps = self.total_steps + 1 48 | while len(self.mem) > self.mem_size: 49 | self.mem = self.mem[int(len(self.mem)*self.drop_rate):] 50 | 51 | 52 | def get_last_state(self): 53 | if len(self.mem) > abs(self.kth): 54 | if self.kth == -1: 55 | return self.mem[-1].cur_step 56 | if len(self.state_size) == 1: 57 | return [s.cur_step for s in self.mem[-abs(self.kth):]] 58 | last_state = np.stack([s.cur_step for s in self.mem[-abs(self.kth):]], axis=len(self.state_size)) 59 | return np.stack([s.cur_step for s in self.mem[-abs(self.kth):]], axis=len(self.state_size)) 60 | return [] 61 | 62 | 63 | def sample(self, num=None): 64 | """Randomly draw [num] samples""" 65 | if num == None: 66 | num = self.batch_size 67 | if len(self.mem) < self.start_mem: 68 | return [] 69 | sampled_idx = random.sample(range(abs(self.kth),len(self.mem)), num) 70 | samples = [] 71 | for idx in sampled_idx: 72 | steps = self.mem[idx-abs(self.kth):idx] 73 | cur_state = np.stack([s.cur_step for s in steps], axis=len(self.state_size)) 74 | next_state = np.stack([s.next_step for s in steps], axis=len(self.state_size)) 75 | # handle special cases 76 | if self.kth == -1: 77 | cur_state = steps[0].cur_step 78 | next_state = steps[0].next_step 79 | elif len(self.state_size) == 1: 80 | cur_state = [steps[0].cur_step] 81 | next_state = [steps[0].next_step] 82 | reward = steps[-1].reward 83 | action = steps[-1].action 84 | done = steps[-1].done 85 | samples.append(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) 86 | return samples 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /ddpg/imgs/ddpg_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/ddpg/imgs/ddpg_plot.png -------------------------------------------------------------------------------- /ddpg/mountaincar_ddpg.py: -------------------------------------------------------------------------------- 1 | # DDPG Pendulum-v0 example 2 | # --- 3 | # @author Yiren Lu 4 | # @email luyiren [at] seas [dot] upenn [dot] edu 5 | # 6 | # MIT License 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | import argparse 11 | from ddpg import DDPG 12 | from actor import ActorNetwork 13 | from critic import CriticNetwork 14 | from exp_replay import ExpReplay 15 | from exp_replay import Step 16 | from ou import OUProcess 17 | import matplotlib.pyplot as plt 18 | import sys 19 | import gym 20 | from gym import wrappers 21 | 22 | # env = gym.make('MountainCarContinuous-v0') 23 | # print(env.observation_space) 24 | # print(env.action_space) 25 | # print(env.action_space.low) 26 | # print(env.action_space.high) 27 | 28 | parser = argparse.ArgumentParser(description=None) 29 | parser.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu') 30 | parser.add_argument('-e', '--episodes', default=50, type=int, help='number of episodes') 31 | parser.add_argument('-l', '--log_dir', default='/tmp/mountaincar-log-0', type=str, help='log directory') 32 | args = parser.parse_args() 33 | print(args) 34 | 35 | 36 | DEVICE = args.device 37 | NUM_EPISODES = args.episodes 38 | LOG_DIR=args.log_dir 39 | 40 | ACTOR_LEARNING_RATE = 0.0001 41 | CRITIC_LEARNING_RATE = 0.001 42 | GAMMA = 0.99 43 | TAU = 0.001 44 | MEM_SIZE = 1000000 45 | 46 | 47 | STATE_SIZE = 2 48 | ACTION_SIZE = 1 49 | BATCH_SIZE = 64 50 | MAX_STEPS = 10000 51 | FAIL_PENALTY = 0 52 | ACTION_RANGE = 1 53 | EVALUATE_EVERY = 10 54 | 55 | 56 | def summarize(cum_reward, i, summary_writer): 57 | summary = tf.Summary() 58 | summary.value.add(tag="cumulative reward", simple_value=cum_reward) 59 | summary_writer.add_summary(summary, i) 60 | summary_writer.flush() 61 | 62 | 63 | def train(agent, env, sess): 64 | for i in xrange(NUM_EPISODES): 65 | cur_state = env.reset() 66 | cum_reward = 0 67 | # tensorboard summary 68 | summary_writer = tf.summary.FileWriter(LOG_DIR+'/train', graph=tf.get_default_graph()) 69 | 70 | if (i % EVALUATE_EVERY) == 0: 71 | print '====evaluation====' 72 | for t in xrange(MAX_STEPS): 73 | if t % 500 == 0: 74 | print 'step {}'.format(t) 75 | if (i % EVALUATE_EVERY) == 0: 76 | env.render() 77 | action = agent.get_action(cur_state, sess)[0] 78 | else: 79 | # decaying noise 80 | action = agent.get_action_noise(cur_state, sess, rate=(NUM_EPISODES-i)/NUM_EPISODES)[0] 81 | # action = agent.get_action_noise(cur_state, sess, rate=0.01)[0] 82 | next_state, reward, done, info = env.step(action) 83 | if done: 84 | cum_reward += reward 85 | agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) 86 | print("Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward)) 87 | summarize(cum_reward, i, summary_writer) 88 | break 89 | cum_reward += reward 90 | agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) 91 | cur_state = next_state 92 | if t == MAX_STEPS - 1: 93 | print("Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward)) 94 | print action 95 | summarize(cum_reward, i, summary_writer) 96 | agent.learn_batch(sess) 97 | 98 | 99 | env = gym.make('MountainCarContinuous-v0') 100 | env._max_episode_steps = MAX_STEPS 101 | # env = wrappers.Monitor(env, '/tmp/pendulum-experiment-0', force=True) 102 | 103 | actor = ActorNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=ACTOR_LEARNING_RATE, tau=TAU) 104 | critic = CriticNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=CRITIC_LEARNING_RATE, tau=TAU) 105 | noise = OUProcess(ACTION_SIZE) 106 | exprep = ExpReplay(mem_size=MEM_SIZE, start_mem=10000, state_size=[STATE_SIZE], kth=-1, batch_size=BATCH_SIZE) 107 | 108 | sess = tf.Session() 109 | with tf.device('/{}:0'.format(DEVICE)): 110 | agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=env.action_space.high) 111 | sess.run(tf.initialize_all_variables()) 112 | 113 | train(agent, env, sess) 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /ddpg/ou.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class OUProcess(object): 5 | """Ornstein-Uhlenbeck process""" 6 | 7 | 8 | def __init__(self, x_size, mu=0, theta=0.15, sigma=0.3): 9 | self.x = np.ones(x_size) * mu 10 | self.x_size = x_size 11 | self.mu = mu 12 | self.theta = theta 13 | self.sigma = sigma 14 | 15 | 16 | def noise(self): 17 | dx = self.theta * (self.mu - self.x) + self.sigma * np.random.randn(self.x_size) 18 | self.x = self.x + dx 19 | return self.x 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /ddpg/pendulum_ddpg.py: -------------------------------------------------------------------------------- 1 | # DDPG Pendulum-v0 example 2 | # --- 3 | # @author Yiren Lu 4 | # @email luyiren [at] seas [dot] upenn [dot] edu 5 | # 6 | # MIT License 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | import argparse 11 | from ddpg import DDPG 12 | from actor import ActorNetwork 13 | from critic import CriticNetwork 14 | from exp_replay import ExpReplay 15 | from exp_replay import Step 16 | from ou import OUProcess 17 | import matplotlib.pyplot as plt 18 | import sys 19 | import gym 20 | from gym import wrappers 21 | 22 | 23 | parser = argparse.ArgumentParser(description=None) 24 | parser.add_argument('-d', '--device', default='cpu', type=str, help='choose device: cpu/gpu') 25 | parser.add_argument('-e', '--episodes', default=300, type=int, help='number of episodes') 26 | parser.add_argument('-l', '--log_dir', default='/tmp/pendulum-log-0', type=str, help='log directory') 27 | args = parser.parse_args() 28 | print(args) 29 | 30 | 31 | DEVICE = args.device 32 | NUM_EPISODES = args.episodes 33 | LOG_DIR=args.log_dir 34 | 35 | ACTOR_LEARNING_RATE = 0.0001 36 | CRITIC_LEARNING_RATE = 0.001 37 | GAMMA = 0.99 38 | TAU = 0.001 39 | MEM_SIZE = 1000000 40 | 41 | 42 | STATE_SIZE = 3 43 | ACTION_SIZE = 1 44 | BATCH_SIZE = 64 45 | MAX_STEPS = 200 46 | FAIL_PENALTY = 0 47 | ACTION_RANGE = 1 48 | EVALUATE_EVERY = 10 49 | 50 | 51 | def summarize(cum_reward, i, summary_writer): 52 | summary = tf.Summary() 53 | summary.value.add(tag="cumulative reward", simple_value=cum_reward) 54 | summary_writer.add_summary(summary, i) 55 | summary_writer.flush() 56 | 57 | 58 | def train(agent, env, sess): 59 | for i in xrange(NUM_EPISODES): 60 | cur_state = env.reset() 61 | cum_reward = 0 62 | # tensorboard summary 63 | summary_writer = tf.summary.FileWriter(LOG_DIR+'/train', graph=tf.get_default_graph()) 64 | 65 | if (i % EVALUATE_EVERY) == 0: 66 | print '====evaluation====' 67 | for t in xrange(MAX_STEPS): 68 | if (i % EVALUATE_EVERY) == 0: 69 | env.render() 70 | action = agent.get_action(cur_state, sess)[0] 71 | else: 72 | # decaying noise 73 | action = agent.get_action_noise(cur_state, sess, rate=(NUM_EPISODES-i)/NUM_EPISODES)[0] 74 | next_state, reward, done, info = env.step(action) 75 | if done: 76 | cum_reward += reward 77 | agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) 78 | print("Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward)) 79 | summarize(cum_reward, i, summary_writer) 80 | break 81 | cum_reward += reward 82 | agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) 83 | cur_state = next_state 84 | if t == MAX_STEPS - 1: 85 | print("Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward)) 86 | print action 87 | summarize(cum_reward, i, summary_writer) 88 | agent.learn_batch(sess) 89 | 90 | 91 | env = gym.make('Pendulum-v0') 92 | # env = wrappers.Monitor(env, '/tmp/pendulum-experiment-0', force=True) 93 | 94 | actor = ActorNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=ACTOR_LEARNING_RATE, tau=TAU) 95 | critic = CriticNetwork(state_size=STATE_SIZE, action_size=ACTION_SIZE, lr=CRITIC_LEARNING_RATE, tau=TAU) 96 | noise = OUProcess(ACTION_SIZE) 97 | exprep = ExpReplay(mem_size=MEM_SIZE, start_mem=10000, state_size=[STATE_SIZE], kth=-1, batch_size=BATCH_SIZE) 98 | 99 | sess = tf.Session() 100 | with tf.device('/{}:0'.format(DEVICE)): 101 | agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=env.action_space.high) 102 | sess.run(tf.initialize_all_variables()) 103 | 104 | train(agent, env, sess) 105 | -------------------------------------------------------------------------------- /ddpg/tf_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for tensorflow""" 2 | import tensorflow as tf 3 | 4 | 5 | def max_pool(x, k_sz=[2,2]): 6 | """max pooling layer wrapper 7 | Args 8 | x: 4d tensor [batch, height, width, channels] 9 | k_sz: The size of the window for each dimension of the input tensor 10 | Returns 11 | a max pooling layer 12 | """ 13 | return tf.nn.max_pool(x, ksize=[1, k_sz[0], k_sz[1], 1], strides=[1, k_sz[0], k_sz[1], 1], padding='SAME') 14 | 15 | def conv2d(x, n_kernel, k_sz, stride=1): 16 | """convolutional layer with relu activation wrapper 17 | Args: 18 | x: 4d tensor [batch, height, width, channels] 19 | n_kernel: number of kernels (output size) 20 | k_sz: 2d array, kernel size. e.g. [8,8] 21 | stride: stride 22 | Returns 23 | a conv2d layer 24 | """ 25 | W = tf.Variable(tf.random_normal([k_sz[0], k_sz[1], int(x.get_shape()[3]), n_kernel])) 26 | b = tf.Variable(tf.random_normal([n_kernel])) 27 | # - strides[0] and strides[1] must be 1 28 | # - padding can be 'VALID'(without padding) or 'SAME'(zero padding) 29 | # - http://stackoverflow.com/questions/37674306/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-t 30 | conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME') 31 | conv = tf.nn.bias_add(conv, b) # add bias term 32 | return tf.nn.relu(conv) # rectified linear unit: https://en.wikipedia.org/wiki/Rectifier_(neural_networks) 33 | 34 | 35 | def fc(x, n_output, scope="fc", activation_fn=None, initializer=None): 36 | """fully connected layer with relu activation wrapper 37 | Args 38 | x: 2d tensor [batch, n_input] 39 | n_output output size 40 | """ 41 | with tf.variable_scope(scope): 42 | if initializer is None: 43 | # default initialization 44 | W = tf.Variable(tf.random_normal([int(x.get_shape()[1]), n_output])) 45 | b = tf.Variable(tf.random_normal([n_output])) 46 | else: 47 | W = tf.get_variable("W", shape=[int(x.get_shape()[1]), n_output], initializer=initializer) 48 | b = tf.get_variable("b", shape=[n_output], initializer=tf.constant_initializer(.0, dtype=tf.float32)) 49 | fc1 = tf.add(tf.matmul(x, W), b) 50 | if not activation_fn is None: 51 | fc1 = activation_fn(fc1) 52 | return fc1 53 | 54 | 55 | def flatten(x): 56 | """flatten a 4d tensor into 2d 57 | Args 58 | x: 4d tensor [batch, height, width, channels] 59 | Returns a flattened 2d tensor 60 | """ 61 | return tf.reshape(x, [-1, int(x.get_shape()[1]*x.get_shape()[2]*x.get_shape()[3])]) 62 | 63 | 64 | -------------------------------------------------------------------------------- /envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/envs/__init__.py -------------------------------------------------------------------------------- /envs/env.py: -------------------------------------------------------------------------------- 1 | # Environment Abstract Class 2 | # --- 3 | # @author Yiren Lu 4 | # @email luyiren [at] seas [dot] upenn [dot] edu 5 | # 6 | # MIT License 7 | 8 | 9 | class Env: 10 | 11 | def reset(self, start_state): 12 | """ 13 | Reset the gridworld for model-free learning. It assumes only 1 agent in the gridworld. 14 | """ 15 | abstract 16 | 17 | 18 | def get_current_state(self): 19 | abstract 20 | 21 | 22 | def step(self, action): 23 | abstract -------------------------------------------------------------------------------- /envs/gridworld.py: -------------------------------------------------------------------------------- 1 | # Gridworld environment based on mdp.py 2 | # Gridworld provides a basic environment for RL agents to interact with 3 | # 4 | # --- 5 | # @author Yiren Lu 6 | # @email luyiren [at] seas [dot] upenn [dot] edu 7 | # 8 | # MIT License 9 | 10 | import mdp 11 | import env 12 | import numpy as np 13 | import unittest 14 | 15 | 16 | class GridWorld(mdp.MDP, env.Env): 17 | """ 18 | Grid world environment 19 | """ 20 | 21 | def __init__(self, grid, terminals, trans_prob=1): 22 | """ 23 | input: 24 | grid 2-d list of the grid including the reward 25 | terminals a set of all the terminal states 26 | trans_prob transition probability when given a certain action 27 | """ 28 | self.height = len(grid) 29 | self.width = len(grid[0]) 30 | self.terminals = terminals 31 | self.grid = grid 32 | self.neighbors = [(0, 1), (0, -1), (1, 0), (-1, 0), (0, 0)] 33 | self.actions = [0, 1, 2, 3, 4] 34 | self.dirs = {0: 'r', 1: 'l', 2: 'd', 3: 'u', 4: 's'} 35 | # right, left, down, up , stay 36 | # self.action_nei = {0: (0,1), 1:(0,-1), 2:(1,0), 3:(-1,0)} 37 | 38 | # If the mdp is deterministic, the transition probability of taken a certain action should be 1 39 | # otherwise < 1, the rest of the probability are equally spreaded onto 40 | # other neighboring states. 41 | self.trans_prob = trans_prob 42 | 43 | def show_grid(self): 44 | for i in range(len(self.grid)): 45 | print self.grid[i] 46 | 47 | def get_grid(self): 48 | return self.grid 49 | 50 | def get_states(self): 51 | """ 52 | returns 53 | a list of all states 54 | """ 55 | return filter( 56 | lambda x: self.grid[x[0]][x[1]] != 'x', 57 | [(i, j) for i in range(self.height) for j in range(self.width)]) 58 | 59 | def get_actions(self, state): 60 | """ 61 | get all the actions that can be takens on the current state 62 | returns 63 | a list of actions 64 | """ 65 | if self.grid[state[0]][state[1]] == 'x': 66 | return [4] 67 | 68 | actions = [] 69 | for i in range(len(self.actions)-1): 70 | inc = self.neighbors[i] 71 | a = self.actions[i] 72 | nei_s = (state[0] + inc[0], state[1] + inc[1]) 73 | if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[1] >= 0 and nei_s[ 74 | 1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x': 75 | actions.append(a) 76 | return actions 77 | 78 | def __get_action_states(self, state): 79 | """ 80 | get all the actions that can be takens on the current state 81 | returns 82 | a list of (action, state) pairs 83 | """ 84 | a_s = [] 85 | for i in range(len(self.actions)): 86 | inc = self.neighbors[i] 87 | a = self.actions[i] 88 | nei_s = (state[0] + inc[0], state[1] + inc[1]) 89 | if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[1] >= 0 and nei_s[ 90 | 1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x': 91 | a_s.append((a, nei_s)) 92 | return a_s 93 | 94 | def get_reward_sas(self, state, action, state1): 95 | """ 96 | args 97 | state current state 98 | action action 99 | state1 next state 100 | returns 101 | the reward on current state 102 | """ 103 | if not self.grid[state[0]][state[1]] == 'x': 104 | return float(self.grid[state[0]][state[1]]) 105 | else: 106 | return 0 107 | 108 | def get_reward(self, state): 109 | """ 110 | returns 111 | the reward on current state 112 | """ 113 | if not self.grid[state[0]][state[1]] == 'x': 114 | return float(self.grid[state[0]][state[1]]) 115 | else: 116 | return 0 117 | 118 | def get_transition_states_and_probs(self, state, action): 119 | """ 120 | get all the possible transition states and their probabilities with [action] on [state] 121 | args 122 | state (y, x) 123 | action int 124 | returns 125 | a list of (state, probability) pair 126 | """ 127 | if self.trans_prob == 1: 128 | inc = self.neighbors[action] 129 | nei_s = (state[0] + inc[0], state[1] + inc[1]) 130 | if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[ 131 | 1] >= 0 and nei_s[1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x': 132 | return [(nei_s, 1)] 133 | else: 134 | # if the state is invalid, stay in the current state 135 | return [(state, 1)] 136 | else: 137 | action_states = self.__get_action_states(state) 138 | inc = self.neighbors[action] 139 | nei_s = (state[0] + inc[0], state[1] + inc[1]) 140 | res = [] 141 | 142 | if nei_s[0] >= 0 and nei_s[0] < self.height and nei_s[ 143 | 1] >= 0 and nei_s[1] < self.width and self.grid[nei_s[0]][nei_s[1]] != 'x': 144 | for i in range(len(action_states)): 145 | if action_states[i][0] == action: 146 | res.append((action_states[i][1], self.trans_prob)) 147 | else: 148 | res.append( 149 | (action_states[i][1], (1 - self.trans_prob) / (len(action_states) - 1))) 150 | else: 151 | # if the action is not valid, then return uniform distribution of the valid moves. 152 | for i in range(len(action_states)): 153 | res.append((action_states[i][1], 1.0 / len(action_states))) 154 | return res 155 | 156 | def is_terminal(self, state): 157 | """ 158 | returns 159 | True if the [state] is terminal 160 | """ 161 | if state in self.terminals: 162 | return True 163 | else: 164 | return False 165 | 166 | ############################################## 167 | # Stateful Functions For Model-Free Leanring # 168 | ############################################## 169 | 170 | def reset(self, start_pos): 171 | """ 172 | Reset the gridworld for model-free learning. It assumes only 1 agent in the gridworld. 173 | args 174 | start_pos (i,j) pair of the start location 175 | """ 176 | self._cur_state = start_pos 177 | 178 | 179 | def get_current_state(self): 180 | return self._cur_state 181 | 182 | def step(self, action): 183 | """ 184 | Step function for the agent to interact with gridworld 185 | args 186 | action action taken by the agent 187 | returns 188 | current_state current state 189 | action input action 190 | next_state next_state 191 | reward reward on the next state 192 | is_done True/False - if the episode terminates on the next_state 193 | """ 194 | if self.is_terminal(self._cur_state): 195 | self._is_done = True 196 | return self._cur_state, action, self._cur_state, self.get_reward(self._cur_state), True 197 | 198 | st_prob = self.get_transition_states_and_probs(self._cur_state, action) 199 | 200 | sampled_idx = np.random.choice(np.arange(0,len(st_prob)), p=[prob for st, prob in st_prob]) 201 | last_state = self._cur_state 202 | next_state = st_prob[sampled_idx][0] 203 | reward = self.get_reward(last_state) 204 | self._cur_state = next_state 205 | return last_state, action, next_state, reward, False 206 | 207 | ########################################### 208 | # Policy Evaluation for Model-free Agents # 209 | ########################################### 210 | 211 | def get_optimal_policy(self, agent): 212 | states = self.get_states() 213 | policy = {} 214 | for s in states: 215 | policy[s] = [(agent.get_optimal_action(s), 1)] 216 | return policy 217 | 218 | def get_values(self, agent): 219 | states = self.get_states() 220 | values = {} 221 | for s in states: 222 | values[s] = agent.get_value(s) 223 | return values 224 | 225 | 226 | def get_qvalues(self, agent): 227 | states = self.get_states() 228 | q_values = {} 229 | for s in states: 230 | for a in self.get_actions(s): 231 | q_values[(s,a)] = agent.get_qvalue(s,a) 232 | return q_values 233 | 234 | ############### 235 | # For Display # 236 | ############### 237 | 238 | def display_qvalue_grid(self, qvalues): 239 | print "==Display q-value grid==" 240 | 241 | qvalues_grid = np.empty((len(self.grid), len(self.grid[0])), dtype=object) 242 | for s in self.get_states(): 243 | if self.grid[s[0]][s[1]] == 'x': 244 | qvalues_grid[s[0]][s[1]] = '-' 245 | else: 246 | tmp_str = "" 247 | for a in self.get_actions(s): 248 | tmp_str = tmp_str + self.dirs[a] 249 | tmp_str = tmp_str + str(' {:.2f} '.format(qvalues[(s,a)])) 250 | # print tmp_str 251 | qvalues_grid[s[0]][s[1]] = tmp_str 252 | 253 | row_format = '{:>40}' * (len(self.grid[0])) 254 | for row in qvalues_grid: 255 | print row_format.format(*row) 256 | 257 | 258 | def display_value_grid(self, values): 259 | """ 260 | Prints a nice table of the values in grid 261 | """ 262 | print "==Display value grid==" 263 | 264 | value_grid = np.zeros((len(self.grid), len(self.grid[0]))) 265 | for k in values: 266 | value_grid[k[0]][k[1]] = float(values[k]) 267 | 268 | row_format = '{:>20.4}' * (len(self.grid[0])) 269 | for row in value_grid: 270 | print row_format.format(*row) 271 | 272 | def display_policy_grid(self, policy): 273 | """ 274 | prints a nice table of the policy in grid 275 | input: 276 | policy a dictionary of the optimal policy {} 277 | """ 278 | print "==Display policy grid==" 279 | 280 | policy_grid = np.chararray((len(self.grid), len(self.grid[0]))) 281 | for k in self.get_states(): 282 | if self.is_terminal((k[0], k[1])) or self.grid[k[0]][k[1]] == 'x': 283 | policy_grid[k[0]][k[1]] = '-' 284 | else: 285 | # policy_grid[k[0]][k[1]] = self.dirs[agent.get_action((k[0], k[1]))] 286 | policy_grid[k[0]][k[1]] = self.dirs[policy[(k[0], k[1])][0][0]] 287 | 288 | row_format = '{:>20}' * (len(self.grid[0])) 289 | for row in policy_grid: 290 | print row_format.format(*row) 291 | -------------------------------------------------------------------------------- /envs/mdp.py: -------------------------------------------------------------------------------- 1 | # Markov Decision Process 2 | # --- 3 | # @author Yiren Lu 4 | # @email luyiren [at] seas [dot] upenn [dot] edu 5 | # 6 | # MIT License 7 | 8 | 9 | class MDP: 10 | 11 | def get_states(self): 12 | """ 13 | get a list of all states 14 | """ 15 | abstract 16 | 17 | def get_actions(self, state): 18 | """ 19 | get all the actions that can be takens on the current state 20 | """ 21 | abstract 22 | 23 | def get_reward(self, state): 24 | """ 25 | return the reward on current state 26 | """ 27 | abstract 28 | 29 | def get_transition_states_and_probs(self, state, action): 30 | """ 31 | get all the possible transition states and their probabilities with [action] on [state] 32 | """ 33 | abstract 34 | 35 | def is_terminal(self, state): 36 | """ 37 | return True is the [state] is terminal 38 | """ 39 | abstract 40 | -------------------------------------------------------------------------------- /envs/test_gridworld.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import gridworld 3 | 4 | 5 | class GridWorldTest(unittest.TestCase): 6 | """ 7 | Unit test for grid world 8 | """ 9 | 10 | def setUp(self): 11 | grid = [['0', '0', '0', '0', '10'], 12 | ['0', 'x', '0', '0', '-10'], 13 | ['0', '0', '0', '0', '0']] 14 | 15 | self.grid = grid 16 | self.gw_deterministic = gridworld.GridWorld(grid, {(0, 4), (1, 4)}, 1) 17 | self.gw_non_deterministic = gridworld.GridWorld( 18 | grid, {(0, 4), (1, 4)}, 0.8) 19 | 20 | def test_grid_dims(self): 21 | self.assertEqual(len(self.gw_deterministic.get_grid()), 3) 22 | self.assertEqual(len(self.gw_deterministic.get_grid()[0]), 5) 23 | 24 | def test_grid_values(self): 25 | grid_tmp = self.gw_deterministic.get_grid() 26 | for i in range(len(grid_tmp)): 27 | for j in range(len(grid_tmp[0])): 28 | self.assertEqual(self.grid[i][j], grid_tmp[i][j]) 29 | 30 | def test_get_states(self): 31 | self.assertEqual(len(self.gw_deterministic.get_states()), 14) 32 | 33 | def test_get_actions(self): 34 | self.assertEqual(len(self.gw_deterministic.get_actions((0, 0))), 2) 35 | self.assertEqual(len(self.gw_deterministic.get_actions((2, 0))), 2) 36 | self.assertEqual(len(self.gw_deterministic.get_actions((2, 4))), 2) 37 | self.assertEqual(len(self.gw_deterministic.get_actions((0, 4))), 2) 38 | self.assertEqual(len(self.gw_deterministic.get_actions((1, 0))), 2) 39 | 40 | def test_get_reward(self): 41 | self.assertEqual(self.gw_deterministic.get_reward((0, 0)), 0) 42 | self.assertEqual(self.gw_deterministic.get_reward((0, 4)), 10.0) 43 | self.assertEqual(self.gw_deterministic.get_reward((1, 4)), -10.0) 44 | 45 | def test_trans_prob_deter(self): 46 | self.assertEqual( 47 | len( 48 | self.gw_deterministic.get_transition_states_and_probs( 49 | (0, 0), 0)), 1) 50 | self.assertEqual( 51 | self.gw_deterministic.get_transition_states_and_probs( 52 | (0, 0), 0)[0][0], (0, 1)) 53 | self.assertEqual( 54 | self.gw_deterministic.get_transition_states_and_probs( 55 | (0, 0), 0)[0][1], 1) 56 | 57 | self.assertEqual( 58 | len( 59 | self.gw_deterministic.get_transition_states_and_probs( 60 | (0, 0), 1)), 1) 61 | self.assertEqual( 62 | self.gw_deterministic.get_transition_states_and_probs( 63 | (0, 0), 1)[0][0], (0, 0)) 64 | self.assertEqual( 65 | self.gw_deterministic.get_transition_states_and_probs( 66 | (0, 0), 1)[0][1], 1) 67 | 68 | def test_trans_prob_non_deter(self): 69 | self.assertEqual( 70 | len( 71 | self.gw_non_deterministic.get_transition_states_and_probs( 72 | (0, 0), 0)), 3) 73 | self.assertEqual( 74 | self.gw_non_deterministic.get_transition_states_and_probs( 75 | (0, 0), 0)[0][0], (0, 1)) 76 | self.assertEqual( 77 | self.gw_non_deterministic.get_transition_states_and_probs( 78 | (0, 0), 0)[0][1], 0.8) 79 | 80 | self.assertTrue( 81 | self.gw_non_deterministic.get_transition_states_and_probs( 82 | (0, 0), 0)[1][1] - 0.1 < 1e-5) 83 | self.assertTrue( 84 | self.gw_non_deterministic.get_transition_states_and_probs( 85 | (0, 0), 0)[2][1] - 0.1 < 1e-5) 86 | 87 | self.assertEqual( 88 | len( 89 | self.gw_non_deterministic.get_transition_states_and_probs( 90 | (1, 0), 0)), 3) 91 | 92 | def test_terminals(self): 93 | self.assertTrue(self.gw_deterministic.is_terminal((0, 4))) 94 | self.assertTrue(self.gw_deterministic.is_terminal((1, 4))) 95 | 96 | if __name__ == '__main__': 97 | unittest.main() 98 | -------------------------------------------------------------------------------- /imgs/breakout10.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/imgs/breakout10.gif -------------------------------------------------------------------------------- /monte_carlo/monte_carlo.py: -------------------------------------------------------------------------------- 1 | # Monte Carlo Agent 2 | # Epsilon-greedy monte carlo agent 3 | # 4 | # --- 5 | # @author Yiren Lu 6 | # @email luyiren [at] seas [dot] upenn [dot] edu 7 | # 8 | # MIT License 9 | 10 | import sys 11 | if "../" not in sys.path: 12 | sys.path.append("../") 13 | from TD import qlearning 14 | import numpy 15 | 16 | class Counter: 17 | """ 18 | Counter class 19 | """ 20 | 21 | def __init__(self): 22 | self.counter = {} 23 | 24 | def add(self, key): 25 | if key in self.counter: 26 | self.counter[key] = self.counter[key] + 1 27 | else: 28 | self.counter[key] = 1 29 | 30 | def get(self, key): 31 | if key in self.counter: 32 | return self.counter[key] 33 | else: 34 | return 0 35 | 36 | 37 | class MonteCarloAgent(qlearning.QLearningAgent): 38 | 39 | def __init__(self, legal_actions_fn, epsilon=0.5, alpha=0.5, gamma=0.9, epsilon_decay=1): 40 | self.n_s_a = Counter() 41 | super(MonteCarloAgent, self).__init__(legal_actions_fn, epsilon, alpha, gamma, epsilon_decay) 42 | 43 | 44 | @staticmethod 45 | def compute_G_t(rewards, gamma): 46 | """ 47 | args 48 | a list of rewards 49 | returns 50 | a list of cummulated rewards G_t = R_{t+1} + gamma*R_{t+2} + gamma^2*R_{t+3} + .. + gamma^{T-t-1}*R_{T} 51 | """ 52 | G_t = [0]*len(rewards) 53 | 54 | for i in xrange(0,len(rewards)): 55 | G_t[0] = G_t[0] + rewards[i]*(gamma**i) 56 | 57 | for i in xrange(1,len(rewards)): 58 | G_t[i] = (G_t[i-1] - rewards[i-1])/gamma 59 | 60 | return G_t 61 | 62 | 63 | def learn(self, episode): 64 | """ 65 | args 66 | episode a list of (current state, action, next state, reward) 67 | """ 68 | q_values = self.q_values.copy() 69 | 70 | rewards = [r for c, a, n, r in episode] 71 | G_t = MonteCarloAgent.compute_G_t(rewards, self.gamma) 72 | for i in xrange(len(episode)): 73 | c, a, n, r = episode[i] 74 | # q-state count++ 75 | self.n_s_a.add((c,a)) 76 | # update q-value 77 | # notices here I took the max of the weights and self.alpha to ensure it actually 78 | # learns some thing from each episode of experience 79 | q_values[(c,a)] = self.get_qvalue(c,a) + max(1/self.n_s_a.get((c,a)), self.alpha) * (G_t[i] - self.get_qvalue(c,a)) 80 | 81 | self.q_values = q_values 82 | 83 | # policy improvement 84 | policy = self.policy.copy() 85 | for c, a, n, r in episode: 86 | legal_actions = self.legal_actions_fn(c) 87 | s_q_values = [self.get_qvalue(c,a) for a in legal_actions] 88 | policy[c] = legal_actions[s_q_values.index(max(s_q_values))] 89 | self.policy = policy 90 | 91 | self.epsilon = self.epsilon*self.epsilon_decay 92 | 93 | -------------------------------------------------------------------------------- /monte_carlo/test_monte_carlo.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | if "../" not in sys.path: 4 | sys.path.append("../") 5 | from envs import gridworld 6 | import monte_carlo 7 | 8 | 9 | class MonteCarloAgentTest(unittest.TestCase): 10 | """ 11 | Unit test for monte carlo agent 12 | """ 13 | 14 | 15 | def test2(self): 16 | print 'Test 2 -- Gridworld test' 17 | grid = [['0', '0', '0', '1'], 18 | ['0', 'x', '0', '-1'], 19 | ['0', '0', '0', '0']] 20 | 21 | gw = gridworld.GridWorld( 22 | grid, {(0, 3), (1, 3)}, 0.8) 23 | 24 | agent = monte_carlo.MonteCarloAgent(gw.get_actions, 25 | epsilon=0.4, gamma=0.9, alpha=0.01, epsilon_decay=1) 26 | # Training 27 | episodes = 1000 28 | for i in range(episodes): 29 | episode = [] 30 | gw.reset((2,0)) 31 | cur_s = gw.get_current_state() 32 | is_done = False 33 | while not is_done: 34 | a = agent.get_action(cur_s) 35 | last_state, action, next_state, reward, is_done = gw.step(a) 36 | episode.append((last_state, action, next_state, reward)) 37 | # agent.learn(last_state, action, next_state, reward, is_done) 38 | cur_s = next_state 39 | if is_done: 40 | agent.learn(episode) 41 | 42 | opt_policy = gw.get_optimal_policy(agent) 43 | gw.display_policy_grid(opt_policy) 44 | gw.display_value_grid(gw.get_values(agent)) 45 | gw.display_qvalue_grid(gw.get_qvalues(agent)) 46 | 47 | 48 | def test1(self): 49 | print 'Test 1 -- test G_t' 50 | G_t = monte_carlo.MonteCarloAgent.compute_G_t([1,2,3,4], 0.5) 51 | self.assertEqual(G_t, [3.25,4.5,5,4]) 52 | 53 | if __name__ == '__main__': 54 | unittest.main() -------------------------------------------------------------------------------- /papers/AlphaGoNaturePaper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/AlphaGoNaturePaper.pdf -------------------------------------------------------------------------------- /papers/GAN.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/GAN.pdf -------------------------------------------------------------------------------- /papers/Learning2learn_by_GD_by_GD.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/Learning2learn_by_GD_by_GD.pdf -------------------------------------------------------------------------------- /papers/a3c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/a3c.pdf -------------------------------------------------------------------------------- /papers/browne_mcts_survey_ieee12.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/browne_mcts_survey_ieee12.pdf -------------------------------------------------------------------------------- /papers/ddpg.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/ddpg.pdf -------------------------------------------------------------------------------- /papers/ddqn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/ddqn.pdf -------------------------------------------------------------------------------- /papers/dpg_silver14.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/dpg_silver14.pdf -------------------------------------------------------------------------------- /papers/dqn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/dqn.pdf -------------------------------------------------------------------------------- /papers/dqn_nature.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/dqn_nature.pdf -------------------------------------------------------------------------------- /papers/drl_bench_mark2016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/drl_bench_mark2016.pdf -------------------------------------------------------------------------------- /papers/dueling_dqn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/dueling_dqn.pdf -------------------------------------------------------------------------------- /papers/learn2rl.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/papers/learn2rl.pdf -------------------------------------------------------------------------------- /policy_gradient/README.md: -------------------------------------------------------------------------------- 1 | ## Policy Gradient Methods 2 | 3 | ### REINFORCE 4 | 5 | The policy function is approximated by a 4-layer fully connected network with l2 regularization. The algorithm [solved cartpole-v0 after 632 episodes](https://gym.openai.com/evaluations/eval_0qE4YdUoQMi60hslLEGg) 6 | 7 | - `reinforce.py`: REINFORCE with policy function approximation 8 | - `cartpole_reinforce.py`: working example on cartpole-v0 9 | 10 | #### Run Code 11 | 12 | `$ python cartpole_reinforce.py` 13 | 14 | #### Cartpole-v0 Result 15 | 16 | ![cartpole training](imgs/cartpole_reinforce.png "cartpole training") 17 | 18 | ### REINFORCE with Baseline 19 | 20 | Here the code shows REINFORCE algorithm with baseline. The policy and value function share the same network regularized by l2. Have not been tuning the hyperparameters too much. Sometimes the model quickly converges to a local optimal (degenerate policy) due to random initialization, but a few attempts (<5) should be sufficient. 21 | 22 | - `reinforce_w_baseline.py`: REINFORCE with baseline 23 | - `cartpole_reinforce_baseline.py`: working example on cartpole-v0 24 | 25 | #### Run Code 26 | 27 | `$ python cartpole_reinforce_baseline.py` 28 | 29 | #### Cartpole-v0 Result 30 | 31 | ![cartpole training](imgs/cartpole_reinforce_w_baseline.png "cartpole training") -------------------------------------------------------------------------------- /policy_gradient/cartpole_reinforce.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import wrappers 3 | import reinforce 4 | import numpy 5 | import matplotlib.pyplot as plt 6 | import tensorflow as tf 7 | 8 | NUM_EPISODES = 400 9 | MAX_STEPS = 300 10 | FAIL_PENALTY = -100 11 | # LEARNING_RATE = 0.0001 # hidden layer 10/20 12 | LEARNING_RATE = 0.002 # hidden layer 5 13 | # LEARNING_RATE = 0.1 # hidden layer 3 14 | DISCOUNT_FACTOR = 0.9 15 | TRAIN_EVERY_NUM_EPISODES = 1 16 | EPOCH_SIZE = 1 17 | MEM_SIZE = 100 18 | 19 | RECORD = False 20 | 21 | 22 | def train(agent, env, sess, num_episodes=NUM_EPISODES): 23 | history = [] 24 | for i in xrange(NUM_EPISODES): 25 | if i % 100: 26 | print "Episode {}".format(i + 1) 27 | cur_state = env.reset() 28 | episode = [] 29 | for t in xrange(MAX_STEPS): 30 | action = agent.get_action(cur_state, sess) 31 | next_state, reward, done, info = env.step(action) 32 | if done: 33 | reward = FAIL_PENALTY 34 | episode.append([cur_state, action, next_state, reward, done]) 35 | print("Episode finished after {} timesteps".format(t + 1)) 36 | print agent.get_policy(cur_state, sess) 37 | history.append(t + 1) 38 | break 39 | episode.append([cur_state, action, next_state, 1, done]) 40 | cur_state = next_state 41 | if t == MAX_STEPS - 1: 42 | history.append(t + 1) 43 | print("Episode finished after {} timesteps".format(t + 1)) 44 | # agent.add_episode(episode) 45 | if i % TRAIN_EVERY_NUM_EPISODES == 0: 46 | print 'train at episode {}'.format(i) 47 | agent.learn(episode, sess, EPOCH_SIZE) 48 | return agent, history 49 | 50 | 51 | agent = reinforce.PolicyGradientNNAgent(lr=LEARNING_RATE, 52 | gamma=DISCOUNT_FACTOR, 53 | state_size=4, 54 | action_size=2, 55 | n_hidden_1=5, 56 | n_hidden_2=5) 57 | 58 | 59 | env = gym.make('CartPole-v0') 60 | if RECORD: 61 | env = wrappers.Monitor(env, '/tmp/cartpole-experiment-2', force=True) 62 | 63 | 64 | with tf.Session() as sess: 65 | sess.run(tf.global_variables_initializer()) 66 | agent, history = train(agent, env, sess) 67 | 68 | 69 | if RECORD: 70 | env.monitor.close() 71 | 72 | window = 10 73 | avg_reward = [numpy.mean(history[i*window:(i+1)*window]) for i in xrange(int(len(history)/window))] 74 | f_reward = plt.figure(1) 75 | plt.plot(numpy.linspace(0, len(history), len(avg_reward)), avg_reward) 76 | plt.ylabel('Rewards') 77 | f_reward.show() 78 | print 'press enter to continue' 79 | raw_input() 80 | 81 | -------------------------------------------------------------------------------- /policy_gradient/cartpole_reinforce_baseline.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import wrappers 3 | import reinforce_w_baseline 4 | import numpy 5 | import matplotlib.pyplot as plt 6 | import tensorflow as tf 7 | 8 | NUM_EPISODES = 200 9 | MAX_STEPS = 300 10 | FAIL_PENALTY = -100 11 | LEARNING_RATE = 0.002 12 | DISCOUNT_FACTOR = 0.9 13 | TRAIN_EVERY_NUM_EPISODES = 1 14 | EPOCH_SIZE = 1 15 | MEM_SIZE = 100 16 | 17 | RECORD = False 18 | 19 | 20 | def train(agent, env, sess, num_episodes=NUM_EPISODES): 21 | history = [] 22 | for i in xrange(NUM_EPISODES): 23 | if i % 100: 24 | print "Episode {}".format(i + 1) 25 | cur_state = env.reset() 26 | episode = [] 27 | for t in xrange(MAX_STEPS): 28 | action = agent.get_action(cur_state, sess) 29 | next_state, reward, done, info = env.step(action) 30 | if done: 31 | reward = FAIL_PENALTY 32 | episode.append([cur_state, action, next_state, reward, done]) 33 | print("Episode finished after {} timesteps".format(t + 1)) 34 | print agent.get_policy(cur_state, sess) 35 | history.append(t + 1) 36 | break 37 | episode.append([cur_state, action, next_state, 1, done]) 38 | cur_state = next_state 39 | if t == MAX_STEPS - 1: 40 | history.append(t + 1) 41 | print("Episode finished after {} timesteps".format(t + 1)) 42 | if i % TRAIN_EVERY_NUM_EPISODES == 0: 43 | print 'train at episode {}'.format(i) 44 | agent.learn(episode, sess, EPOCH_SIZE) 45 | return agent, history 46 | 47 | 48 | agent = reinforce_w_baseline.PolicyGradientNNAgent(lr=LEARNING_RATE, 49 | gamma=DISCOUNT_FACTOR, 50 | state_size=4, 51 | action_size=2, 52 | n_hidden_1=10, 53 | n_hidden_2=10) 54 | 55 | 56 | env = gym.make('CartPole-v0') 57 | 58 | 59 | with tf.Session() as sess: 60 | sess.run(tf.global_variables_initializer()) 61 | agent, history = train(agent, env, sess) 62 | 63 | 64 | window = 10 65 | avg_reward = [numpy.mean(history[i*window:(i+1)*window]) for i in xrange(int(len(history)/window))] 66 | f_reward = plt.figure(1) 67 | plt.plot(numpy.linspace(0, len(history), len(avg_reward)), avg_reward) 68 | plt.ylabel('Rewards') 69 | plt.xlabel('Episodes') 70 | f_reward.show() 71 | print 'press enter to continue' 72 | raw_input() 73 | 74 | -------------------------------------------------------------------------------- /policy_gradient/imgs/cartpole_reinforce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/policy_gradient/imgs/cartpole_reinforce.png -------------------------------------------------------------------------------- /policy_gradient/imgs/cartpole_reinforce_w_baseline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yrlu/reinforcement_learning/aadf27632191dce6d3055b56f18df87cbf97f172/policy_gradient/imgs/cartpole_reinforce_w_baseline.png -------------------------------------------------------------------------------- /policy_gradient/reinforce.py: -------------------------------------------------------------------------------- 1 | # Policy Gradient Agent 2 | # - policy approximation with fully connected neural network 3 | # 4 | # --- 5 | # @author Yiren Lu 6 | # @email luyiren [at] seas [dot] upenn [dot] edu 7 | # 8 | # MIT License 9 | import gym 10 | import numpy as np 11 | import random 12 | import tensorflow as tf 13 | import tensorflow.contrib.slim as slim 14 | import tf_utils 15 | 16 | 17 | class PolicyGradientNNAgent(): 18 | 19 | def __init__(self, 20 | lr=0.5, 21 | gamma=0.99, 22 | state_size=4, 23 | action_size=2, 24 | n_hidden_1=20, 25 | n_hidden_2=20, 26 | scope="pg" 27 | ): 28 | """ 29 | args 30 | epsilon exploration rate 31 | epsilon_anneal linear decay rate per call of learn() function (iteration) 32 | end_epsilon lowest exploration rate 33 | lr learning rate 34 | gamma discount factor 35 | state_size network input size 36 | action_size network output size 37 | """ 38 | self.lr = lr 39 | self.gamma = gamma 40 | self.state_size = state_size 41 | self.action_size = action_size 42 | self.total_steps = 0 43 | self.n_hidden_1 = n_hidden_1 44 | self.n_hidden_2 = n_hidden_2 45 | self.scope = scope 46 | 47 | self._build_policy_net() 48 | 49 | 50 | def _build_policy_net(self): 51 | """Build policy network""" 52 | with tf.variable_scope(self.scope): 53 | self.state_input = tf.placeholder(tf.float32, [None, self.state_size]) 54 | self.action = tf.placeholder(tf.int32, [None]) 55 | self.target = tf.placeholder(tf.float32, [None]) 56 | 57 | layer_1 = tf_utils.fc(self.state_input, self.n_hidden_1, tf.nn.relu) 58 | layer_2 = tf_utils.fc(layer_1, self.n_hidden_2, tf.nn.relu) 59 | 60 | self.action_values = tf_utils.fc(layer_2, self.action_size) 61 | action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0) 62 | self.action_prob = tf.nn.softmax(self.action_values) 63 | self.action_value_pred = tf.reduce_sum(self.action_prob * action_mask, 1) 64 | 65 | # l2 regularization 66 | self.l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() ]) 67 | self.pg_loss = tf.reduce_mean(-tf.log(self.action_value_pred) * self.target) 68 | 69 | self.loss = self.pg_loss + 0.002 * self.l2_loss 70 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) 71 | self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step()) 72 | 73 | 74 | def get_action(self, state, sess): 75 | """Returns stochastic policy""" 76 | pi = self.get_policy(state, sess) 77 | return np.random.choice(range(self.action_size), p=pi) 78 | 79 | 80 | def get_policy(self, state, sess): 81 | """returns policy as probability distribution of actions""" 82 | pi = sess.run(self.action_prob, feed_dict={self.state_input: [state]}) 83 | return pi[0] 84 | 85 | 86 | def learn(self, episode, sess, train_epoch = 1): 87 | for t in xrange(len(episode)): 88 | self.total_steps = self.total_steps + 1 89 | target = sum([self.gamma**i * r for i, (s, a, s1, r, d) in enumerate(episode[t:])]) 90 | state, action, next_state, reward, done = episode[t] 91 | feed_dict = { self.state_input: [state], self.target: [target], self.action: [action] } 92 | _, loss = sess.run([self.train_op, self.loss], feed_dict) -------------------------------------------------------------------------------- /policy_gradient/reinforce_w_baseline.py: -------------------------------------------------------------------------------- 1 | # Policy Gradient Agent 2 | # - REINFORCE algorithm with baseline 3 | # - Policy/value function approximation 4 | # 5 | # --- 6 | # @author Yiren Lu 7 | # @email luyiren [at] seas [dot] upenn [dot] edu 8 | # 9 | # MIT License 10 | import gym 11 | import numpy as np 12 | import random 13 | import tensorflow as tf 14 | import tensorflow.contrib.slim as slim 15 | import tf_utils 16 | 17 | 18 | class PolicyGradientNNAgent(): 19 | 20 | def __init__(self, 21 | lr=0.5, 22 | gamma=0.99, 23 | state_size=4, 24 | action_size=2, 25 | n_hidden_1=20, 26 | n_hidden_2=20, 27 | scope="pg" 28 | ): 29 | """ 30 | args 31 | epsilon exploration rate 32 | epsilon_anneal linear decay rate per call of learn() function (iteration) 33 | end_epsilon lowest exploration rate 34 | lr learning rate 35 | gamma discount factor 36 | state_size network input size 37 | action_size network output size 38 | """ 39 | self.lr = lr 40 | self.gamma = gamma 41 | self.state_size = state_size 42 | self.action_size = action_size 43 | self.total_steps = 0 44 | self.n_hidden_1 = n_hidden_1 45 | self.n_hidden_2 = n_hidden_2 46 | self.scope = scope 47 | 48 | self._build_policy_net() 49 | 50 | 51 | 52 | def _build_policy_net(self): 53 | """Build policy network""" 54 | with tf.variable_scope(self.scope): 55 | self.state_input = tf.placeholder(tf.float32, [None, self.state_size]) 56 | self.action = tf.placeholder(tf.int32, [None]) 57 | self.target = tf.placeholder(tf.float32, [None]) 58 | 59 | layer_1 = tf_utils.fc(self.state_input, self.n_hidden_1, tf.nn.relu) 60 | layer_2 = tf_utils.fc(layer_1, self.n_hidden_2, tf.nn.relu) 61 | 62 | self.value = tf_utils.fc(layer_2, 1) 63 | 64 | self.action_values = tf_utils.fc(layer_2, self.action_size) 65 | action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0) 66 | self.action_value_pred = tf.reduce_sum(tf.nn.softmax(self.action_values) * action_mask, 1) 67 | 68 | self.action_probs = tf.nn.softmax(self.action_values) 69 | self.value_loss = tf.reduce_mean(tf.square(self.target - self.value)) 70 | self.pg_loss = tf.reduce_mean(-tf.log(self.action_value_pred) * (self.target - self.value)) 71 | self.l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() ]) 72 | self.loss = self.pg_loss + 5*self.value_loss + 0.002 * self.l2_loss 73 | 74 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) 75 | self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step()) 76 | 77 | 78 | def get_action(self, state, sess): 79 | """Returns stochastic policy""" 80 | pi = self.get_policy(state, sess) 81 | return np.random.choice(range(self.action_size), p=pi) 82 | 83 | 84 | def get_policy(self, state, sess): 85 | """returns policy as probability distribution of actions""" 86 | pi = sess.run(self.action_probs, feed_dict={self.state_input: [state]}) 87 | return pi[0] 88 | 89 | 90 | def learn(self, episode, sess, train_epoch = 1): 91 | for t in xrange(len(episode)): 92 | self.total_steps = self.total_steps + 1 93 | target = sum([self.gamma**i * r for i, (s, a, s1, r, d) in enumerate(episode[t:])]) 94 | state, action, next_state, reward, done = episode[t] 95 | feed_dict = { self.state_input: [state], self.target: [target], self.action: [action] } 96 | _, loss, v, pg_loss, v_a = sess.run([self.train_op, self.loss, self.value, self.pg_loss, self.action_value_pred], feed_dict) 97 | # print target, v 98 | # print pg_loss, v, v_a, target, -np.log(v_a) * target -------------------------------------------------------------------------------- /policy_gradient/tf_utils.py: -------------------------------------------------------------------------------- 1 | """Utility functions for tensorflow""" 2 | import tensorflow as tf 3 | 4 | 5 | def max_pool(x, k_sz=[2,2]): 6 | """max pooling layer wrapper 7 | Args 8 | x: 4d tensor [batch, height, width, channels] 9 | k_sz: The size of the window for each dimension of the input tensor 10 | Returns 11 | a max pooling layer 12 | """ 13 | return tf.nn.max_pool(x, ksize=[1, k_sz[0], k_sz[1], 1], strides=[1, k_sz[0], k_sz[1], 1], padding='SAME') 14 | 15 | def conv2d(x, n_kernel, k_sz, stride=1): 16 | """convolutional layer with relu activation wrapper 17 | Args: 18 | x: 4d tensor [batch, height, width, channels] 19 | n_kernel: number of kernels (output size) 20 | k_sz: 2d array, kernel size. e.g. [8,8] 21 | stride: stride 22 | Returns 23 | a conv2d layer 24 | """ 25 | W = tf.Variable(tf.random_normal([k_sz[0], k_sz[1], int(x.get_shape()[3]), n_kernel])) 26 | b = tf.Variable(tf.random_normal([n_kernel])) 27 | # - strides[0] and strides[1] must be 1 28 | # - padding can be 'VALID'(without padding) or 'SAME'(zero padding) 29 | # - http://stackoverflow.com/questions/37674306/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-t 30 | conv = tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME') 31 | conv = tf.nn.bias_add(conv, b) # add bias term 32 | return tf.nn.relu(conv) # rectified linear unit: https://en.wikipedia.org/wiki/Rectifier_(neural_networks) 33 | 34 | 35 | def fc(x, n_output, activation_fn=None): 36 | """fully connected layer with relu activation wrapper 37 | Args 38 | x: 2d tensor [batch, n_input] 39 | n_output output size 40 | """ 41 | W=tf.Variable(tf.random_normal([int(x.get_shape()[1]), n_output])) 42 | b=tf.Variable(tf.random_normal([n_output])) 43 | fc1 = tf.add(tf.matmul(x, W), b) 44 | if not activation_fn == None: 45 | fc1 = activation_fn(fc1) 46 | return fc1 47 | 48 | 49 | def flatten(x): 50 | """flatten a 4d tensor into 2d 51 | Args 52 | x: 4d tensor [batch, height, width, channels] 53 | Returns a flattened 2d tensor 54 | """ 55 | return tf.reshape(x, [-1, int(x.get_shape()[1]*x.get_shape()[2]*x.get_shape()[3])]) 56 | 57 | 58 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu 2 | gym[all] 3 | scikit-image 4 | scipy 5 | numpy 6 | --------------------------------------------------------------------------------