├── .gitignore ├── 01_0_play_frozenlake_det.py ├── 01_1_play_frozenlake_det.py ├── 02_random_frozenkake_det.py ├── 03_0_q_table_frozenlake_det.py ├── 03_1_q_table_frozenlake_det.py ├── 03_2_q_table_frozenlake_det.py ├── 04_play_frozenlake.py ├── 05_0_q_table_frozenlake.py ├── 05_q_table_frozenlake.py ├── 06_q_net_frozenlake.py ├── 07_0_random_cartpole.py ├── 07_1_q_net_cartpole.py ├── 07_2_dqn_2013_cartpole.py ├── 07_3_dqn_2015_cartpole.py ├── 08_1_pg_cartpole.py ├── 08_2_softmax_pg_cartpole.py ├── 08_3_softmax_pg_pacman.py ├── 08_4_softmax_pg_pong.py ├── 08_4_softmax_pg_pong_y.py ├── 09_2_cross_entropy.py ├── 10_1_Actor_Critic.ipynb ├── 10_2_A3C_threads.py ├── README.md ├── assets ├── actor_critic.png └── openai_user.jpg ├── dqn.py ├── gym.ini ├── gym_uploader.py ├── mini_pacman.py ├── requirements.txt ├── tests ├── __init__.py └── test_DQN.py └── utils ├── __init__.py └── prints.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | gym-results/ 4 | pacman_log 5 | -------------------------------------------------------------------------------- /01_0_play_frozenlake_det.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.envs.registration import register 3 | 4 | # http://stackoverflow.com/questions/510357/python-read-a-single-character-from-the-user 5 | import readchar # pip3 install readchar 6 | 7 | # MACROS 8 | LEFT = 0 9 | DOWN = 1 10 | RIGHT = 2 11 | UP = 3 12 | 13 | # Key mapping 14 | arrow_keys = { 15 | '\x1b[A': UP, 16 | '\x1b[B': DOWN, 17 | '\x1b[C': RIGHT, 18 | '\x1b[D': LEFT} 19 | 20 | # Register FrozenLake with is_slippery False 21 | register( 22 | id='FrozenLake-v3', 23 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 24 | kwargs={'map_name': '4x4', 'is_slippery': False} 25 | ) 26 | 27 | env = gym.make('FrozenLake-v3') 28 | env.render() # Show the initial board 29 | 30 | while True: 31 | # Choose an action from keyboard 32 | key = readchar.readkey() 33 | if key not in arrow_keys.keys(): 34 | print("Game aborted!") 35 | break 36 | 37 | action = arrow_keys[key] 38 | state, reward, done, info = env.step(action) 39 | env.render() # Show the board after action 40 | print("State: ", state, "Action: ", action, 41 | "Reward: ", reward, "Info: ", info) 42 | 43 | if done: 44 | print("Finished with reward", reward) 45 | break 46 | -------------------------------------------------------------------------------- /01_1_play_frozenlake_det.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.envs.registration import register 3 | import sys 4 | import tty 5 | import termios 6 | 7 | 8 | class _Getch: 9 | 10 | def __call__(self): 11 | fd = sys.stdin.fileno() 12 | old_settings = termios.tcgetattr(fd) 13 | try: 14 | tty.setraw(sys.stdin.fileno()) 15 | ch = sys.stdin.read(3) 16 | finally: 17 | termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) 18 | return ch 19 | 20 | inkey = _Getch() 21 | 22 | # MACROS 23 | LEFT = 0 24 | DOWN = 1 25 | RIGHT = 2 26 | UP = 3 27 | 28 | # Key mapping 29 | arrow_keys = { 30 | '\x1b[A': UP, 31 | '\x1b[B': DOWN, 32 | '\x1b[C': RIGHT, 33 | '\x1b[D': LEFT} 34 | 35 | # Register FrozenLake with is_slippery False 36 | register( 37 | id='FrozenLake-v3', 38 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 39 | kwargs={'map_name': '4x4', 'is_slippery': False} 40 | ) 41 | 42 | env = gym.make('FrozenLake-v3') 43 | env.render() # Show the initial board 44 | 45 | while True: 46 | # Choose an action from keyboard 47 | key = inkey() 48 | if key not in arrow_keys.keys(): 49 | print("Game aborted!") 50 | break 51 | 52 | action = arrow_keys[key] 53 | state, reward, done, info = env.step(action) 54 | env.render() # Show the board after action 55 | print("State: ", state, "Action: ", action, 56 | "Reward: ", reward, "Info: ", info) 57 | 58 | if done: 59 | print("Finished with reward", reward) 60 | break 61 | -------------------------------------------------------------------------------- /02_random_frozenkake_det.py: -------------------------------------------------------------------------------- 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap 2 | import gym 3 | import random 4 | from gym.envs.registration import register 5 | import matplotlib.pyplot as plt 6 | 7 | register( 8 | id='FrozenLake-v3', 9 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 10 | kwargs={'map_name': '4x4', 11 | 'is_slippery': False} 12 | ) 13 | 14 | env = gym.make('FrozenLake-v0') 15 | env.render() 16 | 17 | num_episodes = 2000 18 | 19 | rList = [] 20 | for i in range(num_episodes): 21 | # Reset environment and get first new observation 22 | env.reset() 23 | rAll = 0 24 | done = False 25 | 26 | while not done: 27 | # Random action 28 | action = random.randint(0, env.action_space.n - 1) 29 | 30 | # Get new state and reward from environment 31 | _state, reward, done, _info = env.step(action) 32 | 33 | # rAll will be 1 if success, o otherwise 34 | rAll += reward 35 | 36 | rList.append(rAll) 37 | 38 | print("Success rate: " + str(sum(rList) / num_episodes)) 39 | plt.plot(rList) 40 | plt.show() 41 | -------------------------------------------------------------------------------- /03_0_q_table_frozenlake_det.py: -------------------------------------------------------------------------------- 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap 2 | 3 | import gym 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from gym.envs.registration import register 7 | import random as pr 8 | 9 | # https://gist.github.com/stober/1943451 10 | 11 | 12 | def rargmax(vector): 13 | """ Argmax that chooses randomly among eligible maximum indices. """ 14 | m = np.amax(vector) 15 | indices = np.nonzero(vector == m)[0] 16 | return pr.choice(indices) 17 | 18 | 19 | register( 20 | id='FrozenLake-v3', 21 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 22 | kwargs={'map_name': '4x4', 23 | 'is_slippery': False} 24 | ) 25 | env = gym.make('FrozenLake-v3') 26 | 27 | # Initialize table with all zeros 28 | Q = np.zeros([env.observation_space.n, env.action_space.n]) 29 | # Set learning parameters 30 | num_episodes = 2000 31 | 32 | # create lists to contain total rewards and steps per episode 33 | rList = [] 34 | for i in range(num_episodes): 35 | # Reset environment and get first new observation 36 | state = env.reset() 37 | rAll = 0 38 | done = False 39 | 40 | # The Q-Table learning algorithm 41 | while not done: 42 | action = rargmax(Q[state, :]) 43 | 44 | # Get new state and reward from environment 45 | new_state, reward, done, _ = env.step(action) 46 | 47 | # Update Q-Table with new knowledge using learning rate 48 | Q[state, action] = reward + np.max(Q[new_state, :]) 49 | 50 | rAll += reward 51 | state = new_state 52 | 53 | rList.append(rAll) 54 | 55 | print("Success rate: " + str(sum(rList) / num_episodes)) 56 | print("Final Q-Table Values") 57 | print("LEFT DOWN RIGHT UP") 58 | print(Q) 59 | plt.bar(range(len(rList)), rList, color="blue") 60 | plt.show() 61 | -------------------------------------------------------------------------------- /03_1_q_table_frozenlake_det.py: -------------------------------------------------------------------------------- 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from gym.envs.registration import register 6 | 7 | register( 8 | id='FrozenLake-v3', 9 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 10 | kwargs={'map_name': '4x4', 11 | 'is_slippery': False} 12 | ) 13 | 14 | env = gym.make('FrozenLake-v3') 15 | 16 | # Initialize table with all zeros 17 | Q = np.zeros([env.observation_space.n, env.action_space.n]) 18 | # Discount factor 19 | dis = .99 20 | num_episodes = 2000 21 | 22 | # create lists to contain total rewards and steps per episode 23 | rList = [] 24 | 25 | for i in range(num_episodes): 26 | # Reset environment and get first new observation 27 | state = env.reset() 28 | rAll = 0 29 | done = False 30 | 31 | # The Q-Table learning algorithm 32 | while not done: 33 | # Choose an action by greedily (with noise) picking from Q table 34 | action = np.argmax(Q[state, :] + np.random.randn(1, 35 | env.action_space.n) / (i + 1)) 36 | 37 | # Get new state and reward from environment 38 | new_state, reward, done, _ = env.step(action) 39 | 40 | # Update Q-Table with new knowledge using decay rate 41 | Q[state, action] = reward + dis * np.max(Q[new_state, :]) 42 | 43 | rAll += reward 44 | state = new_state 45 | 46 | rList.append(rAll) 47 | 48 | print("Success rate: " + str(sum(rList) / num_episodes)) 49 | print("Final Q-Table Values") 50 | print(Q) 51 | plt.bar(range(len(rList)), rList, color="blue") 52 | plt.show() 53 | -------------------------------------------------------------------------------- /03_2_q_table_frozenlake_det.py: -------------------------------------------------------------------------------- 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap 2 | import gym 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from gym.envs.registration import register 6 | import random as pr 7 | 8 | register( 9 | id='FrozenLake-v3', 10 | entry_point='gym.envs.toy_text:FrozenLakeEnv', 11 | kwargs={'map_name': '4x4', 12 | 'is_slippery': False} 13 | ) 14 | 15 | env = gym.make('FrozenLake-v3') 16 | 17 | # Initialize table with all zeros 18 | Q = np.zeros([env.observation_space.n, env.action_space.n]) 19 | # Set learning parameters 20 | dis = .99 21 | num_episodes = 2000 22 | 23 | # create lists to contain total rewards and steps per episode 24 | rList = [] 25 | for i in range(num_episodes): 26 | # Reset environment and get first new observation 27 | state = env.reset() 28 | rAll = 0 29 | done = False 30 | 31 | e = 1. / ((i // 100) + 1) # Python2&3 32 | 33 | # The Q-Table learning algorithm 34 | while not done: 35 | # Choose an action by e greedy 36 | if np.random.rand(1) < e: 37 | action = env.action_space.sample() 38 | else: 39 | action = np.argmax(Q[state, :]) 40 | 41 | # Get new state and reward from environment 42 | new_state, reward, done, _ = env.step(action) 43 | 44 | # Update Q-Table with new knowledge using learning rate 45 | Q[state, action] = reward + dis * np.max(Q[new_state, :]) 46 | 47 | rAll += reward 48 | state = new_state 49 | 50 | rList.append(rAll) 51 | 52 | print("Success rate: " + str(sum(rList) / num_episodes)) 53 | print("Final Q-Table Values") 54 | print(Q) 55 | plt.bar(range(len(rList)), rList, color="blue") 56 | plt.show() 57 | -------------------------------------------------------------------------------- /04_play_frozenlake.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import readchar 3 | 4 | import utils.prints as print_utils 5 | 6 | # MACROS 7 | LEFT = 0 8 | DOWN = 1 9 | RIGHT = 2 10 | UP = 3 11 | 12 | # Key mapping 13 | arrow_keys = {'\x1b[A': UP, '\x1b[B': DOWN, '\x1b[C': RIGHT, '\x1b[D': LEFT} 14 | 15 | # is_slippery True 16 | env = gym.make('FrozenLake-v0') 17 | 18 | env.reset() 19 | 20 | print_utils.clear_screen() 21 | env.render() # Show the initial board 22 | 23 | while True: 24 | # Choose an action from keyboard 25 | key = readchar.readkey() 26 | 27 | if key not in arrow_keys.keys(): 28 | print("Game aborted!") 29 | break 30 | 31 | action = arrow_keys[key] 32 | state, reward, done, info = env.step(action) 33 | 34 | # Show the board after action 35 | print_utils.clear_screen() 36 | env.render() 37 | 38 | print("State: {} Action: {} Reward: {} Info: {}".format( 39 | state, action, reward, info)) 40 | 41 | if done: 42 | print_utils.print_result(reward) 43 | -------------------------------------------------------------------------------- /05_0_q_table_frozenlake.py: -------------------------------------------------------------------------------- 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap 2 | 3 | import gym 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | env = gym.make('FrozenLake-v0') 8 | 9 | # Initialize table with all zeros 10 | Q = np.zeros([env.observation_space.n, env.action_space.n]) 11 | 12 | # Set learning parameters 13 | learning_rate = .85 14 | dis = .99 15 | num_episodes = 2000 16 | 17 | # create lists to contain total rewards and steps per episode 18 | rList = [] 19 | for i in range(num_episodes): 20 | # Reset environment and get first new observation 21 | state = env.reset() 22 | rAll = 0 23 | done = False 24 | 25 | # The Q-Table learning algorithm 26 | while not done: 27 | # Choose an action by greedily (with noise) picking from Q table 28 | action = np.argmax(Q[state, :] + np.random.randn(1, 29 | env.action_space.n) / (i + 1)) 30 | 31 | # Get new state and reward from environment 32 | new_state, reward, done, _ = env.step(action) 33 | 34 | # Update Q-Table with new knowledge using learning rate 35 | Q[state, action] = reward + dis * np.max(Q[new_state, :]) 36 | state = new_state 37 | 38 | rAll += reward 39 | 40 | rList.append(rAll) 41 | 42 | print("Score over time: " + str(sum(rList) / num_episodes)) 43 | print("Final Q-Table Values") 44 | print(Q) 45 | plt.bar(range(len(rList)), rList, color="blue") 46 | plt.show() 47 | -------------------------------------------------------------------------------- /05_q_table_frozenlake.py: -------------------------------------------------------------------------------- 1 | """ 2 | FrozenLake solver using Q-table 3 | https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap 4 | """ 5 | 6 | import time 7 | 8 | import gym 9 | import numpy as np 10 | 11 | import utils.prints as print_utils 12 | 13 | N_ACTIONS = 4 14 | N_STATES = 16 15 | 16 | LEARNING_RATE = .5 17 | DISCOUNT_RATE = .98 18 | 19 | N_EPISODES = 2000 20 | 21 | def main(): 22 | """Main""" 23 | frozone_lake_env = gym.make("FrozenLake-v0") 24 | 25 | # Initialize table with all zeros 26 | Q = np.zeros([N_STATES, N_ACTIONS]) 27 | 28 | # Set learning parameters 29 | 30 | # create lists to contain total rewards and steps per episode 31 | rewards = [] 32 | 33 | for i in range(N_EPISODES): 34 | # Reset environment and get first new observation 35 | state = frozone_lake_env.reset() 36 | episode_reward = 0 37 | done = False 38 | 39 | # The Q-Table learning algorithm 40 | while not done: 41 | # Choose an action by greedily (with noise) picking from Q table 42 | noise = np.random.randn(1, N_ACTIONS) / (i + 1) 43 | action = np.argmax(Q[state, :] + noise) 44 | 45 | # Get new state and reward from environment 46 | new_state, reward, done, _ = frozone_lake_env.step(action) 47 | 48 | reward = -1 if done and reward < 1 else reward 49 | 50 | # Update Q-Table with new knowledge using learning rate 51 | Q[state, action] = ( 52 | 1 - LEARNING_RATE) * Q[state, action] + LEARNING_RATE * ( 53 | reward + DISCOUNT_RATE * np.max(Q[new_state, :])) 54 | 55 | episode_reward += reward 56 | state = new_state 57 | 58 | rewards.append(episode_reward) 59 | 60 | print("Score over time: " + str(sum(rewards) / N_EPISODES)) 61 | print("Final Q-Table Values") 62 | 63 | for i in range(10): 64 | # Reset environment and get first new observation 65 | state = frozone_lake_env.reset() 66 | episode_reward = 0 67 | done = False 68 | 69 | # The Q-Table learning algorithm 70 | while not done: 71 | # Choose an action by greedily (with noise) picking from Q table 72 | action = np.argmax(Q[state, :]) 73 | 74 | # Get new state and reward from environment 75 | new_state, reward, done, _ = frozone_lake_env.step(action) 76 | print_utils.clear_screen() 77 | frozone_lake_env.render() 78 | time.sleep(.1) 79 | 80 | episode_reward += reward 81 | state = new_state 82 | 83 | if done: 84 | print("Episode Reward: {}".format(episode_reward)) 85 | print_utils.print_result(episode_reward) 86 | 87 | rewards.append(episode_reward) 88 | 89 | frozone_lake_env.close() 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /06_q_net_frozenlake.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is based on 3 | https://github.com/hunkim/DeepRL-Agents 4 | ''' 5 | import gym 6 | import numpy as np 7 | import tensorflow as tf 8 | import matplotlib.pyplot as plt 9 | 10 | env = gym.make('FrozenLake-v0') 11 | 12 | # Input and output size based on the Env 13 | input_size = env.observation_space.n 14 | output_size = env.action_space.n 15 | learning_rate = 0.1 16 | 17 | # These lines establish the feed-forward part of the network used to 18 | # choose actions 19 | X = tf.placeholder(shape=[1, input_size], dtype=tf.float32) # state input 20 | W = tf.Variable(tf.random_uniform( 21 | [input_size, output_size], 0, 0.01)) # weight 22 | 23 | Qpred = tf.matmul(X, W) # Out Q prediction 24 | Y = tf.placeholder(shape=[1, output_size], dtype=tf.float32) # Y label 25 | 26 | loss = tf.reduce_sum(tf.square(Y - Qpred)) 27 | train = tf.train.GradientDescentOptimizer( 28 | learning_rate=learning_rate).minimize(loss) 29 | 30 | # Set Q-learning related parameters 31 | dis = .99 32 | num_episodes = 2000 33 | 34 | # Create lists to contain total rewards and steps per episode 35 | rList = [] 36 | 37 | 38 | def one_hot(x): 39 | return np.identity(16)[x:x + 1] 40 | 41 | init = tf.global_variables_initializer() 42 | with tf.Session() as sess: 43 | sess.run(init) 44 | for i in range(num_episodes): 45 | # Reset environment and get first new observation 46 | s = env.reset() 47 | e = 1. / ((i / 50) + 10) 48 | rAll = 0 49 | done = False 50 | local_loss = [] 51 | 52 | # The Q-Network training 53 | while not done: 54 | # Choose an action by greedily (with e chance of random action) 55 | # from the Q-network 56 | Qs = sess.run(Qpred, feed_dict={X: one_hot(s)}) 57 | if np.random.rand(1) < e: 58 | a = env.action_space.sample() 59 | else: 60 | a = np.argmax(Qs) 61 | 62 | # Get new state and reward from environment 63 | s1, reward, done, _ = env.step(a) 64 | if done: 65 | # Update Q, and no Qs+1, since it's a terminal state 66 | Qs[0, a] = reward 67 | else: 68 | # Obtain the Q_s1 values by feeding the new state through our 69 | # network 70 | Qs1 = sess.run(Qpred, feed_dict={X: one_hot(s1)}) 71 | # Update Q 72 | Qs[0, a] = reward + dis * np.max(Qs1) 73 | 74 | # Train our network using target (Y) and predicted Q (Qpred) values 75 | sess.run(train, feed_dict={X: one_hot(s), Y: Qs}) 76 | 77 | rAll += reward 78 | s = s1 79 | rList.append(rAll) 80 | 81 | print("Percent of successful episodes: " + 82 | str(sum(rList) / num_episodes) + "%") 83 | plt.bar(range(len(rList)), rList, color="blue") 84 | plt.show() 85 | -------------------------------------------------------------------------------- /07_0_random_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | env = gym.make('CartPole-v0') 4 | env.reset() 5 | random_episodes = 0 6 | reward_sum = 0 7 | while random_episodes < 10: 8 | env.render() 9 | action = env.action_space.sample() 10 | observation, reward, done, _ = env.step(action) 11 | print(observation, reward, done) 12 | reward_sum += reward 13 | if done: 14 | random_episodes += 1 15 | print("Reward for this episode was:", reward_sum) 16 | reward_sum = 0 17 | env.reset() 18 | -------------------------------------------------------------------------------- /07_1_q_net_cartpole.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is based on 3 | https://github.com/hunkim/DeepRL-Agents 4 | ''' 5 | import numpy as np 6 | import tensorflow as tf 7 | from collections import deque 8 | 9 | import gym 10 | env = gym.make('CartPole-v0') 11 | 12 | # Constants defining our neural network 13 | learning_rate = 1e-1 14 | input_size = env.observation_space.shape[0] 15 | output_size = env.action_space.n 16 | 17 | X = tf.placeholder(tf.float32, [None, input_size], name="input_x") 18 | 19 | # First layer of weights 20 | W1 = tf.get_variable("W1", shape=[input_size, output_size], 21 | initializer=tf.contrib.layers.xavier_initializer()) 22 | Qpred = tf.matmul(X, W1) 23 | 24 | # We need to define the parts of the network needed for learning a policy 25 | Y = tf.placeholder(shape=[None, output_size], dtype=tf.float32) 26 | 27 | # Loss function 28 | loss = tf.reduce_sum(tf.square(Y - Qpred)) 29 | # Learning 30 | train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) 31 | 32 | # Values for q learning 33 | max_episodes = 5000 34 | dis = 0.9 35 | step_history = [] 36 | 37 | 38 | # Setting up our environment 39 | init = tf.global_variables_initializer() 40 | sess = tf.Session() 41 | sess.run(init) 42 | 43 | for episode in range(max_episodes): 44 | e = 1. / ((episode / 10) + 1) 45 | step_count = 0 46 | state = env.reset() 47 | done = False 48 | 49 | # The Q-Network training 50 | while not done: 51 | step_count += 1 52 | x = np.reshape(state, [1, input_size]) 53 | # Choose an action by greedily (with e chance of random action) from 54 | # the Q-network 55 | Q = sess.run(Qpred, feed_dict={X: x}) 56 | if np.random.rand(1) < e: 57 | action = env.action_space.sample() 58 | else: 59 | action = np.argmax(Q) 60 | 61 | # Get new state and reward from environment 62 | next_state, reward, done, _ = env.step(action) 63 | if done: 64 | Q[0, action] = -100 65 | else: 66 | x_next = np.reshape(next_state, [1, input_size]) 67 | # Obtain the Q' values by feeding the new state through our network 68 | Q_next = sess.run(Qpred, feed_dict={X: x_next}) 69 | Q[0, action] = reward + dis * np.max(Q_next) 70 | 71 | # Train our network using target and predicted Q values on each episode 72 | sess.run(train, feed_dict={X: x, Y: Q}) 73 | state = next_state 74 | 75 | step_history.append(step_count) 76 | print("Episode: {} steps: {}".format(episode, step_count)) 77 | # If last 10's avg steps are 500, it's good enough 78 | if len(step_history) > 10 and np.mean(step_history[-10:]) > 500: 79 | break 80 | 81 | # See our trained network in action 82 | observation = env.reset() 83 | reward_sum = 0 84 | while True: 85 | env.render() 86 | 87 | x = np.reshape(observation, [1, input_size]) 88 | Q = sess.run(Qpred, feed_dict={X: x}) 89 | action = np.argmax(Q) 90 | 91 | observation, reward, done, _ = env.step(action) 92 | reward_sum += reward 93 | if done: 94 | print("Total score: {}".format(reward_sum)) 95 | break 96 | -------------------------------------------------------------------------------- /07_2_dqn_2013_cartpole.py: -------------------------------------------------------------------------------- 1 | """ 2 | DQN (NIPS 2013) 3 | 4 | Playing Atari with Deep Reinforcement Learning 5 | https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf 6 | """ 7 | import numpy as np 8 | import tensorflow as tf 9 | import random 10 | import dqn 11 | import gym 12 | from collections import deque 13 | 14 | env = gym.make('CartPole-v0') 15 | env = gym.wrappers.Monitor(env, 'gym-results/', force=True) 16 | INPUT_SIZE = env.observation_space.shape[0] 17 | OUTPUT_SIZE = env.action_space.n 18 | 19 | DISCOUNT_RATE = 0.99 20 | REPLAY_MEMORY = 50000 21 | MAX_EPISODE = 5000 22 | BATCH_SIZE = 64 23 | 24 | # minimum epsilon for epsilon greedy 25 | MIN_E = 0.0 26 | # epsilon will be `MIN_E` at `EPSILON_DECAYING_EPISODE` 27 | EPSILON_DECAYING_EPISODE = MAX_EPISODE * 0.01 28 | 29 | 30 | def bot_play(mainDQN: dqn.DQN) -> None: 31 | """Runs a single episode with rendering and prints a reward 32 | 33 | Args: 34 | mainDQN (dqn.DQN): DQN Agent 35 | """ 36 | state = env.reset() 37 | total_reward = 0 38 | 39 | while True: 40 | env.render() 41 | action = np.argmax(mainDQN.predict(state)) 42 | state, reward, done, _ = env.step(action) 43 | total_reward += reward 44 | if done: 45 | print("Total score: {}".format(total_reward)) 46 | break 47 | 48 | 49 | def train_minibatch(DQN: dqn.DQN, train_batch: list) -> float: 50 | """Prepare X_batch, y_batch and train them 51 | 52 | Recall our loss function is 53 | target = reward + discount * max Q(s',a) 54 | or reward if done early 55 | 56 | Loss function: [target - Q(s, a)]^2 57 | 58 | Hence, 59 | 60 | X_batch is a state list 61 | y_batch is reward + discount * max Q 62 | or reward if terminated early 63 | 64 | Args: 65 | DQN (dqn.DQN): DQN Agent to train & run 66 | train_batch (list): Minibatch of Replay memory 67 | Eeach element is a tuple of (s, a, r, s', done) 68 | 69 | Returns: 70 | loss: Returns a loss 71 | 72 | """ 73 | state_array = np.vstack([x[0] for x in train_batch]) 74 | action_array = np.array([x[1] for x in train_batch]) 75 | reward_array = np.array([x[2] for x in train_batch]) 76 | next_state_array = np.vstack([x[3] for x in train_batch]) 77 | done_array = np.array([x[4] for x in train_batch]) 78 | 79 | X_batch = state_array 80 | y_batch = DQN.predict(state_array) 81 | 82 | Q_target = reward_array + DISCOUNT_RATE * np.max(DQN.predict(next_state_array), axis=1) * ~done_array 83 | y_batch[np.arange(len(X_batch)), action_array] = Q_target 84 | 85 | # Train our network using target and predicted Q values on each episode 86 | loss, _ = DQN.update(X_batch, y_batch) 87 | 88 | return loss 89 | 90 | 91 | def annealing_epsilon(episode: int, min_e: float, max_e: float, target_episode: int) -> float: 92 | """Return an linearly annealed epsilon 93 | 94 | Epsilon will decrease over time until it reaches `target_episode` 95 | 96 | (epsilon) 97 | | 98 | max_e ---|\ 99 | | \ 100 | | \ 101 | | \ 102 | min_e ---|____\_______________(episode) 103 | | 104 | target_episode 105 | 106 | slope = (min_e - max_e) / (target_episode) 107 | intercept = max_e 108 | 109 | e = slope * episode + intercept 110 | 111 | Args: 112 | episode (int): Current episode 113 | min_e (float): Minimum epsilon 114 | max_e (float): Maximum epsilon 115 | target_episode (int): epsilon becomes the `min_e` at `target_episode` 116 | 117 | Returns: 118 | float: epsilon between `min_e` and `max_e` 119 | """ 120 | 121 | slope = (min_e - max_e) / (target_episode) 122 | intercept = max_e 123 | 124 | return max(min_e, slope * episode + intercept) 125 | 126 | 127 | def main(): 128 | # store the previous observations in replay memory 129 | replay_buffer = deque(maxlen=REPLAY_MEMORY) 130 | last_100_game_reward = deque(maxlen=100) 131 | 132 | with tf.Session() as sess: 133 | mainDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE) 134 | init = tf.global_variables_initializer() 135 | sess.run(init) 136 | 137 | for episode in range(MAX_EPISODE): 138 | e = annealing_epsilon(episode, MIN_E, 1.0, EPSILON_DECAYING_EPISODE) 139 | done = False 140 | state = env.reset() 141 | 142 | step_count = 0 143 | while not done: 144 | 145 | if np.random.rand() < e: 146 | action = env.action_space.sample() 147 | else: 148 | action = np.argmax(mainDQN.predict(state)) 149 | 150 | next_state, reward, done, _ = env.step(action) 151 | 152 | if done: 153 | reward = -1 154 | 155 | replay_buffer.append((state, action, reward, next_state, done)) 156 | 157 | state = next_state 158 | step_count += 1 159 | 160 | if len(replay_buffer) > BATCH_SIZE: 161 | minibatch = random.sample(replay_buffer, BATCH_SIZE) 162 | train_minibatch(mainDQN, minibatch) 163 | 164 | print("[Episode {:>5}] steps: {:>5} e: {:>5.2f}".format(episode, step_count, e)) 165 | 166 | # CartPole-v0 Game Clear Logic 167 | last_100_game_reward.append(step_count) 168 | if len(last_100_game_reward) == last_100_game_reward.maxlen: 169 | avg_reward = np.mean(last_100_game_reward) 170 | if avg_reward > 199.0: 171 | print("Game Cleared within {} episodes with avg reward {}".format(episode, avg_reward)) 172 | break 173 | 174 | 175 | if __name__ == "__main__": 176 | main() 177 | -------------------------------------------------------------------------------- /07_3_dqn_2015_cartpole.py: -------------------------------------------------------------------------------- 1 | """ 2 | Double DQN (Nature 2015) 3 | http://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf 4 | 5 | Notes: 6 | The difference is that now there are two DQNs (DQN & Target DQN) 7 | 8 | y_i = r_i + 𝛾 * max(Q(next_state, action; 𝜃_target)) 9 | 10 | Loss: (y_i - Q(state, action; 𝜃))^2 11 | 12 | Every C step, 𝜃_target <- 𝜃 13 | 14 | """ 15 | import numpy as np 16 | import tensorflow as tf 17 | import random 18 | from collections import deque 19 | import dqn 20 | 21 | import gym 22 | from typing import List 23 | 24 | env = gym.make('CartPole-v0') 25 | env = gym.wrappers.Monitor(env, directory="gym-results/", force=True) 26 | 27 | # Constants defining our neural network 28 | INPUT_SIZE = env.observation_space.shape[0] 29 | OUTPUT_SIZE = env.action_space.n 30 | 31 | DISCOUNT_RATE = 0.99 32 | REPLAY_MEMORY = 50000 33 | BATCH_SIZE = 64 34 | TARGET_UPDATE_FREQUENCY = 5 35 | MAX_EPISODES = 5000 36 | 37 | 38 | def replay_train(mainDQN: dqn.DQN, targetDQN: dqn.DQN, train_batch: list) -> float: 39 | """Trains `mainDQN` with target Q values given by `targetDQN` 40 | 41 | Args: 42 | mainDQN (dqn.DQN): Main DQN that will be trained 43 | targetDQN (dqn.DQN): Target DQN that will predict Q_target 44 | train_batch (list): Minibatch of replay memory 45 | Each element is (s, a, r, s', done) 46 | [(state, action, reward, next_state, done), ...] 47 | 48 | Returns: 49 | float: After updating `mainDQN`, it returns a `loss` 50 | """ 51 | states = np.vstack([x[0] for x in train_batch]) 52 | actions = np.array([x[1] for x in train_batch]) 53 | rewards = np.array([x[2] for x in train_batch]) 54 | next_states = np.vstack([x[3] for x in train_batch]) 55 | done = np.array([x[4] for x in train_batch]) 56 | 57 | X = states 58 | 59 | Q_target = rewards + DISCOUNT_RATE * np.max(targetDQN.predict(next_states), axis=1) * ~done 60 | 61 | y = mainDQN.predict(states) 62 | y[np.arange(len(X)), actions] = Q_target 63 | 64 | # Train our network using target and predicted Q values on each episode 65 | return mainDQN.update(X, y) 66 | 67 | 68 | def get_copy_var_ops(*, dest_scope_name: str, src_scope_name: str) -> List[tf.Operation]: 69 | """Creates TF operations that copy weights from `src_scope` to `dest_scope` 70 | 71 | Args: 72 | dest_scope_name (str): Destination weights (copy to) 73 | src_scope_name (str): Source weight (copy from) 74 | 75 | Returns: 76 | List[tf.Operation]: Update operations are created and returned 77 | """ 78 | # Copy variables src_scope to dest_scope 79 | op_holder = [] 80 | 81 | src_vars = tf.get_collection( 82 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name) 83 | dest_vars = tf.get_collection( 84 | tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name) 85 | 86 | for src_var, dest_var in zip(src_vars, dest_vars): 87 | op_holder.append(dest_var.assign(src_var.value())) 88 | 89 | return op_holder 90 | 91 | 92 | def bot_play(mainDQN: dqn.DQN, env: gym.Env) -> None: 93 | """Test runs with rendering and prints the total score 94 | 95 | Args: 96 | mainDQN (dqn.DQN): DQN agent to run a test 97 | env (gym.Env): Gym Environment 98 | """ 99 | state = env.reset() 100 | reward_sum = 0 101 | 102 | while True: 103 | 104 | env.render() 105 | action = np.argmax(mainDQN.predict(state)) 106 | state, reward, done, _ = env.step(action) 107 | reward_sum += reward 108 | 109 | if done: 110 | print("Total score: {}".format(reward_sum)) 111 | break 112 | 113 | 114 | def main(): 115 | # store the previous observations in replay memory 116 | replay_buffer = deque(maxlen=REPLAY_MEMORY) 117 | 118 | last_100_game_reward = deque(maxlen=100) 119 | 120 | with tf.Session() as sess: 121 | mainDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="main") 122 | targetDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="target") 123 | sess.run(tf.global_variables_initializer()) 124 | 125 | # initial copy q_net -> target_net 126 | copy_ops = get_copy_var_ops(dest_scope_name="target", 127 | src_scope_name="main") 128 | sess.run(copy_ops) 129 | 130 | for episode in range(MAX_EPISODES): 131 | e = 1. / ((episode / 10) + 1) 132 | done = False 133 | step_count = 0 134 | state = env.reset() 135 | 136 | while not done: 137 | if np.random.rand() < e: 138 | action = env.action_space.sample() 139 | else: 140 | # Choose an action by greedily from the Q-network 141 | action = np.argmax(mainDQN.predict(state)) 142 | 143 | # Get new state and reward from environment 144 | next_state, reward, done, _ = env.step(action) 145 | 146 | if done: # Penalty 147 | reward = -1 148 | 149 | # Save the experience to our buffer 150 | replay_buffer.append((state, action, reward, next_state, done)) 151 | 152 | if len(replay_buffer) > BATCH_SIZE: 153 | minibatch = random.sample(replay_buffer, BATCH_SIZE) 154 | loss, _ = replay_train(mainDQN, targetDQN, minibatch) 155 | 156 | if step_count % TARGET_UPDATE_FREQUENCY == 0: 157 | sess.run(copy_ops) 158 | 159 | state = next_state 160 | step_count += 1 161 | 162 | print("Episode: {} steps: {}".format(episode, step_count)) 163 | 164 | # CartPole-v0 Game Clear Checking Logic 165 | last_100_game_reward.append(step_count) 166 | 167 | if len(last_100_game_reward) == last_100_game_reward.maxlen: 168 | avg_reward = np.mean(last_100_game_reward) 169 | 170 | if avg_reward > 199: 171 | print(f"Game Cleared in {episode} episodes with avg reward {avg_reward}") 172 | break 173 | 174 | 175 | if __name__ == "__main__": 176 | main() 177 | -------------------------------------------------------------------------------- /08_1_pg_cartpole.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is based on: 3 | https://github.com/hunkim/DeepRL-Agents 4 | http://karpathy.github.io/2016/05/31/rl/ 5 | ''' 6 | import numpy as np 7 | import tensorflow as tf 8 | import gym 9 | 10 | env = gym.make('CartPole-v0') 11 | 12 | hidden_layer_neurons = 24 13 | learning_rate = 1e-2 14 | 15 | # Constants defining our neural network 16 | input_size = env.observation_space.shape[0] 17 | output_size = 1 # logistic regression, one p output 18 | 19 | X = tf.placeholder(tf.float32, [None, input_size], name="input_x") 20 | 21 | # First layer of weights 22 | W1 = tf.get_variable("W1", shape=[input_size, hidden_layer_neurons], 23 | initializer=tf.contrib.layers.xavier_initializer()) 24 | layer1 = tf.nn.relu(tf.matmul(X, W1)) 25 | 26 | # Second layer of weights 27 | W2 = tf.get_variable("W2", shape=[hidden_layer_neurons, output_size], 28 | initializer=tf.contrib.layers.xavier_initializer()) 29 | action_pred = tf.nn.sigmoid(tf.matmul(layer1, W2)) 30 | 31 | # Y (fake) and advantages (rewards) 32 | Y = tf.placeholder(tf.float32, [None, output_size], name="input_y") 33 | advantages = tf.placeholder(tf.float32, name="reward_signal") 34 | 35 | # Loss function: log_likelihood * advantages 36 | #log_lik = -tf.log(Y * action_pred + (1 - Y) * (1 - action_pred)) # using author(awjuliani)'s original cost function (maybe log_likelihood) 37 | log_lik = -Y*tf.log(action_pred) - (1 - Y)*tf.log(1 - action_pred) # using logistic regression cost function 38 | loss = tf.reduce_sum(log_lik * advantages) 39 | 40 | # Learning 41 | train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) 42 | 43 | 44 | def discount_rewards(r, gamma=0.99): 45 | """ take 1D float array of rewards and compute discounted reward """ 46 | discounted_r = np.zeros_like(r, dtype=np.float32) 47 | running_add = 0 48 | for t in reversed(range(len(r))): 49 | running_add = running_add * gamma + r[t] 50 | discounted_r[t] = running_add 51 | 52 | return discounted_r 53 | 54 | # Testing Code 55 | # It's always recommended to test your code 56 | input = [1, 1, 1] 57 | output = discount_rewards(input) 58 | expect = [1 + 0.99 + 0.99**2, 1 + 0.99, 1] 59 | np.testing.assert_almost_equal(output, expect) 60 | 61 | 62 | 63 | # Setting up our environment 64 | sess = tf.Session() 65 | sess.run(tf.global_variables_initializer()) 66 | 67 | max_num_episodes = 500 68 | 69 | # This list will contain episode rewards from the most recent 100 games 70 | # Clear Condition: Average reward per episode >= 195.0 over 100 games 71 | EPISODE_100_REWARD_LIST = [] 72 | for step in range(max_num_episodes): 73 | # Initialize x stack, y stack, and rewards 74 | xs = np.empty(shape=[0, input_size]) 75 | ys = np.empty(shape=[0, 1]) 76 | rewards = np.empty(shape=[0, 1]) 77 | 78 | reward_sum = 0 79 | observation = env.reset() 80 | 81 | while True: 82 | x = np.reshape(observation, [1, input_size]) 83 | 84 | # Run the neural net to determine output 85 | action_prob = sess.run(action_pred, feed_dict={X: x}) 86 | 87 | # Determine the output based on our net, allowing for some randomness 88 | action = 0 if action_prob < np.random.uniform() else 1 89 | 90 | # Append the observations and outputs for learning 91 | xs = np.vstack([xs, x]) 92 | ys = np.vstack([ys, action]) # Fake action 93 | 94 | # Determine the outcome of our action 95 | observation, reward, done, _ = env.step(action) 96 | rewards = np.vstack([rewards, reward]) 97 | reward_sum += reward 98 | 99 | if done: 100 | # Determine standardized rewards 101 | discounted_rewards = discount_rewards(rewards) 102 | # Normalization 103 | discounted_rewards = (discounted_rewards - discounted_rewards.mean())/(discounted_rewards.std() + 1e-7) 104 | l, _ = sess.run([loss, train], 105 | feed_dict={X: xs, Y: ys, advantages: discounted_rewards}) 106 | 107 | EPISODE_100_REWARD_LIST.append(reward_sum) 108 | if len(EPISODE_100_REWARD_LIST) > 100: 109 | EPISODE_100_REWARD_LIST = EPISODE_100_REWARD_LIST[1:] 110 | break 111 | 112 | # Print status 113 | print(f"[Episode {step:>5d}] Reward: {reward_sum:>4} Loss: {l:>10.5f}") 114 | 115 | if np.mean(EPISODE_100_REWARD_LIST) >= 195: 116 | print(f"Game Cleared within {step} steps with the average reward: {np.mean(EPISODE_100_REWARD_LIST)}") 117 | break 118 | 119 | # See our trained bot in action 120 | observation = env.reset() 121 | reward_sum = 0 122 | 123 | while True: 124 | env.render() 125 | x = np.reshape(observation, [1, input_size]) 126 | action_prob = sess.run(action_pred, feed_dict={X: x}) 127 | action = 0 if action_prob < 0.5 else 1 # No randomness 128 | observation, reward, done, _ = env.step(action) 129 | reward_sum += reward 130 | if done: 131 | print("Total score: {}".format(reward_sum)) 132 | break 133 | 134 | sess.close() -------------------------------------------------------------------------------- /08_2_softmax_pg_cartpole.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is based on: 3 | https://github.com/hunkim/DeepRL-Agents 4 | http://karpathy.github.io/2016/05/31/rl/ 5 | ''' 6 | import numpy as np 7 | import tensorflow as tf 8 | import gym 9 | 10 | env = gym.make('CartPole-v0') 11 | 12 | hidden_layer_neurons = 24 13 | learning_rate = 1e-2 14 | gamma = .99 15 | 16 | # Constants defining our neural network 17 | input_size = env.observation_space.shape[0] 18 | output_size = env.action_space.n 19 | 20 | X = tf.placeholder(tf.float32, [None, input_size], name="input_x") 21 | 22 | # First layer of weights 23 | W1 = tf.get_variable("W1", shape=[input_size, hidden_layer_neurons], 24 | initializer=tf.contrib.layers.xavier_initializer()) 25 | layer1 = tf.nn.relu(tf.matmul(X, W1)) 26 | 27 | # Second layer of weights 28 | W2 = tf.get_variable("W2", shape=[hidden_layer_neurons, output_size], 29 | initializer=tf.contrib.layers.xavier_initializer()) 30 | action_pred = tf.nn.softmax(tf.matmul(layer1, W2)) 31 | 32 | # We need to define the parts of the network needed for learning a policy 33 | Y = tf.placeholder(tf.float32, [None, output_size], name="input_y") 34 | advantages = tf.placeholder(tf.float32, name="reward_signal") 35 | 36 | print(Y, action_pred) 37 | # Loss function, ∑ Ai*logp(yi∣xi), but we need fake lable Y due to autodiff 38 | log_lik = -Y * tf.log(action_pred) 39 | log_lik_adv = log_lik * advantages 40 | loss = tf.reduce_mean(tf.reduce_sum(log_lik_adv, axis=1)) 41 | 42 | # Learning 43 | train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) 44 | 45 | 46 | def discount_rewards(r, gamma=0.99): 47 | """ take 1D float array of rewards and compute discounted reward """ 48 | discounted_r = np.zeros_like(r, dtype=np.float32) 49 | running_add = 0 50 | for t in reversed(range(len(r))): 51 | running_add = running_add * gamma + r[t] 52 | discounted_r[t] = running_add 53 | 54 | return discounted_r 55 | 56 | 57 | # Setting up our environment 58 | sess = tf.Session() 59 | sess.run(tf.global_variables_initializer()) 60 | 61 | num_episodes = 1000 62 | # This list will contain episode rewards from the most recent 100 games 63 | # Clear Condition: Average reward per episode >= 195.0 over 100 games 64 | EPISODE_100_REWARD_LIST = [] 65 | for i in range(num_episodes): 66 | 67 | # Clear out game variables 68 | xs = np.empty(shape=[0, input_size]) 69 | ys = np.empty(shape=[0, output_size]) 70 | rewards = np.empty(shape=[0, 1]) 71 | 72 | reward_sum = 0 73 | state = env.reset() 74 | 75 | while True: 76 | # Append the observations to our batch 77 | x = np.reshape(state, [1, input_size]) 78 | 79 | # Run the neural net to determine output 80 | action_prob = sess.run(action_pred, feed_dict={X: x}) 81 | action = np.random.choice(np.arange(output_size), p=action_prob[0]) 82 | 83 | # Append the observations and outputs for learning 84 | xs = np.vstack([xs, x]) 85 | y = np.zeros(output_size) 86 | y[action] = 1 87 | 88 | ys = np.vstack([ys, y]) 89 | 90 | # Determine the outcome of our action 91 | state, reward, done, _ = env.step(action) 92 | reward_sum += reward 93 | rewards = np.vstack([rewards, reward]) 94 | 95 | if done: 96 | # Determine standardized rewards 97 | discounted_rewards = discount_rewards(rewards, gamma) 98 | # Normalization 99 | discounted_rewards = (discounted_rewards - discounted_rewards.mean())/(discounted_rewards.std() + 1e-7) 100 | ll, la, l, _ = sess.run([log_lik, log_lik_adv, loss, train], feed_dict={X: xs, 101 | Y: ys, 102 | advantages: discounted_rewards}) 103 | # print values for debugging 104 | # print(1, ll, la) 105 | EPISODE_100_REWARD_LIST.append(reward_sum) 106 | if len(EPISODE_100_REWARD_LIST) > 100: 107 | EPISODE_100_REWARD_LIST = EPISODE_100_REWARD_LIST[1:] 108 | break 109 | 110 | 111 | # Print status 112 | print(f"[Episode {i:>}] Reward: {reward_sum:>4} Loss: {l:>5.5}") 113 | 114 | if np.mean(EPISODE_100_REWARD_LIST) >= 195.0: 115 | print(f"Game Cleared within {i} steps with the average reward: {np.mean(EPISODE_100_REWARD_LIST)}") 116 | break 117 | 118 | 119 | 120 | state = env.reset() 121 | reward_sum = 0 122 | 123 | while True: 124 | env.render() 125 | 126 | x = np.reshape(state, [1, input_size]) 127 | action_prob = sess.run(action_pred, feed_dict={X: x}) 128 | action = np.argmax(action_prob) 129 | state, reward, done, _ = env.step(action) 130 | reward_sum += reward 131 | if done: 132 | print("Total score: {}".format(reward_sum)) 133 | break 134 | 135 | sess.close() -------------------------------------------------------------------------------- /08_3_softmax_pg_pacman.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is based on: 3 | https://github.com/hunkim/DeepRL-Agents 4 | http://karpathy.github.io/2016/05/31/rl/ 5 | ''' 6 | import numpy as np 7 | import tensorflow as tf 8 | import os 9 | # by Jin Kim (golbin) https://github.com/golbin/TensorFlow-Tutorials 10 | import mini_pacman 11 | 12 | env = mini_pacman.Gym() 13 | 14 | hidden_layer_neurons = 64 15 | learning_rate = 1e-3 16 | gamma = .99 17 | 18 | LOG_DIR = './pacman_log' 19 | 20 | # Constants defining our neural network 21 | input_size = 240 22 | output_size = 3 23 | 24 | X = tf.placeholder(tf.float32, [None, input_size], name="input_x") 25 | x_image = tf.reshape(X, [-1, 6, 10, 1]) 26 | tf.summary.image('input', x_image, 8) 27 | 28 | # First layer of weights 29 | with tf.name_scope("layer1"): 30 | W1 = tf.get_variable("W1", shape=[input_size, hidden_layer_neurons], 31 | initializer=tf.contrib.layers.xavier_initializer()) 32 | B1 = tf.Variable(tf.zeros([hidden_layer_neurons])) 33 | layer1 = tf.matmul(X, W1) + B1 34 | layer1_act = tf.nn.tanh(layer1) 35 | tf.summary.histogram("X", X) 36 | tf.summary.histogram("weights", W1) 37 | tf.summary.histogram("bias", B1) 38 | tf.summary.histogram("layer", layer1) 39 | tf.summary.histogram("activations", layer1_act) 40 | 41 | 42 | # Second layer of weights 43 | with tf.name_scope("layer2"): 44 | W2 = tf.get_variable("W2", shape=[hidden_layer_neurons, output_size], 45 | initializer=tf.contrib.layers.xavier_initializer()) 46 | B2 = tf.Variable(tf.zeros([output_size])) 47 | layer2 = tf.matmul(layer1_act, W2) + B2 48 | action_pred = tf.nn.softmax(layer2) 49 | tf.summary.histogram("weights", W2) 50 | tf.summary.histogram("bias", B2) 51 | tf.summary.histogram("layer", layer2) 52 | tf.summary.histogram("action_pred", action_pred) 53 | 54 | # We need to define the parts of the network needed for learning a policy 55 | Y = tf.placeholder(tf.float32, [None, output_size], name="input_y") 56 | advantages = tf.placeholder(tf.float32, name="reward_signal") 57 | 58 | # Loss function 59 | # Sum (Ai*logp(yi|xi)) 60 | log_lik = -Y * tf.log(action_pred) 61 | log_like_adv = log_lik * advantages 62 | loss = tf.reduce_mean(tf.reduce_sum(log_like_adv)) 63 | tf.summary.scalar("Q", tf.reduce_mean(action_pred)) 64 | tf.summary.scalar("Y", tf.reduce_mean(Y)) 65 | tf.summary.scalar("log_likelihood", tf.reduce_mean(log_lik)) 66 | tf.summary.scalar("loss", loss) 67 | 68 | # Learning 69 | train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) 70 | 71 | # Some place holders for summary 72 | summary_reward = tf.placeholder(tf.float32, shape=(), name="reward") 73 | tf.summary.scalar("reward", summary_reward) 74 | 75 | # Summary 76 | summary = tf.summary.merge_all() 77 | 78 | 79 | def discount_rewards(r, gamma=0.99): 80 | """ take 1D float array of rewards and compute discounted reward """ 81 | discounted_r = np.zeros_like(r) 82 | running_add = 0 83 | for t in reversed(range(0, r.size)): 84 | if r[t] != 0: 85 | # reset the sum, since this was a game boundary (pong specific!) 86 | running_add = 0 87 | running_add = running_add * gamma + r[t] 88 | discounted_r[t] = running_add 89 | 90 | # compute the discounted reward backwards through time 91 | # standardize the rewards to be unit normal (helps control the gradient 92 | # estimator variance) 93 | discounted_r -= np.mean(discounted_r) 94 | discounted_r /= np.std(discounted_r) 95 | 96 | return discounted_r 97 | 98 | # Setting up our environment 99 | sess = tf.Session() 100 | rendering = False 101 | sess.run(tf.global_variables_initializer()) 102 | 103 | # TensorBoard 104 | writer = tf.summary.FileWriter(LOG_DIR) 105 | writer.add_graph(sess.graph) 106 | 107 | # Savor and Restore 108 | saver = tf.train.Saver() 109 | checkpoint = tf.train.get_checkpoint_state(LOG_DIR) 110 | if checkpoint and checkpoint.model_checkpoint_path: 111 | try: 112 | saver.restore(sess, checkpoint.model_checkpoint_path) 113 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 114 | except: 115 | print("Error on loading old network weights") 116 | else: 117 | print("Could not find old network weights") 118 | 119 | global_step = 0 120 | while True: 121 | global_step += 1 122 | 123 | # Clear out game variables 124 | xs = np.empty(0).reshape(0, input_size) 125 | ys = np.empty(0).reshape(0, output_size) 126 | rewards = np.empty(0).reshape(0, 1) 127 | 128 | reward_sum = 0 129 | state = env.reset() 130 | 131 | # Initial 4 frame data 132 | s_t = np.array([state, state, state, state]) 133 | 134 | while True: 135 | # Append the observations to our batch 136 | x = np.reshape(s_t, [1, input_size]) 137 | 138 | # Run the neural net to determine output 139 | action_prob = sess.run(action_pred, feed_dict={X: x}) 140 | action_prob = np.squeeze(action_prob) # shape (?, 2) -> 2 141 | random_noise = np.random.uniform(0, 1, output_size) 142 | action = np.argmax(action_prob + random_noise) 143 | print("Action prediction: ", np.argmax(action_prob), " action taken:", action, 144 | np.argmax(action_prob) == action) 145 | 146 | # Append the observations and outputs for learning 147 | xs = np.vstack([xs, x]) 148 | y = np.eye(output_size)[action:action + 1] # One hot encoding 149 | ys = np.vstack([ys, y]) 150 | 151 | # Determine the outcome of our action 152 | state, reward, done, _ = env.step(action) 153 | 154 | s_t = np.array([state, s_t[0], s_t[1], s_t[2]]) 155 | reward_sum += reward 156 | rewards = np.vstack([rewards, reward]) 157 | 158 | if done: 159 | # Determine standardized rewards 160 | rewards = discount_rewards(rewards) 161 | reward_mean = np.mean(rewards) 162 | ll, la, l, s, _ = sess.run([log_lik, log_like_adv, loss, summary, train], 163 | feed_dict={X: xs, 164 | Y: ys, 165 | advantages: rewards, 166 | summary_reward: reward_mean}) 167 | #print(ll, la) 168 | writer.add_summary(s, global_step) 169 | break 170 | 171 | # Print status 172 | print("Average reward for episode {}: {}. Loss: {}".format( 173 | global_step, reward_sum, l)) 174 | 175 | if global_step % 100 == 0: 176 | print("Saving network...") 177 | if not os.path.exists(LOG_DIR): 178 | os.makedirs(LOG_DIR) 179 | saver.save(sess, os.path.join(LOG_DIR, "model.ckpt"), 180 | global_step=global_step) 181 | -------------------------------------------------------------------------------- /08_4_softmax_pg_pong.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is based on: 3 | https://github.com/hunkim/DeepRL-Agents 4 | http://karpathy.github.io/2016/05/31/rl/ 5 | ''' 6 | import numpy as np 7 | import tensorflow as tf 8 | import gym 9 | import os 10 | 11 | env = gym.make("Pong-v0") 12 | 13 | gamma = .99 14 | 15 | SUMMARY_DIR = './tensorboard/pong' 16 | CHECK_POINT_DIR = SUMMARY_DIR 17 | 18 | # Constants defining our neural network 19 | input_size = 80 * 80 * 4 20 | action_space = env.action_space.n 21 | print("Pong Action space", action_space) 22 | 23 | with tf.name_scope("cnn"): 24 | X = tf.placeholder(tf.float32, [None, input_size], name="input_x") 25 | x_image = tf.reshape(X, [-1, 80, 80, 4]) 26 | tf.summary.image('input', x_image, 8) 27 | 28 | # Build a convolutional layer random initialization 29 | W_conv1 = tf.get_variable("W_conv1", shape = [5, 5, 4, 32], initializer=tf.contrib.layers.xavier_initializer()) 30 | # W is [row, col, channel, feature] 31 | b_conv1 = tf.Variable(tf.zeros([32]), name="b_conv1") 32 | h_conv1 = tf.nn.relu(tf.nn.conv2d(x_image, W_conv1, strides=[1, 2, 2, 1], padding='VALID') + b_conv1, name="h_conv1") 33 | 34 | W_conv2 = tf.get_variable("W_conv2", shape = [5, 5, 32, 64], initializer=tf.contrib.layers.xavier_initializer()) 35 | b_conv2 = tf.Variable(tf.zeros([64]), name="b_conv2") 36 | h_conv2 = tf.nn.relu(tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding='VALID') + b_conv2, name="h_conv2") 37 | 38 | W_conv3 = tf.get_variable("W_conv3", shape = [5, 5, 64, 64], initializer=tf.contrib.layers.xavier_initializer()) 39 | b_conv3 = tf.Variable(tf.zeros([64]), name="b_conv3") 40 | h_conv3 = tf.nn.relu(tf.nn.conv2d(h_conv2, W_conv3, strides=[1, 2, 2, 1], padding='VALID') + b_conv3, name="h_conv3") 41 | 42 | # Build a fully connected layer with softmax 43 | h_conv3_flat = tf.reshape(h_conv3, [-1, 7*7*64], name="h_pool2_flat") 44 | W_fc1 = tf.get_variable("W_fc1", shape = [7*7*64, action_space], initializer=tf.contrib.layers.xavier_initializer()) 45 | b_fc1 = tf.Variable(tf.zeros([action_space]), name = 'b_fc1') 46 | action_pred = tf.nn.softmax(tf.matmul(h_conv3_flat, W_fc1) + b_fc1, name="h_fc1") 47 | 48 | tf.summary.histogram("action_pred", action_pred) 49 | 50 | # We need to define the parts of the network needed for learning a policy 51 | Y = tf.placeholder(tf.float32, [None, action_space], name="input_y") 52 | advantages = tf.placeholder(tf.float32, [None, 1], name="reward_signal") 53 | 54 | # Loss function 55 | # Sum (Ai*logp(yi|xi)) 56 | log_lik = -Y * (tf.log(tf.clip_by_value(action_pred, 1e-10 , 1.0))) 57 | loss = tf.reduce_mean(tf.reduce_sum(log_lik * advantages, axis=1)) 58 | tf.summary.scalar("A_pred", tf.reduce_mean(action_pred)) 59 | tf.summary.scalar("Y", tf.reduce_mean(Y)) 60 | tf.summary.scalar("log_likelihood", tf.reduce_mean(log_lik)) 61 | tf.summary.scalar("loss", loss) 62 | 63 | # Learning 64 | train = tf.train.AdamOptimizer().minimize(loss) 65 | 66 | # Some place holders for summary 67 | summary_reward = tf.placeholder(tf.float32, shape=(), name="reward") 68 | tf.summary.scalar("reward", summary_reward) 69 | 70 | # Summary 71 | summary = tf.summary.merge_all() 72 | 73 | 74 | def discount_rewards(r, gamma=0.99): 75 | """ take 1D float array of rewards and compute discounted reward 76 | http://karpathy.github.io/2016/05/31/rl/ """ 77 | discounted_r = np.zeros_like(r) 78 | running_add = 0 79 | for t in reversed(range(0, r.size)): 80 | if r[t] != 0: 81 | # reset the sum, since this was a game boundary (pong specific!) 82 | running_add = 0 83 | running_add = running_add * gamma + r[t] 84 | discounted_r[t] = running_add 85 | 86 | # compute the discounted reward backwards through time 87 | # standardize the rewards to be unit normal (helps control the gradient 88 | # estimator variance) 89 | discounted_r -= np.mean(discounted_r) 90 | discounted_r /= np.std(discounted_r) 91 | return discounted_r 92 | 93 | 94 | def prepro(I): 95 | """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector 96 | http://karpathy.github.io/2016/05/31/rl/ """ 97 | I = I[35:195] # crop 98 | I = I[::2, ::2, 0] # downsample by factor of 2 99 | I[I == 144] = 0 # erase background (background type 1) 100 | I[I == 109] = 0 # erase background (background type 2) 101 | I[I != 0] = 1 # everything else (paddles, ball) just set to 1 102 | return I.astype(np.float).ravel() 103 | 104 | 105 | # Setting up our environment 106 | sess = tf.Session() 107 | sess.run(tf.global_variables_initializer()) 108 | writer = tf.summary.FileWriter(SUMMARY_DIR) 109 | writer.add_graph(sess.graph) 110 | 111 | # Savor and Restore 112 | saver = tf.train.Saver() 113 | checkpoint = tf.train.get_checkpoint_state(CHECK_POINT_DIR) 114 | if checkpoint and checkpoint.model_checkpoint_path: 115 | try: 116 | saver.restore(sess, checkpoint.model_checkpoint_path) 117 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 118 | except: 119 | print("Error on loading old network weights") 120 | else: 121 | print("Could not find old network weights") 122 | 123 | global_step = 0 124 | while True: 125 | global_step += 1 126 | 127 | xs_list = [] 128 | ys_list = [] 129 | rewards = np.empty(0).reshape(0, 1) 130 | ep_rewards_list = [] 131 | 132 | reward_sum = 0 133 | state = env.reset() 134 | state = prepro(state) 135 | 136 | # Initial 4 frame data 137 | s_t = np.array([state, state, state, state]) 138 | 139 | while True: 140 | # Append the observations to our batch 141 | x = np.reshape(s_t, [1, input_size]) 142 | 143 | # Run the neural net to determine output 144 | action_prob = sess.run(action_pred, feed_dict={X: x}) 145 | action_prob = np.squeeze(action_prob) # shape (?, n) -> n 146 | action = np.random.choice(action_space, size=1, p=action_prob)[0] 147 | 148 | #random_noise = np.random.uniform(0, 1, output_size) 149 | #action = np.argmax(action_prob + random_noise) 150 | # print("Action prediction: ", np.argmax(action_prob), " action taken:", action, 151 | # np.argmax(action_prob) == action) 152 | 153 | # Append the observations and outputs for learning 154 | xs_list.append(x) 155 | y = np.eye(action_space)[action:action + 1] # One hot encoding 156 | ys_list.append(y) 157 | 158 | state, reward, done, _ = env.step(action) 159 | # env.render() 160 | state = prepro(state) 161 | s_t = np.array([state, s_t[0], s_t[1], s_t[2]]) # s_t[4] out! 162 | reward_sum += reward 163 | 164 | ep_rewards_list.append(reward) 165 | 166 | # Discount rewards on every single game 167 | if reward == 1 or reward == -1: 168 | ep_rewards = np.vstack(ep_rewards_list) 169 | discounted_rewards = discount_rewards(ep_rewards, gamma) 170 | rewards = np.vstack([rewards, discounted_rewards]) 171 | ep_rewards_list = [] 172 | # print(ep_rewards, discounted_rewards) 173 | print("Ep reward {}".format(reward)) 174 | if done: 175 | xs = np.vstack(xs_list) 176 | ys = np.vstack(ys_list) 177 | 178 | l, s, _ = sess.run([loss, summary, train], 179 | feed_dict={X: xs, 180 | Y: ys, 181 | advantages: rewards, 182 | summary_reward: reward_sum}) 183 | writer.add_summary(s, global_step) 184 | break 185 | 186 | # Print status 187 | print("Average reward for episode {}: {}. Loss: {}".format( 188 | global_step, reward_sum, l)) 189 | 190 | if global_step % 100 == 0: 191 | print("Saving network...") 192 | if not os.path.exists(CHECK_POINT_DIR): 193 | os.makedirs(CHECK_POINT_DIR) 194 | saver.save(sess, CHECK_POINT_DIR + "/pong", global_step=global_step) 195 | -------------------------------------------------------------------------------- /08_4_softmax_pg_pong_y.py: -------------------------------------------------------------------------------- 1 | """ 2 | Yet Another Pong 3 | 4 | Vanilla Policy Gradient implementation 5 | 6 | (1) Pong's state is (210, 160, 3) 7 | (2) After `pipeline(image)`, it becomes (80, 80, 1) 8 | (3) The model uses an input of `state_diff` = `new_state` - `old_state` 9 | (4) It assumes there exists 2 actions. 10 | 11 | Pong's original action space is the following: 12 | 0, 1 : do nothing 13 | 2, 4 : move up 14 | 3, 5 : move down 15 | 16 | In this file, it uses {2: move up, 3: move down} only 17 | 18 | It gets rid of unnecessary complexity. 19 | """ 20 | import gym 21 | import matplotlib.pyplot as plt 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from functools import partial 26 | from scipy.misc import imresize 27 | 28 | import os 29 | 30 | 31 | def plot_image(image): 32 | """Plot an image 33 | 34 | If an image is a grayscale image, 35 | plot in `gray` cmap. 36 | Otherwise, regular RGB plot. 37 | 38 | Args: 39 | image (2-D or 3-D array): (H, W) or (H, W, C) 40 | """ 41 | image = np.squeeze(image) 42 | shape = image.shape 43 | 44 | if len(shape) == 2: 45 | plt.imshow(image, cmap="gray") 46 | 47 | else: 48 | plt.imshow(image) 49 | 50 | plt.show() 51 | 52 | 53 | def pipeline(image, new_HW, height_range=(35, 193), bg=(144, 72, 17)): 54 | """Returns a preprocessed image 55 | 56 | (1) Crop image (top and bottom) 57 | (2) Remove background & grayscale 58 | (3) Reszie to smaller image 59 | 60 | Args: 61 | image (3-D array): (H, W, C) 62 | new_HW (tuple): New image size (height, width) 63 | height_range (tuple): Height range (H_begin, H_end) else cropped 64 | bg (tuple): Background RGB Color (R, G, B) 65 | 66 | Returns: 67 | image (3-D array): (H, W, 1) 68 | """ 69 | image = crop_image(image, height_range) 70 | image = resize_image(image, new_HW) 71 | image = kill_background_grayscale(image, bg) 72 | 73 | image = np.expand_dims(image, axis=2) 74 | return image 75 | 76 | 77 | def resize_image(image, new_HW): 78 | """Returns a resized image 79 | 80 | Args: 81 | image (3-D array): Numpy array (H, W, C) 82 | new_HW (tuple): Target size (height, width) 83 | 84 | Returns: 85 | image (3-D array): Resized image (height, width, C) 86 | """ 87 | return imresize(image, new_HW, interp="nearest") 88 | 89 | 90 | def crop_image(image, height_range=(35, 195)): 91 | """Crops top and bottom 92 | 93 | Args: 94 | image (3-D array): Numpy image (H, W, C) 95 | height_range (tuple): Height range between (min_height, max_height) 96 | will be kept 97 | 98 | Returns: 99 | image (3-D array): Numpy image (max_H - min_H, W, C) 100 | """ 101 | h_beg, h_end = height_range 102 | return image[h_beg:h_end, ...] 103 | 104 | 105 | def kill_background_grayscale(image, bg): 106 | """Make the background 0 107 | 108 | Args: 109 | image (3-D array): Numpy array (H, W, C) 110 | bg (tuple): RGB code of background (R, G, B) 111 | 112 | Returns: 113 | image (2-D array): Binarized image of shape (H, W) 114 | The background is 0 and everything else is 1 115 | """ 116 | H, W, _ = image.shape 117 | 118 | R = image[..., 0] 119 | G = image[..., 1] 120 | B = image[..., 2] 121 | 122 | cond = (R == bg[0]) & (G == bg[1]) & (B == bg[2]) 123 | 124 | image = np.zeros((H, W)) 125 | image[~cond] = 1 126 | 127 | return image 128 | 129 | 130 | class Agent(object): 131 | 132 | def __init__(self, input_dim, output_dim, logdir="logdir", checkpoint_dir="checkpoints"): 133 | """Agent class 134 | 135 | Args: 136 | input_dim (tuple): The input shape (H, W, C) 137 | output_dim (int): Number of actions 138 | logdir (str): Directory to save `summary` 139 | checkpoint_dir (str): Directory to save `model.ckpt` 140 | 141 | Notes: 142 | 143 | It has two methods. 144 | 145 | `choose_action(state)` 146 | Will return an action given the state 147 | 148 | `train(state, action, reward)` 149 | Will train on given `states`, `actions`, `rewards` 150 | 151 | Private methods has two underscore prefixes 152 | """ 153 | self.input_dim = list(input_dim) 154 | self.output_dim = output_dim 155 | self.gamma = 0.99 156 | self.entropy_coefficient = 0.01 157 | self.RMSPropdecay = 0.99 158 | self.learning_rate = 0.001 159 | 160 | self.checkpoint_dir = checkpoint_dir 161 | self.__build_network(self.input_dim, self.output_dim) 162 | 163 | if logdir is not None: 164 | self.__build_summary_op(logdir) 165 | else: 166 | self.summary_op = None 167 | 168 | if checkpoint_dir is not None: 169 | self.saver = tf.train.Saver() 170 | 171 | maybe_path = os.path.join(self.checkpoint_dir, "model.ckpt") 172 | if os.path.exists(self.checkpoint_dir) and tf.train.checkpoint_exists(maybe_path): 173 | print("Restored {}".format(maybe_path)) 174 | sess = tf.get_default_session() 175 | self.saver.restore(sess, maybe_path) 176 | 177 | else: 178 | print("No model is found") 179 | os.makedirs(checkpoint_dir, exist_ok=True) 180 | 181 | def __build_network(self, input_dim, output_dim): 182 | 183 | self.global_step = tf.train.get_or_create_global_step() 184 | 185 | self.X = tf.placeholder(tf.float32, shape=[None, *input_dim], name='state') 186 | self.action = tf.placeholder(tf.uint8, shape=[None], name="action") 187 | action_onehot = tf.one_hot(self.action, output_dim, name="action_onehot") 188 | self.reward = tf.placeholder(tf.float32, shape=[None], name="reward") 189 | 190 | net = self.X 191 | 192 | with tf.variable_scope("layer1"): 193 | net = tf.layers.conv2d(net, 194 | filters=16, 195 | kernel_size=(8, 8), 196 | strides=(4, 4), 197 | name="conv") 198 | net = tf.nn.relu(net, name="relu") 199 | 200 | with tf.variable_scope("layer2"): 201 | net = tf.layers.conv2d(net, 202 | filters=32, 203 | kernel_size=(4, 4), 204 | strides=(2, 2), 205 | name="conv") 206 | net = tf.nn.relu(net, name="relu") 207 | 208 | with tf.variable_scope("fc1"): 209 | net = tf.contrib.layers.flatten(net) 210 | net = tf.layers.dense(net, 256, name='dense') 211 | net = tf.nn.relu(net, name='relu') 212 | 213 | with tf.variable_scope("fc2"): 214 | net = tf.layers.dense(net, output_dim, name='dense') 215 | 216 | self.action_prob = tf.nn.softmax(net, name="action_prob") 217 | 218 | log_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1) 219 | log_action_prob = tf.log(log_action_prob + 1e-7) 220 | 221 | entropy = - self.action_prob * tf.log(self.action_prob + 1e-7) 222 | self.entropy = tf.reduce_sum(entropy, axis=1) 223 | 224 | loss = -log_action_prob * self.reward - self.entropy * self.entropy_coefficient 225 | self.loss = tf.reduce_mean(loss) 226 | 227 | optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, 228 | decay=self.RMSPropdecay) 229 | 230 | self.train_op = optimizer.minimize(loss, 231 | global_step=self.global_step) 232 | 233 | def __build_summary_op(self, logdir): 234 | tf.summary.histogram("Action Probability Histogram", self.action_prob) 235 | tf.summary.histogram("Entropy", self.entropy) 236 | tf.summary.scalar("Loss", self.loss) 237 | tf.summary.scalar("Mean Reward", tf.reduce_mean(self.reward)) 238 | 239 | self.summary_op = tf.summary.merge_all() 240 | self.summary_writer = tf.summary.FileWriter(logdir, tf.get_default_graph()) 241 | 242 | def choose_action(self, S): 243 | shape = S.shape 244 | 245 | if len(shape) == 3: 246 | S = np.expand_dims(S, axis=0) 247 | 248 | np.testing.assert_equal(S.shape[1:], self.input_dim) 249 | 250 | sess = tf.get_default_session() 251 | action_prob = sess.run(self.action_prob, 252 | feed_dict={self.X: S}) 253 | action_prob = np.squeeze(action_prob) 254 | return np.random.choice(np.arange(self.output_dim) + 2, p=action_prob) 255 | 256 | def train(self, S, A, R): 257 | S = np.array(S) 258 | A = np.array(A) 259 | R = np.array(R) 260 | np.testing.assert_equal(S.shape[1:], self.input_dim) 261 | assert len(A.shape) == 1, "A.shape = {}".format(A.shape) 262 | assert len(R.shape) == 1, "R.shape = {}".format(R.shape) 263 | 264 | R = discount_reward(R, gamma=self.gamma) 265 | R -= np.mean(R) 266 | R /= np.std(R) + 1e-7 267 | 268 | A = A - 2 269 | 270 | sess = tf.get_default_session() 271 | 272 | _, summary_op, global_step_value = sess.run([self.train_op, 273 | self.summary_op, 274 | self.global_step], 275 | feed_dict={self.X: S, 276 | self.action: A, 277 | self.reward: R}) 278 | 279 | if self.summary_op is not None: 280 | self.summary_writer.add_summary(summary_op, global_step_value) 281 | 282 | def save(self): 283 | sess = tf.get_default_session() 284 | path = os.path.join(self.checkpoint_dir, "model.ckpt") 285 | self.saver.save(sess, path) 286 | 287 | 288 | def discount_reward(rewards, gamma=0.99): 289 | """Returns discounted rewards 290 | 291 | Args: 292 | rewards (1-D array): Reward array 293 | gamma (float): Discounted rate 294 | 295 | Returns: 296 | discounted_rewards: same shape as `rewards` 297 | 298 | Notes: 299 | In Pong, when the reward can be {-1, 0, 1}. 300 | 301 | However, when the reward is either -1 or 1, 302 | it means the game has been reset. 303 | 304 | Therefore, it's necessaray to reset `running_add` to 0 305 | whenever the reward is nonzero 306 | """ 307 | discounted_r = np.zeros_like(rewards, dtype=np.float32) 308 | running_add = 0 309 | for t in reversed(range(len(rewards))): 310 | if rewards[t] != 0: 311 | running_add = 0 312 | running_add = running_add * gamma + rewards[t] 313 | discounted_r[t] = running_add 314 | 315 | return discounted_r 316 | 317 | 318 | def run_episode(env, agent, pipeline): 319 | """Runs one episode and returns a total reward 320 | 321 | Args: 322 | env (gym.env): Gym Environment 323 | agent (Agent): Agent Player 324 | pipeline (function): Preprocessing function. 325 | processed_image = pipeline(image) 326 | 327 | Returns: 328 | total_reward (int): Total reward earned in an episode. 329 | """ 330 | states = [] 331 | actions = [] 332 | rewards = [] 333 | 334 | old_s = env.reset() 335 | old_s = pipeline(old_s) 336 | 337 | done = False 338 | total_reward = 0 339 | step_counter = 0 340 | 341 | state_diff = old_s 342 | 343 | while not done: 344 | 345 | action = agent.choose_action(state_diff) 346 | new_s, r, done, info = env.step(action) 347 | total_reward += r 348 | 349 | states.append(state_diff) 350 | actions.append(action) 351 | rewards.append(r) 352 | 353 | new_s = pipeline(new_s) 354 | state_diff = new_s - old_s 355 | old_s = new_s 356 | 357 | if r == -1 or r == 1 or done: 358 | step_counter += 1 359 | 360 | if step_counter > 10 or done: 361 | step_counter = 0 362 | # Agent expects numpy array 363 | agent.train(states, actions, rewards) 364 | 365 | states, actions, rewards = [], [], [] 366 | 367 | return total_reward 368 | 369 | 370 | def main(): 371 | try: 372 | env = gym.make("Pong-v0") 373 | env = gym.wrappers.Monitor(env, "monitor", force=True) 374 | action_dim = 2 375 | 376 | tf.reset_default_graph() 377 | sess = tf.InteractiveSession() 378 | 379 | new_HW = [80, 80] 380 | repeat = 1 381 | pipeline_fn = partial(pipeline, new_HW=new_HW, height_range=(35, 195), bg=(144, 72, 17)) 382 | 383 | agent = Agent(new_HW + [repeat], 384 | output_dim=action_dim, 385 | logdir='logdir/train', 386 | checkpoint_dir="checkpoints") 387 | 388 | init = tf.global_variables_initializer() 389 | sess.run(init) 390 | 391 | episode = 1 392 | 393 | while True: 394 | episode_reward = run_episode(env, agent, pipeline_fn) 395 | print(episode, episode_reward) 396 | 397 | episode += 1 398 | 399 | finally: 400 | agent.save() 401 | 402 | env.close() 403 | sess.close() 404 | 405 | 406 | def debug_mode(): 407 | pipeline_fn = partial(pipeline, new_HW=(50, 50), height_range=(35, 195), bg=(144, 72, 17)) 408 | try: 409 | 410 | env = gym.make("Pong-v0") 411 | env.reset() 412 | 413 | for _ in range(50): 414 | 415 | s = env.step(env.action_space.sample())[0] 416 | 417 | plot_image(np.squeeze(pipeline_fn(s))) 418 | 419 | finally: 420 | 421 | env.close() 422 | 423 | 424 | if __name__ == '__main__': 425 | main() 426 | # debug_mode() 427 | -------------------------------------------------------------------------------- /09_2_cross_entropy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cross Entropy Method 3 | 4 | Cross Entropy Method is a simple and efficient method 5 | for solving a variety of estimation and optimization problems. 6 | 7 | Psuedocode 8 | 9 | initialize mu, sd 10 | while not done: 11 | collect N samples of theta ~ N(mu, diag(sd)) 12 | perform one episode with each theta 13 | select top performing samples, called elite set 14 | obtain a new mu and sd 15 | end 16 | 17 | """ 18 | import numpy as np 19 | import gym 20 | 21 | 22 | env = gym.make("CartPole-v0") 23 | 24 | INPUT_SIZE = env.observation_space.shape[0] 25 | OUTPUT_SIZE = env.action_space.n 26 | 27 | 28 | def get_W_b(theta): 29 | """Get W and b 30 | 31 | Parameters 32 | ---------- 33 | theta : 1-d array 34 | Flatten theta 35 | 36 | Returns 37 | ---------- 38 | W : 2-d array 39 | b : 1-d array 40 | 41 | Examples 42 | ---------- 43 | >>> theta = np.random.randn(5) 44 | >>> W, b = get_W_b(theta) 45 | """ 46 | idx = INPUT_SIZE * OUTPUT_SIZE 47 | W = theta[:idx].reshape(INPUT_SIZE, OUTPUT_SIZE) 48 | b = theta[idx:].reshape(OUTPUT_SIZE) 49 | 50 | return W, b 51 | 52 | 53 | def choose_action(s, W, b): 54 | """Return an action (argmax) 55 | 56 | Parameters 57 | ---------- 58 | s : ndarray 59 | Observation (input_dim, ) 60 | 61 | W : ndarray, (input_dim, number_of_actions) 62 | b : ndarray, (number_of_actions) 63 | 64 | Returns 65 | ---------- 66 | action: int 67 | action index 68 | 69 | Examples 70 | ---------- 71 | >>> s = env.reset() 72 | >>> W, b = get_W_b(theta) 73 | >>> action = choose_action(s, W, b) 74 | """ 75 | 76 | action = np.dot(s, W) + b 77 | return np.argmax(action) 78 | 79 | 80 | def run_episode(env, theta, render=False): 81 | """ Run a single episode with theta 82 | 83 | Parameters 84 | ---------- 85 | env : gym environment 86 | theta : 1-d array 87 | render : bool, optional 88 | 89 | Returns 90 | ---------- 91 | reward : float 92 | Episode reward 93 | 94 | Examples 95 | ---------- 96 | >>> env = gym.make('CartPole-v0') 97 | >>> reward = run_episode(env, theta) 98 | """ 99 | W, b = get_W_b(theta) 100 | s = env.reset() 101 | done = False 102 | 103 | reward = 0 104 | 105 | while not done: 106 | if render: 107 | env.render() 108 | 109 | a = choose_action(s, W, b) 110 | s2, r, done, info = env.step(a) 111 | reward += r 112 | s = s2 113 | 114 | return reward 115 | 116 | 117 | def make_theta(theta_mean, theta_sd): 118 | """ Make a theta parameters with mean and sd 119 | 120 | Parameters 121 | ---------- 122 | theta_mean : ndarray 123 | A n-d array of means 124 | 125 | theta_sd : nd array 126 | A n-d array of standard deviations 127 | 128 | Returns 129 | ---------- 130 | theta : n-d array 131 | Shape (n, ) 132 | 133 | Examples 134 | ---------- 135 | >>> DIM = INPUT_SIZE * OUTPUT_SIZE + OUTPUT_SIZE 136 | >>> mu = np.zeros(DIM) 137 | >>> sd = np.ones(SD) 138 | >>> theta = make_theta(mu, sd) 139 | 140 | """ 141 | return np.random.multivariate_normal(mean=theta_mean, cov=np.diag(theta_sd),) 142 | 143 | 144 | def main(): 145 | """ Every magic happens here """ 146 | global env, INPUT_SIZE, OUTPUT_SIZE 147 | 148 | # Number of samples 149 | N = 32 150 | # Size of theta 151 | DIM = INPUT_SIZE * OUTPUT_SIZE + OUTPUT_SIZE 152 | 153 | # Initialize parameters 154 | theta_mean = np.zeros(DIM) 155 | theta_sd = np.ones(DIM) 156 | 157 | # Loop until clear the game 158 | # make population with mean & sd 159 | # choose elite groups 160 | # obtain new mean & sd 161 | for _ in range(100): 162 | population = [make_theta(theta_mean, theta_sd) for _ in range(N)] 163 | reward = [run_episode(env, p) for p in population] 164 | 165 | sorted_idx = np.argsort(reward)[-int(N * 0.20):] 166 | 167 | elite_population = [population[idx] for idx in sorted_idx] 168 | elite_reward = [reward[idx] for idx in sorted_idx] 169 | 170 | theta_mean = np.mean(elite_population, axis=0) 171 | theta_sd = np.std(elite_population, axis=0) 172 | 173 | avg_reward = np.mean(elite_reward) 174 | print("Reward: {}".format(avg_reward)) 175 | 176 | if avg_reward == 200: 177 | print("Game Cleared") 178 | break 179 | 180 | env = gym.wrappers.Monitor(env, "gym-results/", force=True) 181 | best_parm = elite_population[-1] 182 | 183 | for i in range(100): 184 | reward = run_episode(env, best_parm) 185 | print(reward) 186 | 187 | 188 | if __name__ == '__main__': 189 | main() 190 | -------------------------------------------------------------------------------- /10_1_Actor_Critic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# What is an actor critic network?\n", 11 | "\n", 12 | "- So far we have learned a single output network that produces Q-values(Value Iteration) or an action policy (Policy Iteration)\n", 13 | "- What if we can use both value functions and policy functions? That's how actor-critic methods were developed. It turns out if we use both, we can learn more complex systems. In this notebook, we will a simple policy gradient actor-critic methods\n", 14 | "\n", 15 | "# Structure of Actor Critic Networks\n", 16 | "- There are two networks: an actor network and a critic network\n", 17 | "![actor-critic](assets/actor_critic.png)\n", 18 | "- Actor network:\n", 19 | " * This network chooses an action!\n", 20 | " * It takes an input of game state and produce outputs an action policy (as in policy-gradient)\n", 21 | "\n", 22 | "- Critic network:\n", 23 | " * This network is simply a value network\n", 24 | " * It takes the same input as the actor network and produces a current state value" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "deletable": true, 31 | "editable": true 32 | }, 33 | "source": [ 34 | "# From pervious lectures\n", 35 | "\n", 36 | "* We used policy gradient methods that is to find a policy $\\pi$ that\n", 37 | "$$\n", 38 | "\\text{maximize } E\\left[\\ R\\ \\mid\\ \\pi\\ \\right]\n", 39 | "$$\n", 40 | "\n", 41 | "$$\\text{where }R = r_0 + r_1 + \\dots + r_{\\tau - 1}$$\n", 42 | "\n", 43 | "* We use an gradient estimator that is\n", 44 | "$$\n", 45 | "\\hat{g} = \\nabla_\\theta \\log \\pi \\left(a_t \\mid s_t; \\theta \\right) \\cdot R_t \n", 46 | "$$\n", 47 | "\n", 48 | "$$\\text{where }R_t = \\sum_{t'=t}^{T-1} \\left(discount\\ rate\\right)^{t'-t} \\cdot (reward)_t$$" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "deletable": true, 55 | "editable": true 56 | }, 57 | "source": [ 58 | "* The above gradient estimator simply means we boost the probability of an action that returns high rewards\n", 59 | "\n", 60 | "# Problems\n", 61 | "\n", 62 | "* The above method is however not stable because a step size of the gradients can be very large and once we overshoot, our agent will collect trajectories based on a bad policy" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": { 68 | "deletable": true, 69 | "editable": true 70 | }, 71 | "source": [ 72 | "# Solution\n", 73 | "* In order to solve high variance problems, we will use $A_t$ instead of $R_t$ and this is called an advantage function\n", 74 | "* What is an advantage function? We know a Q function and a Value function. \n", 75 | " * The $Q$ maps a state $s$ to an action $a$ value which is how good action $a$ is\n", 76 | " * The $V$ maps a state $s$ to a value that shows how good an input state $s$ is\n", 77 | " \n", 78 | "* Therefore, we can write two functions as following:\n", 79 | "$$ Q(s, a) = V(s) + A(a) $$\n", 80 | "\n", 81 | "* Therefore, \n", 82 | "\n", 83 | "$$ A(a) = Q(s, a) - V(s) $$\n", 84 | "\n", 85 | "* That's the definition of an advatage function. We are trying to find how good action $a$ is by subtracting a value function\n", 86 | "\n", 87 | "* Hence, we need to change the gradient estimator $\\hat{g}$ to the following\n", 88 | "$$\\hat{g} = \\nabla_\\theta \\log \\pi(a_t | s_t; \\theta) \\cdot A_t $$\n", 89 | "\n", 90 | "where\n", 91 | "\n", 92 | "\\begin{align*}\n", 93 | "A_t &= Q(s_t, a') - V(s_t) \\\\\n", 94 | " &= R_{t} - V(s_t)\n", 95 | "\\end{align*}\n", 96 | "\n", 97 | "# Notes\n", 98 | "- Its performance is still not great because it has a few flaws\n", 99 | " - We have to learn $V(s)$ first and learning $V(s)$ can be very difficult (requires careful reward enginneering)\n", 100 | " - Every trajectories is highly correlated\n", 101 | "- In order to deal with these problems, we will later discuss various methods such as TRPO(Trust Region Policy Optimization) or A3C(Asynchronous Actor Critic Networks)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 1, 107 | "metadata": { 108 | "collapsed": true, 109 | "deletable": true, 110 | "editable": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "import numpy as np\n", 115 | "import gym\n", 116 | "import tensorflow as tf" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 2, 122 | "metadata": { 123 | "collapsed": true, 124 | "deletable": true, 125 | "editable": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "slim = tf.contrib.slim" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 3, 135 | "metadata": { 136 | "collapsed": false, 137 | "deletable": true, 138 | "editable": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "class ActorCriticNetwork:\n", 143 | " \"\"\" Actor Critic Network\n", 144 | " \n", 145 | " - 3 placeholders for policy\n", 146 | " - S : state (shared)\n", 147 | " - A : action one hot\n", 148 | " - ADV : advantage value\n", 149 | " \n", 150 | " - 2 placeholders for value\n", 151 | " - S : state (shared)\n", 152 | " - R : reward\n", 153 | " \n", 154 | " - 2 outputs\n", 155 | " - P : action policy, p(a | s)\n", 156 | " - V : V(s)\n", 157 | " \n", 158 | " Examples\n", 159 | " ----------\n", 160 | " >>> input_shape = [None, 4]\n", 161 | " >>> action_n = 2\n", 162 | " >>> hidden_dims = [32, 32]\n", 163 | " >>> ac_network = ActorCriticNetwork(input_shape, action_n, hidden_dims)\n", 164 | " \"\"\"\n", 165 | " def __init__(self, input_shape, action_n, hidden_dims):\n", 166 | " # Policy Input\n", 167 | " self.S = tf.placeholder(tf.float32, shape=input_shape, name=\"state_input\")\n", 168 | " self.A = tf.placeholder(tf.float32, shape=[None, action_n], name=\"action_one_hot_input\")\n", 169 | " self.ADV = tf.placeholder(tf.float32, shape=[None], name=\"advantage_input\")\n", 170 | " \n", 171 | " # Value Input\n", 172 | " self.R = tf.placeholder(tf.float32, shape=[None], name=\"reward_input\")\n", 173 | " \n", 174 | " self._create_network(hidden_dims, action_n)\n", 175 | " \n", 176 | " def _create_network(self, hidden_dims, action_n):\n", 177 | " net = self.S\n", 178 | " \n", 179 | " for i, h_dim in enumerate(hidden_dims):\n", 180 | " net = slim.fully_connected(net, h_dim, activation_fn=None, scope=f\"fc-{i}\")\n", 181 | " net = tf.nn.relu(net)\n", 182 | " \n", 183 | " # Policy shape: [None, action_n]\n", 184 | " self.P = slim.fully_connected(net, action_n, activation_fn=tf.nn.softmax, scope=\"policy_output\")\n", 185 | "\n", 186 | " # Value shape: [None, 1] -> [None]\n", 187 | " _V = slim.fully_connected(net, 1, activation_fn=None, scope=\"value_output\")\n", 188 | " self.V = tf.squeeze(_V)\n", 189 | " \n", 190 | " self._create_op()\n", 191 | " \n", 192 | " def _create_op(self):\n", 193 | " # output shape: [None]\n", 194 | " policy_gain = tf.reduce_sum(self.P * self.A, 1)\n", 195 | "\n", 196 | " # output shape: [None]\n", 197 | " policy_gain = tf.log(policy_gain) * self.ADV\n", 198 | " policy_gain = tf.reduce_sum(policy_gain, name=\"policy_gain\")\n", 199 | "\n", 200 | " entropy = - tf.reduce_sum(self.P * tf.log(self.P), 1)\n", 201 | " entropy = tf.reduce_mean(entropy)\n", 202 | " \n", 203 | " value_loss = tf.losses.mean_squared_error(self.V, self.R, scope=\"value_loss\")\n", 204 | " \n", 205 | " # Becareful negative sign because we only can minimize\n", 206 | " # we want to maximize policy gain and entropy (for exploration)\n", 207 | " self.loss = - policy_gain + value_loss - entropy * 0.01\n", 208 | " self.optimizer = tf.train.AdamOptimizer()\n", 209 | " self.train_op = self.optimizer.minimize(self.loss)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 4, 215 | "metadata": { 216 | "collapsed": true, 217 | "deletable": true, 218 | "editable": true 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "class Agent:\n", 223 | " \"\"\" Agent class \"\"\"\n", 224 | " \n", 225 | " def __init__(self, env, network):\n", 226 | " \"\"\" Constructor\n", 227 | " \n", 228 | " Parameters\n", 229 | " ----------\n", 230 | " env\n", 231 | " Open ai gym environment \n", 232 | " network\n", 233 | " Actor Critic Network \n", 234 | " \"\"\"\n", 235 | " self.env = env\n", 236 | " self.model = network\n", 237 | " self.sess = tf.get_default_session()\n", 238 | " self.action_n = env.action_space.n\n", 239 | " \n", 240 | " \n", 241 | " def choose_an_action(self, state):\n", 242 | " \"\"\" Returns an action (int) \"\"\"\n", 243 | " \n", 244 | " feed = {\n", 245 | " self.model.S: state\n", 246 | " }\n", 247 | " \n", 248 | " action_prob = self.sess.run(self.model.P, feed_dict=feed)[0]\n", 249 | " \n", 250 | " return np.random.choice(np.arange(self.action_n), p=action_prob)\n", 251 | " \n", 252 | " def train(self, S, A, R):\n", 253 | " \"\"\" Train the actor critic networks\n", 254 | " \n", 255 | " (1) Compute discounted rewards R\n", 256 | " (2) Compute advantage values A = R - V\n", 257 | " (3) Perform gradients updates\n", 258 | " \n", 259 | " \"\"\"\n", 260 | " \n", 261 | " def discount_rewards(r, gamma=0.99):\n", 262 | " \"\"\" take 1D float array of rewards and compute discounted reward \"\"\"\n", 263 | " discounted_r = np.zeros_like(r, dtype=np.float32)\n", 264 | " running_add = 0\n", 265 | " \n", 266 | " for t in reversed(range(len(r))):\n", 267 | " running_add = running_add * gamma + r[t]\n", 268 | " discounted_r[t] = running_add\n", 269 | "\n", 270 | " return discounted_r\n", 271 | "\n", 272 | " # 1. Get discounted `R`s\n", 273 | " R = discount_rewards(R)\n", 274 | " \n", 275 | " # 2. Get `V`s\n", 276 | " feed = {\n", 277 | " self.model.S: S\n", 278 | " }\n", 279 | " V = self.sess.run(self.model.V, feed_dict=feed)\n", 280 | " \n", 281 | " # 3. Get Advantage values, A = R - V\n", 282 | " ADV = R - V \n", 283 | " ADV = (ADV - np.mean(ADV)) / (np.std(ADV) + 1e-8)\n", 284 | " \n", 285 | " # 4. Perform gradient descents\n", 286 | " feed = {\n", 287 | " self.model.S: S,\n", 288 | " self.model.A: A,\n", 289 | " self.model.ADV: ADV,\n", 290 | " self.model.R: R\n", 291 | " }\n", 292 | "\n", 293 | " self.sess.run(self.model.train_op, feed_dict=feed) " 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 5, 299 | "metadata": { 300 | "collapsed": false, 301 | "deletable": true, 302 | "editable": true 303 | }, 304 | "outputs": [ 305 | { 306 | "name": "stderr", 307 | "output_type": "stream", 308 | "text": [ 309 | "[2017-04-08 21:10:41,639] Making new env: CartPole-v0\n", 310 | "[2017-04-08 21:10:41,643] Clearing 26 monitor files from previous run (because force=True was provided)\n" 311 | ] 312 | }, 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "input_shape: [None, 4], action_n: 2\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "# Tensorflow Reset\n", 323 | "tf.reset_default_graph()\n", 324 | "sess = tf.InteractiveSession()\n", 325 | "\n", 326 | "# Gym Environment Setup\n", 327 | "env_name = \"CartPole-v0\"\n", 328 | "env = gym.make(env_name)\n", 329 | "env = gym.wrappers.Monitor(env, \"./gym-results/\", force=True)\n", 330 | "\n", 331 | "# Global parameters\n", 332 | "input_shape = [None, env.observation_space.shape[0]]\n", 333 | "action_n = env.action_space.n\n", 334 | "\n", 335 | "print(f\"input_shape: {input_shape}, action_n: {action_n}\")\n", 336 | "\n", 337 | "# Define A2C(Actor-Critic) and Agent\n", 338 | "ac_network = ActorCriticNetwork(input_shape, action_n, [32, 32])\n", 339 | "agent = Agent(env, ac_network)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 6, 345 | "metadata": { 346 | "collapsed": false, 347 | "deletable": true, 348 | "editable": true 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "def preprocess_state(state_list):\n", 353 | " \"\"\" Preprocess a state list\n", 354 | " \n", 355 | " Currently it's only used to reshape the value\n", 356 | " When a single state is given, its shape is 1-d array,\n", 357 | " which needs to be reshaped in 2-d array\n", 358 | " \"\"\"\n", 359 | " return np.reshape(state_list, [-1, *input_shape[1:]])\n", 360 | "\n", 361 | "def preprocess_action(action_list, n_actions):\n", 362 | " \"\"\"Action -> 1-hot \"\"\"\n", 363 | " N = len(action_list)\n", 364 | " one_hot = np.zeros(shape=(N, n_actions))\n", 365 | " one_hot[np.arange(N), action_list] = 1\n", 366 | " \n", 367 | " return one_hot\n", 368 | "\n", 369 | "# Test codes\n", 370 | "tmp = np.zeros((32, *input_shape[1:]))\n", 371 | "np.testing.assert_almost_equal(preprocess_state(tmp), np.zeros([32, *input_shape[1:]]))\n", 372 | "tmp = np.zeros(*input_shape[1:])\n", 373 | "np.testing.assert_almost_equal(preprocess_state(tmp), np.zeros([1, *input_shape[1:]]))\n", 374 | "tmp = [0, 1]\n", 375 | "np.testing.assert_almost_equal(preprocess_action(tmp, 2), np.eye(2))" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 7, 381 | "metadata": { 382 | "collapsed": false, 383 | "scrolled": false 384 | }, 385 | "outputs": [ 386 | { 387 | "name": "stderr", 388 | "output_type": "stream", 389 | "text": [ 390 | "[2017-04-08 21:10:42,119] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000000.mp4\n", 391 | "[2017-04-08 21:10:43,292] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000001.mp4\n" 392 | ] 393 | }, 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "[Episode- 0] 16\r\n" 399 | ] 400 | }, 401 | { 402 | "name": "stderr", 403 | "output_type": "stream", 404 | "text": [ 405 | "[2017-04-08 21:10:43,998] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000008.mp4\n" 406 | ] 407 | }, 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "[Episode- 1] 16\r", 413 | "[Episode- 2] 22\r", 414 | "[Episode- 3] 10\r", 415 | "[Episode- 4] 45\r", 416 | "[Episode- 5] 17\r", 417 | "[Episode- 6] 16\r", 418 | "[Episode- 7] 13\r" 419 | ] 420 | }, 421 | { 422 | "name": "stderr", 423 | "output_type": "stream", 424 | "text": [ 425 | "[2017-04-08 21:10:45,072] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000027.mp4\n" 426 | ] 427 | }, 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "[Episode- 46] 22\r" 433 | ] 434 | }, 435 | { 436 | "name": "stderr", 437 | "output_type": "stream", 438 | "text": [ 439 | "[2017-04-08 21:10:46,212] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000064.mp4\n" 440 | ] 441 | }, 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "[Episode- 107] 10\r" 447 | ] 448 | }, 449 | { 450 | "name": "stderr", 451 | "output_type": "stream", 452 | "text": [ 453 | "[2017-04-08 21:10:47,241] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000125.mp4\n" 454 | ] 455 | }, 456 | { 457 | "name": "stdout", 458 | "output_type": "stream", 459 | "text": [ 460 | "[Episode- 209] 19\r" 461 | ] 462 | }, 463 | { 464 | "name": "stderr", 465 | "output_type": "stream", 466 | "text": [ 467 | "[2017-04-08 21:10:48,925] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000216.mp4\n" 468 | ] 469 | }, 470 | { 471 | "name": "stdout", 472 | "output_type": "stream", 473 | "text": [ 474 | "[Episode- 337] 60\r" 475 | ] 476 | }, 477 | { 478 | "name": "stderr", 479 | "output_type": "stream", 480 | "text": [ 481 | "[2017-04-08 21:10:51,951] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000343.mp4\n" 482 | ] 483 | }, 484 | { 485 | "name": "stdout", 486 | "output_type": "stream", 487 | "text": [ 488 | "[Episode- 507] 31\r" 489 | ] 490 | }, 491 | { 492 | "name": "stderr", 493 | "output_type": "stream", 494 | "text": [ 495 | "[2017-04-08 21:11:00,967] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000512.mp4\n" 496 | ] 497 | }, 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "[Episode- 722] 104\r" 503 | ] 504 | }, 505 | { 506 | "name": "stderr", 507 | "output_type": "stream", 508 | "text": [ 509 | "[2017-04-08 21:11:09,900] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000729.mp4\n" 510 | ] 511 | }, 512 | { 513 | "name": "stdout", 514 | "output_type": "stream", 515 | "text": [ 516 | "[Episode- 993] 130\r" 517 | ] 518 | }, 519 | { 520 | "name": "stderr", 521 | "output_type": "stream", 522 | "text": [ 523 | "[2017-04-08 21:11:25,444] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video001000.mp4\n" 524 | ] 525 | }, 526 | { 527 | "name": "stdout", 528 | "output_type": "stream", 529 | "text": [ 530 | "[Episode- 1000] 200\n", 531 | "[Episode- 1996] 26\r" 532 | ] 533 | }, 534 | { 535 | "name": "stderr", 536 | "output_type": "stream", 537 | "text": [ 538 | "[2017-04-08 21:12:26,066] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video002000.mp4\n" 539 | ] 540 | }, 541 | { 542 | "name": "stdout", 543 | "output_type": "stream", 544 | "text": [ 545 | "[Episode- 2000] 146\n", 546 | "[Episode- 2363] 200\n", 547 | "Game cleared in 2363, average rewards: 195.15\n" 548 | ] 549 | } 550 | ], 551 | "source": [ 552 | "init = tf.global_variables_initializer()\n", 553 | "sess.run(init)\n", 554 | "\n", 555 | "MAX_EPISODES = 5000\n", 556 | "\n", 557 | "# For checking if the game is cleared\n", 558 | "EPISODE_100_REWARDS = []\n", 559 | "CLEAR_REWARD = env.spec.reward_threshold\n", 560 | "CLEAR_REWARD = CLEAR_REWARD if CLEAR_REWARD else 9999\n", 561 | "\n", 562 | "for episode in range(MAX_EPISODES):\n", 563 | " s = env.reset() \n", 564 | " done = False\n", 565 | " \n", 566 | " s_list = []\n", 567 | " a_list = []\n", 568 | " r_list = []\n", 569 | " \n", 570 | " episode_r = 0\n", 571 | " \n", 572 | " while not done:\n", 573 | " \n", 574 | " s = preprocess_state(s)\n", 575 | " a = agent.choose_an_action(s)\n", 576 | "\n", 577 | " s2, r, done, info = env.step(a)\n", 578 | " \n", 579 | " s_list.append(s)\n", 580 | " a_list.append(a)\n", 581 | " r_list.append(r)\n", 582 | " \n", 583 | " s = s2\n", 584 | " \n", 585 | " episode_r += r\n", 586 | " \n", 587 | " a_list = preprocess_action(a_list, action_n)\n", 588 | " \n", 589 | " agent.train(np.vstack(s_list), a_list, r_list)\n", 590 | " \n", 591 | " print(f\"[Episode-{episode:>6}] {int(episode_r):>4}\", end=\"\\r\")\n", 592 | " \n", 593 | " # For line breaks\n", 594 | " if episode % (MAX_EPISODES // 5) == 0:\n", 595 | " print()\n", 596 | " \n", 597 | " EPISODE_100_REWARDS.append(episode_r)\n", 598 | " \n", 599 | " # Check if the game is cleared\n", 600 | " if len(EPISODE_100_REWARDS) > 100:\n", 601 | " EPISODE_100_REWARDS = EPISODE_100_REWARDS[1:]\n", 602 | " \n", 603 | " avg_rewards = np.mean(EPISODE_100_REWARDS)\n", 604 | " \n", 605 | " if avg_rewards > CLEAR_REWARD:\n", 606 | " print()\n", 607 | " print(f\"Game cleared in {episode}, average rewards: {avg_rewards}\")\n", 608 | " break" 609 | ] 610 | }, 611 | { 612 | "cell_type": "markdown", 613 | "metadata": { 614 | "deletable": true, 615 | "editable": true 616 | }, 617 | "source": [ 618 | "# Test run\n" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": 8, 624 | "metadata": { 625 | "collapsed": false, 626 | "deletable": true, 627 | "editable": true 628 | }, 629 | "outputs": [ 630 | { 631 | "name": "stdout", 632 | "output_type": "stream", 633 | "text": [ 634 | "[Episode-0] 198\n", 635 | "[Episode-20] 200\n", 636 | "[Episode-40] 200\n", 637 | "[Episode-60] 200\n", 638 | "[Episode-80] 200\n", 639 | "[Episode-98] 200\r" 640 | ] 641 | }, 642 | { 643 | "name": "stderr", 644 | "output_type": "stream", 645 | "text": [ 646 | "[2017-04-08 21:13:16,119] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/kkweon/github/ReinforcementZeroToAll/gym-results')\n" 647 | ] 648 | }, 649 | { 650 | "name": "stdout", 651 | "output_type": "stream", 652 | "text": [ 653 | "[Episode-99] 200\r" 654 | ] 655 | } 656 | ], 657 | "source": [ 658 | "for episode in range(100):\n", 659 | " s = env.reset() \n", 660 | " done = False\n", 661 | " \n", 662 | " episode_r = 0\n", 663 | " while not done:\n", 664 | " if episode % 20 == 0:\n", 665 | " env.render()\n", 666 | " s = preprocess_state(s)\n", 667 | " a = agent.choose_an_action(s)\n", 668 | " s2, r, done, info = env.step(a)\n", 669 | " \n", 670 | " s = s2\n", 671 | " episode_r += r \n", 672 | " \n", 673 | " print(f\"[Episode-{episode}] {int(episode_r)}\", end=\"\\r\")\n", 674 | " \n", 675 | " if episode % 20 == 0:\n", 676 | " print()\n", 677 | " \n", 678 | "env.close()" 679 | ] 680 | } 681 | ], 682 | "metadata": { 683 | "kernelspec": { 684 | "display_name": "Python 3", 685 | "language": "python", 686 | "name": "python3" 687 | }, 688 | "language_info": { 689 | "codemirror_mode": { 690 | "name": "ipython", 691 | "version": 3 692 | }, 693 | "file_extension": ".py", 694 | "mimetype": "text/x-python", 695 | "name": "python", 696 | "nbconvert_exporter": "python", 697 | "pygments_lexer": "ipython3", 698 | "version": "3.6.1" 699 | } 700 | }, 701 | "nbformat": 4, 702 | "nbformat_minor": 2 703 | } 704 | -------------------------------------------------------------------------------- /10_2_A3C_threads.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple Asynchronous Methods for Deep Reinforcement Learning (A3C) 3 | 4 | - It mimics A3C by using multi threads 5 | - Distributed Tensorflow is preferred because of Python's GIL 6 | 7 | """ 8 | import tensorflow as tf 9 | import numpy as np 10 | import threading 11 | import gym 12 | import os 13 | from scipy.misc import imresize 14 | 15 | 16 | def copy_src_to_dst(from_scope, to_scope): 17 | """Creates a copy variable weights operation 18 | 19 | Args: 20 | from_scope (str): The name of scope to copy from 21 | It should be "global" 22 | to_scope (str): The name of scope to copy to 23 | It should be "thread-{}" 24 | 25 | Returns: 26 | list: Each element is a copy operation 27 | """ 28 | from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) 29 | to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) 30 | 31 | op_holder = [] 32 | for from_var, to_var in zip(from_vars, to_vars): 33 | op_holder.append(to_var.assign(from_var)) 34 | return op_holder 35 | 36 | 37 | def pipeline(image, new_HW=(80, 80), height_range=(35, 193), bg=(144, 72, 17)): 38 | """Returns a preprocessed image 39 | 40 | (1) Crop image (top and bottom) 41 | (2) Remove background & grayscale 42 | (3) Reszie to smaller image 43 | 44 | Args: 45 | image (3-D array): (H, W, C) 46 | new_HW (tuple): New image size (height, width) 47 | height_range (tuple): Height range (H_begin, H_end) else cropped 48 | bg (tuple): Background RGB Color (R, G, B) 49 | 50 | Returns: 51 | image (3-D array): (H, W, 1) 52 | """ 53 | image = crop_image(image, height_range) 54 | image = resize_image(image, new_HW) 55 | image = kill_background_grayscale(image, bg) 56 | image = np.expand_dims(image, axis=2) 57 | 58 | return image 59 | 60 | 61 | def resize_image(image, new_HW): 62 | """Returns a resized image 63 | 64 | Args: 65 | image (3-D array): Numpy array (H, W, C) 66 | new_HW (tuple): Target size (height, width) 67 | 68 | Returns: 69 | image (3-D array): Resized image (height, width, C) 70 | """ 71 | return imresize(image, new_HW, interp="nearest") 72 | 73 | 74 | def crop_image(image, height_range=(35, 195)): 75 | """Crops top and bottom 76 | 77 | Args: 78 | image (3-D array): Numpy image (H, W, C) 79 | height_range (tuple): Image will be cropped out 80 | except the height range between (min_height, max_height) 81 | 82 | Returns: 83 | image (3-D array): Numpy image (max_H - min_H, W, C) 84 | """ 85 | h_beg, h_end = height_range 86 | return image[h_beg:h_end, ...] 87 | 88 | 89 | def kill_background_grayscale(image, bg): 90 | """Make the background 0 91 | 92 | Args: 93 | image (3-D array): Numpy array (H, W, C) 94 | bg (tuple): RGB code of background (R, G, B) 95 | 96 | Returns: 97 | image (2-D array): Binarized image of shape (H, W) 98 | The background is 0 and everything else is 1 99 | """ 100 | H, W, _ = image.shape 101 | 102 | R = image[..., 0] 103 | G = image[..., 1] 104 | B = image[..., 2] 105 | 106 | cond = (R == bg[0]) & (G == bg[1]) & (B == bg[2]) 107 | 108 | image = np.zeros((H, W)) 109 | image[~cond] = 1 110 | 111 | return image 112 | 113 | 114 | def discount_reward(rewards, gamma=0.99): 115 | """Returns discounted rewards 116 | 117 | Args: 118 | rewards (1-D array): Reward array 119 | gamma (float): Discounted rate 120 | 121 | Returns: 122 | discounted_rewards: same shape as `rewards` 123 | 124 | Notes: 125 | In Pong, when the reward can be {-1, 0, 1}. 126 | However, when the reward is either -1 or 1, 127 | it means the game has been reset. 128 | Therefore, it's necessaray to reset `running_add` to 0 129 | whenever the reward is nonzero 130 | """ 131 | discounted_r = np.zeros_like(rewards, dtype=np.float32) 132 | running_add = 0 133 | for t in reversed(range(len(rewards))): 134 | if rewards[t] != 0: 135 | running_add = 0 136 | running_add = running_add * gamma + rewards[t] 137 | discounted_r[t] = running_add 138 | 139 | return discounted_r 140 | 141 | 142 | class A3CNetwork(object): 143 | 144 | def __init__(self, name, input_shape, output_dim, logdir=None): 145 | """A3C Network tensors and operations are defined here 146 | 147 | Args: 148 | name (str): The name of scope 149 | input_shape (list): The shape of input image [H, W, C] 150 | output_dim (int): Number of actions 151 | logdir (str, optional): directory to save summaries 152 | 153 | Notes: 154 | You should be familiar with Policy Gradients. 155 | The only difference between vanilla PG and A3C is that there is 156 | an operation to apply gradients manually 157 | """ 158 | with tf.variable_scope(name): 159 | self.states = tf.placeholder(tf.float32, shape=[None, *input_shape], name="states") 160 | self.actions = tf.placeholder(tf.uint8, shape=[None], name="actions") 161 | self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards") 162 | self.advantage = tf.placeholder(tf.float32, shape=[None], name="advantage") 163 | 164 | action_onehot = tf.one_hot(self.actions, output_dim, name="action_onehot") 165 | net = self.states 166 | 167 | with tf.variable_scope("layer1"): 168 | net = tf.layers.conv2d(net, 169 | filters=16, 170 | kernel_size=(8, 8), 171 | strides=(4, 4), 172 | name="conv") 173 | net = tf.nn.relu(net, name="relu") 174 | 175 | with tf.variable_scope("layer2"): 176 | net = tf.layers.conv2d(net, 177 | filters=32, 178 | kernel_size=(4, 4), 179 | strides=(2, 2), 180 | name="conv") 181 | net = tf.nn.relu(net, name="relu") 182 | 183 | with tf.variable_scope("fc1"): 184 | net = tf.contrib.layers.flatten(net) 185 | net = tf.layers.dense(net, 256, name='dense') 186 | net = tf.nn.relu(net, name='relu') 187 | 188 | # actor network 189 | actions = tf.layers.dense(net, output_dim, name="final_fc") 190 | self.action_prob = tf.nn.softmax(actions, name="action_prob") 191 | single_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1) 192 | 193 | entropy = - self.action_prob * tf.log(self.action_prob + 1e-7) 194 | entropy = tf.reduce_sum(entropy, axis=1) 195 | 196 | log_action_prob = tf.log(single_action_prob + 1e-7) 197 | maximize_objective = log_action_prob * self.advantage + entropy * 0.005 198 | self.actor_loss = - tf.reduce_mean(maximize_objective) 199 | 200 | # value network 201 | self.values = tf.squeeze(tf.layers.dense(net, 1, name="values")) 202 | self.value_loss = tf.losses.mean_squared_error(labels=self.rewards, 203 | predictions=self.values) 204 | 205 | self.total_loss = self.actor_loss + self.value_loss * .5 206 | self.optimizer = tf.train.AdamOptimizer() 207 | 208 | var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name) 209 | self.gradients = self.optimizer.compute_gradients(self.total_loss, var_list) 210 | self.gradients_placeholders = [] 211 | 212 | for grad, var in self.gradients: 213 | placeholder = tf.placeholder(var.dtype, shape=var.get_shape()) 214 | placeholder = tf.clip_by_norm(placeholder, 40) 215 | self.gradients_placeholders.append((placeholder, var)) 216 | self.apply_gradients = self.optimizer.apply_gradients(self.gradients_placeholders) 217 | 218 | if logdir: 219 | loss_summary = tf.summary.scalar("total_loss", self.total_loss) 220 | value_summary = tf.summary.histogram("values", self.values) 221 | 222 | self.summary_op = tf.summary.merge([loss_summary, value_summary]) 223 | self.summary_writer = tf.summary.FileWriter(logdir) 224 | 225 | 226 | class Agent(threading.Thread): 227 | 228 | def __init__(self, session, env, coord, name, global_network, input_shape, output_dim, logdir=None): 229 | """Agent worker thread 230 | 231 | Args: 232 | session (tf.Session): Tensorflow session needs to be shared 233 | env (gym.Env): Gym environment (Pong-v0) 234 | coord (tf.train.Coordinator): Tensorflow Queue Coordinator 235 | name (str): Name of this worker 236 | global_network (A3CNetwork): Global network that needs to be updated 237 | input_shape (list): Required for local A3CNetwork, [H, W, C] 238 | output_dim (int): Number of actions 239 | logdir (str, optional): If logdir is given, will write summary 240 | 241 | Methods: 242 | print(reward): prints episode rewards 243 | play_episode(): a single episode logic is stored in here 244 | run(): override threading.Thread.run 245 | choose_action(state) 246 | train(states, actions, rewards) 247 | """ 248 | super(Agent, self).__init__() 249 | self.local = A3CNetwork(name, input_shape, output_dim, logdir) 250 | self.global_to_local = copy_src_to_dst("global", name) 251 | self.global_network = global_network 252 | 253 | self.input_shape = input_shape 254 | self.output_dim = output_dim 255 | self.env = env 256 | self.sess = session 257 | self.coord = coord 258 | self.name = name 259 | self.logdir = logdir 260 | 261 | def print(self, reward): 262 | message = "Agent(name={}, reward={})".format(self.name, reward) 263 | print(message) 264 | 265 | def play_episode(self): 266 | self.sess.run(self.global_to_local) 267 | 268 | states = [] 269 | actions = [] 270 | rewards = [] 271 | 272 | s = self.env.reset() 273 | s = pipeline(s) 274 | state_diff = s 275 | 276 | done = False 277 | total_reward = 0 278 | time_step = 0 279 | while not done: 280 | 281 | a = self.choose_action(state_diff) 282 | s2, r, done, _ = self.env.step(a) 283 | 284 | s2 = pipeline(s2) 285 | total_reward += r 286 | 287 | states.append(state_diff) 288 | actions.append(a) 289 | rewards.append(r) 290 | 291 | state_diff = s2 - s 292 | s = s2 293 | 294 | if r == -1 or r == 1 or done: 295 | time_step += 1 296 | 297 | if time_step >= 5 or done: 298 | self.train(states, actions, rewards) 299 | self.sess.run(self.global_to_local) 300 | states, actions, rewards = [], [], [] 301 | time_step = 0 302 | 303 | self.print(total_reward) 304 | 305 | def run(self): 306 | while not self.coord.should_stop(): 307 | self.play_episode() 308 | 309 | def choose_action(self, state): 310 | """ 311 | Args: 312 | state (2-D array): (N, H, W, 1) 313 | """ 314 | state = np.reshape(state, [-1, *self.input_shape]) 315 | feed = { 316 | self.local.states: state 317 | } 318 | 319 | action = self.sess.run(self.local.action_prob, feed) 320 | action = np.squeeze(action) 321 | 322 | return np.random.choice(np.arange(self.output_dim) + 1, p=action) 323 | 324 | def train(self, states, actions, rewards): 325 | states = np.array(states) 326 | actions = np.array(actions) - 1 327 | rewards = np.array(rewards) 328 | 329 | feed = { 330 | self.local.states: states 331 | } 332 | 333 | values = self.sess.run(self.local.values, feed) 334 | 335 | rewards = discount_reward(rewards, gamma=0.99) 336 | rewards -= np.mean(rewards) 337 | rewards /= np.std(rewards) 338 | 339 | advantage = rewards - values 340 | advantage -= np.mean(advantage) 341 | advantage /= np.std(advantage) + 1e-8 342 | 343 | feed = { 344 | self.local.states: states, 345 | self.local.actions: actions, 346 | self.local.rewards: rewards, 347 | self.local.advantage: advantage 348 | } 349 | 350 | gradients = self.sess.run(self.local.gradients, feed) 351 | 352 | feed = [] 353 | for (grad, _), (placeholder, _) in zip(gradients, self.global_network.gradients_placeholders): 354 | feed.append((placeholder, grad)) 355 | 356 | feed = dict(feed) 357 | self.sess.run(self.global_network.apply_gradients, feed) 358 | 359 | 360 | def main(): 361 | try: 362 | tf.reset_default_graph() 363 | sess = tf.InteractiveSession() 364 | coord = tf.train.Coordinator() 365 | 366 | checkpoint_dir = "checkpoint" 367 | monitor_dir = "monitors" 368 | save_path = os.path.join(checkpoint_dir, "model.ckpt") 369 | 370 | if not os.path.exists(checkpoint_dir): 371 | os.makedirs(checkpoint_dir) 372 | print("Directory {} was created".format(checkpoint_dir)) 373 | 374 | n_threads = 16 375 | input_shape = [80, 80, 1] 376 | output_dim = 3 # {1, 2, 3} 377 | global_network = A3CNetwork(name="global", 378 | input_shape=input_shape, 379 | output_dim=output_dim) 380 | 381 | thread_list = [] 382 | env_list = [] 383 | 384 | for id in range(n_threads): 385 | env = gym.make("PongDeterministic-v4") 386 | 387 | if id == 0: 388 | env = gym.wrappers.Monitor(env, monitor_dir, force=True) 389 | 390 | single_agent = Agent(env=env, 391 | session=sess, 392 | coord=coord, 393 | name="thread_{}".format(id), 394 | global_network=global_network, 395 | input_shape=input_shape, 396 | output_dim=output_dim) 397 | thread_list.append(single_agent) 398 | env_list.append(env) 399 | 400 | if tf.train.get_checkpoint_state(os.path.dirname(save_path)): 401 | var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global") 402 | saver = tf.train.Saver(var_list=var_list) 403 | saver.restore(sess, save_path) 404 | print("Model restored to global") 405 | 406 | else: 407 | init = tf.global_variables_initializer() 408 | sess.run(init) 409 | print("No model is found") 410 | 411 | for t in thread_list: 412 | t.start() 413 | 414 | print("Ctrl + C to close") 415 | coord.wait_for_stop() 416 | 417 | except KeyboardInterrupt: 418 | var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global") 419 | saver = tf.train.Saver(var_list=var_list) 420 | saver.save(sess, save_path) 421 | print('Checkpoint Saved to {}'.format(save_path)) 422 | 423 | print("Closing threads") 424 | coord.request_stop() 425 | coord.join(thread_list) 426 | 427 | print("Closing environments") 428 | for env in env_list: 429 | env.close() 430 | 431 | sess.close() 432 | 433 | 434 | if __name__ == '__main__': 435 | main() 436 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Zero to All 2 | 3 | This is work in progress and it may have bugs. 4 | However, we call for your comments and pull requests. 5 | 6 | We emphasize on the following: 7 | 8 | * **Readiability** over anything else 9 | - That's why we choose Python 10 | * **Pythonic code** 11 | - PEP8 12 | - Docstring 13 | * **Use High Level Tensorflow API** 14 | - Cleaner and easier to understand 15 | * **KISS** 16 | - [Keep It Simple Stupid](https://www.techopedia.com/definition/20262/keep-it-simple-stupid-principle-kiss-principle) 17 | 18 | ## Lecture videos 19 | - [Youtube](https://www.youtube.com/playlist?list=PLlMkM4tgfjnKsCWav-Z2F-MMFRx-2gMGG) 20 | 21 | ## File naming rule 22 | 23 | ``` 24 | 99_9_description.py 25 | ``` 26 | - First two digits indicates a category of algorithms 27 | - 07: DQN 28 | - 08: Policy Gradient 29 | - 09: Random Search Methods 30 | - 10: Actor Critic 31 | - A second digit indicates an id 32 | - Description shows what the file is about 33 | 34 | 35 | ## How to use uploader 36 | It makes the uploading process a little bit simpler 37 | 38 | 1. Go to https://gym.openai.com/ 39 | 2. Login with your github account 40 | * https://gym.openai.com/users/YOUR_GITHUB_ACCOUNT 41 | 3. Copy your OpenAI api key from the upper right corner of your profile page 42 | ![user](assets/openai_user.jpg) 43 | 4. Modify `gym.ini` 44 | 5. In console 45 | ```bash 46 | #python gym_uploader.py /path/to/gym_results 47 | python gym_uploader.py gym-results/ 48 | ``` 49 | 50 | ## Install requirements 51 | ```bash 52 | pip install -r requirements.txt 53 | ``` 54 | 55 | ## Run test and autopep8 56 | TODO: Need to add more test cases 57 | 58 | ```bash 59 | pytest 60 | ``` 61 | 62 | ```bash 63 | # pip install autopep8 # if you haven't install 64 | autopep8 . --recursive --in-place --pep8-passes 2000 --verbose --ignore E501 65 | ``` 66 | 67 | ## Contributions/Comments 68 | We always welcome your comments and pull requests. 69 | -------------------------------------------------------------------------------- /assets/actor_critic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hunkim/ReinforcementZeroToAll/276e950a95c006666f1a34362dfd40ef4264ffbb/assets/actor_critic.png -------------------------------------------------------------------------------- /assets/openai_user.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hunkim/ReinforcementZeroToAll/276e950a95c006666f1a34362dfd40ef4264ffbb/assets/openai_user.jpg -------------------------------------------------------------------------------- /dqn.py: -------------------------------------------------------------------------------- 1 | """DQN Class 2 | 3 | DQN(NIPS-2013) 4 | "Playing Atari with Deep Reinforcement Learning" 5 | https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf 6 | 7 | DQN(Nature-2015) 8 | "Human-level control through deep reinforcement learning" 9 | http://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf 10 | """ 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | 15 | class DQN: 16 | 17 | def __init__(self, session: tf.Session, input_size: int, output_size: int, name: str="main") -> None: 18 | """DQN Agent can 19 | 20 | 1) Build network 21 | 2) Predict Q_value given state 22 | 3) Train parameters 23 | 24 | Args: 25 | session (tf.Session): Tensorflow session 26 | input_size (int): Input dimension 27 | output_size (int): Number of discrete actions 28 | name (str, optional): TF Graph will be built under this name scope 29 | """ 30 | self.session = session 31 | self.input_size = input_size 32 | self.output_size = output_size 33 | self.net_name = name 34 | 35 | self._build_network() 36 | 37 | def _build_network(self, h_size=16, l_rate=0.001) -> None: 38 | """DQN Network architecture (simple MLP) 39 | 40 | Args: 41 | h_size (int, optional): Hidden layer dimension 42 | l_rate (float, optional): Learning rate 43 | """ 44 | with tf.variable_scope(self.net_name): 45 | self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x") 46 | net = self._X 47 | 48 | net = tf.layers.dense(net, h_size, activation=tf.nn.relu) 49 | net = tf.layers.dense(net, self.output_size) 50 | self._Qpred = net 51 | 52 | self._Y = tf.placeholder(tf.float32, shape=[None, self.output_size]) 53 | self._loss = tf.losses.mean_squared_error(self._Y, self._Qpred) 54 | 55 | optimizer = tf.train.AdamOptimizer(learning_rate=l_rate) 56 | self._train = optimizer.minimize(self._loss) 57 | 58 | def predict(self, state: np.ndarray) -> np.ndarray: 59 | """Returns Q(s, a) 60 | 61 | Args: 62 | state (np.ndarray): State array, shape (n, input_dim) 63 | 64 | Returns: 65 | np.ndarray: Q value array, shape (n, output_dim) 66 | """ 67 | x = np.reshape(state, [-1, self.input_size]) 68 | return self.session.run(self._Qpred, feed_dict={self._X: x}) 69 | 70 | def update(self, x_stack: np.ndarray, y_stack: np.ndarray) -> list: 71 | """Performs updates on given X and y and returns a result 72 | 73 | Args: 74 | x_stack (np.ndarray): State array, shape (n, input_dim) 75 | y_stack (np.ndarray): Target Q array, shape (n, output_dim) 76 | 77 | Returns: 78 | list: First element is loss, second element is a result from train step 79 | """ 80 | feed = { 81 | self._X: x_stack, 82 | self._Y: y_stack 83 | } 84 | return self.session.run([self._loss, self._train], feed) 85 | -------------------------------------------------------------------------------- /gym.ini: -------------------------------------------------------------------------------- 1 | [default] 2 | # key is obtained from 3 | # https://gym.openai.com 4 | 5 | GYM_API_KEY=YOUR_API_KEY -------------------------------------------------------------------------------- /gym_uploader.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import configparser 3 | import gym 4 | 5 | 6 | def read_config(file='gym.ini'): 7 | parser = configparser.ConfigParser() 8 | parser.read(file) 9 | 10 | return parser 11 | 12 | 13 | def read_argparse(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('path', help='path to gym results') 16 | 17 | return parser.parse_args() 18 | 19 | 20 | def upload(gym_path, key): 21 | return gym.upload(gym_path, api_key=key) 22 | 23 | 24 | if __name__ == '__main__': 25 | config = read_config() 26 | args = read_argparse() 27 | key = config['default']['GYM_API_KEY'] 28 | 29 | if len(key) == 0 or key is None or key == "YOUR_API_KEY": 30 | print("Please enter the API key in gym.ini ") 31 | 32 | else: 33 | upload(args.path, key) 34 | -------------------------------------------------------------------------------- /mini_pacman.py: -------------------------------------------------------------------------------- 1 | # Simple pacman to avoid falling bombs 2 | # Original author: Jin Kim (golbin) https://github.com/golbin/TensorFlow-Tutorials 3 | import numpy as np 4 | import random 5 | 6 | import matplotlib.pyplot as plt 7 | import matplotlib.patches as patches 8 | 9 | 10 | class Gym: 11 | 12 | def __init__(self, screen_width=6, screen_height=10, show_game=True): 13 | self.screen_width = screen_width 14 | self.screen_height = screen_height 15 | self.road_width = (screen_width // 2) 16 | self.road_left = self.road_width // 2 + 1 17 | self.road_right = self.road_left + self.road_width - 1 18 | 19 | self.car = {"col": 0, "row": 2} 20 | self.block = [ 21 | {"col": 0, "row": 0, "speed": 1}, 22 | {"col": 0, "row": 0, "speed": 2}, 23 | ] 24 | 25 | self.total_reward = 0. 26 | self.current_reward = 0. 27 | self.total_game = 0 28 | self.show_game = show_game 29 | 30 | if show_game: 31 | self.fig, self.axis = self.prepare_display() 32 | 33 | def prepare_display(self): 34 | fig, axis = plt.subplots(figsize=(4, 6)) 35 | fig.set_size_inches(4, 6) 36 | fig.canvas.mpl_connect('close_event', exit) 37 | plt.axis((0, self.screen_width, 0, self.screen_height)) 38 | plt.tick_params(top='off', right='off', 39 | left='off', labelleft='off', 40 | bottom='off', labelbottom='off') 41 | 42 | plt.draw() 43 | plt.ion() 44 | plt.show() 45 | 46 | return fig, axis 47 | 48 | def action_space_sample(self): 49 | return random.randint(0, 2) 50 | 51 | def get_state(self): 52 | state = np.zeros((self.screen_width, self.screen_height)) 53 | 54 | state[self.car["col"], self.car["row"]] = 1 55 | 56 | if self.block[0]["row"] < self.screen_height: 57 | state[self.block[0]["col"], self.block[0]["row"]] = 1 58 | 59 | if self.block[1]["row"] < self.screen_height: 60 | state[self.block[1]["col"], self.block[1]["row"]] = 1 61 | 62 | return state.reshape((-1, self.screen_width * self.screen_height)) 63 | 64 | def draw_screen(self): 65 | title = " Avg. Reward: %d Reward: %d Total Game: %d" % ( 66 | self.total_reward / self.total_game, 67 | self.current_reward, 68 | self.total_game) 69 | 70 | self.axis.clear() 71 | self.axis.set_title(title, fontsize=12) 72 | 73 | road = patches.Rectangle((self.road_left - 1, 0), self.road_width + 1, 74 | self.screen_height, linewidth=0, facecolor="#333333") 75 | 76 | if self._is_gameover(): 77 | car_color = "#FF0000" 78 | else: 79 | car_color = "#00FF00" 80 | car = patches.Wedge((self.car["col"] - 0.5, self.car["row"] - 0.5), 81 | 0.5, 20, 340, linewidth=0, facecolor=car_color) 82 | block1 = patches.Circle((self.block[0][ 83 | "col"] - 0.5, self.block[0]["row"]), 0.5, linewidth=0, facecolor="#0099FF") 84 | block2 = patches.Circle((self.block[1][ 85 | "col"] - 0.5, self.block[1]["row"]), 0.5, linewidth=0, facecolor="#EB70AA") 86 | 87 | self.axis.add_patch(road) 88 | self.axis.add_patch(car) 89 | self.axis.add_patch(block1) 90 | self.axis.add_patch(block2) 91 | 92 | self.fig.canvas.draw() 93 | plt.pause(0.0001) 94 | 95 | def reset(self): 96 | self.current_reward = 0 97 | self.total_game += 1 98 | 99 | self.car["col"] = int(self.screen_width / 2) 100 | 101 | self.block[0]["col"] = random.randrange( 102 | self.road_left, self.road_right + 1) 103 | self.block[0]["row"] = 0 104 | self.block[1]["col"] = random.randrange( 105 | self.road_left, self.road_right + 1) 106 | self.block[1]["row"] = 0 107 | 108 | self.update_block() 109 | return self.get_state() 110 | 111 | def update_car(self, move): 112 | self.car["col"] = max(self.road_left, self.car["col"] + move) 113 | self.car["col"] = min(self.car["col"], self.road_right) 114 | 115 | def update_block(self): 116 | reward = 0 117 | 118 | if self.block[0]["row"] > 0: 119 | self.block[0]["row"] -= self.block[0]["speed"] 120 | else: 121 | self.block[0]["col"] = random.randrange( 122 | self.road_left, self.road_right + 1) 123 | self.block[0]["row"] = self.screen_height 124 | reward += 1 125 | 126 | if self.block[1]["row"] > 0: 127 | self.block[1]["row"] -= self.block[1]["speed"] 128 | else: 129 | self.block[1]["col"] = random.randrange( 130 | self.road_left, self.road_right + 1) 131 | self.block[1]["row"] = self.screen_height 132 | reward += 1 133 | 134 | return reward 135 | 136 | def _is_gameover(self): 137 | if ((self.car["col"] == self.block[0]["col"] and 138 | self.car["row"] == self.block[0]["row"]) or 139 | (self.car["col"] == self.block[1]["col"] and 140 | self.car["row"] == self.block[1]["row"])): 141 | 142 | self.total_reward += self.current_reward 143 | 144 | return True 145 | else: 146 | return False 147 | 148 | def step(self, action): 149 | # action: 0: left, 1: stay, 2: right 150 | self.update_car(action - 1) 151 | escape_reward = self.update_block() 152 | stable_reward = 1. / self.screen_height if action == 1 else 0 153 | gameover = self._is_gameover() 154 | stat = self.get_state() 155 | 156 | if gameover: 157 | reward = -2 158 | else: 159 | reward = escape_reward + stable_reward 160 | self.current_reward += reward 161 | 162 | if self.show_game: 163 | self.draw_screen() 164 | 165 | return stat, reward, gameover, None 166 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.7.3 2 | matplotlib==2.0.0 3 | numpy==1.12.0 4 | readchar==0.7 5 | tensorflow==1.0.0 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hunkim/ReinforcementZeroToAll/276e950a95c006666f1a34362dfd40ef4264ffbb/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_DQN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from dqn import DQN 4 | 5 | 6 | class TestDQN: 7 | def setup_method(self, method): 8 | print("Session open") 9 | tf.reset_default_graph() 10 | self.sess = tf.Session() 11 | 12 | def teardown_method(self, method): 13 | print("Sesson close") 14 | self.sess.close() 15 | 16 | def test_one_agent(self): 17 | agent = DQN(self.sess, 4, 2) 18 | assert isinstance(agent, DQN) is True 19 | 20 | assert hasattr(agent, "_X") 21 | assert hasattr(agent, "_Y") 22 | assert hasattr(agent, "_loss") 23 | assert hasattr(agent, "_train") 24 | 25 | def run_init(self): 26 | init = tf.global_variables_initializer() 27 | self.sess.run(init) 28 | 29 | def test_agent_can_take_observation(self): 30 | obs = np.zeros([1, 4]) 31 | agent = DQN(self.sess, 4, 2) 32 | self.run_init() 33 | output = agent.predict(obs) 34 | np.testing.assert_almost_equal(output, [[0, 0]]) 35 | 36 | obs = np.zeros([4, ]) 37 | output = agent.predict(obs) 38 | np.testing.assert_almost_equal(output, [[0, 0]]) 39 | 40 | obs = np.zeros([32, 4]) 41 | output = agent.predict(obs) 42 | np.testing.assert_almost_equal(output, [[0, 0] for _ in range(32)]) 43 | 44 | def test_agent_can_run_update(self): 45 | x_stack = np.zeros([32, 4]) 46 | y_stack = np.zeros([32, 2]) 47 | 48 | agent = DQN(self.sess, 4, 2) 49 | self.run_init() 50 | 51 | output = agent.update(x_stack, y_stack) 52 | assert output[0] == 0 53 | 54 | x_stack = np.zeros([1, 4]) 55 | y_stack = np.zeros([1, 2]) 56 | 57 | output = agent.update(x_stack, y_stack) 58 | assert output[0] == 0 59 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hunkim/ReinforcementZeroToAll/276e950a95c006666f1a34362dfd40ef4264ffbb/utils/__init__.py -------------------------------------------------------------------------------- /utils/prints.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for console printing 3 | """ 4 | import os 5 | import time 6 | 7 | 8 | def clear_screen() -> None: 9 | """Clear terminal console""" 10 | os.system("cls" if os.name == "nt" else "clear") 11 | 12 | 13 | def print_result(score: float) -> None: 14 | """Prints GOAL if score is positive else DEAD""" 15 | message = "GOAL" if score > 0 else "DEAD" 16 | print("=" * 50) 17 | print("{:^50}".format(message)) 18 | print("=" * 50) 19 | time.sleep(3) 20 | --------------------------------------------------------------------------------