├── .gitignore
├── 01_0_play_frozenlake_det.py
├── 01_1_play_frozenlake_det.py
├── 02_random_frozenkake_det.py
├── 03_0_q_table_frozenlake_det.py
├── 03_1_q_table_frozenlake_det.py
├── 03_2_q_table_frozenlake_det.py
├── 04_play_frozenlake.py
├── 05_0_q_table_frozenlake.py
├── 05_q_table_frozenlake.py
├── 06_q_net_frozenlake.py
├── 07_0_random_cartpole.py
├── 07_1_q_net_cartpole.py
├── 07_2_dqn_2013_cartpole.py
├── 07_3_dqn_2015_cartpole.py
├── 08_1_pg_cartpole.py
├── 08_2_softmax_pg_cartpole.py
├── 08_3_softmax_pg_pacman.py
├── 08_4_softmax_pg_pong.py
├── 08_4_softmax_pg_pong_y.py
├── 09_2_cross_entropy.py
├── 10_1_Actor_Critic.ipynb
├── 10_2_A3C_threads.py
├── README.md
├── assets
    ├── actor_critic.png
    └── openai_user.jpg
├── dqn.py
├── gym.ini
├── gym_uploader.py
├── mini_pacman.py
├── requirements.txt
├── tests
    ├── __init__.py
    └── test_DQN.py
└── utils
    ├── __init__.py
    └── prints.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 | gym-results/
4 | pacman_log
5 | 


--------------------------------------------------------------------------------
/01_0_play_frozenlake_det.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym.envs.registration import register
 3 | 
 4 | # http://stackoverflow.com/questions/510357/python-read-a-single-character-from-the-user
 5 | import readchar  # pip3 install readchar
 6 | 
 7 | # MACROS
 8 | LEFT = 0
 9 | DOWN = 1
10 | RIGHT = 2
11 | UP = 3
12 | 
13 | # Key mapping
14 | arrow_keys = {
15 |     '\x1b[A': UP,
16 |     '\x1b[B': DOWN,
17 |     '\x1b[C': RIGHT,
18 |     '\x1b[D': LEFT}
19 | 
20 | # Register FrozenLake with is_slippery False
21 | register(
22 |     id='FrozenLake-v3',
23 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
24 |     kwargs={'map_name': '4x4', 'is_slippery': False}
25 | )
26 | 
27 | env = gym.make('FrozenLake-v3')
28 | env.render()  # Show the initial board
29 | 
30 | while True:
31 |     # Choose an action from keyboard
32 |     key = readchar.readkey()
33 |     if key not in arrow_keys.keys():
34 |         print("Game aborted!")
35 |         break
36 | 
37 |     action = arrow_keys[key]
38 |     state, reward, done, info = env.step(action)
39 |     env.render()  # Show the board after action
40 |     print("State: ", state, "Action: ", action,
41 |           "Reward: ", reward, "Info: ", info)
42 | 
43 |     if done:
44 |         print("Finished with reward", reward)
45 |         break
46 | 


--------------------------------------------------------------------------------
/01_1_play_frozenlake_det.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym.envs.registration import register
 3 | import sys
 4 | import tty
 5 | import termios
 6 | 
 7 | 
 8 | class _Getch:
 9 | 
10 |     def __call__(self):
11 |         fd = sys.stdin.fileno()
12 |         old_settings = termios.tcgetattr(fd)
13 |         try:
14 |             tty.setraw(sys.stdin.fileno())
15 |             ch = sys.stdin.read(3)
16 |         finally:
17 |             termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
18 |         return ch
19 | 
20 | inkey = _Getch()
21 | 
22 | # MACROS
23 | LEFT = 0
24 | DOWN = 1
25 | RIGHT = 2
26 | UP = 3
27 | 
28 | # Key mapping
29 | arrow_keys = {
30 |     '\x1b[A': UP,
31 |     '\x1b[B': DOWN,
32 |     '\x1b[C': RIGHT,
33 |     '\x1b[D': LEFT}
34 | 
35 | # Register FrozenLake with is_slippery False
36 | register(
37 |     id='FrozenLake-v3',
38 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
39 |     kwargs={'map_name': '4x4', 'is_slippery': False}
40 | )
41 | 
42 | env = gym.make('FrozenLake-v3')
43 | env.render()  # Show the initial board
44 | 
45 | while True:
46 |     # Choose an action from keyboard
47 |     key = inkey()
48 |     if key not in arrow_keys.keys():
49 |         print("Game aborted!")
50 |         break
51 | 
52 |     action = arrow_keys[key]
53 |     state, reward, done, info = env.step(action)
54 |     env.render()  # Show the board after action
55 |     print("State: ", state, "Action: ", action,
56 |           "Reward: ", reward, "Info: ", info)
57 | 
58 |     if done:
59 |         print("Finished with reward", reward)
60 |         break
61 | 


--------------------------------------------------------------------------------
/02_random_frozenkake_det.py:
--------------------------------------------------------------------------------
 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap
 2 | import gym
 3 | import random
 4 | from gym.envs.registration import register
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | register(
 8 |     id='FrozenLake-v3',
 9 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
10 |     kwargs={'map_name': '4x4',
11 |             'is_slippery': False}
12 | )
13 | 
14 | env = gym.make('FrozenLake-v0')
15 | env.render()
16 | 
17 | num_episodes = 2000
18 | 
19 | rList = []
20 | for i in range(num_episodes):
21 |     # Reset environment and get first new observation
22 |     env.reset()
23 |     rAll = 0
24 |     done = False
25 | 
26 |     while not done:
27 |         # Random action
28 |         action = random.randint(0, env.action_space.n - 1)
29 | 
30 |         # Get new state and reward from environment
31 |         _state, reward, done, _info = env.step(action)
32 | 
33 |         # rAll will be 1 if success, o otherwise
34 |         rAll += reward
35 | 
36 |     rList.append(rAll)
37 | 
38 | print("Success rate: " + str(sum(rList) / num_episodes))
39 | plt.plot(rList)
40 | plt.show()
41 | 


--------------------------------------------------------------------------------
/03_0_q_table_frozenlake_det.py:
--------------------------------------------------------------------------------
 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap
 2 | 
 3 | import gym
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from gym.envs.registration import register
 7 | import random as pr
 8 | 
 9 | # https://gist.github.com/stober/1943451
10 | 
11 | 
12 | def rargmax(vector):
13 |     """ Argmax that chooses randomly among eligible maximum indices. """
14 |     m = np.amax(vector)
15 |     indices = np.nonzero(vector == m)[0]
16 |     return pr.choice(indices)
17 | 
18 | 
19 | register(
20 |     id='FrozenLake-v3',
21 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
22 |     kwargs={'map_name': '4x4',
23 |             'is_slippery': False}
24 | )
25 | env = gym.make('FrozenLake-v3')
26 | 
27 | # Initialize table with all zeros
28 | Q = np.zeros([env.observation_space.n, env.action_space.n])
29 | # Set learning parameters
30 | num_episodes = 2000
31 | 
32 | # create lists to contain total rewards and steps per episode
33 | rList = []
34 | for i in range(num_episodes):
35 |     # Reset environment and get first new observation
36 |     state = env.reset()
37 |     rAll = 0
38 |     done = False
39 | 
40 |     # The Q-Table learning algorithm
41 |     while not done:
42 |         action = rargmax(Q[state, :])
43 | 
44 |         # Get new state and reward from environment
45 |         new_state, reward, done, _ = env.step(action)
46 | 
47 |         # Update Q-Table with new knowledge using learning rate
48 |         Q[state, action] = reward + np.max(Q[new_state, :])
49 | 
50 |         rAll += reward
51 |         state = new_state
52 | 
53 |     rList.append(rAll)
54 | 
55 | print("Success rate: " + str(sum(rList) / num_episodes))
56 | print("Final Q-Table Values")
57 | print("LEFT DOWN RIGHT UP")
58 | print(Q)
59 | plt.bar(range(len(rList)), rList, color="blue")
60 | plt.show()
61 | 


--------------------------------------------------------------------------------
/03_1_q_table_frozenlake_det.py:
--------------------------------------------------------------------------------
 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap
 2 | import gym
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from gym.envs.registration import register
 6 | 
 7 | register(
 8 |     id='FrozenLake-v3',
 9 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
10 |     kwargs={'map_name': '4x4',
11 |             'is_slippery': False}
12 | )
13 | 
14 | env = gym.make('FrozenLake-v3')
15 | 
16 | # Initialize table with all zeros
17 | Q = np.zeros([env.observation_space.n, env.action_space.n])
18 | # Discount factor
19 | dis = .99
20 | num_episodes = 2000
21 | 
22 | # create lists to contain total rewards and steps per episode
23 | rList = []
24 | 
25 | for i in range(num_episodes):
26 |     # Reset environment and get first new observation
27 |     state = env.reset()
28 |     rAll = 0
29 |     done = False
30 | 
31 |     # The Q-Table learning algorithm
32 |     while not done:
33 |         # Choose an action by greedily (with noise) picking from Q table
34 |         action = np.argmax(Q[state, :] + np.random.randn(1,
35 |                                                          env.action_space.n) / (i + 1))
36 | 
37 |         # Get new state and reward from environment
38 |         new_state, reward, done, _ = env.step(action)
39 | 
40 |         # Update Q-Table with new knowledge using decay rate
41 |         Q[state, action] = reward + dis * np.max(Q[new_state, :])
42 | 
43 |         rAll += reward
44 |         state = new_state
45 | 
46 |     rList.append(rAll)
47 | 
48 | print("Success rate: " + str(sum(rList) / num_episodes))
49 | print("Final Q-Table Values")
50 | print(Q)
51 | plt.bar(range(len(rList)), rList, color="blue")
52 | plt.show()
53 | 


--------------------------------------------------------------------------------
/03_2_q_table_frozenlake_det.py:
--------------------------------------------------------------------------------
 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap
 2 | import gym
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from gym.envs.registration import register
 6 | import random as pr
 7 | 
 8 | register(
 9 |     id='FrozenLake-v3',
10 |     entry_point='gym.envs.toy_text:FrozenLakeEnv',
11 |     kwargs={'map_name': '4x4',
12 |             'is_slippery': False}
13 | )
14 | 
15 | env = gym.make('FrozenLake-v3')
16 | 
17 | # Initialize table with all zeros
18 | Q = np.zeros([env.observation_space.n, env.action_space.n])
19 | # Set learning parameters
20 | dis = .99
21 | num_episodes = 2000
22 | 
23 | # create lists to contain total rewards and steps per episode
24 | rList = []
25 | for i in range(num_episodes):
26 |     # Reset environment and get first new observation
27 |     state = env.reset()
28 |     rAll = 0
29 |     done = False
30 | 
31 |     e = 1. / ((i // 100) + 1)  # Python2&3
32 | 
33 |     # The Q-Table learning algorithm
34 |     while not done:
35 |         # Choose an action by e greedy
36 |         if np.random.rand(1) < e:
37 |             action = env.action_space.sample()
38 |         else:
39 |             action = np.argmax(Q[state, :])
40 | 
41 |         # Get new state and reward from environment
42 |         new_state, reward, done, _ = env.step(action)
43 | 
44 |         # Update Q-Table with new knowledge using learning rate
45 |         Q[state, action] = reward + dis * np.max(Q[new_state, :])
46 | 
47 |         rAll += reward
48 |         state = new_state
49 | 
50 |     rList.append(rAll)
51 | 
52 | print("Success rate: " + str(sum(rList) / num_episodes))
53 | print("Final Q-Table Values")
54 | print(Q)
55 | plt.bar(range(len(rList)), rList, color="blue")
56 | plt.show()
57 | 


--------------------------------------------------------------------------------
/04_play_frozenlake.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import readchar
 3 | 
 4 | import utils.prints as print_utils
 5 | 
 6 | # MACROS
 7 | LEFT = 0
 8 | DOWN = 1
 9 | RIGHT = 2
10 | UP = 3
11 | 
12 | # Key mapping
13 | arrow_keys = {'\x1b[A': UP, '\x1b[B': DOWN, '\x1b[C': RIGHT, '\x1b[D': LEFT}
14 | 
15 | # is_slippery True
16 | env = gym.make('FrozenLake-v0')
17 | 
18 | env.reset()
19 | 
20 | print_utils.clear_screen()
21 | env.render()  # Show the initial board
22 | 
23 | while True:
24 |     # Choose an action from keyboard
25 |     key = readchar.readkey()
26 | 
27 |     if key not in arrow_keys.keys():
28 |         print("Game aborted!")
29 |         break
30 | 
31 |     action = arrow_keys[key]
32 |     state, reward, done, info = env.step(action)
33 | 
34 |     # Show the board after action
35 |     print_utils.clear_screen()
36 |     env.render()
37 | 
38 |     print("State: {} Action: {} Reward: {} Info: {}".format(
39 |         state, action, reward, info))
40 | 
41 |     if done:
42 |         print_utils.print_result(reward)
43 | 


--------------------------------------------------------------------------------
/05_0_q_table_frozenlake.py:
--------------------------------------------------------------------------------
 1 | # https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap
 2 | 
 3 | import gym
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | env = gym.make('FrozenLake-v0')
 8 | 
 9 | # Initialize table with all zeros
10 | Q = np.zeros([env.observation_space.n, env.action_space.n])
11 | 
12 | # Set learning parameters
13 | learning_rate = .85
14 | dis = .99
15 | num_episodes = 2000
16 | 
17 | # create lists to contain total rewards and steps per episode
18 | rList = []
19 | for i in range(num_episodes):
20 |     # Reset environment and get first new observation
21 |     state = env.reset()
22 |     rAll = 0
23 |     done = False
24 | 
25 |     # The Q-Table learning algorithm
26 |     while not done:
27 |         # Choose an action by greedily (with noise) picking from Q table
28 |         action = np.argmax(Q[state, :] + np.random.randn(1,
29 |                                                          env.action_space.n) / (i + 1))
30 | 
31 |         # Get new state and reward from environment
32 |         new_state, reward, done, _ = env.step(action)
33 | 
34 |         # Update Q-Table with new knowledge using learning rate
35 |         Q[state, action] = reward + dis * np.max(Q[new_state, :])
36 |         state = new_state
37 | 
38 |         rAll += reward
39 | 
40 |     rList.append(rAll)
41 | 
42 | print("Score over time: " + str(sum(rList) / num_episodes))
43 | print("Final Q-Table Values")
44 | print(Q)
45 | plt.bar(range(len(rList)), rList, color="blue")
46 | plt.show()
47 | 


--------------------------------------------------------------------------------
/05_q_table_frozenlake.py:
--------------------------------------------------------------------------------
 1 | """
 2 | FrozenLake solver using Q-table
 3 | https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-0-q-learning-with-tables-and-neural-networks-d195264329d0#.pjz9g59ap
 4 | """
 5 | 
 6 | import time
 7 | 
 8 | import gym
 9 | import numpy as np
10 | 
11 | import utils.prints as print_utils
12 | 
13 | N_ACTIONS = 4
14 | N_STATES = 16
15 | 
16 | LEARNING_RATE = .5
17 | DISCOUNT_RATE = .98
18 | 
19 | N_EPISODES = 2000
20 | 
21 | def main():
22 |     """Main"""
23 |     frozone_lake_env = gym.make("FrozenLake-v0")
24 | 
25 |     # Initialize table with all zeros
26 |     Q = np.zeros([N_STATES, N_ACTIONS])
27 | 
28 |     # Set learning parameters
29 | 
30 |     # create lists to contain total rewards and steps per episode
31 |     rewards = []
32 | 
33 |     for i in range(N_EPISODES):
34 |         # Reset environment and get first new observation
35 |         state = frozone_lake_env.reset()
36 |         episode_reward = 0
37 |         done = False
38 | 
39 |         # The Q-Table learning algorithm
40 |         while not done:
41 |             # Choose an action by greedily (with noise) picking from Q table
42 |             noise = np.random.randn(1, N_ACTIONS) / (i + 1)
43 |             action = np.argmax(Q[state, :] + noise)
44 | 
45 |             # Get new state and reward from environment
46 |             new_state, reward, done, _ = frozone_lake_env.step(action)
47 | 
48 |             reward = -1 if done and reward < 1 else reward
49 | 
50 |             # Update Q-Table with new knowledge using learning rate
51 |             Q[state, action] = (
52 |                 1 - LEARNING_RATE) * Q[state, action] + LEARNING_RATE * (
53 |                     reward + DISCOUNT_RATE * np.max(Q[new_state, :]))
54 | 
55 |             episode_reward += reward
56 |             state = new_state
57 | 
58 |         rewards.append(episode_reward)
59 | 
60 |     print("Score over time: " + str(sum(rewards) / N_EPISODES))
61 |     print("Final Q-Table Values")
62 | 
63 |     for i in range(10):
64 |         # Reset environment and get first new observation
65 |         state = frozone_lake_env.reset()
66 |         episode_reward = 0
67 |         done = False
68 | 
69 |         # The Q-Table learning algorithm
70 |         while not done:
71 |             # Choose an action by greedily (with noise) picking from Q table
72 |             action = np.argmax(Q[state, :])
73 | 
74 |             # Get new state and reward from environment
75 |             new_state, reward, done, _ = frozone_lake_env.step(action)
76 |             print_utils.clear_screen()
77 |             frozone_lake_env.render()
78 |             time.sleep(.1)
79 | 
80 |             episode_reward += reward
81 |             state = new_state
82 | 
83 |             if done:
84 |                 print("Episode Reward: {}".format(episode_reward))
85 |                 print_utils.print_result(episode_reward)
86 | 
87 |         rewards.append(episode_reward)
88 | 
89 |     frozone_lake_env.close()
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------
/06_q_net_frozenlake.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This code is based on
 3 | https://github.com/hunkim/DeepRL-Agents
 4 | '''
 5 | import gym
 6 | import numpy as np
 7 | import tensorflow as tf
 8 | import matplotlib.pyplot as plt
 9 | 
10 | env = gym.make('FrozenLake-v0')
11 | 
12 | # Input and output size based on the Env
13 | input_size = env.observation_space.n
14 | output_size = env.action_space.n
15 | learning_rate = 0.1
16 | 
17 | # These lines establish the feed-forward part of the network used to
18 | # choose actions
19 | X = tf.placeholder(shape=[1, input_size], dtype=tf.float32)  # state input
20 | W = tf.Variable(tf.random_uniform(
21 |     [input_size, output_size], 0, 0.01))  # weight
22 | 
23 | Qpred = tf.matmul(X, W)  # Out Q prediction
24 | Y = tf.placeholder(shape=[1, output_size], dtype=tf.float32)  # Y label
25 | 
26 | loss = tf.reduce_sum(tf.square(Y - Qpred))
27 | train = tf.train.GradientDescentOptimizer(
28 |     learning_rate=learning_rate).minimize(loss)
29 | 
30 | # Set Q-learning related parameters
31 | dis = .99
32 | num_episodes = 2000
33 | 
34 | # Create lists to contain total rewards and steps per episode
35 | rList = []
36 | 
37 | 
38 | def one_hot(x):
39 |     return np.identity(16)[x:x + 1]
40 | 
41 | init = tf.global_variables_initializer()
42 | with tf.Session() as sess:
43 |     sess.run(init)
44 |     for i in range(num_episodes):
45 |         # Reset environment and get first new observation
46 |         s = env.reset()
47 |         e = 1. / ((i / 50) + 10)
48 |         rAll = 0
49 |         done = False
50 |         local_loss = []
51 | 
52 |         # The Q-Network training
53 |         while not done:
54 |             # Choose an action by greedily (with e chance of random action)
55 |             # from the Q-network
56 |             Qs = sess.run(Qpred, feed_dict={X: one_hot(s)})
57 |             if np.random.rand(1) < e:
58 |                 a = env.action_space.sample()
59 |             else:
60 |                 a = np.argmax(Qs)
61 | 
62 |             # Get new state and reward from environment
63 |             s1, reward, done, _ = env.step(a)
64 |             if done:
65 |                 # Update Q, and no Qs+1, since it's a terminal state
66 |                 Qs[0, a] = reward
67 |             else:
68 |                 # Obtain the Q_s1 values by feeding the new state through our
69 |                 # network
70 |                 Qs1 = sess.run(Qpred, feed_dict={X: one_hot(s1)})
71 |                 # Update Q
72 |                 Qs[0, a] = reward + dis * np.max(Qs1)
73 | 
74 |             # Train our network using target (Y) and predicted Q (Qpred) values
75 |             sess.run(train, feed_dict={X: one_hot(s), Y: Qs})
76 | 
77 |             rAll += reward
78 |             s = s1
79 |         rList.append(rAll)
80 | 
81 | print("Percent of successful episodes: " +
82 |       str(sum(rList) / num_episodes) + "%")
83 | plt.bar(range(len(rList)), rList, color="blue")
84 | plt.show()
85 | 


--------------------------------------------------------------------------------
/07_0_random_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | env = gym.make('CartPole-v0')
 4 | env.reset()
 5 | random_episodes = 0
 6 | reward_sum = 0
 7 | while random_episodes < 10:
 8 |     env.render()
 9 |     action = env.action_space.sample()
10 |     observation, reward, done, _ = env.step(action)
11 |     print(observation, reward, done)
12 |     reward_sum += reward
13 |     if done:
14 |         random_episodes += 1
15 |         print("Reward for this episode was:", reward_sum)
16 |         reward_sum = 0
17 |         env.reset()
18 | 


--------------------------------------------------------------------------------
/07_1_q_net_cartpole.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This code is based on
 3 | https://github.com/hunkim/DeepRL-Agents
 4 | '''
 5 | import numpy as np
 6 | import tensorflow as tf
 7 | from collections import deque
 8 | 
 9 | import gym
10 | env = gym.make('CartPole-v0')
11 | 
12 | # Constants defining our neural network
13 | learning_rate = 1e-1
14 | input_size = env.observation_space.shape[0]
15 | output_size = env.action_space.n
16 | 
17 | X = tf.placeholder(tf.float32, [None, input_size], name="input_x")
18 | 
19 | # First layer of weights
20 | W1 = tf.get_variable("W1", shape=[input_size, output_size],
21 |                      initializer=tf.contrib.layers.xavier_initializer())
22 | Qpred = tf.matmul(X, W1)
23 | 
24 | # We need to define the parts of the network needed for learning a policy
25 | Y = tf.placeholder(shape=[None, output_size], dtype=tf.float32)
26 | 
27 | # Loss function
28 | loss = tf.reduce_sum(tf.square(Y - Qpred))
29 | # Learning
30 | train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
31 | 
32 | # Values for q learning
33 | max_episodes = 5000
34 | dis = 0.9
35 | step_history = []
36 | 
37 | 
38 | # Setting up our environment
39 | init = tf.global_variables_initializer()
40 | sess = tf.Session()
41 | sess.run(init)
42 | 
43 | for episode in range(max_episodes):
44 |     e = 1. / ((episode / 10) + 1)
45 |     step_count = 0
46 |     state = env.reset()
47 |     done = False
48 | 
49 |     # The Q-Network training
50 |     while not done:
51 |         step_count += 1
52 |         x = np.reshape(state, [1, input_size])
53 |         # Choose an action by greedily (with e chance of random action) from
54 |         # the Q-network
55 |         Q = sess.run(Qpred, feed_dict={X: x})
56 |         if np.random.rand(1) < e:
57 |             action = env.action_space.sample()
58 |         else:
59 |             action = np.argmax(Q)
60 | 
61 |         # Get new state and reward from environment
62 |         next_state, reward, done, _ = env.step(action)
63 |         if done:
64 |             Q[0, action] = -100
65 |         else:
66 |             x_next = np.reshape(next_state, [1, input_size])
67 |             # Obtain the Q' values by feeding the new state through our network
68 |             Q_next = sess.run(Qpred, feed_dict={X: x_next})
69 |             Q[0, action] = reward + dis * np.max(Q_next)
70 | 
71 |         # Train our network using target and predicted Q values on each episode
72 |         sess.run(train, feed_dict={X: x, Y: Q})
73 |         state = next_state
74 | 
75 |     step_history.append(step_count)
76 |     print("Episode: {}  steps: {}".format(episode, step_count))
77 |     # If last 10's avg steps are 500, it's good enough
78 |     if len(step_history) > 10 and np.mean(step_history[-10:]) > 500:
79 |         break
80 | 
81 | # See our trained network in action
82 | observation = env.reset()
83 | reward_sum = 0
84 | while True:
85 |     env.render()
86 | 
87 |     x = np.reshape(observation, [1, input_size])
88 |     Q = sess.run(Qpred, feed_dict={X: x})
89 |     action = np.argmax(Q)
90 | 
91 |     observation, reward, done, _ = env.step(action)
92 |     reward_sum += reward
93 |     if done:
94 |         print("Total score: {}".format(reward_sum))
95 |         break
96 | 


--------------------------------------------------------------------------------
/07_2_dqn_2013_cartpole.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DQN (NIPS 2013)
  3 | 
  4 | Playing Atari with Deep Reinforcement Learning
  5 | https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
  6 | """
  7 | import numpy as np
  8 | import tensorflow as tf
  9 | import random
 10 | import dqn
 11 | import gym
 12 | from collections import deque
 13 | 
 14 | env = gym.make('CartPole-v0')
 15 | env = gym.wrappers.Monitor(env, 'gym-results/', force=True)
 16 | INPUT_SIZE = env.observation_space.shape[0]
 17 | OUTPUT_SIZE = env.action_space.n
 18 | 
 19 | DISCOUNT_RATE = 0.99
 20 | REPLAY_MEMORY = 50000
 21 | MAX_EPISODE = 5000
 22 | BATCH_SIZE = 64
 23 | 
 24 | # minimum epsilon for epsilon greedy
 25 | MIN_E = 0.0
 26 | # epsilon will be `MIN_E` at `EPSILON_DECAYING_EPISODE`
 27 | EPSILON_DECAYING_EPISODE = MAX_EPISODE * 0.01
 28 | 
 29 | 
 30 | def bot_play(mainDQN: dqn.DQN) -> None:
 31 |     """Runs a single episode with rendering and prints a reward
 32 | 
 33 |     Args:
 34 |         mainDQN (dqn.DQN): DQN Agent
 35 |     """
 36 |     state = env.reset()
 37 |     total_reward = 0
 38 | 
 39 |     while True:
 40 |         env.render()
 41 |         action = np.argmax(mainDQN.predict(state))
 42 |         state, reward, done, _ = env.step(action)
 43 |         total_reward += reward
 44 |         if done:
 45 |             print("Total score: {}".format(total_reward))
 46 |             break
 47 | 
 48 | 
 49 | def train_minibatch(DQN: dqn.DQN, train_batch: list) -> float:
 50 |     """Prepare X_batch, y_batch and train them
 51 | 
 52 |     Recall our loss function is
 53 |         target = reward + discount * max Q(s',a)
 54 |                  or reward if done early
 55 | 
 56 |         Loss function: [target - Q(s, a)]^2
 57 | 
 58 |     Hence,
 59 | 
 60 |         X_batch is a state list
 61 |         y_batch is reward + discount * max Q
 62 |                    or reward if terminated early
 63 | 
 64 |     Args:
 65 |         DQN (dqn.DQN): DQN Agent to train & run
 66 |         train_batch (list): Minibatch of Replay memory
 67 |             Eeach element is a tuple of (s, a, r, s', done)
 68 | 
 69 |     Returns:
 70 |         loss: Returns a loss
 71 | 
 72 |     """
 73 |     state_array = np.vstack([x[0] for x in train_batch])
 74 |     action_array = np.array([x[1] for x in train_batch])
 75 |     reward_array = np.array([x[2] for x in train_batch])
 76 |     next_state_array = np.vstack([x[3] for x in train_batch])
 77 |     done_array = np.array([x[4] for x in train_batch])
 78 | 
 79 |     X_batch = state_array
 80 |     y_batch = DQN.predict(state_array)
 81 | 
 82 |     Q_target = reward_array + DISCOUNT_RATE * np.max(DQN.predict(next_state_array), axis=1) * ~done_array
 83 |     y_batch[np.arange(len(X_batch)), action_array] = Q_target
 84 | 
 85 |     # Train our network using target and predicted Q values on each episode
 86 |     loss, _ = DQN.update(X_batch, y_batch)
 87 | 
 88 |     return loss
 89 | 
 90 | 
 91 | def annealing_epsilon(episode: int, min_e: float, max_e: float, target_episode: int) -> float:
 92 |     """Return an linearly annealed epsilon
 93 | 
 94 |     Epsilon will decrease over time until it reaches `target_episode`
 95 | 
 96 |          (epsilon)
 97 |              |
 98 |     max_e ---|\
 99 |              | \
100 |              |  \
101 |              |   \
102 |     min_e ---|____\_______________(episode)
103 |                   |
104 |                  target_episode
105 | 
106 |      slope = (min_e - max_e) / (target_episode)
107 |      intercept = max_e
108 | 
109 |      e = slope * episode + intercept
110 | 
111 |     Args:
112 |         episode (int): Current episode
113 |         min_e (float): Minimum epsilon
114 |         max_e (float): Maximum epsilon
115 |         target_episode (int): epsilon becomes the `min_e` at `target_episode`
116 | 
117 |     Returns:
118 |         float: epsilon between `min_e` and `max_e`
119 |     """
120 | 
121 |     slope = (min_e - max_e) / (target_episode)
122 |     intercept = max_e
123 | 
124 |     return max(min_e, slope * episode + intercept)
125 | 
126 | 
127 | def main():
128 |     # store the previous observations in replay memory
129 |     replay_buffer = deque(maxlen=REPLAY_MEMORY)
130 |     last_100_game_reward = deque(maxlen=100)
131 | 
132 |     with tf.Session() as sess:
133 |         mainDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE)
134 |         init = tf.global_variables_initializer()
135 |         sess.run(init)
136 | 
137 |         for episode in range(MAX_EPISODE):
138 |             e = annealing_epsilon(episode, MIN_E, 1.0, EPSILON_DECAYING_EPISODE)
139 |             done = False
140 |             state = env.reset()
141 | 
142 |             step_count = 0
143 |             while not done:
144 | 
145 |                 if np.random.rand() < e:
146 |                     action = env.action_space.sample()
147 |                 else:
148 |                     action = np.argmax(mainDQN.predict(state))
149 | 
150 |                 next_state, reward, done, _ = env.step(action)
151 | 
152 |                 if done:
153 |                     reward = -1
154 | 
155 |                 replay_buffer.append((state, action, reward, next_state, done))
156 | 
157 |                 state = next_state
158 |                 step_count += 1
159 | 
160 |                 if len(replay_buffer) > BATCH_SIZE:
161 |                     minibatch = random.sample(replay_buffer, BATCH_SIZE)
162 |                     train_minibatch(mainDQN, minibatch)
163 | 
164 |             print("[Episode {:>5}]  steps: {:>5} e: {:>5.2f}".format(episode, step_count, e))
165 | 
166 |             # CartPole-v0 Game Clear Logic
167 |             last_100_game_reward.append(step_count)
168 |             if len(last_100_game_reward) == last_100_game_reward.maxlen:
169 |                 avg_reward = np.mean(last_100_game_reward)
170 |                 if avg_reward > 199.0:
171 |                     print("Game Cleared within {} episodes with avg reward {}".format(episode, avg_reward))
172 |                     break
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     main()
177 | 


--------------------------------------------------------------------------------
/07_3_dqn_2015_cartpole.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Double DQN (Nature 2015)
  3 | http://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf
  4 | 
  5 | Notes:
  6 |     The difference is that now there are two DQNs (DQN & Target DQN)
  7 | 
  8 |     y_i = r_i + 𝛾 * max(Q(next_state, action; 𝜃_target))
  9 | 
 10 |     Loss: (y_i - Q(state, action; 𝜃))^2
 11 | 
 12 |     Every C step, 𝜃_target <- 𝜃
 13 | 
 14 | """
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | import random
 18 | from collections import deque
 19 | import dqn
 20 | 
 21 | import gym
 22 | from typing import List
 23 | 
 24 | env = gym.make('CartPole-v0')
 25 | env = gym.wrappers.Monitor(env, directory="gym-results/", force=True)
 26 | 
 27 | # Constants defining our neural network
 28 | INPUT_SIZE = env.observation_space.shape[0]
 29 | OUTPUT_SIZE = env.action_space.n
 30 | 
 31 | DISCOUNT_RATE = 0.99
 32 | REPLAY_MEMORY = 50000
 33 | BATCH_SIZE = 64
 34 | TARGET_UPDATE_FREQUENCY = 5
 35 | MAX_EPISODES = 5000
 36 | 
 37 | 
 38 | def replay_train(mainDQN: dqn.DQN, targetDQN: dqn.DQN, train_batch: list) -> float:
 39 |     """Trains `mainDQN` with target Q values given by `targetDQN`
 40 | 
 41 |     Args:
 42 |         mainDQN (dqn.DQN): Main DQN that will be trained
 43 |         targetDQN (dqn.DQN): Target DQN that will predict Q_target
 44 |         train_batch (list): Minibatch of replay memory
 45 |             Each element is (s, a, r, s', done)
 46 |             [(state, action, reward, next_state, done), ...]
 47 | 
 48 |     Returns:
 49 |         float: After updating `mainDQN`, it returns a `loss`
 50 |     """
 51 |     states = np.vstack([x[0] for x in train_batch])
 52 |     actions = np.array([x[1] for x in train_batch])
 53 |     rewards = np.array([x[2] for x in train_batch])
 54 |     next_states = np.vstack([x[3] for x in train_batch])
 55 |     done = np.array([x[4] for x in train_batch])
 56 | 
 57 |     X = states
 58 | 
 59 |     Q_target = rewards + DISCOUNT_RATE * np.max(targetDQN.predict(next_states), axis=1) * ~done
 60 | 
 61 |     y = mainDQN.predict(states)
 62 |     y[np.arange(len(X)), actions] = Q_target
 63 | 
 64 |     # Train our network using target and predicted Q values on each episode
 65 |     return mainDQN.update(X, y)
 66 | 
 67 | 
 68 | def get_copy_var_ops(*, dest_scope_name: str, src_scope_name: str) -> List[tf.Operation]:
 69 |     """Creates TF operations that copy weights from `src_scope` to `dest_scope`
 70 | 
 71 |     Args:
 72 |         dest_scope_name (str): Destination weights (copy to)
 73 |         src_scope_name (str): Source weight (copy from)
 74 | 
 75 |     Returns:
 76 |         List[tf.Operation]: Update operations are created and returned
 77 |     """
 78 |     # Copy variables src_scope to dest_scope
 79 |     op_holder = []
 80 | 
 81 |     src_vars = tf.get_collection(
 82 |         tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
 83 |     dest_vars = tf.get_collection(
 84 |         tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)
 85 | 
 86 |     for src_var, dest_var in zip(src_vars, dest_vars):
 87 |         op_holder.append(dest_var.assign(src_var.value()))
 88 | 
 89 |     return op_holder
 90 | 
 91 | 
 92 | def bot_play(mainDQN: dqn.DQN, env: gym.Env) -> None:
 93 |     """Test runs with rendering and prints the total score
 94 | 
 95 |     Args:
 96 |         mainDQN (dqn.DQN): DQN agent to run a test
 97 |         env (gym.Env): Gym Environment
 98 |     """
 99 |     state = env.reset()
100 |     reward_sum = 0
101 | 
102 |     while True:
103 | 
104 |         env.render()
105 |         action = np.argmax(mainDQN.predict(state))
106 |         state, reward, done, _ = env.step(action)
107 |         reward_sum += reward
108 | 
109 |         if done:
110 |             print("Total score: {}".format(reward_sum))
111 |             break
112 | 
113 | 
114 | def main():
115 |     # store the previous observations in replay memory
116 |     replay_buffer = deque(maxlen=REPLAY_MEMORY)
117 | 
118 |     last_100_game_reward = deque(maxlen=100)
119 | 
120 |     with tf.Session() as sess:
121 |         mainDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="main")
122 |         targetDQN = dqn.DQN(sess, INPUT_SIZE, OUTPUT_SIZE, name="target")
123 |         sess.run(tf.global_variables_initializer())
124 | 
125 |         # initial copy q_net -> target_net
126 |         copy_ops = get_copy_var_ops(dest_scope_name="target",
127 |                                     src_scope_name="main")
128 |         sess.run(copy_ops)
129 | 
130 |         for episode in range(MAX_EPISODES):
131 |             e = 1. / ((episode / 10) + 1)
132 |             done = False
133 |             step_count = 0
134 |             state = env.reset()
135 | 
136 |             while not done:
137 |                 if np.random.rand() < e:
138 |                     action = env.action_space.sample()
139 |                 else:
140 |                     # Choose an action by greedily from the Q-network
141 |                     action = np.argmax(mainDQN.predict(state))
142 | 
143 |                 # Get new state and reward from environment
144 |                 next_state, reward, done, _ = env.step(action)
145 | 
146 |                 if done:  # Penalty
147 |                     reward = -1
148 | 
149 |                 # Save the experience to our buffer
150 |                 replay_buffer.append((state, action, reward, next_state, done))
151 | 
152 |                 if len(replay_buffer) > BATCH_SIZE:
153 |                     minibatch = random.sample(replay_buffer, BATCH_SIZE)
154 |                     loss, _ = replay_train(mainDQN, targetDQN, minibatch)
155 | 
156 |                 if step_count % TARGET_UPDATE_FREQUENCY == 0:
157 |                     sess.run(copy_ops)
158 | 
159 |                 state = next_state
160 |                 step_count += 1
161 | 
162 |             print("Episode: {}  steps: {}".format(episode, step_count))
163 | 
164 |             # CartPole-v0 Game Clear Checking Logic
165 |             last_100_game_reward.append(step_count)
166 | 
167 |             if len(last_100_game_reward) == last_100_game_reward.maxlen:
168 |                 avg_reward = np.mean(last_100_game_reward)
169 | 
170 |                 if avg_reward > 199:
171 |                     print(f"Game Cleared in {episode} episodes with avg reward {avg_reward}")
172 |                     break
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     main()
177 | 


--------------------------------------------------------------------------------
/08_1_pg_cartpole.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is based on:
  3 | https://github.com/hunkim/DeepRL-Agents
  4 | http://karpathy.github.io/2016/05/31/rl/
  5 | '''
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import gym
  9 | 
 10 | env = gym.make('CartPole-v0')
 11 | 
 12 | hidden_layer_neurons = 24
 13 | learning_rate = 1e-2
 14 | 
 15 | # Constants defining our neural network
 16 | input_size = env.observation_space.shape[0]
 17 | output_size = 1  # logistic regression, one p output
 18 | 
 19 | X = tf.placeholder(tf.float32, [None, input_size], name="input_x")
 20 | 
 21 | # First layer of weights
 22 | W1 = tf.get_variable("W1", shape=[input_size, hidden_layer_neurons],
 23 |                      initializer=tf.contrib.layers.xavier_initializer())
 24 | layer1 = tf.nn.relu(tf.matmul(X, W1))
 25 | 
 26 | # Second layer of weights
 27 | W2 = tf.get_variable("W2", shape=[hidden_layer_neurons, output_size],
 28 |                      initializer=tf.contrib.layers.xavier_initializer())
 29 | action_pred = tf.nn.sigmoid(tf.matmul(layer1, W2))
 30 | 
 31 | # Y (fake) and advantages (rewards)
 32 | Y = tf.placeholder(tf.float32, [None, output_size], name="input_y")
 33 | advantages = tf.placeholder(tf.float32, name="reward_signal")
 34 | 
 35 | # Loss function: log_likelihood * advantages
 36 | #log_lik = -tf.log(Y * action_pred + (1 - Y) * (1 - action_pred))     # using author(awjuliani)'s original cost function (maybe log_likelihood)
 37 | log_lik = -Y*tf.log(action_pred) - (1 - Y)*tf.log(1 - action_pred)    # using logistic regression cost function
 38 | loss = tf.reduce_sum(log_lik * advantages)
 39 | 
 40 | # Learning
 41 | train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
 42 | 
 43 | 
 44 | def discount_rewards(r, gamma=0.99):
 45 |     """ take 1D float array of rewards and compute discounted reward """
 46 |     discounted_r = np.zeros_like(r, dtype=np.float32)
 47 |     running_add = 0
 48 |     for t in reversed(range(len(r))):
 49 |         running_add = running_add * gamma + r[t]
 50 |         discounted_r[t] = running_add
 51 | 
 52 |     return discounted_r
 53 | 
 54 | # Testing Code
 55 | # It's always recommended to test your code
 56 | input = [1, 1, 1]
 57 | output = discount_rewards(input)
 58 | expect = [1 + 0.99 + 0.99**2, 1 + 0.99, 1]
 59 | np.testing.assert_almost_equal(output, expect)
 60 | 
 61 | 
 62 | 
 63 | # Setting up our environment
 64 | sess = tf.Session()
 65 | sess.run(tf.global_variables_initializer())
 66 | 
 67 | max_num_episodes = 500
 68 | 
 69 | # This list will contain episode rewards from the most recent 100 games
 70 | # Clear Condition: Average reward per episode >= 195.0 over 100 games
 71 | EPISODE_100_REWARD_LIST = []
 72 | for step in range(max_num_episodes):
 73 |     # Initialize x stack, y stack, and rewards
 74 |     xs = np.empty(shape=[0, input_size])
 75 |     ys = np.empty(shape=[0, 1])
 76 |     rewards = np.empty(shape=[0, 1])
 77 | 
 78 |     reward_sum = 0
 79 |     observation = env.reset()
 80 | 
 81 |     while True:
 82 |         x = np.reshape(observation, [1, input_size])
 83 | 
 84 |         # Run the neural net to determine output
 85 |         action_prob = sess.run(action_pred, feed_dict={X: x})
 86 | 
 87 |         # Determine the output based on our net, allowing for some randomness
 88 |         action = 0 if action_prob < np.random.uniform() else 1
 89 | 
 90 |         # Append the observations and outputs for learning
 91 |         xs = np.vstack([xs, x])
 92 |         ys = np.vstack([ys, action])  # Fake action
 93 | 
 94 |         # Determine the outcome of our action
 95 |         observation, reward, done, _ = env.step(action)
 96 |         rewards = np.vstack([rewards, reward])
 97 |         reward_sum += reward
 98 | 
 99 |         if done:
100 |             # Determine standardized rewards
101 |             discounted_rewards = discount_rewards(rewards)
102 |             # Normalization
103 |             discounted_rewards = (discounted_rewards - discounted_rewards.mean())/(discounted_rewards.std() + 1e-7)
104 |             l, _ = sess.run([loss, train],
105 |                             feed_dict={X: xs, Y: ys, advantages: discounted_rewards})
106 | 
107 |             EPISODE_100_REWARD_LIST.append(reward_sum)
108 |             if len(EPISODE_100_REWARD_LIST) > 100:
109 |                 EPISODE_100_REWARD_LIST = EPISODE_100_REWARD_LIST[1:]
110 |             break
111 | 
112 |     # Print status
113 |     print(f"[Episode {step:>5d}] Reward: {reward_sum:>4} Loss: {l:>10.5f}")
114 |     
115 |     if np.mean(EPISODE_100_REWARD_LIST) >= 195:
116 |         print(f"Game Cleared within {step} steps with the average reward: {np.mean(EPISODE_100_REWARD_LIST)}")
117 |         break
118 | 
119 | # See our trained bot in action
120 | observation = env.reset()
121 | reward_sum = 0
122 | 
123 | while True:
124 |     env.render()
125 |     x = np.reshape(observation, [1, input_size])
126 |     action_prob = sess.run(action_pred, feed_dict={X: x})
127 |     action = 0 if action_prob < 0.5 else 1  # No randomness
128 |     observation, reward, done, _ = env.step(action)
129 |     reward_sum += reward
130 |     if done:
131 |         print("Total score: {}".format(reward_sum))
132 |         break
133 | 
134 | sess.close()


--------------------------------------------------------------------------------
/08_2_softmax_pg_cartpole.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is based on:
  3 | https://github.com/hunkim/DeepRL-Agents
  4 | http://karpathy.github.io/2016/05/31/rl/
  5 | '''
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import gym
  9 | 
 10 | env = gym.make('CartPole-v0')
 11 | 
 12 | hidden_layer_neurons = 24
 13 | learning_rate = 1e-2
 14 | gamma = .99
 15 | 
 16 | # Constants defining our neural network
 17 | input_size = env.observation_space.shape[0]
 18 | output_size = env.action_space.n
 19 | 
 20 | X = tf.placeholder(tf.float32, [None, input_size], name="input_x")
 21 | 
 22 | # First layer of weights
 23 | W1 = tf.get_variable("W1", shape=[input_size, hidden_layer_neurons],
 24 |                      initializer=tf.contrib.layers.xavier_initializer())
 25 | layer1 = tf.nn.relu(tf.matmul(X, W1))
 26 | 
 27 | # Second layer of weights
 28 | W2 = tf.get_variable("W2", shape=[hidden_layer_neurons, output_size],
 29 |                      initializer=tf.contrib.layers.xavier_initializer())
 30 | action_pred = tf.nn.softmax(tf.matmul(layer1, W2))
 31 | 
 32 | # We need to define the parts of the network needed for learning a policy
 33 | Y = tf.placeholder(tf.float32, [None, output_size], name="input_y")
 34 | advantages = tf.placeholder(tf.float32, name="reward_signal")
 35 | 
 36 | print(Y, action_pred)
 37 | # Loss function, ∑ Ai*logp(yi∣xi), but we need fake lable Y due to autodiff
 38 | log_lik = -Y * tf.log(action_pred)
 39 | log_lik_adv = log_lik * advantages
 40 | loss = tf.reduce_mean(tf.reduce_sum(log_lik_adv, axis=1))
 41 | 
 42 | # Learning
 43 | train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
 44 | 
 45 | 
 46 | def discount_rewards(r, gamma=0.99):
 47 |     """ take 1D float array of rewards and compute discounted reward """
 48 |     discounted_r = np.zeros_like(r, dtype=np.float32)
 49 |     running_add = 0
 50 |     for t in reversed(range(len(r))):
 51 |         running_add = running_add * gamma + r[t]
 52 |         discounted_r[t] = running_add
 53 | 
 54 |     return discounted_r
 55 | 
 56 | 
 57 | # Setting up our environment
 58 | sess = tf.Session()
 59 | sess.run(tf.global_variables_initializer())
 60 | 
 61 | num_episodes = 1000
 62 | # This list will contain episode rewards from the most recent 100 games
 63 | # Clear Condition: Average reward per episode >= 195.0 over 100 games
 64 | EPISODE_100_REWARD_LIST = []
 65 | for i in range(num_episodes):
 66 | 
 67 |     # Clear out game variables
 68 |     xs = np.empty(shape=[0, input_size])
 69 |     ys = np.empty(shape=[0, output_size])
 70 |     rewards = np.empty(shape=[0, 1])
 71 | 
 72 |     reward_sum = 0
 73 |     state = env.reset()
 74 | 
 75 |     while True:
 76 |         # Append the observations to our batch
 77 |         x = np.reshape(state, [1, input_size])
 78 | 
 79 |         # Run the neural net to determine output
 80 |         action_prob = sess.run(action_pred, feed_dict={X: x})        
 81 |         action = np.random.choice(np.arange(output_size), p=action_prob[0])
 82 |     
 83 |         # Append the observations and outputs for learning
 84 |         xs = np.vstack([xs, x])
 85 |         y = np.zeros(output_size)
 86 |         y[action] = 1
 87 |         
 88 |         ys = np.vstack([ys, y])
 89 | 
 90 |         # Determine the outcome of our action
 91 |         state, reward, done, _ = env.step(action)
 92 |         reward_sum += reward
 93 |         rewards = np.vstack([rewards, reward])
 94 | 
 95 |         if done:
 96 |             # Determine standardized rewards
 97 |             discounted_rewards = discount_rewards(rewards, gamma)
 98 |             # Normalization
 99 |             discounted_rewards = (discounted_rewards - discounted_rewards.mean())/(discounted_rewards.std() + 1e-7)
100 |             ll, la, l, _ = sess.run([log_lik, log_lik_adv, loss, train], feed_dict={X: xs,
101 |                                                                                     Y: ys,
102 |                                                                                     advantages: discounted_rewards})
103 |             # print values for debugging
104 |             # print(1, ll, la)
105 |             EPISODE_100_REWARD_LIST.append(reward_sum)
106 |             if len(EPISODE_100_REWARD_LIST) > 100:
107 |                 EPISODE_100_REWARD_LIST = EPISODE_100_REWARD_LIST[1:]
108 |             break
109 | 
110 | 
111 |     # Print status
112 |     print(f"[Episode {i:>}] Reward: {reward_sum:>4} Loss: {l:>5.5}")
113 |     
114 |     if np.mean(EPISODE_100_REWARD_LIST) >= 195.0:
115 |         print(f"Game Cleared within {i} steps with the average reward: {np.mean(EPISODE_100_REWARD_LIST)}")
116 |         break
117 | 
118 | 
119 | 
120 | state = env.reset()
121 | reward_sum = 0
122 | 
123 | while True:
124 |     env.render()
125 | 
126 |     x = np.reshape(state, [1, input_size])
127 |     action_prob = sess.run(action_pred, feed_dict={X: x})
128 |     action = np.argmax(action_prob)
129 |     state, reward, done, _ = env.step(action)
130 |     reward_sum += reward
131 |     if done:
132 |         print("Total score: {}".format(reward_sum))
133 |         break
134 | 
135 | sess.close()


--------------------------------------------------------------------------------
/08_3_softmax_pg_pacman.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is based on:
  3 | https://github.com/hunkim/DeepRL-Agents
  4 | http://karpathy.github.io/2016/05/31/rl/
  5 | '''
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import os
  9 | # by Jin Kim (golbin) https://github.com/golbin/TensorFlow-Tutorials
 10 | import mini_pacman
 11 | 
 12 | env = mini_pacman.Gym()
 13 | 
 14 | hidden_layer_neurons = 64
 15 | learning_rate = 1e-3
 16 | gamma = .99
 17 | 
 18 | LOG_DIR = './pacman_log'
 19 | 
 20 | # Constants defining our neural network
 21 | input_size = 240
 22 | output_size = 3
 23 | 
 24 | X = tf.placeholder(tf.float32, [None, input_size], name="input_x")
 25 | x_image = tf.reshape(X, [-1, 6, 10, 1])
 26 | tf.summary.image('input', x_image, 8)
 27 | 
 28 | # First layer of weights
 29 | with tf.name_scope("layer1"):
 30 |     W1 = tf.get_variable("W1", shape=[input_size, hidden_layer_neurons],
 31 |                          initializer=tf.contrib.layers.xavier_initializer())
 32 |     B1 = tf.Variable(tf.zeros([hidden_layer_neurons]))
 33 |     layer1 = tf.matmul(X, W1) + B1
 34 |     layer1_act = tf.nn.tanh(layer1)
 35 |     tf.summary.histogram("X", X)
 36 |     tf.summary.histogram("weights", W1)
 37 |     tf.summary.histogram("bias", B1)
 38 |     tf.summary.histogram("layer", layer1)
 39 |     tf.summary.histogram("activations", layer1_act)
 40 | 
 41 | 
 42 | # Second layer of weights
 43 | with tf.name_scope("layer2"):
 44 |     W2 = tf.get_variable("W2", shape=[hidden_layer_neurons, output_size],
 45 |                          initializer=tf.contrib.layers.xavier_initializer())
 46 |     B2 = tf.Variable(tf.zeros([output_size]))
 47 |     layer2 = tf.matmul(layer1_act, W2) + B2
 48 |     action_pred = tf.nn.softmax(layer2)
 49 |     tf.summary.histogram("weights", W2)
 50 |     tf.summary.histogram("bias", B2)
 51 |     tf.summary.histogram("layer", layer2)
 52 |     tf.summary.histogram("action_pred", action_pred)
 53 | 
 54 | # We need to define the parts of the network needed for learning a policy
 55 | Y = tf.placeholder(tf.float32, [None, output_size], name="input_y")
 56 | advantages = tf.placeholder(tf.float32, name="reward_signal")
 57 | 
 58 | # Loss function
 59 | # Sum (Ai*logp(yi|xi))
 60 | log_lik = -Y * tf.log(action_pred)
 61 | log_like_adv = log_lik * advantages
 62 | loss = tf.reduce_mean(tf.reduce_sum(log_like_adv))
 63 | tf.summary.scalar("Q", tf.reduce_mean(action_pred))
 64 | tf.summary.scalar("Y", tf.reduce_mean(Y))
 65 | tf.summary.scalar("log_likelihood", tf.reduce_mean(log_lik))
 66 | tf.summary.scalar("loss", loss)
 67 | 
 68 | # Learning
 69 | train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
 70 | 
 71 | # Some place holders for summary
 72 | summary_reward = tf.placeholder(tf.float32, shape=(), name="reward")
 73 | tf.summary.scalar("reward", summary_reward)
 74 | 
 75 | # Summary
 76 | summary = tf.summary.merge_all()
 77 | 
 78 | 
 79 | def discount_rewards(r, gamma=0.99):
 80 |     """ take 1D float array of rewards and compute discounted reward """
 81 |     discounted_r = np.zeros_like(r)
 82 |     running_add = 0
 83 |     for t in reversed(range(0, r.size)):
 84 |         if r[t] != 0:
 85 |             # reset the sum, since this was a game boundary (pong specific!)
 86 |             running_add = 0
 87 |         running_add = running_add * gamma + r[t]
 88 |         discounted_r[t] = running_add
 89 | 
 90 |         # compute the discounted reward backwards through time
 91 |         # standardize the rewards to be unit normal (helps control the gradient
 92 |         # estimator variance)
 93 |         discounted_r -= np.mean(discounted_r)
 94 |         discounted_r /= np.std(discounted_r)
 95 | 
 96 |     return discounted_r
 97 | 
 98 | # Setting up our environment
 99 | sess = tf.Session()
100 | rendering = False
101 | sess.run(tf.global_variables_initializer())
102 | 
103 | # TensorBoard
104 | writer = tf.summary.FileWriter(LOG_DIR)
105 | writer.add_graph(sess.graph)
106 | 
107 | # Savor and Restore
108 | saver = tf.train.Saver()
109 | checkpoint = tf.train.get_checkpoint_state(LOG_DIR)
110 | if checkpoint and checkpoint.model_checkpoint_path:
111 |     try:
112 |         saver.restore(sess, checkpoint.model_checkpoint_path)
113 |         print("Successfully loaded:", checkpoint.model_checkpoint_path)
114 |     except:
115 |         print("Error on loading old network weights")
116 | else:
117 |     print("Could not find old network weights")
118 | 
119 | global_step = 0
120 | while True:
121 |     global_step += 1
122 | 
123 |     # Clear out game variables
124 |     xs = np.empty(0).reshape(0, input_size)
125 |     ys = np.empty(0).reshape(0, output_size)
126 |     rewards = np.empty(0).reshape(0, 1)
127 | 
128 |     reward_sum = 0
129 |     state = env.reset()
130 | 
131 |     # Initial 4 frame data
132 |     s_t = np.array([state, state, state, state])
133 | 
134 |     while True:
135 |         # Append the observations to our batch
136 |         x = np.reshape(s_t, [1, input_size])
137 | 
138 |         # Run the neural net to determine output
139 |         action_prob = sess.run(action_pred, feed_dict={X: x})
140 |         action_prob = np.squeeze(action_prob)  # shape (?, 2) -> 2
141 |         random_noise = np.random.uniform(0, 1, output_size)
142 |         action = np.argmax(action_prob + random_noise)
143 |         print("Action prediction: ", np.argmax(action_prob), " action taken:", action,
144 |               np.argmax(action_prob) == action)
145 | 
146 |         # Append the observations and outputs for learning
147 |         xs = np.vstack([xs, x])
148 |         y = np.eye(output_size)[action:action + 1]  # One hot encoding
149 |         ys = np.vstack([ys, y])
150 | 
151 |         # Determine the outcome of our action
152 |         state, reward, done, _ = env.step(action)
153 | 
154 |         s_t = np.array([state, s_t[0], s_t[1], s_t[2]])
155 |         reward_sum += reward
156 |         rewards = np.vstack([rewards, reward])
157 | 
158 |         if done:
159 |             # Determine standardized rewards
160 |             rewards = discount_rewards(rewards)
161 |             reward_mean = np.mean(rewards)
162 |             ll, la, l, s, _ = sess.run([log_lik, log_like_adv, loss, summary, train],
163 |                                        feed_dict={X: xs,
164 |                                                   Y: ys,
165 |                                                   advantages: rewards,
166 |                                                   summary_reward: reward_mean})
167 |             #print(ll, la)
168 |             writer.add_summary(s, global_step)
169 |             break
170 | 
171 |     # Print status
172 |     print("Average reward for episode {}: {}. Loss: {}".format(
173 |         global_step, reward_sum, l))
174 | 
175 |     if global_step % 100 == 0:
176 |         print("Saving network...")
177 |         if not os.path.exists(LOG_DIR):
178 |             os.makedirs(LOG_DIR)
179 |         saver.save(sess, os.path.join(LOG_DIR, "model.ckpt"),
180 |                    global_step=global_step)
181 | 


--------------------------------------------------------------------------------
/08_4_softmax_pg_pong.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This code is based on:
  3 | https://github.com/hunkim/DeepRL-Agents
  4 | http://karpathy.github.io/2016/05/31/rl/
  5 | '''
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import gym
  9 | import os
 10 | 
 11 | env = gym.make("Pong-v0")
 12 | 
 13 | gamma = .99
 14 | 
 15 | SUMMARY_DIR = './tensorboard/pong'
 16 | CHECK_POINT_DIR = SUMMARY_DIR
 17 | 
 18 | # Constants defining our neural network
 19 | input_size = 80 * 80 * 4
 20 | action_space = env.action_space.n
 21 | print("Pong Action space", action_space)
 22 | 
 23 | with tf.name_scope("cnn"):
 24 |     X = tf.placeholder(tf.float32, [None, input_size], name="input_x")
 25 |     x_image = tf.reshape(X, [-1, 80, 80, 4])
 26 |     tf.summary.image('input', x_image, 8)
 27 | 
 28 |     # Build a convolutional layer random initialization
 29 |     W_conv1 = tf.get_variable("W_conv1", shape = [5, 5, 4, 32], initializer=tf.contrib.layers.xavier_initializer()) 
 30 |     # W is [row, col, channel, feature]
 31 |     b_conv1 = tf.Variable(tf.zeros([32]), name="b_conv1")
 32 |     h_conv1 = tf.nn.relu(tf.nn.conv2d(x_image, W_conv1, strides=[1, 2, 2, 1], padding='VALID') + b_conv1, name="h_conv1")
 33 | 
 34 |     W_conv2 = tf.get_variable("W_conv2", shape = [5, 5, 32, 64], initializer=tf.contrib.layers.xavier_initializer())
 35 |     b_conv2 = tf.Variable(tf.zeros([64]), name="b_conv2")
 36 |     h_conv2 = tf.nn.relu(tf.nn.conv2d(h_conv1, W_conv2, strides=[1, 2, 2, 1], padding='VALID') + b_conv2, name="h_conv2")
 37 | 
 38 |     W_conv3 = tf.get_variable("W_conv3", shape = [5, 5, 64, 64], initializer=tf.contrib.layers.xavier_initializer())
 39 |     b_conv3 = tf.Variable(tf.zeros([64]), name="b_conv3")
 40 |     h_conv3 = tf.nn.relu(tf.nn.conv2d(h_conv2, W_conv3, strides=[1, 2, 2, 1], padding='VALID') + b_conv3, name="h_conv3")
 41 |     
 42 |     # Build a fully connected layer with softmax 
 43 |     h_conv3_flat = tf.reshape(h_conv3, [-1, 7*7*64], name="h_pool2_flat")
 44 |     W_fc1 = tf.get_variable("W_fc1", shape = [7*7*64, action_space], initializer=tf.contrib.layers.xavier_initializer())
 45 |     b_fc1 = tf.Variable(tf.zeros([action_space]), name = 'b_fc1')
 46 |     action_pred = tf.nn.softmax(tf.matmul(h_conv3_flat, W_fc1) + b_fc1, name="h_fc1")
 47 | 
 48 |     tf.summary.histogram("action_pred", action_pred)
 49 | 
 50 | # We need to define the parts of the network needed for learning a policy
 51 | Y = tf.placeholder(tf.float32, [None, action_space], name="input_y")
 52 | advantages = tf.placeholder(tf.float32, [None, 1], name="reward_signal")
 53 | 
 54 | # Loss function
 55 | # Sum (Ai*logp(yi|xi))
 56 | log_lik = -Y * (tf.log(tf.clip_by_value(action_pred, 1e-10 , 1.0)))
 57 | loss = tf.reduce_mean(tf.reduce_sum(log_lik * advantages, axis=1))
 58 | tf.summary.scalar("A_pred", tf.reduce_mean(action_pred))
 59 | tf.summary.scalar("Y", tf.reduce_mean(Y))
 60 | tf.summary.scalar("log_likelihood", tf.reduce_mean(log_lik))
 61 | tf.summary.scalar("loss", loss)
 62 | 
 63 | # Learning
 64 | train = tf.train.AdamOptimizer().minimize(loss)
 65 | 
 66 | # Some place holders for summary
 67 | summary_reward = tf.placeholder(tf.float32, shape=(), name="reward")
 68 | tf.summary.scalar("reward", summary_reward)
 69 | 
 70 | # Summary
 71 | summary = tf.summary.merge_all()
 72 | 
 73 | 
 74 | def discount_rewards(r, gamma=0.99):
 75 |     """ take 1D float array of rewards and compute discounted reward
 76 |         http://karpathy.github.io/2016/05/31/rl/  """
 77 |     discounted_r = np.zeros_like(r)
 78 |     running_add = 0
 79 |     for t in reversed(range(0, r.size)):
 80 |         if r[t] != 0:
 81 |             # reset the sum, since this was a game boundary (pong specific!)
 82 |             running_add = 0
 83 |         running_add = running_add * gamma + r[t]
 84 |         discounted_r[t] = running_add
 85 | 
 86 |     # compute the discounted reward backwards through time
 87 |     # standardize the rewards to be unit normal (helps control the gradient
 88 |     # estimator variance)
 89 |     discounted_r -= np.mean(discounted_r)
 90 |     discounted_r /= np.std(discounted_r)
 91 |     return discounted_r
 92 | 
 93 | 
 94 | def prepro(I):
 95 |     """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector
 96 |         http://karpathy.github.io/2016/05/31/rl/ """
 97 |     I = I[35:195]  # crop
 98 |     I = I[::2, ::2, 0]  # downsample by factor of 2
 99 |     I[I == 144] = 0  # erase background (background type 1)
100 |     I[I == 109] = 0  # erase background (background type 2)
101 |     I[I != 0] = 1  # everything else (paddles, ball) just set to 1
102 |     return I.astype(np.float).ravel()
103 | 
104 | 
105 | # Setting up our environment
106 | sess = tf.Session()
107 | sess.run(tf.global_variables_initializer())
108 | writer = tf.summary.FileWriter(SUMMARY_DIR)
109 | writer.add_graph(sess.graph)
110 | 
111 | # Savor and Restore
112 | saver = tf.train.Saver()
113 | checkpoint = tf.train.get_checkpoint_state(CHECK_POINT_DIR)
114 | if checkpoint and checkpoint.model_checkpoint_path:
115 |     try:
116 |         saver.restore(sess, checkpoint.model_checkpoint_path)
117 |         print("Successfully loaded:", checkpoint.model_checkpoint_path)
118 |     except:
119 |         print("Error on loading old network weights")
120 | else:
121 |     print("Could not find old network weights")
122 | 
123 | global_step = 0
124 | while True:
125 |     global_step += 1
126 | 
127 |     xs_list = []
128 |     ys_list = []
129 |     rewards = np.empty(0).reshape(0, 1)
130 |     ep_rewards_list = []
131 | 
132 |     reward_sum = 0
133 |     state = env.reset()
134 |     state = prepro(state)
135 | 
136 |     # Initial 4 frame data
137 |     s_t = np.array([state, state, state, state])
138 | 
139 |     while True:
140 |         # Append the observations to our batch
141 |         x = np.reshape(s_t, [1, input_size])
142 | 
143 |         # Run the neural net to determine output
144 |         action_prob = sess.run(action_pred, feed_dict={X: x})
145 |         action_prob = np.squeeze(action_prob)  # shape (?, n) -> n
146 |         action = np.random.choice(action_space, size=1, p=action_prob)[0]
147 |         
148 |         #random_noise = np.random.uniform(0, 1, output_size)
149 |         #action = np.argmax(action_prob + random_noise)
150 |         # print("Action prediction: ", np.argmax(action_prob), " action taken:", action,
151 |         #      np.argmax(action_prob) == action)
152 | 
153 |         # Append the observations and outputs for learning
154 |         xs_list.append(x)
155 |         y = np.eye(action_space)[action:action + 1]  # One hot encoding
156 |         ys_list.append(y)
157 | 
158 |         state, reward, done, _ = env.step(action)
159 |         # env.render()
160 |         state = prepro(state)
161 |         s_t = np.array([state, s_t[0], s_t[1], s_t[2]])  # s_t[4] out!
162 |         reward_sum += reward
163 | 
164 |         ep_rewards_list.append(reward)
165 | 
166 |         # Discount rewards on every single game
167 |         if reward == 1 or reward == -1:
168 |             ep_rewards = np.vstack(ep_rewards_list)
169 |             discounted_rewards = discount_rewards(ep_rewards, gamma)
170 |             rewards = np.vstack([rewards, discounted_rewards])
171 |             ep_rewards_list = []
172 |             # print(ep_rewards, discounted_rewards)
173 |             print("Ep reward {}".format(reward))
174 |         if done:
175 |             xs = np.vstack(xs_list)
176 |             ys = np.vstack(ys_list)
177 |             
178 |             l, s, _ = sess.run([loss, summary, train],
179 |                                feed_dict={X: xs,
180 |                                           Y: ys,
181 |                                           advantages: rewards,
182 |                                           summary_reward: reward_sum})
183 |             writer.add_summary(s, global_step)
184 |             break
185 | 
186 |     # Print status
187 |     print("Average reward for episode {}: {}. Loss: {}".format(
188 |         global_step, reward_sum, l))
189 | 
190 |     if global_step % 100 == 0:
191 |         print("Saving network...")
192 |         if not os.path.exists(CHECK_POINT_DIR):
193 |             os.makedirs(CHECK_POINT_DIR)
194 |         saver.save(sess, CHECK_POINT_DIR + "/pong", global_step=global_step)
195 | 


--------------------------------------------------------------------------------
/08_4_softmax_pg_pong_y.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Yet Another Pong
  3 | 
  4 | Vanilla Policy Gradient implementation
  5 | 
  6 | (1) Pong's state is (210, 160, 3)
  7 | (2) After `pipeline(image)`, it becomes (80, 80, 1)
  8 | (3) The model uses an input of `state_diff` = `new_state` - `old_state`
  9 | (4) It assumes there exists 2 actions.
 10 | 
 11 |         Pong's original action space is the following:
 12 |             0, 1 : do nothing
 13 |             2, 4 : move up
 14 |             3, 5 : move down
 15 | 
 16 |         In this file, it uses {2: move up, 3: move down} only
 17 | 
 18 |         It gets rid of unnecessary complexity.
 19 | """
 20 | import gym
 21 | import matplotlib.pyplot as plt
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | from functools import partial
 26 | from scipy.misc import imresize
 27 | 
 28 | import os
 29 | 
 30 | 
 31 | def plot_image(image):
 32 |     """Plot an image
 33 | 
 34 |     If an image is a grayscale image,
 35 |     plot in `gray` cmap.
 36 |     Otherwise, regular RGB plot.
 37 | 
 38 |     Args:
 39 |         image (2-D or 3-D array): (H, W) or (H, W, C)
 40 |     """
 41 |     image = np.squeeze(image)
 42 |     shape = image.shape
 43 | 
 44 |     if len(shape) == 2:
 45 |         plt.imshow(image, cmap="gray")
 46 | 
 47 |     else:
 48 |         plt.imshow(image)
 49 | 
 50 |     plt.show()
 51 | 
 52 | 
 53 | def pipeline(image, new_HW, height_range=(35, 193), bg=(144, 72, 17)):
 54 |     """Returns a preprocessed image
 55 | 
 56 |     (1) Crop image (top and bottom)
 57 |     (2) Remove background & grayscale
 58 |     (3) Reszie to smaller image
 59 | 
 60 |     Args:
 61 |         image (3-D array): (H, W, C)
 62 |         new_HW (tuple): New image size (height, width)
 63 |         height_range (tuple): Height range (H_begin, H_end) else cropped
 64 |         bg (tuple): Background RGB Color (R, G, B)
 65 | 
 66 |     Returns:
 67 |         image (3-D array): (H, W, 1)
 68 |     """
 69 |     image = crop_image(image, height_range)
 70 |     image = resize_image(image, new_HW)
 71 |     image = kill_background_grayscale(image, bg)
 72 | 
 73 |     image = np.expand_dims(image, axis=2)
 74 |     return image
 75 | 
 76 | 
 77 | def resize_image(image, new_HW):
 78 |     """Returns a resized image
 79 | 
 80 |     Args:
 81 |         image (3-D array): Numpy array (H, W, C)
 82 |         new_HW (tuple): Target size (height, width)
 83 | 
 84 |     Returns:
 85 |         image (3-D array): Resized image (height, width, C)
 86 |     """
 87 |     return imresize(image, new_HW, interp="nearest")
 88 | 
 89 | 
 90 | def crop_image(image, height_range=(35, 195)):
 91 |     """Crops top and bottom
 92 | 
 93 |     Args:
 94 |         image (3-D array): Numpy image (H, W, C)
 95 |         height_range (tuple): Height range between (min_height, max_height)
 96 |             will be kept
 97 | 
 98 |     Returns:
 99 |         image (3-D array): Numpy image (max_H - min_H, W, C)
100 |     """
101 |     h_beg, h_end = height_range
102 |     return image[h_beg:h_end, ...]
103 | 
104 | 
105 | def kill_background_grayscale(image, bg):
106 |     """Make the background 0
107 | 
108 |     Args:
109 |         image (3-D array): Numpy array (H, W, C)
110 |         bg (tuple): RGB code of background (R, G, B)
111 | 
112 |     Returns:
113 |         image (2-D array): Binarized image of shape (H, W)
114 |             The background is 0 and everything else is 1
115 |     """
116 |     H, W, _ = image.shape
117 | 
118 |     R = image[..., 0]
119 |     G = image[..., 1]
120 |     B = image[..., 2]
121 | 
122 |     cond = (R == bg[0]) & (G == bg[1]) & (B == bg[2])
123 | 
124 |     image = np.zeros((H, W))
125 |     image[~cond] = 1
126 | 
127 |     return image
128 | 
129 | 
130 | class Agent(object):
131 | 
132 |     def __init__(self, input_dim, output_dim, logdir="logdir", checkpoint_dir="checkpoints"):
133 |         """Agent class
134 | 
135 |         Args:
136 |             input_dim (tuple): The input shape (H, W, C)
137 |             output_dim (int): Number of actions
138 |             logdir (str): Directory to save `summary`
139 |             checkpoint_dir (str): Directory to save `model.ckpt`
140 | 
141 |         Notes:
142 | 
143 |             It has two methods.
144 | 
145 |                 `choose_action(state)`
146 |                     Will return an action given the state
147 | 
148 |                 `train(state, action, reward)`
149 |                     Will train on given `states`, `actions`, `rewards`
150 | 
151 |             Private methods has two underscore prefixes
152 |         """
153 |         self.input_dim = list(input_dim)
154 |         self.output_dim = output_dim
155 |         self.gamma = 0.99
156 |         self.entropy_coefficient = 0.01
157 |         self.RMSPropdecay = 0.99
158 |         self.learning_rate = 0.001
159 | 
160 |         self.checkpoint_dir = checkpoint_dir
161 |         self.__build_network(self.input_dim, self.output_dim)
162 | 
163 |         if logdir is not None:
164 |             self.__build_summary_op(logdir)
165 |         else:
166 |             self.summary_op = None
167 | 
168 |         if checkpoint_dir is not None:
169 |             self.saver = tf.train.Saver()
170 | 
171 |             maybe_path = os.path.join(self.checkpoint_dir, "model.ckpt")
172 |             if os.path.exists(self.checkpoint_dir) and tf.train.checkpoint_exists(maybe_path):
173 |                 print("Restored {}".format(maybe_path))
174 |                 sess = tf.get_default_session()
175 |                 self.saver.restore(sess, maybe_path)
176 | 
177 |             else:
178 |                 print("No model is found")
179 |                 os.makedirs(checkpoint_dir, exist_ok=True)
180 | 
181 |     def __build_network(self, input_dim, output_dim):
182 | 
183 |         self.global_step = tf.train.get_or_create_global_step()
184 | 
185 |         self.X = tf.placeholder(tf.float32, shape=[None, *input_dim], name='state')
186 |         self.action = tf.placeholder(tf.uint8, shape=[None], name="action")
187 |         action_onehot = tf.one_hot(self.action, output_dim, name="action_onehot")
188 |         self.reward = tf.placeholder(tf.float32, shape=[None], name="reward")
189 | 
190 |         net = self.X
191 | 
192 |         with tf.variable_scope("layer1"):
193 |             net = tf.layers.conv2d(net,
194 |                                    filters=16,
195 |                                    kernel_size=(8, 8),
196 |                                    strides=(4, 4),
197 |                                    name="conv")
198 |             net = tf.nn.relu(net, name="relu")
199 | 
200 |         with tf.variable_scope("layer2"):
201 |             net = tf.layers.conv2d(net,
202 |                                    filters=32,
203 |                                    kernel_size=(4, 4),
204 |                                    strides=(2, 2),
205 |                                    name="conv")
206 |             net = tf.nn.relu(net, name="relu")
207 | 
208 |         with tf.variable_scope("fc1"):
209 |             net = tf.contrib.layers.flatten(net)
210 |             net = tf.layers.dense(net, 256, name='dense')
211 |             net = tf.nn.relu(net, name='relu')
212 | 
213 |         with tf.variable_scope("fc2"):
214 |             net = tf.layers.dense(net, output_dim, name='dense')
215 | 
216 |         self.action_prob = tf.nn.softmax(net, name="action_prob")
217 | 
218 |         log_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1)
219 |         log_action_prob = tf.log(log_action_prob + 1e-7)
220 | 
221 |         entropy = - self.action_prob * tf.log(self.action_prob + 1e-7)
222 |         self.entropy = tf.reduce_sum(entropy, axis=1)
223 | 
224 |         loss = -log_action_prob * self.reward - self.entropy * self.entropy_coefficient
225 |         self.loss = tf.reduce_mean(loss)
226 | 
227 |         optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate,
228 |                                               decay=self.RMSPropdecay)
229 | 
230 |         self.train_op = optimizer.minimize(loss,
231 |                                            global_step=self.global_step)
232 | 
233 |     def __build_summary_op(self, logdir):
234 |         tf.summary.histogram("Action Probability Histogram", self.action_prob)
235 |         tf.summary.histogram("Entropy", self.entropy)
236 |         tf.summary.scalar("Loss", self.loss)
237 |         tf.summary.scalar("Mean Reward", tf.reduce_mean(self.reward))
238 | 
239 |         self.summary_op = tf.summary.merge_all()
240 |         self.summary_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
241 | 
242 |     def choose_action(self, S):
243 |         shape = S.shape
244 | 
245 |         if len(shape) == 3:
246 |             S = np.expand_dims(S, axis=0)
247 | 
248 |         np.testing.assert_equal(S.shape[1:], self.input_dim)
249 | 
250 |         sess = tf.get_default_session()
251 |         action_prob = sess.run(self.action_prob,
252 |                                feed_dict={self.X: S})
253 |         action_prob = np.squeeze(action_prob)
254 |         return np.random.choice(np.arange(self.output_dim) + 2, p=action_prob)
255 | 
256 |     def train(self, S, A, R):
257 |         S = np.array(S)
258 |         A = np.array(A)
259 |         R = np.array(R)
260 |         np.testing.assert_equal(S.shape[1:], self.input_dim)
261 |         assert len(A.shape) == 1, "A.shape = {}".format(A.shape)
262 |         assert len(R.shape) == 1, "R.shape = {}".format(R.shape)
263 | 
264 |         R = discount_reward(R, gamma=self.gamma)
265 |         R -= np.mean(R)
266 |         R /= np.std(R) + 1e-7
267 | 
268 |         A = A - 2
269 | 
270 |         sess = tf.get_default_session()
271 | 
272 |         _, summary_op, global_step_value = sess.run([self.train_op,
273 |                                                      self.summary_op,
274 |                                                      self.global_step],
275 |                                                     feed_dict={self.X: S,
276 |                                                                self.action: A,
277 |                                                                self.reward: R})
278 | 
279 |         if self.summary_op is not None:
280 |             self.summary_writer.add_summary(summary_op, global_step_value)
281 | 
282 |     def save(self):
283 |         sess = tf.get_default_session()
284 |         path = os.path.join(self.checkpoint_dir, "model.ckpt")
285 |         self.saver.save(sess, path)
286 | 
287 | 
288 | def discount_reward(rewards, gamma=0.99):
289 |     """Returns discounted rewards
290 | 
291 |     Args:
292 |         rewards (1-D array): Reward array
293 |         gamma (float): Discounted rate
294 | 
295 |     Returns:
296 |         discounted_rewards: same shape as `rewards`
297 | 
298 |     Notes:
299 |         In Pong, when the reward can be {-1, 0, 1}.
300 | 
301 |         However, when the reward is either -1 or 1,
302 |         it means the game has been reset.
303 | 
304 |         Therefore, it's necessaray to reset `running_add` to 0
305 |         whenever the reward is nonzero
306 |     """
307 |     discounted_r = np.zeros_like(rewards, dtype=np.float32)
308 |     running_add = 0
309 |     for t in reversed(range(len(rewards))):
310 |         if rewards[t] != 0:
311 |             running_add = 0
312 |         running_add = running_add * gamma + rewards[t]
313 |         discounted_r[t] = running_add
314 | 
315 |     return discounted_r
316 | 
317 | 
318 | def run_episode(env, agent, pipeline):
319 |     """Runs one episode and returns a total reward
320 | 
321 |     Args:
322 |         env (gym.env): Gym Environment
323 |         agent (Agent): Agent Player
324 |         pipeline (function): Preprocessing function.
325 |             processed_image = pipeline(image)
326 | 
327 |     Returns:
328 |         total_reward (int): Total reward earned in an episode.
329 |     """
330 |     states = []
331 |     actions = []
332 |     rewards = []
333 | 
334 |     old_s = env.reset()
335 |     old_s = pipeline(old_s)
336 | 
337 |     done = False
338 |     total_reward = 0
339 |     step_counter = 0
340 | 
341 |     state_diff = old_s
342 | 
343 |     while not done:
344 | 
345 |         action = agent.choose_action(state_diff)
346 |         new_s, r, done, info = env.step(action)
347 |         total_reward += r
348 | 
349 |         states.append(state_diff)
350 |         actions.append(action)
351 |         rewards.append(r)
352 | 
353 |         new_s = pipeline(new_s)
354 |         state_diff = new_s - old_s
355 |         old_s = new_s
356 | 
357 |         if r == -1 or r == 1 or done:
358 |             step_counter += 1
359 | 
360 |             if step_counter > 10 or done:
361 |                 step_counter = 0
362 |                 # Agent expects numpy array
363 |                 agent.train(states, actions, rewards)
364 | 
365 |                 states, actions, rewards = [], [], []
366 | 
367 |     return total_reward
368 | 
369 | 
370 | def main():
371 |     try:
372 |         env = gym.make("Pong-v0")
373 |         env = gym.wrappers.Monitor(env, "monitor", force=True)
374 |         action_dim = 2
375 | 
376 |         tf.reset_default_graph()
377 |         sess = tf.InteractiveSession()
378 | 
379 |         new_HW = [80, 80]
380 |         repeat = 1
381 |         pipeline_fn = partial(pipeline, new_HW=new_HW, height_range=(35, 195), bg=(144, 72, 17))
382 | 
383 |         agent = Agent(new_HW + [repeat],
384 |                       output_dim=action_dim,
385 |                       logdir='logdir/train',
386 |                       checkpoint_dir="checkpoints")
387 | 
388 |         init = tf.global_variables_initializer()
389 |         sess.run(init)
390 | 
391 |         episode = 1
392 | 
393 |         while True:
394 |             episode_reward = run_episode(env, agent, pipeline_fn)
395 |             print(episode, episode_reward)
396 | 
397 |             episode += 1
398 | 
399 |     finally:
400 |         agent.save()
401 | 
402 |         env.close()
403 |         sess.close()
404 | 
405 | 
406 | def debug_mode():
407 |     pipeline_fn = partial(pipeline, new_HW=(50, 50), height_range=(35, 195), bg=(144, 72, 17))
408 |     try:
409 | 
410 |         env = gym.make("Pong-v0")
411 |         env.reset()
412 | 
413 |         for _ in range(50):
414 | 
415 |             s = env.step(env.action_space.sample())[0]
416 | 
417 |             plot_image(np.squeeze(pipeline_fn(s)))
418 | 
419 |     finally:
420 | 
421 |         env.close()
422 | 
423 | 
424 | if __name__ == '__main__':
425 |     main()
426 |     # debug_mode()
427 | 


--------------------------------------------------------------------------------
/09_2_cross_entropy.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Cross Entropy Method
  3 | 
  4 | Cross Entropy Method is a simple and efficient method
  5 | for solving a variety of estimation and optimization problems.
  6 | 
  7 | Psuedocode
  8 | 
  9 | initialize mu, sd
 10 | while not done:
 11 |     collect N samples of theta ~ N(mu, diag(sd))
 12 |     perform one episode with each theta
 13 |     select top performing samples, called elite set
 14 |     obtain a new mu and sd
 15 | end
 16 | 
 17 | """
 18 | import numpy as np
 19 | import gym
 20 | 
 21 | 
 22 | env = gym.make("CartPole-v0")
 23 | 
 24 | INPUT_SIZE = env.observation_space.shape[0]
 25 | OUTPUT_SIZE = env.action_space.n
 26 | 
 27 | 
 28 | def get_W_b(theta):
 29 |     """Get W and b
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     theta : 1-d array
 34 |         Flatten theta
 35 | 
 36 |     Returns
 37 |     ----------
 38 |     W : 2-d array
 39 |     b : 1-d array
 40 | 
 41 |     Examples
 42 |     ----------
 43 |     >>> theta = np.random.randn(5)
 44 |     >>> W, b = get_W_b(theta)
 45 |     """
 46 |     idx = INPUT_SIZE * OUTPUT_SIZE
 47 |     W = theta[:idx].reshape(INPUT_SIZE, OUTPUT_SIZE)
 48 |     b = theta[idx:].reshape(OUTPUT_SIZE)
 49 | 
 50 |     return W, b
 51 | 
 52 | 
 53 | def choose_action(s, W, b):
 54 |     """Return an action (argmax)
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     s : ndarray
 59 |         Observation (input_dim, )
 60 | 
 61 |     W : ndarray, (input_dim, number_of_actions)
 62 |     b : ndarray, (number_of_actions)
 63 | 
 64 |     Returns
 65 |     ----------
 66 |     action: int
 67 |         action index
 68 | 
 69 |     Examples
 70 |     ----------
 71 |     >>> s = env.reset()
 72 |     >>> W, b = get_W_b(theta)
 73 |     >>> action = choose_action(s, W, b)
 74 |     """
 75 | 
 76 |     action = np.dot(s, W) + b
 77 |     return np.argmax(action)
 78 | 
 79 | 
 80 | def run_episode(env, theta, render=False):
 81 |     """ Run a single episode with theta
 82 | 
 83 |     Parameters
 84 |     ----------
 85 |     env : gym environment
 86 |     theta : 1-d array
 87 |     render : bool, optional
 88 | 
 89 |     Returns
 90 |     ----------
 91 |     reward : float
 92 |         Episode reward
 93 | 
 94 |     Examples
 95 |     ----------
 96 |     >>> env = gym.make('CartPole-v0')
 97 |     >>> reward = run_episode(env, theta)
 98 |     """
 99 |     W, b = get_W_b(theta)
100 |     s = env.reset()
101 |     done = False
102 | 
103 |     reward = 0
104 | 
105 |     while not done:
106 |         if render:
107 |             env.render()
108 | 
109 |         a = choose_action(s, W, b)
110 |         s2, r, done, info = env.step(a)
111 |         reward += r
112 |         s = s2
113 | 
114 |     return reward
115 | 
116 | 
117 | def make_theta(theta_mean, theta_sd):
118 |     """ Make a theta parameters with mean and sd
119 | 
120 |     Parameters
121 |     ----------
122 |     theta_mean : ndarray
123 |         A n-d array of means
124 | 
125 |     theta_sd : nd array
126 |         A n-d array of standard deviations
127 | 
128 |     Returns
129 |     ----------
130 |     theta : n-d array
131 |         Shape (n, )
132 | 
133 |     Examples
134 |     ----------
135 |     >>> DIM = INPUT_SIZE * OUTPUT_SIZE + OUTPUT_SIZE
136 |     >>> mu = np.zeros(DIM)
137 |     >>> sd = np.ones(SD)
138 |     >>> theta = make_theta(mu, sd)
139 | 
140 |     """
141 |     return np.random.multivariate_normal(mean=theta_mean, cov=np.diag(theta_sd),)
142 | 
143 | 
144 | def main():
145 |     """ Every magic happens here """
146 |     global env, INPUT_SIZE, OUTPUT_SIZE
147 | 
148 |     # Number of samples
149 |     N = 32
150 |     # Size of theta
151 |     DIM = INPUT_SIZE * OUTPUT_SIZE + OUTPUT_SIZE
152 | 
153 |     # Initialize parameters
154 |     theta_mean = np.zeros(DIM)
155 |     theta_sd = np.ones(DIM)
156 | 
157 |     # Loop until clear the game
158 |     #   make population with mean & sd
159 |     #   choose elite groups
160 |     #   obtain new mean & sd
161 |     for _ in range(100):
162 |         population = [make_theta(theta_mean, theta_sd) for _ in range(N)]
163 |         reward = [run_episode(env, p) for p in population]
164 | 
165 |         sorted_idx = np.argsort(reward)[-int(N * 0.20):]
166 | 
167 |         elite_population = [population[idx] for idx in sorted_idx]
168 |         elite_reward = [reward[idx] for idx in sorted_idx]
169 | 
170 |         theta_mean = np.mean(elite_population, axis=0)
171 |         theta_sd = np.std(elite_population, axis=0)
172 | 
173 |         avg_reward = np.mean(elite_reward)
174 |         print("Reward: {}".format(avg_reward))
175 | 
176 |         if avg_reward == 200:
177 |             print("Game Cleared")
178 |             break
179 | 
180 |     env = gym.wrappers.Monitor(env, "gym-results/", force=True)
181 |     best_parm = elite_population[-1]
182 | 
183 |     for i in range(100):
184 |         reward = run_episode(env, best_parm)
185 |         print(reward)
186 | 
187 | 
188 | if __name__ == '__main__':
189 |     main()
190 | 


--------------------------------------------------------------------------------
/10_1_Actor_Critic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# What is an actor critic network?\n",
 11 |     "\n",
 12 |     "- So far we have learned a single output network that produces Q-values(Value Iteration) or an action policy (Policy Iteration)\n",
 13 |     "- What if we can use both value functions and policy functions? That's how actor-critic methods were developed. It turns out if we use both, we can learn more complex systems. In this notebook, we will a simple policy gradient actor-critic methods\n",
 14 |     "\n",
 15 |     "# Structure of Actor Critic Networks\n",
 16 |     "- There are two networks: an actor network and a critic network\n",
 17 |     "![actor-critic](assets/actor_critic.png)\n",
 18 |     "- Actor network:\n",
 19 |     "    * This network chooses an action!\n",
 20 |     "    * It takes an input of game state and produce outputs an action policy (as in policy-gradient)\n",
 21 |     "\n",
 22 |     "- Critic network:\n",
 23 |     "    * This network is simply a value network\n",
 24 |     "    * It takes the same input as the actor network and produces a current state value"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "deletable": true,
 31 |     "editable": true
 32 |    },
 33 |    "source": [
 34 |     "# From pervious lectures\n",
 35 |     "\n",
 36 |     "* We used policy gradient methods that is to find a policy $\\pi$ that\n",
 37 |     "$$\n",
 38 |     "\\text{maximize } E\\left[\\ R\\ \\mid\\ \\pi\\ \\right]\n",
 39 |     "$$\n",
 40 |     "\n",
 41 |     "$$\\text{where }R  = r_0 + r_1 + \\dots + r_{\\tau - 1}$$\n",
 42 |     "\n",
 43 |     "* We use an gradient estimator that is\n",
 44 |     "$$\n",
 45 |     "\\hat{g} = \\nabla_\\theta \\log \\pi \\left(a_t \\mid s_t; \\theta \\right) \\cdot R_t \n",
 46 |     "$$\n",
 47 |     "\n",
 48 |     "$$\\text{where }R_t = \\sum_{t'=t}^{T-1} \\left(discount\\ rate\\right)^{t'-t} \\cdot (reward)_t$$"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {
 54 |     "deletable": true,
 55 |     "editable": true
 56 |    },
 57 |    "source": [
 58 |     "* The above gradient estimator simply means we boost the probability of an action that returns high rewards\n",
 59 |     "\n",
 60 |     "# Problems\n",
 61 |     "\n",
 62 |     "* The above method is however not stable because a step size of the gradients can be very large and once we overshoot, our agent will collect trajectories based on a bad policy"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {
 68 |     "deletable": true,
 69 |     "editable": true
 70 |    },
 71 |    "source": [
 72 |     "# Solution\n",
 73 |     "* In order to solve high variance problems, we will use $A_t$ instead of $R_t$ and this is called an advantage function\n",
 74 |     "* What is an advantage function? We know a Q function and a Value function. \n",
 75 |     "    * The $Q$ maps a state $s$ to an action $a$ value which is how good action $a$ is\n",
 76 |     "    * The $V$ maps a state $s$ to a value that shows how good an input state $s$ is\n",
 77 |     "    \n",
 78 |     "* Therefore, we can write two functions as following:\n",
 79 |     "$$ Q(s, a) = V(s) + A(a) $$\n",
 80 |     "\n",
 81 |     "* Therefore, \n",
 82 |     "\n",
 83 |     "$$ A(a) = Q(s, a) - V(s) $$\n",
 84 |     "\n",
 85 |     "* That's the definition of an advatage function. We are trying to find how good action $a$ is by subtracting a value function\n",
 86 |     "\n",
 87 |     "* Hence, we need to change the gradient estimator $\\hat{g}$ to the following\n",
 88 |     "$$\\hat{g} = \\nabla_\\theta \\log \\pi(a_t | s_t; \\theta) \\cdot A_t $$\n",
 89 |     "\n",
 90 |     "where\n",
 91 |     "\n",
 92 |     "\\begin{align*}\n",
 93 |     "A_t &= Q(s_t, a') - V(s_t) \\\\\n",
 94 |     "    &= R_{t} - V(s_t)\n",
 95 |     "\\end{align*}\n",
 96 |     "\n",
 97 |     "# Notes\n",
 98 |     "- Its performance is still not great because it has a few flaws\n",
 99 |     "    - We have to learn $V(s)$ first and learning $V(s)$ can be very difficult (requires careful reward enginneering)\n",
100 |     "    - Every trajectories is highly correlated\n",
101 |     "- In order to deal with these problems, we will later discuss various methods such as TRPO(Trust Region Policy Optimization) or A3C(Asynchronous Actor Critic Networks)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 1,
107 |    "metadata": {
108 |     "collapsed": true,
109 |     "deletable": true,
110 |     "editable": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "import numpy as np\n",
115 |     "import gym\n",
116 |     "import tensorflow as tf"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 2,
122 |    "metadata": {
123 |     "collapsed": true,
124 |     "deletable": true,
125 |     "editable": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "slim = tf.contrib.slim"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 3,
135 |    "metadata": {
136 |     "collapsed": false,
137 |     "deletable": true,
138 |     "editable": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "class ActorCriticNetwork:\n",
143 |     "    \"\"\" Actor Critic Network\n",
144 |     "    \n",
145 |     "    - 3 placeholders for policy\n",
146 |     "        - S : state (shared)\n",
147 |     "        - A : action one hot\n",
148 |     "        - ADV : advantage value\n",
149 |     "        \n",
150 |     "    - 2 placeholders for value\n",
151 |     "        - S : state (shared)\n",
152 |     "        - R : reward\n",
153 |     "    \n",
154 |     "    - 2 outputs\n",
155 |     "        - P : action policy, p(a | s)\n",
156 |     "        - V : V(s)\n",
157 |     "        \n",
158 |     "    Examples\n",
159 |     "    ----------\n",
160 |     "    >>> input_shape = [None, 4]\n",
161 |     "    >>> action_n = 2\n",
162 |     "    >>> hidden_dims = [32, 32]\n",
163 |     "    >>> ac_network = ActorCriticNetwork(input_shape, action_n, hidden_dims)\n",
164 |     "    \"\"\"\n",
165 |     "    def __init__(self, input_shape, action_n, hidden_dims):\n",
166 |     "        # Policy Input\n",
167 |     "        self.S = tf.placeholder(tf.float32, shape=input_shape, name=\"state_input\")\n",
168 |     "        self.A = tf.placeholder(tf.float32, shape=[None, action_n], name=\"action_one_hot_input\")\n",
169 |     "        self.ADV = tf.placeholder(tf.float32, shape=[None], name=\"advantage_input\")\n",
170 |     "        \n",
171 |     "        # Value Input\n",
172 |     "        self.R = tf.placeholder(tf.float32, shape=[None], name=\"reward_input\")\n",
173 |     "    \n",
174 |     "        self._create_network(hidden_dims, action_n)\n",
175 |     "        \n",
176 |     "    def _create_network(self, hidden_dims, action_n):\n",
177 |     "        net = self.S\n",
178 |     "        \n",
179 |     "        for i, h_dim in enumerate(hidden_dims):\n",
180 |     "            net = slim.fully_connected(net, h_dim, activation_fn=None, scope=f\"fc-{i}\")\n",
181 |     "            net = tf.nn.relu(net)\n",
182 |     "        \n",
183 |     "        # Policy shape: [None, action_n]\n",
184 |     "        self.P = slim.fully_connected(net, action_n, activation_fn=tf.nn.softmax, scope=\"policy_output\")\n",
185 |     "\n",
186 |     "        # Value shape: [None, 1] -> [None]\n",
187 |     "        _V = slim.fully_connected(net, 1, activation_fn=None, scope=\"value_output\")\n",
188 |     "        self.V = tf.squeeze(_V)\n",
189 |     "        \n",
190 |     "        self._create_op()\n",
191 |     "        \n",
192 |     "    def _create_op(self):\n",
193 |     "        # output shape: [None]\n",
194 |     "        policy_gain = tf.reduce_sum(self.P * self.A, 1)\n",
195 |     "\n",
196 |     "        # output shape: [None]\n",
197 |     "        policy_gain = tf.log(policy_gain) * self.ADV\n",
198 |     "        policy_gain = tf.reduce_sum(policy_gain, name=\"policy_gain\")\n",
199 |     "\n",
200 |     "        entropy = - tf.reduce_sum(self.P * tf.log(self.P), 1)\n",
201 |     "        entropy = tf.reduce_mean(entropy)\n",
202 |     "        \n",
203 |     "        value_loss = tf.losses.mean_squared_error(self.V, self.R, scope=\"value_loss\")\n",
204 |     "        \n",
205 |     "        # Becareful negative sign because we only can minimize\n",
206 |     "        # we want to maximize policy gain and entropy (for exploration)\n",
207 |     "        self.loss = - policy_gain  + value_loss - entropy * 0.01\n",
208 |     "        self.optimizer = tf.train.AdamOptimizer()\n",
209 |     "        self.train_op = self.optimizer.minimize(self.loss)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 4,
215 |    "metadata": {
216 |     "collapsed": true,
217 |     "deletable": true,
218 |     "editable": true
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "class Agent:\n",
223 |     "    \"\"\" Agent class  \"\"\"\n",
224 |     "    \n",
225 |     "    def __init__(self, env, network):\n",
226 |     "        \"\"\" Constructor\n",
227 |     "        \n",
228 |     "        Parameters\n",
229 |     "        ----------\n",
230 |     "        env\n",
231 |     "            Open ai gym environment        \n",
232 |     "        network\n",
233 |     "            Actor Critic Network   \n",
234 |     "        \"\"\"\n",
235 |     "        self.env = env\n",
236 |     "        self.model = network\n",
237 |     "        self.sess = tf.get_default_session()\n",
238 |     "        self.action_n = env.action_space.n\n",
239 |     "        \n",
240 |     "        \n",
241 |     "    def choose_an_action(self, state):\n",
242 |     "        \"\"\" Returns an action (int) \"\"\"\n",
243 |     "        \n",
244 |     "        feed = {\n",
245 |     "            self.model.S: state\n",
246 |     "        }\n",
247 |     "        \n",
248 |     "        action_prob = self.sess.run(self.model.P, feed_dict=feed)[0]\n",
249 |     "        \n",
250 |     "        return np.random.choice(np.arange(self.action_n), p=action_prob)\n",
251 |     "        \n",
252 |     "    def train(self, S, A, R):\n",
253 |     "        \"\"\" Train the actor critic networks\n",
254 |     "        \n",
255 |     "        (1) Compute discounted rewards R\n",
256 |     "        (2) Compute advantage values A = R - V\n",
257 |     "        (3) Perform gradients updates\n",
258 |     "        \n",
259 |     "        \"\"\"\n",
260 |     "        \n",
261 |     "        def discount_rewards(r, gamma=0.99):\n",
262 |     "            \"\"\" take 1D float array of rewards and compute discounted reward \"\"\"\n",
263 |     "            discounted_r = np.zeros_like(r, dtype=np.float32)\n",
264 |     "            running_add = 0\n",
265 |     "            \n",
266 |     "            for t in reversed(range(len(r))):\n",
267 |     "                running_add = running_add * gamma + r[t]\n",
268 |     "                discounted_r[t] = running_add\n",
269 |     "\n",
270 |     "            return discounted_r\n",
271 |     "\n",
272 |     "        # 1. Get discounted `R`s\n",
273 |     "        R = discount_rewards(R)\n",
274 |     "        \n",
275 |     "        # 2. Get `V`s\n",
276 |     "        feed = {\n",
277 |     "            self.model.S: S\n",
278 |     "        }\n",
279 |     "        V = self.sess.run(self.model.V, feed_dict=feed)\n",
280 |     "        \n",
281 |     "        # 3. Get Advantage values, A = R - V\n",
282 |     "        ADV = R - V        \n",
283 |     "        ADV = (ADV - np.mean(ADV)) / (np.std(ADV) + 1e-8)\n",
284 |     "        \n",
285 |     "        # 4. Perform gradient descents\n",
286 |     "        feed = {\n",
287 |     "            self.model.S: S,\n",
288 |     "            self.model.A: A,\n",
289 |     "            self.model.ADV: ADV,\n",
290 |     "            self.model.R: R\n",
291 |     "        }\n",
292 |     "\n",
293 |     "        self.sess.run(self.model.train_op, feed_dict=feed)        "
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 5,
299 |    "metadata": {
300 |     "collapsed": false,
301 |     "deletable": true,
302 |     "editable": true
303 |    },
304 |    "outputs": [
305 |     {
306 |      "name": "stderr",
307 |      "output_type": "stream",
308 |      "text": [
309 |       "[2017-04-08 21:10:41,639] Making new env: CartPole-v0\n",
310 |       "[2017-04-08 21:10:41,643] Clearing 26 monitor files from previous run (because force=True was provided)\n"
311 |      ]
312 |     },
313 |     {
314 |      "name": "stdout",
315 |      "output_type": "stream",
316 |      "text": [
317 |       "input_shape: [None, 4], action_n: 2\n"
318 |      ]
319 |     }
320 |    ],
321 |    "source": [
322 |     "# Tensorflow Reset\n",
323 |     "tf.reset_default_graph()\n",
324 |     "sess = tf.InteractiveSession()\n",
325 |     "\n",
326 |     "# Gym Environment Setup\n",
327 |     "env_name = \"CartPole-v0\"\n",
328 |     "env = gym.make(env_name)\n",
329 |     "env = gym.wrappers.Monitor(env, \"./gym-results/\", force=True)\n",
330 |     "\n",
331 |     "# Global parameters\n",
332 |     "input_shape = [None, env.observation_space.shape[0]]\n",
333 |     "action_n = env.action_space.n\n",
334 |     "\n",
335 |     "print(f\"input_shape: {input_shape}, action_n: {action_n}\")\n",
336 |     "\n",
337 |     "# Define A2C(Actor-Critic) and Agent\n",
338 |     "ac_network = ActorCriticNetwork(input_shape, action_n, [32, 32])\n",
339 |     "agent = Agent(env, ac_network)"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 6,
345 |    "metadata": {
346 |     "collapsed": false,
347 |     "deletable": true,
348 |     "editable": true
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "def preprocess_state(state_list):\n",
353 |     "    \"\"\" Preprocess a state list\n",
354 |     "    \n",
355 |     "    Currently it's only used to reshape the value\n",
356 |     "    When a single state is given, its shape is 1-d array,\n",
357 |     "    which needs to be reshaped in 2-d array\n",
358 |     "    \"\"\"\n",
359 |     "    return np.reshape(state_list, [-1, *input_shape[1:]])\n",
360 |     "\n",
361 |     "def preprocess_action(action_list, n_actions):\n",
362 |     "    \"\"\"Action -> 1-hot \"\"\"\n",
363 |     "    N = len(action_list)\n",
364 |     "    one_hot = np.zeros(shape=(N, n_actions))\n",
365 |     "    one_hot[np.arange(N), action_list] = 1\n",
366 |     "    \n",
367 |     "    return one_hot\n",
368 |     "\n",
369 |     "# Test codes\n",
370 |     "tmp = np.zeros((32, *input_shape[1:]))\n",
371 |     "np.testing.assert_almost_equal(preprocess_state(tmp), np.zeros([32, *input_shape[1:]]))\n",
372 |     "tmp = np.zeros(*input_shape[1:])\n",
373 |     "np.testing.assert_almost_equal(preprocess_state(tmp), np.zeros([1, *input_shape[1:]]))\n",
374 |     "tmp = [0, 1]\n",
375 |     "np.testing.assert_almost_equal(preprocess_action(tmp, 2), np.eye(2))"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 7,
381 |    "metadata": {
382 |     "collapsed": false,
383 |     "scrolled": false
384 |    },
385 |    "outputs": [
386 |     {
387 |      "name": "stderr",
388 |      "output_type": "stream",
389 |      "text": [
390 |       "[2017-04-08 21:10:42,119] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000000.mp4\n",
391 |       "[2017-04-08 21:10:43,292] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000001.mp4\n"
392 |      ]
393 |     },
394 |     {
395 |      "name": "stdout",
396 |      "output_type": "stream",
397 |      "text": [
398 |       "[Episode-     0]   16\r\n"
399 |      ]
400 |     },
401 |     {
402 |      "name": "stderr",
403 |      "output_type": "stream",
404 |      "text": [
405 |       "[2017-04-08 21:10:43,998] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000008.mp4\n"
406 |      ]
407 |     },
408 |     {
409 |      "name": "stdout",
410 |      "output_type": "stream",
411 |      "text": [
412 |       "[Episode-     1]   16\r",
413 |       "[Episode-     2]   22\r",
414 |       "[Episode-     3]   10\r",
415 |       "[Episode-     4]   45\r",
416 |       "[Episode-     5]   17\r",
417 |       "[Episode-     6]   16\r",
418 |       "[Episode-     7]   13\r"
419 |      ]
420 |     },
421 |     {
422 |      "name": "stderr",
423 |      "output_type": "stream",
424 |      "text": [
425 |       "[2017-04-08 21:10:45,072] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000027.mp4\n"
426 |      ]
427 |     },
428 |     {
429 |      "name": "stdout",
430 |      "output_type": "stream",
431 |      "text": [
432 |       "[Episode-    46]   22\r"
433 |      ]
434 |     },
435 |     {
436 |      "name": "stderr",
437 |      "output_type": "stream",
438 |      "text": [
439 |       "[2017-04-08 21:10:46,212] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000064.mp4\n"
440 |      ]
441 |     },
442 |     {
443 |      "name": "stdout",
444 |      "output_type": "stream",
445 |      "text": [
446 |       "[Episode-   107]   10\r"
447 |      ]
448 |     },
449 |     {
450 |      "name": "stderr",
451 |      "output_type": "stream",
452 |      "text": [
453 |       "[2017-04-08 21:10:47,241] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000125.mp4\n"
454 |      ]
455 |     },
456 |     {
457 |      "name": "stdout",
458 |      "output_type": "stream",
459 |      "text": [
460 |       "[Episode-   209]   19\r"
461 |      ]
462 |     },
463 |     {
464 |      "name": "stderr",
465 |      "output_type": "stream",
466 |      "text": [
467 |       "[2017-04-08 21:10:48,925] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000216.mp4\n"
468 |      ]
469 |     },
470 |     {
471 |      "name": "stdout",
472 |      "output_type": "stream",
473 |      "text": [
474 |       "[Episode-   337]   60\r"
475 |      ]
476 |     },
477 |     {
478 |      "name": "stderr",
479 |      "output_type": "stream",
480 |      "text": [
481 |       "[2017-04-08 21:10:51,951] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000343.mp4\n"
482 |      ]
483 |     },
484 |     {
485 |      "name": "stdout",
486 |      "output_type": "stream",
487 |      "text": [
488 |       "[Episode-   507]   31\r"
489 |      ]
490 |     },
491 |     {
492 |      "name": "stderr",
493 |      "output_type": "stream",
494 |      "text": [
495 |       "[2017-04-08 21:11:00,967] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000512.mp4\n"
496 |      ]
497 |     },
498 |     {
499 |      "name": "stdout",
500 |      "output_type": "stream",
501 |      "text": [
502 |       "[Episode-   722]  104\r"
503 |      ]
504 |     },
505 |     {
506 |      "name": "stderr",
507 |      "output_type": "stream",
508 |      "text": [
509 |       "[2017-04-08 21:11:09,900] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video000729.mp4\n"
510 |      ]
511 |     },
512 |     {
513 |      "name": "stdout",
514 |      "output_type": "stream",
515 |      "text": [
516 |       "[Episode-   993]  130\r"
517 |      ]
518 |     },
519 |     {
520 |      "name": "stderr",
521 |      "output_type": "stream",
522 |      "text": [
523 |       "[2017-04-08 21:11:25,444] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video001000.mp4\n"
524 |      ]
525 |     },
526 |     {
527 |      "name": "stdout",
528 |      "output_type": "stream",
529 |      "text": [
530 |       "[Episode-  1000]  200\n",
531 |       "[Episode-  1996]   26\r"
532 |      ]
533 |     },
534 |     {
535 |      "name": "stderr",
536 |      "output_type": "stream",
537 |      "text": [
538 |       "[2017-04-08 21:12:26,066] Starting new video recorder writing to /home/kkweon/github/ReinforcementZeroToAll/gym-results/openaigym.video.0.12761.video002000.mp4\n"
539 |      ]
540 |     },
541 |     {
542 |      "name": "stdout",
543 |      "output_type": "stream",
544 |      "text": [
545 |       "[Episode-  2000]  146\n",
546 |       "[Episode-  2363]  200\n",
547 |       "Game cleared in 2363, average rewards: 195.15\n"
548 |      ]
549 |     }
550 |    ],
551 |    "source": [
552 |     "init = tf.global_variables_initializer()\n",
553 |     "sess.run(init)\n",
554 |     "\n",
555 |     "MAX_EPISODES = 5000\n",
556 |     "\n",
557 |     "# For checking if the game is cleared\n",
558 |     "EPISODE_100_REWARDS = []\n",
559 |     "CLEAR_REWARD = env.spec.reward_threshold\n",
560 |     "CLEAR_REWARD = CLEAR_REWARD if CLEAR_REWARD else 9999\n",
561 |     "\n",
562 |     "for episode in range(MAX_EPISODES):\n",
563 |     "    s = env.reset()    \n",
564 |     "    done = False\n",
565 |     "    \n",
566 |     "    s_list = []\n",
567 |     "    a_list = []\n",
568 |     "    r_list = []\n",
569 |     "    \n",
570 |     "    episode_r = 0\n",
571 |     "    \n",
572 |     "    while not done:\n",
573 |     "        \n",
574 |     "        s = preprocess_state(s)\n",
575 |     "        a = agent.choose_an_action(s)\n",
576 |     "\n",
577 |     "        s2, r, done, info = env.step(a)\n",
578 |     "        \n",
579 |     "        s_list.append(s)\n",
580 |     "        a_list.append(a)\n",
581 |     "        r_list.append(r)\n",
582 |     "        \n",
583 |     "        s = s2\n",
584 |     "            \n",
585 |     "        episode_r += r\n",
586 |     "   \n",
587 |     "    a_list = preprocess_action(a_list, action_n)\n",
588 |     "        \n",
589 |     "    agent.train(np.vstack(s_list), a_list, r_list)\n",
590 |     "    \n",
591 |     "    print(f\"[Episode-{episode:>6}] {int(episode_r):>4}\", end=\"\\r\")\n",
592 |     "    \n",
593 |     "    # For line breaks\n",
594 |     "    if episode % (MAX_EPISODES // 5) == 0:\n",
595 |     "        print()\n",
596 |     "        \n",
597 |     "    EPISODE_100_REWARDS.append(episode_r)\n",
598 |     "    \n",
599 |     "    # Check if the game is cleared\n",
600 |     "    if len(EPISODE_100_REWARDS) > 100:\n",
601 |     "        EPISODE_100_REWARDS = EPISODE_100_REWARDS[1:]\n",
602 |     "        \n",
603 |     "        avg_rewards = np.mean(EPISODE_100_REWARDS)\n",
604 |     "        \n",
605 |     "        if avg_rewards > CLEAR_REWARD:\n",
606 |     "            print()\n",
607 |     "            print(f\"Game cleared in {episode}, average rewards: {avg_rewards}\")\n",
608 |     "            break"
609 |    ]
610 |   },
611 |   {
612 |    "cell_type": "markdown",
613 |    "metadata": {
614 |     "deletable": true,
615 |     "editable": true
616 |    },
617 |    "source": [
618 |     "# Test run\n"
619 |    ]
620 |   },
621 |   {
622 |    "cell_type": "code",
623 |    "execution_count": 8,
624 |    "metadata": {
625 |     "collapsed": false,
626 |     "deletable": true,
627 |     "editable": true
628 |    },
629 |    "outputs": [
630 |     {
631 |      "name": "stdout",
632 |      "output_type": "stream",
633 |      "text": [
634 |       "[Episode-0] 198\n",
635 |       "[Episode-20] 200\n",
636 |       "[Episode-40] 200\n",
637 |       "[Episode-60] 200\n",
638 |       "[Episode-80] 200\n",
639 |       "[Episode-98] 200\r"
640 |      ]
641 |     },
642 |     {
643 |      "name": "stderr",
644 |      "output_type": "stream",
645 |      "text": [
646 |       "[2017-04-08 21:13:16,119] Finished writing results. You can upload them to the scoreboard via gym.upload('/home/kkweon/github/ReinforcementZeroToAll/gym-results')\n"
647 |      ]
648 |     },
649 |     {
650 |      "name": "stdout",
651 |      "output_type": "stream",
652 |      "text": [
653 |       "[Episode-99] 200\r"
654 |      ]
655 |     }
656 |    ],
657 |    "source": [
658 |     "for episode in range(100):\n",
659 |     "    s = env.reset()    \n",
660 |     "    done = False\n",
661 |     "    \n",
662 |     "    episode_r = 0\n",
663 |     "    while not done:\n",
664 |     "        if episode % 20 == 0:\n",
665 |     "            env.render()\n",
666 |     "        s = preprocess_state(s)\n",
667 |     "        a = agent.choose_an_action(s)\n",
668 |     "        s2, r, done, info = env.step(a)\n",
669 |     "                \n",
670 |     "        s = s2\n",
671 |     "        episode_r += r    \n",
672 |     "    \n",
673 |     "    print(f\"[Episode-{episode}] {int(episode_r)}\", end=\"\\r\")\n",
674 |     "    \n",
675 |     "    if episode % 20 == 0:\n",
676 |     "        print()\n",
677 |     "        \n",
678 |     "env.close()"
679 |    ]
680 |   }
681 |  ],
682 |  "metadata": {
683 |   "kernelspec": {
684 |    "display_name": "Python 3",
685 |    "language": "python",
686 |    "name": "python3"
687 |   },
688 |   "language_info": {
689 |    "codemirror_mode": {
690 |     "name": "ipython",
691 |     "version": 3
692 |    },
693 |    "file_extension": ".py",
694 |    "mimetype": "text/x-python",
695 |    "name": "python",
696 |    "nbconvert_exporter": "python",
697 |    "pygments_lexer": "ipython3",
698 |    "version": "3.6.1"
699 |   }
700 |  },
701 |  "nbformat": 4,
702 |  "nbformat_minor": 2
703 | }
704 | 


--------------------------------------------------------------------------------
/10_2_A3C_threads.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple Asynchronous Methods for Deep Reinforcement Learning (A3C)
  3 | 
  4 | - It mimics A3C by using multi threads
  5 | - Distributed Tensorflow is preferred because of Python's GIL
  6 | 
  7 | """
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | import threading
 11 | import gym
 12 | import os
 13 | from scipy.misc import imresize
 14 | 
 15 | 
 16 | def copy_src_to_dst(from_scope, to_scope):
 17 |     """Creates a copy variable weights operation
 18 | 
 19 |     Args:
 20 |         from_scope (str): The name of scope to copy from
 21 |             It should be "global"
 22 |         to_scope (str): The name of scope to copy to
 23 |             It should be "thread-{}"
 24 | 
 25 |     Returns:
 26 |         list: Each element is a copy operation
 27 |     """
 28 |     from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope)
 29 |     to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope)
 30 | 
 31 |     op_holder = []
 32 |     for from_var, to_var in zip(from_vars, to_vars):
 33 |         op_holder.append(to_var.assign(from_var))
 34 |     return op_holder
 35 | 
 36 | 
 37 | def pipeline(image, new_HW=(80, 80), height_range=(35, 193), bg=(144, 72, 17)):
 38 |     """Returns a preprocessed image
 39 | 
 40 |     (1) Crop image (top and bottom)
 41 |     (2) Remove background & grayscale
 42 |     (3) Reszie to smaller image
 43 | 
 44 |     Args:
 45 |         image (3-D array): (H, W, C)
 46 |         new_HW (tuple): New image size (height, width)
 47 |         height_range (tuple): Height range (H_begin, H_end) else cropped
 48 |         bg (tuple): Background RGB Color (R, G, B)
 49 | 
 50 |     Returns:
 51 |         image (3-D array): (H, W, 1)
 52 |     """
 53 |     image = crop_image(image, height_range)
 54 |     image = resize_image(image, new_HW)
 55 |     image = kill_background_grayscale(image, bg)
 56 |     image = np.expand_dims(image, axis=2)
 57 | 
 58 |     return image
 59 | 
 60 | 
 61 | def resize_image(image, new_HW):
 62 |     """Returns a resized image
 63 | 
 64 |     Args:
 65 |         image (3-D array): Numpy array (H, W, C)
 66 |         new_HW (tuple): Target size (height, width)
 67 | 
 68 |     Returns:
 69 |         image (3-D array): Resized image (height, width, C)
 70 |     """
 71 |     return imresize(image, new_HW, interp="nearest")
 72 | 
 73 | 
 74 | def crop_image(image, height_range=(35, 195)):
 75 |     """Crops top and bottom
 76 | 
 77 |     Args:
 78 |         image (3-D array): Numpy image (H, W, C)
 79 |         height_range (tuple): Image will be cropped out
 80 |             except the height range between (min_height, max_height)
 81 | 
 82 |     Returns:
 83 |         image (3-D array): Numpy image (max_H - min_H, W, C)
 84 |     """
 85 |     h_beg, h_end = height_range
 86 |     return image[h_beg:h_end, ...]
 87 | 
 88 | 
 89 | def kill_background_grayscale(image, bg):
 90 |     """Make the background 0
 91 | 
 92 |     Args:
 93 |         image (3-D array): Numpy array (H, W, C)
 94 |         bg (tuple): RGB code of background (R, G, B)
 95 | 
 96 |     Returns:
 97 |         image (2-D array): Binarized image of shape (H, W)
 98 |             The background is 0 and everything else is 1
 99 |     """
100 |     H, W, _ = image.shape
101 | 
102 |     R = image[..., 0]
103 |     G = image[..., 1]
104 |     B = image[..., 2]
105 | 
106 |     cond = (R == bg[0]) & (G == bg[1]) & (B == bg[2])
107 | 
108 |     image = np.zeros((H, W))
109 |     image[~cond] = 1
110 | 
111 |     return image
112 | 
113 | 
114 | def discount_reward(rewards, gamma=0.99):
115 |     """Returns discounted rewards
116 | 
117 |     Args:
118 |         rewards (1-D array): Reward array
119 |         gamma (float): Discounted rate
120 | 
121 |     Returns:
122 |         discounted_rewards: same shape as `rewards`
123 | 
124 |     Notes:
125 |         In Pong, when the reward can be {-1, 0, 1}.
126 |         However, when the reward is either -1 or 1,
127 |         it means the game has been reset.
128 |         Therefore, it's necessaray to reset `running_add` to 0
129 |         whenever the reward is nonzero
130 |     """
131 |     discounted_r = np.zeros_like(rewards, dtype=np.float32)
132 |     running_add = 0
133 |     for t in reversed(range(len(rewards))):
134 |         if rewards[t] != 0:
135 |             running_add = 0
136 |         running_add = running_add * gamma + rewards[t]
137 |         discounted_r[t] = running_add
138 | 
139 |     return discounted_r
140 | 
141 | 
142 | class A3CNetwork(object):
143 | 
144 |     def __init__(self, name, input_shape, output_dim, logdir=None):
145 |         """A3C Network tensors and operations are defined here
146 | 
147 |         Args:
148 |             name (str): The name of scope
149 |             input_shape (list): The shape of input image [H, W, C]
150 |             output_dim (int): Number of actions
151 |             logdir (str, optional): directory to save summaries
152 | 
153 |         Notes:
154 |             You should be familiar with Policy Gradients.
155 |             The only difference between vanilla PG and A3C is that there is
156 |             an operation to apply gradients manually
157 |         """
158 |         with tf.variable_scope(name):
159 |             self.states = tf.placeholder(tf.float32, shape=[None, *input_shape], name="states")
160 |             self.actions = tf.placeholder(tf.uint8, shape=[None], name="actions")
161 |             self.rewards = tf.placeholder(tf.float32, shape=[None], name="rewards")
162 |             self.advantage = tf.placeholder(tf.float32, shape=[None], name="advantage")
163 | 
164 |             action_onehot = tf.one_hot(self.actions, output_dim, name="action_onehot")
165 |             net = self.states
166 | 
167 |             with tf.variable_scope("layer1"):
168 |                 net = tf.layers.conv2d(net,
169 |                                        filters=16,
170 |                                        kernel_size=(8, 8),
171 |                                        strides=(4, 4),
172 |                                        name="conv")
173 |                 net = tf.nn.relu(net, name="relu")
174 | 
175 |             with tf.variable_scope("layer2"):
176 |                 net = tf.layers.conv2d(net,
177 |                                        filters=32,
178 |                                        kernel_size=(4, 4),
179 |                                        strides=(2, 2),
180 |                                        name="conv")
181 |                 net = tf.nn.relu(net, name="relu")
182 | 
183 |             with tf.variable_scope("fc1"):
184 |                 net = tf.contrib.layers.flatten(net)
185 |                 net = tf.layers.dense(net, 256, name='dense')
186 |                 net = tf.nn.relu(net, name='relu')
187 | 
188 |             # actor network
189 |             actions = tf.layers.dense(net, output_dim, name="final_fc")
190 |             self.action_prob = tf.nn.softmax(actions, name="action_prob")
191 |             single_action_prob = tf.reduce_sum(self.action_prob * action_onehot, axis=1)
192 | 
193 |             entropy = - self.action_prob * tf.log(self.action_prob + 1e-7)
194 |             entropy = tf.reduce_sum(entropy, axis=1)
195 | 
196 |             log_action_prob = tf.log(single_action_prob + 1e-7)
197 |             maximize_objective = log_action_prob * self.advantage + entropy * 0.005
198 |             self.actor_loss = - tf.reduce_mean(maximize_objective)
199 | 
200 |             # value network
201 |             self.values = tf.squeeze(tf.layers.dense(net, 1, name="values"))
202 |             self.value_loss = tf.losses.mean_squared_error(labels=self.rewards,
203 |                                                            predictions=self.values)
204 | 
205 |             self.total_loss = self.actor_loss + self.value_loss * .5
206 |             self.optimizer = tf.train.AdamOptimizer()
207 | 
208 |         var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=name)
209 |         self.gradients = self.optimizer.compute_gradients(self.total_loss, var_list)
210 |         self.gradients_placeholders = []
211 | 
212 |         for grad, var in self.gradients:
213 |             placeholder = tf.placeholder(var.dtype, shape=var.get_shape())
214 |             placeholder = tf.clip_by_norm(placeholder, 40)
215 |             self.gradients_placeholders.append((placeholder, var))
216 |         self.apply_gradients = self.optimizer.apply_gradients(self.gradients_placeholders)
217 | 
218 |         if logdir:
219 |             loss_summary = tf.summary.scalar("total_loss", self.total_loss)
220 |             value_summary = tf.summary.histogram("values", self.values)
221 | 
222 |             self.summary_op = tf.summary.merge([loss_summary, value_summary])
223 |             self.summary_writer = tf.summary.FileWriter(logdir)
224 | 
225 | 
226 | class Agent(threading.Thread):
227 | 
228 |     def __init__(self, session, env, coord, name, global_network, input_shape, output_dim, logdir=None):
229 |         """Agent worker thread
230 | 
231 |         Args:
232 |             session (tf.Session): Tensorflow session needs to be shared
233 |             env (gym.Env): Gym environment (Pong-v0)
234 |             coord (tf.train.Coordinator): Tensorflow Queue Coordinator
235 |             name (str): Name of this worker
236 |             global_network (A3CNetwork): Global network that needs to be updated
237 |             input_shape (list): Required for local A3CNetwork, [H, W, C]
238 |             output_dim (int): Number of actions
239 |             logdir (str, optional): If logdir is given, will write summary
240 | 
241 |         Methods:
242 |             print(reward): prints episode rewards
243 |             play_episode(): a single episode logic is stored in here
244 |             run(): override threading.Thread.run
245 |             choose_action(state)
246 |             train(states, actions, rewards)
247 |         """
248 |         super(Agent, self).__init__()
249 |         self.local = A3CNetwork(name, input_shape, output_dim, logdir)
250 |         self.global_to_local = copy_src_to_dst("global", name)
251 |         self.global_network = global_network
252 | 
253 |         self.input_shape = input_shape
254 |         self.output_dim = output_dim
255 |         self.env = env
256 |         self.sess = session
257 |         self.coord = coord
258 |         self.name = name
259 |         self.logdir = logdir
260 | 
261 |     def print(self, reward):
262 |         message = "Agent(name={}, reward={})".format(self.name, reward)
263 |         print(message)
264 | 
265 |     def play_episode(self):
266 |         self.sess.run(self.global_to_local)
267 | 
268 |         states = []
269 |         actions = []
270 |         rewards = []
271 | 
272 |         s = self.env.reset()
273 |         s = pipeline(s)
274 |         state_diff = s
275 | 
276 |         done = False
277 |         total_reward = 0
278 |         time_step = 0
279 |         while not done:
280 | 
281 |             a = self.choose_action(state_diff)
282 |             s2, r, done, _ = self.env.step(a)
283 | 
284 |             s2 = pipeline(s2)
285 |             total_reward += r
286 | 
287 |             states.append(state_diff)
288 |             actions.append(a)
289 |             rewards.append(r)
290 | 
291 |             state_diff = s2 - s
292 |             s = s2
293 | 
294 |             if r == -1 or r == 1 or done:
295 |                 time_step += 1
296 | 
297 |                 if time_step >= 5 or done:
298 |                     self.train(states, actions, rewards)
299 |                     self.sess.run(self.global_to_local)
300 |                     states, actions, rewards = [], [], []
301 |                     time_step = 0
302 | 
303 |         self.print(total_reward)
304 | 
305 |     def run(self):
306 |         while not self.coord.should_stop():
307 |             self.play_episode()
308 | 
309 |     def choose_action(self, state):
310 |         """
311 |         Args:
312 |             state (2-D array): (N, H, W, 1)
313 |         """
314 |         state = np.reshape(state, [-1, *self.input_shape])
315 |         feed = {
316 |             self.local.states: state
317 |         }
318 | 
319 |         action = self.sess.run(self.local.action_prob, feed)
320 |         action = np.squeeze(action)
321 | 
322 |         return np.random.choice(np.arange(self.output_dim) + 1, p=action)
323 | 
324 |     def train(self, states, actions, rewards):
325 |         states = np.array(states)
326 |         actions = np.array(actions) - 1
327 |         rewards = np.array(rewards)
328 | 
329 |         feed = {
330 |             self.local.states: states
331 |         }
332 | 
333 |         values = self.sess.run(self.local.values, feed)
334 | 
335 |         rewards = discount_reward(rewards, gamma=0.99)
336 |         rewards -= np.mean(rewards)
337 |         rewards /= np.std(rewards)
338 | 
339 |         advantage = rewards - values
340 |         advantage -= np.mean(advantage)
341 |         advantage /= np.std(advantage) + 1e-8
342 | 
343 |         feed = {
344 |             self.local.states: states,
345 |             self.local.actions: actions,
346 |             self.local.rewards: rewards,
347 |             self.local.advantage: advantage
348 |         }
349 | 
350 |         gradients = self.sess.run(self.local.gradients, feed)
351 | 
352 |         feed = []
353 |         for (grad, _), (placeholder, _) in zip(gradients, self.global_network.gradients_placeholders):
354 |             feed.append((placeholder, grad))
355 | 
356 |         feed = dict(feed)
357 |         self.sess.run(self.global_network.apply_gradients, feed)
358 | 
359 | 
360 | def main():
361 |     try:
362 |         tf.reset_default_graph()
363 |         sess = tf.InteractiveSession()
364 |         coord = tf.train.Coordinator()
365 | 
366 |         checkpoint_dir = "checkpoint"
367 |         monitor_dir = "monitors"
368 |         save_path = os.path.join(checkpoint_dir, "model.ckpt")
369 | 
370 |         if not os.path.exists(checkpoint_dir):
371 |             os.makedirs(checkpoint_dir)
372 |             print("Directory {} was created".format(checkpoint_dir))
373 | 
374 |         n_threads = 16
375 |         input_shape = [80, 80, 1]
376 |         output_dim = 3  # {1, 2, 3}
377 |         global_network = A3CNetwork(name="global",
378 |                                     input_shape=input_shape,
379 |                                     output_dim=output_dim)
380 | 
381 |         thread_list = []
382 |         env_list = []
383 | 
384 |         for id in range(n_threads):
385 |             env = gym.make("PongDeterministic-v4")
386 | 
387 |             if id == 0:
388 |                 env = gym.wrappers.Monitor(env, monitor_dir, force=True)
389 | 
390 |             single_agent = Agent(env=env,
391 |                                  session=sess,
392 |                                  coord=coord,
393 |                                  name="thread_{}".format(id),
394 |                                  global_network=global_network,
395 |                                  input_shape=input_shape,
396 |                                  output_dim=output_dim)
397 |             thread_list.append(single_agent)
398 |             env_list.append(env)
399 | 
400 |         if tf.train.get_checkpoint_state(os.path.dirname(save_path)):
401 |             var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global")
402 |             saver = tf.train.Saver(var_list=var_list)
403 |             saver.restore(sess, save_path)
404 |             print("Model restored to global")
405 | 
406 |         else:
407 |             init = tf.global_variables_initializer()
408 |             sess.run(init)
409 |             print("No model is found")
410 | 
411 |         for t in thread_list:
412 |             t.start()
413 | 
414 |         print("Ctrl + C to close")
415 |         coord.wait_for_stop()
416 | 
417 |     except KeyboardInterrupt:
418 |         var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "global")
419 |         saver = tf.train.Saver(var_list=var_list)
420 |         saver.save(sess, save_path)
421 |         print('Checkpoint Saved to {}'.format(save_path))
422 | 
423 |         print("Closing threads")
424 |         coord.request_stop()
425 |         coord.join(thread_list)
426 | 
427 |         print("Closing environments")
428 |         for env in env_list:
429 |             env.close()
430 | 
431 |         sess.close()
432 | 
433 | 
434 | if __name__ == '__main__':
435 |     main()
436 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement Zero to All
 2 | 
 3 | This is work in progress and it may have bugs.
 4 | However, we call for your comments and pull requests. 
 5 | 
 6 | We emphasize on the following:
 7 | 
 8 | * **Readiability** over anything else
 9 |     - That's why we choose Python
10 | * **Pythonic code**
11 |     - PEP8
12 |     - Docstring
13 | * **Use High Level Tensorflow API**
14 |     - Cleaner and easier to understand
15 | * **KISS**
16 |     - [Keep It Simple Stupid](https://www.techopedia.com/definition/20262/keep-it-simple-stupid-principle-kiss-principle)
17 | 
18 | ## Lecture videos
19 | - [Youtube](https://www.youtube.com/playlist?list=PLlMkM4tgfjnKsCWav-Z2F-MMFRx-2gMGG)
20 |  
21 | ## File naming rule
22 | 
23 | ```
24 | 99_9_description.py
25 | ```
26 | - First two digits indicates a category of algorithms
27 |     - 07: DQN
28 |     - 08: Policy Gradient
29 |     - 09: Random Search Methods
30 |     - 10: Actor Critic
31 | - A second digit indicates an id
32 | - Description shows what the file is about
33 |     
34 | 
35 | ## How to use uploader
36 | It makes the uploading process a little bit simpler
37 | 
38 | 1. Go to https://gym.openai.com/
39 | 2. Login with your github account
40 |     * https://gym.openai.com/users/YOUR_GITHUB_ACCOUNT
41 | 3. Copy your OpenAI api key from the upper right corner of your profile page  
42 | ![user](assets/openai_user.jpg)
43 | 4. Modify `gym.ini`
44 | 5. In console
45 | ```bash
46 | #python gym_uploader.py /path/to/gym_results
47 | python gym_uploader.py gym-results/
48 | ```
49 | 
50 | ## Install requirements
51 | ```bash
52 | pip install -r requirements.txt
53 | ```
54 | 
55 | ## Run test and autopep8
56 | TODO: Need to add more test cases
57 | 
58 | ```bash
59 | pytest
60 | ```
61 | 
62 | ```bash
63 | # pip install autopep8 # if you haven't install
64 | autopep8 . --recursive --in-place --pep8-passes 2000 --verbose --ignore E501
65 | ```
66 | 
67 | ## Contributions/Comments
68 | We always welcome your comments and pull requests.
69 | 


--------------------------------------------------------------------------------
/assets/actor_critic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hunkim/ReinforcementZeroToAll/276e950a95c006666f1a34362dfd40ef4264ffbb/assets/actor_critic.png


--------------------------------------------------------------------------------
/assets/openai_user.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hunkim/ReinforcementZeroToAll/276e950a95c006666f1a34362dfd40ef4264ffbb/assets/openai_user.jpg


--------------------------------------------------------------------------------
/dqn.py:
--------------------------------------------------------------------------------
 1 | """DQN Class
 2 | 
 3 | DQN(NIPS-2013)
 4 | "Playing Atari with Deep Reinforcement Learning"
 5 | https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
 6 | 
 7 | DQN(Nature-2015)
 8 | "Human-level control through deep reinforcement learning"
 9 | http://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf
10 | """
11 | import numpy as np
12 | import tensorflow as tf
13 | 
14 | 
15 | class DQN:
16 | 
17 |     def __init__(self, session: tf.Session, input_size: int, output_size: int, name: str="main") -> None:
18 |         """DQN Agent can
19 | 
20 |         1) Build network
21 |         2) Predict Q_value given state
22 |         3) Train parameters
23 | 
24 |         Args:
25 |             session (tf.Session): Tensorflow session
26 |             input_size (int): Input dimension
27 |             output_size (int): Number of discrete actions
28 |             name (str, optional): TF Graph will be built under this name scope
29 |         """
30 |         self.session = session
31 |         self.input_size = input_size
32 |         self.output_size = output_size
33 |         self.net_name = name
34 | 
35 |         self._build_network()
36 | 
37 |     def _build_network(self, h_size=16, l_rate=0.001) -> None:
38 |         """DQN Network architecture (simple MLP)
39 | 
40 |         Args:
41 |             h_size (int, optional): Hidden layer dimension
42 |             l_rate (float, optional): Learning rate
43 |         """
44 |         with tf.variable_scope(self.net_name):
45 |             self._X = tf.placeholder(tf.float32, [None, self.input_size], name="input_x")
46 |             net = self._X
47 | 
48 |             net = tf.layers.dense(net, h_size, activation=tf.nn.relu)
49 |             net = tf.layers.dense(net, self.output_size)
50 |             self._Qpred = net
51 | 
52 |             self._Y = tf.placeholder(tf.float32, shape=[None, self.output_size])
53 |             self._loss = tf.losses.mean_squared_error(self._Y, self._Qpred)
54 | 
55 |             optimizer = tf.train.AdamOptimizer(learning_rate=l_rate)
56 |             self._train = optimizer.minimize(self._loss)
57 | 
58 |     def predict(self, state: np.ndarray) -> np.ndarray:
59 |         """Returns Q(s, a)
60 | 
61 |         Args:
62 |             state (np.ndarray): State array, shape (n, input_dim)
63 | 
64 |         Returns:
65 |             np.ndarray: Q value array, shape (n, output_dim)
66 |         """
67 |         x = np.reshape(state, [-1, self.input_size])
68 |         return self.session.run(self._Qpred, feed_dict={self._X: x})
69 | 
70 |     def update(self, x_stack: np.ndarray, y_stack: np.ndarray) -> list:
71 |         """Performs updates on given X and y and returns a result
72 | 
73 |         Args:
74 |             x_stack (np.ndarray): State array, shape (n, input_dim)
75 |             y_stack (np.ndarray): Target Q array, shape (n, output_dim)
76 | 
77 |         Returns:
78 |             list: First element is loss, second element is a result from train step
79 |         """
80 |         feed = {
81 |             self._X: x_stack,
82 |             self._Y: y_stack
83 |         }
84 |         return self.session.run([self._loss, self._train], feed)
85 | 


--------------------------------------------------------------------------------
/gym.ini:
--------------------------------------------------------------------------------
1 | [default]
2 | # key is obtained from
3 | # https://gym.openai.com
4 | 
5 | GYM_API_KEY=YOUR_API_KEY


--------------------------------------------------------------------------------
/gym_uploader.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import configparser
 3 | import gym
 4 | 
 5 | 
 6 | def read_config(file='gym.ini'):
 7 |     parser = configparser.ConfigParser()
 8 |     parser.read(file)
 9 | 
10 |     return parser
11 | 
12 | 
13 | def read_argparse():
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('path', help='path to gym results')
16 | 
17 |     return parser.parse_args()
18 | 
19 | 
20 | def upload(gym_path, key):
21 |     return gym.upload(gym_path, api_key=key)
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     config = read_config()
26 |     args = read_argparse()
27 |     key = config['default']['GYM_API_KEY']
28 | 
29 |     if len(key) == 0 or key is None or key == "YOUR_API_KEY":
30 |         print("Please enter the API key in gym.ini ")
31 | 
32 |     else:
33 |         upload(args.path, key)
34 | 


--------------------------------------------------------------------------------
/mini_pacman.py:
--------------------------------------------------------------------------------
  1 | # Simple pacman to avoid falling bombs
  2 | # Original author: Jin Kim (golbin) https://github.com/golbin/TensorFlow-Tutorials
  3 | import numpy as np
  4 | import random
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import matplotlib.patches as patches
  8 | 
  9 | 
 10 | class Gym:
 11 | 
 12 |     def __init__(self, screen_width=6, screen_height=10, show_game=True):
 13 |         self.screen_width = screen_width
 14 |         self.screen_height = screen_height
 15 |         self.road_width = (screen_width // 2)
 16 |         self.road_left = self.road_width // 2 + 1
 17 |         self.road_right = self.road_left + self.road_width - 1
 18 | 
 19 |         self.car = {"col": 0, "row": 2}
 20 |         self.block = [
 21 |             {"col": 0, "row": 0, "speed": 1},
 22 |             {"col": 0, "row": 0, "speed": 2},
 23 |         ]
 24 | 
 25 |         self.total_reward = 0.
 26 |         self.current_reward = 0.
 27 |         self.total_game = 0
 28 |         self.show_game = show_game
 29 | 
 30 |         if show_game:
 31 |             self.fig, self.axis = self.prepare_display()
 32 | 
 33 |     def prepare_display(self):
 34 |         fig, axis = plt.subplots(figsize=(4, 6))
 35 |         fig.set_size_inches(4, 6)
 36 |         fig.canvas.mpl_connect('close_event', exit)
 37 |         plt.axis((0, self.screen_width, 0, self.screen_height))
 38 |         plt.tick_params(top='off', right='off',
 39 |                         left='off', labelleft='off',
 40 |                         bottom='off', labelbottom='off')
 41 | 
 42 |         plt.draw()
 43 |         plt.ion()
 44 |         plt.show()
 45 | 
 46 |         return fig, axis
 47 | 
 48 |     def action_space_sample(self):
 49 |         return random.randint(0, 2)
 50 | 
 51 |     def get_state(self):
 52 |         state = np.zeros((self.screen_width, self.screen_height))
 53 | 
 54 |         state[self.car["col"], self.car["row"]] = 1
 55 | 
 56 |         if self.block[0]["row"] < self.screen_height:
 57 |             state[self.block[0]["col"], self.block[0]["row"]] = 1
 58 | 
 59 |         if self.block[1]["row"] < self.screen_height:
 60 |             state[self.block[1]["col"], self.block[1]["row"]] = 1
 61 | 
 62 |         return state.reshape((-1, self.screen_width * self.screen_height))
 63 | 
 64 |     def draw_screen(self):
 65 |         title = " Avg. Reward: %d Reward: %d Total Game: %d" % (
 66 |             self.total_reward / self.total_game,
 67 |             self.current_reward,
 68 |             self.total_game)
 69 | 
 70 |         self.axis.clear()
 71 |         self.axis.set_title(title, fontsize=12)
 72 | 
 73 |         road = patches.Rectangle((self.road_left - 1, 0), self.road_width + 1,
 74 |                                  self.screen_height, linewidth=0, facecolor="#333333")
 75 | 
 76 |         if self._is_gameover():
 77 |             car_color = "#FF0000"
 78 |         else:
 79 |             car_color = "#00FF00"
 80 |         car = patches.Wedge((self.car["col"] - 0.5, self.car["row"] - 0.5),
 81 |                             0.5, 20, 340, linewidth=0, facecolor=car_color)
 82 |         block1 = patches.Circle((self.block[0][
 83 |                                 "col"] - 0.5, self.block[0]["row"]), 0.5, linewidth=0, facecolor="#0099FF")
 84 |         block2 = patches.Circle((self.block[1][
 85 |                                 "col"] - 0.5, self.block[1]["row"]), 0.5, linewidth=0, facecolor="#EB70AA")
 86 | 
 87 |         self.axis.add_patch(road)
 88 |         self.axis.add_patch(car)
 89 |         self.axis.add_patch(block1)
 90 |         self.axis.add_patch(block2)
 91 | 
 92 |         self.fig.canvas.draw()
 93 |         plt.pause(0.0001)
 94 | 
 95 |     def reset(self):
 96 |         self.current_reward = 0
 97 |         self.total_game += 1
 98 | 
 99 |         self.car["col"] = int(self.screen_width / 2)
100 | 
101 |         self.block[0]["col"] = random.randrange(
102 |             self.road_left, self.road_right + 1)
103 |         self.block[0]["row"] = 0
104 |         self.block[1]["col"] = random.randrange(
105 |             self.road_left, self.road_right + 1)
106 |         self.block[1]["row"] = 0
107 | 
108 |         self.update_block()
109 |         return self.get_state()
110 | 
111 |     def update_car(self, move):
112 |         self.car["col"] = max(self.road_left, self.car["col"] + move)
113 |         self.car["col"] = min(self.car["col"], self.road_right)
114 | 
115 |     def update_block(self):
116 |         reward = 0
117 | 
118 |         if self.block[0]["row"] > 0:
119 |             self.block[0]["row"] -= self.block[0]["speed"]
120 |         else:
121 |             self.block[0]["col"] = random.randrange(
122 |                 self.road_left, self.road_right + 1)
123 |             self.block[0]["row"] = self.screen_height
124 |             reward += 1
125 | 
126 |         if self.block[1]["row"] > 0:
127 |             self.block[1]["row"] -= self.block[1]["speed"]
128 |         else:
129 |             self.block[1]["col"] = random.randrange(
130 |                 self.road_left, self.road_right + 1)
131 |             self.block[1]["row"] = self.screen_height
132 |             reward += 1
133 | 
134 |         return reward
135 | 
136 |     def _is_gameover(self):
137 |         if ((self.car["col"] == self.block[0]["col"] and
138 |              self.car["row"] == self.block[0]["row"]) or
139 |             (self.car["col"] == self.block[1]["col"] and
140 |              self.car["row"] == self.block[1]["row"])):
141 | 
142 |             self.total_reward += self.current_reward
143 | 
144 |             return True
145 |         else:
146 |             return False
147 | 
148 |     def step(self, action):
149 |         # action: 0: left, 1: stay, 2: right
150 |         self.update_car(action - 1)
151 |         escape_reward = self.update_block()
152 |         stable_reward = 1. / self.screen_height if action == 1 else 0
153 |         gameover = self._is_gameover()
154 |         stat = self.get_state()
155 | 
156 |         if gameover:
157 |             reward = -2
158 |         else:
159 |             reward = escape_reward + stable_reward
160 |             self.current_reward += reward
161 | 
162 |         if self.show_game:
163 |             self.draw_screen()
164 | 
165 |         return stat, reward, gameover, None
166 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.7.3
2 | matplotlib==2.0.0
3 | numpy==1.12.0
4 | readchar==0.7
5 | tensorflow==1.0.0
6 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hunkim/ReinforcementZeroToAll/276e950a95c006666f1a34362dfd40ef4264ffbb/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_DQN.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from dqn import DQN
 4 | 
 5 | 
 6 | class TestDQN:
 7 |     def setup_method(self, method):
 8 |         print("Session open")
 9 |         tf.reset_default_graph()
10 |         self.sess = tf.Session()
11 | 
12 |     def teardown_method(self, method):
13 |         print("Sesson close")
14 |         self.sess.close()
15 | 
16 |     def test_one_agent(self):
17 |         agent = DQN(self.sess, 4, 2)
18 |         assert isinstance(agent, DQN) is True
19 | 
20 |         assert hasattr(agent, "_X")
21 |         assert hasattr(agent, "_Y")
22 |         assert hasattr(agent, "_loss")
23 |         assert hasattr(agent, "_train")
24 | 
25 |     def run_init(self):
26 |         init = tf.global_variables_initializer()
27 |         self.sess.run(init)
28 | 
29 |     def test_agent_can_take_observation(self):
30 |         obs = np.zeros([1, 4])
31 |         agent = DQN(self.sess, 4, 2)
32 |         self.run_init()
33 |         output = agent.predict(obs)
34 |         np.testing.assert_almost_equal(output, [[0, 0]])
35 | 
36 |         obs = np.zeros([4, ])
37 |         output = agent.predict(obs)
38 |         np.testing.assert_almost_equal(output, [[0, 0]])
39 | 
40 |         obs = np.zeros([32, 4])
41 |         output = agent.predict(obs)
42 |         np.testing.assert_almost_equal(output, [[0, 0] for _ in range(32)])
43 | 
44 |     def test_agent_can_run_update(self):
45 |         x_stack = np.zeros([32, 4])
46 |         y_stack = np.zeros([32, 2])
47 | 
48 |         agent = DQN(self.sess, 4, 2)
49 |         self.run_init()
50 | 
51 |         output = agent.update(x_stack, y_stack)
52 |         assert output[0] == 0
53 | 
54 |         x_stack = np.zeros([1, 4])
55 |         y_stack = np.zeros([1, 2])
56 | 
57 |         output = agent.update(x_stack, y_stack)
58 |         assert output[0] == 0
59 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hunkim/ReinforcementZeroToAll/276e950a95c006666f1a34362dfd40ef4264ffbb/utils/__init__.py


--------------------------------------------------------------------------------
/utils/prints.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helper functions for console printing
 3 | """
 4 | import os
 5 | import time
 6 | 
 7 | 
 8 | def clear_screen() -> None:
 9 |     """Clear terminal console"""
10 |     os.system("cls" if os.name == "nt" else "clear")
11 | 
12 | 
13 | def print_result(score: float) -> None:
14 |     """Prints GOAL if score is positive else DEAD"""
15 |     message = "GOAL" if score > 0 else "DEAD"
16 |     print("=" * 50)
17 |     print("{:^50}".format(message))
18 |     print("=" * 50)
19 |     time.sleep(3)
20 | 


--------------------------------------------------------------------------------