├── DDQN
    ├── ddqn_agent.py
    ├── deep_q_network.py
    ├── main_ddqn.py
    ├── models
    │   ├── PongNoFrameskip-v4_DDQNAgent_q_eval
    │   └── PongNoFrameskip-v4_DDQNAgent_q_next
    ├── plots
    │   └── DDQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png
    ├── replay_memory.py
    ├── tf2
    │   ├── agent.py
    │   ├── main.py
    │   ├── network.py
    │   ├── replay_memory.py
    │   └── utils.py
    └── utils.py
├── DQN
    ├── deep_q_network.py
    ├── dqn_agent.py
    ├── main_dqn.py
    ├── models
    │   ├── PongNoFrameskip-v4_DQNAgent_q_eval
    │   └── PongNoFrameskip-v4_DQNAgent_q_next
    ├── plots
    │   └── DQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png
    ├── preprocess_pseudocode
    ├── replay_memory.py
    ├── tf2
    │   ├── agent.py
    │   ├── main.py
    │   ├── network.py
    │   ├── replay_memory.py
    │   └── utils.py
    └── utils.py
├── DuelingDDQN
    ├── deep_q_network.py
    ├── dueling_ddqn_agent.py
    ├── main_dueling_ddqn.py
    ├── models
    │   ├── PongNoFrameskip-v4_DuelingDDQNAgent_q_eval
    │   └── PongNoFrameskip-v4_DuelingDDQNAgent_q_next
    ├── plots
    │   └── DuelingDDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png
    ├── replay_memory.py
    ├── tf2
    │   ├── agent.py
    │   ├── main.py
    │   ├── network.py
    │   ├── replay_memory.py
    │   └── utils.py
    └── utils.py
├── DuelingDQN
    ├── deep_q_network.py
    ├── dueling_dqn_agent.py
    ├── main_dueling_dqn.py
    ├── models
    │   ├── PongNoFrameskip-v4_DuelingDQNAgent_q_eval
    │   └── PongNoFrameskip-v4_DuelingDQNAgent_q_next
    ├── plots
    │   └── DuelingDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png
    ├── replay_memory.py
    ├── tf2
    │   ├── agent.py
    │   ├── main.py
    │   ├── network.py
    │   ├── replay_memory.py
    │   └── utils.py
    └── utils.py
├── LICENSE
├── README.md
├── agents.py
├── argparse_example.py
├── deep_q_network.py
├── main.py
├── naive_deep_q_learning
    ├── cartpole_naive_dqn.png
    ├── cartpole_naive_dqn.py
    ├── pytorch_example.py
    └── util.py
├── q_learning
    ├── frozen_lake_deterministic_policy.py
    ├── frozen_lake_env_test.py
    ├── frozen_lake_q_learning.py
    ├── frozen_lake_random_agent.py
    ├── plots
    │   ├── frozen_lake_deterministic_policy.png
    │   ├── frozen_lake_q_learning_agent.png
    │   └── frozen_lake_random_policy.png
    ├── q_learning_agent.py
    └── q_network.py
├── replay_memory.py
└── utils.py


/DDQN/ddqn_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | from deep_q_network import DeepQNetwork
  4 | from replay_memory import ReplayBuffer
  5 | 
  6 | class DDQNAgent(object):
  7 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
  8 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
  9 |                  replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 10 |         self.gamma = gamma
 11 |         self.epsilon = epsilon
 12 |         self.lr = lr
 13 |         self.n_actions = n_actions
 14 |         self.input_dims = input_dims
 15 |         self.batch_size = batch_size
 16 |         self.eps_min = eps_min
 17 |         self.eps_dec = eps_dec
 18 |         self.replace_target_cnt = replace
 19 |         self.algo = algo
 20 |         self.env_name = env_name
 21 |         self.chkpt_dir = chkpt_dir
 22 |         self.action_space = [i for i in range(n_actions)]
 23 |         self.learn_step_counter = 0
 24 | 
 25 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
 26 | 
 27 |         self.q_eval = DeepQNetwork(self.lr, self.n_actions,
 28 |                                     input_dims=self.input_dims,
 29 |                                     name=self.env_name+'_'+self.algo+'_q_eval',
 30 |                                     chkpt_dir=self.chkpt_dir)
 31 |         self.q_next = DeepQNetwork(self.lr, self.n_actions,
 32 |                                     input_dims=self.input_dims,
 33 |                                     name=self.env_name+'_'+self.algo+'_q_next',
 34 |                                     chkpt_dir=self.chkpt_dir)
 35 | 
 36 |     def store_transition(self, state, action, reward, state_, done):
 37 |         self.memory.store_transition(state, action, reward, state_, done)
 38 | 
 39 |     def sample_memory(self):
 40 |         state, action, reward, new_state, done = \
 41 |                                 self.memory.sample_buffer(self.batch_size)
 42 | 
 43 |         states = T.tensor(state).to(self.q_eval.device)
 44 |         rewards = T.tensor(reward).to(self.q_eval.device)
 45 |         dones = T.tensor(done).to(self.q_eval.device)
 46 |         actions = T.tensor(action).to(self.q_eval.device)
 47 |         states_ = T.tensor(new_state).to(self.q_eval.device)
 48 | 
 49 |         return states, actions, rewards, states_, dones
 50 | 
 51 |     def choose_action(self, observation):
 52 |         if np.random.random() > self.epsilon:
 53 |             state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
 54 |             actions = self.q_eval.forward(state)
 55 |             action = T.argmax(actions).item()
 56 |         else:
 57 |             action = np.random.choice(self.action_space)
 58 | 
 59 |         return action
 60 | 
 61 |     def replace_target_network(self):
 62 |         if self.replace_target_cnt is not None and \
 63 |            self.learn_step_counter % self.replace_target_cnt == 0:
 64 |             self.q_next.load_state_dict(self.q_eval.state_dict())
 65 | 
 66 |     def decrement_epsilon(self):
 67 |         self.epsilon = self.epsilon - self.eps_dec \
 68 |                            if self.epsilon > self.eps_min else self.eps_min
 69 | 
 70 |     def learn(self):
 71 |         if self.memory.mem_cntr < self.batch_size:
 72 |             return
 73 | 
 74 |         self.q_eval.optimizer.zero_grad()
 75 | 
 76 |         self.replace_target_network()
 77 | 
 78 |         states, actions, rewards, states_, dones = self.sample_memory()
 79 | 
 80 |         indices = np.arange(self.batch_size)
 81 | 
 82 |         q_pred = self.q_eval.forward(states)[indices, actions]
 83 |         q_next = self.q_next.forward(states_)
 84 |         q_eval = self.q_eval.forward(states_)
 85 | 
 86 |         max_actions = T.argmax(q_eval, dim=1)
 87 |         q_next[dones] = 0.0
 88 | 
 89 |         q_target = rewards + self.gamma*q_next[indices, max_actions]
 90 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
 91 |         loss.backward()
 92 | 
 93 |         self.q_eval.optimizer.step()
 94 |         self.learn_step_counter += 1
 95 | 
 96 |         self.decrement_epsilon()
 97 | 
 98 |     def save_models(self):
 99 |         self.q_eval.save_checkpoint()
100 |         self.q_next.save_checkpoint()
101 | 
102 |     def load_models(self):
103 |         self.q_eval.load_checkpoint()
104 |         self.q_next.load_checkpoint()
105 | 


--------------------------------------------------------------------------------
/DDQN/deep_q_network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | import numpy as np
 7 | 
 8 | class DeepQNetwork(nn.Module):
 9 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
10 |         super(DeepQNetwork, self).__init__()
11 |         self.checkpoint_dir = chkpt_dir
12 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
13 | 
14 |         self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4)
15 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
16 |         self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
17 | 
18 |         fc_input_dims = self.calculate_conv_output_dims(input_dims)
19 | 
20 |         self.fc1 = nn.Linear(fc_input_dims, 512)
21 |         self.fc2 = nn.Linear(512, n_actions)
22 | 
23 |         self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
24 | 
25 |         self.loss = nn.MSELoss()
26 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
27 |         self.to(self.device)
28 | 
29 |     def calculate_conv_output_dims(self, input_dims):
30 |         state = T.zeros(1, *input_dims)
31 |         dims = self.conv1(state)
32 |         dims = self.conv2(dims)
33 |         dims = self.conv3(dims)
34 |         return int(np.prod(dims.size()))
35 | 
36 |     def forward(self, state):
37 |         conv1 = F.relu(self.conv1(state))
38 |         conv2 = F.relu(self.conv2(conv1))
39 |         conv3 = F.relu(self.conv3(conv2))
40 |         conv_state = conv3.view(conv3.size()[0], -1)
41 | 
42 |         flat1 = F.relu(self.fc1(conv_state))
43 |         actions = self.fc2(flat1)
44 | 
45 |         return actions
46 | 
47 |     def save_checkpoint(self):
48 |         print('... saving checkpoint ...')
49 |         T.save(self.state_dict(), self.checkpoint_file)
50 | 
51 |     def load_checkpoint(self):
52 |         print('... loading checkpoint ...')
53 |         self.load_state_dict(T.load(self.checkpoint_file))
54 | 


--------------------------------------------------------------------------------
/DDQN/main_ddqn.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from ddqn_agent import DDQNAgent
 4 | from utils import plot_learning_curve, make_env
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = make_env('PongNoFrameskip-v4')
 8 |     best_score = -np.inf
 9 |     load_checkpoint = False
10 |     n_games = 100
11 |     agent = DDQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001,
12 |                      input_dims=(env.observation_space.shape),
13 |                      n_actions=env.action_space.n, mem_size=50000, eps_min=0.1,
14 |                      batch_size=32, replace=10000, eps_dec=1e-5,
15 |                      chkpt_dir='models/', algo='DDQNAgent',
16 |                      env_name='PongNoFrameskip-v4')
17 | 
18 |     if load_checkpoint:
19 |         agent.load_models()
20 | 
21 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \
22 |             + str(n_games) + 'games'
23 |     figure_file = 'plots/' + fname + '.png'
24 | 
25 |     n_steps = 0
26 |     scores, eps_history, steps_array = [], [], []
27 | 
28 |     for i in range(n_games):
29 |         done = False
30 |         observation = env.reset()
31 | 
32 |         score = 0
33 |         while not done:
34 |             action = agent.choose_action(observation)
35 |             observation_, reward, done, info = env.step(action)
36 |             score += reward
37 | 
38 |             if not load_checkpoint:
39 |                 agent.store_transition(observation, action,
40 |                                      reward, observation_, int(done))
41 |                 agent.learn()
42 |             observation = observation_
43 |             n_steps += 1
44 |         scores.append(score)
45 |         steps_array.append(n_steps)
46 | 
47 |         avg_score = np.mean(scores[-100:])
48 |         print('episode: ', i,'score: ', score,
49 |              ' average score %.1f' % avg_score, 'best score %.2f' % best_score,
50 |             'epsilon %.2f' % agent.epsilon, 'steps', n_steps)
51 | 
52 |         if avg_score > best_score:
53 |             #if not load_checkpoint:
54 |             #    agent.save_models()
55 |             best_score = avg_score
56 | 
57 |         eps_history.append(agent.epsilon)
58 |         if load_checkpoint and n_steps >= 18000:
59 |             break
60 | 
61 |     x = [i+1 for i in range(len(scores))]
62 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
63 | 


--------------------------------------------------------------------------------
/DDQN/models/PongNoFrameskip-v4_DDQNAgent_q_eval:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DDQN/models/PongNoFrameskip-v4_DDQNAgent_q_eval


--------------------------------------------------------------------------------
/DDQN/models/PongNoFrameskip-v4_DDQNAgent_q_next:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DDQN/models/PongNoFrameskip-v4_DDQNAgent_q_next


--------------------------------------------------------------------------------
/DDQN/plots/DDQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DDQN/plots/DDQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png


--------------------------------------------------------------------------------
/DDQN/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer(object):
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 8 |                                      dtype=np.float32)
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
10 |                                          dtype=np.float32)
11 | 
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, done):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.new_state_memory[index] = state_
20 |         self.action_memory[index] = action
21 |         self.reward_memory[index] = reward
22 |         self.terminal_memory[index] = done
23 |         self.mem_cntr += 1
24 | 
25 |     def sample_buffer(self, batch_size):
26 |         max_mem = min(self.mem_cntr, self.mem_size)
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         terminal = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, terminal
36 | 


--------------------------------------------------------------------------------
/DDQN/tf2/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import tensorflow.keras as keras
  4 | from tensorflow.keras.optimizers import Adam
  5 | from network import DeepQNetwork
  6 | from replay_memory import ReplayBuffer
  7 | 
  8 | 
  9 | class Agent:
 10 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
 11 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
 12 |                  replace=1000, algo=None, env_name=None, chkpt_dir='tmp/ddqn'):
 13 |         self.gamma = gamma
 14 |         self.epsilon = epsilon
 15 |         self.lr = lr
 16 |         self.n_actions = n_actions
 17 |         self.input_dims = input_dims
 18 |         self.batch_size = batch_size
 19 |         self.eps_min = eps_min
 20 |         self.eps_dec = eps_dec
 21 |         self.replace_target_cnt = replace
 22 |         self.algo = algo
 23 |         self.env_name = env_name
 24 |         self.chkpt_dir = chkpt_dir
 25 |         self.action_space = [i for i in range(n_actions)]
 26 |         self.learn_step_counter = 0
 27 |         self.fname = self.chkpt_dir + self.env_name + '_' + self.algo + '_'
 28 | 
 29 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
 30 | 
 31 |         self.q_eval = DeepQNetwork(input_dims, n_actions)
 32 |         self.q_eval.compile(optimizer=Adam(learning_rate=lr))
 33 |         self.q_next = DeepQNetwork(input_dims, n_actions)
 34 |         self.q_next.compile(optimizer=Adam(learning_rate=lr))
 35 | 
 36 |     def save_models(self):
 37 |         self.q_eval.save(self.fname+'q_eval')
 38 |         self.q_next.save(self.fname+'q_next')
 39 |         print('... models saved successfully ...')
 40 | 
 41 |     def load_models(self):
 42 |         self.q_eval = keras.models.load_model(self.fname+'q_eval')
 43 |         self.q_next = keras.models.load_model(self.fname+'q_next')
 44 |         print('... models loaded successfully ...')
 45 | 
 46 |     def store_transition(self, state, action, reward, state_, done):
 47 |         self.memory.store_transition(state, action, reward, state_, done)
 48 | 
 49 |     def sample_memory(self):
 50 |         state, action, reward, new_state, done = \
 51 |                                   self.memory.sample_buffer(self.batch_size)
 52 |         states = tf.convert_to_tensor(state)
 53 |         rewards = tf.convert_to_tensor(reward)
 54 |         dones = tf.convert_to_tensor(done)
 55 |         actions = tf.convert_to_tensor(action, dtype=tf.int32)
 56 |         states_ = tf.convert_to_tensor(new_state)
 57 |         return states, actions, rewards, states_, dones
 58 | 
 59 |     def choose_action(self, observation):
 60 |         if np.random.random() > self.epsilon:
 61 |             state = tf.convert_to_tensor([observation])
 62 |             actions = self.q_eval(state)
 63 |             action = tf.math.argmax(actions, axis=1).numpy()[0]
 64 |         else:
 65 |             action = np.random.choice(self.action_space)
 66 |         return action
 67 | 
 68 |     def replace_target_network(self):
 69 |         if self.learn_step_counter % self.replace_target_cnt == 0:
 70 |             self.q_next.set_weights(self.q_eval.get_weights())
 71 | 
 72 |     def decrement_epsilon(self):
 73 |         self.epsilon = self.epsilon - self.eps_dec \
 74 |                            if self.epsilon > self.eps_min else self.eps_min
 75 | 
 76 |     def learn(self):
 77 |         if self.memory.mem_cntr < self.batch_size:
 78 |             return
 79 | 
 80 |         self.replace_target_network()
 81 | 
 82 |         states, actions, rewards, states_, dones = self.sample_memory()
 83 | 
 84 |         indices = tf.range(self.batch_size, dtype=tf.int32)
 85 |         action_indices = tf.stack([indices, actions], axis=1)
 86 | 
 87 |         with tf.GradientTape() as tape:
 88 |             q_pred = tf.gather_nd(self.q_eval(states), indices=action_indices)
 89 |             q_next = self.q_next(states_)
 90 |             q_eval = self.q_eval(states_)
 91 | 
 92 |             max_actions = tf.math.argmax(q_eval, axis=1, output_type=tf.int32)
 93 |             max_action_idx = tf.stack([indices, max_actions], axis=1)
 94 | 
 95 |             q_target = rewards + \
 96 |                 self.gamma*tf.gather_nd(q_next, indices=max_action_idx) *\
 97 |                 (1 - dones.numpy())
 98 | 
 99 |             loss = keras.losses.MSE(q_pred, q_target)
100 | 
101 |         params = self.q_eval.trainable_variables
102 |         grads = tape.gradient(loss, params)
103 | 
104 |         self.q_eval.optimizer.apply_gradients(zip(grads, params))
105 | 
106 |         self.learn_step_counter += 1
107 | 
108 |         self.decrement_epsilon()
109 | 


--------------------------------------------------------------------------------
/DDQN/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from agent import Agent
 3 | from utils import plot_learning_curve, make_env, manage_memory
 4 | from gym import wrappers
 5 | 
 6 | if __name__ == '__main__':
 7 |     manage_memory()
 8 |     env = make_env('PongNoFrameskip-v4')
 9 |     best_score = -np.inf
10 |     load_checkpoint = False
11 |     record_agent = False
12 |     n_games = 250
13 |     agent = Agent(gamma=0.99, epsilon=1, lr=0.0001,
14 |                   input_dims=(env.observation_space.shape),
15 |                   n_actions=env.action_space.n, mem_size=50000, eps_min=0.1,
16 |                   batch_size=32, replace=1000, eps_dec=1e-5,
17 |                   chkpt_dir='models/', algo='DQNAgent',
18 |                   env_name='PongNoFrameskip-v4')
19 |     if load_checkpoint:
20 |         agent.load_models()
21 |         agent.epsilon = agent.eps_min
22 | 
23 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \
24 |         + str(n_games) + 'games'
25 |     figure_file = 'plots/' + fname + '.png'
26 |     # if you want to record video of your agent playing, do a
27 |     # mkdir video
28 |     if record_agent:
29 |         env = wrappers.Monitor(env, "video",
30 |                                video_callable=lambda episode_id: True,
31 |                                force=True)
32 |     n_steps = 0
33 |     scores, eps_history, steps_array = [], [], []
34 | 
35 |     for i in range(n_games):
36 |         done = False
37 |         observation = env.reset()
38 | 
39 |         score = 0
40 |         while not done:
41 |             action = agent.choose_action(observation)
42 |             observation_, reward, done, info = env.step(action)
43 |             score += reward
44 | 
45 |             if not load_checkpoint:
46 |                 agent.store_transition(observation, action,
47 |                                        reward, observation_, done)
48 |                 agent.learn()
49 |             observation = observation_
50 |             n_steps += 1
51 |         scores.append(score)
52 |         steps_array.append(n_steps)
53 | 
54 |         avg_score = np.mean(scores[-100:])
55 |         print('episode {} score {:.1f} avg score {:.1f} '
56 |               'best score {:.1f} epsilon {:.2f} steps {}'.
57 |               format(i, score, avg_score, best_score, agent.epsilon,
58 |                      n_steps))
59 | 
60 |         if score > best_score:
61 |             if not load_checkpoint:
62 |                 agent.save_models()
63 |             best_score = score
64 | 
65 |         eps_history.append(agent.epsilon)
66 | 
67 |     x = [i+1 for i in range(len(scores))]
68 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
69 | 


--------------------------------------------------------------------------------
/DDQN/tf2/network.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.keras as keras
 2 | from tensorflow.keras.layers import Conv2D, Dense, Flatten
 3 | 
 4 | 
 5 | class DeepQNetwork(keras.Model):
 6 |     def __init__(self, input_dims, n_actions):
 7 |         super(DeepQNetwork, self).__init__()
 8 |         self.conv1 = Conv2D(32, 8, strides=(4, 4), activation='relu',
 9 |                             data_format='channels_first',
10 |                             input_shape=input_dims)
11 |         self.conv2 = Conv2D(64, 4, strides=(2, 2), activation='relu',
12 |                             data_format='channels_first')
13 |         self.conv3 = Conv2D(64, 3, strides=(1, 1), activation='relu',
14 |                             data_format='channels_first')
15 |         self.flat = Flatten()
16 |         self.fc1 = Dense(512, activation='relu')
17 |         self.fc2 = Dense(n_actions, activation=None)
18 | 
19 |     def call(self, state):
20 |         x = self.conv1(state)
21 |         x = self.conv2(x)
22 |         x = self.conv3(x)
23 |         x = self.flat(x)
24 |         x = self.fc1(x)
25 |         x = self.fc2(x)
26 | 
27 |         return x
28 | 


--------------------------------------------------------------------------------
/DDQN/tf2/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer(object):
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 8 |                                      dtype=np.float32)
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
10 |                                          dtype=np.float32)
11 | 
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, done):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.new_state_memory[index] = state_
20 |         self.action_memory[index] = action
21 |         self.reward_memory[index] = reward
22 |         self.terminal_memory[index] = done
23 |         self.mem_cntr += 1
24 | 
25 |     def sample_buffer(self, batch_size):
26 |         max_mem = min(self.mem_cntr, self.mem_size)
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         terminal = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, terminal
36 | 


--------------------------------------------------------------------------------
/DDQN/tf2/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | def manage_memory():
 10 |     gpus = tf.config.list_physical_devices('GPU')
 11 |     if gpus:
 12 |         try:
 13 |             for gpu in gpus:
 14 |                 tf.config.experimental.set_memory_growth(gpu, True)
 15 |         except RuntimeError as e:
 16 |             print(e)
 17 | 
 18 | 
 19 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
 20 |     fig=plt.figure()
 21 |     ax=fig.add_subplot(111, label="1")
 22 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 23 | 
 24 |     ax.plot(x, epsilons, color="C0")
 25 |     ax.set_xlabel("Training Steps", color="C0")
 26 |     ax.set_ylabel("Epsilon", color="C0")
 27 |     ax.tick_params(axis='x', colors="C0")
 28 |     ax.tick_params(axis='y', colors="C0")
 29 | 
 30 |     N = len(scores)
 31 |     running_avg = np.empty(N)
 32 |     for t in range(N):
 33 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 34 | 
 35 |     ax2.scatter(x, running_avg, color="C1")
 36 |     ax2.axes.get_xaxis().set_visible(False)
 37 |     ax2.yaxis.tick_right()
 38 |     ax2.set_ylabel('Score', color="C1")
 39 |     ax2.yaxis.set_label_position('right')
 40 |     ax2.tick_params(axis='y', colors="C1")
 41 | 
 42 |     if lines is not None:
 43 |         for line in lines:
 44 |             plt.axvline(x=line)
 45 | 
 46 |     plt.savefig(filename)
 47 | 
 48 | class RepeatActionAndMaxFrame(gym.Wrapper):
 49 |     def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0,
 50 |                  fire_first=False):
 51 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 52 |         self.repeat = repeat
 53 |         self.shape = env.observation_space.low.shape
 54 |         self.frame_buffer = np.zeros_like((2, self.shape))
 55 |         self.clip_reward = clip_reward
 56 |         self.no_ops = no_ops
 57 |         self.fire_first = fire_first
 58 | 
 59 |     def step(self, action):
 60 |         t_reward = 0.0
 61 |         done = False
 62 |         for i in range(self.repeat):
 63 |             obs, reward, done, info = self.env.step(action)
 64 |             if self.clip_reward:
 65 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 66 |             t_reward += reward
 67 |             idx = i % 2
 68 |             self.frame_buffer[idx] = obs
 69 |             if done:
 70 |                 break
 71 | 
 72 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 73 |         return max_frame, t_reward, done, info
 74 | 
 75 |     def reset(self):
 76 |         obs = self.env.reset()
 77 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 78 |         for _ in range(no_ops):
 79 |             _, _, done, _ = self.env.step(0)
 80 |             if done:
 81 |                 self.env.reset()
 82 |         if self.fire_first:
 83 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 84 |             obs, _, _, _ = self.env.step(1)
 85 | 
 86 |         self.frame_buffer = np.zeros_like((2,self.shape))
 87 |         self.frame_buffer[0] = obs
 88 | 
 89 |         return obs
 90 | 
 91 | class PreprocessFrame(gym.ObservationWrapper):
 92 |     def __init__(self, shape, env=None):
 93 |         super(PreprocessFrame, self).__init__(env)
 94 |         self.shape = (shape[2], shape[0], shape[1])
 95 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
 96 |                                     shape=self.shape, dtype=np.float32)
 97 | 
 98 |     def observation(self, obs):
 99 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
100 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
101 |                                     interpolation=cv2.INTER_AREA)
102 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
103 |         new_obs = new_obs / 255.0
104 | 
105 |         return new_obs
106 | 
107 | class StackFrames(gym.ObservationWrapper):
108 |     def __init__(self, env, repeat):
109 |         super(StackFrames, self).__init__(env)
110 |         self.observation_space = gym.spaces.Box(
111 |                             env.observation_space.low.repeat(repeat, axis=0),
112 |                             env.observation_space.high.repeat(repeat, axis=0),
113 |                             dtype=np.float32)
114 |         self.stack = collections.deque(maxlen=repeat)
115 | 
116 |     def reset(self):
117 |         self.stack.clear()
118 |         observation = self.env.reset()
119 |         for _ in range(self.stack.maxlen):
120 |             self.stack.append(observation)
121 | 
122 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
123 | 
124 |     def observation(self, observation):
125 |         self.stack.append(observation)
126 | 
127 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
128 | 
129 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False,
130 |              no_ops=0, fire_first=False):
131 |     env = gym.make(env_name)
132 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first)
133 |     env = PreprocessFrame(shape, env)
134 |     env = StackFrames(env, repeat)
135 | 
136 |     return env
137 | 


--------------------------------------------------------------------------------
/DDQN/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | 
  7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
  8 |     fig=plt.figure()
  9 |     ax=fig.add_subplot(111, label="1")
 10 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 11 | 
 12 |     ax.plot(x, epsilons, color="C0")
 13 |     ax.set_xlabel("Training Steps", color="C0")
 14 |     ax.set_ylabel("Epsilon", color="C0")
 15 |     ax.tick_params(axis='x', colors="C0")
 16 |     ax.tick_params(axis='y', colors="C0")
 17 | 
 18 |     N = len(scores)
 19 |     running_avg = np.empty(N)
 20 |     for t in range(N):
 21 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 22 | 
 23 |     ax2.scatter(x, running_avg, color="C1")
 24 |     ax2.axes.get_xaxis().set_visible(False)
 25 |     ax2.yaxis.tick_right()
 26 |     ax2.set_ylabel('Score', color="C1")
 27 |     ax2.yaxis.set_label_position('right')
 28 |     ax2.tick_params(axis='y', colors="C1")
 29 | 
 30 |     if lines is not None:
 31 |         for line in lines:
 32 |             plt.axvline(x=line)
 33 | 
 34 |     plt.savefig(filename)
 35 | 
 36 | class RepeatActionAndMaxFrame(gym.Wrapper):
 37 |     """ modified from:
 38 |         https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py
 39 |     """
 40 |     def __init__(self, env=None, repeat=4):
 41 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 42 |         self.repeat = repeat
 43 |         self.shape = env.observation_space.low.shape
 44 |         self.frame_buffer = np.zeros_like((2,self.shape))
 45 | 
 46 |     def step(self, action):
 47 |         t_reward = 0.0
 48 |         done = False
 49 |         for i in range(self.repeat):
 50 |             obs, reward, done, info = self.env.step(action)
 51 |             t_reward += reward
 52 |             idx = i % 2
 53 |             self.frame_buffer[idx] = obs
 54 |             if done:
 55 |                 break
 56 | 
 57 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 58 |         return max_frame, t_reward, done, info
 59 | 
 60 |     def reset(self):
 61 |         obs = self.env.reset()
 62 |         self.frame_buffer = np.zeros_like((2,self.shape))
 63 |         self.frame_buffer[0] = obs
 64 |         return obs
 65 | 
 66 | class PreprocessFrame(gym.ObservationWrapper):
 67 |     def __init__(self, shape, env=None):
 68 |         super(PreprocessFrame, self).__init__(env)
 69 |         self.shape=(shape[2], shape[0], shape[1])
 70 |         self.observation_space = gym.spaces.Box(low=0, high=1.0,
 71 |                                               shape=self.shape,dtype=np.float32)
 72 |     def observation(self, obs):
 73 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
 74 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
 75 |                                     interpolation=cv2.INTER_AREA)
 76 | 
 77 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
 78 |         new_obs = np.swapaxes(new_obs, 2,0)
 79 |         new_obs = new_obs / 255.0
 80 |         return new_obs
 81 | 
 82 | class StackFrames(gym.ObservationWrapper):
 83 |     def __init__(self, env, n_steps):
 84 |         super(StackFrames, self).__init__(env)
 85 |         self.observation_space = gym.spaces.Box(
 86 |                              env.observation_space.low.repeat(n_steps, axis=0),
 87 |                              env.observation_space.high.repeat(n_steps, axis=0),
 88 |                              dtype=np.float32)
 89 |         self.stack = collections.deque(maxlen=n_steps)
 90 | 
 91 |     def reset(self):
 92 |         self.stack.clear()
 93 |         observation = self.env.reset()
 94 |         for _ in range(self.stack.maxlen):
 95 |             self.stack.append(observation)
 96 | 
 97 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
 98 | 
 99 |     def observation(self, observation):
100 |         self.stack.append(observation)
101 |         obs = np.array(self.stack).reshape(self.observation_space.low.shape)
102 | 
103 |         return obs
104 | 
105 | def make_env(env_name, shape=(84,84,1), skip=4):
106 |     env = gym.make(env_name)
107 |     env = RepeatActionAndMaxFrame(env, skip)
108 |     env = PreprocessFrame(shape, env)
109 |     env = StackFrames(env, skip)
110 | 
111 |     return env
112 | 


--------------------------------------------------------------------------------
/DQN/deep_q_network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | import numpy as np
 7 | 
 8 | class DeepQNetwork(nn.Module):
 9 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
10 |         super(DeepQNetwork, self).__init__()
11 |         self.checkpoint_dir = chkpt_dir
12 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
13 | 
14 |         self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4)
15 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
16 |         self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
17 | 
18 |         fc_input_dims = self.calculate_conv_output_dims(input_dims)
19 | 
20 |         self.fc1 = nn.Linear(fc_input_dims, 512)
21 |         self.fc2 = nn.Linear(512, n_actions)
22 | 
23 |         self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
24 | 
25 |         self.loss = nn.MSELoss()
26 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
27 |         self.to(self.device)
28 | 
29 |     def calculate_conv_output_dims(self, input_dims):
30 |         state = T.zeros(1, *input_dims)
31 |         dims = self.conv1(state)
32 |         dims = self.conv2(dims)
33 |         dims = self.conv3(dims)
34 |         return int(np.prod(dims.size()))
35 | 
36 |     def forward(self, state):
37 |         conv1 = F.relu(self.conv1(state))
38 |         conv2 = F.relu(self.conv2(conv1))
39 |         conv3 = F.relu(self.conv3(conv2))
40 |         # conv3 shape is BS x n_filters x H x W
41 |         conv_state = conv3.view(conv3.size()[0], -1)
42 |         # conv_state shape is BS x (n_filters * H * W)
43 |         flat1 = F.relu(self.fc1(conv_state))
44 |         actions = self.fc2(flat1)
45 | 
46 |         return actions
47 | 
48 |     def save_checkpoint(self):
49 |         print('... saving checkpoint ...')
50 |         T.save(self.state_dict(), self.checkpoint_file)
51 | 
52 |     def load_checkpoint(self):
53 |         print('... loading checkpoint ...')
54 |         self.load_state_dict(T.load(self.checkpoint_file))
55 | 


--------------------------------------------------------------------------------
/DQN/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | from deep_q_network import DeepQNetwork
  4 | from replay_memory import ReplayBuffer
  5 | 
  6 | class DQNAgent(object):
  7 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
  8 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
  9 |                  replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 10 |         self.gamma = gamma
 11 |         self.epsilon = epsilon
 12 |         self.lr = lr
 13 |         self.n_actions = n_actions
 14 |         self.input_dims = input_dims
 15 |         self.batch_size = batch_size
 16 |         self.eps_min = eps_min
 17 |         self.eps_dec = eps_dec
 18 |         self.replace_target_cnt = replace
 19 |         self.algo = algo
 20 |         self.env_name = env_name
 21 |         self.chkpt_dir = chkpt_dir
 22 |         self.action_space = [i for i in range(n_actions)]
 23 |         self.learn_step_counter = 0
 24 | 
 25 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
 26 | 
 27 |         self.q_eval = DeepQNetwork(self.lr, self.n_actions,
 28 |                                     input_dims=self.input_dims,
 29 |                                     name=self.env_name+'_'+self.algo+'_q_eval',
 30 |                                     chkpt_dir=self.chkpt_dir)
 31 | 
 32 |         self.q_next = DeepQNetwork(self.lr, self.n_actions,
 33 |                                     input_dims=self.input_dims,
 34 |                                     name=self.env_name+'_'+self.algo+'_q_next',
 35 |                                     chkpt_dir=self.chkpt_dir)
 36 | 
 37 |     def choose_action(self, observation):
 38 |         if np.random.random() > self.epsilon:
 39 |             state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
 40 |             actions = self.q_eval.forward(state)
 41 |             action = T.argmax(actions).item()
 42 |         else:
 43 |             action = np.random.choice(self.action_space)
 44 | 
 45 |         return action
 46 | 
 47 |     def store_transition(self, state, action, reward, state_, done):
 48 |         self.memory.store_transition(state, action, reward, state_, done)
 49 | 
 50 |     def sample_memory(self):
 51 |         state, action, reward, new_state, done = \
 52 |                                 self.memory.sample_buffer(self.batch_size)
 53 | 
 54 |         states = T.tensor(state).to(self.q_eval.device)
 55 |         rewards = T.tensor(reward).to(self.q_eval.device)
 56 |         dones = T.tensor(done).to(self.q_eval.device)
 57 |         actions = T.tensor(action).to(self.q_eval.device)
 58 |         states_ = T.tensor(new_state).to(self.q_eval.device)
 59 | 
 60 |         return states, actions, rewards, states_, dones
 61 | 
 62 |     def replace_target_network(self):
 63 |         if self.learn_step_counter % self.replace_target_cnt == 0:
 64 |             self.q_next.load_state_dict(self.q_eval.state_dict())
 65 | 
 66 |     def decrement_epsilon(self):
 67 |         self.epsilon = self.epsilon - self.eps_dec \
 68 |                            if self.epsilon > self.eps_min else self.eps_min
 69 | 
 70 |     def save_models(self):
 71 |         self.q_eval.save_checkpoint()
 72 |         self.q_next.save_checkpoint()
 73 | 
 74 |     def load_models(self):
 75 |         self.q_eval.load_checkpoint()
 76 |         self.q_next.load_checkpoint()
 77 | 
 78 |     def learn(self):
 79 |         if self.memory.mem_cntr < self.batch_size:
 80 |             return
 81 | 
 82 |         self.q_eval.optimizer.zero_grad()
 83 | 
 84 |         self.replace_target_network()
 85 | 
 86 |         states, actions, rewards, states_, dones = self.sample_memory()
 87 |         indices = np.arange(self.batch_size)
 88 | 
 89 |         q_pred = self.q_eval.forward(states)[indices, actions]
 90 |         q_next = self.q_next.forward(states_).max(dim=1)[0]
 91 | 
 92 |         q_next[dones] = 0.0
 93 |         q_target = rewards + self.gamma*q_next
 94 | 
 95 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
 96 |         loss.backward()
 97 |         self.q_eval.optimizer.step()
 98 |         self.learn_step_counter += 1
 99 | 
100 |         self.decrement_epsilon()
101 | 


--------------------------------------------------------------------------------
/DQN/main_dqn.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from dqn_agent import DQNAgent
 4 | from utils import plot_learning_curve, make_env
 5 | from gym import wrappers
 6 | 
 7 | if __name__ == '__main__':
 8 |     env = make_env('PongNoFrameskip-v4')
 9 |     #env = gym.make('CartPole-v1')
10 |     best_score = -np.inf
11 |     load_checkpoint = False
12 |     n_games = 250
13 | 
14 |     agent = DQNAgent(gamma=0.99, epsilon=1, lr=0.0001,
15 |                      input_dims=(env.observation_space.shape),
16 |                      n_actions=env.action_space.n, mem_size=50000, eps_min=0.1,
17 |                      batch_size=32, replace=1000, eps_dec=1e-5,
18 |                      chkpt_dir='models/', algo='DQNAgent',
19 |                      env_name='PongNoFrameskip-v4')
20 | 
21 |     if load_checkpoint:
22 |         agent.load_models()
23 | 
24 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \
25 |             + str(n_games) + 'games'
26 |     figure_file = 'plots/' + fname + '.png'
27 |     # if you want to record video of your agent playing, do a mkdir tmp && mkdir tmp/dqn-video
28 |     # and uncomment the following 2 lines.
29 |     #env = wrappers.Monitor(env, "tmp/dqn-video",
30 |     #                    video_callable=lambda episode_id: True, force=True)
31 |     n_steps = 0
32 |     scores, eps_history, steps_array = [], [], []
33 | 
34 |     for i in range(n_games):
35 |         done = False
36 |         observation = env.reset()
37 | 
38 |         score = 0
39 |         while not done:
40 |             action = agent.choose_action(observation)
41 |             observation_, reward, done, info = env.step(action)
42 |             score += reward
43 | 
44 |             if not load_checkpoint:
45 |                 agent.store_transition(observation, action,
46 |                                      reward, observation_, done)
47 |                 agent.learn()
48 |             observation = observation_
49 |             n_steps += 1
50 |         scores.append(score)
51 |         steps_array.append(n_steps)
52 | 
53 |         avg_score = np.mean(scores[-100:])
54 |         print('episode: ', i,'score: ', score,
55 |              ' average score %.1f' % avg_score, 'best score %.2f' % best_score,
56 |             'epsilon %.2f' % agent.epsilon, 'steps', n_steps)
57 | 
58 |         if avg_score > best_score:
59 |             if not load_checkpoint:
60 |                 agent.save_models()
61 |             best_score = avg_score
62 | 
63 |         eps_history.append(agent.epsilon)
64 | 
65 |     x = [i+1 for i in range(len(scores))]
66 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
67 | 


--------------------------------------------------------------------------------
/DQN/models/PongNoFrameskip-v4_DQNAgent_q_eval:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DQN/models/PongNoFrameskip-v4_DQNAgent_q_eval


--------------------------------------------------------------------------------
/DQN/models/PongNoFrameskip-v4_DQNAgent_q_next:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DQN/models/PongNoFrameskip-v4_DQNAgent_q_next


--------------------------------------------------------------------------------
/DQN/plots/DQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DQN/plots/DQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png


--------------------------------------------------------------------------------
/DQN/preprocess_pseudocode:
--------------------------------------------------------------------------------
 1 | Class RepeatActionAndMaxFrame
 2 |   derives from: gym.Wrapper
 3 |   input: environment, repeat
 4 |   init frame buffer as an array of zeros in shape 2 x the obs space
 5 | 
 6 |   function step:
 7 |     input: action
 8 |   	set total reward to 0
 9 |     set done to false
10 |   	for i in range repeat
11 |   		call the env.step function
12 |         receive obs, reward, done, info
13 |   		increment total reward
14 |   		insert obs in frame buffer
15 |   		if done
16 |             break
17 |     end for
18 |   	find the max frame
19 |   	return: max frame, total reward, done, info
20 | 
21 |   function reset:
22 |     input: none
23 | 
24 |   	call env.reset
25 |   	reset the frame buffer
26 |     store initial observation in buffer
27 | 
28 |     return: initial observation
29 | 
30 | Class PreprocessFrame
31 |   derives from: gym.ObservationWrapper
32 |   input: environment, new shape
33 |   set shape by swapping channels axis
34 | 	set observation space to new shape using gym.spaces.Box (0 to 1.0)
35 | 
36 | 	function observation
37 |     input: raw observation
38 | 		covert the observation to gray scale
39 | 		resize observation to new shape
40 |     convert observation to numpy array
41 |     move observation's channel axis from position 2 to position 0
42 |     observation /= 255
43 | 		return observation
44 | 
45 | 
46 | Class StackFrames
47 |   derives from: gym.ObservationWrapper
48 |   input: environment, stack size
49 | 	init the new obs space (gym.spaces.Box) low & high bounds as repeat of n_steps
50 | 	initialize empty frame stack
51 | 
52 | 	reset function
53 | 		clear the stack
54 | 		reset the environment
55 |     for i in range(stack size)
56 |    		append initial observation to stack
57 |     convert stack to numpy array
58 |     reshape stack array to observation space low shape
59 |     return stack
60 | 
61 | 	observation function
62 |     input: observation
63 | 		append the observation to the end of the stack
64 | 		convert the stack to a numpy array
65 |     reshape stack to observation space low shape
66 | 		return the stack of frames
67 | 
68 | function make_env:
69 |   input: environment name, new shape, stack size
70 |   init env with the base gym.make function
71 |   env := RepeatActionAndMaxFrame
72 |   env := PreprocessFrame
73 |   env := StackFrames
74 | 
75 |   return: env
76 | 


--------------------------------------------------------------------------------
/DQN/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer(object):
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 8 |                                      dtype=np.float32)
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
10 |                                          dtype=np.float32)
11 | 
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, done):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.new_state_memory[index] = state_
20 |         self.action_memory[index] = action
21 |         self.reward_memory[index] = reward
22 |         self.terminal_memory[index] = done
23 |         self.mem_cntr += 1
24 | 
25 |     def sample_buffer(self, batch_size):
26 |         max_mem = min(self.mem_cntr, self.mem_size)
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         terminal = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, terminal
36 | 


--------------------------------------------------------------------------------
/DQN/tf2/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import tensorflow.keras as keras
  4 | from tensorflow.keras.optimizers import Adam
  5 | from network import DeepQNetwork
  6 | from replay_memory import ReplayBuffer
  7 | 
  8 | 
  9 | class Agent:
 10 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
 11 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
 12 |                  replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 13 |         self.gamma = gamma
 14 |         self.epsilon = epsilon
 15 |         self.lr = lr
 16 |         self.n_actions = n_actions
 17 |         self.input_dims = input_dims
 18 |         self.batch_size = batch_size
 19 |         self.eps_min = eps_min
 20 |         self.eps_dec = eps_dec
 21 |         self.replace_target_cnt = replace
 22 |         self.algo = algo
 23 |         self.env_name = env_name
 24 |         self.chkpt_dir = chkpt_dir
 25 |         self.action_space = [i for i in range(n_actions)]
 26 |         self.learn_step_counter = 0
 27 |         self.fname = self.chkpt_dir + self.env_name + '_' + self.algo + '_'
 28 | 
 29 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
 30 | 
 31 |         self.q_eval = DeepQNetwork(input_dims, n_actions)
 32 |         self.q_eval.compile(optimizer=Adam(learning_rate=lr))
 33 |         self.q_next = DeepQNetwork(input_dims, n_actions)
 34 |         self.q_next.compile(optimizer=Adam(learning_rate=lr))
 35 | 
 36 |     def save_models(self):
 37 |         self.q_eval.save(self.fname+'q_eval')
 38 |         self.q_next.save(self.fname+'q_next')
 39 |         print('... models saved successfully ...')
 40 | 
 41 |     def load_models(self):
 42 |         self.q_eval = keras.models.load_model(self.fname+'q_eval')
 43 |         self.q_next = keras.models.load_model(self.fname+'q_next')
 44 |         print('... models loaded successfully ...')
 45 | 
 46 |     def store_transition(self, state, action, reward, state_, done):
 47 |         self.memory.store_transition(state, action, reward, state_, done)
 48 | 
 49 |     def sample_memory(self):
 50 |         state, action, reward, new_state, done = \
 51 |                                   self.memory.sample_buffer(self.batch_size)
 52 |         states = tf.convert_to_tensor(state)
 53 |         rewards = tf.convert_to_tensor(reward)
 54 |         dones = tf.convert_to_tensor(done)
 55 |         actions = tf.convert_to_tensor(action, dtype=tf.int32)
 56 |         states_ = tf.convert_to_tensor(new_state)
 57 |         return states, actions, rewards, states_, dones
 58 | 
 59 |     def choose_action(self, observation):
 60 |         if np.random.random() > self.epsilon:
 61 |             state = tf.convert_to_tensor([observation])
 62 |             actions = self.q_eval(state)
 63 |             action = tf.math.argmax(actions, axis=1).numpy()[0]
 64 |         else:
 65 |             action = np.random.choice(self.action_space)
 66 |         return action
 67 | 
 68 |     def replace_target_network(self):
 69 |         if self.learn_step_counter % self.replace_target_cnt == 0:
 70 |             self.q_next.set_weights(self.q_eval.get_weights())
 71 | 
 72 |     def decrement_epsilon(self):
 73 |         self.epsilon = self.epsilon - self.eps_dec \
 74 |                            if self.epsilon > self.eps_min else self.eps_min
 75 | 
 76 |     def learn(self):
 77 |         if self.memory.mem_cntr < self.batch_size:
 78 |             return
 79 | 
 80 |         self.replace_target_network()
 81 | 
 82 |         states, actions, rewards, states_, dones = self.sample_memory()
 83 | 
 84 |         indices = tf.range(self.batch_size, dtype=tf.int32)
 85 |         print(actions.shape)
 86 |         exit()
 87 |         action_indices = tf.stack([indices, actions], axis=1)
 88 | 
 89 |         with tf.GradientTape() as tape:
 90 |             q_pred = tf.gather_nd(self.q_eval(states), indices=action_indices)
 91 |             q_next = self.q_next(states_)
 92 | 
 93 |             max_actions = tf.math.argmax(q_next, axis=1, output_type=tf.int32)
 94 |             max_action_idx = tf.stack([indices, max_actions], axis=1)
 95 | 
 96 |             q_target = rewards + \
 97 |                 self.gamma*tf.gather_nd(q_next, indices=max_action_idx) *\
 98 |                 (1 - dones.numpy())
 99 | 
100 |             loss = keras.losses.MSE(q_pred, q_target)
101 | 
102 |         params = self.q_eval.trainable_variables
103 |         grads = tape.gradient(loss, params)
104 | 
105 |         self.q_eval.optimizer.apply_gradients(zip(grads, params))
106 | 
107 |         self.learn_step_counter += 1
108 | 
109 |         self.decrement_epsilon()
110 | 


--------------------------------------------------------------------------------
/DQN/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from agent import Agent
 3 | from utils import plot_learning_curve, make_env, manage_memory
 4 | from gym import wrappers
 5 | 
 6 | if __name__ == '__main__':
 7 |     manage_memory()
 8 |     env = make_env('PongNoFrameskip-v4')
 9 |     best_score = -np.inf
10 |     load_checkpoint = False
11 |     record_agent = False
12 |     n_games = 250
13 |     agent = Agent(gamma=0.99, epsilon=1, lr=0.0001,
14 |                   input_dims=(env.observation_space.shape),
15 |                   n_actions=env.action_space.n, mem_size=50000, eps_min=0.1,
16 |                   batch_size=32, replace=1000, eps_dec=1e-5,
17 |                   chkpt_dir='models/', algo='DQNAgent',
18 |                   env_name='PongNoFrameskip-v4')
19 |     if load_checkpoint:
20 |         agent.load_models()
21 |         agent.epsilon = agent.eps_min
22 | 
23 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \
24 |         + str(n_games) + 'games'
25 |     figure_file = 'plots/' + fname + '.png'
26 |     # if you want to record video of your agent playing, do a
27 |     # mkdir video
28 |     if record_agent:
29 |         env = wrappers.Monitor(env, "video",
30 |                                video_callable=lambda episode_id: True,
31 |                                force=True)
32 |     n_steps = 0
33 |     scores, eps_history, steps_array = [], [], []
34 | 
35 |     for i in range(n_games):
36 |         done = False
37 |         observation = env.reset()
38 | 
39 |         score = 0
40 |         while not done:
41 |             action = agent.choose_action(observation)
42 |             observation_, reward, done, info = env.step(action)
43 |             score += reward
44 | 
45 |             if not load_checkpoint:
46 |                 agent.store_transition(observation, action,
47 |                                        reward, observation_, done)
48 |                 agent.learn()
49 |             observation = observation_
50 |             n_steps += 1
51 |         scores.append(score)
52 |         steps_array.append(n_steps)
53 | 
54 |         avg_score = np.mean(scores[-100:])
55 |         print('episode {} score {:.1f} avg score {:.1f} '
56 |               'best score {:.1f} epsilon {:.2f} steps {}'.
57 |               format(i, score, avg_score, best_score, agent.epsilon,
58 |                      n_steps))
59 | 
60 |         if score > best_score:
61 |             if not load_checkpoint:
62 |                 agent.save_models()
63 |             best_score = score
64 | 
65 |         eps_history.append(agent.epsilon)
66 | 
67 |     x = [i+1 for i in range(len(scores))]
68 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
69 | 


--------------------------------------------------------------------------------
/DQN/tf2/network.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.keras as keras
 2 | from tensorflow.keras.layers import Conv2D, Dense, Flatten
 3 | 
 4 | 
 5 | class DeepQNetwork(keras.Model):
 6 |     def __init__(self, input_dims, n_actions):
 7 |         super(DeepQNetwork, self).__init__()
 8 |         self.conv1 = Conv2D(32, 8, strides=(4, 4), activation='relu',
 9 |                             data_format='channels_first',
10 |                             input_shape=input_dims)
11 |         self.conv2 = Conv2D(64, 4, strides=(2, 2), activation='relu',
12 |                             data_format='channels_first')
13 |         self.conv3 = Conv2D(64, 3, strides=(1, 1), activation='relu',
14 |                             data_format='channels_first')
15 |         self.flat = Flatten()
16 |         self.fc1 = Dense(512, activation='relu')
17 |         self.fc2 = Dense(n_actions, activation=None)
18 | 
19 |     def call(self, state):
20 |         x = self.conv1(state)
21 |         x = self.conv2(x)
22 |         x = self.conv3(x)
23 |         x = self.flat(x)
24 |         x = self.fc1(x)
25 |         x = self.fc2(x)
26 | 
27 |         return x
28 | 


--------------------------------------------------------------------------------
/DQN/tf2/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer(object):
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 8 |                                      dtype=np.float32)
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
10 |                                          dtype=np.float32)
11 | 
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, done):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.new_state_memory[index] = state_
20 |         self.action_memory[index] = action
21 |         self.reward_memory[index] = reward
22 |         self.terminal_memory[index] = done
23 |         self.mem_cntr += 1
24 | 
25 |     def sample_buffer(self, batch_size):
26 |         max_mem = min(self.mem_cntr, self.mem_size)
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         terminal = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, terminal
36 | 


--------------------------------------------------------------------------------
/DQN/tf2/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | def manage_memory():
 10 |     gpus = tf.config.list_physical_devices('GPU')
 11 |     if gpus:
 12 |         try:
 13 |             for gpu in gpus:
 14 |                 tf.config.experimental.set_memory_growth(gpu, True)
 15 |         except RuntimeError as e:
 16 |             print(e)
 17 | 
 18 | 
 19 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
 20 |     fig=plt.figure()
 21 |     ax=fig.add_subplot(111, label="1")
 22 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 23 | 
 24 |     ax.plot(x, epsilons, color="C0")
 25 |     ax.set_xlabel("Training Steps", color="C0")
 26 |     ax.set_ylabel("Epsilon", color="C0")
 27 |     ax.tick_params(axis='x', colors="C0")
 28 |     ax.tick_params(axis='y', colors="C0")
 29 | 
 30 |     N = len(scores)
 31 |     running_avg = np.empty(N)
 32 |     for t in range(N):
 33 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 34 | 
 35 |     ax2.scatter(x, running_avg, color="C1")
 36 |     ax2.axes.get_xaxis().set_visible(False)
 37 |     ax2.yaxis.tick_right()
 38 |     ax2.set_ylabel('Score', color="C1")
 39 |     ax2.yaxis.set_label_position('right')
 40 |     ax2.tick_params(axis='y', colors="C1")
 41 | 
 42 |     if lines is not None:
 43 |         for line in lines:
 44 |             plt.axvline(x=line)
 45 | 
 46 |     plt.savefig(filename)
 47 | 
 48 | class RepeatActionAndMaxFrame(gym.Wrapper):
 49 |     def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0,
 50 |                  fire_first=False):
 51 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 52 |         self.repeat = repeat
 53 |         self.shape = env.observation_space.low.shape
 54 |         self.frame_buffer = np.zeros_like((2, self.shape))
 55 |         self.clip_reward = clip_reward
 56 |         self.no_ops = no_ops
 57 |         self.fire_first = fire_first
 58 | 
 59 |     def step(self, action):
 60 |         t_reward = 0.0
 61 |         done = False
 62 |         for i in range(self.repeat):
 63 |             obs, reward, done, info = self.env.step(action)
 64 |             if self.clip_reward:
 65 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 66 |             t_reward += reward
 67 |             idx = i % 2
 68 |             self.frame_buffer[idx] = obs
 69 |             if done:
 70 |                 break
 71 | 
 72 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 73 |         return max_frame, t_reward, done, info
 74 | 
 75 |     def reset(self):
 76 |         obs = self.env.reset()
 77 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 78 |         for _ in range(no_ops):
 79 |             _, _, done, _ = self.env.step(0)
 80 |             if done:
 81 |                 self.env.reset()
 82 |         if self.fire_first:
 83 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 84 |             obs, _, _, _ = self.env.step(1)
 85 | 
 86 |         self.frame_buffer = np.zeros_like((2,self.shape))
 87 |         self.frame_buffer[0] = obs
 88 | 
 89 |         return obs
 90 | 
 91 | class PreprocessFrame(gym.ObservationWrapper):
 92 |     def __init__(self, shape, env=None):
 93 |         super(PreprocessFrame, self).__init__(env)
 94 |         self.shape = (shape[2], shape[0], shape[1])
 95 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
 96 |                                     shape=self.shape, dtype=np.float32)
 97 | 
 98 |     def observation(self, obs):
 99 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
100 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
101 |                                     interpolation=cv2.INTER_AREA)
102 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
103 |         new_obs = new_obs / 255.0
104 | 
105 |         return new_obs
106 | 
107 | class StackFrames(gym.ObservationWrapper):
108 |     def __init__(self, env, repeat):
109 |         super(StackFrames, self).__init__(env)
110 |         self.observation_space = gym.spaces.Box(
111 |                             env.observation_space.low.repeat(repeat, axis=0),
112 |                             env.observation_space.high.repeat(repeat, axis=0),
113 |                             dtype=np.float32)
114 |         self.stack = collections.deque(maxlen=repeat)
115 | 
116 |     def reset(self):
117 |         self.stack.clear()
118 |         observation = self.env.reset()
119 |         for _ in range(self.stack.maxlen):
120 |             self.stack.append(observation)
121 | 
122 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
123 | 
124 |     def observation(self, observation):
125 |         self.stack.append(observation)
126 | 
127 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
128 | 
129 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False,
130 |              no_ops=0, fire_first=False):
131 |     env = gym.make(env_name)
132 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first)
133 |     env = PreprocessFrame(shape, env)
134 |     env = StackFrames(env, repeat)
135 | 
136 |     return env
137 | 


--------------------------------------------------------------------------------
/DQN/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | 
  7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
  8 |     fig=plt.figure()
  9 |     ax=fig.add_subplot(111, label="1")
 10 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 11 | 
 12 |     ax.plot(x, epsilons, color="C0")
 13 |     ax.set_xlabel("Training Steps", color="C0")
 14 |     ax.set_ylabel("Epsilon", color="C0")
 15 |     ax.tick_params(axis='x', colors="C0")
 16 |     ax.tick_params(axis='y', colors="C0")
 17 | 
 18 |     N = len(scores)
 19 |     running_avg = np.empty(N)
 20 |     for t in range(N):
 21 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 22 | 
 23 |     ax2.scatter(x, running_avg, color="C1")
 24 |     ax2.axes.get_xaxis().set_visible(False)
 25 |     ax2.yaxis.tick_right()
 26 |     ax2.set_ylabel('Score', color="C1")
 27 |     ax2.yaxis.set_label_position('right')
 28 |     ax2.tick_params(axis='y', colors="C1")
 29 | 
 30 |     if lines is not None:
 31 |         for line in lines:
 32 |             plt.axvline(x=line)
 33 | 
 34 |     plt.savefig(filename)
 35 | 
 36 | class RepeatActionAndMaxFrame(gym.Wrapper):
 37 |     def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0,
 38 |                  fire_first=False):
 39 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 40 |         self.repeat = repeat
 41 |         self.shape = env.observation_space.low.shape
 42 |         self.frame_buffer = np.zeros_like((2, self.shape))
 43 |         self.clip_reward = clip_reward
 44 |         self.no_ops = no_ops
 45 |         self.fire_first = fire_first
 46 | 
 47 |     def step(self, action):
 48 |         t_reward = 0.0
 49 |         done = False
 50 |         for i in range(self.repeat):
 51 |             obs, reward, done, info = self.env.step(action)
 52 |             if self.clip_reward:
 53 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 54 |             t_reward += reward
 55 |             idx = i % 2
 56 |             self.frame_buffer[idx] = obs
 57 |             if done:
 58 |                 break
 59 | 
 60 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 61 |         return max_frame, t_reward, done, info
 62 | 
 63 |     def reset(self):
 64 |         obs = self.env.reset()
 65 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 66 |         for _ in range(no_ops):
 67 |             _, _, done, _ = self.env.step(0)
 68 |             if done:
 69 |                 self.env.reset()
 70 |         if self.fire_first:
 71 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 72 |             obs, _, _, _ = self.env.step(1)
 73 | 
 74 |         self.frame_buffer = np.zeros_like((2,self.shape))
 75 |         self.frame_buffer[0] = obs
 76 | 
 77 |         return obs
 78 | 
 79 | class PreprocessFrame(gym.ObservationWrapper):
 80 |     def __init__(self, shape, env=None):
 81 |         super(PreprocessFrame, self).__init__(env)
 82 |         self.shape = (shape[2], shape[0], shape[1])
 83 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
 84 |                                     shape=self.shape, dtype=np.float32)
 85 | 
 86 |     def observation(self, obs):
 87 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
 88 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
 89 |                                     interpolation=cv2.INTER_AREA)
 90 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
 91 |         new_obs = new_obs / 255.0
 92 | 
 93 |         return new_obs
 94 | 
 95 | class StackFrames(gym.ObservationWrapper):
 96 |     def __init__(self, env, repeat):
 97 |         super(StackFrames, self).__init__(env)
 98 |         self.observation_space = gym.spaces.Box(
 99 |                             env.observation_space.low.repeat(repeat, axis=0),
100 |                             env.observation_space.high.repeat(repeat, axis=0),
101 |                             dtype=np.float32)
102 |         self.stack = collections.deque(maxlen=repeat)
103 | 
104 |     def reset(self):
105 |         self.stack.clear()
106 |         observation = self.env.reset()
107 |         for _ in range(self.stack.maxlen):
108 |             self.stack.append(observation)
109 | 
110 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
111 | 
112 |     def observation(self, observation):
113 |         self.stack.append(observation)
114 | 
115 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
116 | 
117 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False,
118 |              no_ops=0, fire_first=False):
119 |     env = gym.make(env_name)
120 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first)
121 |     env = PreprocessFrame(shape, env)
122 |     env = StackFrames(env, repeat)
123 | 
124 |     return env
125 | 


--------------------------------------------------------------------------------
/DuelingDDQN/deep_q_network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | import numpy as np
 7 | 
 8 | class DuelingDeepQNetwork(nn.Module):
 9 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
10 |         super(DuelingDeepQNetwork, self).__init__()
11 | 
12 |         self.checkpoint_dir = chkpt_dir
13 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
14 | 
15 |         self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4)
16 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
17 |         self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
18 | 
19 |         fc_input_dims = self.calculate_conv_output_dims(input_dims)
20 | 
21 |         self.fc1 = nn.Linear(fc_input_dims, 1024)
22 |         self.fc2 = nn.Linear(1024, 512)
23 |         self.V = nn.Linear(512, 1)
24 |         self.A = nn.Linear(512, n_actions)
25 | 
26 |         self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
27 |         self.loss = nn.MSELoss()
28 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
29 |         self.to(self.device)
30 | 
31 | 
32 |     def calculate_conv_output_dims(self, input_dims):
33 |         state = T.zeros(1, *input_dims)
34 |         dims = self.conv1(state)
35 |         dims = self.conv2(dims)
36 |         dims = self.conv3(dims)
37 |         return int(np.prod(dims.size()))
38 | 
39 |     def forward(self, state):
40 |         conv1 = F.relu(self.conv1(state))
41 |         conv2 = F.relu(self.conv2(conv1))
42 |         conv3 = F.relu(self.conv3(conv2))
43 |         conv_state = conv3.view(conv3.size()[0], -1)
44 |         flat1 = F.relu(self.fc1(conv_state))
45 |         flat2 = F.relu(self.fc2(flat1))
46 | 
47 |         V = self.V(flat2)
48 |         A = self.A(flat2)
49 | 
50 |         return V, A
51 | 
52 |     def save_checkpoint(self):
53 |         print('... saving checkpoint ...')
54 |         T.save(self.state_dict(), self.checkpoint_file)
55 | 
56 |     def load_checkpoint(self):
57 |         print('... loading checkpoint ...')
58 |         self.load_state_dict(T.load(self.checkpoint_file))
59 | 


--------------------------------------------------------------------------------
/DuelingDDQN/dueling_ddqn_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | from deep_q_network import DuelingDeepQNetwork
  4 | from replay_memory import ReplayBuffer
  5 | 
  6 | class DuelingDDQNAgent(object):
  7 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
  8 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
  9 |                  replace=1000,  algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 10 |         self.gamma = gamma
 11 |         self.epsilon = epsilon
 12 |         self.lr = lr
 13 |         self.n_actions = n_actions
 14 |         self.input_dims = input_dims
 15 |         self.batch_size = batch_size
 16 |         self.eps_min = eps_min
 17 |         self.eps_dec = eps_dec
 18 |         self.replace_target_cnt = replace
 19 |         self.algo = algo
 20 |         self.env_name = env_name
 21 |         self.chkpt_dir = chkpt_dir
 22 |         self.action_space = [i for i in range(n_actions)]
 23 |         self.learn_step_counter = 0
 24 | 
 25 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
 26 | 
 27 |         self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions,
 28 |                         input_dims=self.input_dims,
 29 |                         name=self.env_name+'_'+self.algo+'_q_eval',
 30 |                         chkpt_dir=self.chkpt_dir)
 31 |         self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions,
 32 |                         input_dims=self.input_dims,
 33 |                         name=self.env_name+'_'+self.algo+'_q_next',
 34 |                         chkpt_dir=self.chkpt_dir)
 35 | 
 36 |     def store_transition(self, state, action, reward, state_, done):
 37 |         self.memory.store_transition(state, action, reward, state_, done)
 38 | 
 39 |     def sample_memory(self):
 40 |         state, action, reward, new_state, done = \
 41 |                                 self.memory.sample_buffer(self.batch_size)
 42 | 
 43 |         states = T.tensor(state).to(self.q_eval.device)
 44 |         rewards = T.tensor(reward).to(self.q_eval.device)
 45 |         dones = T.tensor(done).to(self.q_eval.device)
 46 |         actions = T.tensor(action).to(self.q_eval.device)
 47 |         states_ = T.tensor(new_state).to(self.q_eval.device)
 48 | 
 49 |         return states, actions, rewards, states_, dones
 50 | 
 51 |     def choose_action(self, observation):
 52 |         if np.random.random() > self.epsilon:
 53 |             state = np.array([observation], copy=False, dtype=np.float32)
 54 |             state_tensor = T.tensor(state).to(self.q_eval.device)
 55 |             _, advantages = self.q_eval.forward(state_tensor)
 56 | 
 57 |             action = T.argmax(advantages).item()
 58 |         else:
 59 |             action = np.random.choice(self.action_space)
 60 | 
 61 |         return action
 62 | 
 63 |     def replace_target_network(self):
 64 |         if self.replace_target_cnt is not None and \
 65 |            self.learn_step_counter % self.replace_target_cnt == 0:
 66 |             self.q_next.load_state_dict(self.q_eval.state_dict())
 67 | 
 68 |     def decrement_epsilon(self):
 69 |         self.epsilon = self.epsilon - self.eps_dec \
 70 |                            if self.epsilon > self.eps_min else self.eps_min
 71 | 
 72 |     def learn(self):
 73 |         if self.memory.mem_cntr < self.batch_size:
 74 |             return
 75 | 
 76 |         self.q_eval.optimizer.zero_grad()
 77 | 
 78 |         self.replace_target_network()
 79 | 
 80 |         states, actions, rewards, states_, dones = self.sample_memory()
 81 |         indices = np.arange(self.batch_size)
 82 | 
 83 |         V_s, A_s = self.q_eval.forward(states)
 84 |         V_s_, A_s_ = self.q_next.forward(states_)
 85 | 
 86 |         V_s_eval, A_s_eval = self.q_eval.forward(states_)
 87 | 
 88 |         q_pred = T.add(V_s,
 89 |                         (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions]
 90 | 
 91 |         q_next = T.add(V_s_, (A_s_ - A_s_.mean(dim=1, keepdim=True)))
 92 | 
 93 |         q_eval = T.add(V_s_eval, (A_s_eval - A_s_eval.mean(dim=1,keepdim=True)))
 94 | 
 95 |         max_actions = T.argmax(q_eval, dim=1)
 96 |         q_next[dones] = 0.0
 97 | 
 98 |         q_target = rewards + self.gamma*q_next[indices, max_actions]
 99 | 
100 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
101 |         loss.backward()
102 |         self.q_eval.optimizer.step()
103 |         self.learn_step_counter += 1
104 | 
105 |         self.decrement_epsilon()
106 | 
107 |     def save_models(self):
108 |         self.q_eval.save_checkpoint()
109 |         self.q_next.save_checkpoint()
110 | 
111 |     def load_models(self):
112 |         self.q_eval.load_checkpoint()
113 |         self.q_next.load_checkpoint()
114 | 


--------------------------------------------------------------------------------
/DuelingDDQN/main_dueling_ddqn.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym import wrappers
 3 | import numpy as np
 4 | from dueling_ddqn_agent import DuelingDDQNAgent
 5 | from utils import plot_learning_curve, make_env
 6 | 
 7 | if __name__ == '__main__':
 8 |     env = make_env('PongNoFrameskip-v4')
 9 |     best_score = -np.inf
10 |     load_checkpoint = False
11 |     n_games = 20
12 |     agent = DuelingDDQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001,
13 |                      input_dims=(env.observation_space.shape),
14 |                      n_actions=env.action_space.n, mem_size=50000, eps_min=0.1,
15 |                      batch_size=32, replace=10000, eps_dec=1e-5,
16 |                      chkpt_dir='models/', algo='DuelingDDQNAgent',
17 |                      env_name='PongNoFrameskip-v4')
18 | 
19 |     if load_checkpoint:
20 |         agent.load_models()
21 | 
22 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \
23 |             + str(n_games) + 'games'
24 |     figure_file = 'plots/' + fname + '.png'
25 | 
26 |     n_steps = 0
27 |     scores, eps_history, steps_array = [], [], []
28 | 
29 |     for i in range(n_games):
30 |         done = False
31 |         observation = env.reset()
32 | 
33 |         score = 0
34 |         while not done:
35 |             action = agent.choose_action(observation)
36 |             observation_, reward, done, info = env.step(action)
37 |             score += reward
38 | 
39 |             if not load_checkpoint:
40 |                 agent.store_transition(observation, action,
41 |                                      reward, observation_, int(done))
42 |                 agent.learn()
43 |             observation = observation_
44 |             n_steps += 1
45 |         scores.append(score)
46 |         steps_array.append(n_steps)
47 | 
48 |         avg_score = np.mean(scores[-100:])
49 |         print('episode: ', i,'score: ', score,
50 |              ' average score %.1f' % avg_score, 'best score %.2f' % best_score,
51 |             'epsilon %.2f' % agent.epsilon, 'steps', n_steps)
52 | 
53 |         if avg_score > best_score:
54 |             if not load_checkpoint:
55 |                 agent.save_models()
56 |             best_score = avg_score
57 | 
58 |         eps_history.append(agent.epsilon)
59 |         if load_checkpoint and n_steps >= 18000:
60 |             break
61 | 
62 |     x = [i+1 for i in range(len(scores))]
63 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
64 | 


--------------------------------------------------------------------------------
/DuelingDDQN/models/PongNoFrameskip-v4_DuelingDDQNAgent_q_eval:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDDQN/models/PongNoFrameskip-v4_DuelingDDQNAgent_q_eval


--------------------------------------------------------------------------------
/DuelingDDQN/models/PongNoFrameskip-v4_DuelingDDQNAgent_q_next:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDDQN/models/PongNoFrameskip-v4_DuelingDDQNAgent_q_next


--------------------------------------------------------------------------------
/DuelingDDQN/plots/DuelingDDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDDQN/plots/DuelingDDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png


--------------------------------------------------------------------------------
/DuelingDDQN/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer(object):
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 8 |                                      dtype=np.float32)
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
10 |                                          dtype=np.float32)
11 | 
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, done):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.new_state_memory[index] = state_
20 |         self.action_memory[index] = action
21 |         self.reward_memory[index] = reward
22 |         self.terminal_memory[index] = done
23 |         self.mem_cntr += 1
24 | 
25 |     def sample_buffer(self, batch_size):
26 |         max_mem = min(self.mem_cntr, self.mem_size)
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         terminal = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, terminal
36 | 


--------------------------------------------------------------------------------
/DuelingDDQN/tf2/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.keras.optimizers import Adam
  4 | import tensorflow.keras as keras
  5 | from network import DuelingDeepQNetwork
  6 | from replay_memory import ReplayBuffer
  7 | 
  8 | 
  9 | class Agent:
 10 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
 11 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
 12 |                  replace=1000, algo=None, env_name=None,
 13 |                  chkpt_dir='tmp/duelingddqn'):
 14 |         self.gamma = gamma
 15 |         self.epsilon = epsilon
 16 |         self.lr = lr
 17 |         self.n_actions = n_actions
 18 |         self.input_dims = input_dims
 19 |         self.batch_size = batch_size
 20 |         self.eps_min = eps_min
 21 |         self.eps_dec = eps_dec
 22 |         self.replace_target_cnt = replace
 23 |         self.algo = algo
 24 |         self.env_name = env_name
 25 |         self.chkpt_dir = chkpt_dir
 26 |         self.action_space = [i for i in range(n_actions)]
 27 |         self.learn_step_counter = 0
 28 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
 29 |         self.fname = self.chkpt_dir + self.env_name + '_' + self.algo + '_'
 30 | 
 31 |         self.q_eval = DuelingDeepQNetwork(input_dims, n_actions)
 32 |         self.q_eval.compile(optimizer=Adam(learning_rate=lr))
 33 |         self.q_next = DuelingDeepQNetwork(input_dims, n_actions)
 34 |         self.q_next.compile(optimizer=Adam(learning_rate=lr))
 35 | 
 36 |     def save_models(self):
 37 |         self.q_eval.save(self.fname+'q_eval')
 38 |         self.q_next.save(self.fname+'q_next')
 39 |         print('... models saved successfully ...')
 40 | 
 41 |     def load_models(self):
 42 |         self.q_eval = keras.models.load_model(self.fname+'q_eval')
 43 |         self.q_next = keras.models.load_model(self.fname+'q_next')
 44 |         print('... models loaded successfully ...')
 45 | 
 46 |     def choose_action(self, observation):
 47 |         if np.random.random() > self.epsilon:
 48 |             state = tf.convert_to_tensor([observation])
 49 |             _, advantage = self.q_eval(state)
 50 |             action = tf.math.argmax(advantage, axis=1).numpy()[0]
 51 |         else:
 52 |             action = np.random.choice(self.action_space)
 53 |         return action
 54 | 
 55 |     def store_transition(self, state, action, reward, state_, done):
 56 |         self.memory.store_transition(state, action, reward, state_, done)
 57 | 
 58 |     def sample_memory(self):
 59 |         state, action, reward, new_state, done = \
 60 |                                  self.memory.sample_buffer(self.batch_size)
 61 |         states = tf.convert_to_tensor(state)
 62 |         rewards = tf.convert_to_tensor(reward)
 63 |         dones = tf.convert_to_tensor(done)
 64 |         actions = tf.convert_to_tensor(action, dtype=tf.int32)
 65 |         states_ = tf.convert_to_tensor(new_state)
 66 |         return states, actions, rewards, states_, dones
 67 | 
 68 |     def replace_target_network(self):
 69 |         if self.learn_step_counter % self.replace_target_cnt == 0:
 70 |             self.q_next.set_weights(self.q_eval.get_weights())
 71 | 
 72 |     def decrement_epsilon(self):
 73 |         self.epsilon = self.epsilon - self.eps_dec \
 74 |                          if self.epsilon > self.eps_min else self.eps_min
 75 | 
 76 |     def learn(self):
 77 |         if self.memory.mem_cntr < self.batch_size:
 78 |             return
 79 | 
 80 |         self.replace_target_network()
 81 | 
 82 |         states, actions, rewards, states_, dones = self.sample_memory()
 83 | 
 84 |         indices = tf.range(self.batch_size, dtype=tf.int32)
 85 |         action_indices = tf.stack([indices, actions], axis=1)
 86 | 
 87 |         with tf.GradientTape() as tape:
 88 |             V_s, A_s = self.q_eval(states)
 89 |             V_s_, A_s_ = self.q_next(states_)
 90 |             V_s_eval, A_s_eval = self.q_eval(states_)
 91 | 
 92 |             advantage = V_s+A_s-tf.reduce_mean(A_s, axis=1,
 93 |                                                keepdims=True)
 94 |             advantage_ = V_s_+A_s_-tf.reduce_mean(A_s_, axis=1,
 95 |                                                   keepdims=True)
 96 |             advantage_eval = V_s_eval+A_s_eval-tf.reduce_mean(A_s_eval,
 97 |                                                               axis=1,
 98 |                                                               keepdims=True)
 99 |             max_actions = tf.argmax(advantage_eval, axis=1,
100 |                                     output_type=tf.int32)
101 |             max_action_idx = tf.stack([indices, max_actions], axis=1)
102 |             q_next = tf.gather_nd(advantage_, indices=max_action_idx)
103 |             q_pred = tf.gather_nd(advantage, indices=action_indices)
104 | 
105 |             q_target = rewards + self.gamma*q_next * (1 - dones.numpy())
106 |             loss = keras.losses.MSE(q_pred, q_target)
107 |         params = self.q_eval.trainable_variables
108 |         grads = tape.gradient(loss, params)
109 |         self.q_eval.optimizer.apply_gradients(zip(grads, params))
110 |         self.learn_step_counter += 1
111 | 
112 |         self.decrement_epsilon()
113 | 


--------------------------------------------------------------------------------
/DuelingDDQN/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from agent import Agent
 3 | from utils import plot_learning_curve, make_env, manage_memory
 4 | from gym import wrappers
 5 | 
 6 | if __name__ == '__main__':
 7 |     manage_memory()
 8 |     env = make_env('PongNoFrameskip-v4')
 9 |     best_score = -np.inf
10 |     load_checkpoint = False
11 |     record_agent = False
12 |     n_games = 250
13 |     agent = Agent(gamma=0.99, epsilon=1, lr=0.0001,
14 |                   input_dims=(env.observation_space.shape),
15 |                   n_actions=env.action_space.n, mem_size=50000, eps_min=0.1,
16 |                   batch_size=32, replace=1000, eps_dec=1e-5,
17 |                   chkpt_dir='models/', algo='DQNAgent',
18 |                   env_name='PongNoFrameskip-v4')
19 |     if load_checkpoint:
20 |         agent.load_models()
21 |         agent.epsilon = agent.eps_min
22 | 
23 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \
24 |         + str(n_games) + 'games'
25 |     figure_file = 'plots/' + fname + '.png'
26 |     # if you want to record video of your agent playing, do a
27 |     # mkdir video
28 |     if record_agent:
29 |         env = wrappers.Monitor(env, "video",
30 |                                video_callable=lambda episode_id: True,
31 |                                force=True)
32 |     n_steps = 0
33 |     scores, eps_history, steps_array = [], [], []
34 | 
35 |     for i in range(n_games):
36 |         done = False
37 |         observation = env.reset()
38 | 
39 |         score = 0
40 |         while not done:
41 |             action = agent.choose_action(observation)
42 |             observation_, reward, done, info = env.step(action)
43 |             score += reward
44 | 
45 |             if not load_checkpoint:
46 |                 agent.store_transition(observation, action,
47 |                                        reward, observation_, done)
48 |                 agent.learn()
49 |             observation = observation_
50 |             n_steps += 1
51 |         scores.append(score)
52 |         steps_array.append(n_steps)
53 | 
54 |         avg_score = np.mean(scores[-100:])
55 |         print('episode {} score {:.1f} avg score {:.1f} '
56 |               'best score {:.1f} epsilon {:.2f} steps {}'.
57 |               format(i, score, avg_score, best_score, agent.epsilon,
58 |                      n_steps))
59 | 
60 |         if score > best_score:
61 |             if not load_checkpoint:
62 |                 agent.save_models()
63 |             best_score = score
64 | 
65 |         eps_history.append(agent.epsilon)
66 | 
67 |     x = [i+1 for i in range(len(scores))]
68 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
69 | 


--------------------------------------------------------------------------------
/DuelingDDQN/tf2/network.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.keras as keras
 2 | from tensorflow.keras.layers import Conv2D, Dense, Flatten
 3 | 
 4 | 
 5 | class DuelingDeepQNetwork(keras.Model):
 6 |     def __init__(self, input_dims, n_actions):
 7 |         super(DuelingDeepQNetwork, self).__init__()
 8 |         self.conv1 = Conv2D(32, 8, strides=(4, 4), activation='relu',
 9 |                             data_format='channels_first',
10 |                             input_shape=input_dims)
11 |         self.conv2 = Conv2D(64, 4, strides=(2, 2), activation='relu',
12 |                             data_format='channels_first')
13 |         self.conv3 = Conv2D(64, 3, strides=(1, 1), activation='relu',
14 |                             data_format='channels_first')
15 |         self.flat = Flatten()
16 |         self.fc1 = Dense(512, activation='relu')
17 |         self.A = Dense(n_actions, activation=None)
18 |         self.V = Dense(1, activation=None)
19 | 
20 |     def call(self, state):
21 |         x = self.conv1(state)
22 |         x = self.conv2(x)
23 |         x = self.conv3(x)
24 |         x = self.flat(x)
25 |         x = self.fc1(x)
26 |         V = self.V(x)
27 |         A = self.A(x)
28 | 
29 |         return V, A
30 | 


--------------------------------------------------------------------------------
/DuelingDDQN/tf2/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer(object):
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 8 |                                      dtype=np.float32)
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
10 |                                          dtype=np.float32)
11 | 
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, done):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.new_state_memory[index] = state_
20 |         self.action_memory[index] = action
21 |         self.reward_memory[index] = reward
22 |         self.terminal_memory[index] = done
23 |         self.mem_cntr += 1
24 | 
25 |     def sample_buffer(self, batch_size):
26 |         max_mem = min(self.mem_cntr, self.mem_size)
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         terminal = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, terminal
36 | 


--------------------------------------------------------------------------------
/DuelingDDQN/tf2/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | def manage_memory():
 10 |     gpus = tf.config.list_physical_devices('GPU')
 11 |     if gpus:
 12 |         try:
 13 |             for gpu in gpus:
 14 |                 tf.config.experimental.set_memory_growth(gpu, True)
 15 |         except RuntimeError as e:
 16 |             print(e)
 17 | 
 18 | 
 19 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
 20 |     fig = plt.figure()
 21 |     ax = fig.add_subplot(111, label="1")
 22 |     ax2 = fig.add_subplot(111, label="2", frame_on=False)
 23 | 
 24 |     ax.plot(x, epsilons, color="C0")
 25 |     ax.set_xlabel("Training Steps", color="C0")
 26 |     ax.set_ylabel("Epsilon", color="C0")
 27 |     ax.tick_params(axis='x', colors="C0")
 28 |     ax.tick_params(axis='y', colors="C0")
 29 | 
 30 |     N = len(scores)
 31 |     running_avg = np.empty(N)
 32 |     for t in range(N):
 33 |         running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 34 | 
 35 |     ax2.scatter(x, running_avg, color="C1")
 36 |     ax2.axes.get_xaxis().set_visible(False)
 37 |     ax2.yaxis.tick_right()
 38 |     ax2.set_ylabel('Score', color="C1")
 39 |     ax2.yaxis.set_label_position('right')
 40 |     ax2.tick_params(axis='y', colors="C1")
 41 | 
 42 |     if lines is not None:
 43 |         for line in lines:
 44 |             plt.axvline(x=line)
 45 | 
 46 |     plt.savefig(filename)
 47 | 
 48 | 
 49 | class RepeatActionAndMaxFrame(gym.Wrapper):
 50 |     def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0,
 51 |                  fire_first=False):
 52 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 53 |         self.repeat = repeat
 54 |         self.shape = env.observation_space.low.shape
 55 | 
 56 |         # self.frame_buffer = np.zeros(shape=(2, *self.shape))
 57 |         self.frame_buffer = np.zeros_like((2, self.shape))
 58 |         self.clip_reward = clip_reward
 59 |         self.no_ops = no_ops
 60 |         self.fire_first = fire_first
 61 | 
 62 |     def step(self, action):
 63 |         t_reward = 0.0
 64 |         done = False
 65 |         for i in range(self.repeat):
 66 |             obs, reward, done, info = self.env.step(action)
 67 |             if self.clip_reward:
 68 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 69 |             t_reward += reward
 70 |             idx = i % 2
 71 |             self.frame_buffer[idx] = obs
 72 |             if done:
 73 |                 break
 74 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 75 |         return max_frame, t_reward, done, info
 76 | 
 77 |     def reset(self):
 78 |         obs = self.env.reset()
 79 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 80 |         for _ in range(no_ops):
 81 |             _, _, done, _ = self.env.step(0)
 82 |             if done:
 83 |                 self.env.reset()
 84 |         if self.fire_first:
 85 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 86 |             obs, _, _, _ = self.env.step(1)
 87 | 
 88 |         # self.frame_buffer = np.zeros(shape=(2, *self.shape))
 89 |         self.frame_buffer = np.zeros_like((2, self.shape))
 90 |         self.frame_buffer[0] = obs
 91 | 
 92 |         return obs
 93 | 
 94 | 
 95 | class PreprocessFrame(gym.ObservationWrapper):
 96 |     def __init__(self, shape, env=None):
 97 |         super(PreprocessFrame, self).__init__(env)
 98 |         self.shape = (shape[2], shape[0], shape[1])
 99 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
100 |                                                 shape=self.shape,
101 |                                                 dtype=np.float32)
102 | 
103 |     def observation(self, obs):
104 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
105 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
106 |                                     interpolation=cv2.INTER_AREA)
107 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
108 |         new_obs = new_obs / 255.0
109 | 
110 |         return new_obs
111 | 
112 | 
113 | class StackFrames(gym.ObservationWrapper):
114 |     def __init__(self, env, repeat):
115 |         super(StackFrames, self).__init__(env)
116 |         self.observation_space = gym.spaces.Box(
117 |                             env.observation_space.low.repeat(repeat, axis=0),
118 |                             env.observation_space.high.repeat(repeat, axis=0),
119 |                             dtype=np.float32)
120 |         self.stack = collections.deque(maxlen=repeat)
121 | 
122 |     def reset(self):
123 |         self.stack.clear()
124 |         observation = self.env.reset()
125 |         for _ in range(self.stack.maxlen):
126 |             self.stack.append(observation)
127 | 
128 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
129 | 
130 |     def observation(self, observation):
131 |         self.stack.append(observation)
132 | 
133 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
134 | 
135 | 
136 | def make_env(env_name, shape=(84, 84, 1), repeat=4, clip_rewards=False,
137 |              no_ops=0, fire_first=False):
138 |     env = gym.make(env_name)
139 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards,
140 |                                   no_ops, fire_first)
141 |     env = PreprocessFrame(shape, env)
142 |     env = StackFrames(env, repeat)
143 | 
144 |     return env
145 | 


--------------------------------------------------------------------------------
/DuelingDDQN/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | 
  7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
  8 |     fig=plt.figure()
  9 |     ax=fig.add_subplot(111, label="1")
 10 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 11 | 
 12 |     ax.plot(x, epsilons, color="C0")
 13 |     ax.set_xlabel("Training Steps", color="C0")
 14 |     ax.set_ylabel("Epsilon", color="C0")
 15 |     ax.tick_params(axis='x', colors="C0")
 16 |     ax.tick_params(axis='y', colors="C0")
 17 | 
 18 |     N = len(scores)
 19 |     running_avg = np.empty(N)
 20 |     for t in range(N):
 21 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 22 | 
 23 |     ax2.scatter(x, running_avg, color="C1")
 24 |     ax2.axes.get_xaxis().set_visible(False)
 25 |     ax2.yaxis.tick_right()
 26 |     ax2.set_ylabel('Score', color="C1")
 27 |     ax2.yaxis.set_label_position('right')
 28 |     ax2.tick_params(axis='y', colors="C1")
 29 | 
 30 |     if lines is not None:
 31 |         for line in lines:
 32 |             plt.axvline(x=line)
 33 | 
 34 |     plt.savefig(filename)
 35 | 
 36 | class RepeatActionAndMaxFrame(gym.Wrapper):
 37 |     """ modified from:
 38 |         https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py
 39 |     """
 40 |     def __init__(self, env=None, repeat=4):
 41 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 42 |         self.repeat = repeat
 43 |         self.shape = env.observation_space.low.shape
 44 |         self.frame_buffer = np.zeros_like((2,self.shape))
 45 | 
 46 |     def step(self, action):
 47 |         t_reward = 0.0
 48 |         done = False
 49 |         for i in range(self.repeat):
 50 |             obs, reward, done, info = self.env.step(action)
 51 |             t_reward += reward
 52 |             idx = i % 2
 53 |             self.frame_buffer[idx] = obs
 54 |             if done:
 55 |                 break
 56 | 
 57 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 58 |         return max_frame, t_reward, done, info
 59 | 
 60 |     def reset(self):
 61 |         obs = self.env.reset()
 62 |         self.frame_buffer = np.zeros_like((2,self.shape))
 63 |         self.frame_buffer[0] = obs
 64 |         return obs
 65 | 
 66 | class PreprocessFrame(gym.ObservationWrapper):
 67 |     def __init__(self, shape, env=None):
 68 |         super(PreprocessFrame, self).__init__(env)
 69 |         self.shape=(shape[2], shape[0], shape[1])
 70 |         self.observation_space = gym.spaces.Box(low=0, high=1.0,
 71 |                                               shape=self.shape,dtype=np.float32)
 72 |     def observation(self, obs):
 73 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
 74 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
 75 |                                     interpolation=cv2.INTER_AREA)
 76 | 
 77 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
 78 |         new_obs = np.swapaxes(new_obs, 2,0)
 79 |         new_obs = new_obs / 255.0
 80 |         return new_obs
 81 | 
 82 | class StackFrames(gym.ObservationWrapper):
 83 |     def __init__(self, env, n_steps):
 84 |         super(StackFrames, self).__init__(env)
 85 |         self.observation_space = gym.spaces.Box(
 86 |                              env.observation_space.low.repeat(n_steps, axis=0),
 87 |                              env.observation_space.high.repeat(n_steps, axis=0),
 88 |                              dtype=np.float32)
 89 |         self.stack = collections.deque(maxlen=n_steps)
 90 | 
 91 |     def reset(self):
 92 |         self.stack.clear()
 93 |         observation = self.env.reset()
 94 |         for _ in range(self.stack.maxlen):
 95 |             self.stack.append(observation)
 96 | 
 97 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
 98 | 
 99 |     def observation(self, observation):
100 |         self.stack.append(observation)
101 |         obs = np.array(self.stack).reshape(self.observation_space.low.shape)
102 | 
103 |         return obs
104 | 
105 | def make_env(env_name, shape=(84,84,1), skip=4):
106 |     env = gym.make(env_name)
107 |     env = RepeatActionAndMaxFrame(env, skip)
108 |     env = PreprocessFrame(shape, env)
109 |     env = StackFrames(env, skip)
110 | 
111 |     return env
112 | 


--------------------------------------------------------------------------------
/DuelingDQN/deep_q_network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | import numpy as np
 7 | 
 8 | class DuelingDeepQNetwork(nn.Module):
 9 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
10 |         super(DuelingDeepQNetwork, self).__init__()
11 | 
12 |         self.checkpoint_dir = chkpt_dir
13 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
14 | 
15 |         self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4)
16 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
17 |         self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
18 | 
19 |         fc_input_dims = self.calculate_conv_output_dims(input_dims)
20 | 
21 |         self.fc1 = nn.Linear(fc_input_dims, 1024)
22 |         self.fc2 = nn.Linear(1024, 512)
23 |         self.V = nn.Linear(512, 1)
24 |         self.A = nn.Linear(512, n_actions)
25 | 
26 |         self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
27 |         self.loss = nn.MSELoss()
28 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
29 |         self.to(self.device)
30 | 
31 | 
32 |     def calculate_conv_output_dims(self, input_dims):
33 |         state = T.zeros(1, *input_dims)
34 |         dims = self.conv1(state)
35 |         dims = self.conv2(dims)
36 |         dims = self.conv3(dims)
37 |         return int(np.prod(dims.size()))
38 | 
39 |     def forward(self, state):
40 |         conv1 = F.relu(self.conv1(state))
41 |         conv2 = F.relu(self.conv2(conv1))
42 |         conv3 = F.relu(self.conv3(conv2))
43 |         conv_state = conv3.view(conv3.size()[0], -1)
44 |         flat1 = F.relu(self.fc1(conv_state))
45 |         flat2 = F.relu(self.fc2(flat1))
46 | 
47 |         V = self.V(flat2)
48 |         A = self.A(flat2)
49 | 
50 |         return V, A
51 | 
52 |     def save_checkpoint(self):
53 |         print('... saving checkpoint ...')
54 |         T.save(self.state_dict(), self.checkpoint_file)
55 | 
56 |     def load_checkpoint(self):
57 |         print('... loading checkpoint ...')
58 |         self.load_state_dict(T.load(self.checkpoint_file))
59 | 


--------------------------------------------------------------------------------
/DuelingDQN/dueling_dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | from deep_q_network import DuelingDeepQNetwork
  4 | from replay_memory import ReplayBuffer
  5 | 
  6 | class DuelingDQNAgent(object):
  7 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
  8 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
  9 |                  replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 10 |         self.gamma = gamma
 11 |         self.epsilon = epsilon
 12 |         self.lr = lr
 13 |         self.n_actions = n_actions
 14 |         self.input_dims = input_dims
 15 |         self.batch_size = batch_size
 16 |         self.eps_min = eps_min
 17 |         self.eps_dec = eps_dec
 18 |         self.replace_target_cnt = replace
 19 |         self.algo = algo
 20 |         self.env_name = env_name
 21 |         self.chkpt_dir = chkpt_dir
 22 |         self.action_space = [i for i in range(n_actions)]
 23 |         self.learn_step_counter = 0
 24 | 
 25 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
 26 | 
 27 |         self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions,
 28 |                         input_dims=self.input_dims,
 29 |                         name=self.env_name+'_'+self.algo+'_q_eval',
 30 |                         chkpt_dir=self.chkpt_dir)
 31 |         self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions,
 32 |                         input_dims=self.input_dims,
 33 |                         name=self.env_name+'_'+self.algo+'_q_next',
 34 |                         chkpt_dir=self.chkpt_dir)
 35 | 
 36 |     def store_transition(self, state, action, reward, state_, done):
 37 |         self.memory.store_transition(state, action, reward, state_, done)
 38 | 
 39 |     def sample_memory(self):
 40 |         state, action, reward, new_state, done = \
 41 |                                 self.memory.sample_buffer(self.batch_size)
 42 | 
 43 |         states = T.tensor(state).to(self.q_eval.device)
 44 |         rewards = T.tensor(reward).to(self.q_eval.device)
 45 |         dones = T.tensor(done).to(self.q_eval.device)
 46 |         actions = T.tensor(action).to(self.q_eval.device)
 47 |         states_ = T.tensor(new_state).to(self.q_eval.device)
 48 | 
 49 |         return states, actions, rewards, states_, dones
 50 | 
 51 |     def choose_action(self, observation):
 52 |         if np.random.random() > self.epsilon:
 53 |             state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
 54 |             _, advantage = self.q_eval.forward(state)
 55 |             action = T.argmax(advantage).item()
 56 |         else:
 57 |             action = np.random.choice(self.action_space)
 58 | 
 59 |         return action
 60 | 
 61 |     def replace_target_network(self):
 62 |         if self.replace_target_cnt is not None and \
 63 |            self.learn_step_counter % self.replace_target_cnt == 0:
 64 |             self.q_next.load_state_dict(self.q_eval.state_dict())
 65 | 
 66 |     def decrement_epsilon(self):
 67 |         self.epsilon = self.epsilon - self.eps_dec \
 68 |                          if self.epsilon > self.eps_min else self.eps_min
 69 | 
 70 |     def learn(self):
 71 |         if self.memory.mem_cntr < self.batch_size:
 72 |             return
 73 | 
 74 |         self.q_eval.optimizer.zero_grad()
 75 | 
 76 |         self.replace_target_network()
 77 | 
 78 |         states, actions, rewards, states_, dones = self.sample_memory()
 79 | 
 80 |         V_s, A_s = self.q_eval.forward(states)
 81 |         V_s_, A_s_ = self.q_next.forward(states_)
 82 | 
 83 |         indices = np.arange(self.batch_size)
 84 | 
 85 |         q_pred = T.add(V_s,
 86 |                         (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions]
 87 |         q_next = T.add(V_s_,
 88 |                         (A_s_ - A_s_.mean(dim=1, keepdim=True))).max(dim=1)[0]
 89 | 
 90 |         q_next[dones] = 0.0
 91 |         q_target = rewards + self.gamma*q_next
 92 | 
 93 | 
 94 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
 95 |         loss.backward()
 96 |         self.q_eval.optimizer.step()
 97 |         self.learn_step_counter += 1
 98 | 
 99 |         self.decrement_epsilon()
100 | 
101 |     def save_models(self):
102 |         self.q_eval.save_checkpoint()
103 |         self.q_next.save_checkpoint()
104 | 
105 |     def load_models(self):
106 |         self.q_eval.load_checkpoint()
107 |         self.q_next.load_checkpoint()
108 | 


--------------------------------------------------------------------------------
/DuelingDQN/main_dueling_dqn.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from dueling_dqn_agent import DuelingDQNAgent
 4 | from utils import plot_learning_curve, make_env
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = make_env('PongNoFrameskip-v4')
 8 |     best_score = -np.inf
 9 |     load_checkpoint = False
10 |     n_games = 20
11 |     agent = DuelingDQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001,
12 |                      input_dims=(env.observation_space.shape),
13 |                      n_actions=env.action_space.n, mem_size=50000, eps_min=0.1,
14 |                      batch_size=32, replace=10000, eps_dec=1e-5,
15 |                      chkpt_dir='models/', algo='DuelingDQNAgent',
16 |                      env_name='PongNoFrameskip-v4')
17 | 
18 |     if load_checkpoint:
19 |         agent.load_models()
20 | 
21 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \
22 |             + str(n_games) + 'games'
23 |     figure_file = 'plots/' + fname + '.png'
24 | 
25 |     n_steps = 0
26 |     scores, eps_history, steps_array = [], [], []
27 | 
28 |     for i in range(n_games):
29 |         done = False
30 |         observation = env.reset()
31 | 
32 |         score = 0
33 |         while not done:
34 |             action = agent.choose_action(observation)
35 |             observation_, reward, done, info = env.step(action)
36 |             score += reward
37 | 
38 |             if not load_checkpoint:
39 |                 agent.store_transition(observation, action,
40 |                                      reward, observation_, int(done))
41 |                 agent.learn()
42 |             observation = observation_
43 |             n_steps += 1
44 |         scores.append(score)
45 |         steps_array.append(n_steps)
46 | 
47 |         avg_score = np.mean(scores[-100:])
48 |         print('episode: ', i,'score: ', score,
49 |              ' average score %.1f' % avg_score, 'best score %.2f' % best_score,
50 |             'epsilon %.2f' % agent.epsilon, 'steps', n_steps)
51 | 
52 |         if avg_score > best_score:
53 |             if not load_checkpoint:
54 |                 agent.save_models()
55 |             best_score = avg_score
56 | 
57 |         eps_history.append(agent.epsilon)
58 |         if load_checkpoint and n_steps >= 18000:
59 |             break
60 | 
61 |     x = [i+1 for i in range(len(scores))]
62 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
63 | 


--------------------------------------------------------------------------------
/DuelingDQN/models/PongNoFrameskip-v4_DuelingDQNAgent_q_eval:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDQN/models/PongNoFrameskip-v4_DuelingDQNAgent_q_eval


--------------------------------------------------------------------------------
/DuelingDQN/models/PongNoFrameskip-v4_DuelingDQNAgent_q_next:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDQN/models/PongNoFrameskip-v4_DuelingDQNAgent_q_next


--------------------------------------------------------------------------------
/DuelingDQN/plots/DuelingDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDQN/plots/DuelingDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png


--------------------------------------------------------------------------------
/DuelingDQN/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer(object):
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 8 |                                      dtype=np.float32)
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
10 |                                          dtype=np.float32)
11 | 
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, done):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.new_state_memory[index] = state_
20 |         self.action_memory[index] = action
21 |         self.reward_memory[index] = reward
22 |         self.terminal_memory[index] = done
23 |         self.mem_cntr += 1
24 | 
25 |     def sample_buffer(self, batch_size):
26 |         max_mem = min(self.mem_cntr, self.mem_size)
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         terminal = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, terminal
36 | 


--------------------------------------------------------------------------------
/DuelingDQN/tf2/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.keras.optimizers import Adam
  4 | import tensorflow.keras as keras
  5 | from network import DuelingDeepQNetwork
  6 | from replay_memory import ReplayBuffer
  7 | 
  8 | 
  9 | class Agent:
 10 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
 11 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
 12 |                  replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 13 |         self.gamma = gamma
 14 |         self.epsilon = epsilon
 15 |         self.lr = lr
 16 |         self.n_actions = n_actions
 17 |         self.input_dims = input_dims
 18 |         self.batch_size = batch_size
 19 |         self.eps_min = eps_min
 20 |         self.eps_dec = eps_dec
 21 |         self.replace_target_cnt = replace
 22 |         self.algo = algo
 23 |         self.env_name = env_name
 24 |         self.chkpt_dir = chkpt_dir
 25 |         self.action_space = [i for i in range(n_actions)]
 26 |         self.learn_step_counter = 0
 27 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
 28 |         self.fname = self.chkpt_dir + self.env_name + '_' + self.algo + '_'
 29 | 
 30 |         self.q_eval = DuelingDeepQNetwork(input_dims, n_actions)
 31 |         self.q_eval.compile(optimizer=Adam(learning_rate=lr))
 32 |         self.q_next = DuelingDeepQNetwork(input_dims, n_actions)
 33 |         self.q_next.compile(optimizer=Adam(learning_rate=lr))
 34 | 
 35 |     def save_models(self):
 36 |         self.q_eval.save(self.fname+'q_eval')
 37 |         self.q_next.save(self.fname+'q_next')
 38 |         print('... models saved successfully ...')
 39 | 
 40 |     def load_models(self):
 41 |         self.q_eval = keras.models.load_model(self.fname+'q_eval')
 42 |         self.q_next = keras.models.load_model(self.fname+'q_next')
 43 |         print('... models loaded successfully ...')
 44 | 
 45 |     def choose_action(self, observation):
 46 |         if np.random.random() > self.epsilon:
 47 |             state = tf.convert_to_tensor([observation])
 48 |             _, advantage = self.q_eval(state)
 49 |             action = tf.math.argmax(advantage, axis=1).numpy()[0]
 50 |         else:
 51 |             action = np.random.choice(self.action_space)
 52 |         return action
 53 | 
 54 |     def store_transition(self, state, action, reward, state_, done):
 55 |         self.memory.store_transition(state, action, reward, state_, done)
 56 | 
 57 |     def sample_memory(self):
 58 |         state, action, reward, new_state, done = \
 59 |                                  self.memory.sample_buffer(self.batch_size)
 60 |         states = tf.convert_to_tensor(state)
 61 |         rewards = tf.convert_to_tensor(reward)
 62 |         dones = tf.convert_to_tensor(done)
 63 |         actions = tf.convert_to_tensor(action, dtype=tf.int32)
 64 |         states_ = tf.convert_to_tensor(new_state)
 65 |         return states, actions, rewards, states_, dones
 66 | 
 67 |     def replace_target_network(self):
 68 |         if self.learn_step_counter % self.replace_target_cnt == 0:
 69 |             self.q_next.set_weights(self.q_eval.get_weights())
 70 | 
 71 |     def decrement_epsilon(self):
 72 |         self.epsilon = self.epsilon - self.eps_dec \
 73 |                          if self.epsilon > self.eps_min else self.eps_min
 74 | 
 75 |     def learn(self):
 76 |         if self.memory.mem_cntr < self.batch_size:
 77 |             return
 78 | 
 79 |         self.replace_target_network()
 80 | 
 81 |         states, actions, rewards, states_, dones = self.sample_memory()
 82 | 
 83 |         indices = tf.range(self.batch_size, dtype=tf.int32)
 84 |         action_indices = tf.stack([indices, actions], axis=1)
 85 | 
 86 |         with tf.GradientTape() as tape:
 87 |             V_s, A_s = self.q_eval(states)
 88 |             V_s_, A_s_ = self.q_next(states_)
 89 | 
 90 |             advantage = V_s + A_s - tf.reduce_mean(A_s, axis=1,
 91 |                                                    keepdims=True)
 92 |             advantage_ = V_s_ + A_s_ - tf.reduce_mean(A_s_, axis=1,
 93 |                                                       keepdims=True)
 94 |             q_pred = tf.gather_nd(advantage, indices=action_indices)
 95 | 
 96 |             q_next = tf.reduce_max(advantage_, axis=1)
 97 | 
 98 |             q_target = rewards + self.gamma*q_next * (1 - dones.numpy())
 99 |             loss = keras.losses.MSE(q_pred, q_target)
100 |         params = self.q_eval.trainable_variables
101 |         grads = tape.gradient(loss, params)
102 |         self.q_eval.optimizer.apply_gradients(zip(grads, params))
103 |         self.learn_step_counter += 1
104 | 
105 |         self.decrement_epsilon()
106 | 


--------------------------------------------------------------------------------
/DuelingDQN/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from agent import Agent
 3 | from utils import plot_learning_curve, make_env, manage_memory
 4 | from gym import wrappers
 5 | 
 6 | if __name__ == '__main__':
 7 |     manage_memory()
 8 |     env = make_env('PongNoFrameskip-v4')
 9 |     best_score = -np.inf
10 |     load_checkpoint = False
11 |     record_agent = False
12 |     n_games = 250
13 |     agent = Agent(gamma=0.99, epsilon=1, lr=0.0001,
14 |                   input_dims=(env.observation_space.shape),
15 |                   n_actions=env.action_space.n, mem_size=50000, eps_min=0.1,
16 |                   batch_size=32, replace=1000, eps_dec=1e-5,
17 |                   chkpt_dir='models/', algo='DQNAgent',
18 |                   env_name='PongNoFrameskip-v4')
19 |     if load_checkpoint:
20 |         agent.load_models()
21 |         agent.epsilon = agent.eps_min
22 | 
23 |     fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \
24 |         + str(n_games) + 'games'
25 |     figure_file = 'plots/' + fname + '.png'
26 |     # if you want to record video of your agent playing, do a
27 |     # mkdir video
28 |     if record_agent:
29 |         env = wrappers.Monitor(env, "video",
30 |                                video_callable=lambda episode_id: True,
31 |                                force=True)
32 |     n_steps = 0
33 |     scores, eps_history, steps_array = [], [], []
34 | 
35 |     for i in range(n_games):
36 |         done = False
37 |         observation = env.reset()
38 | 
39 |         score = 0
40 |         while not done:
41 |             action = agent.choose_action(observation)
42 |             observation_, reward, done, info = env.step(action)
43 |             score += reward
44 | 
45 |             if not load_checkpoint:
46 |                 agent.store_transition(observation, action,
47 |                                        reward, observation_, done)
48 |                 agent.learn()
49 |             observation = observation_
50 |             n_steps += 1
51 |         scores.append(score)
52 |         steps_array.append(n_steps)
53 | 
54 |         avg_score = np.mean(scores[-100:])
55 |         print('episode {} score {:.1f} avg score {:.1f} '
56 |               'best score {:.1f} epsilon {:.2f} steps {}'.
57 |               format(i, score, avg_score, best_score, agent.epsilon,
58 |                      n_steps))
59 | 
60 |         if score > best_score:
61 |             if not load_checkpoint:
62 |                 agent.save_models()
63 |             best_score = score
64 | 
65 |         eps_history.append(agent.epsilon)
66 | 
67 |     x = [i+1 for i in range(len(scores))]
68 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
69 | 


--------------------------------------------------------------------------------
/DuelingDQN/tf2/network.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.keras as keras
 2 | from tensorflow.keras.layers import Conv2D, Dense, Flatten
 3 | 
 4 | 
 5 | class DuelingDeepQNetwork(keras.Model):
 6 |     def __init__(self, input_dims, n_actions):
 7 |         super(DuelingDeepQNetwork, self).__init__()
 8 |         self.conv1 = Conv2D(32, 8, strides=(4, 4), activation='relu',
 9 |                             data_format='channels_first',
10 |                             input_shape=input_dims)
11 |         self.conv2 = Conv2D(64, 4, strides=(2, 2), activation='relu',
12 |                             data_format='channels_first')
13 |         self.conv3 = Conv2D(64, 3, strides=(1, 1), activation='relu',
14 |                             data_format='channels_first')
15 |         self.flat = Flatten()
16 |         self.fc1 = Dense(512, activation='relu')
17 |         self.A = Dense(n_actions, activation=None)
18 |         self.V = Dense(1, activation=None)
19 | 
20 |     def call(self, state):
21 |         x = self.conv1(state)
22 |         x = self.conv2(x)
23 |         x = self.conv3(x)
24 |         x = self.flat(x)
25 |         x = self.fc1(x)
26 |         V = self.V(x)
27 |         A = self.A(x)
28 | 
29 |         return V, A
30 | 


--------------------------------------------------------------------------------
/DuelingDQN/tf2/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer(object):
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 8 |                                      dtype=np.float32)
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
10 |                                          dtype=np.float32)
11 | 
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, done):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.new_state_memory[index] = state_
20 |         self.action_memory[index] = action
21 |         self.reward_memory[index] = reward
22 |         self.terminal_memory[index] = done
23 |         self.mem_cntr += 1
24 | 
25 |     def sample_buffer(self, batch_size):
26 |         max_mem = min(self.mem_cntr, self.mem_size)
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         terminal = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, terminal
36 | 


--------------------------------------------------------------------------------
/DuelingDQN/tf2/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | def manage_memory():
 10 |     gpus = tf.config.list_physical_devices('GPU')
 11 |     if gpus:
 12 |         try:
 13 |             for gpu in gpus:
 14 |                 tf.config.experimental.set_memory_growth(gpu, True)
 15 |         except RuntimeError as e:
 16 |             print(e)
 17 | 
 18 | 
 19 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
 20 |     fig=plt.figure()
 21 |     ax=fig.add_subplot(111, label="1")
 22 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 23 | 
 24 |     ax.plot(x, epsilons, color="C0")
 25 |     ax.set_xlabel("Training Steps", color="C0")
 26 |     ax.set_ylabel("Epsilon", color="C0")
 27 |     ax.tick_params(axis='x', colors="C0")
 28 |     ax.tick_params(axis='y', colors="C0")
 29 | 
 30 |     N = len(scores)
 31 |     running_avg = np.empty(N)
 32 |     for t in range(N):
 33 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 34 | 
 35 |     ax2.scatter(x, running_avg, color="C1")
 36 |     ax2.axes.get_xaxis().set_visible(False)
 37 |     ax2.yaxis.tick_right()
 38 |     ax2.set_ylabel('Score', color="C1")
 39 |     ax2.yaxis.set_label_position('right')
 40 |     ax2.tick_params(axis='y', colors="C1")
 41 | 
 42 |     if lines is not None:
 43 |         for line in lines:
 44 |             plt.axvline(x=line)
 45 | 
 46 |     plt.savefig(filename)
 47 | 
 48 | class RepeatActionAndMaxFrame(gym.Wrapper):
 49 |     def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0,
 50 |                  fire_first=False):
 51 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 52 |         self.repeat = repeat
 53 |         self.shape = env.observation_space.low.shape
 54 |         self.frame_buffer = np.zeros_like((2, self.shape))
 55 |         self.clip_reward = clip_reward
 56 |         self.no_ops = no_ops
 57 |         self.fire_first = fire_first
 58 | 
 59 |     def step(self, action):
 60 |         t_reward = 0.0
 61 |         done = False
 62 |         for i in range(self.repeat):
 63 |             obs, reward, done, info = self.env.step(action)
 64 |             if self.clip_reward:
 65 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 66 |             t_reward += reward
 67 |             idx = i % 2
 68 |             self.frame_buffer[idx] = obs
 69 |             if done:
 70 |                 break
 71 | 
 72 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 73 |         return max_frame, t_reward, done, info
 74 | 
 75 |     def reset(self):
 76 |         obs = self.env.reset()
 77 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 78 |         for _ in range(no_ops):
 79 |             _, _, done, _ = self.env.step(0)
 80 |             if done:
 81 |                 self.env.reset()
 82 |         if self.fire_first:
 83 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 84 |             obs, _, _, _ = self.env.step(1)
 85 | 
 86 |         self.frame_buffer = np.zeros_like((2,self.shape))
 87 |         self.frame_buffer[0] = obs
 88 | 
 89 |         return obs
 90 | 
 91 | class PreprocessFrame(gym.ObservationWrapper):
 92 |     def __init__(self, shape, env=None):
 93 |         super(PreprocessFrame, self).__init__(env)
 94 |         self.shape = (shape[2], shape[0], shape[1])
 95 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
 96 |                                     shape=self.shape, dtype=np.float32)
 97 | 
 98 |     def observation(self, obs):
 99 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
100 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
101 |                                     interpolation=cv2.INTER_AREA)
102 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
103 |         new_obs = new_obs / 255.0
104 | 
105 |         return new_obs
106 | 
107 | class StackFrames(gym.ObservationWrapper):
108 |     def __init__(self, env, repeat):
109 |         super(StackFrames, self).__init__(env)
110 |         self.observation_space = gym.spaces.Box(
111 |                             env.observation_space.low.repeat(repeat, axis=0),
112 |                             env.observation_space.high.repeat(repeat, axis=0),
113 |                             dtype=np.float32)
114 |         self.stack = collections.deque(maxlen=repeat)
115 | 
116 |     def reset(self):
117 |         self.stack.clear()
118 |         observation = self.env.reset()
119 |         for _ in range(self.stack.maxlen):
120 |             self.stack.append(observation)
121 | 
122 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
123 | 
124 |     def observation(self, observation):
125 |         self.stack.append(observation)
126 | 
127 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
128 | 
129 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False,
130 |              no_ops=0, fire_first=False):
131 |     env = gym.make(env_name)
132 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first)
133 |     env = PreprocessFrame(shape, env)
134 |     env = StackFrames(env, repeat)
135 | 
136 |     return env
137 | 


--------------------------------------------------------------------------------
/DuelingDQN/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | 
  7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
  8 |     fig=plt.figure()
  9 |     ax=fig.add_subplot(111, label="1")
 10 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 11 | 
 12 |     ax.plot(x, epsilons, color="C0")
 13 |     ax.set_xlabel("Training Steps", color="C0")
 14 |     ax.set_ylabel("Epsilon", color="C0")
 15 |     ax.tick_params(axis='x', colors="C0")
 16 |     ax.tick_params(axis='y', colors="C0")
 17 | 
 18 |     N = len(scores)
 19 |     running_avg = np.empty(N)
 20 |     for t in range(N):
 21 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 22 | 
 23 |     ax2.scatter(x, running_avg, color="C1")
 24 |     ax2.axes.get_xaxis().set_visible(False)
 25 |     ax2.yaxis.tick_right()
 26 |     ax2.set_ylabel('Score', color="C1")
 27 |     ax2.yaxis.set_label_position('right')
 28 |     ax2.tick_params(axis='y', colors="C1")
 29 | 
 30 |     if lines is not None:
 31 |         for line in lines:
 32 |             plt.axvline(x=line)
 33 | 
 34 |     plt.savefig(filename)
 35 | 
 36 | class RepeatActionAndMaxFrame(gym.Wrapper):
 37 |     """ modified from:
 38 |         https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py
 39 |     """
 40 |     def __init__(self, env=None, repeat=4):
 41 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 42 |         self.repeat = repeat
 43 |         self.shape = env.observation_space.low.shape
 44 |         self.frame_buffer = np.zeros_like((2,self.shape))
 45 | 
 46 |     def step(self, action):
 47 |         t_reward = 0.0
 48 |         done = False
 49 |         for i in range(self.repeat):
 50 |             obs, reward, done, info = self.env.step(action)
 51 |             t_reward += reward
 52 |             idx = i % 2
 53 |             self.frame_buffer[idx] = obs
 54 |             if done:
 55 |                 break
 56 | 
 57 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 58 |         return max_frame, t_reward, done, info
 59 | 
 60 |     def reset(self):
 61 |         obs = self.env.reset()
 62 |         self.frame_buffer = np.zeros_like((2,self.shape))
 63 |         self.frame_buffer[0] = obs
 64 |         return obs
 65 | 
 66 | class PreprocessFrame(gym.ObservationWrapper):
 67 |     def __init__(self, shape, env=None):
 68 |         super(PreprocessFrame, self).__init__(env)
 69 |         self.shape=(shape[2], shape[0], shape[1])
 70 |         self.observation_space = gym.spaces.Box(low=0, high=1.0,
 71 |                                               shape=self.shape,dtype=np.float32)
 72 |     def observation(self, obs):
 73 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
 74 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
 75 |                                     interpolation=cv2.INTER_AREA)
 76 | 
 77 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
 78 |         new_obs = np.swapaxes(new_obs, 2,0)
 79 |         new_obs = new_obs / 255.0
 80 |         return new_obs
 81 | 
 82 | class StackFrames(gym.ObservationWrapper):
 83 |     def __init__(self, env, n_steps):
 84 |         super(StackFrames, self).__init__(env)
 85 |         self.observation_space = gym.spaces.Box(
 86 |                              env.observation_space.low.repeat(n_steps, axis=0),
 87 |                              env.observation_space.high.repeat(n_steps, axis=0),
 88 |                              dtype=np.float32)
 89 |         self.stack = collections.deque(maxlen=n_steps)
 90 | 
 91 |     def reset(self):
 92 |         self.stack.clear()
 93 |         observation = self.env.reset()
 94 |         for _ in range(self.stack.maxlen):
 95 |             self.stack.append(observation)
 96 | 
 97 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
 98 | 
 99 |     def observation(self, observation):
100 |         self.stack.append(observation)
101 |         obs = np.array(self.stack).reshape(self.observation_space.low.shape)
102 | 
103 |         return obs
104 | 
105 | def make_env(env_name, shape=(84,84,1), skip=4):
106 |     env = gym.make(env_name)
107 |     env = RepeatActionAndMaxFrame(env, skip)
108 |     env = PreprocessFrame(shape, env)
109 |     env = StackFrames(env, skip)
110 | 
111 |     return env
112 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Phil Tabor
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Deep-Q-Learning-Paper-To-Code
 2 | 
 3 | Code for my course at Udemy: 
 4 | 
 5 | https://www.udemy.com/course/deep-q-learning-from-paper-to-code/?referralCode=CBA45A3B737237E7BFD2
 6 | 
 7 | We analyze and implement the following papers:
 8 | 
 9 | Human Level Control Through Deep Reinforcement Learning
10 | 
11 | https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf
12 | 
13 | Deep Reinforcement Learning with Double Q Learning:
14 | 
15 | https://arxiv.org/abs/1509.06461
16 | 
17 | Dueling Network Architectures for Deep Reinforcement Learning:
18 | 
19 | https://arxiv.org/abs/1511.06581
20 | 
21 | The course is still in review, and this readme is a work in progress. 
22 | 
23 | Better docs to come!
24 | 


--------------------------------------------------------------------------------
/agents.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch as T
  3 | from deep_q_network import DeepQNetwork, DuelingDeepQNetwork
  4 | from replay_memory import ReplayBuffer
  5 | 
  6 | class Agent():
  7 |     def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
  8 |                  mem_size, batch_size, eps_min=0.01, eps_dec=5e-7,
  9 |                  replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'):
 10 |         self.gamma = gamma
 11 |         self.epsilon = epsilon
 12 |         self.lr = lr
 13 |         self.n_actions = n_actions
 14 |         self.input_dims = input_dims
 15 |         self.eps_min = eps_min
 16 |         self.eps_dec = eps_dec
 17 |         self.action_space = [i for i in range(n_actions)]
 18 |         self.learn_step_counter = 0
 19 |         self.batch_size = batch_size
 20 |         self.replace_target_cnt = replace
 21 |         self.algo = algo
 22 |         self.env_name = env_name
 23 |         self.chkpt_dir = chkpt_dir
 24 | 
 25 |         self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
 26 | 
 27 |     def store_transition(self, state, action, reward, state_, done):
 28 |         self.memory.store_transition(state, action, reward, state_, done)
 29 | 
 30 |     def choose_action(self, observation):
 31 |         raise NotImplementedError
 32 | 
 33 |     def replace_target_network(self):
 34 |         if self.learn_step_counter % self.replace_target_cnt == 0:
 35 |             self.q_next.load_state_dict(self.q_eval.state_dict())
 36 | 
 37 |     def decrement_epsilon(self):
 38 |         self.epsilon = self.epsilon - self.eps_dec \
 39 |                            if self.epsilon > self.eps_min else self.eps_min
 40 |     def sample_memory(self):
 41 |         state, action, reward, new_state, done = \
 42 |                                 self.memory.sample_buffer(self.batch_size)
 43 | 
 44 |         states = T.tensor(state).to(self.q_eval.device)
 45 |         rewards = T.tensor(reward).to(self.q_eval.device)
 46 |         dones = T.tensor(done).to(self.q_eval.device)
 47 |         actions = T.tensor(action).to(self.q_eval.device)
 48 |         states_ = T.tensor(new_state).to(self.q_eval.device)
 49 | 
 50 |         return states, actions, rewards, states_, dones
 51 | 
 52 |     def learn(self):
 53 |         raise NotImplementedError
 54 | 
 55 |     def save_models(self):
 56 |         self.q_eval.save_checkpoint()
 57 |         self.q_next.save_checkpoint()
 58 | 
 59 |     def load_models(self):
 60 |         self.q_eval.load_checkpoint()
 61 |         self.q_next.load_checkpoint()
 62 | 
 63 | class DQNAgent(Agent):
 64 |     def __init__(self, *args, **kwargs):
 65 |         super(DQNAgent, self).__init__(*args, **kwargs)
 66 | 
 67 |         self.q_eval = DeepQNetwork(self.lr, self.n_actions,
 68 |                                     input_dims=self.input_dims,
 69 |                                     name=self.env_name+'_'+self.algo+'_q_eval',
 70 |                                     chkpt_dir=self.chkpt_dir)
 71 |         self.q_next = DeepQNetwork(self.lr, self.n_actions,
 72 |                                     input_dims=self.input_dims,
 73 |                                     name=self.env_name+'_'+self.algo+'_q_next',
 74 |                                     chkpt_dir=self.chkpt_dir)
 75 | 
 76 |     def choose_action(self, observation):
 77 |         if np.random.random() > self.epsilon:
 78 |             state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
 79 |             actions = self.q_eval.forward(state)
 80 |             action = T.argmax(actions).item()
 81 |         else:
 82 |             action = np.random.choice(self.action_space)
 83 | 
 84 |         return action
 85 | 
 86 |     def learn(self):
 87 |         if self.memory.mem_cntr < self.batch_size:
 88 |             return
 89 | 
 90 |         self.q_eval.optimizer.zero_grad()
 91 | 
 92 |         self.replace_target_network()
 93 | 
 94 |         states, actions, rewards, states_, dones = self.sample_memory()
 95 |         indices = np.arange(self.batch_size)
 96 | 
 97 |         q_pred = self.q_eval.forward(states)[indices, actions]
 98 | 
 99 |         q_next = self.q_next.forward(states_).max(dim=1)[0]
100 |         q_next[dones] = 0.0
101 | 
102 |         q_target = rewards + self.gamma*q_next
103 | 
104 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
105 |         loss.backward()
106 |         self.q_eval.optimizer.step()
107 |         self.learn_step_counter += 1
108 | 
109 |         self.decrement_epsilon()
110 | 
111 | class DDQNAgent(Agent):
112 |     def __init__(self, *args, **kwargs):
113 |         super(DDQNAgent, self).__init__(*args, **kwargs)
114 | 
115 |         self.q_eval = DeepQNetwork(self.lr, self.n_actions,
116 |                                     input_dims=self.input_dims,
117 |                                     name=self.env_name+'_'+self.algo+'_q_eval',
118 |                                     chkpt_dir=self.chkpt_dir)
119 |         self.q_next = DeepQNetwork(self.lr, self.n_actions,
120 |                                     input_dims=self.input_dims,
121 |                                     name=self.env_name+'_'+self.algo+'_q_next',
122 |                                     chkpt_dir=self.chkpt_dir)
123 | 
124 |     def choose_action(self, observation):
125 |         if np.random.random() > self.epsilon:
126 |             state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
127 |             actions = self.q_eval.forward(state)
128 |             action = T.argmax(actions).item()
129 |         else:
130 |             action = np.random.choice(self.action_space)
131 | 
132 |         return action
133 | 
134 |     def learn(self):
135 |         if self.memory.mem_cntr < self.batch_size:
136 |             return
137 | 
138 |         self.q_eval.optimizer.zero_grad()
139 | 
140 |         self.replace_target_network()
141 | 
142 |         states, actions, rewards, states_, dones = self.sample_memory()
143 |         indices = np.arange(self.batch_size)
144 | 
145 |         q_pred = self.q_eval.forward(states)[indices, actions]
146 |         q_next = self.q_next.forward(states_)
147 |         q_eval = self.q_eval.forward(states_)
148 | 
149 |         max_actions = T.argmax(q_eval, dim=1)
150 |         q_next[dones] = 0.0
151 | 
152 |         q_target = rewards + self.gamma*q_next[indices, max_actions]
153 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
154 |         loss.backward()
155 | 
156 |         self.q_eval.optimizer.step()
157 |         self.learn_step_counter += 1
158 | 
159 |         self.decrement_epsilon()
160 | 
161 | class DuelingDQNAgent(Agent):
162 |     def __init__(self, *args, **kwargs):
163 |         super(DuelingDQNAgent, self).__init__(*args, **kwargs)
164 | 
165 |         self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions,
166 |                         input_dims=self.input_dims,
167 |                         name=self.env_name+'_'+self.algo+'_q_eval',
168 |                         chkpt_dir=self.chkpt_dir)
169 |         self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions,
170 |                         input_dims=self.input_dims,
171 |                         name=self.env_name+'_'+self.algo+'_q_next',
172 |                         chkpt_dir=self.chkpt_dir)
173 | 
174 |     def choose_action(self, observation):
175 |         if np.random.random() > self.epsilon:
176 |             state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
177 |             _, advantage = self.q_eval.forward(state)
178 |             action = T.argmax(advantage).item()
179 |         else:
180 |             action = np.random.choice(self.action_space)
181 | 
182 |         return action
183 | 
184 |     def learn(self):
185 |         if self.memory.mem_cntr < self.batch_size:
186 |             return
187 | 
188 |         self.q_eval.optimizer.zero_grad()
189 | 
190 |         self.replace_target_network()
191 | 
192 |         states, actions, rewards, states_, dones = self.sample_memory()
193 |         indices = np.arange(self.batch_size)
194 | 
195 |         V_s, A_s = self.q_eval.forward(states)
196 |         V_s_, A_s_ = self.q_next.forward(states_)
197 | 
198 |         q_pred = T.add(V_s,
199 |                         (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions]
200 |         q_next = T.add(V_s_,
201 |                         (A_s_ - A_s_.mean(dim=1, keepdim=True))).max(dim=1)[0]
202 | 
203 |         q_next[dones] = 0.0
204 |         q_target = rewards + self.gamma*q_next
205 | 
206 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
207 |         loss.backward()
208 |         self.q_eval.optimizer.step()
209 | 
210 |         self.learn_step_counter += 1
211 |         self.decrement_epsilon()
212 | 
213 | class DuelingDDQNAgent(Agent):
214 |     def __init__(self, *args, **kwargs):
215 |         super(DuelingDDQNAgent, self).__init__(*args, **kwargs)
216 | 
217 |         self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions,
218 |                         input_dims=self.input_dims,
219 |                         name=self.env_name+'_'+self.algo+'_q_eval',
220 |                         chkpt_dir=self.chkpt_dir)
221 |         self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions,
222 |                         input_dims=self.input_dims,
223 |                         name=self.env_name+'_'+self.algo+'_q_next',
224 |                         chkpt_dir=self.chkpt_dir)
225 | 
226 |     def choose_action(self, observation):
227 |         if np.random.random() > self.epsilon:
228 |             state = T.tensor([observation],dtype=T.float).to(self.q_eval.device)
229 |             _, advantage = self.q_eval.forward(state)
230 |             action = T.argmax(advantage).item()
231 |         else:
232 |             action = np.random.choice(self.action_space)
233 | 
234 |         return action
235 | 
236 |     def learn(self):
237 |         if self.memory.mem_cntr < self.batch_size:
238 |             return
239 | 
240 |         self.q_eval.optimizer.zero_grad()
241 | 
242 |         self.replace_target_network()
243 | 
244 |         states, actions, rewards, states_, dones = self.sample_memory()
245 | 
246 |         indices = np.arange(self.batch_size)
247 | 
248 |         V_s, A_s = self.q_eval.forward(states)
249 |         V_s_, A_s_ = self.q_next.forward(states_)
250 | 
251 |         V_s_eval, A_s_eval = self.q_eval.forward(states_)
252 | 
253 |         q_pred = T.add(V_s,
254 |                         (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions]
255 |         q_next = T.add(V_s_, (A_s_ - A_s_.mean(dim=1, keepdim=True)))
256 | 
257 |         q_eval = T.add(V_s_eval,
258 |                       (A_s_eval - A_s_eval.mean(dim=1, keepdim=True)))
259 | 
260 |         max_actions = T.argmax(q_eval, dim=1)
261 |         q_next[dones] = 0.0
262 | 
263 |         q_target = rewards + self.gamma*q_next[indices, max_actions]
264 |         loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
265 |         loss.backward()
266 |         self.q_eval.optimizer.step()
267 |         self.learn_step_counter += 1
268 | 
269 |         self.decrement_epsilon()
270 | 


--------------------------------------------------------------------------------
/argparse_example.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | parser = argparse.ArgumentParser(description='')
 4 | 
 5 | # type can be int, str, float, bool, etc.
 6 | # this argument is optional
 7 | parser.add_argument('-argument', type=dtype, default=x, help='help string')
 8 | 
 9 | # this argument is not optional
10 | parser.add_argument('argument', type=dtype, default=x, help='help string')
11 | 
12 | # parse the args.
13 | args = parser.parse_args()
14 | 
15 | # access parameters like this
16 | variable = args.argument
17 | 


--------------------------------------------------------------------------------
/deep_q_network.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch as T
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | import numpy as np
  7 | 
  8 | class DeepQNetwork(nn.Module):
  9 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
 10 |         super(DeepQNetwork, self).__init__()
 11 |         self.checkpoint_dir = chkpt_dir
 12 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
 13 | 
 14 |         self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4)
 15 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
 16 |         self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
 17 | 
 18 |         fc_input_dims = self.calculate_conv_output_dims(input_dims)
 19 | 
 20 |         self.fc1 = nn.Linear(fc_input_dims, 512)
 21 |         self.fc2 = nn.Linear(512, n_actions)
 22 | 
 23 |         self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
 24 | 
 25 |         self.loss = nn.MSELoss()
 26 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 27 |         self.to(self.device)
 28 | 
 29 |     def calculate_conv_output_dims(self, input_dims):
 30 |         state = T.zeros(1, *input_dims)
 31 |         dims = self.conv1(state)
 32 |         dims = self.conv2(dims)
 33 |         dims = self.conv3(dims)
 34 |         return int(np.prod(dims.size()))
 35 | 
 36 |     def forward(self, state):
 37 |         conv1 = F.relu(self.conv1(state))
 38 |         conv2 = F.relu(self.conv2(conv1))
 39 |         conv3 = F.relu(self.conv3(conv2))
 40 |         conv_state = conv3.view(conv3.size()[0], -1)
 41 | 
 42 |         flat1 = F.relu(self.fc1(conv_state))
 43 |         actions = self.fc2(flat1)
 44 | 
 45 |         return actions
 46 | 
 47 |     def save_checkpoint(self):
 48 |         print('... saving checkpoint ...')
 49 |         T.save(self.state_dict(), self.checkpoint_file)
 50 | 
 51 |     def load_checkpoint(self):
 52 |         print('... loading checkpoint ...')
 53 |         self.load_state_dict(T.load(self.checkpoint_file))
 54 | 
 55 | class DuelingDeepQNetwork(nn.Module):
 56 |     def __init__(self, lr, n_actions, name, input_dims, chkpt_dir):
 57 |         super(DuelingDeepQNetwork, self).__init__()
 58 | 
 59 |         self.checkpoint_dir = chkpt_dir
 60 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name)
 61 | 
 62 |         self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4)
 63 |         self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
 64 |         self.conv3 = nn.Conv2d(64, 64, 3, stride=1)
 65 | 
 66 |         fc_input_dims = self.calculate_conv_output_dims(input_dims)
 67 | 
 68 |         self.fc1 = nn.Linear(fc_input_dims, 512)
 69 | 
 70 |         self.V = nn.Linear(512, 1)
 71 |         self.A = nn.Linear(512, n_actions)
 72 | 
 73 |         self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
 74 |         self.loss = nn.MSELoss()
 75 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 76 |         self.to(self.device)
 77 | 
 78 |     def calculate_conv_output_dims(self, input_dims):
 79 |         state = T.zeros(1, *input_dims)
 80 |         dims = self.conv1(state)
 81 |         dims = self.conv2(dims)
 82 |         dims = self.conv3(dims)
 83 |         return int(np.prod(dims.size()))
 84 | 
 85 |     def forward(self, state):
 86 |         conv1 = F.relu(self.conv1(state))
 87 |         conv2 = F.relu(self.conv2(conv1))
 88 |         conv3 = F.relu(self.conv3(conv2))
 89 |         conv_state = conv3.view(conv3.size()[0], -1)
 90 |         flat1 = F.relu(self.fc1(conv_state))
 91 | 
 92 |         V = self.V(flat1)
 93 |         A = self.A(flat1)
 94 | 
 95 |         return V, A
 96 | 
 97 |     def save_checkpoint(self):
 98 |         print('... saving checkpoint ...')
 99 |         T.save(self.state_dict(), self.checkpoint_file)
100 | 
101 |     def load_checkpoint(self):
102 |         print('... loading checkpoint ...')
103 |         self.load_state_dict(T.load(self.checkpoint_file))
104 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import argparse, os
  2 | import gym
  3 | import numpy as np
  4 | import agents as Agents
  5 | from utils import plot_learning_curve, make_env
  6 | 
  7 | if __name__ == '__main__':
  8 |     parser = argparse.ArgumentParser(
  9 |                     description='Deep Q Learning: From Paper to Code')
 10 |     # the hyphen makes the argument optional
 11 |     parser.add_argument('-n_games', type=int, default=1,
 12 |                         help='Number of games to play')
 13 |     parser.add_argument('-lr', type=float, default=0.0001,
 14 |                         help='Learning rate for optimizer')
 15 |     parser.add_argument('-eps_min', type=float, default=0.1,
 16 |             help='Minimum value for epsilon in epsilon-greedy action selection')
 17 |     parser.add_argument('-gamma', type=float, default=0.99,
 18 |                                     help='Discount factor for update equation.')
 19 |     parser.add_argument('-eps_dec', type=float, default=1e-5,
 20 |                         help='Linear factor for decreasing epsilon')
 21 |     parser.add_argument('-eps', type=float, default=1.0,
 22 |         help='Starting value for epsilon in epsilon-greedy action selection')
 23 |     parser.add_argument('-max_mem', type=int, default=50000, #~13Gb
 24 |                                 help='Maximum size for memory replay buffer')
 25 |     parser.add_argument('-repeat', type=int, default=4,
 26 |                             help='Number of frames to repeat & stack')
 27 |     parser.add_argument('-bs', type=int, default=32,
 28 |                             help='Batch size for replay memory sampling')
 29 |     parser.add_argument('-replace', type=int, default=1000,
 30 |                         help='interval for replacing target network')
 31 |     parser.add_argument('-env', type=str, default='PongNoFrameskip-v4',
 32 |                             help='Atari environment.\nPongNoFrameskip-v4\n \
 33 |                                   BreakoutNoFrameskip-v4\n \
 34 |                                   SpaceInvadersNoFrameskip-v4\n \
 35 |                                   EnduroNoFrameskip-v4\n \
 36 |                                   AtlantisNoFrameskip-v4')
 37 |     parser.add_argument('-gpu', type=str, default='0', help='GPU: 0 or 1')
 38 |     parser.add_argument('-load_checkpoint', type=bool, default=False,
 39 |                         help='load model checkpoint')
 40 |     parser.add_argument('-path', type=str, default='models/',
 41 |                         help='path for model saving/loading')
 42 |     parser.add_argument('-algo', type=str, default='DQNAgent',
 43 |                     help='DQNAgent/DDQNAgent/DuelingDQNAgent/DuelingDDQNAgent')
 44 |     parser.add_argument('-clip_rewards', type=bool, default=False,
 45 |                         help='Clip rewards to range -1 to 1')
 46 |     parser.add_argument('-no_ops', type=int, default=0,
 47 |                         help='Max number of no ops for testing')
 48 |     parser.add_argument('-fire_first', type=bool, default=False,
 49 |                         help='Set first action of episode to fire')
 50 |     args = parser.parse_args()
 51 | 
 52 |     os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
 53 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
 54 | 
 55 |     env = make_env(env_name=args.env, repeat=args.repeat,
 56 |                   clip_rewards=args.clip_rewards, no_ops=args.no_ops,
 57 |                   fire_first=args.fire_first)
 58 | 
 59 |     best_score = -np.inf
 60 |     agent_ = getattr(Agents, args.algo)
 61 |     agent = agent_(gamma=args.gamma,
 62 |                   epsilon=args.eps,
 63 |                   lr=args.lr,
 64 |                   input_dims=env.observation_space.shape,
 65 |                   n_actions=env.action_space.n,
 66 |                   mem_size=args.max_mem,
 67 |                   eps_min=args.eps_min,
 68 |                   batch_size=args.bs,
 69 |                   replace=args.replace,
 70 |                   eps_dec=args.eps_dec,
 71 |                   chkpt_dir=args.path,
 72 |                   algo=args.algo,
 73 |                   env_name=args.env)
 74 | 
 75 |     if args.load_checkpoint:
 76 |         agent.load_models()
 77 | 
 78 |     fname = args.algo + '_' + args.env + '_alpha' + str(args.lr) +'_' \
 79 |             + str(args.n_games) + 'games'
 80 |     figure_file = 'plots/' + fname + '.png'
 81 |     scores_file = fname + '_scores.npy'
 82 | 
 83 |     scores, eps_history = [], []
 84 |     n_steps = 0
 85 |     steps_array = []
 86 |     for i in range(args.n_games):
 87 |         done = False
 88 |         observation = env.reset()
 89 |         score = 0
 90 |         while not done:
 91 |             action = agent.choose_action(observation)
 92 |             observation_, reward, done, info = env.step(action)
 93 |             score += reward
 94 | 
 95 |             if not args.load_checkpoint:
 96 |                 agent.store_transition(observation, action,
 97 |                                      reward, observation_, int(done))
 98 |                 agent.learn()
 99 |             observation = observation_
100 |             n_steps += 1
101 |         scores.append(score)
102 |         steps_array.append(n_steps)
103 | 
104 |         avg_score = np.mean(scores[-100:])
105 |         print('episode: ', i,'score: ', score,
106 |              ' average score %.1f' % avg_score, 'best score %.2f' % best_score,
107 |             'epsilon %.2f' % agent.epsilon, 'steps', n_steps)
108 | 
109 |         if avg_score > best_score:
110 |             if not args.load_checkpoint:
111 |                 agent.save_models()
112 |             best_score = avg_score
113 | 
114 |         eps_history.append(agent.epsilon)
115 |         if args.load_checkpoint and n_steps >= 18000:
116 |             break
117 | 
118 |     x = [i+1 for i in range(len(scores))]
119 |     plot_learning_curve(steps_array, scores, eps_history, figure_file)
120 |     #np.save(scores_file, np.array(scores))
121 | 


--------------------------------------------------------------------------------
/naive_deep_q_learning/cartpole_naive_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/naive_deep_q_learning/cartpole_naive_dqn.png


--------------------------------------------------------------------------------
/naive_deep_q_learning/cartpole_naive_dqn.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | import torch as T
  7 | from util import plot_learning_curve
  8 | 
  9 | class LinearDeepQNetwork(nn.Module):
 10 |     def __init__(self, lr, n_actions, input_dims):
 11 |         super(LinearDeepQNetwork, self).__init__()
 12 | 
 13 |         self.fc1 = nn.Linear(*input_dims, 128)
 14 |         self.fc2 = nn.Linear(128, n_actions)
 15 | 
 16 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
 17 |         self.loss = nn.MSELoss()
 18 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 19 |         self.to(self.device)
 20 | 
 21 |     def forward(self, state):
 22 |         layer1 = F.relu(self.fc1(state))
 23 |         actions = self.fc2(layer1)
 24 | 
 25 |         return actions
 26 | 
 27 | 
 28 | class Agent():
 29 |     def __init__(self, input_dims, n_actions, lr, gamma=0.99,
 30 |                  epsilon=1.0, eps_dec=1e-5, eps_min=0.01):
 31 |         self.lr = lr
 32 |         self.input_dims = input_dims
 33 |         self.n_actions = n_actions
 34 |         self.gamma = gamma
 35 |         self.epsilon = epsilon
 36 |         self.eps_dec = eps_dec
 37 |         self.eps_min = eps_min
 38 |         self.action_space = [i for i in range(self.n_actions)]
 39 | 
 40 |         self.Q = LinearDeepQNetwork(self.lr, self.n_actions, self.input_dims)
 41 | 
 42 |     def choose_action(self, observation):
 43 |         if np.random.random() > self.epsilon:
 44 |             state = T.tensor(observation, dtype=T.float).to(self.Q.device)
 45 |             actions = self.Q.forward(state)
 46 |             action = T.argmax(actions).item()
 47 |         else:
 48 |             action = np.random.choice(self.action_space)
 49 | 
 50 |         return action
 51 | 
 52 |     def decrement_epsilon(self):
 53 |         self.epsilon = self.epsilon - self.eps_dec \
 54 |                         if self.epsilon > self.eps_min else self.eps_min
 55 | 
 56 |     def learn(self, state, action, reward, state_):
 57 |         self.Q.optimizer.zero_grad()
 58 |         states = T.tensor(state, dtype=T.float).to(self.Q.device)
 59 |         actions = T.tensor(action).to(self.Q.device)
 60 |         rewards = T.tensor(reward).to(self.Q.device)
 61 |         states_ = T.tensor(state_, dtype=T.float).to(self.Q.device)
 62 | 
 63 |         q_pred = self.Q.forward(states)[actions]
 64 | 
 65 |         q_next = self.Q.forward(states_).max()
 66 | 
 67 |         q_target = rewards + self.gamma*q_next
 68 | 
 69 |         loss = self.Q.loss(q_target, q_pred).to(self.Q.device)
 70 |         loss.backward()
 71 |         self.Q.optimizer.step()
 72 |         self.decrement_epsilon()
 73 | 
 74 | if __name__ == '__main__':
 75 |     env = gym.make('CartPole-v1')
 76 |     n_games = 10000
 77 |     scores = []
 78 |     eps_history = []
 79 | 
 80 |     agent = Agent(lr=0.0001, input_dims=env.observation_space.shape,
 81 |                   n_actions=env.action_space.n)
 82 | 
 83 |     for i in range(n_games):
 84 |         score = 0
 85 |         done = False
 86 |         obs = env.reset()
 87 | 
 88 |         while not done:
 89 |             action = agent.choose_action(obs)
 90 |             obs_, reward, done, info = env.step(action)
 91 |             score += reward
 92 |             agent.learn(obs, action, reward, obs_)
 93 |             obs = obs_
 94 |         scores.append(score)
 95 |         eps_history.append(agent.epsilon)
 96 | 
 97 |         if i % 100 == 0:
 98 |             avg_score = np.mean(scores[-100:])
 99 |             print('episode ', i, 'score %.1f avg score %.1f epsilon %.2f' %
100 |                   (score, avg_score, agent.epsilon))
101 |     filename = 'cartpole_naive_dqn.png'
102 |     x = [i+1 for i in range(n_games)]
103 |     plot_learning_curve(x, scores, eps_history, filename)
104 | 


--------------------------------------------------------------------------------
/naive_deep_q_learning/pytorch_example.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | import torch.optim as optim
 4 | import torch as T
 5 | 
 6 | class LinearClassifier(nn.Module):
 7 |     def __init__(self, lr, n_classes, input_dims):
 8 |         super(LinearClassifier, self).__init__()
 9 | 
10 |         self.fc1 = nn.Linear(*input_dims, 128)
11 |         self.fc2 = nn.Linear(128, 256)
12 |         self.fc3 = nn.Linear(256, n_classes)
13 | 
14 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
15 |         self.loss = nn.CrossEntropyLoss()
16 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
17 |         self.to(self.device)
18 | 
19 |     def forward(self, data):
20 |         layer1 = F.sigmoid(self.fc1(data))
21 |         layer2 = F.sigmoid(self.fc2(layer1))
22 |         layer3 = self.fc3(layer2)
23 | 
24 |         return layer3
25 | 
26 |     def learn(self, data, labels):
27 |         self.optimizer.zero_grad()
28 |         data = T.tensor(data).to(self.device)
29 |         labels = T.tensor(labels).to(self.device)
30 | 
31 |         predictions = self.forward(data)
32 | 
33 |         cost = self.loss(predictions, labels)
34 | 
35 |         cost.backward()
36 |         self.optimizer.step()
37 | 


--------------------------------------------------------------------------------
/naive_deep_q_learning/util.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | def plot_learning_curve(x, scores, epsilons, filename):
 5 |     fig = plt.figure()
 6 |     ax = fig.add_subplot(111, label="1")
 7 |     ax2 = fig.add_subplot(111, label="2", frame_on=False)
 8 | 
 9 |     ax.plot(x, epsilons, color="C0")
10 |     ax.set_xlabel("Training Steps", color="C0")
11 |     ax.set_ylabel("Epsilon", color="C0")
12 |     ax.tick_params(axis='x', colors="C0")
13 |     ax.tick_params(axis='y', colors="C0")
14 | 
15 |     N = len(scores)
16 |     running_avg = np.empty(N)
17 |     for t in range(N):
18 |         running_avg[t] = np.mean(scores[max(0, t-100):(t+1)])
19 | 
20 |     ax2.scatter(x, running_avg, color="C1")
21 |     ax2.axes.get_xaxis().set_visible(False)
22 |     ax2.yaxis.tick_right()
23 |     ax2.set_ylabel('Score', color="C1")
24 |     ax2.yaxis.set_label_position('right')
25 |     ax2.tick_params(axis='y', colors="C1")
26 | 
27 |     plt.savefig(filename)
28 | 


--------------------------------------------------------------------------------
/q_learning/frozen_lake_deterministic_policy.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # LEFT = 0 DOWN = 1 RIGHT = 2 UP = 3
 6 | # SFFF
 7 | # FHFH
 8 | # FFFH
 9 | # HFFG
10 | 
11 | policy = {0: 1, 1: 2, 2: 1, 3: 0, 4: 1, 6: 1, 8: 2, 9: 1, 10: 1, 13: 2, 14: 2}
12 | 
13 | env = gym.make('FrozenLake-v0')
14 | n_games = 1000
15 | win_pct = []
16 | scores = []
17 | 
18 | for i in range(n_games):
19 |     done = False
20 |     obs = env.reset()
21 |     score = 0
22 |     while not done:
23 |         action = policy[obs]
24 |         obs, reward, done, info = env.step(action)
25 |         score += reward
26 |     scores.append(score)
27 |     if i % 10 == 0:
28 |         average = np.mean(scores[-10:])
29 |         win_pct.append(average)
30 | plt.plot(win_pct)
31 | plt.show()
32 | 


--------------------------------------------------------------------------------
/q_learning/frozen_lake_env_test.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | env = gym.make('FrozenLake-v0')
 6 | # LEFT = 0 DOWN = 1 RIGHT = 2 UP = 3
 7 | # SFFF
 8 | # FHFH
 9 | # FFFH
10 | # HFFG
11 | policy = {0: 1, 1: 2, 2: 1, 3: 0, 4:1, 6: 1, 8:2, 9:1, 10:1, 13: 2, 14:2}
12 | 
13 | n_games = 1000
14 | win_pct = []
15 | scores = []
16 | for i in range(n_games):
17 |     done = False
18 |     obs = env.reset()
19 |     score = 0
20 |     while not done:
21 |         action = env.action_space.sample()
22 |         #action = policy[obs]
23 |         obs, reward, done, info = env.step(action)
24 |         score += reward
25 |     scores.append(score)
26 |     if i % 10 == 0:
27 |         average = np.mean(scores[-10:])
28 |         win_pct.append(average)
29 | plt.plot(win_pct)
30 | plt.show()
31 | 


--------------------------------------------------------------------------------
/q_learning/frozen_lake_q_learning.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from q_learning_agent import Agent
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('FrozenLake-v0')
 8 |     agent = Agent(lr=0.001, gamma=0.9, eps_start=1.0, eps_end=0.01,
 9 |                   eps_dec=0.9999995, n_actions=4, n_states=16)
10 | 
11 |     scores = []
12 |     win_pct_list = []
13 |     n_games = 500000
14 | 
15 |     for i in range(n_games):
16 |         done = False
17 |         observation = env.reset()
18 |         score = 0
19 |         while not done:
20 |             action = agent.choose_action(observation)
21 |             observation_, reward, done, info = env.step(action)
22 |             agent.learn(observation, action, reward, observation_)
23 |             score += reward
24 |             observation = observation_
25 |         scores.append(score)
26 |         if i % 100 == 0:
27 |             win_pct = np.mean(scores[-100:])
28 |             win_pct_list.append(win_pct)
29 |             if i % 1000 == 0:
30 |                 print('episode ', i, 'win pct %.2f' % win_pct,
31 |                       'epsilon %.2f' % agent.epsilon)
32 |     plt.plot(win_pct_list)
33 |     plt.show()
34 | 


--------------------------------------------------------------------------------
/q_learning/frozen_lake_random_agent.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | env = gym.make('FrozenLake-v0')
 6 | 
 7 | n_games = 1000
 8 | win_pct = []
 9 | scores = []
10 | for i in range(n_games):
11 |     done = False
12 |     obs = env.reset()
13 |     score = 0
14 |     while not done:
15 |         action = env.action_space.sample()
16 |         obs, reward, done, info = env.step(action)
17 |         score += reward
18 |     scores.append(score)
19 | 
20 |     if i % 10 == 0:
21 |         average = np.mean(scores[-10:])
22 |         win_pct.append(average)
23 | plt.plot(win_pct)
24 | plt.show()
25 | 


--------------------------------------------------------------------------------
/q_learning/plots/frozen_lake_deterministic_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/q_learning/plots/frozen_lake_deterministic_policy.png


--------------------------------------------------------------------------------
/q_learning/plots/frozen_lake_q_learning_agent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/q_learning/plots/frozen_lake_q_learning_agent.png


--------------------------------------------------------------------------------
/q_learning/plots/frozen_lake_random_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/q_learning/plots/frozen_lake_random_policy.png


--------------------------------------------------------------------------------
/q_learning/q_learning_agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Agent():
 4 |     def __init__(self, lr, gamma, n_actions, n_states, eps_start, eps_end,
 5 |                  eps_dec):
 6 |         self.lr = lr
 7 |         self.gamma = gamma
 8 |         self.n_actions = n_actions
 9 |         self.n_states = n_states
10 |         self.epsilon = eps_start
11 |         self.eps_min = eps_end
12 |         self.eps_dec = eps_dec
13 | 
14 |         self.Q = {}
15 | 
16 |         self.init_Q()
17 | 
18 |     def init_Q(self):
19 |         for state in range(self.n_states):
20 |             for action in range(self.n_actions):
21 |                 self.Q[(state, action)] = 0.0
22 | 
23 |     def choose_action(self, state):
24 |         if np.random.random() < self.epsilon:
25 |             action = np.random.choice([i for i in range(self.n_actions)])
26 |         else:
27 |             actions = np.array([self.Q[(state, a)] \
28 |                                 for a in range(self.n_actions)])
29 |             action = np.argmax(actions)
30 |         return action
31 | 
32 |     def decrement_epsilon(self):
33 |         self.epsilon = self.epsilon*self.eps_dec if self.epsilon>self.eps_min\
34 |                        else self.eps_min
35 | 
36 |     def learn(self, state, action, reward, state_):
37 |         actions = np.array([self.Q[(state_, a)] for a in range(self.n_actions)])
38 |         a_max = np.argmax(actions)
39 | 
40 |         self.Q[(state, action)] += self.lr*(reward +
41 |                                         self.gamma*self.Q[(state_, a_max)] -
42 |                                         self.Q[(state, action)])
43 |         self.decrement_epsilon()
44 | 


--------------------------------------------------------------------------------
/q_learning/q_network.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Agent(object):
 4 |     def __init__(self, lr, gamma, n_actions, n_states, epsilon_start,
 5 |                  epsilon_end, epsilon_dec):
 6 |         self.lr = lr
 7 |         self.gamma = gamma
 8 |         self.n_actions = n_actions
 9 |         self.n_states = n_states
10 |         self.epsilon = epsilon_start
11 |         self.eps_min = epsilon_end
12 |         self.eps_dec = epsilon_dec
13 |         self.Q = {}
14 | 
15 |         self.init_Q()
16 | 
17 |     def init_Q(self):
18 |         for state in range(self.n_states):
19 |             for action in range(self.n_actions):
20 |                 self.Q[(state, action)] = 0.0
21 | 
22 |     def choose_action(self, state):
23 |         if np.random.random() < self.epsilon:
24 |             action = np.random.choice([i for i in range(self.n_actions)])
25 |         else:
26 |             actions = np.array([self.Q[(state, a)] \
27 |                                 for a in range(self.n_actions)])
28 |             action = np.argmax(actions)
29 |         return action
30 | 
31 |     def decrement_epsilon(self):
32 |         self.epsilon = self.epsilon*self.eps_dec if self.epsilon>self.eps_min \
33 |                        else self.eps_min
34 | 
35 |     def learn(self, state, action, reward, state_):
36 |         actions = np.array([self.Q[(state_, a)] for a in range(self.n_actions)])
37 |         a_max = np.argmax(actions)
38 |         self.Q[(state,action)] += self.lr*(reward +
39 |                                            self.gamma*self.Q[(state_,a_max)]-\
40 |                                            self.Q[(state, action)])
41 | 
42 |         self.decrement_epsilon()
43 | 


--------------------------------------------------------------------------------
/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer():
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape),
 8 |                                      dtype=np.float32)
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape),
10 |                                          dtype=np.float32)
11 | 
12 |         self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
13 |         self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
14 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
15 | 
16 |     def store_transition(self, state, action, reward, state_, done):
17 |         index = self.mem_cntr % self.mem_size
18 |         self.state_memory[index] = state
19 |         self.new_state_memory[index] = state_
20 |         self.action_memory[index] = action
21 |         self.reward_memory[index] = reward
22 |         self.terminal_memory[index] = done
23 |         self.mem_cntr += 1
24 | 
25 |     def sample_buffer(self, batch_size):
26 |         max_mem = min(self.mem_cntr, self.mem_size)
27 |         batch = np.random.choice(max_mem, batch_size, replace=False)
28 | 
29 |         states = self.state_memory[batch]
30 |         actions = self.action_memory[batch]
31 |         rewards = self.reward_memory[batch]
32 |         states_ = self.new_state_memory[batch]
33 |         terminal = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, terminal
36 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import cv2
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import gym
  6 | 
  7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None):
  8 |     fig=plt.figure()
  9 |     ax=fig.add_subplot(111, label="1")
 10 |     ax2=fig.add_subplot(111, label="2", frame_on=False)
 11 | 
 12 |     ax.plot(x, epsilons, color="C0")
 13 |     ax.set_xlabel("Training Steps", color="C0")
 14 |     ax.set_ylabel("Epsilon", color="C0")
 15 |     ax.tick_params(axis='x', colors="C0")
 16 |     ax.tick_params(axis='y', colors="C0")
 17 | 
 18 |     N = len(scores)
 19 |     running_avg = np.empty(N)
 20 |     for t in range(N):
 21 | 	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
 22 | 
 23 |     ax2.scatter(x, running_avg, color="C1")
 24 |     ax2.axes.get_xaxis().set_visible(False)
 25 |     ax2.yaxis.tick_right()
 26 |     ax2.set_ylabel('Score', color="C1")
 27 |     ax2.yaxis.set_label_position('right')
 28 |     ax2.tick_params(axis='y', colors="C1")
 29 | 
 30 |     if lines is not None:
 31 |         for line in lines:
 32 |             plt.axvline(x=line)
 33 | 
 34 |     plt.savefig(filename)
 35 | 
 36 | class RepeatActionAndMaxFrame(gym.Wrapper):
 37 |     """ modified from:
 38 |         https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py
 39 |     """
 40 |     def __init__(self, env=None, repeat=4, clip_reward=False,
 41 |                  no_ops=0, fire_first=False):
 42 |         super(RepeatActionAndMaxFrame, self).__init__(env)
 43 |         self.repeat = repeat
 44 |         self.shape = env.observation_space.low.shape
 45 |         self.frame_buffer = np.zeros_like((2,self.shape))
 46 |         self.clip_reward = clip_reward
 47 |         self.no_ops = 0
 48 |         self.fire_first = fire_first
 49 | 
 50 |     def step(self, action):
 51 |         t_reward = 0.0
 52 |         done = False
 53 |         for i in range(self.repeat):
 54 |             obs, reward, done, info = self.env.step(action)
 55 |             if self.clip_reward:
 56 |                 reward = np.clip(np.array([reward]), -1, 1)[0]
 57 |             t_reward += reward
 58 |             idx = i % 2
 59 |             self.frame_buffer[idx] = obs
 60 |             if done:
 61 |                 break
 62 | 
 63 |         max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1])
 64 |         return max_frame, t_reward, done, info
 65 | 
 66 |     def reset(self):
 67 |         obs = self.env.reset()
 68 |         no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0
 69 |         for _ in range(no_ops):
 70 |             _, _, done, _ = self.env.step(0)
 71 |             if done:
 72 |                 self.env.reset()
 73 | 
 74 |         if self.fire_first:
 75 |             assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
 76 |             obs, _, _, _ = self.env.step(1)
 77 | 
 78 |         self.frame_buffer = np.zeros_like((2,self.shape))
 79 |         self.frame_buffer[0] = obs
 80 |         return obs
 81 | 
 82 | class PreprocessFrame(gym.ObservationWrapper):
 83 |     def __init__(self, shape, env=None):
 84 |         super(PreprocessFrame, self).__init__(env)
 85 |         self.shape=(shape[2], shape[0], shape[1])
 86 |         self.observation_space = gym.spaces.Box(low=0, high=1.0,
 87 |                                               shape=self.shape,dtype=np.float32)
 88 |     def observation(self, obs):
 89 |         new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
 90 |         resized_screen = cv2.resize(new_frame, self.shape[1:],
 91 |                                     interpolation=cv2.INTER_AREA)
 92 |         new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
 93 |         new_obs = new_obs / 255.0
 94 |         return new_obs
 95 | 
 96 | class StackFrames(gym.ObservationWrapper):
 97 |     def __init__(self, env, repeat):
 98 |         super(StackFrames, self).__init__(env)
 99 |         self.observation_space = gym.spaces.Box(
100 |                              env.observation_space.low.repeat(repeat, axis=0),
101 |                              env.observation_space.high.repeat(repeat, axis=0),
102 |                              dtype=np.float32)
103 |         self.stack = collections.deque(maxlen=repeat)
104 | 
105 |     def reset(self):
106 |         self.stack.clear()
107 |         observation = self.env.reset()
108 |         for _ in range(self.stack.maxlen):
109 |             self.stack.append(observation)
110 | 
111 |         return np.array(self.stack).reshape(self.observation_space.low.shape)
112 | 
113 |     def observation(self, observation):
114 |         self.stack.append(observation)
115 |         obs = np.array(self.stack).reshape(self.observation_space.low.shape)
116 | 
117 |         return obs
118 | 
119 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False,
120 |              no_ops=0, fire_first=False):
121 |     env = gym.make(env_name)
122 |     env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first)
123 |     env = PreprocessFrame(shape, env)
124 |     env = StackFrames(env, repeat)
125 | 
126 |     return env
127 | 


--------------------------------------------------------------------------------