├── DDQN ├── ddqn_agent.py ├── deep_q_network.py ├── main_ddqn.py ├── models │ ├── PongNoFrameskip-v4_DDQNAgent_q_eval │ └── PongNoFrameskip-v4_DDQNAgent_q_next ├── plots │ └── DDQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png ├── replay_memory.py ├── tf2 │ ├── agent.py │ ├── main.py │ ├── network.py │ ├── replay_memory.py │ └── utils.py └── utils.py ├── DQN ├── deep_q_network.py ├── dqn_agent.py ├── main_dqn.py ├── models │ ├── PongNoFrameskip-v4_DQNAgent_q_eval │ └── PongNoFrameskip-v4_DQNAgent_q_next ├── plots │ └── DQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png ├── preprocess_pseudocode ├── replay_memory.py ├── tf2 │ ├── agent.py │ ├── main.py │ ├── network.py │ ├── replay_memory.py │ └── utils.py └── utils.py ├── DuelingDDQN ├── deep_q_network.py ├── dueling_ddqn_agent.py ├── main_dueling_ddqn.py ├── models │ ├── PongNoFrameskip-v4_DuelingDDQNAgent_q_eval │ └── PongNoFrameskip-v4_DuelingDDQNAgent_q_next ├── plots │ └── DuelingDDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png ├── replay_memory.py ├── tf2 │ ├── agent.py │ ├── main.py │ ├── network.py │ ├── replay_memory.py │ └── utils.py └── utils.py ├── DuelingDQN ├── deep_q_network.py ├── dueling_dqn_agent.py ├── main_dueling_dqn.py ├── models │ ├── PongNoFrameskip-v4_DuelingDQNAgent_q_eval │ └── PongNoFrameskip-v4_DuelingDQNAgent_q_next ├── plots │ └── DuelingDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png ├── replay_memory.py ├── tf2 │ ├── agent.py │ ├── main.py │ ├── network.py │ ├── replay_memory.py │ └── utils.py └── utils.py ├── LICENSE ├── README.md ├── agents.py ├── argparse_example.py ├── deep_q_network.py ├── main.py ├── naive_deep_q_learning ├── cartpole_naive_dqn.png ├── cartpole_naive_dqn.py ├── pytorch_example.py └── util.py ├── q_learning ├── frozen_lake_deterministic_policy.py ├── frozen_lake_env_test.py ├── frozen_lake_q_learning.py ├── frozen_lake_random_agent.py ├── plots │ ├── frozen_lake_deterministic_policy.png │ ├── frozen_lake_q_learning_agent.png │ └── frozen_lake_random_policy.png ├── q_learning_agent.py └── q_network.py ├── replay_memory.py └── utils.py /DDQN/ddqn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from deep_q_network import DeepQNetwork 4 | from replay_memory import ReplayBuffer 5 | 6 | class DDQNAgent(object): 7 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 8 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 9 | replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): 10 | self.gamma = gamma 11 | self.epsilon = epsilon 12 | self.lr = lr 13 | self.n_actions = n_actions 14 | self.input_dims = input_dims 15 | self.batch_size = batch_size 16 | self.eps_min = eps_min 17 | self.eps_dec = eps_dec 18 | self.replace_target_cnt = replace 19 | self.algo = algo 20 | self.env_name = env_name 21 | self.chkpt_dir = chkpt_dir 22 | self.action_space = [i for i in range(n_actions)] 23 | self.learn_step_counter = 0 24 | 25 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions) 26 | 27 | self.q_eval = DeepQNetwork(self.lr, self.n_actions, 28 | input_dims=self.input_dims, 29 | name=self.env_name+'_'+self.algo+'_q_eval', 30 | chkpt_dir=self.chkpt_dir) 31 | self.q_next = DeepQNetwork(self.lr, self.n_actions, 32 | input_dims=self.input_dims, 33 | name=self.env_name+'_'+self.algo+'_q_next', 34 | chkpt_dir=self.chkpt_dir) 35 | 36 | def store_transition(self, state, action, reward, state_, done): 37 | self.memory.store_transition(state, action, reward, state_, done) 38 | 39 | def sample_memory(self): 40 | state, action, reward, new_state, done = \ 41 | self.memory.sample_buffer(self.batch_size) 42 | 43 | states = T.tensor(state).to(self.q_eval.device) 44 | rewards = T.tensor(reward).to(self.q_eval.device) 45 | dones = T.tensor(done).to(self.q_eval.device) 46 | actions = T.tensor(action).to(self.q_eval.device) 47 | states_ = T.tensor(new_state).to(self.q_eval.device) 48 | 49 | return states, actions, rewards, states_, dones 50 | 51 | def choose_action(self, observation): 52 | if np.random.random() > self.epsilon: 53 | state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) 54 | actions = self.q_eval.forward(state) 55 | action = T.argmax(actions).item() 56 | else: 57 | action = np.random.choice(self.action_space) 58 | 59 | return action 60 | 61 | def replace_target_network(self): 62 | if self.replace_target_cnt is not None and \ 63 | self.learn_step_counter % self.replace_target_cnt == 0: 64 | self.q_next.load_state_dict(self.q_eval.state_dict()) 65 | 66 | def decrement_epsilon(self): 67 | self.epsilon = self.epsilon - self.eps_dec \ 68 | if self.epsilon > self.eps_min else self.eps_min 69 | 70 | def learn(self): 71 | if self.memory.mem_cntr < self.batch_size: 72 | return 73 | 74 | self.q_eval.optimizer.zero_grad() 75 | 76 | self.replace_target_network() 77 | 78 | states, actions, rewards, states_, dones = self.sample_memory() 79 | 80 | indices = np.arange(self.batch_size) 81 | 82 | q_pred = self.q_eval.forward(states)[indices, actions] 83 | q_next = self.q_next.forward(states_) 84 | q_eval = self.q_eval.forward(states_) 85 | 86 | max_actions = T.argmax(q_eval, dim=1) 87 | q_next[dones] = 0.0 88 | 89 | q_target = rewards + self.gamma*q_next[indices, max_actions] 90 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 91 | loss.backward() 92 | 93 | self.q_eval.optimizer.step() 94 | self.learn_step_counter += 1 95 | 96 | self.decrement_epsilon() 97 | 98 | def save_models(self): 99 | self.q_eval.save_checkpoint() 100 | self.q_next.save_checkpoint() 101 | 102 | def load_models(self): 103 | self.q_eval.load_checkpoint() 104 | self.q_next.load_checkpoint() 105 | -------------------------------------------------------------------------------- /DDQN/deep_q_network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import numpy as np 7 | 8 | class DeepQNetwork(nn.Module): 9 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 10 | super(DeepQNetwork, self).__init__() 11 | self.checkpoint_dir = chkpt_dir 12 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 13 | 14 | self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4) 15 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 16 | self.conv3 = nn.Conv2d(64, 64, 3, stride=1) 17 | 18 | fc_input_dims = self.calculate_conv_output_dims(input_dims) 19 | 20 | self.fc1 = nn.Linear(fc_input_dims, 512) 21 | self.fc2 = nn.Linear(512, n_actions) 22 | 23 | self.optimizer = optim.RMSprop(self.parameters(), lr=lr) 24 | 25 | self.loss = nn.MSELoss() 26 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 27 | self.to(self.device) 28 | 29 | def calculate_conv_output_dims(self, input_dims): 30 | state = T.zeros(1, *input_dims) 31 | dims = self.conv1(state) 32 | dims = self.conv2(dims) 33 | dims = self.conv3(dims) 34 | return int(np.prod(dims.size())) 35 | 36 | def forward(self, state): 37 | conv1 = F.relu(self.conv1(state)) 38 | conv2 = F.relu(self.conv2(conv1)) 39 | conv3 = F.relu(self.conv3(conv2)) 40 | conv_state = conv3.view(conv3.size()[0], -1) 41 | 42 | flat1 = F.relu(self.fc1(conv_state)) 43 | actions = self.fc2(flat1) 44 | 45 | return actions 46 | 47 | def save_checkpoint(self): 48 | print('... saving checkpoint ...') 49 | T.save(self.state_dict(), self.checkpoint_file) 50 | 51 | def load_checkpoint(self): 52 | print('... loading checkpoint ...') 53 | self.load_state_dict(T.load(self.checkpoint_file)) 54 | -------------------------------------------------------------------------------- /DDQN/main_ddqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from ddqn_agent import DDQNAgent 4 | from utils import plot_learning_curve, make_env 5 | 6 | if __name__ == '__main__': 7 | env = make_env('PongNoFrameskip-v4') 8 | best_score = -np.inf 9 | load_checkpoint = False 10 | n_games = 100 11 | agent = DDQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001, 12 | input_dims=(env.observation_space.shape), 13 | n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, 14 | batch_size=32, replace=10000, eps_dec=1e-5, 15 | chkpt_dir='models/', algo='DDQNAgent', 16 | env_name='PongNoFrameskip-v4') 17 | 18 | if load_checkpoint: 19 | agent.load_models() 20 | 21 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \ 22 | + str(n_games) + 'games' 23 | figure_file = 'plots/' + fname + '.png' 24 | 25 | n_steps = 0 26 | scores, eps_history, steps_array = [], [], [] 27 | 28 | for i in range(n_games): 29 | done = False 30 | observation = env.reset() 31 | 32 | score = 0 33 | while not done: 34 | action = agent.choose_action(observation) 35 | observation_, reward, done, info = env.step(action) 36 | score += reward 37 | 38 | if not load_checkpoint: 39 | agent.store_transition(observation, action, 40 | reward, observation_, int(done)) 41 | agent.learn() 42 | observation = observation_ 43 | n_steps += 1 44 | scores.append(score) 45 | steps_array.append(n_steps) 46 | 47 | avg_score = np.mean(scores[-100:]) 48 | print('episode: ', i,'score: ', score, 49 | ' average score %.1f' % avg_score, 'best score %.2f' % best_score, 50 | 'epsilon %.2f' % agent.epsilon, 'steps', n_steps) 51 | 52 | if avg_score > best_score: 53 | #if not load_checkpoint: 54 | # agent.save_models() 55 | best_score = avg_score 56 | 57 | eps_history.append(agent.epsilon) 58 | if load_checkpoint and n_steps >= 18000: 59 | break 60 | 61 | x = [i+1 for i in range(len(scores))] 62 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 63 | -------------------------------------------------------------------------------- /DDQN/models/PongNoFrameskip-v4_DDQNAgent_q_eval: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DDQN/models/PongNoFrameskip-v4_DDQNAgent_q_eval -------------------------------------------------------------------------------- /DDQN/models/PongNoFrameskip-v4_DDQNAgent_q_next: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DDQN/models/PongNoFrameskip-v4_DDQNAgent_q_next -------------------------------------------------------------------------------- /DDQN/plots/DDQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DDQN/plots/DDQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png -------------------------------------------------------------------------------- /DDQN/replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape), 8 | dtype=np.float32) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 10 | dtype=np.float32) 11 | 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, done): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.new_state_memory[index] = state_ 20 | self.action_memory[index] = action 21 | self.reward_memory[index] = reward 22 | self.terminal_memory[index] = done 23 | self.mem_cntr += 1 24 | 25 | def sample_buffer(self, batch_size): 26 | max_mem = min(self.mem_cntr, self.mem_size) 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | terminal = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, terminal 36 | -------------------------------------------------------------------------------- /DDQN/tf2/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import tensorflow.keras as keras 4 | from tensorflow.keras.optimizers import Adam 5 | from network import DeepQNetwork 6 | from replay_memory import ReplayBuffer 7 | 8 | 9 | class Agent: 10 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 11 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 12 | replace=1000, algo=None, env_name=None, chkpt_dir='tmp/ddqn'): 13 | self.gamma = gamma 14 | self.epsilon = epsilon 15 | self.lr = lr 16 | self.n_actions = n_actions 17 | self.input_dims = input_dims 18 | self.batch_size = batch_size 19 | self.eps_min = eps_min 20 | self.eps_dec = eps_dec 21 | self.replace_target_cnt = replace 22 | self.algo = algo 23 | self.env_name = env_name 24 | self.chkpt_dir = chkpt_dir 25 | self.action_space = [i for i in range(n_actions)] 26 | self.learn_step_counter = 0 27 | self.fname = self.chkpt_dir + self.env_name + '_' + self.algo + '_' 28 | 29 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions) 30 | 31 | self.q_eval = DeepQNetwork(input_dims, n_actions) 32 | self.q_eval.compile(optimizer=Adam(learning_rate=lr)) 33 | self.q_next = DeepQNetwork(input_dims, n_actions) 34 | self.q_next.compile(optimizer=Adam(learning_rate=lr)) 35 | 36 | def save_models(self): 37 | self.q_eval.save(self.fname+'q_eval') 38 | self.q_next.save(self.fname+'q_next') 39 | print('... models saved successfully ...') 40 | 41 | def load_models(self): 42 | self.q_eval = keras.models.load_model(self.fname+'q_eval') 43 | self.q_next = keras.models.load_model(self.fname+'q_next') 44 | print('... models loaded successfully ...') 45 | 46 | def store_transition(self, state, action, reward, state_, done): 47 | self.memory.store_transition(state, action, reward, state_, done) 48 | 49 | def sample_memory(self): 50 | state, action, reward, new_state, done = \ 51 | self.memory.sample_buffer(self.batch_size) 52 | states = tf.convert_to_tensor(state) 53 | rewards = tf.convert_to_tensor(reward) 54 | dones = tf.convert_to_tensor(done) 55 | actions = tf.convert_to_tensor(action, dtype=tf.int32) 56 | states_ = tf.convert_to_tensor(new_state) 57 | return states, actions, rewards, states_, dones 58 | 59 | def choose_action(self, observation): 60 | if np.random.random() > self.epsilon: 61 | state = tf.convert_to_tensor([observation]) 62 | actions = self.q_eval(state) 63 | action = tf.math.argmax(actions, axis=1).numpy()[0] 64 | else: 65 | action = np.random.choice(self.action_space) 66 | return action 67 | 68 | def replace_target_network(self): 69 | if self.learn_step_counter % self.replace_target_cnt == 0: 70 | self.q_next.set_weights(self.q_eval.get_weights()) 71 | 72 | def decrement_epsilon(self): 73 | self.epsilon = self.epsilon - self.eps_dec \ 74 | if self.epsilon > self.eps_min else self.eps_min 75 | 76 | def learn(self): 77 | if self.memory.mem_cntr < self.batch_size: 78 | return 79 | 80 | self.replace_target_network() 81 | 82 | states, actions, rewards, states_, dones = self.sample_memory() 83 | 84 | indices = tf.range(self.batch_size, dtype=tf.int32) 85 | action_indices = tf.stack([indices, actions], axis=1) 86 | 87 | with tf.GradientTape() as tape: 88 | q_pred = tf.gather_nd(self.q_eval(states), indices=action_indices) 89 | q_next = self.q_next(states_) 90 | q_eval = self.q_eval(states_) 91 | 92 | max_actions = tf.math.argmax(q_eval, axis=1, output_type=tf.int32) 93 | max_action_idx = tf.stack([indices, max_actions], axis=1) 94 | 95 | q_target = rewards + \ 96 | self.gamma*tf.gather_nd(q_next, indices=max_action_idx) *\ 97 | (1 - dones.numpy()) 98 | 99 | loss = keras.losses.MSE(q_pred, q_target) 100 | 101 | params = self.q_eval.trainable_variables 102 | grads = tape.gradient(loss, params) 103 | 104 | self.q_eval.optimizer.apply_gradients(zip(grads, params)) 105 | 106 | self.learn_step_counter += 1 107 | 108 | self.decrement_epsilon() 109 | -------------------------------------------------------------------------------- /DDQN/tf2/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from agent import Agent 3 | from utils import plot_learning_curve, make_env, manage_memory 4 | from gym import wrappers 5 | 6 | if __name__ == '__main__': 7 | manage_memory() 8 | env = make_env('PongNoFrameskip-v4') 9 | best_score = -np.inf 10 | load_checkpoint = False 11 | record_agent = False 12 | n_games = 250 13 | agent = Agent(gamma=0.99, epsilon=1, lr=0.0001, 14 | input_dims=(env.observation_space.shape), 15 | n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, 16 | batch_size=32, replace=1000, eps_dec=1e-5, 17 | chkpt_dir='models/', algo='DQNAgent', 18 | env_name='PongNoFrameskip-v4') 19 | if load_checkpoint: 20 | agent.load_models() 21 | agent.epsilon = agent.eps_min 22 | 23 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \ 24 | + str(n_games) + 'games' 25 | figure_file = 'plots/' + fname + '.png' 26 | # if you want to record video of your agent playing, do a 27 | # mkdir video 28 | if record_agent: 29 | env = wrappers.Monitor(env, "video", 30 | video_callable=lambda episode_id: True, 31 | force=True) 32 | n_steps = 0 33 | scores, eps_history, steps_array = [], [], [] 34 | 35 | for i in range(n_games): 36 | done = False 37 | observation = env.reset() 38 | 39 | score = 0 40 | while not done: 41 | action = agent.choose_action(observation) 42 | observation_, reward, done, info = env.step(action) 43 | score += reward 44 | 45 | if not load_checkpoint: 46 | agent.store_transition(observation, action, 47 | reward, observation_, done) 48 | agent.learn() 49 | observation = observation_ 50 | n_steps += 1 51 | scores.append(score) 52 | steps_array.append(n_steps) 53 | 54 | avg_score = np.mean(scores[-100:]) 55 | print('episode {} score {:.1f} avg score {:.1f} ' 56 | 'best score {:.1f} epsilon {:.2f} steps {}'. 57 | format(i, score, avg_score, best_score, agent.epsilon, 58 | n_steps)) 59 | 60 | if score > best_score: 61 | if not load_checkpoint: 62 | agent.save_models() 63 | best_score = score 64 | 65 | eps_history.append(agent.epsilon) 66 | 67 | x = [i+1 for i in range(len(scores))] 68 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 69 | -------------------------------------------------------------------------------- /DDQN/tf2/network.py: -------------------------------------------------------------------------------- 1 | import tensorflow.keras as keras 2 | from tensorflow.keras.layers import Conv2D, Dense, Flatten 3 | 4 | 5 | class DeepQNetwork(keras.Model): 6 | def __init__(self, input_dims, n_actions): 7 | super(DeepQNetwork, self).__init__() 8 | self.conv1 = Conv2D(32, 8, strides=(4, 4), activation='relu', 9 | data_format='channels_first', 10 | input_shape=input_dims) 11 | self.conv2 = Conv2D(64, 4, strides=(2, 2), activation='relu', 12 | data_format='channels_first') 13 | self.conv3 = Conv2D(64, 3, strides=(1, 1), activation='relu', 14 | data_format='channels_first') 15 | self.flat = Flatten() 16 | self.fc1 = Dense(512, activation='relu') 17 | self.fc2 = Dense(n_actions, activation=None) 18 | 19 | def call(self, state): 20 | x = self.conv1(state) 21 | x = self.conv2(x) 22 | x = self.conv3(x) 23 | x = self.flat(x) 24 | x = self.fc1(x) 25 | x = self.fc2(x) 26 | 27 | return x 28 | -------------------------------------------------------------------------------- /DDQN/tf2/replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape), 8 | dtype=np.float32) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 10 | dtype=np.float32) 11 | 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, done): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.new_state_memory[index] = state_ 20 | self.action_memory[index] = action 21 | self.reward_memory[index] = reward 22 | self.terminal_memory[index] = done 23 | self.mem_cntr += 1 24 | 25 | def sample_buffer(self, batch_size): 26 | max_mem = min(self.mem_cntr, self.mem_size) 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | terminal = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, terminal 36 | -------------------------------------------------------------------------------- /DDQN/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | import tensorflow as tf 7 | 8 | 9 | def manage_memory(): 10 | gpus = tf.config.list_physical_devices('GPU') 11 | if gpus: 12 | try: 13 | for gpu in gpus: 14 | tf.config.experimental.set_memory_growth(gpu, True) 15 | except RuntimeError as e: 16 | print(e) 17 | 18 | 19 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 20 | fig=plt.figure() 21 | ax=fig.add_subplot(111, label="1") 22 | ax2=fig.add_subplot(111, label="2", frame_on=False) 23 | 24 | ax.plot(x, epsilons, color="C0") 25 | ax.set_xlabel("Training Steps", color="C0") 26 | ax.set_ylabel("Epsilon", color="C0") 27 | ax.tick_params(axis='x', colors="C0") 28 | ax.tick_params(axis='y', colors="C0") 29 | 30 | N = len(scores) 31 | running_avg = np.empty(N) 32 | for t in range(N): 33 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 34 | 35 | ax2.scatter(x, running_avg, color="C1") 36 | ax2.axes.get_xaxis().set_visible(False) 37 | ax2.yaxis.tick_right() 38 | ax2.set_ylabel('Score', color="C1") 39 | ax2.yaxis.set_label_position('right') 40 | ax2.tick_params(axis='y', colors="C1") 41 | 42 | if lines is not None: 43 | for line in lines: 44 | plt.axvline(x=line) 45 | 46 | plt.savefig(filename) 47 | 48 | class RepeatActionAndMaxFrame(gym.Wrapper): 49 | def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0, 50 | fire_first=False): 51 | super(RepeatActionAndMaxFrame, self).__init__(env) 52 | self.repeat = repeat 53 | self.shape = env.observation_space.low.shape 54 | self.frame_buffer = np.zeros_like((2, self.shape)) 55 | self.clip_reward = clip_reward 56 | self.no_ops = no_ops 57 | self.fire_first = fire_first 58 | 59 | def step(self, action): 60 | t_reward = 0.0 61 | done = False 62 | for i in range(self.repeat): 63 | obs, reward, done, info = self.env.step(action) 64 | if self.clip_reward: 65 | reward = np.clip(np.array([reward]), -1, 1)[0] 66 | t_reward += reward 67 | idx = i % 2 68 | self.frame_buffer[idx] = obs 69 | if done: 70 | break 71 | 72 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 73 | return max_frame, t_reward, done, info 74 | 75 | def reset(self): 76 | obs = self.env.reset() 77 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 78 | for _ in range(no_ops): 79 | _, _, done, _ = self.env.step(0) 80 | if done: 81 | self.env.reset() 82 | if self.fire_first: 83 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 84 | obs, _, _, _ = self.env.step(1) 85 | 86 | self.frame_buffer = np.zeros_like((2,self.shape)) 87 | self.frame_buffer[0] = obs 88 | 89 | return obs 90 | 91 | class PreprocessFrame(gym.ObservationWrapper): 92 | def __init__(self, shape, env=None): 93 | super(PreprocessFrame, self).__init__(env) 94 | self.shape = (shape[2], shape[0], shape[1]) 95 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 96 | shape=self.shape, dtype=np.float32) 97 | 98 | def observation(self, obs): 99 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 100 | resized_screen = cv2.resize(new_frame, self.shape[1:], 101 | interpolation=cv2.INTER_AREA) 102 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 103 | new_obs = new_obs / 255.0 104 | 105 | return new_obs 106 | 107 | class StackFrames(gym.ObservationWrapper): 108 | def __init__(self, env, repeat): 109 | super(StackFrames, self).__init__(env) 110 | self.observation_space = gym.spaces.Box( 111 | env.observation_space.low.repeat(repeat, axis=0), 112 | env.observation_space.high.repeat(repeat, axis=0), 113 | dtype=np.float32) 114 | self.stack = collections.deque(maxlen=repeat) 115 | 116 | def reset(self): 117 | self.stack.clear() 118 | observation = self.env.reset() 119 | for _ in range(self.stack.maxlen): 120 | self.stack.append(observation) 121 | 122 | return np.array(self.stack).reshape(self.observation_space.low.shape) 123 | 124 | def observation(self, observation): 125 | self.stack.append(observation) 126 | 127 | return np.array(self.stack).reshape(self.observation_space.low.shape) 128 | 129 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False, 130 | no_ops=0, fire_first=False): 131 | env = gym.make(env_name) 132 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first) 133 | env = PreprocessFrame(shape, env) 134 | env = StackFrames(env, repeat) 135 | 136 | return env 137 | -------------------------------------------------------------------------------- /DDQN/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | 7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 8 | fig=plt.figure() 9 | ax=fig.add_subplot(111, label="1") 10 | ax2=fig.add_subplot(111, label="2", frame_on=False) 11 | 12 | ax.plot(x, epsilons, color="C0") 13 | ax.set_xlabel("Training Steps", color="C0") 14 | ax.set_ylabel("Epsilon", color="C0") 15 | ax.tick_params(axis='x', colors="C0") 16 | ax.tick_params(axis='y', colors="C0") 17 | 18 | N = len(scores) 19 | running_avg = np.empty(N) 20 | for t in range(N): 21 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 22 | 23 | ax2.scatter(x, running_avg, color="C1") 24 | ax2.axes.get_xaxis().set_visible(False) 25 | ax2.yaxis.tick_right() 26 | ax2.set_ylabel('Score', color="C1") 27 | ax2.yaxis.set_label_position('right') 28 | ax2.tick_params(axis='y', colors="C1") 29 | 30 | if lines is not None: 31 | for line in lines: 32 | plt.axvline(x=line) 33 | 34 | plt.savefig(filename) 35 | 36 | class RepeatActionAndMaxFrame(gym.Wrapper): 37 | """ modified from: 38 | https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py 39 | """ 40 | def __init__(self, env=None, repeat=4): 41 | super(RepeatActionAndMaxFrame, self).__init__(env) 42 | self.repeat = repeat 43 | self.shape = env.observation_space.low.shape 44 | self.frame_buffer = np.zeros_like((2,self.shape)) 45 | 46 | def step(self, action): 47 | t_reward = 0.0 48 | done = False 49 | for i in range(self.repeat): 50 | obs, reward, done, info = self.env.step(action) 51 | t_reward += reward 52 | idx = i % 2 53 | self.frame_buffer[idx] = obs 54 | if done: 55 | break 56 | 57 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 58 | return max_frame, t_reward, done, info 59 | 60 | def reset(self): 61 | obs = self.env.reset() 62 | self.frame_buffer = np.zeros_like((2,self.shape)) 63 | self.frame_buffer[0] = obs 64 | return obs 65 | 66 | class PreprocessFrame(gym.ObservationWrapper): 67 | def __init__(self, shape, env=None): 68 | super(PreprocessFrame, self).__init__(env) 69 | self.shape=(shape[2], shape[0], shape[1]) 70 | self.observation_space = gym.spaces.Box(low=0, high=1.0, 71 | shape=self.shape,dtype=np.float32) 72 | def observation(self, obs): 73 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 74 | resized_screen = cv2.resize(new_frame, self.shape[1:], 75 | interpolation=cv2.INTER_AREA) 76 | 77 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 78 | new_obs = np.swapaxes(new_obs, 2,0) 79 | new_obs = new_obs / 255.0 80 | return new_obs 81 | 82 | class StackFrames(gym.ObservationWrapper): 83 | def __init__(self, env, n_steps): 84 | super(StackFrames, self).__init__(env) 85 | self.observation_space = gym.spaces.Box( 86 | env.observation_space.low.repeat(n_steps, axis=0), 87 | env.observation_space.high.repeat(n_steps, axis=0), 88 | dtype=np.float32) 89 | self.stack = collections.deque(maxlen=n_steps) 90 | 91 | def reset(self): 92 | self.stack.clear() 93 | observation = self.env.reset() 94 | for _ in range(self.stack.maxlen): 95 | self.stack.append(observation) 96 | 97 | return np.array(self.stack).reshape(self.observation_space.low.shape) 98 | 99 | def observation(self, observation): 100 | self.stack.append(observation) 101 | obs = np.array(self.stack).reshape(self.observation_space.low.shape) 102 | 103 | return obs 104 | 105 | def make_env(env_name, shape=(84,84,1), skip=4): 106 | env = gym.make(env_name) 107 | env = RepeatActionAndMaxFrame(env, skip) 108 | env = PreprocessFrame(shape, env) 109 | env = StackFrames(env, skip) 110 | 111 | return env 112 | -------------------------------------------------------------------------------- /DQN/deep_q_network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import numpy as np 7 | 8 | class DeepQNetwork(nn.Module): 9 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 10 | super(DeepQNetwork, self).__init__() 11 | self.checkpoint_dir = chkpt_dir 12 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 13 | 14 | self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4) 15 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 16 | self.conv3 = nn.Conv2d(64, 64, 3, stride=1) 17 | 18 | fc_input_dims = self.calculate_conv_output_dims(input_dims) 19 | 20 | self.fc1 = nn.Linear(fc_input_dims, 512) 21 | self.fc2 = nn.Linear(512, n_actions) 22 | 23 | self.optimizer = optim.RMSprop(self.parameters(), lr=lr) 24 | 25 | self.loss = nn.MSELoss() 26 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 27 | self.to(self.device) 28 | 29 | def calculate_conv_output_dims(self, input_dims): 30 | state = T.zeros(1, *input_dims) 31 | dims = self.conv1(state) 32 | dims = self.conv2(dims) 33 | dims = self.conv3(dims) 34 | return int(np.prod(dims.size())) 35 | 36 | def forward(self, state): 37 | conv1 = F.relu(self.conv1(state)) 38 | conv2 = F.relu(self.conv2(conv1)) 39 | conv3 = F.relu(self.conv3(conv2)) 40 | # conv3 shape is BS x n_filters x H x W 41 | conv_state = conv3.view(conv3.size()[0], -1) 42 | # conv_state shape is BS x (n_filters * H * W) 43 | flat1 = F.relu(self.fc1(conv_state)) 44 | actions = self.fc2(flat1) 45 | 46 | return actions 47 | 48 | def save_checkpoint(self): 49 | print('... saving checkpoint ...') 50 | T.save(self.state_dict(), self.checkpoint_file) 51 | 52 | def load_checkpoint(self): 53 | print('... loading checkpoint ...') 54 | self.load_state_dict(T.load(self.checkpoint_file)) 55 | -------------------------------------------------------------------------------- /DQN/dqn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from deep_q_network import DeepQNetwork 4 | from replay_memory import ReplayBuffer 5 | 6 | class DQNAgent(object): 7 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 8 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 9 | replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): 10 | self.gamma = gamma 11 | self.epsilon = epsilon 12 | self.lr = lr 13 | self.n_actions = n_actions 14 | self.input_dims = input_dims 15 | self.batch_size = batch_size 16 | self.eps_min = eps_min 17 | self.eps_dec = eps_dec 18 | self.replace_target_cnt = replace 19 | self.algo = algo 20 | self.env_name = env_name 21 | self.chkpt_dir = chkpt_dir 22 | self.action_space = [i for i in range(n_actions)] 23 | self.learn_step_counter = 0 24 | 25 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions) 26 | 27 | self.q_eval = DeepQNetwork(self.lr, self.n_actions, 28 | input_dims=self.input_dims, 29 | name=self.env_name+'_'+self.algo+'_q_eval', 30 | chkpt_dir=self.chkpt_dir) 31 | 32 | self.q_next = DeepQNetwork(self.lr, self.n_actions, 33 | input_dims=self.input_dims, 34 | name=self.env_name+'_'+self.algo+'_q_next', 35 | chkpt_dir=self.chkpt_dir) 36 | 37 | def choose_action(self, observation): 38 | if np.random.random() > self.epsilon: 39 | state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) 40 | actions = self.q_eval.forward(state) 41 | action = T.argmax(actions).item() 42 | else: 43 | action = np.random.choice(self.action_space) 44 | 45 | return action 46 | 47 | def store_transition(self, state, action, reward, state_, done): 48 | self.memory.store_transition(state, action, reward, state_, done) 49 | 50 | def sample_memory(self): 51 | state, action, reward, new_state, done = \ 52 | self.memory.sample_buffer(self.batch_size) 53 | 54 | states = T.tensor(state).to(self.q_eval.device) 55 | rewards = T.tensor(reward).to(self.q_eval.device) 56 | dones = T.tensor(done).to(self.q_eval.device) 57 | actions = T.tensor(action).to(self.q_eval.device) 58 | states_ = T.tensor(new_state).to(self.q_eval.device) 59 | 60 | return states, actions, rewards, states_, dones 61 | 62 | def replace_target_network(self): 63 | if self.learn_step_counter % self.replace_target_cnt == 0: 64 | self.q_next.load_state_dict(self.q_eval.state_dict()) 65 | 66 | def decrement_epsilon(self): 67 | self.epsilon = self.epsilon - self.eps_dec \ 68 | if self.epsilon > self.eps_min else self.eps_min 69 | 70 | def save_models(self): 71 | self.q_eval.save_checkpoint() 72 | self.q_next.save_checkpoint() 73 | 74 | def load_models(self): 75 | self.q_eval.load_checkpoint() 76 | self.q_next.load_checkpoint() 77 | 78 | def learn(self): 79 | if self.memory.mem_cntr < self.batch_size: 80 | return 81 | 82 | self.q_eval.optimizer.zero_grad() 83 | 84 | self.replace_target_network() 85 | 86 | states, actions, rewards, states_, dones = self.sample_memory() 87 | indices = np.arange(self.batch_size) 88 | 89 | q_pred = self.q_eval.forward(states)[indices, actions] 90 | q_next = self.q_next.forward(states_).max(dim=1)[0] 91 | 92 | q_next[dones] = 0.0 93 | q_target = rewards + self.gamma*q_next 94 | 95 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 96 | loss.backward() 97 | self.q_eval.optimizer.step() 98 | self.learn_step_counter += 1 99 | 100 | self.decrement_epsilon() 101 | -------------------------------------------------------------------------------- /DQN/main_dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from dqn_agent import DQNAgent 4 | from utils import plot_learning_curve, make_env 5 | from gym import wrappers 6 | 7 | if __name__ == '__main__': 8 | env = make_env('PongNoFrameskip-v4') 9 | #env = gym.make('CartPole-v1') 10 | best_score = -np.inf 11 | load_checkpoint = False 12 | n_games = 250 13 | 14 | agent = DQNAgent(gamma=0.99, epsilon=1, lr=0.0001, 15 | input_dims=(env.observation_space.shape), 16 | n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, 17 | batch_size=32, replace=1000, eps_dec=1e-5, 18 | chkpt_dir='models/', algo='DQNAgent', 19 | env_name='PongNoFrameskip-v4') 20 | 21 | if load_checkpoint: 22 | agent.load_models() 23 | 24 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \ 25 | + str(n_games) + 'games' 26 | figure_file = 'plots/' + fname + '.png' 27 | # if you want to record video of your agent playing, do a mkdir tmp && mkdir tmp/dqn-video 28 | # and uncomment the following 2 lines. 29 | #env = wrappers.Monitor(env, "tmp/dqn-video", 30 | # video_callable=lambda episode_id: True, force=True) 31 | n_steps = 0 32 | scores, eps_history, steps_array = [], [], [] 33 | 34 | for i in range(n_games): 35 | done = False 36 | observation = env.reset() 37 | 38 | score = 0 39 | while not done: 40 | action = agent.choose_action(observation) 41 | observation_, reward, done, info = env.step(action) 42 | score += reward 43 | 44 | if not load_checkpoint: 45 | agent.store_transition(observation, action, 46 | reward, observation_, done) 47 | agent.learn() 48 | observation = observation_ 49 | n_steps += 1 50 | scores.append(score) 51 | steps_array.append(n_steps) 52 | 53 | avg_score = np.mean(scores[-100:]) 54 | print('episode: ', i,'score: ', score, 55 | ' average score %.1f' % avg_score, 'best score %.2f' % best_score, 56 | 'epsilon %.2f' % agent.epsilon, 'steps', n_steps) 57 | 58 | if avg_score > best_score: 59 | if not load_checkpoint: 60 | agent.save_models() 61 | best_score = avg_score 62 | 63 | eps_history.append(agent.epsilon) 64 | 65 | x = [i+1 for i in range(len(scores))] 66 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 67 | -------------------------------------------------------------------------------- /DQN/models/PongNoFrameskip-v4_DQNAgent_q_eval: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DQN/models/PongNoFrameskip-v4_DQNAgent_q_eval -------------------------------------------------------------------------------- /DQN/models/PongNoFrameskip-v4_DQNAgent_q_next: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DQN/models/PongNoFrameskip-v4_DQNAgent_q_next -------------------------------------------------------------------------------- /DQN/plots/DQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DQN/plots/DQNAgent_PongNoFrameskip-v4_alpha0.0001_500games.png -------------------------------------------------------------------------------- /DQN/preprocess_pseudocode: -------------------------------------------------------------------------------- 1 | Class RepeatActionAndMaxFrame 2 | derives from: gym.Wrapper 3 | input: environment, repeat 4 | init frame buffer as an array of zeros in shape 2 x the obs space 5 | 6 | function step: 7 | input: action 8 | set total reward to 0 9 | set done to false 10 | for i in range repeat 11 | call the env.step function 12 | receive obs, reward, done, info 13 | increment total reward 14 | insert obs in frame buffer 15 | if done 16 | break 17 | end for 18 | find the max frame 19 | return: max frame, total reward, done, info 20 | 21 | function reset: 22 | input: none 23 | 24 | call env.reset 25 | reset the frame buffer 26 | store initial observation in buffer 27 | 28 | return: initial observation 29 | 30 | Class PreprocessFrame 31 | derives from: gym.ObservationWrapper 32 | input: environment, new shape 33 | set shape by swapping channels axis 34 | set observation space to new shape using gym.spaces.Box (0 to 1.0) 35 | 36 | function observation 37 | input: raw observation 38 | covert the observation to gray scale 39 | resize observation to new shape 40 | convert observation to numpy array 41 | move observation's channel axis from position 2 to position 0 42 | observation /= 255 43 | return observation 44 | 45 | 46 | Class StackFrames 47 | derives from: gym.ObservationWrapper 48 | input: environment, stack size 49 | init the new obs space (gym.spaces.Box) low & high bounds as repeat of n_steps 50 | initialize empty frame stack 51 | 52 | reset function 53 | clear the stack 54 | reset the environment 55 | for i in range(stack size) 56 | append initial observation to stack 57 | convert stack to numpy array 58 | reshape stack array to observation space low shape 59 | return stack 60 | 61 | observation function 62 | input: observation 63 | append the observation to the end of the stack 64 | convert the stack to a numpy array 65 | reshape stack to observation space low shape 66 | return the stack of frames 67 | 68 | function make_env: 69 | input: environment name, new shape, stack size 70 | init env with the base gym.make function 71 | env := RepeatActionAndMaxFrame 72 | env := PreprocessFrame 73 | env := StackFrames 74 | 75 | return: env 76 | -------------------------------------------------------------------------------- /DQN/replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape), 8 | dtype=np.float32) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 10 | dtype=np.float32) 11 | 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, done): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.new_state_memory[index] = state_ 20 | self.action_memory[index] = action 21 | self.reward_memory[index] = reward 22 | self.terminal_memory[index] = done 23 | self.mem_cntr += 1 24 | 25 | def sample_buffer(self, batch_size): 26 | max_mem = min(self.mem_cntr, self.mem_size) 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | terminal = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, terminal 36 | -------------------------------------------------------------------------------- /DQN/tf2/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import tensorflow.keras as keras 4 | from tensorflow.keras.optimizers import Adam 5 | from network import DeepQNetwork 6 | from replay_memory import ReplayBuffer 7 | 8 | 9 | class Agent: 10 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 11 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 12 | replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): 13 | self.gamma = gamma 14 | self.epsilon = epsilon 15 | self.lr = lr 16 | self.n_actions = n_actions 17 | self.input_dims = input_dims 18 | self.batch_size = batch_size 19 | self.eps_min = eps_min 20 | self.eps_dec = eps_dec 21 | self.replace_target_cnt = replace 22 | self.algo = algo 23 | self.env_name = env_name 24 | self.chkpt_dir = chkpt_dir 25 | self.action_space = [i for i in range(n_actions)] 26 | self.learn_step_counter = 0 27 | self.fname = self.chkpt_dir + self.env_name + '_' + self.algo + '_' 28 | 29 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions) 30 | 31 | self.q_eval = DeepQNetwork(input_dims, n_actions) 32 | self.q_eval.compile(optimizer=Adam(learning_rate=lr)) 33 | self.q_next = DeepQNetwork(input_dims, n_actions) 34 | self.q_next.compile(optimizer=Adam(learning_rate=lr)) 35 | 36 | def save_models(self): 37 | self.q_eval.save(self.fname+'q_eval') 38 | self.q_next.save(self.fname+'q_next') 39 | print('... models saved successfully ...') 40 | 41 | def load_models(self): 42 | self.q_eval = keras.models.load_model(self.fname+'q_eval') 43 | self.q_next = keras.models.load_model(self.fname+'q_next') 44 | print('... models loaded successfully ...') 45 | 46 | def store_transition(self, state, action, reward, state_, done): 47 | self.memory.store_transition(state, action, reward, state_, done) 48 | 49 | def sample_memory(self): 50 | state, action, reward, new_state, done = \ 51 | self.memory.sample_buffer(self.batch_size) 52 | states = tf.convert_to_tensor(state) 53 | rewards = tf.convert_to_tensor(reward) 54 | dones = tf.convert_to_tensor(done) 55 | actions = tf.convert_to_tensor(action, dtype=tf.int32) 56 | states_ = tf.convert_to_tensor(new_state) 57 | return states, actions, rewards, states_, dones 58 | 59 | def choose_action(self, observation): 60 | if np.random.random() > self.epsilon: 61 | state = tf.convert_to_tensor([observation]) 62 | actions = self.q_eval(state) 63 | action = tf.math.argmax(actions, axis=1).numpy()[0] 64 | else: 65 | action = np.random.choice(self.action_space) 66 | return action 67 | 68 | def replace_target_network(self): 69 | if self.learn_step_counter % self.replace_target_cnt == 0: 70 | self.q_next.set_weights(self.q_eval.get_weights()) 71 | 72 | def decrement_epsilon(self): 73 | self.epsilon = self.epsilon - self.eps_dec \ 74 | if self.epsilon > self.eps_min else self.eps_min 75 | 76 | def learn(self): 77 | if self.memory.mem_cntr < self.batch_size: 78 | return 79 | 80 | self.replace_target_network() 81 | 82 | states, actions, rewards, states_, dones = self.sample_memory() 83 | 84 | indices = tf.range(self.batch_size, dtype=tf.int32) 85 | print(actions.shape) 86 | exit() 87 | action_indices = tf.stack([indices, actions], axis=1) 88 | 89 | with tf.GradientTape() as tape: 90 | q_pred = tf.gather_nd(self.q_eval(states), indices=action_indices) 91 | q_next = self.q_next(states_) 92 | 93 | max_actions = tf.math.argmax(q_next, axis=1, output_type=tf.int32) 94 | max_action_idx = tf.stack([indices, max_actions], axis=1) 95 | 96 | q_target = rewards + \ 97 | self.gamma*tf.gather_nd(q_next, indices=max_action_idx) *\ 98 | (1 - dones.numpy()) 99 | 100 | loss = keras.losses.MSE(q_pred, q_target) 101 | 102 | params = self.q_eval.trainable_variables 103 | grads = tape.gradient(loss, params) 104 | 105 | self.q_eval.optimizer.apply_gradients(zip(grads, params)) 106 | 107 | self.learn_step_counter += 1 108 | 109 | self.decrement_epsilon() 110 | -------------------------------------------------------------------------------- /DQN/tf2/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from agent import Agent 3 | from utils import plot_learning_curve, make_env, manage_memory 4 | from gym import wrappers 5 | 6 | if __name__ == '__main__': 7 | manage_memory() 8 | env = make_env('PongNoFrameskip-v4') 9 | best_score = -np.inf 10 | load_checkpoint = False 11 | record_agent = False 12 | n_games = 250 13 | agent = Agent(gamma=0.99, epsilon=1, lr=0.0001, 14 | input_dims=(env.observation_space.shape), 15 | n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, 16 | batch_size=32, replace=1000, eps_dec=1e-5, 17 | chkpt_dir='models/', algo='DQNAgent', 18 | env_name='PongNoFrameskip-v4') 19 | if load_checkpoint: 20 | agent.load_models() 21 | agent.epsilon = agent.eps_min 22 | 23 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \ 24 | + str(n_games) + 'games' 25 | figure_file = 'plots/' + fname + '.png' 26 | # if you want to record video of your agent playing, do a 27 | # mkdir video 28 | if record_agent: 29 | env = wrappers.Monitor(env, "video", 30 | video_callable=lambda episode_id: True, 31 | force=True) 32 | n_steps = 0 33 | scores, eps_history, steps_array = [], [], [] 34 | 35 | for i in range(n_games): 36 | done = False 37 | observation = env.reset() 38 | 39 | score = 0 40 | while not done: 41 | action = agent.choose_action(observation) 42 | observation_, reward, done, info = env.step(action) 43 | score += reward 44 | 45 | if not load_checkpoint: 46 | agent.store_transition(observation, action, 47 | reward, observation_, done) 48 | agent.learn() 49 | observation = observation_ 50 | n_steps += 1 51 | scores.append(score) 52 | steps_array.append(n_steps) 53 | 54 | avg_score = np.mean(scores[-100:]) 55 | print('episode {} score {:.1f} avg score {:.1f} ' 56 | 'best score {:.1f} epsilon {:.2f} steps {}'. 57 | format(i, score, avg_score, best_score, agent.epsilon, 58 | n_steps)) 59 | 60 | if score > best_score: 61 | if not load_checkpoint: 62 | agent.save_models() 63 | best_score = score 64 | 65 | eps_history.append(agent.epsilon) 66 | 67 | x = [i+1 for i in range(len(scores))] 68 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 69 | -------------------------------------------------------------------------------- /DQN/tf2/network.py: -------------------------------------------------------------------------------- 1 | import tensorflow.keras as keras 2 | from tensorflow.keras.layers import Conv2D, Dense, Flatten 3 | 4 | 5 | class DeepQNetwork(keras.Model): 6 | def __init__(self, input_dims, n_actions): 7 | super(DeepQNetwork, self).__init__() 8 | self.conv1 = Conv2D(32, 8, strides=(4, 4), activation='relu', 9 | data_format='channels_first', 10 | input_shape=input_dims) 11 | self.conv2 = Conv2D(64, 4, strides=(2, 2), activation='relu', 12 | data_format='channels_first') 13 | self.conv3 = Conv2D(64, 3, strides=(1, 1), activation='relu', 14 | data_format='channels_first') 15 | self.flat = Flatten() 16 | self.fc1 = Dense(512, activation='relu') 17 | self.fc2 = Dense(n_actions, activation=None) 18 | 19 | def call(self, state): 20 | x = self.conv1(state) 21 | x = self.conv2(x) 22 | x = self.conv3(x) 23 | x = self.flat(x) 24 | x = self.fc1(x) 25 | x = self.fc2(x) 26 | 27 | return x 28 | -------------------------------------------------------------------------------- /DQN/tf2/replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape), 8 | dtype=np.float32) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 10 | dtype=np.float32) 11 | 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, done): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.new_state_memory[index] = state_ 20 | self.action_memory[index] = action 21 | self.reward_memory[index] = reward 22 | self.terminal_memory[index] = done 23 | self.mem_cntr += 1 24 | 25 | def sample_buffer(self, batch_size): 26 | max_mem = min(self.mem_cntr, self.mem_size) 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | terminal = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, terminal 36 | -------------------------------------------------------------------------------- /DQN/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | import tensorflow as tf 7 | 8 | 9 | def manage_memory(): 10 | gpus = tf.config.list_physical_devices('GPU') 11 | if gpus: 12 | try: 13 | for gpu in gpus: 14 | tf.config.experimental.set_memory_growth(gpu, True) 15 | except RuntimeError as e: 16 | print(e) 17 | 18 | 19 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 20 | fig=plt.figure() 21 | ax=fig.add_subplot(111, label="1") 22 | ax2=fig.add_subplot(111, label="2", frame_on=False) 23 | 24 | ax.plot(x, epsilons, color="C0") 25 | ax.set_xlabel("Training Steps", color="C0") 26 | ax.set_ylabel("Epsilon", color="C0") 27 | ax.tick_params(axis='x', colors="C0") 28 | ax.tick_params(axis='y', colors="C0") 29 | 30 | N = len(scores) 31 | running_avg = np.empty(N) 32 | for t in range(N): 33 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 34 | 35 | ax2.scatter(x, running_avg, color="C1") 36 | ax2.axes.get_xaxis().set_visible(False) 37 | ax2.yaxis.tick_right() 38 | ax2.set_ylabel('Score', color="C1") 39 | ax2.yaxis.set_label_position('right') 40 | ax2.tick_params(axis='y', colors="C1") 41 | 42 | if lines is not None: 43 | for line in lines: 44 | plt.axvline(x=line) 45 | 46 | plt.savefig(filename) 47 | 48 | class RepeatActionAndMaxFrame(gym.Wrapper): 49 | def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0, 50 | fire_first=False): 51 | super(RepeatActionAndMaxFrame, self).__init__(env) 52 | self.repeat = repeat 53 | self.shape = env.observation_space.low.shape 54 | self.frame_buffer = np.zeros_like((2, self.shape)) 55 | self.clip_reward = clip_reward 56 | self.no_ops = no_ops 57 | self.fire_first = fire_first 58 | 59 | def step(self, action): 60 | t_reward = 0.0 61 | done = False 62 | for i in range(self.repeat): 63 | obs, reward, done, info = self.env.step(action) 64 | if self.clip_reward: 65 | reward = np.clip(np.array([reward]), -1, 1)[0] 66 | t_reward += reward 67 | idx = i % 2 68 | self.frame_buffer[idx] = obs 69 | if done: 70 | break 71 | 72 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 73 | return max_frame, t_reward, done, info 74 | 75 | def reset(self): 76 | obs = self.env.reset() 77 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 78 | for _ in range(no_ops): 79 | _, _, done, _ = self.env.step(0) 80 | if done: 81 | self.env.reset() 82 | if self.fire_first: 83 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 84 | obs, _, _, _ = self.env.step(1) 85 | 86 | self.frame_buffer = np.zeros_like((2,self.shape)) 87 | self.frame_buffer[0] = obs 88 | 89 | return obs 90 | 91 | class PreprocessFrame(gym.ObservationWrapper): 92 | def __init__(self, shape, env=None): 93 | super(PreprocessFrame, self).__init__(env) 94 | self.shape = (shape[2], shape[0], shape[1]) 95 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 96 | shape=self.shape, dtype=np.float32) 97 | 98 | def observation(self, obs): 99 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 100 | resized_screen = cv2.resize(new_frame, self.shape[1:], 101 | interpolation=cv2.INTER_AREA) 102 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 103 | new_obs = new_obs / 255.0 104 | 105 | return new_obs 106 | 107 | class StackFrames(gym.ObservationWrapper): 108 | def __init__(self, env, repeat): 109 | super(StackFrames, self).__init__(env) 110 | self.observation_space = gym.spaces.Box( 111 | env.observation_space.low.repeat(repeat, axis=0), 112 | env.observation_space.high.repeat(repeat, axis=0), 113 | dtype=np.float32) 114 | self.stack = collections.deque(maxlen=repeat) 115 | 116 | def reset(self): 117 | self.stack.clear() 118 | observation = self.env.reset() 119 | for _ in range(self.stack.maxlen): 120 | self.stack.append(observation) 121 | 122 | return np.array(self.stack).reshape(self.observation_space.low.shape) 123 | 124 | def observation(self, observation): 125 | self.stack.append(observation) 126 | 127 | return np.array(self.stack).reshape(self.observation_space.low.shape) 128 | 129 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False, 130 | no_ops=0, fire_first=False): 131 | env = gym.make(env_name) 132 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first) 133 | env = PreprocessFrame(shape, env) 134 | env = StackFrames(env, repeat) 135 | 136 | return env 137 | -------------------------------------------------------------------------------- /DQN/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | 7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 8 | fig=plt.figure() 9 | ax=fig.add_subplot(111, label="1") 10 | ax2=fig.add_subplot(111, label="2", frame_on=False) 11 | 12 | ax.plot(x, epsilons, color="C0") 13 | ax.set_xlabel("Training Steps", color="C0") 14 | ax.set_ylabel("Epsilon", color="C0") 15 | ax.tick_params(axis='x', colors="C0") 16 | ax.tick_params(axis='y', colors="C0") 17 | 18 | N = len(scores) 19 | running_avg = np.empty(N) 20 | for t in range(N): 21 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 22 | 23 | ax2.scatter(x, running_avg, color="C1") 24 | ax2.axes.get_xaxis().set_visible(False) 25 | ax2.yaxis.tick_right() 26 | ax2.set_ylabel('Score', color="C1") 27 | ax2.yaxis.set_label_position('right') 28 | ax2.tick_params(axis='y', colors="C1") 29 | 30 | if lines is not None: 31 | for line in lines: 32 | plt.axvline(x=line) 33 | 34 | plt.savefig(filename) 35 | 36 | class RepeatActionAndMaxFrame(gym.Wrapper): 37 | def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0, 38 | fire_first=False): 39 | super(RepeatActionAndMaxFrame, self).__init__(env) 40 | self.repeat = repeat 41 | self.shape = env.observation_space.low.shape 42 | self.frame_buffer = np.zeros_like((2, self.shape)) 43 | self.clip_reward = clip_reward 44 | self.no_ops = no_ops 45 | self.fire_first = fire_first 46 | 47 | def step(self, action): 48 | t_reward = 0.0 49 | done = False 50 | for i in range(self.repeat): 51 | obs, reward, done, info = self.env.step(action) 52 | if self.clip_reward: 53 | reward = np.clip(np.array([reward]), -1, 1)[0] 54 | t_reward += reward 55 | idx = i % 2 56 | self.frame_buffer[idx] = obs 57 | if done: 58 | break 59 | 60 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 61 | return max_frame, t_reward, done, info 62 | 63 | def reset(self): 64 | obs = self.env.reset() 65 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 66 | for _ in range(no_ops): 67 | _, _, done, _ = self.env.step(0) 68 | if done: 69 | self.env.reset() 70 | if self.fire_first: 71 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 72 | obs, _, _, _ = self.env.step(1) 73 | 74 | self.frame_buffer = np.zeros_like((2,self.shape)) 75 | self.frame_buffer[0] = obs 76 | 77 | return obs 78 | 79 | class PreprocessFrame(gym.ObservationWrapper): 80 | def __init__(self, shape, env=None): 81 | super(PreprocessFrame, self).__init__(env) 82 | self.shape = (shape[2], shape[0], shape[1]) 83 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 84 | shape=self.shape, dtype=np.float32) 85 | 86 | def observation(self, obs): 87 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 88 | resized_screen = cv2.resize(new_frame, self.shape[1:], 89 | interpolation=cv2.INTER_AREA) 90 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 91 | new_obs = new_obs / 255.0 92 | 93 | return new_obs 94 | 95 | class StackFrames(gym.ObservationWrapper): 96 | def __init__(self, env, repeat): 97 | super(StackFrames, self).__init__(env) 98 | self.observation_space = gym.spaces.Box( 99 | env.observation_space.low.repeat(repeat, axis=0), 100 | env.observation_space.high.repeat(repeat, axis=0), 101 | dtype=np.float32) 102 | self.stack = collections.deque(maxlen=repeat) 103 | 104 | def reset(self): 105 | self.stack.clear() 106 | observation = self.env.reset() 107 | for _ in range(self.stack.maxlen): 108 | self.stack.append(observation) 109 | 110 | return np.array(self.stack).reshape(self.observation_space.low.shape) 111 | 112 | def observation(self, observation): 113 | self.stack.append(observation) 114 | 115 | return np.array(self.stack).reshape(self.observation_space.low.shape) 116 | 117 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False, 118 | no_ops=0, fire_first=False): 119 | env = gym.make(env_name) 120 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first) 121 | env = PreprocessFrame(shape, env) 122 | env = StackFrames(env, repeat) 123 | 124 | return env 125 | -------------------------------------------------------------------------------- /DuelingDDQN/deep_q_network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import numpy as np 7 | 8 | class DuelingDeepQNetwork(nn.Module): 9 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 10 | super(DuelingDeepQNetwork, self).__init__() 11 | 12 | self.checkpoint_dir = chkpt_dir 13 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 14 | 15 | self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4) 16 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 17 | self.conv3 = nn.Conv2d(64, 64, 3, stride=1) 18 | 19 | fc_input_dims = self.calculate_conv_output_dims(input_dims) 20 | 21 | self.fc1 = nn.Linear(fc_input_dims, 1024) 22 | self.fc2 = nn.Linear(1024, 512) 23 | self.V = nn.Linear(512, 1) 24 | self.A = nn.Linear(512, n_actions) 25 | 26 | self.optimizer = optim.RMSprop(self.parameters(), lr=lr) 27 | self.loss = nn.MSELoss() 28 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 29 | self.to(self.device) 30 | 31 | 32 | def calculate_conv_output_dims(self, input_dims): 33 | state = T.zeros(1, *input_dims) 34 | dims = self.conv1(state) 35 | dims = self.conv2(dims) 36 | dims = self.conv3(dims) 37 | return int(np.prod(dims.size())) 38 | 39 | def forward(self, state): 40 | conv1 = F.relu(self.conv1(state)) 41 | conv2 = F.relu(self.conv2(conv1)) 42 | conv3 = F.relu(self.conv3(conv2)) 43 | conv_state = conv3.view(conv3.size()[0], -1) 44 | flat1 = F.relu(self.fc1(conv_state)) 45 | flat2 = F.relu(self.fc2(flat1)) 46 | 47 | V = self.V(flat2) 48 | A = self.A(flat2) 49 | 50 | return V, A 51 | 52 | def save_checkpoint(self): 53 | print('... saving checkpoint ...') 54 | T.save(self.state_dict(), self.checkpoint_file) 55 | 56 | def load_checkpoint(self): 57 | print('... loading checkpoint ...') 58 | self.load_state_dict(T.load(self.checkpoint_file)) 59 | -------------------------------------------------------------------------------- /DuelingDDQN/dueling_ddqn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from deep_q_network import DuelingDeepQNetwork 4 | from replay_memory import ReplayBuffer 5 | 6 | class DuelingDDQNAgent(object): 7 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 8 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 9 | replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): 10 | self.gamma = gamma 11 | self.epsilon = epsilon 12 | self.lr = lr 13 | self.n_actions = n_actions 14 | self.input_dims = input_dims 15 | self.batch_size = batch_size 16 | self.eps_min = eps_min 17 | self.eps_dec = eps_dec 18 | self.replace_target_cnt = replace 19 | self.algo = algo 20 | self.env_name = env_name 21 | self.chkpt_dir = chkpt_dir 22 | self.action_space = [i for i in range(n_actions)] 23 | self.learn_step_counter = 0 24 | 25 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions) 26 | 27 | self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions, 28 | input_dims=self.input_dims, 29 | name=self.env_name+'_'+self.algo+'_q_eval', 30 | chkpt_dir=self.chkpt_dir) 31 | self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions, 32 | input_dims=self.input_dims, 33 | name=self.env_name+'_'+self.algo+'_q_next', 34 | chkpt_dir=self.chkpt_dir) 35 | 36 | def store_transition(self, state, action, reward, state_, done): 37 | self.memory.store_transition(state, action, reward, state_, done) 38 | 39 | def sample_memory(self): 40 | state, action, reward, new_state, done = \ 41 | self.memory.sample_buffer(self.batch_size) 42 | 43 | states = T.tensor(state).to(self.q_eval.device) 44 | rewards = T.tensor(reward).to(self.q_eval.device) 45 | dones = T.tensor(done).to(self.q_eval.device) 46 | actions = T.tensor(action).to(self.q_eval.device) 47 | states_ = T.tensor(new_state).to(self.q_eval.device) 48 | 49 | return states, actions, rewards, states_, dones 50 | 51 | def choose_action(self, observation): 52 | if np.random.random() > self.epsilon: 53 | state = np.array([observation], copy=False, dtype=np.float32) 54 | state_tensor = T.tensor(state).to(self.q_eval.device) 55 | _, advantages = self.q_eval.forward(state_tensor) 56 | 57 | action = T.argmax(advantages).item() 58 | else: 59 | action = np.random.choice(self.action_space) 60 | 61 | return action 62 | 63 | def replace_target_network(self): 64 | if self.replace_target_cnt is not None and \ 65 | self.learn_step_counter % self.replace_target_cnt == 0: 66 | self.q_next.load_state_dict(self.q_eval.state_dict()) 67 | 68 | def decrement_epsilon(self): 69 | self.epsilon = self.epsilon - self.eps_dec \ 70 | if self.epsilon > self.eps_min else self.eps_min 71 | 72 | def learn(self): 73 | if self.memory.mem_cntr < self.batch_size: 74 | return 75 | 76 | self.q_eval.optimizer.zero_grad() 77 | 78 | self.replace_target_network() 79 | 80 | states, actions, rewards, states_, dones = self.sample_memory() 81 | indices = np.arange(self.batch_size) 82 | 83 | V_s, A_s = self.q_eval.forward(states) 84 | V_s_, A_s_ = self.q_next.forward(states_) 85 | 86 | V_s_eval, A_s_eval = self.q_eval.forward(states_) 87 | 88 | q_pred = T.add(V_s, 89 | (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions] 90 | 91 | q_next = T.add(V_s_, (A_s_ - A_s_.mean(dim=1, keepdim=True))) 92 | 93 | q_eval = T.add(V_s_eval, (A_s_eval - A_s_eval.mean(dim=1,keepdim=True))) 94 | 95 | max_actions = T.argmax(q_eval, dim=1) 96 | q_next[dones] = 0.0 97 | 98 | q_target = rewards + self.gamma*q_next[indices, max_actions] 99 | 100 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 101 | loss.backward() 102 | self.q_eval.optimizer.step() 103 | self.learn_step_counter += 1 104 | 105 | self.decrement_epsilon() 106 | 107 | def save_models(self): 108 | self.q_eval.save_checkpoint() 109 | self.q_next.save_checkpoint() 110 | 111 | def load_models(self): 112 | self.q_eval.load_checkpoint() 113 | self.q_next.load_checkpoint() 114 | -------------------------------------------------------------------------------- /DuelingDDQN/main_dueling_ddqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import wrappers 3 | import numpy as np 4 | from dueling_ddqn_agent import DuelingDDQNAgent 5 | from utils import plot_learning_curve, make_env 6 | 7 | if __name__ == '__main__': 8 | env = make_env('PongNoFrameskip-v4') 9 | best_score = -np.inf 10 | load_checkpoint = False 11 | n_games = 20 12 | agent = DuelingDDQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001, 13 | input_dims=(env.observation_space.shape), 14 | n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, 15 | batch_size=32, replace=10000, eps_dec=1e-5, 16 | chkpt_dir='models/', algo='DuelingDDQNAgent', 17 | env_name='PongNoFrameskip-v4') 18 | 19 | if load_checkpoint: 20 | agent.load_models() 21 | 22 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \ 23 | + str(n_games) + 'games' 24 | figure_file = 'plots/' + fname + '.png' 25 | 26 | n_steps = 0 27 | scores, eps_history, steps_array = [], [], [] 28 | 29 | for i in range(n_games): 30 | done = False 31 | observation = env.reset() 32 | 33 | score = 0 34 | while not done: 35 | action = agent.choose_action(observation) 36 | observation_, reward, done, info = env.step(action) 37 | score += reward 38 | 39 | if not load_checkpoint: 40 | agent.store_transition(observation, action, 41 | reward, observation_, int(done)) 42 | agent.learn() 43 | observation = observation_ 44 | n_steps += 1 45 | scores.append(score) 46 | steps_array.append(n_steps) 47 | 48 | avg_score = np.mean(scores[-100:]) 49 | print('episode: ', i,'score: ', score, 50 | ' average score %.1f' % avg_score, 'best score %.2f' % best_score, 51 | 'epsilon %.2f' % agent.epsilon, 'steps', n_steps) 52 | 53 | if avg_score > best_score: 54 | if not load_checkpoint: 55 | agent.save_models() 56 | best_score = avg_score 57 | 58 | eps_history.append(agent.epsilon) 59 | if load_checkpoint and n_steps >= 18000: 60 | break 61 | 62 | x = [i+1 for i in range(len(scores))] 63 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 64 | -------------------------------------------------------------------------------- /DuelingDDQN/models/PongNoFrameskip-v4_DuelingDDQNAgent_q_eval: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDDQN/models/PongNoFrameskip-v4_DuelingDDQNAgent_q_eval -------------------------------------------------------------------------------- /DuelingDDQN/models/PongNoFrameskip-v4_DuelingDDQNAgent_q_next: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDDQN/models/PongNoFrameskip-v4_DuelingDDQNAgent_q_next -------------------------------------------------------------------------------- /DuelingDDQN/plots/DuelingDDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDDQN/plots/DuelingDDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png -------------------------------------------------------------------------------- /DuelingDDQN/replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape), 8 | dtype=np.float32) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 10 | dtype=np.float32) 11 | 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, done): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.new_state_memory[index] = state_ 20 | self.action_memory[index] = action 21 | self.reward_memory[index] = reward 22 | self.terminal_memory[index] = done 23 | self.mem_cntr += 1 24 | 25 | def sample_buffer(self, batch_size): 26 | max_mem = min(self.mem_cntr, self.mem_size) 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | terminal = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, terminal 36 | -------------------------------------------------------------------------------- /DuelingDDQN/tf2/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras.optimizers import Adam 4 | import tensorflow.keras as keras 5 | from network import DuelingDeepQNetwork 6 | from replay_memory import ReplayBuffer 7 | 8 | 9 | class Agent: 10 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 11 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 12 | replace=1000, algo=None, env_name=None, 13 | chkpt_dir='tmp/duelingddqn'): 14 | self.gamma = gamma 15 | self.epsilon = epsilon 16 | self.lr = lr 17 | self.n_actions = n_actions 18 | self.input_dims = input_dims 19 | self.batch_size = batch_size 20 | self.eps_min = eps_min 21 | self.eps_dec = eps_dec 22 | self.replace_target_cnt = replace 23 | self.algo = algo 24 | self.env_name = env_name 25 | self.chkpt_dir = chkpt_dir 26 | self.action_space = [i for i in range(n_actions)] 27 | self.learn_step_counter = 0 28 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions) 29 | self.fname = self.chkpt_dir + self.env_name + '_' + self.algo + '_' 30 | 31 | self.q_eval = DuelingDeepQNetwork(input_dims, n_actions) 32 | self.q_eval.compile(optimizer=Adam(learning_rate=lr)) 33 | self.q_next = DuelingDeepQNetwork(input_dims, n_actions) 34 | self.q_next.compile(optimizer=Adam(learning_rate=lr)) 35 | 36 | def save_models(self): 37 | self.q_eval.save(self.fname+'q_eval') 38 | self.q_next.save(self.fname+'q_next') 39 | print('... models saved successfully ...') 40 | 41 | def load_models(self): 42 | self.q_eval = keras.models.load_model(self.fname+'q_eval') 43 | self.q_next = keras.models.load_model(self.fname+'q_next') 44 | print('... models loaded successfully ...') 45 | 46 | def choose_action(self, observation): 47 | if np.random.random() > self.epsilon: 48 | state = tf.convert_to_tensor([observation]) 49 | _, advantage = self.q_eval(state) 50 | action = tf.math.argmax(advantage, axis=1).numpy()[0] 51 | else: 52 | action = np.random.choice(self.action_space) 53 | return action 54 | 55 | def store_transition(self, state, action, reward, state_, done): 56 | self.memory.store_transition(state, action, reward, state_, done) 57 | 58 | def sample_memory(self): 59 | state, action, reward, new_state, done = \ 60 | self.memory.sample_buffer(self.batch_size) 61 | states = tf.convert_to_tensor(state) 62 | rewards = tf.convert_to_tensor(reward) 63 | dones = tf.convert_to_tensor(done) 64 | actions = tf.convert_to_tensor(action, dtype=tf.int32) 65 | states_ = tf.convert_to_tensor(new_state) 66 | return states, actions, rewards, states_, dones 67 | 68 | def replace_target_network(self): 69 | if self.learn_step_counter % self.replace_target_cnt == 0: 70 | self.q_next.set_weights(self.q_eval.get_weights()) 71 | 72 | def decrement_epsilon(self): 73 | self.epsilon = self.epsilon - self.eps_dec \ 74 | if self.epsilon > self.eps_min else self.eps_min 75 | 76 | def learn(self): 77 | if self.memory.mem_cntr < self.batch_size: 78 | return 79 | 80 | self.replace_target_network() 81 | 82 | states, actions, rewards, states_, dones = self.sample_memory() 83 | 84 | indices = tf.range(self.batch_size, dtype=tf.int32) 85 | action_indices = tf.stack([indices, actions], axis=1) 86 | 87 | with tf.GradientTape() as tape: 88 | V_s, A_s = self.q_eval(states) 89 | V_s_, A_s_ = self.q_next(states_) 90 | V_s_eval, A_s_eval = self.q_eval(states_) 91 | 92 | advantage = V_s+A_s-tf.reduce_mean(A_s, axis=1, 93 | keepdims=True) 94 | advantage_ = V_s_+A_s_-tf.reduce_mean(A_s_, axis=1, 95 | keepdims=True) 96 | advantage_eval = V_s_eval+A_s_eval-tf.reduce_mean(A_s_eval, 97 | axis=1, 98 | keepdims=True) 99 | max_actions = tf.argmax(advantage_eval, axis=1, 100 | output_type=tf.int32) 101 | max_action_idx = tf.stack([indices, max_actions], axis=1) 102 | q_next = tf.gather_nd(advantage_, indices=max_action_idx) 103 | q_pred = tf.gather_nd(advantage, indices=action_indices) 104 | 105 | q_target = rewards + self.gamma*q_next * (1 - dones.numpy()) 106 | loss = keras.losses.MSE(q_pred, q_target) 107 | params = self.q_eval.trainable_variables 108 | grads = tape.gradient(loss, params) 109 | self.q_eval.optimizer.apply_gradients(zip(grads, params)) 110 | self.learn_step_counter += 1 111 | 112 | self.decrement_epsilon() 113 | -------------------------------------------------------------------------------- /DuelingDDQN/tf2/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from agent import Agent 3 | from utils import plot_learning_curve, make_env, manage_memory 4 | from gym import wrappers 5 | 6 | if __name__ == '__main__': 7 | manage_memory() 8 | env = make_env('PongNoFrameskip-v4') 9 | best_score = -np.inf 10 | load_checkpoint = False 11 | record_agent = False 12 | n_games = 250 13 | agent = Agent(gamma=0.99, epsilon=1, lr=0.0001, 14 | input_dims=(env.observation_space.shape), 15 | n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, 16 | batch_size=32, replace=1000, eps_dec=1e-5, 17 | chkpt_dir='models/', algo='DQNAgent', 18 | env_name='PongNoFrameskip-v4') 19 | if load_checkpoint: 20 | agent.load_models() 21 | agent.epsilon = agent.eps_min 22 | 23 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \ 24 | + str(n_games) + 'games' 25 | figure_file = 'plots/' + fname + '.png' 26 | # if you want to record video of your agent playing, do a 27 | # mkdir video 28 | if record_agent: 29 | env = wrappers.Monitor(env, "video", 30 | video_callable=lambda episode_id: True, 31 | force=True) 32 | n_steps = 0 33 | scores, eps_history, steps_array = [], [], [] 34 | 35 | for i in range(n_games): 36 | done = False 37 | observation = env.reset() 38 | 39 | score = 0 40 | while not done: 41 | action = agent.choose_action(observation) 42 | observation_, reward, done, info = env.step(action) 43 | score += reward 44 | 45 | if not load_checkpoint: 46 | agent.store_transition(observation, action, 47 | reward, observation_, done) 48 | agent.learn() 49 | observation = observation_ 50 | n_steps += 1 51 | scores.append(score) 52 | steps_array.append(n_steps) 53 | 54 | avg_score = np.mean(scores[-100:]) 55 | print('episode {} score {:.1f} avg score {:.1f} ' 56 | 'best score {:.1f} epsilon {:.2f} steps {}'. 57 | format(i, score, avg_score, best_score, agent.epsilon, 58 | n_steps)) 59 | 60 | if score > best_score: 61 | if not load_checkpoint: 62 | agent.save_models() 63 | best_score = score 64 | 65 | eps_history.append(agent.epsilon) 66 | 67 | x = [i+1 for i in range(len(scores))] 68 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 69 | -------------------------------------------------------------------------------- /DuelingDDQN/tf2/network.py: -------------------------------------------------------------------------------- 1 | import tensorflow.keras as keras 2 | from tensorflow.keras.layers import Conv2D, Dense, Flatten 3 | 4 | 5 | class DuelingDeepQNetwork(keras.Model): 6 | def __init__(self, input_dims, n_actions): 7 | super(DuelingDeepQNetwork, self).__init__() 8 | self.conv1 = Conv2D(32, 8, strides=(4, 4), activation='relu', 9 | data_format='channels_first', 10 | input_shape=input_dims) 11 | self.conv2 = Conv2D(64, 4, strides=(2, 2), activation='relu', 12 | data_format='channels_first') 13 | self.conv3 = Conv2D(64, 3, strides=(1, 1), activation='relu', 14 | data_format='channels_first') 15 | self.flat = Flatten() 16 | self.fc1 = Dense(512, activation='relu') 17 | self.A = Dense(n_actions, activation=None) 18 | self.V = Dense(1, activation=None) 19 | 20 | def call(self, state): 21 | x = self.conv1(state) 22 | x = self.conv2(x) 23 | x = self.conv3(x) 24 | x = self.flat(x) 25 | x = self.fc1(x) 26 | V = self.V(x) 27 | A = self.A(x) 28 | 29 | return V, A 30 | -------------------------------------------------------------------------------- /DuelingDDQN/tf2/replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape), 8 | dtype=np.float32) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 10 | dtype=np.float32) 11 | 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, done): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.new_state_memory[index] = state_ 20 | self.action_memory[index] = action 21 | self.reward_memory[index] = reward 22 | self.terminal_memory[index] = done 23 | self.mem_cntr += 1 24 | 25 | def sample_buffer(self, batch_size): 26 | max_mem = min(self.mem_cntr, self.mem_size) 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | terminal = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, terminal 36 | -------------------------------------------------------------------------------- /DuelingDDQN/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | import tensorflow as tf 7 | 8 | 9 | def manage_memory(): 10 | gpus = tf.config.list_physical_devices('GPU') 11 | if gpus: 12 | try: 13 | for gpu in gpus: 14 | tf.config.experimental.set_memory_growth(gpu, True) 15 | except RuntimeError as e: 16 | print(e) 17 | 18 | 19 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 20 | fig = plt.figure() 21 | ax = fig.add_subplot(111, label="1") 22 | ax2 = fig.add_subplot(111, label="2", frame_on=False) 23 | 24 | ax.plot(x, epsilons, color="C0") 25 | ax.set_xlabel("Training Steps", color="C0") 26 | ax.set_ylabel("Epsilon", color="C0") 27 | ax.tick_params(axis='x', colors="C0") 28 | ax.tick_params(axis='y', colors="C0") 29 | 30 | N = len(scores) 31 | running_avg = np.empty(N) 32 | for t in range(N): 33 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 34 | 35 | ax2.scatter(x, running_avg, color="C1") 36 | ax2.axes.get_xaxis().set_visible(False) 37 | ax2.yaxis.tick_right() 38 | ax2.set_ylabel('Score', color="C1") 39 | ax2.yaxis.set_label_position('right') 40 | ax2.tick_params(axis='y', colors="C1") 41 | 42 | if lines is not None: 43 | for line in lines: 44 | plt.axvline(x=line) 45 | 46 | plt.savefig(filename) 47 | 48 | 49 | class RepeatActionAndMaxFrame(gym.Wrapper): 50 | def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0, 51 | fire_first=False): 52 | super(RepeatActionAndMaxFrame, self).__init__(env) 53 | self.repeat = repeat 54 | self.shape = env.observation_space.low.shape 55 | 56 | # self.frame_buffer = np.zeros(shape=(2, *self.shape)) 57 | self.frame_buffer = np.zeros_like((2, self.shape)) 58 | self.clip_reward = clip_reward 59 | self.no_ops = no_ops 60 | self.fire_first = fire_first 61 | 62 | def step(self, action): 63 | t_reward = 0.0 64 | done = False 65 | for i in range(self.repeat): 66 | obs, reward, done, info = self.env.step(action) 67 | if self.clip_reward: 68 | reward = np.clip(np.array([reward]), -1, 1)[0] 69 | t_reward += reward 70 | idx = i % 2 71 | self.frame_buffer[idx] = obs 72 | if done: 73 | break 74 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 75 | return max_frame, t_reward, done, info 76 | 77 | def reset(self): 78 | obs = self.env.reset() 79 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 80 | for _ in range(no_ops): 81 | _, _, done, _ = self.env.step(0) 82 | if done: 83 | self.env.reset() 84 | if self.fire_first: 85 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 86 | obs, _, _, _ = self.env.step(1) 87 | 88 | # self.frame_buffer = np.zeros(shape=(2, *self.shape)) 89 | self.frame_buffer = np.zeros_like((2, self.shape)) 90 | self.frame_buffer[0] = obs 91 | 92 | return obs 93 | 94 | 95 | class PreprocessFrame(gym.ObservationWrapper): 96 | def __init__(self, shape, env=None): 97 | super(PreprocessFrame, self).__init__(env) 98 | self.shape = (shape[2], shape[0], shape[1]) 99 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 100 | shape=self.shape, 101 | dtype=np.float32) 102 | 103 | def observation(self, obs): 104 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 105 | resized_screen = cv2.resize(new_frame, self.shape[1:], 106 | interpolation=cv2.INTER_AREA) 107 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 108 | new_obs = new_obs / 255.0 109 | 110 | return new_obs 111 | 112 | 113 | class StackFrames(gym.ObservationWrapper): 114 | def __init__(self, env, repeat): 115 | super(StackFrames, self).__init__(env) 116 | self.observation_space = gym.spaces.Box( 117 | env.observation_space.low.repeat(repeat, axis=0), 118 | env.observation_space.high.repeat(repeat, axis=0), 119 | dtype=np.float32) 120 | self.stack = collections.deque(maxlen=repeat) 121 | 122 | def reset(self): 123 | self.stack.clear() 124 | observation = self.env.reset() 125 | for _ in range(self.stack.maxlen): 126 | self.stack.append(observation) 127 | 128 | return np.array(self.stack).reshape(self.observation_space.low.shape) 129 | 130 | def observation(self, observation): 131 | self.stack.append(observation) 132 | 133 | return np.array(self.stack).reshape(self.observation_space.low.shape) 134 | 135 | 136 | def make_env(env_name, shape=(84, 84, 1), repeat=4, clip_rewards=False, 137 | no_ops=0, fire_first=False): 138 | env = gym.make(env_name) 139 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, 140 | no_ops, fire_first) 141 | env = PreprocessFrame(shape, env) 142 | env = StackFrames(env, repeat) 143 | 144 | return env 145 | -------------------------------------------------------------------------------- /DuelingDDQN/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | 7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 8 | fig=plt.figure() 9 | ax=fig.add_subplot(111, label="1") 10 | ax2=fig.add_subplot(111, label="2", frame_on=False) 11 | 12 | ax.plot(x, epsilons, color="C0") 13 | ax.set_xlabel("Training Steps", color="C0") 14 | ax.set_ylabel("Epsilon", color="C0") 15 | ax.tick_params(axis='x', colors="C0") 16 | ax.tick_params(axis='y', colors="C0") 17 | 18 | N = len(scores) 19 | running_avg = np.empty(N) 20 | for t in range(N): 21 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 22 | 23 | ax2.scatter(x, running_avg, color="C1") 24 | ax2.axes.get_xaxis().set_visible(False) 25 | ax2.yaxis.tick_right() 26 | ax2.set_ylabel('Score', color="C1") 27 | ax2.yaxis.set_label_position('right') 28 | ax2.tick_params(axis='y', colors="C1") 29 | 30 | if lines is not None: 31 | for line in lines: 32 | plt.axvline(x=line) 33 | 34 | plt.savefig(filename) 35 | 36 | class RepeatActionAndMaxFrame(gym.Wrapper): 37 | """ modified from: 38 | https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py 39 | """ 40 | def __init__(self, env=None, repeat=4): 41 | super(RepeatActionAndMaxFrame, self).__init__(env) 42 | self.repeat = repeat 43 | self.shape = env.observation_space.low.shape 44 | self.frame_buffer = np.zeros_like((2,self.shape)) 45 | 46 | def step(self, action): 47 | t_reward = 0.0 48 | done = False 49 | for i in range(self.repeat): 50 | obs, reward, done, info = self.env.step(action) 51 | t_reward += reward 52 | idx = i % 2 53 | self.frame_buffer[idx] = obs 54 | if done: 55 | break 56 | 57 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 58 | return max_frame, t_reward, done, info 59 | 60 | def reset(self): 61 | obs = self.env.reset() 62 | self.frame_buffer = np.zeros_like((2,self.shape)) 63 | self.frame_buffer[0] = obs 64 | return obs 65 | 66 | class PreprocessFrame(gym.ObservationWrapper): 67 | def __init__(self, shape, env=None): 68 | super(PreprocessFrame, self).__init__(env) 69 | self.shape=(shape[2], shape[0], shape[1]) 70 | self.observation_space = gym.spaces.Box(low=0, high=1.0, 71 | shape=self.shape,dtype=np.float32) 72 | def observation(self, obs): 73 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 74 | resized_screen = cv2.resize(new_frame, self.shape[1:], 75 | interpolation=cv2.INTER_AREA) 76 | 77 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 78 | new_obs = np.swapaxes(new_obs, 2,0) 79 | new_obs = new_obs / 255.0 80 | return new_obs 81 | 82 | class StackFrames(gym.ObservationWrapper): 83 | def __init__(self, env, n_steps): 84 | super(StackFrames, self).__init__(env) 85 | self.observation_space = gym.spaces.Box( 86 | env.observation_space.low.repeat(n_steps, axis=0), 87 | env.observation_space.high.repeat(n_steps, axis=0), 88 | dtype=np.float32) 89 | self.stack = collections.deque(maxlen=n_steps) 90 | 91 | def reset(self): 92 | self.stack.clear() 93 | observation = self.env.reset() 94 | for _ in range(self.stack.maxlen): 95 | self.stack.append(observation) 96 | 97 | return np.array(self.stack).reshape(self.observation_space.low.shape) 98 | 99 | def observation(self, observation): 100 | self.stack.append(observation) 101 | obs = np.array(self.stack).reshape(self.observation_space.low.shape) 102 | 103 | return obs 104 | 105 | def make_env(env_name, shape=(84,84,1), skip=4): 106 | env = gym.make(env_name) 107 | env = RepeatActionAndMaxFrame(env, skip) 108 | env = PreprocessFrame(shape, env) 109 | env = StackFrames(env, skip) 110 | 111 | return env 112 | -------------------------------------------------------------------------------- /DuelingDQN/deep_q_network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import numpy as np 7 | 8 | class DuelingDeepQNetwork(nn.Module): 9 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 10 | super(DuelingDeepQNetwork, self).__init__() 11 | 12 | self.checkpoint_dir = chkpt_dir 13 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 14 | 15 | self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4) 16 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 17 | self.conv3 = nn.Conv2d(64, 64, 3, stride=1) 18 | 19 | fc_input_dims = self.calculate_conv_output_dims(input_dims) 20 | 21 | self.fc1 = nn.Linear(fc_input_dims, 1024) 22 | self.fc2 = nn.Linear(1024, 512) 23 | self.V = nn.Linear(512, 1) 24 | self.A = nn.Linear(512, n_actions) 25 | 26 | self.optimizer = optim.RMSprop(self.parameters(), lr=lr) 27 | self.loss = nn.MSELoss() 28 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 29 | self.to(self.device) 30 | 31 | 32 | def calculate_conv_output_dims(self, input_dims): 33 | state = T.zeros(1, *input_dims) 34 | dims = self.conv1(state) 35 | dims = self.conv2(dims) 36 | dims = self.conv3(dims) 37 | return int(np.prod(dims.size())) 38 | 39 | def forward(self, state): 40 | conv1 = F.relu(self.conv1(state)) 41 | conv2 = F.relu(self.conv2(conv1)) 42 | conv3 = F.relu(self.conv3(conv2)) 43 | conv_state = conv3.view(conv3.size()[0], -1) 44 | flat1 = F.relu(self.fc1(conv_state)) 45 | flat2 = F.relu(self.fc2(flat1)) 46 | 47 | V = self.V(flat2) 48 | A = self.A(flat2) 49 | 50 | return V, A 51 | 52 | def save_checkpoint(self): 53 | print('... saving checkpoint ...') 54 | T.save(self.state_dict(), self.checkpoint_file) 55 | 56 | def load_checkpoint(self): 57 | print('... loading checkpoint ...') 58 | self.load_state_dict(T.load(self.checkpoint_file)) 59 | -------------------------------------------------------------------------------- /DuelingDQN/dueling_dqn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from deep_q_network import DuelingDeepQNetwork 4 | from replay_memory import ReplayBuffer 5 | 6 | class DuelingDQNAgent(object): 7 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 8 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 9 | replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): 10 | self.gamma = gamma 11 | self.epsilon = epsilon 12 | self.lr = lr 13 | self.n_actions = n_actions 14 | self.input_dims = input_dims 15 | self.batch_size = batch_size 16 | self.eps_min = eps_min 17 | self.eps_dec = eps_dec 18 | self.replace_target_cnt = replace 19 | self.algo = algo 20 | self.env_name = env_name 21 | self.chkpt_dir = chkpt_dir 22 | self.action_space = [i for i in range(n_actions)] 23 | self.learn_step_counter = 0 24 | 25 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions) 26 | 27 | self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions, 28 | input_dims=self.input_dims, 29 | name=self.env_name+'_'+self.algo+'_q_eval', 30 | chkpt_dir=self.chkpt_dir) 31 | self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions, 32 | input_dims=self.input_dims, 33 | name=self.env_name+'_'+self.algo+'_q_next', 34 | chkpt_dir=self.chkpt_dir) 35 | 36 | def store_transition(self, state, action, reward, state_, done): 37 | self.memory.store_transition(state, action, reward, state_, done) 38 | 39 | def sample_memory(self): 40 | state, action, reward, new_state, done = \ 41 | self.memory.sample_buffer(self.batch_size) 42 | 43 | states = T.tensor(state).to(self.q_eval.device) 44 | rewards = T.tensor(reward).to(self.q_eval.device) 45 | dones = T.tensor(done).to(self.q_eval.device) 46 | actions = T.tensor(action).to(self.q_eval.device) 47 | states_ = T.tensor(new_state).to(self.q_eval.device) 48 | 49 | return states, actions, rewards, states_, dones 50 | 51 | def choose_action(self, observation): 52 | if np.random.random() > self.epsilon: 53 | state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) 54 | _, advantage = self.q_eval.forward(state) 55 | action = T.argmax(advantage).item() 56 | else: 57 | action = np.random.choice(self.action_space) 58 | 59 | return action 60 | 61 | def replace_target_network(self): 62 | if self.replace_target_cnt is not None and \ 63 | self.learn_step_counter % self.replace_target_cnt == 0: 64 | self.q_next.load_state_dict(self.q_eval.state_dict()) 65 | 66 | def decrement_epsilon(self): 67 | self.epsilon = self.epsilon - self.eps_dec \ 68 | if self.epsilon > self.eps_min else self.eps_min 69 | 70 | def learn(self): 71 | if self.memory.mem_cntr < self.batch_size: 72 | return 73 | 74 | self.q_eval.optimizer.zero_grad() 75 | 76 | self.replace_target_network() 77 | 78 | states, actions, rewards, states_, dones = self.sample_memory() 79 | 80 | V_s, A_s = self.q_eval.forward(states) 81 | V_s_, A_s_ = self.q_next.forward(states_) 82 | 83 | indices = np.arange(self.batch_size) 84 | 85 | q_pred = T.add(V_s, 86 | (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions] 87 | q_next = T.add(V_s_, 88 | (A_s_ - A_s_.mean(dim=1, keepdim=True))).max(dim=1)[0] 89 | 90 | q_next[dones] = 0.0 91 | q_target = rewards + self.gamma*q_next 92 | 93 | 94 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 95 | loss.backward() 96 | self.q_eval.optimizer.step() 97 | self.learn_step_counter += 1 98 | 99 | self.decrement_epsilon() 100 | 101 | def save_models(self): 102 | self.q_eval.save_checkpoint() 103 | self.q_next.save_checkpoint() 104 | 105 | def load_models(self): 106 | self.q_eval.load_checkpoint() 107 | self.q_next.load_checkpoint() 108 | -------------------------------------------------------------------------------- /DuelingDQN/main_dueling_dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from dueling_dqn_agent import DuelingDQNAgent 4 | from utils import plot_learning_curve, make_env 5 | 6 | if __name__ == '__main__': 7 | env = make_env('PongNoFrameskip-v4') 8 | best_score = -np.inf 9 | load_checkpoint = False 10 | n_games = 20 11 | agent = DuelingDQNAgent(gamma=0.99, epsilon=1.0, lr=0.0001, 12 | input_dims=(env.observation_space.shape), 13 | n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, 14 | batch_size=32, replace=10000, eps_dec=1e-5, 15 | chkpt_dir='models/', algo='DuelingDQNAgent', 16 | env_name='PongNoFrameskip-v4') 17 | 18 | if load_checkpoint: 19 | agent.load_models() 20 | 21 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) +'_' \ 22 | + str(n_games) + 'games' 23 | figure_file = 'plots/' + fname + '.png' 24 | 25 | n_steps = 0 26 | scores, eps_history, steps_array = [], [], [] 27 | 28 | for i in range(n_games): 29 | done = False 30 | observation = env.reset() 31 | 32 | score = 0 33 | while not done: 34 | action = agent.choose_action(observation) 35 | observation_, reward, done, info = env.step(action) 36 | score += reward 37 | 38 | if not load_checkpoint: 39 | agent.store_transition(observation, action, 40 | reward, observation_, int(done)) 41 | agent.learn() 42 | observation = observation_ 43 | n_steps += 1 44 | scores.append(score) 45 | steps_array.append(n_steps) 46 | 47 | avg_score = np.mean(scores[-100:]) 48 | print('episode: ', i,'score: ', score, 49 | ' average score %.1f' % avg_score, 'best score %.2f' % best_score, 50 | 'epsilon %.2f' % agent.epsilon, 'steps', n_steps) 51 | 52 | if avg_score > best_score: 53 | if not load_checkpoint: 54 | agent.save_models() 55 | best_score = avg_score 56 | 57 | eps_history.append(agent.epsilon) 58 | if load_checkpoint and n_steps >= 18000: 59 | break 60 | 61 | x = [i+1 for i in range(len(scores))] 62 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 63 | -------------------------------------------------------------------------------- /DuelingDQN/models/PongNoFrameskip-v4_DuelingDQNAgent_q_eval: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDQN/models/PongNoFrameskip-v4_DuelingDQNAgent_q_eval -------------------------------------------------------------------------------- /DuelingDQN/models/PongNoFrameskip-v4_DuelingDQNAgent_q_next: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDQN/models/PongNoFrameskip-v4_DuelingDQNAgent_q_next -------------------------------------------------------------------------------- /DuelingDQN/plots/DuelingDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/DuelingDQN/plots/DuelingDQNAgent_PongNoFrameskip-v4_alpha0.0001_300games.png -------------------------------------------------------------------------------- /DuelingDQN/replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape), 8 | dtype=np.float32) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 10 | dtype=np.float32) 11 | 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, done): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.new_state_memory[index] = state_ 20 | self.action_memory[index] = action 21 | self.reward_memory[index] = reward 22 | self.terminal_memory[index] = done 23 | self.mem_cntr += 1 24 | 25 | def sample_buffer(self, batch_size): 26 | max_mem = min(self.mem_cntr, self.mem_size) 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | terminal = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, terminal 36 | -------------------------------------------------------------------------------- /DuelingDQN/tf2/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras.optimizers import Adam 4 | import tensorflow.keras as keras 5 | from network import DuelingDeepQNetwork 6 | from replay_memory import ReplayBuffer 7 | 8 | 9 | class Agent: 10 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 11 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 12 | replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): 13 | self.gamma = gamma 14 | self.epsilon = epsilon 15 | self.lr = lr 16 | self.n_actions = n_actions 17 | self.input_dims = input_dims 18 | self.batch_size = batch_size 19 | self.eps_min = eps_min 20 | self.eps_dec = eps_dec 21 | self.replace_target_cnt = replace 22 | self.algo = algo 23 | self.env_name = env_name 24 | self.chkpt_dir = chkpt_dir 25 | self.action_space = [i for i in range(n_actions)] 26 | self.learn_step_counter = 0 27 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions) 28 | self.fname = self.chkpt_dir + self.env_name + '_' + self.algo + '_' 29 | 30 | self.q_eval = DuelingDeepQNetwork(input_dims, n_actions) 31 | self.q_eval.compile(optimizer=Adam(learning_rate=lr)) 32 | self.q_next = DuelingDeepQNetwork(input_dims, n_actions) 33 | self.q_next.compile(optimizer=Adam(learning_rate=lr)) 34 | 35 | def save_models(self): 36 | self.q_eval.save(self.fname+'q_eval') 37 | self.q_next.save(self.fname+'q_next') 38 | print('... models saved successfully ...') 39 | 40 | def load_models(self): 41 | self.q_eval = keras.models.load_model(self.fname+'q_eval') 42 | self.q_next = keras.models.load_model(self.fname+'q_next') 43 | print('... models loaded successfully ...') 44 | 45 | def choose_action(self, observation): 46 | if np.random.random() > self.epsilon: 47 | state = tf.convert_to_tensor([observation]) 48 | _, advantage = self.q_eval(state) 49 | action = tf.math.argmax(advantage, axis=1).numpy()[0] 50 | else: 51 | action = np.random.choice(self.action_space) 52 | return action 53 | 54 | def store_transition(self, state, action, reward, state_, done): 55 | self.memory.store_transition(state, action, reward, state_, done) 56 | 57 | def sample_memory(self): 58 | state, action, reward, new_state, done = \ 59 | self.memory.sample_buffer(self.batch_size) 60 | states = tf.convert_to_tensor(state) 61 | rewards = tf.convert_to_tensor(reward) 62 | dones = tf.convert_to_tensor(done) 63 | actions = tf.convert_to_tensor(action, dtype=tf.int32) 64 | states_ = tf.convert_to_tensor(new_state) 65 | return states, actions, rewards, states_, dones 66 | 67 | def replace_target_network(self): 68 | if self.learn_step_counter % self.replace_target_cnt == 0: 69 | self.q_next.set_weights(self.q_eval.get_weights()) 70 | 71 | def decrement_epsilon(self): 72 | self.epsilon = self.epsilon - self.eps_dec \ 73 | if self.epsilon > self.eps_min else self.eps_min 74 | 75 | def learn(self): 76 | if self.memory.mem_cntr < self.batch_size: 77 | return 78 | 79 | self.replace_target_network() 80 | 81 | states, actions, rewards, states_, dones = self.sample_memory() 82 | 83 | indices = tf.range(self.batch_size, dtype=tf.int32) 84 | action_indices = tf.stack([indices, actions], axis=1) 85 | 86 | with tf.GradientTape() as tape: 87 | V_s, A_s = self.q_eval(states) 88 | V_s_, A_s_ = self.q_next(states_) 89 | 90 | advantage = V_s + A_s - tf.reduce_mean(A_s, axis=1, 91 | keepdims=True) 92 | advantage_ = V_s_ + A_s_ - tf.reduce_mean(A_s_, axis=1, 93 | keepdims=True) 94 | q_pred = tf.gather_nd(advantage, indices=action_indices) 95 | 96 | q_next = tf.reduce_max(advantage_, axis=1) 97 | 98 | q_target = rewards + self.gamma*q_next * (1 - dones.numpy()) 99 | loss = keras.losses.MSE(q_pred, q_target) 100 | params = self.q_eval.trainable_variables 101 | grads = tape.gradient(loss, params) 102 | self.q_eval.optimizer.apply_gradients(zip(grads, params)) 103 | self.learn_step_counter += 1 104 | 105 | self.decrement_epsilon() 106 | -------------------------------------------------------------------------------- /DuelingDQN/tf2/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from agent import Agent 3 | from utils import plot_learning_curve, make_env, manage_memory 4 | from gym import wrappers 5 | 6 | if __name__ == '__main__': 7 | manage_memory() 8 | env = make_env('PongNoFrameskip-v4') 9 | best_score = -np.inf 10 | load_checkpoint = False 11 | record_agent = False 12 | n_games = 250 13 | agent = Agent(gamma=0.99, epsilon=1, lr=0.0001, 14 | input_dims=(env.observation_space.shape), 15 | n_actions=env.action_space.n, mem_size=50000, eps_min=0.1, 16 | batch_size=32, replace=1000, eps_dec=1e-5, 17 | chkpt_dir='models/', algo='DQNAgent', 18 | env_name='PongNoFrameskip-v4') 19 | if load_checkpoint: 20 | agent.load_models() 21 | agent.epsilon = agent.eps_min 22 | 23 | fname = agent.algo + '_' + agent.env_name + '_lr' + str(agent.lr) + '_' \ 24 | + str(n_games) + 'games' 25 | figure_file = 'plots/' + fname + '.png' 26 | # if you want to record video of your agent playing, do a 27 | # mkdir video 28 | if record_agent: 29 | env = wrappers.Monitor(env, "video", 30 | video_callable=lambda episode_id: True, 31 | force=True) 32 | n_steps = 0 33 | scores, eps_history, steps_array = [], [], [] 34 | 35 | for i in range(n_games): 36 | done = False 37 | observation = env.reset() 38 | 39 | score = 0 40 | while not done: 41 | action = agent.choose_action(observation) 42 | observation_, reward, done, info = env.step(action) 43 | score += reward 44 | 45 | if not load_checkpoint: 46 | agent.store_transition(observation, action, 47 | reward, observation_, done) 48 | agent.learn() 49 | observation = observation_ 50 | n_steps += 1 51 | scores.append(score) 52 | steps_array.append(n_steps) 53 | 54 | avg_score = np.mean(scores[-100:]) 55 | print('episode {} score {:.1f} avg score {:.1f} ' 56 | 'best score {:.1f} epsilon {:.2f} steps {}'. 57 | format(i, score, avg_score, best_score, agent.epsilon, 58 | n_steps)) 59 | 60 | if score > best_score: 61 | if not load_checkpoint: 62 | agent.save_models() 63 | best_score = score 64 | 65 | eps_history.append(agent.epsilon) 66 | 67 | x = [i+1 for i in range(len(scores))] 68 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 69 | -------------------------------------------------------------------------------- /DuelingDQN/tf2/network.py: -------------------------------------------------------------------------------- 1 | import tensorflow.keras as keras 2 | from tensorflow.keras.layers import Conv2D, Dense, Flatten 3 | 4 | 5 | class DuelingDeepQNetwork(keras.Model): 6 | def __init__(self, input_dims, n_actions): 7 | super(DuelingDeepQNetwork, self).__init__() 8 | self.conv1 = Conv2D(32, 8, strides=(4, 4), activation='relu', 9 | data_format='channels_first', 10 | input_shape=input_dims) 11 | self.conv2 = Conv2D(64, 4, strides=(2, 2), activation='relu', 12 | data_format='channels_first') 13 | self.conv3 = Conv2D(64, 3, strides=(1, 1), activation='relu', 14 | data_format='channels_first') 15 | self.flat = Flatten() 16 | self.fc1 = Dense(512, activation='relu') 17 | self.A = Dense(n_actions, activation=None) 18 | self.V = Dense(1, activation=None) 19 | 20 | def call(self, state): 21 | x = self.conv1(state) 22 | x = self.conv2(x) 23 | x = self.conv3(x) 24 | x = self.flat(x) 25 | x = self.fc1(x) 26 | V = self.V(x) 27 | A = self.A(x) 28 | 29 | return V, A 30 | -------------------------------------------------------------------------------- /DuelingDQN/tf2/replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape), 8 | dtype=np.float32) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 10 | dtype=np.float32) 11 | 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, done): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.new_state_memory[index] = state_ 20 | self.action_memory[index] = action 21 | self.reward_memory[index] = reward 22 | self.terminal_memory[index] = done 23 | self.mem_cntr += 1 24 | 25 | def sample_buffer(self, batch_size): 26 | max_mem = min(self.mem_cntr, self.mem_size) 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | terminal = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, terminal 36 | -------------------------------------------------------------------------------- /DuelingDQN/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | import tensorflow as tf 7 | 8 | 9 | def manage_memory(): 10 | gpus = tf.config.list_physical_devices('GPU') 11 | if gpus: 12 | try: 13 | for gpu in gpus: 14 | tf.config.experimental.set_memory_growth(gpu, True) 15 | except RuntimeError as e: 16 | print(e) 17 | 18 | 19 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 20 | fig=plt.figure() 21 | ax=fig.add_subplot(111, label="1") 22 | ax2=fig.add_subplot(111, label="2", frame_on=False) 23 | 24 | ax.plot(x, epsilons, color="C0") 25 | ax.set_xlabel("Training Steps", color="C0") 26 | ax.set_ylabel("Epsilon", color="C0") 27 | ax.tick_params(axis='x', colors="C0") 28 | ax.tick_params(axis='y', colors="C0") 29 | 30 | N = len(scores) 31 | running_avg = np.empty(N) 32 | for t in range(N): 33 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 34 | 35 | ax2.scatter(x, running_avg, color="C1") 36 | ax2.axes.get_xaxis().set_visible(False) 37 | ax2.yaxis.tick_right() 38 | ax2.set_ylabel('Score', color="C1") 39 | ax2.yaxis.set_label_position('right') 40 | ax2.tick_params(axis='y', colors="C1") 41 | 42 | if lines is not None: 43 | for line in lines: 44 | plt.axvline(x=line) 45 | 46 | plt.savefig(filename) 47 | 48 | class RepeatActionAndMaxFrame(gym.Wrapper): 49 | def __init__(self, env=None, repeat=4, clip_reward=False, no_ops=0, 50 | fire_first=False): 51 | super(RepeatActionAndMaxFrame, self).__init__(env) 52 | self.repeat = repeat 53 | self.shape = env.observation_space.low.shape 54 | self.frame_buffer = np.zeros_like((2, self.shape)) 55 | self.clip_reward = clip_reward 56 | self.no_ops = no_ops 57 | self.fire_first = fire_first 58 | 59 | def step(self, action): 60 | t_reward = 0.0 61 | done = False 62 | for i in range(self.repeat): 63 | obs, reward, done, info = self.env.step(action) 64 | if self.clip_reward: 65 | reward = np.clip(np.array([reward]), -1, 1)[0] 66 | t_reward += reward 67 | idx = i % 2 68 | self.frame_buffer[idx] = obs 69 | if done: 70 | break 71 | 72 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 73 | return max_frame, t_reward, done, info 74 | 75 | def reset(self): 76 | obs = self.env.reset() 77 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 78 | for _ in range(no_ops): 79 | _, _, done, _ = self.env.step(0) 80 | if done: 81 | self.env.reset() 82 | if self.fire_first: 83 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 84 | obs, _, _, _ = self.env.step(1) 85 | 86 | self.frame_buffer = np.zeros_like((2,self.shape)) 87 | self.frame_buffer[0] = obs 88 | 89 | return obs 90 | 91 | class PreprocessFrame(gym.ObservationWrapper): 92 | def __init__(self, shape, env=None): 93 | super(PreprocessFrame, self).__init__(env) 94 | self.shape = (shape[2], shape[0], shape[1]) 95 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, 96 | shape=self.shape, dtype=np.float32) 97 | 98 | def observation(self, obs): 99 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 100 | resized_screen = cv2.resize(new_frame, self.shape[1:], 101 | interpolation=cv2.INTER_AREA) 102 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 103 | new_obs = new_obs / 255.0 104 | 105 | return new_obs 106 | 107 | class StackFrames(gym.ObservationWrapper): 108 | def __init__(self, env, repeat): 109 | super(StackFrames, self).__init__(env) 110 | self.observation_space = gym.spaces.Box( 111 | env.observation_space.low.repeat(repeat, axis=0), 112 | env.observation_space.high.repeat(repeat, axis=0), 113 | dtype=np.float32) 114 | self.stack = collections.deque(maxlen=repeat) 115 | 116 | def reset(self): 117 | self.stack.clear() 118 | observation = self.env.reset() 119 | for _ in range(self.stack.maxlen): 120 | self.stack.append(observation) 121 | 122 | return np.array(self.stack).reshape(self.observation_space.low.shape) 123 | 124 | def observation(self, observation): 125 | self.stack.append(observation) 126 | 127 | return np.array(self.stack).reshape(self.observation_space.low.shape) 128 | 129 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False, 130 | no_ops=0, fire_first=False): 131 | env = gym.make(env_name) 132 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first) 133 | env = PreprocessFrame(shape, env) 134 | env = StackFrames(env, repeat) 135 | 136 | return env 137 | -------------------------------------------------------------------------------- /DuelingDQN/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | 7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 8 | fig=plt.figure() 9 | ax=fig.add_subplot(111, label="1") 10 | ax2=fig.add_subplot(111, label="2", frame_on=False) 11 | 12 | ax.plot(x, epsilons, color="C0") 13 | ax.set_xlabel("Training Steps", color="C0") 14 | ax.set_ylabel("Epsilon", color="C0") 15 | ax.tick_params(axis='x', colors="C0") 16 | ax.tick_params(axis='y', colors="C0") 17 | 18 | N = len(scores) 19 | running_avg = np.empty(N) 20 | for t in range(N): 21 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 22 | 23 | ax2.scatter(x, running_avg, color="C1") 24 | ax2.axes.get_xaxis().set_visible(False) 25 | ax2.yaxis.tick_right() 26 | ax2.set_ylabel('Score', color="C1") 27 | ax2.yaxis.set_label_position('right') 28 | ax2.tick_params(axis='y', colors="C1") 29 | 30 | if lines is not None: 31 | for line in lines: 32 | plt.axvline(x=line) 33 | 34 | plt.savefig(filename) 35 | 36 | class RepeatActionAndMaxFrame(gym.Wrapper): 37 | """ modified from: 38 | https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py 39 | """ 40 | def __init__(self, env=None, repeat=4): 41 | super(RepeatActionAndMaxFrame, self).__init__(env) 42 | self.repeat = repeat 43 | self.shape = env.observation_space.low.shape 44 | self.frame_buffer = np.zeros_like((2,self.shape)) 45 | 46 | def step(self, action): 47 | t_reward = 0.0 48 | done = False 49 | for i in range(self.repeat): 50 | obs, reward, done, info = self.env.step(action) 51 | t_reward += reward 52 | idx = i % 2 53 | self.frame_buffer[idx] = obs 54 | if done: 55 | break 56 | 57 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 58 | return max_frame, t_reward, done, info 59 | 60 | def reset(self): 61 | obs = self.env.reset() 62 | self.frame_buffer = np.zeros_like((2,self.shape)) 63 | self.frame_buffer[0] = obs 64 | return obs 65 | 66 | class PreprocessFrame(gym.ObservationWrapper): 67 | def __init__(self, shape, env=None): 68 | super(PreprocessFrame, self).__init__(env) 69 | self.shape=(shape[2], shape[0], shape[1]) 70 | self.observation_space = gym.spaces.Box(low=0, high=1.0, 71 | shape=self.shape,dtype=np.float32) 72 | def observation(self, obs): 73 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 74 | resized_screen = cv2.resize(new_frame, self.shape[1:], 75 | interpolation=cv2.INTER_AREA) 76 | 77 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 78 | new_obs = np.swapaxes(new_obs, 2,0) 79 | new_obs = new_obs / 255.0 80 | return new_obs 81 | 82 | class StackFrames(gym.ObservationWrapper): 83 | def __init__(self, env, n_steps): 84 | super(StackFrames, self).__init__(env) 85 | self.observation_space = gym.spaces.Box( 86 | env.observation_space.low.repeat(n_steps, axis=0), 87 | env.observation_space.high.repeat(n_steps, axis=0), 88 | dtype=np.float32) 89 | self.stack = collections.deque(maxlen=n_steps) 90 | 91 | def reset(self): 92 | self.stack.clear() 93 | observation = self.env.reset() 94 | for _ in range(self.stack.maxlen): 95 | self.stack.append(observation) 96 | 97 | return np.array(self.stack).reshape(self.observation_space.low.shape) 98 | 99 | def observation(self, observation): 100 | self.stack.append(observation) 101 | obs = np.array(self.stack).reshape(self.observation_space.low.shape) 102 | 103 | return obs 104 | 105 | def make_env(env_name, shape=(84,84,1), skip=4): 106 | env = gym.make(env_name) 107 | env = RepeatActionAndMaxFrame(env, skip) 108 | env = PreprocessFrame(shape, env) 109 | env = StackFrames(env, skip) 110 | 111 | return env 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Phil Tabor 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep-Q-Learning-Paper-To-Code 2 | 3 | Code for my course at Udemy: 4 | 5 | https://www.udemy.com/course/deep-q-learning-from-paper-to-code/?referralCode=CBA45A3B737237E7BFD2 6 | 7 | We analyze and implement the following papers: 8 | 9 | Human Level Control Through Deep Reinforcement Learning 10 | 11 | https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf 12 | 13 | Deep Reinforcement Learning with Double Q Learning: 14 | 15 | https://arxiv.org/abs/1509.06461 16 | 17 | Dueling Network Architectures for Deep Reinforcement Learning: 18 | 19 | https://arxiv.org/abs/1511.06581 20 | 21 | The course is still in review, and this readme is a work in progress. 22 | 23 | Better docs to come! 24 | -------------------------------------------------------------------------------- /agents.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | from deep_q_network import DeepQNetwork, DuelingDeepQNetwork 4 | from replay_memory import ReplayBuffer 5 | 6 | class Agent(): 7 | def __init__(self, gamma, epsilon, lr, n_actions, input_dims, 8 | mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, 9 | replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): 10 | self.gamma = gamma 11 | self.epsilon = epsilon 12 | self.lr = lr 13 | self.n_actions = n_actions 14 | self.input_dims = input_dims 15 | self.eps_min = eps_min 16 | self.eps_dec = eps_dec 17 | self.action_space = [i for i in range(n_actions)] 18 | self.learn_step_counter = 0 19 | self.batch_size = batch_size 20 | self.replace_target_cnt = replace 21 | self.algo = algo 22 | self.env_name = env_name 23 | self.chkpt_dir = chkpt_dir 24 | 25 | self.memory = ReplayBuffer(mem_size, input_dims, n_actions) 26 | 27 | def store_transition(self, state, action, reward, state_, done): 28 | self.memory.store_transition(state, action, reward, state_, done) 29 | 30 | def choose_action(self, observation): 31 | raise NotImplementedError 32 | 33 | def replace_target_network(self): 34 | if self.learn_step_counter % self.replace_target_cnt == 0: 35 | self.q_next.load_state_dict(self.q_eval.state_dict()) 36 | 37 | def decrement_epsilon(self): 38 | self.epsilon = self.epsilon - self.eps_dec \ 39 | if self.epsilon > self.eps_min else self.eps_min 40 | def sample_memory(self): 41 | state, action, reward, new_state, done = \ 42 | self.memory.sample_buffer(self.batch_size) 43 | 44 | states = T.tensor(state).to(self.q_eval.device) 45 | rewards = T.tensor(reward).to(self.q_eval.device) 46 | dones = T.tensor(done).to(self.q_eval.device) 47 | actions = T.tensor(action).to(self.q_eval.device) 48 | states_ = T.tensor(new_state).to(self.q_eval.device) 49 | 50 | return states, actions, rewards, states_, dones 51 | 52 | def learn(self): 53 | raise NotImplementedError 54 | 55 | def save_models(self): 56 | self.q_eval.save_checkpoint() 57 | self.q_next.save_checkpoint() 58 | 59 | def load_models(self): 60 | self.q_eval.load_checkpoint() 61 | self.q_next.load_checkpoint() 62 | 63 | class DQNAgent(Agent): 64 | def __init__(self, *args, **kwargs): 65 | super(DQNAgent, self).__init__(*args, **kwargs) 66 | 67 | self.q_eval = DeepQNetwork(self.lr, self.n_actions, 68 | input_dims=self.input_dims, 69 | name=self.env_name+'_'+self.algo+'_q_eval', 70 | chkpt_dir=self.chkpt_dir) 71 | self.q_next = DeepQNetwork(self.lr, self.n_actions, 72 | input_dims=self.input_dims, 73 | name=self.env_name+'_'+self.algo+'_q_next', 74 | chkpt_dir=self.chkpt_dir) 75 | 76 | def choose_action(self, observation): 77 | if np.random.random() > self.epsilon: 78 | state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) 79 | actions = self.q_eval.forward(state) 80 | action = T.argmax(actions).item() 81 | else: 82 | action = np.random.choice(self.action_space) 83 | 84 | return action 85 | 86 | def learn(self): 87 | if self.memory.mem_cntr < self.batch_size: 88 | return 89 | 90 | self.q_eval.optimizer.zero_grad() 91 | 92 | self.replace_target_network() 93 | 94 | states, actions, rewards, states_, dones = self.sample_memory() 95 | indices = np.arange(self.batch_size) 96 | 97 | q_pred = self.q_eval.forward(states)[indices, actions] 98 | 99 | q_next = self.q_next.forward(states_).max(dim=1)[0] 100 | q_next[dones] = 0.0 101 | 102 | q_target = rewards + self.gamma*q_next 103 | 104 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 105 | loss.backward() 106 | self.q_eval.optimizer.step() 107 | self.learn_step_counter += 1 108 | 109 | self.decrement_epsilon() 110 | 111 | class DDQNAgent(Agent): 112 | def __init__(self, *args, **kwargs): 113 | super(DDQNAgent, self).__init__(*args, **kwargs) 114 | 115 | self.q_eval = DeepQNetwork(self.lr, self.n_actions, 116 | input_dims=self.input_dims, 117 | name=self.env_name+'_'+self.algo+'_q_eval', 118 | chkpt_dir=self.chkpt_dir) 119 | self.q_next = DeepQNetwork(self.lr, self.n_actions, 120 | input_dims=self.input_dims, 121 | name=self.env_name+'_'+self.algo+'_q_next', 122 | chkpt_dir=self.chkpt_dir) 123 | 124 | def choose_action(self, observation): 125 | if np.random.random() > self.epsilon: 126 | state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) 127 | actions = self.q_eval.forward(state) 128 | action = T.argmax(actions).item() 129 | else: 130 | action = np.random.choice(self.action_space) 131 | 132 | return action 133 | 134 | def learn(self): 135 | if self.memory.mem_cntr < self.batch_size: 136 | return 137 | 138 | self.q_eval.optimizer.zero_grad() 139 | 140 | self.replace_target_network() 141 | 142 | states, actions, rewards, states_, dones = self.sample_memory() 143 | indices = np.arange(self.batch_size) 144 | 145 | q_pred = self.q_eval.forward(states)[indices, actions] 146 | q_next = self.q_next.forward(states_) 147 | q_eval = self.q_eval.forward(states_) 148 | 149 | max_actions = T.argmax(q_eval, dim=1) 150 | q_next[dones] = 0.0 151 | 152 | q_target = rewards + self.gamma*q_next[indices, max_actions] 153 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 154 | loss.backward() 155 | 156 | self.q_eval.optimizer.step() 157 | self.learn_step_counter += 1 158 | 159 | self.decrement_epsilon() 160 | 161 | class DuelingDQNAgent(Agent): 162 | def __init__(self, *args, **kwargs): 163 | super(DuelingDQNAgent, self).__init__(*args, **kwargs) 164 | 165 | self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions, 166 | input_dims=self.input_dims, 167 | name=self.env_name+'_'+self.algo+'_q_eval', 168 | chkpt_dir=self.chkpt_dir) 169 | self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions, 170 | input_dims=self.input_dims, 171 | name=self.env_name+'_'+self.algo+'_q_next', 172 | chkpt_dir=self.chkpt_dir) 173 | 174 | def choose_action(self, observation): 175 | if np.random.random() > self.epsilon: 176 | state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) 177 | _, advantage = self.q_eval.forward(state) 178 | action = T.argmax(advantage).item() 179 | else: 180 | action = np.random.choice(self.action_space) 181 | 182 | return action 183 | 184 | def learn(self): 185 | if self.memory.mem_cntr < self.batch_size: 186 | return 187 | 188 | self.q_eval.optimizer.zero_grad() 189 | 190 | self.replace_target_network() 191 | 192 | states, actions, rewards, states_, dones = self.sample_memory() 193 | indices = np.arange(self.batch_size) 194 | 195 | V_s, A_s = self.q_eval.forward(states) 196 | V_s_, A_s_ = self.q_next.forward(states_) 197 | 198 | q_pred = T.add(V_s, 199 | (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions] 200 | q_next = T.add(V_s_, 201 | (A_s_ - A_s_.mean(dim=1, keepdim=True))).max(dim=1)[0] 202 | 203 | q_next[dones] = 0.0 204 | q_target = rewards + self.gamma*q_next 205 | 206 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 207 | loss.backward() 208 | self.q_eval.optimizer.step() 209 | 210 | self.learn_step_counter += 1 211 | self.decrement_epsilon() 212 | 213 | class DuelingDDQNAgent(Agent): 214 | def __init__(self, *args, **kwargs): 215 | super(DuelingDDQNAgent, self).__init__(*args, **kwargs) 216 | 217 | self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions, 218 | input_dims=self.input_dims, 219 | name=self.env_name+'_'+self.algo+'_q_eval', 220 | chkpt_dir=self.chkpt_dir) 221 | self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions, 222 | input_dims=self.input_dims, 223 | name=self.env_name+'_'+self.algo+'_q_next', 224 | chkpt_dir=self.chkpt_dir) 225 | 226 | def choose_action(self, observation): 227 | if np.random.random() > self.epsilon: 228 | state = T.tensor([observation],dtype=T.float).to(self.q_eval.device) 229 | _, advantage = self.q_eval.forward(state) 230 | action = T.argmax(advantage).item() 231 | else: 232 | action = np.random.choice(self.action_space) 233 | 234 | return action 235 | 236 | def learn(self): 237 | if self.memory.mem_cntr < self.batch_size: 238 | return 239 | 240 | self.q_eval.optimizer.zero_grad() 241 | 242 | self.replace_target_network() 243 | 244 | states, actions, rewards, states_, dones = self.sample_memory() 245 | 246 | indices = np.arange(self.batch_size) 247 | 248 | V_s, A_s = self.q_eval.forward(states) 249 | V_s_, A_s_ = self.q_next.forward(states_) 250 | 251 | V_s_eval, A_s_eval = self.q_eval.forward(states_) 252 | 253 | q_pred = T.add(V_s, 254 | (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions] 255 | q_next = T.add(V_s_, (A_s_ - A_s_.mean(dim=1, keepdim=True))) 256 | 257 | q_eval = T.add(V_s_eval, 258 | (A_s_eval - A_s_eval.mean(dim=1, keepdim=True))) 259 | 260 | max_actions = T.argmax(q_eval, dim=1) 261 | q_next[dones] = 0.0 262 | 263 | q_target = rewards + self.gamma*q_next[indices, max_actions] 264 | loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) 265 | loss.backward() 266 | self.q_eval.optimizer.step() 267 | self.learn_step_counter += 1 268 | 269 | self.decrement_epsilon() 270 | -------------------------------------------------------------------------------- /argparse_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | parser = argparse.ArgumentParser(description='') 4 | 5 | # type can be int, str, float, bool, etc. 6 | # this argument is optional 7 | parser.add_argument('-argument', type=dtype, default=x, help='help string') 8 | 9 | # this argument is not optional 10 | parser.add_argument('argument', type=dtype, default=x, help='help string') 11 | 12 | # parse the args. 13 | args = parser.parse_args() 14 | 15 | # access parameters like this 16 | variable = args.argument 17 | -------------------------------------------------------------------------------- /deep_q_network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import numpy as np 7 | 8 | class DeepQNetwork(nn.Module): 9 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 10 | super(DeepQNetwork, self).__init__() 11 | self.checkpoint_dir = chkpt_dir 12 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 13 | 14 | self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4) 15 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 16 | self.conv3 = nn.Conv2d(64, 64, 3, stride=1) 17 | 18 | fc_input_dims = self.calculate_conv_output_dims(input_dims) 19 | 20 | self.fc1 = nn.Linear(fc_input_dims, 512) 21 | self.fc2 = nn.Linear(512, n_actions) 22 | 23 | self.optimizer = optim.RMSprop(self.parameters(), lr=lr) 24 | 25 | self.loss = nn.MSELoss() 26 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 27 | self.to(self.device) 28 | 29 | def calculate_conv_output_dims(self, input_dims): 30 | state = T.zeros(1, *input_dims) 31 | dims = self.conv1(state) 32 | dims = self.conv2(dims) 33 | dims = self.conv3(dims) 34 | return int(np.prod(dims.size())) 35 | 36 | def forward(self, state): 37 | conv1 = F.relu(self.conv1(state)) 38 | conv2 = F.relu(self.conv2(conv1)) 39 | conv3 = F.relu(self.conv3(conv2)) 40 | conv_state = conv3.view(conv3.size()[0], -1) 41 | 42 | flat1 = F.relu(self.fc1(conv_state)) 43 | actions = self.fc2(flat1) 44 | 45 | return actions 46 | 47 | def save_checkpoint(self): 48 | print('... saving checkpoint ...') 49 | T.save(self.state_dict(), self.checkpoint_file) 50 | 51 | def load_checkpoint(self): 52 | print('... loading checkpoint ...') 53 | self.load_state_dict(T.load(self.checkpoint_file)) 54 | 55 | class DuelingDeepQNetwork(nn.Module): 56 | def __init__(self, lr, n_actions, name, input_dims, chkpt_dir): 57 | super(DuelingDeepQNetwork, self).__init__() 58 | 59 | self.checkpoint_dir = chkpt_dir 60 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name) 61 | 62 | self.conv1 = nn.Conv2d(input_dims[0], 32, 8, stride=4) 63 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 64 | self.conv3 = nn.Conv2d(64, 64, 3, stride=1) 65 | 66 | fc_input_dims = self.calculate_conv_output_dims(input_dims) 67 | 68 | self.fc1 = nn.Linear(fc_input_dims, 512) 69 | 70 | self.V = nn.Linear(512, 1) 71 | self.A = nn.Linear(512, n_actions) 72 | 73 | self.optimizer = optim.RMSprop(self.parameters(), lr=lr) 74 | self.loss = nn.MSELoss() 75 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 76 | self.to(self.device) 77 | 78 | def calculate_conv_output_dims(self, input_dims): 79 | state = T.zeros(1, *input_dims) 80 | dims = self.conv1(state) 81 | dims = self.conv2(dims) 82 | dims = self.conv3(dims) 83 | return int(np.prod(dims.size())) 84 | 85 | def forward(self, state): 86 | conv1 = F.relu(self.conv1(state)) 87 | conv2 = F.relu(self.conv2(conv1)) 88 | conv3 = F.relu(self.conv3(conv2)) 89 | conv_state = conv3.view(conv3.size()[0], -1) 90 | flat1 = F.relu(self.fc1(conv_state)) 91 | 92 | V = self.V(flat1) 93 | A = self.A(flat1) 94 | 95 | return V, A 96 | 97 | def save_checkpoint(self): 98 | print('... saving checkpoint ...') 99 | T.save(self.state_dict(), self.checkpoint_file) 100 | 101 | def load_checkpoint(self): 102 | print('... loading checkpoint ...') 103 | self.load_state_dict(T.load(self.checkpoint_file)) 104 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse, os 2 | import gym 3 | import numpy as np 4 | import agents as Agents 5 | from utils import plot_learning_curve, make_env 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser( 9 | description='Deep Q Learning: From Paper to Code') 10 | # the hyphen makes the argument optional 11 | parser.add_argument('-n_games', type=int, default=1, 12 | help='Number of games to play') 13 | parser.add_argument('-lr', type=float, default=0.0001, 14 | help='Learning rate for optimizer') 15 | parser.add_argument('-eps_min', type=float, default=0.1, 16 | help='Minimum value for epsilon in epsilon-greedy action selection') 17 | parser.add_argument('-gamma', type=float, default=0.99, 18 | help='Discount factor for update equation.') 19 | parser.add_argument('-eps_dec', type=float, default=1e-5, 20 | help='Linear factor for decreasing epsilon') 21 | parser.add_argument('-eps', type=float, default=1.0, 22 | help='Starting value for epsilon in epsilon-greedy action selection') 23 | parser.add_argument('-max_mem', type=int, default=50000, #~13Gb 24 | help='Maximum size for memory replay buffer') 25 | parser.add_argument('-repeat', type=int, default=4, 26 | help='Number of frames to repeat & stack') 27 | parser.add_argument('-bs', type=int, default=32, 28 | help='Batch size for replay memory sampling') 29 | parser.add_argument('-replace', type=int, default=1000, 30 | help='interval for replacing target network') 31 | parser.add_argument('-env', type=str, default='PongNoFrameskip-v4', 32 | help='Atari environment.\nPongNoFrameskip-v4\n \ 33 | BreakoutNoFrameskip-v4\n \ 34 | SpaceInvadersNoFrameskip-v4\n \ 35 | EnduroNoFrameskip-v4\n \ 36 | AtlantisNoFrameskip-v4') 37 | parser.add_argument('-gpu', type=str, default='0', help='GPU: 0 or 1') 38 | parser.add_argument('-load_checkpoint', type=bool, default=False, 39 | help='load model checkpoint') 40 | parser.add_argument('-path', type=str, default='models/', 41 | help='path for model saving/loading') 42 | parser.add_argument('-algo', type=str, default='DQNAgent', 43 | help='DQNAgent/DDQNAgent/DuelingDQNAgent/DuelingDDQNAgent') 44 | parser.add_argument('-clip_rewards', type=bool, default=False, 45 | help='Clip rewards to range -1 to 1') 46 | parser.add_argument('-no_ops', type=int, default=0, 47 | help='Max number of no ops for testing') 48 | parser.add_argument('-fire_first', type=bool, default=False, 49 | help='Set first action of episode to fire') 50 | args = parser.parse_args() 51 | 52 | os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' 53 | os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu 54 | 55 | env = make_env(env_name=args.env, repeat=args.repeat, 56 | clip_rewards=args.clip_rewards, no_ops=args.no_ops, 57 | fire_first=args.fire_first) 58 | 59 | best_score = -np.inf 60 | agent_ = getattr(Agents, args.algo) 61 | agent = agent_(gamma=args.gamma, 62 | epsilon=args.eps, 63 | lr=args.lr, 64 | input_dims=env.observation_space.shape, 65 | n_actions=env.action_space.n, 66 | mem_size=args.max_mem, 67 | eps_min=args.eps_min, 68 | batch_size=args.bs, 69 | replace=args.replace, 70 | eps_dec=args.eps_dec, 71 | chkpt_dir=args.path, 72 | algo=args.algo, 73 | env_name=args.env) 74 | 75 | if args.load_checkpoint: 76 | agent.load_models() 77 | 78 | fname = args.algo + '_' + args.env + '_alpha' + str(args.lr) +'_' \ 79 | + str(args.n_games) + 'games' 80 | figure_file = 'plots/' + fname + '.png' 81 | scores_file = fname + '_scores.npy' 82 | 83 | scores, eps_history = [], [] 84 | n_steps = 0 85 | steps_array = [] 86 | for i in range(args.n_games): 87 | done = False 88 | observation = env.reset() 89 | score = 0 90 | while not done: 91 | action = agent.choose_action(observation) 92 | observation_, reward, done, info = env.step(action) 93 | score += reward 94 | 95 | if not args.load_checkpoint: 96 | agent.store_transition(observation, action, 97 | reward, observation_, int(done)) 98 | agent.learn() 99 | observation = observation_ 100 | n_steps += 1 101 | scores.append(score) 102 | steps_array.append(n_steps) 103 | 104 | avg_score = np.mean(scores[-100:]) 105 | print('episode: ', i,'score: ', score, 106 | ' average score %.1f' % avg_score, 'best score %.2f' % best_score, 107 | 'epsilon %.2f' % agent.epsilon, 'steps', n_steps) 108 | 109 | if avg_score > best_score: 110 | if not args.load_checkpoint: 111 | agent.save_models() 112 | best_score = avg_score 113 | 114 | eps_history.append(agent.epsilon) 115 | if args.load_checkpoint and n_steps >= 18000: 116 | break 117 | 118 | x = [i+1 for i in range(len(scores))] 119 | plot_learning_curve(steps_array, scores, eps_history, figure_file) 120 | #np.save(scores_file, np.array(scores)) 121 | -------------------------------------------------------------------------------- /naive_deep_q_learning/cartpole_naive_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/naive_deep_q_learning/cartpole_naive_dqn.png -------------------------------------------------------------------------------- /naive_deep_q_learning/cartpole_naive_dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import torch as T 7 | from util import plot_learning_curve 8 | 9 | class LinearDeepQNetwork(nn.Module): 10 | def __init__(self, lr, n_actions, input_dims): 11 | super(LinearDeepQNetwork, self).__init__() 12 | 13 | self.fc1 = nn.Linear(*input_dims, 128) 14 | self.fc2 = nn.Linear(128, n_actions) 15 | 16 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 17 | self.loss = nn.MSELoss() 18 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 19 | self.to(self.device) 20 | 21 | def forward(self, state): 22 | layer1 = F.relu(self.fc1(state)) 23 | actions = self.fc2(layer1) 24 | 25 | return actions 26 | 27 | 28 | class Agent(): 29 | def __init__(self, input_dims, n_actions, lr, gamma=0.99, 30 | epsilon=1.0, eps_dec=1e-5, eps_min=0.01): 31 | self.lr = lr 32 | self.input_dims = input_dims 33 | self.n_actions = n_actions 34 | self.gamma = gamma 35 | self.epsilon = epsilon 36 | self.eps_dec = eps_dec 37 | self.eps_min = eps_min 38 | self.action_space = [i for i in range(self.n_actions)] 39 | 40 | self.Q = LinearDeepQNetwork(self.lr, self.n_actions, self.input_dims) 41 | 42 | def choose_action(self, observation): 43 | if np.random.random() > self.epsilon: 44 | state = T.tensor(observation, dtype=T.float).to(self.Q.device) 45 | actions = self.Q.forward(state) 46 | action = T.argmax(actions).item() 47 | else: 48 | action = np.random.choice(self.action_space) 49 | 50 | return action 51 | 52 | def decrement_epsilon(self): 53 | self.epsilon = self.epsilon - self.eps_dec \ 54 | if self.epsilon > self.eps_min else self.eps_min 55 | 56 | def learn(self, state, action, reward, state_): 57 | self.Q.optimizer.zero_grad() 58 | states = T.tensor(state, dtype=T.float).to(self.Q.device) 59 | actions = T.tensor(action).to(self.Q.device) 60 | rewards = T.tensor(reward).to(self.Q.device) 61 | states_ = T.tensor(state_, dtype=T.float).to(self.Q.device) 62 | 63 | q_pred = self.Q.forward(states)[actions] 64 | 65 | q_next = self.Q.forward(states_).max() 66 | 67 | q_target = rewards + self.gamma*q_next 68 | 69 | loss = self.Q.loss(q_target, q_pred).to(self.Q.device) 70 | loss.backward() 71 | self.Q.optimizer.step() 72 | self.decrement_epsilon() 73 | 74 | if __name__ == '__main__': 75 | env = gym.make('CartPole-v1') 76 | n_games = 10000 77 | scores = [] 78 | eps_history = [] 79 | 80 | agent = Agent(lr=0.0001, input_dims=env.observation_space.shape, 81 | n_actions=env.action_space.n) 82 | 83 | for i in range(n_games): 84 | score = 0 85 | done = False 86 | obs = env.reset() 87 | 88 | while not done: 89 | action = agent.choose_action(obs) 90 | obs_, reward, done, info = env.step(action) 91 | score += reward 92 | agent.learn(obs, action, reward, obs_) 93 | obs = obs_ 94 | scores.append(score) 95 | eps_history.append(agent.epsilon) 96 | 97 | if i % 100 == 0: 98 | avg_score = np.mean(scores[-100:]) 99 | print('episode ', i, 'score %.1f avg score %.1f epsilon %.2f' % 100 | (score, avg_score, agent.epsilon)) 101 | filename = 'cartpole_naive_dqn.png' 102 | x = [i+1 for i in range(n_games)] 103 | plot_learning_curve(x, scores, eps_history, filename) 104 | -------------------------------------------------------------------------------- /naive_deep_q_learning/pytorch_example.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch.optim as optim 4 | import torch as T 5 | 6 | class LinearClassifier(nn.Module): 7 | def __init__(self, lr, n_classes, input_dims): 8 | super(LinearClassifier, self).__init__() 9 | 10 | self.fc1 = nn.Linear(*input_dims, 128) 11 | self.fc2 = nn.Linear(128, 256) 12 | self.fc3 = nn.Linear(256, n_classes) 13 | 14 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 15 | self.loss = nn.CrossEntropyLoss() 16 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 17 | self.to(self.device) 18 | 19 | def forward(self, data): 20 | layer1 = F.sigmoid(self.fc1(data)) 21 | layer2 = F.sigmoid(self.fc2(layer1)) 22 | layer3 = self.fc3(layer2) 23 | 24 | return layer3 25 | 26 | def learn(self, data, labels): 27 | self.optimizer.zero_grad() 28 | data = T.tensor(data).to(self.device) 29 | labels = T.tensor(labels).to(self.device) 30 | 31 | predictions = self.forward(data) 32 | 33 | cost = self.loss(predictions, labels) 34 | 35 | cost.backward() 36 | self.optimizer.step() 37 | -------------------------------------------------------------------------------- /naive_deep_q_learning/util.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | def plot_learning_curve(x, scores, epsilons, filename): 5 | fig = plt.figure() 6 | ax = fig.add_subplot(111, label="1") 7 | ax2 = fig.add_subplot(111, label="2", frame_on=False) 8 | 9 | ax.plot(x, epsilons, color="C0") 10 | ax.set_xlabel("Training Steps", color="C0") 11 | ax.set_ylabel("Epsilon", color="C0") 12 | ax.tick_params(axis='x', colors="C0") 13 | ax.tick_params(axis='y', colors="C0") 14 | 15 | N = len(scores) 16 | running_avg = np.empty(N) 17 | for t in range(N): 18 | running_avg[t] = np.mean(scores[max(0, t-100):(t+1)]) 19 | 20 | ax2.scatter(x, running_avg, color="C1") 21 | ax2.axes.get_xaxis().set_visible(False) 22 | ax2.yaxis.tick_right() 23 | ax2.set_ylabel('Score', color="C1") 24 | ax2.yaxis.set_label_position('right') 25 | ax2.tick_params(axis='y', colors="C1") 26 | 27 | plt.savefig(filename) 28 | -------------------------------------------------------------------------------- /q_learning/frozen_lake_deterministic_policy.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # LEFT = 0 DOWN = 1 RIGHT = 2 UP = 3 6 | # SFFF 7 | # FHFH 8 | # FFFH 9 | # HFFG 10 | 11 | policy = {0: 1, 1: 2, 2: 1, 3: 0, 4: 1, 6: 1, 8: 2, 9: 1, 10: 1, 13: 2, 14: 2} 12 | 13 | env = gym.make('FrozenLake-v0') 14 | n_games = 1000 15 | win_pct = [] 16 | scores = [] 17 | 18 | for i in range(n_games): 19 | done = False 20 | obs = env.reset() 21 | score = 0 22 | while not done: 23 | action = policy[obs] 24 | obs, reward, done, info = env.step(action) 25 | score += reward 26 | scores.append(score) 27 | if i % 10 == 0: 28 | average = np.mean(scores[-10:]) 29 | win_pct.append(average) 30 | plt.plot(win_pct) 31 | plt.show() 32 | -------------------------------------------------------------------------------- /q_learning/frozen_lake_env_test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | env = gym.make('FrozenLake-v0') 6 | # LEFT = 0 DOWN = 1 RIGHT = 2 UP = 3 7 | # SFFF 8 | # FHFH 9 | # FFFH 10 | # HFFG 11 | policy = {0: 1, 1: 2, 2: 1, 3: 0, 4:1, 6: 1, 8:2, 9:1, 10:1, 13: 2, 14:2} 12 | 13 | n_games = 1000 14 | win_pct = [] 15 | scores = [] 16 | for i in range(n_games): 17 | done = False 18 | obs = env.reset() 19 | score = 0 20 | while not done: 21 | action = env.action_space.sample() 22 | #action = policy[obs] 23 | obs, reward, done, info = env.step(action) 24 | score += reward 25 | scores.append(score) 26 | if i % 10 == 0: 27 | average = np.mean(scores[-10:]) 28 | win_pct.append(average) 29 | plt.plot(win_pct) 30 | plt.show() 31 | -------------------------------------------------------------------------------- /q_learning/frozen_lake_q_learning.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from q_learning_agent import Agent 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('FrozenLake-v0') 8 | agent = Agent(lr=0.001, gamma=0.9, eps_start=1.0, eps_end=0.01, 9 | eps_dec=0.9999995, n_actions=4, n_states=16) 10 | 11 | scores = [] 12 | win_pct_list = [] 13 | n_games = 500000 14 | 15 | for i in range(n_games): 16 | done = False 17 | observation = env.reset() 18 | score = 0 19 | while not done: 20 | action = agent.choose_action(observation) 21 | observation_, reward, done, info = env.step(action) 22 | agent.learn(observation, action, reward, observation_) 23 | score += reward 24 | observation = observation_ 25 | scores.append(score) 26 | if i % 100 == 0: 27 | win_pct = np.mean(scores[-100:]) 28 | win_pct_list.append(win_pct) 29 | if i % 1000 == 0: 30 | print('episode ', i, 'win pct %.2f' % win_pct, 31 | 'epsilon %.2f' % agent.epsilon) 32 | plt.plot(win_pct_list) 33 | plt.show() 34 | -------------------------------------------------------------------------------- /q_learning/frozen_lake_random_agent.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | env = gym.make('FrozenLake-v0') 6 | 7 | n_games = 1000 8 | win_pct = [] 9 | scores = [] 10 | for i in range(n_games): 11 | done = False 12 | obs = env.reset() 13 | score = 0 14 | while not done: 15 | action = env.action_space.sample() 16 | obs, reward, done, info = env.step(action) 17 | score += reward 18 | scores.append(score) 19 | 20 | if i % 10 == 0: 21 | average = np.mean(scores[-10:]) 22 | win_pct.append(average) 23 | plt.plot(win_pct) 24 | plt.show() 25 | -------------------------------------------------------------------------------- /q_learning/plots/frozen_lake_deterministic_policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/q_learning/plots/frozen_lake_deterministic_policy.png -------------------------------------------------------------------------------- /q_learning/plots/frozen_lake_q_learning_agent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/q_learning/plots/frozen_lake_q_learning_agent.png -------------------------------------------------------------------------------- /q_learning/plots/frozen_lake_random_policy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/philtabor/Deep-Q-Learning-Paper-To-Code/64e0546d95ba313fb677f193abbdc38f463c4aa3/q_learning/plots/frozen_lake_random_policy.png -------------------------------------------------------------------------------- /q_learning/q_learning_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Agent(): 4 | def __init__(self, lr, gamma, n_actions, n_states, eps_start, eps_end, 5 | eps_dec): 6 | self.lr = lr 7 | self.gamma = gamma 8 | self.n_actions = n_actions 9 | self.n_states = n_states 10 | self.epsilon = eps_start 11 | self.eps_min = eps_end 12 | self.eps_dec = eps_dec 13 | 14 | self.Q = {} 15 | 16 | self.init_Q() 17 | 18 | def init_Q(self): 19 | for state in range(self.n_states): 20 | for action in range(self.n_actions): 21 | self.Q[(state, action)] = 0.0 22 | 23 | def choose_action(self, state): 24 | if np.random.random() < self.epsilon: 25 | action = np.random.choice([i for i in range(self.n_actions)]) 26 | else: 27 | actions = np.array([self.Q[(state, a)] \ 28 | for a in range(self.n_actions)]) 29 | action = np.argmax(actions) 30 | return action 31 | 32 | def decrement_epsilon(self): 33 | self.epsilon = self.epsilon*self.eps_dec if self.epsilon>self.eps_min\ 34 | else self.eps_min 35 | 36 | def learn(self, state, action, reward, state_): 37 | actions = np.array([self.Q[(state_, a)] for a in range(self.n_actions)]) 38 | a_max = np.argmax(actions) 39 | 40 | self.Q[(state, action)] += self.lr*(reward + 41 | self.gamma*self.Q[(state_, a_max)] - 42 | self.Q[(state, action)]) 43 | self.decrement_epsilon() 44 | -------------------------------------------------------------------------------- /q_learning/q_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Agent(object): 4 | def __init__(self, lr, gamma, n_actions, n_states, epsilon_start, 5 | epsilon_end, epsilon_dec): 6 | self.lr = lr 7 | self.gamma = gamma 8 | self.n_actions = n_actions 9 | self.n_states = n_states 10 | self.epsilon = epsilon_start 11 | self.eps_min = epsilon_end 12 | self.eps_dec = epsilon_dec 13 | self.Q = {} 14 | 15 | self.init_Q() 16 | 17 | def init_Q(self): 18 | for state in range(self.n_states): 19 | for action in range(self.n_actions): 20 | self.Q[(state, action)] = 0.0 21 | 22 | def choose_action(self, state): 23 | if np.random.random() < self.epsilon: 24 | action = np.random.choice([i for i in range(self.n_actions)]) 25 | else: 26 | actions = np.array([self.Q[(state, a)] \ 27 | for a in range(self.n_actions)]) 28 | action = np.argmax(actions) 29 | return action 30 | 31 | def decrement_epsilon(self): 32 | self.epsilon = self.epsilon*self.eps_dec if self.epsilon>self.eps_min \ 33 | else self.eps_min 34 | 35 | def learn(self, state, action, reward, state_): 36 | actions = np.array([self.Q[(state_, a)] for a in range(self.n_actions)]) 37 | a_max = np.argmax(actions) 38 | self.Q[(state,action)] += self.lr*(reward + 39 | self.gamma*self.Q[(state_,a_max)]-\ 40 | self.Q[(state, action)]) 41 | 42 | self.decrement_epsilon() 43 | -------------------------------------------------------------------------------- /replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape), 8 | dtype=np.float32) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape), 10 | dtype=np.float32) 11 | 12 | self.action_memory = np.zeros(self.mem_size, dtype=np.int64) 13 | self.reward_memory = np.zeros(self.mem_size, dtype=np.float32) 14 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 15 | 16 | def store_transition(self, state, action, reward, state_, done): 17 | index = self.mem_cntr % self.mem_size 18 | self.state_memory[index] = state 19 | self.new_state_memory[index] = state_ 20 | self.action_memory[index] = action 21 | self.reward_memory[index] = reward 22 | self.terminal_memory[index] = done 23 | self.mem_cntr += 1 24 | 25 | def sample_buffer(self, batch_size): 26 | max_mem = min(self.mem_cntr, self.mem_size) 27 | batch = np.random.choice(max_mem, batch_size, replace=False) 28 | 29 | states = self.state_memory[batch] 30 | actions = self.action_memory[batch] 31 | rewards = self.reward_memory[batch] 32 | states_ = self.new_state_memory[batch] 33 | terminal = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, terminal 36 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import cv2 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import gym 6 | 7 | def plot_learning_curve(x, scores, epsilons, filename, lines=None): 8 | fig=plt.figure() 9 | ax=fig.add_subplot(111, label="1") 10 | ax2=fig.add_subplot(111, label="2", frame_on=False) 11 | 12 | ax.plot(x, epsilons, color="C0") 13 | ax.set_xlabel("Training Steps", color="C0") 14 | ax.set_ylabel("Epsilon", color="C0") 15 | ax.tick_params(axis='x', colors="C0") 16 | ax.tick_params(axis='y', colors="C0") 17 | 18 | N = len(scores) 19 | running_avg = np.empty(N) 20 | for t in range(N): 21 | running_avg[t] = np.mean(scores[max(0, t-20):(t+1)]) 22 | 23 | ax2.scatter(x, running_avg, color="C1") 24 | ax2.axes.get_xaxis().set_visible(False) 25 | ax2.yaxis.tick_right() 26 | ax2.set_ylabel('Score', color="C1") 27 | ax2.yaxis.set_label_position('right') 28 | ax2.tick_params(axis='y', colors="C1") 29 | 30 | if lines is not None: 31 | for line in lines: 32 | plt.axvline(x=line) 33 | 34 | plt.savefig(filename) 35 | 36 | class RepeatActionAndMaxFrame(gym.Wrapper): 37 | """ modified from: 38 | https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On/blob/master/Chapter06/lib/wrappers.py 39 | """ 40 | def __init__(self, env=None, repeat=4, clip_reward=False, 41 | no_ops=0, fire_first=False): 42 | super(RepeatActionAndMaxFrame, self).__init__(env) 43 | self.repeat = repeat 44 | self.shape = env.observation_space.low.shape 45 | self.frame_buffer = np.zeros_like((2,self.shape)) 46 | self.clip_reward = clip_reward 47 | self.no_ops = 0 48 | self.fire_first = fire_first 49 | 50 | def step(self, action): 51 | t_reward = 0.0 52 | done = False 53 | for i in range(self.repeat): 54 | obs, reward, done, info = self.env.step(action) 55 | if self.clip_reward: 56 | reward = np.clip(np.array([reward]), -1, 1)[0] 57 | t_reward += reward 58 | idx = i % 2 59 | self.frame_buffer[idx] = obs 60 | if done: 61 | break 62 | 63 | max_frame = np.maximum(self.frame_buffer[0], self.frame_buffer[1]) 64 | return max_frame, t_reward, done, info 65 | 66 | def reset(self): 67 | obs = self.env.reset() 68 | no_ops = np.random.randint(self.no_ops)+1 if self.no_ops > 0 else 0 69 | for _ in range(no_ops): 70 | _, _, done, _ = self.env.step(0) 71 | if done: 72 | self.env.reset() 73 | 74 | if self.fire_first: 75 | assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE' 76 | obs, _, _, _ = self.env.step(1) 77 | 78 | self.frame_buffer = np.zeros_like((2,self.shape)) 79 | self.frame_buffer[0] = obs 80 | return obs 81 | 82 | class PreprocessFrame(gym.ObservationWrapper): 83 | def __init__(self, shape, env=None): 84 | super(PreprocessFrame, self).__init__(env) 85 | self.shape=(shape[2], shape[0], shape[1]) 86 | self.observation_space = gym.spaces.Box(low=0, high=1.0, 87 | shape=self.shape,dtype=np.float32) 88 | def observation(self, obs): 89 | new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) 90 | resized_screen = cv2.resize(new_frame, self.shape[1:], 91 | interpolation=cv2.INTER_AREA) 92 | new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape) 93 | new_obs = new_obs / 255.0 94 | return new_obs 95 | 96 | class StackFrames(gym.ObservationWrapper): 97 | def __init__(self, env, repeat): 98 | super(StackFrames, self).__init__(env) 99 | self.observation_space = gym.spaces.Box( 100 | env.observation_space.low.repeat(repeat, axis=0), 101 | env.observation_space.high.repeat(repeat, axis=0), 102 | dtype=np.float32) 103 | self.stack = collections.deque(maxlen=repeat) 104 | 105 | def reset(self): 106 | self.stack.clear() 107 | observation = self.env.reset() 108 | for _ in range(self.stack.maxlen): 109 | self.stack.append(observation) 110 | 111 | return np.array(self.stack).reshape(self.observation_space.low.shape) 112 | 113 | def observation(self, observation): 114 | self.stack.append(observation) 115 | obs = np.array(self.stack).reshape(self.observation_space.low.shape) 116 | 117 | return obs 118 | 119 | def make_env(env_name, shape=(84,84,1), repeat=4, clip_rewards=False, 120 | no_ops=0, fire_first=False): 121 | env = gym.make(env_name) 122 | env = RepeatActionAndMaxFrame(env, repeat, clip_rewards, no_ops, fire_first) 123 | env = PreprocessFrame(shape, env) 124 | env = StackFrames(env, repeat) 125 | 126 | return env 127 | --------------------------------------------------------------------------------