├── ActorCritic ├── actor_critic_torch.py ├── main_lunar_lander_actor_critic.py ├── tf2 │ ├── agent.py │ ├── main.py │ ├── networks.py │ └── utils.py └── utils.py ├── DDPG ├── buffer.py ├── ddpg_torch.py ├── main_ddpg.py ├── networks.py ├── noise.py ├── tf2 │ ├── agent.py │ ├── buffer.py │ ├── main.py │ ├── networks.py │ ├── noise.py │ └── utils.py └── utils.py ├── Fundamentals ├── control_blackJack_no_es.py ├── control_cartpole_q_learning.py ├── main_control_blackJack_no_es.py ├── main_control_cartpole_q_learning.py ├── main_prediction_blackJack.py ├── prediction_blackJack.py └── prediction_cartpole_td_zero.py ├── README.md ├── Reinforce ├── lunar_lander_random.py ├── main_lunar_lander_reinforce.py ├── reinforce_torch.py └── tf2 │ ├── agent.py │ ├── main.py │ ├── networks.py │ └── utils.py ├── SAC ├── buffer.py ├── main_sac.py ├── networks.py ├── sac_torch.py ├── tf2 │ ├── agent.py │ ├── buffer.py │ ├── main.py │ ├── networks.py │ └── utils.py └── utils.py └── TD3 ├── buffer.py ├── main_td3.py ├── networks.py ├── td3_torch.py ├── tf2 ├── agent.py ├── buffer.py ├── main.py ├── networks.py └── utils.py └── utils.py /ActorCritic/actor_critic_torch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | class ActorCriticNetwork(nn.Module): 8 | def __init__(self, lr, input_dims, n_actions, fc1_dims=256, fc2_dims=256): 9 | super(ActorCriticNetwork, self).__init__() 10 | self.fc1 = nn.Linear(*input_dims, fc1_dims) 11 | self.fc2 = nn.Linear(fc1_dims, fc2_dims) 12 | self.pi = nn.Linear(fc2_dims, n_actions) 13 | self.v = nn.Linear(fc2_dims, 1) 14 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 15 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 16 | self.to(self.device) 17 | 18 | def forward(self, state): 19 | x = F.relu(self.fc1(state)) 20 | x = F.relu(self.fc2(x)) 21 | pi = self.pi(x) 22 | v = self.v(x) 23 | 24 | return (pi, v) 25 | 26 | class Agent(): 27 | def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions, 28 | gamma=0.99): 29 | self.gamma = gamma 30 | self.lr = lr 31 | self.fc1_dims = fc1_dims 32 | self.fc2_dims = fc2_dims 33 | self.actor_critic = ActorCriticNetwork(lr, input_dims, n_actions, 34 | fc1_dims, fc2_dims) 35 | self.log_prob = None 36 | 37 | def choose_action(self, observation): 38 | state = T.tensor([observation], dtype=T.float).to(self.actor_critic.device) 39 | probabilities, _ = self.actor_critic.forward(state) 40 | probabilities = F.softmax(probabilities, dim=1) 41 | action_probs = T.distributions.Categorical(probabilities) 42 | action = action_probs.sample() 43 | log_prob = action_probs.log_prob(action) 44 | self.log_prob = log_prob 45 | 46 | return action.item() 47 | 48 | def learn(self, state, reward, state_, done): 49 | self.actor_critic.optimizer.zero_grad() 50 | 51 | state = T.tensor([state], dtype=T.float).to(self.actor_critic.device) 52 | state_ = T.tensor([state_], dtype=T.float).to(self.actor_critic.device) 53 | reward = T.tensor(reward, dtype=T.float).to(self.actor_critic.device) 54 | 55 | _, critic_value = self.actor_critic.forward(state) 56 | _, critic_value_ = self.actor_critic.forward(state_) 57 | 58 | delta = reward + self.gamma*critic_value_*(1-int(done)) - critic_value 59 | 60 | actor_loss = -self.log_prob*delta 61 | critic_loss = delta**2 62 | 63 | (actor_loss + critic_loss).backward() 64 | self.actor_critic.optimizer.step() 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /ActorCritic/main_lunar_lander_actor_critic.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from actor_critic_torch import Agent 4 | from utils import plot_learning_curve 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('LunarLander-v2') 8 | agent = Agent(gamma=0.99, lr=5e-6, input_dims=[8], n_actions=4, 9 | fc1_dims=2048, fc2_dims=1536) 10 | n_games = 3000 11 | 12 | fname = 'ACTOR_CRITIC_' + 'lunar_lander_' + str(agent.fc1_dims) + \ 13 | '_fc1_dims_' + str(agent.fc2_dims) + '_fc2_dims_lr' + str(agent.lr) +\ 14 | '_' + str(n_games) + 'games' 15 | figure_file = 'plots/' + fname + '.png' 16 | 17 | scores = [] 18 | for i in range(n_games): 19 | done = False 20 | observation = env.reset() 21 | score = 0 22 | while not done: 23 | action = agent.choose_action(observation) 24 | observation_, reward, done, info = env.step(action) 25 | score += reward 26 | agent.learn(observation, reward, observation_, done) 27 | observation = observation_ 28 | scores.append(score) 29 | 30 | avg_score = np.mean(scores[-100:]) 31 | print('episode ', i, 'score %.1f' % score, 32 | 'average score %.1f' % avg_score) 33 | 34 | x = [i+1 for i in range(n_games)] 35 | plot_learning_curve(x, scores, figure_file) 36 | 37 | -------------------------------------------------------------------------------- /ActorCritic/tf2/agent.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | from tensorflow.keras.optimizers import Adam 4 | import tensorflow_probability as tfp 5 | import tensorflow.keras as keras 6 | from networks import ActorCriticNetwork 7 | 8 | 9 | class Agent: 10 | def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2, 11 | fc1_dims=256, fc2_dims=256, chkpt_dir='models/'): 12 | self.gamma = gamma 13 | self.n_actions = n_actions 14 | self.action = None 15 | self.action_space = [i for i in range(self.n_actions)] 16 | self.checkpoint_file = os.path.join(chkpt_dir, '_actor_critic') 17 | 18 | self.actor_critic = ActorCriticNetwork(n_actions=n_actions) 19 | 20 | self.actor_critic.compile(optimizer=Adam(learning_rate=alpha)) 21 | 22 | def choose_action(self, observation): 23 | state = tf.convert_to_tensor([observation]) 24 | _, probs = self.actor_critic(state) 25 | 26 | action_probabilities = tfp.distributions.Categorical(probs=probs) 27 | action = action_probabilities.sample() 28 | self.action = action 29 | 30 | return action.numpy()[0] 31 | 32 | def save_models(self): 33 | self.actor_critic.save(self.checkpoint_file) 34 | print('... saving models ...') 35 | 36 | def load_models(self): 37 | self.actor_critic = keras.models.load_model(self.checkpoint_file) 38 | print('... loading models ...') 39 | 40 | def learn(self, state, reward, state_, done): 41 | state = tf.convert_to_tensor([state], dtype=tf.float32) 42 | state_ = tf.convert_to_tensor([state_], dtype=tf.float32) 43 | reward = tf.convert_to_tensor(reward, dtype=tf.float32) 44 | 45 | with tf.GradientTape(persistent=True) as tape: 46 | state_value, probs = self.actor_critic(state) 47 | state_value_, _ = self.actor_critic(state_) 48 | state_value = tf.squeeze(state_value) 49 | state_value_ = tf.squeeze(state_value_) 50 | 51 | action_probs = tfp.distributions.Categorical(probs=probs) 52 | log_prob = action_probs.log_prob(self.action) 53 | 54 | delta = reward + \ 55 | self.gamma*state_value_*(1-int(done)) - state_value 56 | actor_loss = -log_prob*delta 57 | critic_loss = delta**2 58 | total_loss = actor_loss + critic_loss 59 | params = self.actor_critic.trainable_variables 60 | grads = tape.gradient(total_loss, params) 61 | self.actor_critic.optimizer.apply_gradients(zip(grads, params)) 62 | -------------------------------------------------------------------------------- /ActorCritic/tf2/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from agent import Agent 4 | from utils import plot_learning_curve, manage_memory 5 | from gym import wrappers 6 | 7 | if __name__ == '__main__': 8 | manage_memory() 9 | # env = gym.make('LunarLander-v2') 10 | env = gym.make('CartPole-v0') 11 | agent = Agent(alpha=1e-5, n_actions=env.action_space.n) 12 | n_games = 1800 13 | record_video = False 14 | load_checkpoint = False 15 | 16 | # do a mkdir video if you want to record video of the agent playing 17 | if record_video: 18 | env = wrappers.Monitor(env, 'video', 19 | video_callable=lambda episode_id: True, 20 | force=True) 21 | filename = 'cartpole_1e-5_1024x512_1800games.png' 22 | 23 | figure_file = 'plots/' + filename 24 | 25 | best_score = env.reward_range[0] 26 | score_history = [] 27 | 28 | if load_checkpoint: 29 | agent.load_models() 30 | 31 | for i in range(n_games): 32 | observation = env.reset() 33 | done = False 34 | score = 0 35 | while not done: 36 | action = agent.choose_action(observation) 37 | observation_, reward, done, info = env.step(action) 38 | score += reward 39 | if not load_checkpoint: 40 | agent.learn(observation, reward, observation_, done) 41 | observation = observation_ 42 | score_history.append(score) 43 | avg_score = np.mean(score_history[-100:]) 44 | 45 | if avg_score > best_score: 46 | best_score = avg_score 47 | if not load_checkpoint: 48 | agent.save_models() 49 | print('episode {} score {:.1f} avg score {:.1f}'.format( 50 | i, score, avg_score)) 51 | 52 | if not load_checkpoint: 53 | x = [i+1 for i in range(n_games)] 54 | plot_learning_curve(x, score_history, figure_file) 55 | -------------------------------------------------------------------------------- /ActorCritic/tf2/networks.py: -------------------------------------------------------------------------------- 1 | import tensorflow.keras as keras 2 | from tensorflow.keras.layers import Dense 3 | 4 | 5 | class ActorCriticNetwork(keras.Model): 6 | def __init__(self, n_actions, fc1_dims=1024, fc2_dims=512): 7 | super(ActorCriticNetwork, self).__init__() 8 | self.fc1_dims = fc1_dims 9 | self.fc2_dims = fc2_dims 10 | self.n_actions = n_actions 11 | 12 | self.fc1 = Dense(self.fc1_dims, activation='relu') 13 | self.fc2 = Dense(self.fc2_dims, activation='relu') 14 | self.v = Dense(1, activation=None) 15 | self.pi = Dense(n_actions, activation='softmax') 16 | 17 | def call(self, state): 18 | value = self.fc1(state) 19 | value = self.fc2(value) 20 | 21 | v = self.v(value) 22 | pi = self.pi(value) 23 | 24 | return v, pi 25 | -------------------------------------------------------------------------------- /ActorCritic/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import tensorflow as tf 4 | 5 | 6 | def manage_memory(): 7 | gpus = tf.config.list_physical_devices('GPU') 8 | if gpus: 9 | try: 10 | for gpu in gpus: 11 | tf.config.experimental.set_memory_growth(gpu, True) 12 | except RuntimeError as e: 13 | print(e) 14 | 15 | 16 | def plot_learning_curve(x, scores, figure_file): 17 | running_avg = np.zeros(len(scores)) 18 | for i in range(len(running_avg)): 19 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 20 | plt.plot(x, running_avg) 21 | plt.title('Running average of previous 100 scores') 22 | plt.savefig(figure_file) 23 | -------------------------------------------------------------------------------- /ActorCritic/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /DDPG/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape)) 8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape)) 9 | self.action_memory = np.zeros((self.mem_size, n_actions)) 10 | self.reward_memory = np.zeros(self.mem_size) 11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 12 | 13 | def store_transition(self, state, action, reward, state_, done): 14 | index = self.mem_cntr % self.mem_size 15 | self.state_memory[index] = state 16 | self.action_memory[index] = action 17 | self.reward_memory[index] = reward 18 | self.new_state_memory[index] = state_ 19 | self.terminal_memory[index] = done 20 | 21 | self.mem_cntr += 1 22 | 23 | def sample_buffer(self, batch_size): 24 | max_mem = min(self.mem_cntr, self.mem_size) 25 | 26 | batch = np.random.choice(max_mem, batch_size) 27 | 28 | states = self.state_memory[batch] 29 | actions = self.action_memory[batch] 30 | rewards = self.reward_memory[batch] 31 | states_ = self.new_state_memory[batch] 32 | dones = self.terminal_memory[batch] 33 | 34 | return states, actions, rewards, states_, dones 35 | 36 | 37 | -------------------------------------------------------------------------------- /DDPG/ddpg_torch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch as T 4 | import torch.nn.functional as F 5 | from networks import ActorNetwork, CriticNetwork 6 | from noise import OUActionNoise 7 | from buffer import ReplayBuffer 8 | 9 | class Agent(): 10 | def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, 11 | max_size=1000000, fc1_dims=400, fc2_dims=300, 12 | batch_size=64): 13 | self.gamma = gamma 14 | self.tau = tau 15 | self.batch_size = batch_size 16 | self.alpha = alpha 17 | self.beta = beta 18 | 19 | self.memory = ReplayBuffer(max_size, input_dims, n_actions) 20 | 21 | self.noise = OUActionNoise(mu=np.zeros(n_actions)) 22 | 23 | self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, 24 | n_actions=n_actions, name='actor') 25 | self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, 26 | n_actions=n_actions, name='critic') 27 | 28 | self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims, 29 | n_actions=n_actions, name='target_actor') 30 | 31 | self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims, 32 | n_actions=n_actions, name='target_critic') 33 | 34 | self.update_network_parameters(tau=1) 35 | 36 | def choose_action(self, observation): 37 | self.actor.eval() 38 | state = T.tensor([observation], dtype=T.float).to(self.actor.device) 39 | mu = self.actor.forward(state).to(self.actor.device) 40 | mu_prime = mu + T.tensor(self.noise(), 41 | dtype=T.float).to(self.actor.device) 42 | self.actor.train() 43 | 44 | return mu_prime.cpu().detach().numpy()[0] 45 | 46 | def remember(self, state, action, reward, state_, done): 47 | self.memory.store_transition(state, action, reward, state_, done) 48 | 49 | def save_models(self): 50 | self.actor.save_checkpoint() 51 | self.target_actor.save_checkpoint() 52 | self.critic.save_checkpoint() 53 | self.target_critic.save_checkpoint() 54 | 55 | def load_models(self): 56 | self.actor.load_checkpoint() 57 | self.target_actor.load_checkpoint() 58 | self.critic.load_checkpoint() 59 | self.target_critic.load_checkpoint() 60 | 61 | def learn(self): 62 | if self.memory.mem_cntr < self.batch_size: 63 | return 64 | 65 | states, actions, rewards, states_, done = \ 66 | self.memory.sample_buffer(self.batch_size) 67 | 68 | states = T.tensor(states, dtype=T.float).to(self.actor.device) 69 | states_ = T.tensor(states_, dtype=T.float).to(self.actor.device) 70 | actions = T.tensor(actions, dtype=T.float).to(self.actor.device) 71 | rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) 72 | done = T.tensor(done).to(self.actor.device) 73 | 74 | target_actions = self.target_actor.forward(states_) 75 | critic_value_ = self.target_critic.forward(states_, target_actions) 76 | critic_value = self.critic.forward(states, actions) 77 | 78 | critic_value_[done] = 0.0 79 | critic_value_ = critic_value_.view(-1) 80 | 81 | target = rewards + self.gamma*critic_value_ 82 | target = target.view(self.batch_size, 1) 83 | 84 | self.critic.optimizer.zero_grad() 85 | critic_loss = F.mse_loss(target, critic_value) 86 | critic_loss.backward() 87 | self.critic.optimizer.step() 88 | 89 | self.actor.optimizer.zero_grad() 90 | actor_loss = -self.critic.forward(states, self.actor.forward(states)) 91 | actor_loss = T.mean(actor_loss) 92 | actor_loss.backward() 93 | self.actor.optimizer.step() 94 | 95 | self.update_network_parameters() 96 | 97 | def update_network_parameters(self, tau=None): 98 | if tau is None: 99 | tau = self.tau 100 | 101 | actor_params = self.actor.named_parameters() 102 | critic_params = self.critic.named_parameters() 103 | target_actor_params = self.target_actor.named_parameters() 104 | target_critic_params = self.target_critic.named_parameters() 105 | 106 | critic_state_dict = dict(critic_params) 107 | actor_state_dict = dict(actor_params) 108 | target_critic_state_dict = dict(target_critic_params) 109 | target_actor_state_dict = dict(target_actor_params) 110 | 111 | for name in critic_state_dict: 112 | critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ 113 | (1-tau)*target_critic_state_dict[name].clone() 114 | 115 | for name in actor_state_dict: 116 | actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ 117 | (1-tau)*target_actor_state_dict[name].clone() 118 | 119 | self.target_critic.load_state_dict(critic_state_dict) 120 | self.target_actor.load_state_dict(actor_state_dict) 121 | #self.target_critic.load_state_dict(critic_state_dict, strict=False) 122 | #self.target_actor.load_state_dict(actor_state_dict, strict=False) 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /DDPG/main_ddpg.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from ddpg_torch import Agent 4 | from utils import plot_learning_curve 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('LunarLanderContinuous-v2') 8 | agent = Agent(alpha=0.0001, beta=0.001, 9 | input_dims=env.observation_space.shape, tau=0.001, 10 | batch_size=64, fc1_dims=400, fc2_dims=300, 11 | n_actions=env.action_space.shape[0]) 12 | n_games = 1000 13 | filename = 'LunarLander_alpha_' + str(agent.alpha) + '_beta_' + \ 14 | str(agent.beta) + '_' + str(n_games) + '_games' 15 | figure_file = 'plots/' + filename + '.png' 16 | 17 | best_score = env.reward_range[0] 18 | score_history = [] 19 | for i in range(n_games): 20 | observation = env.reset() 21 | done = False 22 | score = 0 23 | agent.noise.reset() 24 | while not done: 25 | action = agent.choose_action(observation) 26 | observation_, reward, done, info = env.step(action) 27 | agent.remember(observation, action, reward, observation_, done) 28 | agent.learn() 29 | score += reward 30 | observation = observation_ 31 | score_history.append(score) 32 | avg_score = np.mean(score_history[-100:]) 33 | 34 | if avg_score > best_score: 35 | best_score = avg_score 36 | agent.save_models() 37 | 38 | print('episode ', i, 'score %.1f' % score, 39 | 'average score %.1f' % avg_score) 40 | x = [i+1 for i in range(n_games)] 41 | plot_learning_curve(x, score_history, figure_file) 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /DDPG/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch as T 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | 8 | class CriticNetwork(nn.Module): 9 | def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, name, 10 | chkpt_dir='tmp/ddpg'): 11 | super(CriticNetwork, self).__init__() 12 | self.input_dims = input_dims 13 | self.fc1_dims = fc1_dims 14 | self.fc2_dims = fc2_dims 15 | self.n_actions = n_actions 16 | self.name = name 17 | self.checkpoint_dir = chkpt_dir 18 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg') 19 | 20 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) 21 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 22 | 23 | self.bn1 = nn.LayerNorm(self.fc1_dims) 24 | self.bn2 = nn.LayerNorm(self.fc2_dims) 25 | #self.bn1 = nn.BatchNorm1d(self.fc1_dims) 26 | #self.bn2 = nn.BatchNorm1d(self.fc2_dims) 27 | 28 | self.action_value = nn.Linear(self.n_actions, self.fc2_dims) 29 | 30 | self.q = nn.Linear(self.fc2_dims, 1) 31 | 32 | f1 = 1./np.sqrt(self.fc1.weight.data.size()[0]) 33 | self.fc1.weight.data.uniform_(-f1, f1) 34 | self.fc1.bias.data.uniform_(-f1, f1) 35 | 36 | f2 = 1./np.sqrt(self.fc2.weight.data.size()[0]) 37 | self.fc2.weight.data.uniform_(-f2, f2) 38 | self.fc2.bias.data.uniform_(-f2, f2) 39 | 40 | f3 = 0.003 41 | self.q.weight.data.uniform_(-f3, f3) 42 | self.q.bias.data.uniform_(-f3, f3) 43 | 44 | f4 = 1./np.sqrt(self.action_value.weight.data.size()[0]) 45 | self.action_value.weight.data.uniform_(-f4, f4) 46 | self.action_value.bias.data.uniform_(-f4, f4) 47 | 48 | self.optimizer = optim.Adam(self.parameters(), lr=beta, 49 | weight_decay=0.01) 50 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1') 51 | 52 | self.to(self.device) 53 | 54 | def forward(self, state, action): 55 | state_value = self.fc1(state) 56 | state_value = self.bn1(state_value) 57 | state_value = F.relu(state_value) 58 | state_value = self.fc2(state_value) 59 | state_value = self.bn2(state_value) 60 | #state_value = F.relu(state_value) 61 | #action_value = F.relu(self.action_value(action)) 62 | action_value = self.action_value(action) 63 | state_action_value = F.relu(T.add(state_value, action_value)) 64 | #state_action_value = T.add(state_value, action_value) 65 | state_action_value = self.q(state_action_value) 66 | 67 | return state_action_value 68 | 69 | def save_checkpoint(self): 70 | print('... saving checkpoint ...') 71 | T.save(self.state_dict(), self.checkpoint_file) 72 | 73 | def load_checkpoint(self): 74 | print('... loading checkpoint ...') 75 | self.load_state_dict(T.load(self.checkpoint_file)) 76 | 77 | def save_best(self): 78 | print('... saving best checkpoint ...') 79 | checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best') 80 | T.save(self.state_dict(), checkpoint_file) 81 | 82 | class ActorNetwork(nn.Module): 83 | def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, n_actions, name, 84 | chkpt_dir='tmp/ddpg'): 85 | super(ActorNetwork, self).__init__() 86 | self.input_dims = input_dims 87 | self.fc1_dims = fc1_dims 88 | self.fc2_dims = fc2_dims 89 | self.n_actions = n_actions 90 | self.name = name 91 | self.checkpoint_dir = chkpt_dir 92 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg') 93 | 94 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) 95 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 96 | 97 | self.bn1 = nn.LayerNorm(self.fc1_dims) 98 | self.bn2 = nn.LayerNorm(self.fc2_dims) 99 | 100 | #self.bn1 = nn.BatchNorm1d(self.fc1_dims) 101 | #self.bn2 = nn.BatchNorm1d(self.fc2_dims) 102 | 103 | self.mu = nn.Linear(self.fc2_dims, self.n_actions) 104 | 105 | f2 = 1./np.sqrt(self.fc2.weight.data.size()[0]) 106 | self.fc2.weight.data.uniform_(-f2, f2) 107 | self.fc2.bias.data.uniform_(-f2, f2) 108 | 109 | f1 = 1./np.sqrt(self.fc1.weight.data.size()[0]) 110 | self.fc1.weight.data.uniform_(-f1, f1) 111 | self.fc1.bias.data.uniform_(-f1, f1) 112 | 113 | f3 = 0.003 114 | self.mu.weight.data.uniform_(-f3, f3) 115 | self.mu.bias.data.uniform_(-f3, f3) 116 | 117 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 118 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1') 119 | 120 | self.to(self.device) 121 | 122 | def forward(self, state): 123 | x = self.fc1(state) 124 | x = self.bn1(x) 125 | x = F.relu(x) 126 | x = self.fc2(x) 127 | x = self.bn2(x) 128 | x = F.relu(x) 129 | x = T.tanh(self.mu(x)) 130 | 131 | return x 132 | 133 | def save_checkpoint(self): 134 | print('... saving checkpoint ...') 135 | T.save(self.state_dict(), self.checkpoint_file) 136 | 137 | def load_checkpoint(self): 138 | print('... loading checkpoint ...') 139 | self.load_state_dict(T.load(self.checkpoint_file)) 140 | 141 | def save_best(self): 142 | print('... saving best checkpoint ...') 143 | checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best') 144 | T.save(self.state_dict(), checkpoint_file) 145 | -------------------------------------------------------------------------------- /DDPG/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class OUActionNoise(): 4 | def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None): 5 | self.theta = theta 6 | self.mu = mu 7 | self.sigma = sigma 8 | self.dt = dt 9 | self.x0 = x0 10 | self.reset() 11 | 12 | def __call__(self): 13 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \ 14 | self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) 15 | self.x_prev = x 16 | 17 | return x 18 | 19 | def reset(self): 20 | self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) 21 | 22 | 23 | -------------------------------------------------------------------------------- /DDPG/tf2/agent.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.keras as keras 3 | from tensorflow.keras.optimizers import Adam 4 | from buffer import ReplayBuffer 5 | from networks import ActorNetwork, CriticNetwork 6 | 7 | 8 | class Agent: 9 | def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None, 10 | gamma=0.99, n_actions=2, max_size=1000000, tau=0.005, 11 | fc1=400, fc2=300, batch_size=64, noise=0.1, 12 | chkpt_dir='models/ddpg/'): 13 | self.gamma = gamma 14 | self.tau = tau 15 | self.memory = ReplayBuffer(max_size, input_dims, n_actions) 16 | self.batch_size = batch_size 17 | self.n_actions = n_actions 18 | self.noise = noise 19 | self.max_action = env.action_space.high[0] 20 | self.min_action = env.action_space.low[0] 21 | self.chkpt_dir = chkpt_dir 22 | 23 | self.actor = ActorNetwork(n_actions=n_actions, 24 | fc1_dims=fc1, fc2_dims=fc2) 25 | self.critic = CriticNetwork(n_actions=n_actions, 26 | fc1_dims=fc1, fc2_dims=fc2) 27 | self.target_actor = ActorNetwork(n_actions=n_actions, 28 | fc1_dims=fc1, fc2_dims=fc2) 29 | self.target_critic = CriticNetwork(n_actions=n_actions, 30 | fc1_dims=fc1, fc2_dims=fc2) 31 | 32 | self.actor.compile(optimizer=Adam(learning_rate=alpha)) 33 | self.critic.compile(optimizer=Adam(learning_rate=beta)) 34 | self.target_actor.compile(optimizer=Adam(learning_rate=alpha)) 35 | self.target_critic.compile(optimizer=Adam(learning_rate=beta)) 36 | 37 | self.update_network_parameters(tau=1) 38 | 39 | def update_network_parameters(self, tau=None): 40 | if tau is None: 41 | tau = self.tau 42 | 43 | weights = [] 44 | targets = self.target_actor.weights 45 | for i, weight in enumerate(self.actor.weights): 46 | weights.append(weight * tau + targets[i]*(1-tau)) 47 | self.target_actor.set_weights(weights) 48 | 49 | weights = [] 50 | targets = self.target_critic.weights 51 | for i, weight in enumerate(self.critic.weights): 52 | weights.append(weight * tau + targets[i]*(1-tau)) 53 | self.target_critic.set_weights(weights) 54 | 55 | def store_transition(self, state, action, reward, new_state, done): 56 | self.memory.store_transition(state, action, reward, new_state, done) 57 | 58 | def save_models(self): 59 | print('... saving models ...') 60 | self.actor.save(self.chkpt_dir+'actor') 61 | self.target_actor.save(self.chkpt_dir+'target_actor') 62 | self.critic.save(self.chkpt_dir+'critic') 63 | self.target_critic.save(self.chkpt_dir+'target_critic') 64 | 65 | def load_models(self): 66 | print('... loading models ...') 67 | self.actor = keras.models.load_model(self.chkpt_dir+'actor') 68 | self.target_actor = \ 69 | keras.models.load_model(self.chkpt_dir+'target_actor') 70 | self.critic = keras.models.load_model(self.chkpt_dir+'critic') 71 | self.target_critic = \ 72 | keras.models.load_model(self.chkpt_dir+'target_critic') 73 | 74 | def choose_action(self, observation, evaluate=False): 75 | state = tf.convert_to_tensor([observation], dtype=tf.float32) 76 | actions = self.actor(state) 77 | if not evaluate: 78 | actions += tf.random.normal(shape=[self.n_actions], 79 | mean=0.0, stddev=self.noise) 80 | # note that if the env has an action > 1, we have to multiply by 81 | # max action at some point 82 | actions = tf.clip_by_value(actions, self.min_action, self.max_action) 83 | 84 | return actions[0] 85 | 86 | def learn(self): 87 | if self.memory.mem_cntr < self.batch_size: 88 | return 89 | 90 | state, action, reward, new_state, done = \ 91 | self.memory.sample_buffer(self.batch_size) 92 | 93 | states = tf.convert_to_tensor(state, dtype=tf.float32) 94 | states_ = tf.convert_to_tensor(new_state, dtype=tf.float32) 95 | rewards = tf.convert_to_tensor(reward, dtype=tf.float32) 96 | actions = tf.convert_to_tensor(action, dtype=tf.float32) 97 | 98 | with tf.GradientTape() as tape: 99 | target_actions = self.target_actor(states_) 100 | critic_value_ = tf.squeeze(self.target_critic( 101 | (states_, target_actions)), 1) 102 | critic_value = tf.squeeze(self.critic((states, actions)), 1) 103 | target = rewards + self.gamma*critic_value_*(1-done) 104 | critic_loss = keras.losses.MSE(target, critic_value) 105 | params = self.critic.trainable_variables 106 | grads = tape.gradient(critic_loss, params) 107 | self.critic.optimizer.apply_gradients(zip(grads, params)) 108 | 109 | with tf.GradientTape() as tape: 110 | new_policy_actions = self.actor(states) 111 | actor_loss = -self.critic((states, new_policy_actions)) 112 | actor_loss = tf.math.reduce_mean(actor_loss) 113 | params = self.actor.trainable_variables 114 | grads = tape.gradient(actor_loss, params) 115 | self.actor.optimizer.apply_gradients(zip(grads, params)) 116 | 117 | self.update_network_parameters() 118 | -------------------------------------------------------------------------------- /DDPG/tf2/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape)) 8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape)) 9 | self.action_memory = np.zeros((self.mem_size, n_actions)) 10 | self.reward_memory = np.zeros(self.mem_size) 11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 12 | 13 | def store_transition(self, state, action, reward, state_, done): 14 | index = self.mem_cntr % self.mem_size 15 | self.state_memory[index] = state 16 | self.action_memory[index] = action 17 | self.reward_memory[index] = reward 18 | self.new_state_memory[index] = state_ 19 | self.terminal_memory[index] = done 20 | 21 | self.mem_cntr += 1 22 | 23 | def sample_buffer(self, batch_size): 24 | max_mem = min(self.mem_cntr, self.mem_size) 25 | 26 | batch = np.random.choice(max_mem, batch_size) 27 | 28 | states = self.state_memory[batch] 29 | actions = self.action_memory[batch] 30 | rewards = self.reward_memory[batch] 31 | states_ = self.new_state_memory[batch] 32 | dones = self.terminal_memory[batch] 33 | 34 | return states, actions, rewards, states_, dones 35 | 36 | 37 | -------------------------------------------------------------------------------- /DDPG/tf2/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from agent import Agent 4 | from utils import plot_learning_curve, manage_memory 5 | 6 | if __name__ == '__main__': 7 | manage_memory() 8 | env = gym.make('LunarLanderContinuous-v2') 9 | agent = Agent(input_dims=env.observation_space.shape, env=env, 10 | n_actions=env.action_space.shape[0], 11 | alpha=0.0001, beta=0.001) 12 | n_games = 1000 13 | 14 | figure_file = 'plots/lunar_lander.png' 15 | 16 | best_score = env.reward_range[0] 17 | score_history = [] 18 | load_checkpoint = False 19 | 20 | if load_checkpoint: 21 | agent.load_models() 22 | evaluate = True 23 | else: 24 | evaluate = False 25 | 26 | for i in range(n_games): 27 | observation = env.reset() 28 | done = False 29 | score = 0 30 | while not done: 31 | action = agent.choose_action(observation, evaluate) 32 | observation_, reward, done, info = env.step(action) 33 | score += reward 34 | agent.store_transition(observation, action, reward, 35 | observation_, done) 36 | if not load_checkpoint: 37 | agent.learn() 38 | observation = observation_ 39 | 40 | score_history.append(score) 41 | avg_score = np.mean(score_history[-100:]) 42 | 43 | if avg_score > best_score: 44 | best_score = avg_score 45 | if not load_checkpoint: 46 | agent.save_models() 47 | 48 | print('episode {} score {:.1f} avg score {:.1f}'. 49 | format(i, score, avg_score)) 50 | 51 | if not load_checkpoint: 52 | x = [i+1 for i in range(n_games)] 53 | plot_learning_curve(x, score_history, figure_file) 54 | -------------------------------------------------------------------------------- /DDPG/tf2/networks.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.keras as keras 3 | from tensorflow.keras.layers import Dense 4 | 5 | 6 | class CriticNetwork(keras.Model): 7 | def __init__(self, n_actions, fc1_dims=512, fc2_dims=512): 8 | super(CriticNetwork, self).__init__() 9 | self.fc1_dims = fc1_dims 10 | self.fc2_dims = fc2_dims 11 | self.n_actions = n_actions 12 | 13 | self.fc1 = Dense(self.fc1_dims, activation='relu') 14 | self.fc2 = Dense(self.fc2_dims, activation='relu') 15 | self.q = Dense(1, activation=None) 16 | 17 | # have to define inputs as a tuple because the model.save() function 18 | # trips an error when trying to save a call function with two inputs. 19 | def call(self, inputs): 20 | state, action = inputs 21 | action_value = self.fc1(tf.concat([state, action], axis=1)) 22 | action_value = self.fc2(action_value) 23 | 24 | q = self.q(action_value) 25 | 26 | return q 27 | 28 | 29 | class ActorNetwork(keras.Model): 30 | def __init__(self, fc1_dims=512, fc2_dims=512, n_actions=2): 31 | super(ActorNetwork, self).__init__() 32 | self.fc1_dims = fc1_dims 33 | self.fc2_dims = fc2_dims 34 | self.n_actions = n_actions 35 | 36 | self.fc1 = Dense(self.fc1_dims, activation='relu') 37 | self.fc2 = Dense(self.fc2_dims, activation='relu') 38 | self.mu = Dense(self.n_actions, activation='tanh') 39 | 40 | def call(self, state): 41 | prob = self.fc1(state) 42 | prob = self.fc2(prob) 43 | 44 | mu = self.mu(prob) 45 | 46 | return mu 47 | -------------------------------------------------------------------------------- /DDPG/tf2/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class OUActionNoise(): 4 | def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None): 5 | self.theta = theta 6 | self.mu = mu 7 | self.sigma = sigma 8 | self.dt = dt 9 | self.x0 = x0 10 | self.reset() 11 | 12 | def __call__(self): 13 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \ 14 | self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) 15 | self.x_prev = x 16 | 17 | return x 18 | 19 | def reset(self): 20 | self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) 21 | 22 | 23 | -------------------------------------------------------------------------------- /DDPG/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import tensorflow as tf 4 | 5 | 6 | def manage_memory(): 7 | gpus = tf.config.list_physical_devices('GPU') 8 | if gpus: 9 | try: 10 | for gpu in gpus: 11 | tf.config.experimental.set_memory_growth(gpu, True) 12 | except RuntimeError as e: 13 | print(e) 14 | 15 | 16 | def plot_learning_curve(x, scores, figure_file): 17 | running_avg = np.zeros(len(scores)) 18 | for i in range(len(running_avg)): 19 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 20 | plt.plot(x, running_avg) 21 | plt.title('Running average of previous 100 scores') 22 | plt.savefig(figure_file) 23 | -------------------------------------------------------------------------------- /DDPG/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /Fundamentals/control_blackJack_no_es.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Agent(): 4 | def __init__(self, eps=0.1, gamma=0.99): 5 | self.Q = {} 6 | self.sum_space = [i for i in range(4, 22)] 7 | self.dealer_show_card_space = [i+1 for i in range(10)] 8 | self.ace_space = [False, True] 9 | self.action_space = [0, 1] #stick or hit 10 | 11 | self.state_space = [] 12 | self.memory = [] 13 | self.pairs_visited = {} 14 | self.returns = {} 15 | 16 | self.gamma = gamma 17 | self.eps = eps 18 | 19 | self.init_vals() 20 | self.init_policy() 21 | 22 | def init_vals(self): 23 | for total in self.sum_space: 24 | for card in self.dealer_show_card_space: 25 | for ace in self.ace_space: 26 | state = (total, card, ace) 27 | self.state_space.append(state) 28 | for action in self.action_space: 29 | self.Q[(state, action)] = 0 30 | self.returns[(state, action)] = [] 31 | self.pairs_visited[(state, action)] = 0 32 | 33 | def init_policy(self): 34 | policy = {} 35 | n = len(self.action_space) 36 | for state in self.state_space: 37 | policy[state] = [1/n for _ in range(n)] 38 | self.policy = policy 39 | 40 | def choose_action(self, state): 41 | action = np.random.choice(self.action_space, p=self.policy[state]) 42 | return action 43 | 44 | def update_Q(self): 45 | for idt, (state, action, _) in enumerate(self.memory): 46 | G = 0 47 | discount = 1 48 | if self.pairs_visited[(state, action)] == 0: 49 | self.pairs_visited[(state, action)] += 1 50 | for t, (_, _, reward) in enumerate(self.memory[idt:]): 51 | G += reward * discount 52 | discount *= self.gamma 53 | self.returns[(state, action)].append(G) 54 | 55 | for state, action, _ in self.memory: 56 | self.Q[(state, action)] = np.mean(self.returns[(state, action)]) 57 | self.update_policy(state) 58 | 59 | for state_action in self.pairs_visited.keys(): 60 | self.pairs_visited[state_action] = 0 61 | 62 | self.memory = [] 63 | 64 | def update_policy(self, state): 65 | actions = [self.Q[(state, a)] for a in self.action_space] 66 | a_max = np.argmax(actions) 67 | n_actions = len(self.action_space) 68 | probs = [] 69 | for action in self.action_space: 70 | prob = 1 - self.eps + self.eps / n_actions if action == a_max else \ 71 | self.eps / n_actions 72 | probs.append(prob) 73 | self.policy[state] = probs 74 | -------------------------------------------------------------------------------- /Fundamentals/control_cartpole_q_learning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Agent(): 4 | def __init__(self, lr, gamma, n_actions, state_space, eps_start, eps_end, 5 | eps_dec): 6 | self.lr = lr 7 | self.gamma = gamma 8 | self.actions = [i for i in range(n_actions)] 9 | self.states = state_space 10 | self.epsilon = eps_start 11 | self.eps_min = eps_end 12 | self.eps_dec = eps_dec 13 | 14 | self.Q = {} 15 | 16 | self.init_Q() 17 | 18 | def init_Q(self): 19 | for state in self.states: 20 | for action in self.actions: 21 | self.Q[(state, action)] = 0.0 22 | 23 | def max_action(self, state): 24 | actions = np.array([self.Q[(state, a)] for a in self.actions]) 25 | action = np.argmax(actions) 26 | 27 | return action 28 | 29 | def choose_action(self, state): 30 | if np.random.random() < self.epsilon: 31 | action = np.random.choice(self.actions) 32 | else: 33 | action = self.max_action(state) 34 | 35 | return action 36 | 37 | def decrement_epsilon(self): 38 | self.epsilon = self.epsilon - self.eps_dec \ 39 | if self.epsilon>self.eps_min else self.eps_min 40 | 41 | def learn(self, state, action, reward, state_): 42 | a_max = self.max_action(state_) 43 | 44 | self.Q[(state, action)] = self.Q[(state, action)] + self.lr*(reward + 45 | self.gamma*self.Q[(state_, a_max)] - 46 | self.Q[(state, action)]) 47 | 48 | -------------------------------------------------------------------------------- /Fundamentals/main_control_blackJack_no_es.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import matplotlib.pyplot as plt 3 | from control_blackJack_no_es import Agent 4 | 5 | if __name__ == '__main__': 6 | env = gym.make('Blackjack-v0') 7 | agent = Agent(eps=0.001) 8 | n_episodes = 200000 9 | win_lose_draw = {-1:0, 0:0, 1:0} 10 | win_rates = [] 11 | for i in range(n_episodes): 12 | if i > 0 and i % 1000 == 0: 13 | pct = win_lose_draw[1] / i 14 | win_rates.append(pct) 15 | if i % 50000 == 0: 16 | rates = win_rates[-1] if win_rates else 0.0 17 | print('starting episode', i, 'win rate %.3f' % rates) 18 | observation = env.reset() 19 | done = False 20 | while not done: 21 | action = agent.choose_action(observation) 22 | observation_, reward, done, info = env.step(action) 23 | agent.memory.append((observation, action, reward)) 24 | observation = observation_ 25 | agent.update_Q() 26 | win_lose_draw[reward] += 1 27 | plt.plot(win_rates) 28 | plt.show() 29 | -------------------------------------------------------------------------------- /Fundamentals/main_control_cartpole_q_learning.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from control_cartpole_q_learning import Agent 5 | 6 | class CartPoleStateDigitizer(): 7 | def __init__(self, bounds=(2.4, 4, 0.209, 4), n_bins=10): 8 | """ 9 | bounds - bounds for linear space. Single floating point number for 10 | each observation element. Space is from -bound to +bound 11 | observation -> x, dx/dt, theta, dtheta/dt 12 | """ 13 | self.position_space = np.linspace(-1*bounds[0], bounds[0], n_bins) 14 | self.velocity_space = np.linspace(-1*bounds[1], bounds[1], n_bins) 15 | self.pole_angle_space = np.linspace(-1*bounds[2], bounds[2], n_bins) 16 | self.pole_velocity_space = np.linspace(-1*bounds[3], bounds[3], n_bins) 17 | self.states = self.get_state_space() 18 | 19 | def get_state_space(self): 20 | states = [] 21 | for i in range(len(self.position_space)+1): 22 | for j in range(len(self.velocity_space)+1): 23 | for k in range(len(self.pole_angle_space)+1): 24 | for l in range(len(self.pole_velocity_space)+1): 25 | states.append((i,j,k,l)) 26 | return states 27 | 28 | def digitize(self, observation): 29 | x, dx_dt, theta, dtheta_dt = observation 30 | cart_x = int(np.digitize(x, self.position_space)) 31 | cart_dx_dt = int(np.digitize(dx_dt, self.velocity_space)) 32 | pole_theta = int(np.digitize(theta, self.pole_angle_space)) 33 | pole_dtheta_dt = int(np.digitize(dtheta_dt, self.pole_velocity_space)) 34 | 35 | return (cart_x, cart_dx_dt, pole_theta, pole_dtheta_dt) 36 | 37 | def plot_learning_curve(scores, x): 38 | running_avg = np.zeros(len(scores)) 39 | for i in range(len(running_avg)): 40 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 41 | plt.plot(x, running_avg) 42 | plt.title('Running average of scores') 43 | plt.show() 44 | 45 | if __name__ == '__main__': 46 | env = gym.make('CartPole-v0') 47 | n_games = 50000 48 | eps_dec = 2 / n_games 49 | digitizer = CartPoleStateDigitizer() 50 | agent = Agent(lr=0.01, gamma=0.99, n_actions=2, eps_start=1.0, 51 | eps_end=0.01, eps_dec=eps_dec, state_space=digitizer.states) 52 | 53 | scores = [] 54 | 55 | for i in range(n_games): 56 | observation = env.reset() 57 | done = False 58 | score = 0 59 | state = digitizer.digitize(observation) 60 | while not done: 61 | action = agent.choose_action(state) 62 | observation_, reward, done, info = env.step(action) 63 | state_ = digitizer.digitize(observation_) 64 | agent.learn(state, action, reward, state_) 65 | state = state_ 66 | score += reward 67 | if i % 5000 == 0: 68 | print('episode ', i, 'score %.1f' % score, 69 | 'epsilon %.2f' % agent.epsilon) 70 | 71 | agent.decrement_epsilon() 72 | scores.append(score) 73 | 74 | x = [i + 1 for i in range(n_games)] 75 | plot_learning_curve(scores, x) 76 | -------------------------------------------------------------------------------- /Fundamentals/main_prediction_blackJack.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from prediction_blackJack import Agent 4 | import matplotlib.pyplot as plt 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('Blackjack-v0') 8 | agent = Agent() 9 | n_episodes = 500000 10 | for i in range(n_episodes): 11 | if i % 50000 == 0: 12 | print('starting episode', i) 13 | observation = env.reset() 14 | done = False 15 | while not done: 16 | action = agent.policy(observation) 17 | observation_, reward, done, info = env.step(action) 18 | agent.memory.append((observation, reward)) 19 | observation = observation_ 20 | agent.update_V() 21 | print(agent.V[(21, 3, True)]) 22 | print(agent.V[(4, 1, False)]) 23 | -------------------------------------------------------------------------------- /Fundamentals/prediction_blackJack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Agent(): 4 | def __init__(self, gamma=0.99): 5 | self.V = {} 6 | self.sum_space = [i for i in range(4, 22)] 7 | self.dealer_show_card_space = [i+1 for i in range(10)] 8 | self.ace_space = [False, True] 9 | self.action_space = [0, 1] # stick or hit 10 | 11 | self.state_space = [] 12 | self.returns = {} 13 | self.states_visited = {} # first visit or not 14 | self.memory = [] 15 | self.gamma = gamma 16 | 17 | self.init_vals() 18 | 19 | def init_vals(self): 20 | for total in self.sum_space: 21 | for card in self.dealer_show_card_space: 22 | for ace in self.ace_space: 23 | self.V[(total, card, ace)] = 0 24 | self.returns[(total, card, ace)] = [] 25 | self.states_visited[(total, card, ace)] = 0 26 | self.state_space.append((total, card, ace)) 27 | 28 | def policy(self, state): 29 | total, _, _ = state 30 | action = 0 if total >= 20 else 1 31 | return action 32 | 33 | 34 | def update_V(self): 35 | for idt, (state, _) in enumerate(self.memory): 36 | G = 0 37 | if self.states_visited[state] == 0: 38 | self.states_visited[state] += 1 39 | discount = 1 40 | for t, (_, reward) in enumerate(self.memory[idt:]): 41 | G += reward * discount 42 | discount *= self.gamma 43 | self.returns[state].append(G) 44 | 45 | for state, _ in self.memory: 46 | self.V[state] = np.mean(self.returns[state]) 47 | 48 | for state in self.state_space: 49 | self.states_visited[state] = 0 50 | 51 | self.memory = [] 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /Fundamentals/prediction_cartpole_td_zero.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | 4 | def simple_policy(state): 5 | action = 0 if state < 5 else 1 6 | return action 7 | 8 | if __name__ == '__main__': 9 | env = gym.make('CartPole-v0') 10 | alpha = 0.1 11 | gamma = 0.99 12 | 13 | states = np.linspace(-0.2094, 0.2094, 10) 14 | V = {} 15 | for state in range(len(states)+1): 16 | V[state] = 0 17 | 18 | for i in range(5000): 19 | observation = env.reset() 20 | done = False 21 | while not done: 22 | state = int(np.digitize(observation[2], states)) 23 | action = simple_policy(state) 24 | observation_, reward, done, info = env.step(action) 25 | state_ = int(np.digitize(observation_[2], states)) 26 | V[state] = V[state] + alpha*(reward + gamma*V[state_] - V[state]) 27 | observation = observation_ 28 | 29 | for state in V: 30 | print(state, '%.3f' % V[state]) 31 | 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Actor-Critic-Methods-Paper-To-Code -------------------------------------------------------------------------------- /Reinforce/lunar_lander_random.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | if __name__ == '__main__': 4 | env = gym.make('LunarLander-v2') 5 | 6 | n_games = 100 7 | 8 | for i in range(n_games): 9 | obs = env.reset() 10 | score = 0 11 | done = False 12 | while not done: 13 | action = env.action_space.sample() 14 | obs_, reward, done, info = env.step(action) 15 | score += reward 16 | #env.render() 17 | print('episode ', i, 'score %.1f' % score) 18 | 19 | -------------------------------------------------------------------------------- /Reinforce/main_lunar_lander_reinforce.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from reinforce_torch import PolicyGradientAgent 5 | 6 | def plot_learning_curve(scores, x, figure_file): 7 | running_avg = np.zeros(len(scores)) 8 | for i in range(len(running_avg)): 9 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 10 | plt.plot(x, running_avg) 11 | plt.title('Running average of previous 100 scores') 12 | plt.savefig(figure_file) 13 | 14 | if __name__ == '__main__': 15 | env = gym.make('LunarLander-v2') 16 | n_games = 3000 17 | agent = PolicyGradientAgent(gamma=0.99, lr=0.0005, input_dims=[8], 18 | n_actions=4) 19 | 20 | fname = 'REINFORCE_' + 'lunar_lunar_lr' + str(agent.lr) + '_' \ 21 | + str(n_games) + 'games' 22 | figure_file = 'plots/' + fname + '.png' 23 | 24 | scores = [] 25 | for i in range(n_games): 26 | done = False 27 | observation = env.reset() 28 | score = 0 29 | while not done: 30 | action = agent.choose_action(observation) 31 | observation_, reward, done, info = env.step(action) 32 | score += reward 33 | agent.store_rewards(reward) 34 | observation = observation_ 35 | agent.learn() 36 | scores.append(score) 37 | 38 | avg_score = np.mean(scores[-100:]) 39 | print('episode ', i, 'score %.2f' % score, 40 | 'average score %.2f' % avg_score) 41 | 42 | x = [i+1 for i in range(len(scores))] 43 | plot_learning_curve(scores, x, figure_file) 44 | -------------------------------------------------------------------------------- /Reinforce/reinforce_torch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | 7 | class PolicyNetwork(nn.Module): 8 | def __init__(self, lr, input_dims, n_actions): 9 | super(PolicyNetwork, self).__init__() 10 | self.fc1 = nn.Linear(*input_dims, 128) 11 | self.fc2 = nn.Linear(128, 128) 12 | self.fc3 = nn.Linear(128, n_actions) 13 | self.optimizer = optim.Adam(self.parameters(), lr=lr) 14 | 15 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 16 | self.to(self.device) 17 | 18 | def forward(self, state): 19 | x = F.relu(self.fc1(state)) 20 | x = F.relu(self.fc2(x)) 21 | x = self.fc3(x) 22 | 23 | return x 24 | 25 | class PolicyGradientAgent(): 26 | def __init__(self, lr, input_dims, gamma=0.99, n_actions=4): 27 | self.gamma = gamma 28 | self.lr = lr 29 | self.reward_memory = [] 30 | self.action_memory = [] 31 | 32 | self.policy = PolicyNetwork(self.lr, input_dims, n_actions) 33 | 34 | def choose_action(self, observation): 35 | state = T.Tensor([observation]).to(self.policy.device) 36 | probabilities = F.softmax(self.policy.forward(state)) 37 | action_probs = T.distributions.Categorical(probabilities) 38 | action = action_probs.sample() 39 | log_probs = action_probs.log_prob(action) 40 | self.action_memory.append(log_probs) 41 | 42 | return action.item() 43 | 44 | def store_rewards(self, reward): 45 | self.reward_memory.append(reward) 46 | 47 | def learn(self): 48 | self.policy.optimizer.zero_grad() 49 | 50 | # G_t = R_t+1 + gamma * R_t+2 + gamma**2 * R_t+3 51 | # G_t = sum from k=0 to k=T {gamma**k * R_t+k+1} 52 | G = np.zeros_like(self.reward_memory, dtype=np.float64) 53 | for t in range(len(self.reward_memory)): 54 | G_sum = 0 55 | discount = 1 56 | for k in range(t, len(self.reward_memory)): 57 | G_sum += self.reward_memory[k] * discount 58 | discount *= self.gamma 59 | G[t] = G_sum 60 | G = T.tensor(G, dtype=T.float).to(self.policy.device) 61 | 62 | loss = 0 63 | for g, logprob in zip(G, self.action_memory): 64 | loss += -g * logprob 65 | loss.backward() 66 | self.policy.optimizer.step() 67 | 68 | self.action_memory = [] 69 | self.reward_memory = [] 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /Reinforce/tf2/agent.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.keras as keras 3 | from networks import PolicyNetwork 4 | import tensorflow_probability as tfp 5 | from tensorflow.keras.optimizers import Adam 6 | import numpy as np 7 | 8 | 9 | class Agent: 10 | def __init__(self, alpha=0.003, gamma=0.99, n_actions=4, 11 | fc1_dims=256, fc2_dims=256, chkpt_dir='models/'): 12 | 13 | self.gamma = gamma 14 | self.lr = alpha 15 | self.n_actions = n_actions 16 | self.chkpt_dir = chkpt_dir 17 | self.state_memory = [] 18 | self.action_memory = [] 19 | self.reward_memory = [] 20 | self.policy = PolicyNetwork(n_actions=n_actions, fc1_dims=fc1_dims, 21 | fc2_dims=fc2_dims) 22 | self.policy.compile(optimizer=Adam(learning_rate=self.lr)) 23 | 24 | def save_models(self): 25 | print('... saving models ...') 26 | self.policy.save(self.chkpt_dir+'reinforce') 27 | 28 | def load_models(self): 29 | print('... loading models ...') 30 | self.policy = keras.models.load_model(self.chkpt_dir+'reinforce') 31 | 32 | def choose_action(self, observation): 33 | state = tf.convert_to_tensor([observation], dtype=tf.float32) 34 | probs = self.policy(state) 35 | action_probs = tfp.distributions.Categorical(probs=probs) 36 | action = action_probs.sample() 37 | 38 | return action.numpy()[0] 39 | 40 | def store_transition(self, observation, action, reward): 41 | self.state_memory.append(observation) 42 | self.action_memory.append(action) 43 | self.reward_memory.append(reward) 44 | 45 | def learn(self): 46 | actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32) 47 | rewards = np.array(self.reward_memory) 48 | 49 | G = np.zeros_like(rewards) 50 | for t in range(len(rewards)): 51 | G_sum = 0 52 | discount = 1 53 | for k in range(t, len(rewards)): 54 | G_sum += rewards[k] * discount 55 | discount *= self.gamma 56 | G[t] = G_sum 57 | 58 | with tf.GradientTape() as tape: 59 | loss = 0 60 | for idx, (g, state) in enumerate(zip(G, self.state_memory)): 61 | state = tf.convert_to_tensor([state], dtype=tf.float32) 62 | probs = self.policy(state) 63 | action_probs = tfp.distributions.Categorical(probs=probs) 64 | log_prob = action_probs.log_prob(actions[idx]) 65 | loss += -g * tf.squeeze(log_prob) 66 | params = self.policy.trainable_variables 67 | grads = tape.gradient(loss, params) 68 | self.policy.optimizer.apply_gradients(zip(grads, params)) 69 | 70 | self.state_memory = [] 71 | self.action_memory = [] 72 | self.reward_memory = [] 73 | -------------------------------------------------------------------------------- /Reinforce/tf2/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from agent import Agent 4 | from utils import plot_learning_curve, manage_memory 5 | # if you have more than 1 gpu, use device '0' or '1' to assign to a gpu 6 | # import os 7 | # os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' 8 | # os.environ['CUDA_VISIBLE_DEVICES'] = '0' 9 | 10 | 11 | if __name__ == '__main__': 12 | manage_memory() 13 | best_score = -np.inf 14 | env = gym.make('LunarLander-v2') 15 | agent = Agent(alpha=0.0005, gamma=0.99, n_actions=env.action_space.n) 16 | load_checkpoint = False 17 | if load_checkpoint: 18 | agent.load_models() 19 | 20 | num_episodes = 1000 21 | score_history = [] 22 | 23 | for i in range(num_episodes): 24 | done = False 25 | score = 0 26 | observation = env.reset() 27 | while not done: 28 | action = agent.choose_action(observation) 29 | observation_, reward, done, info = env.step(action) 30 | agent.store_transition(observation, action, reward) 31 | observation = observation_ 32 | score += reward 33 | score_history.append(score) 34 | 35 | if not load_checkpoint: 36 | agent.learn() 37 | avg_score = np.mean(score_history[-100:]) 38 | if avg_score > best_score: 39 | if not load_checkpoint: 40 | agent.save_models() 41 | best_score = score 42 | 43 | print('episode {} score {:.1f} avg score {:.1f}'. 44 | format(i, score, avg_score)) 45 | 46 | filename = 'plots/lunar-lander.png' 47 | x = [i for i in range(num_episodes)] 48 | plot_learning_curve(x, score_history, filename) 49 | -------------------------------------------------------------------------------- /Reinforce/tf2/networks.py: -------------------------------------------------------------------------------- 1 | import tensorflow.keras as keras 2 | from tensorflow.keras.layers import Dense 3 | 4 | 5 | class PolicyNetwork(keras.Model): 6 | def __init__(self, n_actions, fc1_dims=256, fc2_dims=256): 7 | super(PolicyNetwork, self).__init__() 8 | self.fc1_dims = fc1_dims 9 | self.fc2_dims = fc2_dims 10 | 11 | self.fc1 = Dense(self.fc1_dims, activation='relu') 12 | self.fc2 = Dense(self.fc2_dims, activation='relu') 13 | self.pi = Dense(n_actions, activation='softmax') 14 | 15 | def call(self, state): 16 | value = self.fc1(state) 17 | value = self.fc2(value) 18 | 19 | pi = self.pi(value) 20 | 21 | return pi 22 | -------------------------------------------------------------------------------- /Reinforce/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import tensorflow as tf 4 | 5 | 6 | def manage_memory(): 7 | gpus = tf.config.list_physical_devices('GPU') 8 | if gpus: 9 | try: 10 | for gpu in gpus: 11 | tf.config.experimental.set_memory_growth(gpu, True) 12 | except RuntimeError as e: 13 | print(e) 14 | 15 | 16 | def plot_learning_curve(x, scores, figure_file): 17 | running_avg = np.zeros(len(scores)) 18 | for i in range(len(running_avg)): 19 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 20 | plt.plot(x, running_avg) 21 | plt.title('Running average of previous 100 scores') 22 | plt.savefig(figure_file) 23 | -------------------------------------------------------------------------------- /SAC/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape)) 8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape)) 9 | self.action_memory = np.zeros((self.mem_size, n_actions)) 10 | self.reward_memory = np.zeros(self.mem_size) 11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 12 | 13 | def store_transition(self, state, action, reward, state_, done): 14 | index = self.mem_cntr % self.mem_size 15 | self.state_memory[index] = state 16 | self.action_memory[index] = action 17 | self.reward_memory[index] = reward 18 | self.new_state_memory[index] = state_ 19 | self.terminal_memory[index] = done 20 | 21 | self.mem_cntr += 1 22 | 23 | def sample_buffer(self, batch_size): 24 | max_mem = min(self.mem_cntr, self.mem_size) 25 | 26 | batch = np.random.choice(max_mem, batch_size) 27 | 28 | states = self.state_memory[batch] 29 | actions = self.action_memory[batch] 30 | rewards = self.reward_memory[batch] 31 | states_ = self.new_state_memory[batch] 32 | dones = self.terminal_memory[batch] 33 | 34 | return states, actions, rewards, states_, dones 35 | 36 | 37 | -------------------------------------------------------------------------------- /SAC/main_sac.py: -------------------------------------------------------------------------------- 1 | # the following 3 lines are helpful if you have multiple GPUs and want to train 2 | # agents on multiple GPUs. I do this frequently when testing. 3 | #import os 4 | #os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' 5 | #os.environ['CUDA_VISIBLE_DEVICES'] = '1' 6 | import pybullet_envs 7 | import gym 8 | import numpy as np 9 | from sac_torch import Agent 10 | from utils import plot_learning_curve 11 | import numpy as np 12 | 13 | if __name__ == '__main__': 14 | #env_id = 'LunarLanderContinuous-v2' 15 | #env_id = 'BipedalWalker-v2' 16 | #env_id = 'AntBulletEnv-v0' 17 | env_id = 'InvertedPendulumBulletEnv-v0' 18 | #env_id = 'CartPoleContinuousBulletEnv-v0' 19 | env = gym.make(env_id) 20 | agent = Agent(alpha=0.0003, beta=0.0003, reward_scale=2, env_id=env_id, 21 | input_dims=env.observation_space.shape, tau=0.005, 22 | env=env, batch_size=256, layer1_size=256, layer2_size=256, 23 | n_actions=env.action_space.shape[0]) 24 | n_games = 250 25 | filename = env_id + '_'+ str(n_games) + 'games_scale' + str(agent.scale) + \ 26 | '_clamp_on_sigma.png' 27 | figure_file = 'plots/' + filename 28 | 29 | best_score = env.reward_range[0] 30 | score_history = [] 31 | load_checkpoint = True 32 | if load_checkpoint: 33 | agent.load_models() 34 | env.render(mode='human') 35 | steps = 0 36 | for i in range(n_games): 37 | observation = env.reset() 38 | done = False 39 | score = 0 40 | while not done: 41 | action = agent.choose_action(observation) 42 | observation_, reward, done, info = env.step(action) 43 | steps += 1 44 | agent.remember(observation, action, reward, observation_, done) 45 | if not load_checkpoint: 46 | agent.learn() 47 | score += reward 48 | observation = observation_ 49 | score_history.append(score) 50 | avg_score = np.mean(score_history[-100:]) 51 | 52 | if avg_score > best_score: 53 | best_score = avg_score 54 | if not load_checkpoint: 55 | agent.save_models() 56 | 57 | print('episode ', i, 'score %.1f' % score, 58 | 'trailing 100 games avg %.1f' % avg_score, 59 | 'steps %d' % steps, env_id, 60 | ' scale ', agent.scale) 61 | if not load_checkpoint: 62 | x = [i+1 for i in range(n_games)] 63 | plot_learning_curve(x, score_history, figure_file) 64 | -------------------------------------------------------------------------------- /SAC/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions.normal import Normal 7 | import numpy as np 8 | 9 | class CriticNetwork(nn.Module): 10 | def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, 11 | name, chkpt_dir='tmp/sac'): 12 | super(CriticNetwork, self).__init__() 13 | self.input_dims = input_dims 14 | self.fc1_dims = fc1_dims 15 | self.fc2_dims = fc2_dims 16 | self.n_actions = n_actions 17 | self.name = name 18 | self.checkpoint_dir = chkpt_dir 19 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac') 20 | 21 | # I think this breaks if the env has a 2D state representation 22 | self.fc1 = nn.Linear(self.input_dims[0] + n_actions, self.fc1_dims) 23 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 24 | self.q1 = nn.Linear(self.fc2_dims, 1) 25 | 26 | self.optimizer = optim.Adam(self.parameters(), lr=beta) 27 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 28 | 29 | self.to(self.device) 30 | 31 | def forward(self, state, action): 32 | q1_action_value = self.fc1(T.cat([state, action], dim=1)) 33 | q1_action_value = F.relu(q1_action_value) 34 | q1_action_value = self.fc2(q1_action_value) 35 | q1_action_value = F.relu(q1_action_value) 36 | 37 | q1 = self.q1(q1_action_value) 38 | 39 | return q1 40 | 41 | def save_checkpoint(self): 42 | T.save(self.state_dict(), self.checkpoint_file) 43 | 44 | def load_checkpoint(self): 45 | self.load_state_dict(T.load(self.checkpoint_file)) 46 | 47 | class ActorNetwork(nn.Module): 48 | def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, max_action, 49 | n_actions, name, chkpt_dir='tmp/sac'): 50 | super(ActorNetwork, self).__init__() 51 | self.input_dims = input_dims 52 | self.fc1_dims = fc1_dims 53 | self.fc2_dims = fc2_dims 54 | self.n_actions = n_actions 55 | self.name = name 56 | self.max_action = max_action 57 | self.checkpoint_dir = chkpt_dir 58 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac') 59 | self.reparam_noise = 1e-6 60 | 61 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) 62 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 63 | self.mu = nn.Linear(self.fc2_dims, self.n_actions) 64 | self.sigma = nn.Linear(self.fc2_dims, self.n_actions) 65 | 66 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 67 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 68 | 69 | self.to(self.device) 70 | 71 | def forward(self, state): 72 | prob = self.fc1(state) 73 | prob = F.relu(prob) 74 | prob = self.fc2(prob) 75 | prob = F.relu(prob) 76 | 77 | mu = self.mu(prob) 78 | #sigma = T.sigmoid(self.sigma(prob)) 79 | sigma = self.sigma(prob) 80 | sigma = T.clamp(sigma, min=self.reparam_noise, max=1) 81 | # authors use -20, 2 -> doesn't seem to work for my implementation 82 | 83 | return mu, sigma 84 | 85 | def sample_normal(self, state, reparameterize=True): 86 | mu, sigma = self.forward(state) 87 | probabilities = T.distributions.Normal(mu, sigma) 88 | 89 | if reparameterize: 90 | actions = probabilities.rsample() # reparameterizes the policy 91 | else: 92 | actions = probabilities.sample() 93 | 94 | action = T.tanh(actions)*T.tensor(self.max_action).to(self.device) 95 | log_probs = probabilities.log_prob(actions) 96 | log_probs -= T.log(1-action.pow(2) + self.reparam_noise) 97 | log_probs = log_probs.sum(1, keepdim=True) 98 | 99 | return action, log_probs 100 | 101 | def sample_mvnormal(self, state, reparameterize=True): 102 | """ 103 | Doesn't quite seem to work. The agent never learns. 104 | """ 105 | mu, sigma = self.forward(state) 106 | n_batches = sigma.size()[0] 107 | 108 | cov = [sigma[i] * T.eye(self.n_actions).to(self.device) for i in range(n_batches)] 109 | cov = T.stack(cov) 110 | probabilities = T.distributions.MultivariateNormal(mu, cov) 111 | 112 | if reparameterize: 113 | actions = probabilities.rsample() # reparameterizes the policy 114 | else: 115 | actions = probabilities.sample() 116 | 117 | action = T.tanh(actions) # enforce the action bound for (-1, 1) 118 | log_probs = probabilities.log_prob(actions) 119 | log_probs -= T.sum(T.log(1-action.pow(2) + self.reparam_noise)) 120 | log_probs = log_probs.sum(-1, keepdim=True) 121 | 122 | return action, log_probs 123 | 124 | def save_checkpoint(self): 125 | T.save(self.state_dict(), self.checkpoint_file) 126 | 127 | def load_checkpoint(self): 128 | self.load_state_dict(T.load(self.checkpoint_file)) 129 | 130 | class ValueNetwork(nn.Module): 131 | def __init__(self, beta, input_dims, fc1_dims, fc2_dims, 132 | name, chkpt_dir='tmp/sac'): 133 | super(ValueNetwork, self).__init__() 134 | self.input_dims = input_dims 135 | self.fc1_dims = fc1_dims 136 | self.fc2_dims = fc2_dims 137 | self.name = name 138 | self.checkpoint_dir = chkpt_dir 139 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac') 140 | 141 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) 142 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 143 | self.v = nn.Linear(self.fc2_dims, 1) 144 | 145 | self.optimizer = optim.Adam(self.parameters(), lr=beta) 146 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 147 | 148 | self.to(self.device) 149 | 150 | def forward(self, state): 151 | state_value = self.fc1(state) 152 | state_value = F.relu(state_value) 153 | state_value = self.fc2(state_value) 154 | state_value = F.relu(state_value) 155 | 156 | v = self.v(state_value) 157 | 158 | return v 159 | 160 | def save_checkpoint(self): 161 | T.save(self.state_dict(), self.checkpoint_file) 162 | 163 | def load_checkpoint(self): 164 | self.load_state_dict(T.load(self.checkpoint_file)) 165 | 166 | -------------------------------------------------------------------------------- /SAC/sac_torch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from buffer import ReplayBuffer 6 | from networks import ActorNetwork, CriticNetwork, ValueNetwork 7 | 8 | class Agent(): 9 | def __init__(self, alpha, beta, input_dims, tau, env, 10 | env_id, gamma=0.99, 11 | n_actions=2, max_size=1000000, layer1_size=256, 12 | layer2_size=256, batch_size=100, reward_scale=2): 13 | self.gamma = gamma 14 | self.tau = tau 15 | self.memory = ReplayBuffer(max_size, input_dims, n_actions) 16 | self.batch_size = batch_size 17 | self.n_actions = n_actions 18 | 19 | self.actor = ActorNetwork(alpha, input_dims, layer1_size, 20 | layer2_size, n_actions=n_actions, 21 | name=env_id+'_actor', 22 | max_action=env.action_space.high) 23 | self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, 24 | layer2_size, n_actions=n_actions, 25 | name=env_id+'_critic_1') 26 | self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, 27 | layer2_size, n_actions=n_actions, 28 | name=env_id+'_critic_2') 29 | 30 | self.value = ValueNetwork(beta, input_dims, layer1_size, 31 | layer2_size, name=env_id+'_value') 32 | self.target_value = ValueNetwork(beta, input_dims, layer1_size, 33 | layer2_size, name=env_id+'_target_value') 34 | 35 | self.scale = reward_scale 36 | self.update_network_parameters(tau=1) 37 | 38 | def choose_action(self, observation): 39 | state = T.Tensor([observation]).to(self.actor.device) 40 | actions, _ = self.actor.sample_normal(state, reparameterize=False) 41 | #actions, _ = self.actor.sample_mvnormal(state) 42 | # actions is an array of arrays due to the added dimension in state 43 | return actions.cpu().detach().numpy()[0] 44 | 45 | def remember(self, state, action, reward, new_state, done): 46 | self.memory.store_transition(state, action, reward, new_state, done) 47 | 48 | def learn(self): 49 | if self.memory.mem_cntr < self.batch_size: 50 | return 51 | 52 | state, action, reward, new_state, done = \ 53 | self.memory.sample_buffer(self.batch_size) 54 | 55 | reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device) 56 | done = T.tensor(done).to(self.critic_1.device) 57 | state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device) 58 | state = T.tensor(state, dtype=T.float).to(self.critic_1.device) 59 | action = T.tensor(action, dtype=T.float).to(self.critic_1.device) 60 | 61 | value = self.value(state).view(-1) 62 | value_ = self.target_value(state_).view(-1) 63 | value_[done] = 0.0 64 | 65 | actions, log_probs = self.actor.sample_normal(state, reparameterize=False) 66 | #actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False) 67 | log_probs = log_probs.view(-1) 68 | q1_new_policy = self.critic_1.forward(state, actions) 69 | q2_new_policy = self.critic_2.forward(state, actions) 70 | critic_value = T.min(q1_new_policy, q2_new_policy) 71 | critic_value = critic_value.view(-1) 72 | 73 | self.value.optimizer.zero_grad() 74 | value_target = critic_value - log_probs 75 | value_loss = 0.5 * (F.mse_loss(value, value_target)) 76 | value_loss.backward(retain_graph=True) 77 | self.value.optimizer.step() 78 | 79 | actions, log_probs = self.actor.sample_normal(state, reparameterize=True) 80 | #actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False) 81 | log_probs = log_probs.view(-1) 82 | q1_new_policy = self.critic_1.forward(state, actions) 83 | q2_new_policy = self.critic_2.forward(state, actions) 84 | critic_value = T.min(q1_new_policy, q2_new_policy) 85 | critic_value = critic_value.view(-1) 86 | 87 | actor_loss = log_probs - critic_value 88 | actor_loss = T.mean(actor_loss) 89 | self.actor.optimizer.zero_grad() 90 | actor_loss.backward(retain_graph=True) 91 | self.actor.optimizer.step() 92 | 93 | self.critic_1.optimizer.zero_grad() 94 | self.critic_2.optimizer.zero_grad() 95 | q_hat = self.scale*reward + self.gamma*value_ 96 | q1_old_policy = self.critic_1.forward(state, action).view(-1) 97 | q2_old_policy = self.critic_2.forward(state, action).view(-1) 98 | critic_1_loss = 0.5*F.mse_loss(q1_old_policy, q_hat) 99 | critic_2_loss = 0.5*F.mse_loss(q2_old_policy, q_hat) 100 | 101 | critic_loss = critic_1_loss + critic_2_loss 102 | critic_loss.backward() 103 | self.critic_1.optimizer.step() 104 | self.critic_2.optimizer.step() 105 | self.update_network_parameters() 106 | 107 | def update_network_parameters(self, tau=None): 108 | if tau is None: 109 | tau = self.tau 110 | 111 | target_value_params = self.target_value.named_parameters() 112 | value_params = self.value.named_parameters() 113 | 114 | target_value_state_dict = dict(target_value_params) 115 | value_state_dict = dict(value_params) 116 | 117 | for name in value_state_dict: 118 | value_state_dict[name] = tau*value_state_dict[name].clone() + \ 119 | (1-tau)*target_value_state_dict[name].clone() 120 | 121 | self.target_value.load_state_dict(value_state_dict) 122 | 123 | def save_models(self): 124 | print('.... saving models ....') 125 | self.actor.save_checkpoint() 126 | self.value.save_checkpoint() 127 | self.target_value.save_checkpoint() 128 | self.critic_1.save_checkpoint() 129 | self.critic_2.save_checkpoint() 130 | 131 | def load_models(self): 132 | print('.... loading models ....') 133 | self.actor.load_checkpoint() 134 | self.value.load_checkpoint() 135 | self.target_value.load_checkpoint() 136 | self.critic_1.load_checkpoint() 137 | self.critic_2.load_checkpoint() 138 | 139 | 140 | -------------------------------------------------------------------------------- /SAC/tf2/agent.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.keras as keras 3 | from tensorflow.keras.optimizers import Adam 4 | import tensorflow_probability as tfp 5 | from buffer import ReplayBuffer 6 | from networks import ActorNetwork, CriticNetwork, ValueNetwork 7 | 8 | 9 | class Agent: 10 | def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8], 11 | env=None, gamma=0.99, n_actions=2, max_size=1000000, 12 | tau=0.005, layer1_size=256, layer2_size=256, 13 | batch_size=256, reward_scale=2, chkpt_dir='models/'): 14 | self.gamma = gamma 15 | self.tau = tau 16 | self.memory = ReplayBuffer(max_size, input_dims, n_actions) 17 | self.batch_size = batch_size 18 | self.n_actions = n_actions 19 | self.fname = chkpt_dir + 'SAC/' 20 | self.actor = ActorNetwork(n_actions=n_actions, 21 | max_action=env.action_space.high) 22 | self.critic_1 = CriticNetwork() 23 | self.critic_2 = CriticNetwork() 24 | self.value = ValueNetwork() 25 | self.target_value = ValueNetwork() 26 | 27 | self.actor.compile(optimizer=Adam(learning_rate=alpha)) 28 | self.critic_1.compile(optimizer=Adam(learning_rate=beta)) 29 | self.critic_2.compile(optimizer=Adam(learning_rate=beta)) 30 | self.value.compile(optimizer=Adam(learning_rate=beta)) 31 | self.target_value.compile(optimizer=Adam(learning_rate=beta)) 32 | 33 | self.scale = reward_scale 34 | self.update_network_parameters(tau=1) 35 | 36 | def save_models(self): 37 | # for some environments we can try to save the model before 38 | # actually calling the learn function. This means we have an empty 39 | # graph, and TF2 will throw an error 40 | if self.memory.mem_cntr > self.batch_size: 41 | print('... saving models ...') 42 | self.actor.save(self.fname+'actor') 43 | self.critic_1.save(self.fname+'critic_1') 44 | self.critic_2.save(self.fname+'critic_2') 45 | self.value.save(self.fname+'value') 46 | self.target_value.save(self.fname+'target_value') 47 | 48 | def load_models(self): 49 | print('... loading models ...') 50 | self.actor = keras.models.load_model(self.fname+'actor') 51 | self.critic_1 = keras.models.load_model(self.fname+'critic_1') 52 | self.critic_2 = keras.models.load_model(self.fname+'critic_2') 53 | self.value = keras.models.load_model(self.fname+'value') 54 | self.target_value = keras.models.load_model(self.fname+'target_value') 55 | 56 | def sample_normal(self, state): 57 | mu, sigma = self.actor(state) 58 | probabilities = tfp.distributions.Normal(mu, sigma) 59 | actions = probabilities.sample() # + something else 60 | action = tf.math.tanh(actions)*self.actor.max_action 61 | log_probs = probabilities.log_prob(actions) 62 | log_probs -= tf.math.log(1-tf.math.pow(action, 2)+self.actor.noise) 63 | log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True) 64 | 65 | return action, log_probs 66 | 67 | def choose_action(self, observation): 68 | state = tf.convert_to_tensor([observation]) 69 | # actions, _ = self.actor.sample_normal(state) # reparameterize=False) 70 | actions, _ = self.sample_normal(state) 71 | 72 | return actions[0] 73 | 74 | def store_transition(self, state, action, reward, new_state, done): 75 | self.memory.store_transition(state, action, reward, new_state, done) 76 | 77 | def update_network_parameters(self, tau=None): 78 | if tau is None: 79 | tau = self.tau 80 | 81 | weights = [] 82 | targets = self.target_value.weights 83 | for i, weight in enumerate(self.value.weights): 84 | weights.append(weight * tau + targets[i]*(1-tau)) 85 | 86 | self.target_value.set_weights(weights) 87 | 88 | def learn(self): 89 | if self.memory.mem_cntr < self.batch_size: 90 | return 91 | 92 | state, action, reward, new_state, done = \ 93 | self.memory.sample_buffer(self.batch_size) 94 | 95 | states = tf.convert_to_tensor(state, dtype=tf.float32) 96 | states_ = tf.convert_to_tensor(new_state, dtype=tf.float32) 97 | rewards = tf.convert_to_tensor(reward, dtype=tf.float32) 98 | actions = tf.convert_to_tensor(action, dtype=tf.float32) 99 | 100 | with tf.GradientTape() as tape: 101 | value = tf.squeeze(self.value(states), 1) 102 | 103 | current_policy_actions, log_probs = self.sample_normal(states) 104 | # self.actor.sample_normal(states) # reparameterize=False) 105 | log_probs = tf.squeeze(log_probs, 1) 106 | q1_new_pi = self.critic_1((states, current_policy_actions)) 107 | q2_new_pi = self.critic_2((states, current_policy_actions)) 108 | critic_value = tf.squeeze( 109 | tf.math.minimum(q1_new_pi, q2_new_pi), 1) 110 | 111 | value_target = critic_value - log_probs 112 | value_loss = 0.5 * keras.losses.MSE(value, value_target) 113 | params = self.value.trainable_variables 114 | grads = tape.gradient(value_loss, params) 115 | self.value.optimizer.apply_gradients(zip(grads, params)) 116 | 117 | with tf.GradientTape() as tape: 118 | # in the original paper, they reparameterize here. We don't 119 | # so it's just the usual action. 120 | new_policy_actions, log_probs = self.sample_normal(states) 121 | # self.actor.sample_normal(states) # reparameterize=True) 122 | log_probs = tf.squeeze(log_probs, 1) 123 | q1_new_policy = self.critic_1((states, new_policy_actions)) 124 | q2_new_policy = self.critic_2((states, new_policy_actions)) 125 | critic_value = tf.squeeze(tf.math.minimum( 126 | q1_new_policy, q2_new_policy), 1) 127 | actor_loss = log_probs - critic_value 128 | actor_loss = tf.math.reduce_mean(actor_loss) 129 | params = self.actor.trainable_variables 130 | grads = tape.gradient(actor_loss, params) 131 | self.actor.optimizer.apply_gradients(zip(grads, params)) 132 | 133 | with tf.GradientTape(persistent=True) as tape: 134 | # I didn't know that these context managers shared values? 135 | value_ = tf.squeeze(self.target_value(states_), 1) 136 | q_hat = self.scale*rewards + self.gamma*value_*(1-done) 137 | q1_old_policy = tf.squeeze(self.critic_1((states, actions)), 1) 138 | q2_old_policy = tf.squeeze(self.critic_2((states, actions)), 1) 139 | critic_1_loss = 0.5 * keras.losses.MSE(q1_old_policy, q_hat) 140 | critic_2_loss = 0.5 * keras.losses.MSE(q2_old_policy, q_hat) 141 | params_1 = self.critic_1.trainable_variables 142 | params_2 = self.critic_2.trainable_variables 143 | grads_1 = tape.gradient(critic_1_loss, params_1) 144 | grads_2 = tape.gradient(critic_2_loss, params_2) 145 | 146 | self.critic_1.optimizer.apply_gradients(zip(grads_1, params_1)) 147 | self.critic_2.optimizer.apply_gradients(zip(grads_2, params_2)) 148 | 149 | self.update_network_parameters() 150 | -------------------------------------------------------------------------------- /SAC/tf2/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer: 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape)) 8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape)) 9 | self.action_memory = np.zeros((self.mem_size, n_actions)) 10 | self.reward_memory = np.zeros(self.mem_size) 11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 12 | 13 | def store_transition(self, state, action, reward, state_, done): 14 | index = self.mem_cntr % self.mem_size 15 | 16 | self.state_memory[index] = state 17 | self.new_state_memory[index] = state_ 18 | self.action_memory[index] = action 19 | self.reward_memory[index] = reward 20 | self.terminal_memory[index] = done 21 | 22 | self.mem_cntr += 1 23 | 24 | def sample_buffer(self, batch_size): 25 | max_mem = min(self.mem_cntr, self.mem_size) 26 | 27 | batch = np.random.choice(max_mem, batch_size) 28 | 29 | states = self.state_memory[batch] 30 | states_ = self.new_state_memory[batch] 31 | actions = self.action_memory[batch] 32 | rewards = self.reward_memory[batch] 33 | dones = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, dones 36 | -------------------------------------------------------------------------------- /SAC/tf2/main.py: -------------------------------------------------------------------------------- 1 | import pybullet_envs 2 | import gym 3 | import numpy as np 4 | from agent import Agent 5 | from utils import plot_learning_curve, manage_memory 6 | from gym import wrappers 7 | 8 | if __name__ == '__main__': 9 | manage_memory() 10 | env = gym.make('InvertedPendulumBulletEnv-v0') 11 | agent = Agent(input_dims=env.observation_space.shape, env=env, 12 | n_actions=env.action_space.shape[0]) 13 | n_games = 250 14 | render_video = False 15 | 16 | # do a mkdir video if you want to record video of the agent playing. 17 | if render_video: 18 | env = wrappers.Monitor(env, 'video', 19 | video_callable=lambda episode_id: True, 20 | force=True) 21 | filename = 'inverted_pendulum.png' 22 | 23 | figure_file = 'plots/' + filename 24 | 25 | best_score = env.reward_range[0] 26 | score_history = [] 27 | load_checkpoint = False 28 | 29 | if load_checkpoint: 30 | agent.load_models() 31 | env.render(mode='human') 32 | 33 | for i in range(n_games): 34 | observation = env.reset() 35 | done = False 36 | score = 0 37 | while not done: 38 | action = agent.choose_action(observation) 39 | observation_, reward, done, info = env.step(action) 40 | score += reward 41 | agent.store_transition(observation, action, reward, 42 | observation_, done) 43 | if not load_checkpoint: 44 | agent.learn() 45 | observation = observation_ 46 | score_history.append(score) 47 | avg_score = np.mean(score_history[-100:]) 48 | 49 | if avg_score > best_score: 50 | best_score = avg_score 51 | if not load_checkpoint: 52 | agent.save_models() 53 | print('episode {} score {:.1f} avg_score {:.1f}'. 54 | format(i, score, avg_score)) 55 | 56 | if not load_checkpoint: 57 | x = [i+1 for i in range(n_games)] 58 | plot_learning_curve(x, score_history, figure_file) 59 | -------------------------------------------------------------------------------- /SAC/tf2/networks.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.keras as keras 3 | # import tensorflow_probability as tfp 4 | from tensorflow.keras.layers import Dense 5 | 6 | 7 | class CriticNetwork(keras.Model): 8 | def __init__(self, fc1_dims=256, fc2_dims=256): 9 | super(CriticNetwork, self).__init__() 10 | self.fc1_dims = fc1_dims 11 | self.fc2_dims = fc2_dims 12 | 13 | self.fc1 = Dense(self.fc1_dims, activation='relu') 14 | self.fc2 = Dense(self.fc2_dims, activation='relu') 15 | self.q = Dense(1, activation=None) 16 | 17 | def call(self, inputs): 18 | state, action = inputs 19 | action_value = self.fc1(tf.concat([state, action], axis=1)) 20 | action_value = self.fc2(action_value) 21 | 22 | q = self.q(action_value) 23 | 24 | return q 25 | 26 | 27 | class ValueNetwork(keras.Model): 28 | def __init__(self, fc1_dims=256, fc2_dims=256): 29 | super(ValueNetwork, self).__init__() 30 | self.fc1_dims = fc1_dims 31 | self.fc2_dims = fc2_dims 32 | 33 | self.fc1 = Dense(self.fc1_dims, activation='relu') 34 | self.fc2 = Dense(fc2_dims, activation='relu') 35 | self.v = Dense(1, activation=None) 36 | 37 | def call(self, state): 38 | state_value = self.fc1(state) 39 | state_value = self.fc2(state_value) 40 | 41 | v = self.v(state_value) 42 | 43 | return v 44 | 45 | 46 | class ActorNetwork(keras.Model): 47 | def __init__(self, max_action, fc1_dims=256, fc2_dims=256, n_actions=2): 48 | super(ActorNetwork, self).__init__() 49 | self.fc1_dims = fc1_dims 50 | self.fc2_dims = fc2_dims 51 | self.n_actions = n_actions 52 | self.max_action = max_action 53 | self.noise = 1e-6 54 | 55 | self.fc1 = Dense(self.fc1_dims, activation='relu') 56 | self.fc2 = Dense(self.fc2_dims, activation='relu') 57 | self.mu = Dense(self.n_actions, activation=None) 58 | self.sigma = Dense(self.n_actions, activation=None) 59 | 60 | def call(self, state): 61 | prob = self.fc1(state) 62 | prob = self.fc2(prob) 63 | 64 | mu = self.mu(prob) 65 | sigma = self.sigma(prob) 66 | # might want to come back and change this, 67 | # perhaps tf plays more nicely with a sigma of ~0 68 | sigma = tf.clip_by_value(sigma, self.noise, 1) 69 | 70 | return mu, sigma 71 | """ 72 | def sample_normal(self, state, reparameterize=True): 73 | mu, sigma = self.call(state) 74 | probabilities = tfp.distributions.Normal(mu, sigma) 75 | 76 | if reparameterize: 77 | actions = probabilities.sample() # + something else 78 | else: 79 | actions = probabilities.sample() 80 | 81 | action = tf.math.tanh(actions)*self.max_action 82 | log_probs = probabilities.log_prob(actions) 83 | log_probs -= tf.math.log(1-tf.math.pow(action, 2)+self.noise) 84 | log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True) 85 | 86 | return action, log_probs 87 | 88 | def sample_normal(self, state): 89 | mu, sigma = self.call(state) 90 | probabilities = tfp.distributions.Normal(mu, sigma) 91 | 92 | actions = probabilities.sample() # + something else 93 | 94 | action = tf.math.tanh(actions)*self.max_action 95 | log_probs = probabilities.log_prob(actions) 96 | log_probs -= tf.math.log(1-tf.math.pow(action, 2)+self.noise) 97 | log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True) 98 | 99 | return action, log_probs 100 | """ 101 | -------------------------------------------------------------------------------- /SAC/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import tensorflow as tf 4 | 5 | 6 | def manage_memory(): 7 | gpus = tf.config.list_physical_devices('GPU') 8 | if gpus: 9 | try: 10 | for gpu in gpus: 11 | tf.config.experimental.set_memory_growth(gpu, True) 12 | except RuntimeError as e: 13 | print(e) 14 | 15 | 16 | def plot_learning_curve(x, scores, figure_file): 17 | running_avg = np.zeros(len(scores)) 18 | for i in range(len(running_avg)): 19 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 20 | plt.plot(x, running_avg) 21 | plt.title('Running average of previous 100 scores') 22 | plt.savefig(figure_file) 23 | -------------------------------------------------------------------------------- /SAC/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | -------------------------------------------------------------------------------- /TD3/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(): 4 | def __init__(self, max_size, input_shape, n_actions): 5 | self.mem_size = max_size 6 | self.mem_cntr = 0 7 | self.state_memory = np.zeros((self.mem_size, *input_shape)) 8 | self.new_state_memory = np.zeros((self.mem_size, *input_shape)) 9 | self.action_memory = np.zeros((self.mem_size, n_actions)) 10 | self.reward_memory = np.zeros(self.mem_size) 11 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 12 | 13 | def store_transition(self, state, action, reward, state_, done): 14 | index = self.mem_cntr % self.mem_size 15 | self.state_memory[index] = state 16 | self.action_memory[index] = action 17 | self.reward_memory[index] = reward 18 | self.new_state_memory[index] = state_ 19 | self.terminal_memory[index] = done 20 | 21 | self.mem_cntr += 1 22 | 23 | def sample_buffer(self, batch_size): 24 | max_mem = min(self.mem_cntr, self.mem_size) 25 | 26 | batch = np.random.choice(max_mem, batch_size) 27 | 28 | states = self.state_memory[batch] 29 | actions = self.action_memory[batch] 30 | rewards = self.reward_memory[batch] 31 | states_ = self.new_state_memory[batch] 32 | dones = self.terminal_memory[batch] 33 | 34 | return states, actions, rewards, states_, dones 35 | 36 | 37 | -------------------------------------------------------------------------------- /TD3/main_td3.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from td3_torch import Agent 4 | from utils import plot_learning_curve 5 | 6 | if __name__ == '__main__': 7 | env = gym.make('BipedalWalker-v2') 8 | #env = gym.make('LunarLanderContinuous-v2') 9 | agent = Agent(alpha=0.001, beta=0.001, 10 | input_dims=env.observation_space.shape, tau=0.005, 11 | env=env, batch_size=100, layer1_size=400, layer2_size=300, 12 | n_actions=env.action_space.shape[0]) 13 | n_games = 1500 14 | filename = 'Walker2d_' + str(n_games) + '_2.png' 15 | figure_file = 'plots/' + filename 16 | 17 | best_score = env.reward_range[0] 18 | score_history = [] 19 | 20 | #agent.load_models() 21 | 22 | for i in range(n_games): 23 | observation = env.reset() 24 | done = False 25 | score = 0 26 | while not done: 27 | action = agent.choose_action(observation) 28 | observation_, reward, done, info = env.step(action) 29 | agent.remember(observation, action, reward, observation_, done) 30 | agent.learn() 31 | score += reward 32 | observation = observation_ 33 | score_history.append(score) 34 | avg_score = np.mean(score_history[-100:]) 35 | 36 | if avg_score > best_score: 37 | best_score = avg_score 38 | agent.save_models() 39 | 40 | print('episode ', i, 'score %.2f' % score, 41 | 'trailing 100 games avg %.3f' % avg_score) 42 | 43 | x = [i+1 for i in range(n_games)] 44 | plot_learning_curve(x, score_history, figure_file) 45 | -------------------------------------------------------------------------------- /TD3/networks.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | import numpy as np 7 | 8 | class CriticNetwork(nn.Module): 9 | def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, 10 | name, chkpt_dir='tmp/td3'): 11 | super(CriticNetwork, self).__init__() 12 | self.input_dims = input_dims 13 | self.fc1_dims = fc1_dims 14 | self.fc2_dims = fc2_dims 15 | self.n_actions = n_actions 16 | self.name = name 17 | self.checkpoint_dir = chkpt_dir 18 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_td3') 19 | 20 | # I think this breaks if the env has a 2D state representation 21 | self.fc1 = nn.Linear(self.input_dims[0] + n_actions, self.fc1_dims) 22 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 23 | self.q1 = nn.Linear(self.fc2_dims, 1) 24 | 25 | self.optimizer = optim.Adam(self.parameters(), lr=beta) 26 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 27 | 28 | self.to(self.device) 29 | 30 | def forward(self, state, action): 31 | q1_action_value = self.fc1(T.cat([state, action], dim=1)) 32 | q1_action_value = F.relu(q1_action_value) 33 | q1_action_value = self.fc2(q1_action_value) 34 | q1_action_value = F.relu(q1_action_value) 35 | 36 | q1 = self.q1(q1_action_value) 37 | 38 | return q1 39 | 40 | def save_checkpoint(self): 41 | print('... saving checkpoint ...') 42 | T.save(self.state_dict(), self.checkpoint_file) 43 | 44 | def load_checkpoint(self): 45 | print('... loading checkpoint ...') 46 | self.load_state_dict(T.load(self.checkpoint_file)) 47 | 48 | class ActorNetwork(nn.Module): 49 | def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, 50 | n_actions, name, chkpt_dir='tmp/td3'): 51 | super(ActorNetwork, self).__init__() 52 | self.input_dims = input_dims 53 | self.fc1_dims = fc1_dims 54 | self.fc2_dims = fc2_dims 55 | self.n_actions = n_actions 56 | self.name = name 57 | self.checkpoint_dir = chkpt_dir 58 | self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_td3') 59 | 60 | self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) 61 | self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims) 62 | self.mu = nn.Linear(self.fc2_dims, self.n_actions) 63 | 64 | self.optimizer = optim.Adam(self.parameters(), lr=alpha) 65 | self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 66 | 67 | self.to(self.device) 68 | 69 | def forward(self, state): 70 | prob = self.fc1(state) 71 | prob = F.relu(prob) 72 | prob = self.fc2(prob) 73 | prob = F.relu(prob) 74 | 75 | prob = T.tanh(self.mu(prob)) # if action is > +/- 1 then multiply by max action 76 | 77 | return prob 78 | 79 | def save_checkpoint(self): 80 | print('... saving checkpoint ...') 81 | T.save(self.state_dict(), self.checkpoint_file) 82 | 83 | def load_checkpoint(self): 84 | print('... loading checkpoint ...') 85 | self.load_state_dict(T.load(self.checkpoint_file)) 86 | 87 | 88 | -------------------------------------------------------------------------------- /TD3/td3_torch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch as T 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from buffer import ReplayBuffer 6 | from networks import ActorNetwork, CriticNetwork 7 | 8 | class Agent(): 9 | def __init__(self, alpha, beta, input_dims, tau, env, 10 | gamma=0.99, update_actor_interval=2, warmup=1000, 11 | n_actions=2, max_size=1000000, layer1_size=400, 12 | layer2_size=300, batch_size=100, noise=0.1): 13 | self.gamma = gamma 14 | self.tau = tau 15 | self.max_action = env.action_space.high 16 | self.min_action = env.action_space.low 17 | self.memory = ReplayBuffer(max_size, input_dims, n_actions) 18 | self.batch_size = batch_size 19 | self.learn_step_cntr = 0 20 | self.time_step = 0 21 | self.warmup = warmup 22 | self.n_actions = n_actions 23 | self.update_actor_iter = update_actor_interval 24 | 25 | self.actor = ActorNetwork(alpha, input_dims, layer1_size, 26 | layer2_size, n_actions=n_actions, 27 | name='actor') 28 | self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, 29 | layer2_size, n_actions=n_actions, 30 | name='critic_1') 31 | self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, 32 | layer2_size, n_actions=n_actions, 33 | name='critic_2') 34 | 35 | self.target_actor = ActorNetwork(alpha, input_dims, layer1_size, 36 | layer2_size, n_actions=n_actions, 37 | name='target_actor') 38 | self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size, 39 | layer2_size, n_actions=n_actions, 40 | name='target_critic_1') 41 | self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size, 42 | layer2_size, n_actions=n_actions, 43 | name='target_critic_2') 44 | 45 | self.noise = noise 46 | self.update_network_parameters(tau=1) 47 | 48 | def choose_action(self, observation): 49 | if self.time_step < self.warmup: 50 | mu = T.tensor(np.random.normal(scale=self.noise, size=(self.n_actions,))) 51 | else: 52 | state = T.tensor(observation, dtype=T.float).to(self.actor.device) 53 | mu = self.actor.forward(state).to(self.actor.device) 54 | mu_prime = mu + T.tensor(np.random.normal(scale=self.noise), 55 | dtype=T.float).to(self.actor.device) 56 | mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0]) 57 | self.time_step += 1 58 | return mu_prime.cpu().detach().numpy() 59 | 60 | def remember(self, state, action, reward, new_state, done): 61 | self.memory.store_transition(state, action, reward, new_state, done) 62 | 63 | def learn(self): 64 | if self.memory.mem_cntr < self.batch_size: 65 | return 66 | 67 | state, action, reward, new_state, done = \ 68 | self.memory.sample_buffer(self.batch_size) 69 | 70 | reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device) 71 | done = T.tensor(done).to(self.critic_1.device) 72 | state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device) 73 | state = T.tensor(state, dtype=T.float).to(self.critic_1.device) 74 | action = T.tensor(action, dtype=T.float).to(self.critic_1.device) 75 | 76 | target_actions = self.target_actor.forward(state_) 77 | target_actions = target_actions + \ 78 | T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5) 79 | # might break if elements of min and max are not all equal 80 | target_actions = T.clamp(target_actions, self.min_action[0], self.max_action[0]) 81 | 82 | q1_ = self.target_critic_1.forward(state_, target_actions) 83 | q2_ = self.target_critic_2.forward(state_, target_actions) 84 | 85 | q1 = self.critic_1.forward(state, action) 86 | q2 = self.critic_2.forward(state, action) 87 | 88 | q1_[done] = 0.0 89 | q2_[done] = 0.0 90 | 91 | q1_ = q1_.view(-1) 92 | q2_ = q2_.view(-1) 93 | 94 | critic_value_ = T.min(q1_, q2_) 95 | 96 | target = reward + self.gamma*critic_value_ 97 | target = target.view(self.batch_size, 1) 98 | 99 | self.critic_1.optimizer.zero_grad() 100 | self.critic_2.optimizer.zero_grad() 101 | 102 | q1_loss = F.mse_loss(target, q1) 103 | q2_loss = F.mse_loss(target, q2) 104 | critic_loss = q1_loss + q2_loss 105 | critic_loss.backward() 106 | 107 | self.critic_1.optimizer.step() 108 | self.critic_2.optimizer.step() 109 | 110 | self.learn_step_cntr += 1 111 | 112 | if self.learn_step_cntr % self.update_actor_iter != 0: 113 | return 114 | 115 | self.actor.optimizer.zero_grad() 116 | actor_q1_loss = self.critic_1.forward(state, self.actor.forward(state)) 117 | actor_loss = -T.mean(actor_q1_loss) 118 | actor_loss.backward() 119 | self.actor.optimizer.step() 120 | 121 | self.update_network_parameters() 122 | 123 | def update_network_parameters(self, tau=None): 124 | if tau is None: 125 | tau = self.tau 126 | 127 | actor_params = self.actor.named_parameters() 128 | critic_1_params = self.critic_1.named_parameters() 129 | critic_2_params = self.critic_2.named_parameters() 130 | target_actor_params = self.target_actor.named_parameters() 131 | target_critic_1_params = self.target_critic_1.named_parameters() 132 | target_critic_2_params = self.target_critic_2.named_parameters() 133 | 134 | critic_1_state_dict = dict(critic_1_params) 135 | critic_2_state_dict = dict(critic_2_params) 136 | actor_state_dict = dict(actor_params) 137 | target_actor_state_dict = dict(target_actor_params) 138 | target_critic_1_state_dict = dict(target_critic_1_params) 139 | target_critic_2_state_dict = dict(target_critic_2_params) 140 | 141 | for name in critic_1_state_dict: 142 | critic_1_state_dict[name] = tau*critic_1_state_dict[name].clone() + \ 143 | (1-tau)*target_critic_1_state_dict[name].clone() 144 | 145 | for name in critic_2_state_dict: 146 | critic_2_state_dict[name] = tau*critic_2_state_dict[name].clone() + \ 147 | (1-tau)*target_critic_2_state_dict[name].clone() 148 | 149 | for name in actor_state_dict: 150 | actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ 151 | (1-tau)*target_actor_state_dict[name].clone() 152 | 153 | self.target_critic_1.load_state_dict(critic_1_state_dict) 154 | self.target_critic_2.load_state_dict(critic_2_state_dict) 155 | self.target_actor.load_state_dict(actor_state_dict) 156 | 157 | def save_models(self): 158 | self.actor.save_checkpoint() 159 | self.target_actor.save_checkpoint() 160 | self.critic_1.save_checkpoint() 161 | self.critic_2.save_checkpoint() 162 | self.target_critic_1.save_checkpoint() 163 | self.target_critic_2.save_checkpoint() 164 | 165 | def load_models(self): 166 | self.actor.load_checkpoint() 167 | self.target_actor.load_checkpoint() 168 | self.critic_1.load_checkpoint() 169 | self.critic_2.load_checkpoint() 170 | self.target_critic_1.load_checkpoint() 171 | self.target_critic_2.load_checkpoint() 172 | 173 | 174 | -------------------------------------------------------------------------------- /TD3/tf2/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import tensorflow.keras as keras 4 | from tensorflow.keras.optimizers import Adam 5 | from buffer import ReplayBuffer 6 | from networks import ActorNetwork, CriticNetwork 7 | 8 | 9 | class Agent: 10 | def __init__(self, alpha, beta, input_dims, tau, env, 11 | gamma=0.99, update_actor_interval=2, warmup=1000, 12 | n_actions=2, max_size=1000000, layer1_size=400, 13 | layer2_size=300, batch_size=100, noise=0.1, 14 | chkpt_dir='models/'): 15 | self.gamma = gamma 16 | self.tau = tau 17 | self.max_action = env.action_space.high[0] 18 | self.min_action = env.action_space.low[0] 19 | self.memory = ReplayBuffer(max_size, input_dims, n_actions) 20 | self.batch_size = batch_size 21 | self.learn_step_cntr = 0 22 | self.time_step = 0 23 | self.warmup = warmup 24 | self.n_actions = n_actions 25 | self.fname = chkpt_dir 26 | self.update_actor_iter = update_actor_interval 27 | 28 | self.actor = ActorNetwork(layer1_size, layer2_size, 29 | n_actions=n_actions) 30 | 31 | self.critic_1 = CriticNetwork(layer1_size, layer2_size) 32 | self.critic_2 = CriticNetwork(layer1_size, layer2_size) 33 | 34 | self.target_actor = ActorNetwork(layer1_size, layer2_size, 35 | n_actions=n_actions) 36 | self.target_critic_1 = CriticNetwork(layer1_size, layer2_size) 37 | self.target_critic_2 = CriticNetwork(layer1_size, layer2_size) 38 | 39 | self.actor.compile(optimizer=Adam(learning_rate=alpha)) 40 | self.critic_1.compile(optimizer=Adam(learning_rate=beta)) 41 | self.critic_2.compile(optimizer=Adam(learning_rate=beta)) 42 | 43 | self.target_actor.compile(optimizer=Adam(learning_rate=alpha)) 44 | self.target_critic_1.compile(optimizer=Adam(learning_rate=beta)) 45 | self.target_critic_2.compile(optimizer=Adam(learning_rate=beta)) 46 | 47 | self.noise = noise 48 | self.update_network_parameters(tau=1) 49 | 50 | def save_models(self): 51 | if self.memory.mem_cntr > self.batch_size: 52 | print('... saving models ...') 53 | self.actor.save(self.fname+'actor') 54 | self.critic_1.save(self.fname+'critic_1') 55 | self.critic_2.save(self.fname+'critic_2') 56 | self.target_actor.save(self.fname+'target_actor') 57 | self.target_critic_1.save(self.fname+'target_critic_1') 58 | self.target_critic_2.save(self.fname+'target_critic_2') 59 | 60 | def load_models(self): 61 | print('... loading models ...') 62 | self.actor = keras.models.load_model(self.fname+'actor') 63 | self.critic_1 = keras.models.load_model(self.fname+'critic_1') 64 | self.critic_2 = keras.models.load_model(self.fname+'critic_2') 65 | self.target_actor = keras.models.load_model(self.fname+'target_actor') 66 | self.target_critic_1 = \ 67 | keras.models.load_model(self.fname+'target_critic_1') 68 | self.target_critic_2 = \ 69 | keras.models.load_model(self.fname+'target_critic_2') 70 | 71 | def choose_action(self, observation): 72 | if self.time_step < self.warmup: 73 | mu = np.random.normal(scale=self.noise, size=(self.n_actions,)) 74 | else: 75 | state = tf.convert_to_tensor([observation], dtype=tf.float32) 76 | # returns a batch size of 1, want a scalar array 77 | mu = self.actor(state)[0] 78 | mu_prime = mu + np.random.normal(scale=self.noise) 79 | mu_prime = tf.clip_by_value(mu_prime, self.min_action, self.max_action) 80 | self.time_step += 1 81 | 82 | return mu_prime 83 | 84 | def remember(self, state, action, reward, new_state, done): 85 | self.memory.store_transition(state, action, reward, new_state, done) 86 | 87 | def learn(self): 88 | if self.memory.mem_cntr < self.batch_size: 89 | return 90 | 91 | states, actions, rewards, new_states, dones = \ 92 | self.memory.sample_buffer(self.batch_size) 93 | 94 | states = tf.convert_to_tensor(states, dtype=tf.float32) 95 | actions = tf.convert_to_tensor(actions, dtype=tf.float32) 96 | rewards = tf.convert_to_tensor(rewards, dtype=tf.float32) 97 | states_ = tf.convert_to_tensor(new_states, dtype=tf.float32) 98 | 99 | with tf.GradientTape(persistent=True) as tape: 100 | target_actions = self.target_actor(states_) 101 | target_actions = target_actions + \ 102 | tf.clip_by_value(np.random.normal(scale=0.2), -0.5, 0.5) 103 | 104 | target_actions = tf.clip_by_value(target_actions, self.min_action, 105 | self.max_action) 106 | 107 | q1_ = self.target_critic_1((states_, target_actions)) 108 | q2_ = self.target_critic_2((states_, target_actions)) 109 | 110 | q1 = tf.squeeze(self.critic_1((states, actions)), 1) 111 | q2 = tf.squeeze(self.critic_2((states, actions)), 1) 112 | 113 | # shape is [batch_size, 1], want to collapse to [batch_size] 114 | q1_ = tf.squeeze(q1_, 1) 115 | q2_ = tf.squeeze(q2_, 1) 116 | 117 | critic_value_ = tf.math.minimum(q1_, q2_) 118 | # in tf2 only integer scalar arrays can be used as indices 119 | # and eager exection doesn't support assignment, so we can't do 120 | # q1_[dones] = 0.0 121 | target = rewards + self.gamma*critic_value_*(1-dones) 122 | critic_1_loss = keras.losses.MSE(target, q1) 123 | critic_2_loss = keras.losses.MSE(target, q2) 124 | params_1 = self.critic_1.trainable_variables 125 | params_2 = self.critic_2.trainable_variables 126 | grads_1 = tape.gradient(critic_1_loss, params_1) 127 | grads_2 = tape.gradient(critic_2_loss, params_2) 128 | 129 | self.critic_1.optimizer.apply_gradients(zip(grads_1, params_1)) 130 | self.critic_2.optimizer.apply_gradients(zip(grads_2, params_2)) 131 | 132 | self.learn_step_cntr += 1 133 | 134 | if self.learn_step_cntr % self.update_actor_iter != 0: 135 | return 136 | 137 | with tf.GradientTape() as tape: 138 | new_actions = self.actor(states) 139 | critic_1_value = self.critic_1((states, new_actions)) 140 | actor_loss = -tf.math.reduce_mean(critic_1_value) 141 | params = self.actor.trainable_variables 142 | grads = tape.gradient(actor_loss, params) 143 | self.actor.optimizer.apply_gradients(zip(grads, params)) 144 | 145 | self.update_network_parameters() 146 | 147 | def update_network_parameters(self, tau=None): 148 | if tau is None: 149 | tau = self.tau 150 | 151 | weights = [] 152 | targets = self.target_actor.weights 153 | for i, weight in enumerate(self.actor.weights): 154 | weights.append(weight * tau + targets[i]*(1-tau)) 155 | 156 | self.target_actor.set_weights(weights) 157 | 158 | weights = [] 159 | targets = self.target_critic_1.weights 160 | for i, weight in enumerate(self.critic_1.weights): 161 | weights.append(weight * tau + targets[i]*(1-tau)) 162 | 163 | self.target_critic_1.set_weights(weights) 164 | 165 | weights = [] 166 | targets = self.target_critic_2.weights 167 | for i, weight in enumerate(self.critic_2.weights): 168 | weights.append(weight * tau + targets[i]*(1-tau)) 169 | 170 | self.target_critic_2.set_weights(weights) 171 | -------------------------------------------------------------------------------- /TD3/tf2/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ReplayBuffer: 5 | def __init__(self, max_size, input_shape, n_actions): 6 | self.mem_size = max_size 7 | self.mem_cntr = 0 8 | self.state_memory = np.zeros((self.mem_size, *input_shape)) 9 | self.new_state_memory = np.zeros((self.mem_size, *input_shape)) 10 | self.action_memory = np.zeros((self.mem_size, n_actions)) 11 | self.reward_memory = np.zeros(self.mem_size) 12 | self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool) 13 | 14 | def store_transition(self, state, action, reward, state_, done): 15 | index = self.mem_cntr % self.mem_size 16 | self.state_memory[index] = state 17 | self.new_state_memory[index] = state_ 18 | self.terminal_memory[index] = done 19 | self.reward_memory[index] = reward 20 | self.action_memory[index] = action 21 | 22 | self.mem_cntr += 1 23 | 24 | def sample_buffer(self, batch_size): 25 | max_mem = min(self.mem_cntr, self.mem_size) 26 | 27 | batch = np.random.choice(max_mem, batch_size) 28 | 29 | states = self.state_memory[batch] 30 | states_ = self.new_state_memory[batch] 31 | actions = self.action_memory[batch] 32 | rewards = self.reward_memory[batch] 33 | dones = self.terminal_memory[batch] 34 | 35 | return states, actions, rewards, states_, dones 36 | -------------------------------------------------------------------------------- /TD3/tf2/main.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from agent import Agent 4 | from utils import plot_learning_curve, manage_memory 5 | 6 | if __name__ == '__main__': 7 | # env = gym.make('BipedalWalker-v3') 8 | manage_memory() 9 | env = gym.make('LunarLanderContinuous-v2') 10 | agent = Agent(alpha=0.0001, beta=0.001, 11 | input_dims=env.observation_space.shape, tau=0.005, 12 | env=env, batch_size=100, layer1_size=400, layer2_size=300, 13 | n_actions=env.action_space.shape[0]) 14 | n_games = 1000 15 | filename = 'plots/' + 'lunar_lander_' + str(n_games) + '_games.png' 16 | 17 | best_score = env.reward_range[0] 18 | score_history = [] 19 | load_checkpoint = False 20 | if load_checkpoint: 21 | agent.load_models() 22 | 23 | for i in range(n_games): 24 | observation = env.reset() 25 | done = False 26 | score = 0 27 | while not done: 28 | action = agent.choose_action(observation) 29 | observation_, reward, done, info = env.step(action) 30 | agent.remember(observation, action, reward, observation_, done) 31 | if not load_checkpoint: 32 | agent.learn() 33 | score += reward 34 | observation = observation_ 35 | score_history.append(score) 36 | avg_score = np.mean(score_history[-100:]) 37 | 38 | if avg_score > best_score: 39 | best_score = avg_score 40 | if not load_checkpoint: 41 | agent.save_models() 42 | print('episode {} score {:.1f} avg score {:.1f}'. 43 | format(i, score, avg_score)) 44 | x = [i+1 for i in range(n_games)] 45 | plot_learning_curve(x, score_history, filename) 46 | -------------------------------------------------------------------------------- /TD3/tf2/networks.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.keras as keras 3 | from tensorflow.keras.layers import Dense 4 | 5 | 6 | class CriticNetwork(keras.Model): 7 | def __init__(self, fc1_dims, fc2_dims): 8 | super(CriticNetwork, self).__init__() 9 | self.fc1_dims = fc1_dims 10 | self.fc2_dims = fc2_dims 11 | 12 | self.fc1 = Dense(self.fc1_dims, activation='relu') 13 | self.fc2 = Dense(self.fc2_dims, activation='relu') 14 | self.q = Dense(1, activation=None) 15 | 16 | def call(self, inputs): 17 | state, action = inputs 18 | q1_action_value = self.fc1(tf.concat([state, action], axis=1)) 19 | q1_action_value = self.fc2(q1_action_value) 20 | 21 | q = self.q(q1_action_value) 22 | 23 | return q 24 | 25 | 26 | class ActorNetwork(keras.Model): 27 | def __init__(self, fc1_dims, fc2_dims, n_actions): 28 | super(ActorNetwork, self).__init__() 29 | self.fc1_dims = fc1_dims 30 | self.fc2_dims = fc2_dims 31 | 32 | self.fc1 = Dense(self.fc1_dims, activation='relu') 33 | self.fc2 = Dense(self.fc2_dims, activation='relu') 34 | self.mu = Dense(n_actions, activation='tanh') 35 | 36 | def call(self, state): 37 | prob = self.fc1(state) 38 | prob = self.fc2(prob) 39 | 40 | mu = self.mu(prob) 41 | 42 | return mu 43 | -------------------------------------------------------------------------------- /TD3/tf2/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import tensorflow as tf 4 | 5 | 6 | def manage_memory(): 7 | gpus = tf.config.list_physical_devices('GPU') 8 | if gpus: 9 | try: 10 | for gpu in gpus: 11 | tf.config.experimental.set_memory_growth(gpu, True) 12 | except RuntimeError as e: 13 | print(e) 14 | 15 | 16 | def plot_learning_curve(x, scores, figure_file): 17 | running_avg = np.zeros(len(scores)) 18 | for i in range(len(running_avg)): 19 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 20 | plt.plot(x, running_avg) 21 | plt.title('Running average of previous 100 scores') 22 | plt.savefig(figure_file) 23 | -------------------------------------------------------------------------------- /TD3/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def plot_learning_curve(x, scores, figure_file): 5 | running_avg = np.zeros(len(scores)) 6 | for i in range(len(running_avg)): 7 | running_avg[i] = np.mean(scores[max(0, i-100):(i+1)]) 8 | plt.plot(x, running_avg) 9 | plt.title('Running average of previous 100 scores') 10 | plt.savefig(figure_file) 11 | --------------------------------------------------------------------------------