├── ActorCritic
    ├── actor_critic_torch.py
    ├── main_lunar_lander_actor_critic.py
    ├── tf2
    │   ├── agent.py
    │   ├── main.py
    │   ├── networks.py
    │   └── utils.py
    └── utils.py
├── DDPG
    ├── buffer.py
    ├── ddpg_torch.py
    ├── main_ddpg.py
    ├── networks.py
    ├── noise.py
    ├── tf2
    │   ├── agent.py
    │   ├── buffer.py
    │   ├── main.py
    │   ├── networks.py
    │   ├── noise.py
    │   └── utils.py
    └── utils.py
├── Fundamentals
    ├── control_blackJack_no_es.py
    ├── control_cartpole_q_learning.py
    ├── main_control_blackJack_no_es.py
    ├── main_control_cartpole_q_learning.py
    ├── main_prediction_blackJack.py
    ├── prediction_blackJack.py
    └── prediction_cartpole_td_zero.py
├── README.md
├── Reinforce
    ├── lunar_lander_random.py
    ├── main_lunar_lander_reinforce.py
    ├── reinforce_torch.py
    └── tf2
    │   ├── agent.py
    │   ├── main.py
    │   ├── networks.py
    │   └── utils.py
├── SAC
    ├── buffer.py
    ├── main_sac.py
    ├── networks.py
    ├── sac_torch.py
    ├── tf2
    │   ├── agent.py
    │   ├── buffer.py
    │   ├── main.py
    │   ├── networks.py
    │   └── utils.py
    └── utils.py
└── TD3
    ├── buffer.py
    ├── main_td3.py
    ├── networks.py
    ├── td3_torch.py
    ├── tf2
        ├── agent.py
        ├── buffer.py
        ├── main.py
        ├── networks.py
        └── utils.py
    └── utils.py


/ActorCritic/actor_critic_torch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | 
 7 | class ActorCriticNetwork(nn.Module):
 8 |     def __init__(self, lr, input_dims, n_actions, fc1_dims=256, fc2_dims=256):
 9 |         super(ActorCriticNetwork, self).__init__()
10 |         self.fc1 = nn.Linear(*input_dims, fc1_dims)
11 |         self.fc2 = nn.Linear(fc1_dims, fc2_dims)
12 |         self.pi = nn.Linear(fc2_dims, n_actions)
13 |         self.v = nn.Linear(fc2_dims, 1)
14 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
15 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
16 |         self.to(self.device)
17 | 
18 |     def forward(self, state):
19 |         x = F.relu(self.fc1(state))
20 |         x = F.relu(self.fc2(x))
21 |         pi = self.pi(x)
22 |         v = self.v(x)
23 | 
24 |         return (pi, v)
25 | 
26 | class Agent():
27 |     def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions, 
28 |                  gamma=0.99):
29 |         self.gamma = gamma
30 |         self.lr = lr
31 |         self.fc1_dims = fc1_dims
32 |         self.fc2_dims = fc2_dims
33 |         self.actor_critic = ActorCriticNetwork(lr, input_dims, n_actions, 
34 |                                                fc1_dims, fc2_dims)
35 |         self.log_prob = None
36 | 
37 |     def choose_action(self, observation):
38 |         state = T.tensor([observation], dtype=T.float).to(self.actor_critic.device)
39 |         probabilities, _ = self.actor_critic.forward(state)
40 |         probabilities = F.softmax(probabilities, dim=1)
41 |         action_probs = T.distributions.Categorical(probabilities)
42 |         action = action_probs.sample()
43 |         log_prob = action_probs.log_prob(action)
44 |         self.log_prob = log_prob
45 | 
46 |         return action.item()
47 | 
48 |     def learn(self, state, reward, state_, done):
49 |         self.actor_critic.optimizer.zero_grad()
50 | 
51 |         state = T.tensor([state], dtype=T.float).to(self.actor_critic.device)
52 |         state_ = T.tensor([state_], dtype=T.float).to(self.actor_critic.device)
53 |         reward = T.tensor(reward, dtype=T.float).to(self.actor_critic.device)
54 | 
55 |         _, critic_value = self.actor_critic.forward(state)
56 |         _, critic_value_ = self.actor_critic.forward(state_)
57 | 
58 |         delta = reward + self.gamma*critic_value_*(1-int(done)) - critic_value
59 | 
60 |         actor_loss = -self.log_prob*delta
61 |         critic_loss = delta**2
62 | 
63 |         (actor_loss + critic_loss).backward()
64 |         self.actor_critic.optimizer.step()
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/ActorCritic/main_lunar_lander_actor_critic.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from actor_critic_torch import Agent
 4 | from utils import plot_learning_curve
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('LunarLander-v2')
 8 |     agent = Agent(gamma=0.99, lr=5e-6, input_dims=[8], n_actions=4,
 9 |                   fc1_dims=2048, fc2_dims=1536)
10 |     n_games = 3000
11 | 
12 |     fname = 'ACTOR_CRITIC_' + 'lunar_lander_' + str(agent.fc1_dims) + \
13 |             '_fc1_dims_' + str(agent.fc2_dims) + '_fc2_dims_lr' + str(agent.lr) +\
14 |             '_' + str(n_games) + 'games'
15 |     figure_file = 'plots/' + fname + '.png'
16 | 
17 |     scores = []
18 |     for i in range(n_games):
19 |         done = False
20 |         observation = env.reset()
21 |         score = 0
22 |         while not done:
23 |             action = agent.choose_action(observation)
24 |             observation_, reward, done, info = env.step(action)
25 |             score += reward
26 |             agent.learn(observation, reward, observation_, done)
27 |             observation = observation_
28 |         scores.append(score)
29 | 
30 |         avg_score = np.mean(scores[-100:])
31 |         print('episode ', i, 'score %.1f' % score,
32 |                 'average score %.1f' % avg_score)
33 | 
34 |     x = [i+1 for i in range(n_games)]
35 |     plot_learning_curve(x, scores, figure_file)
36 | 
37 | 


--------------------------------------------------------------------------------
/ActorCritic/tf2/agent.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tensorflow as tf
 3 | from tensorflow.keras.optimizers import Adam
 4 | import tensorflow_probability as tfp
 5 | import tensorflow.keras as keras
 6 | from networks import ActorCriticNetwork
 7 | 
 8 | 
 9 | class Agent:
10 |     def __init__(self, alpha=0.0003, gamma=0.99, n_actions=2,
11 |                  fc1_dims=256, fc2_dims=256, chkpt_dir='models/'):
12 |         self.gamma = gamma
13 |         self.n_actions = n_actions
14 |         self.action = None
15 |         self.action_space = [i for i in range(self.n_actions)]
16 |         self.checkpoint_file = os.path.join(chkpt_dir, '_actor_critic')
17 | 
18 |         self.actor_critic = ActorCriticNetwork(n_actions=n_actions)
19 | 
20 |         self.actor_critic.compile(optimizer=Adam(learning_rate=alpha))
21 | 
22 |     def choose_action(self, observation):
23 |         state = tf.convert_to_tensor([observation])
24 |         _, probs = self.actor_critic(state)
25 | 
26 |         action_probabilities = tfp.distributions.Categorical(probs=probs)
27 |         action = action_probabilities.sample()
28 |         self.action = action
29 | 
30 |         return action.numpy()[0]
31 | 
32 |     def save_models(self):
33 |         self.actor_critic.save(self.checkpoint_file)
34 |         print('... saving models ...')
35 | 
36 |     def load_models(self):
37 |         self.actor_critic = keras.models.load_model(self.checkpoint_file)
38 |         print('... loading models ...')
39 | 
40 |     def learn(self, state, reward, state_, done):
41 |         state = tf.convert_to_tensor([state], dtype=tf.float32)
42 |         state_ = tf.convert_to_tensor([state_], dtype=tf.float32)
43 |         reward = tf.convert_to_tensor(reward, dtype=tf.float32)
44 | 
45 |         with tf.GradientTape(persistent=True) as tape:
46 |             state_value, probs = self.actor_critic(state)
47 |             state_value_, _ = self.actor_critic(state_)
48 |             state_value = tf.squeeze(state_value)
49 |             state_value_ = tf.squeeze(state_value_)
50 | 
51 |             action_probs = tfp.distributions.Categorical(probs=probs)
52 |             log_prob = action_probs.log_prob(self.action)
53 | 
54 |             delta = reward + \
55 |                 self.gamma*state_value_*(1-int(done)) - state_value
56 |             actor_loss = -log_prob*delta
57 |             critic_loss = delta**2
58 |             total_loss = actor_loss + critic_loss
59 |         params = self.actor_critic.trainable_variables
60 |         grads = tape.gradient(total_loss, params)
61 |         self.actor_critic.optimizer.apply_gradients(zip(grads, params))
62 | 


--------------------------------------------------------------------------------
/ActorCritic/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from agent import Agent
 4 | from utils import plot_learning_curve, manage_memory
 5 | from gym import wrappers
 6 | 
 7 | if __name__ == '__main__':
 8 |     manage_memory()
 9 |     # env = gym.make('LunarLander-v2')
10 |     env = gym.make('CartPole-v0')
11 |     agent = Agent(alpha=1e-5, n_actions=env.action_space.n)
12 |     n_games = 1800
13 |     record_video = False
14 |     load_checkpoint = False
15 | 
16 |     # do a mkdir video if you want to record video of the agent playing
17 |     if record_video:
18 |         env = wrappers.Monitor(env, 'video',
19 |                                video_callable=lambda episode_id: True,
20 |                                force=True)
21 |     filename = 'cartpole_1e-5_1024x512_1800games.png'
22 | 
23 |     figure_file = 'plots/' + filename
24 | 
25 |     best_score = env.reward_range[0]
26 |     score_history = []
27 | 
28 |     if load_checkpoint:
29 |         agent.load_models()
30 | 
31 |     for i in range(n_games):
32 |         observation = env.reset()
33 |         done = False
34 |         score = 0
35 |         while not done:
36 |             action = agent.choose_action(observation)
37 |             observation_, reward, done, info = env.step(action)
38 |             score += reward
39 |             if not load_checkpoint:
40 |                 agent.learn(observation, reward, observation_, done)
41 |             observation = observation_
42 |         score_history.append(score)
43 |         avg_score = np.mean(score_history[-100:])
44 | 
45 |         if avg_score > best_score:
46 |             best_score = avg_score
47 |             if not load_checkpoint:
48 |                 agent.save_models()
49 |         print('episode {} score {:.1f} avg score {:.1f}'.format(
50 |               i, score, avg_score))
51 | 
52 |     if not load_checkpoint:
53 |         x = [i+1 for i in range(n_games)]
54 |         plot_learning_curve(x, score_history, figure_file)
55 | 


--------------------------------------------------------------------------------
/ActorCritic/tf2/networks.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.keras as keras
 2 | from tensorflow.keras.layers import Dense
 3 | 
 4 | 
 5 | class ActorCriticNetwork(keras.Model):
 6 |     def __init__(self, n_actions, fc1_dims=1024, fc2_dims=512):
 7 |         super(ActorCriticNetwork, self).__init__()
 8 |         self.fc1_dims = fc1_dims
 9 |         self.fc2_dims = fc2_dims
10 |         self.n_actions = n_actions
11 | 
12 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
13 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
14 |         self.v = Dense(1, activation=None)
15 |         self.pi = Dense(n_actions, activation='softmax')
16 | 
17 |     def call(self, state):
18 |         value = self.fc1(state)
19 |         value = self.fc2(value)
20 | 
21 |         v = self.v(value)
22 |         pi = self.pi(value)
23 | 
24 |         return v, pi
25 | 


--------------------------------------------------------------------------------
/ActorCritic/tf2/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def manage_memory():
 7 |     gpus = tf.config.list_physical_devices('GPU')
 8 |     if gpus:
 9 |         try:
10 |             for gpu in gpus:
11 |                 tf.config.experimental.set_memory_growth(gpu, True)
12 |         except RuntimeError as e:
13 |             print(e)
14 | 
15 | 
16 | def plot_learning_curve(x, scores, figure_file):
17 |     running_avg = np.zeros(len(scores))
18 |     for i in range(len(running_avg)):
19 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
20 |     plt.plot(x, running_avg)
21 |     plt.title('Running average of previous 100 scores')
22 |     plt.savefig(figure_file)
23 | 


--------------------------------------------------------------------------------
/ActorCritic/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/DDPG/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer():
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape))
 8 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape))
 9 |         self.action_memory = np.zeros((self.mem_size, n_actions))
10 |         self.reward_memory = np.zeros(self.mem_size)
11 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 | 
13 |     def store_transition(self, state, action, reward, state_, done):
14 |         index = self.mem_cntr % self.mem_size
15 |         self.state_memory[index] = state
16 |         self.action_memory[index] = action
17 |         self.reward_memory[index] = reward
18 |         self.new_state_memory[index] = state_
19 |         self.terminal_memory[index] = done
20 | 
21 |         self.mem_cntr += 1
22 | 
23 |     def sample_buffer(self, batch_size):
24 |         max_mem = min(self.mem_cntr, self.mem_size)
25 | 
26 |         batch = np.random.choice(max_mem, batch_size)
27 | 
28 |         states = self.state_memory[batch]
29 |         actions = self.action_memory[batch]
30 |         rewards = self.reward_memory[batch]
31 |         states_ = self.new_state_memory[batch]
32 |         dones = self.terminal_memory[batch]
33 | 
34 |         return states, actions, rewards, states_, dones
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/DDPG/ddpg_torch.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import torch as T
  4 | import torch.nn.functional as F
  5 | from networks import ActorNetwork, CriticNetwork
  6 | from noise import OUActionNoise
  7 | from buffer import ReplayBuffer
  8 | 
  9 | class Agent():
 10 |     def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99,
 11 |                  max_size=1000000, fc1_dims=400, fc2_dims=300, 
 12 |                  batch_size=64):
 13 |         self.gamma = gamma
 14 |         self.tau = tau
 15 |         self.batch_size = batch_size
 16 |         self.alpha = alpha
 17 |         self.beta = beta
 18 | 
 19 |         self.memory = ReplayBuffer(max_size, input_dims, n_actions)
 20 | 
 21 |         self.noise = OUActionNoise(mu=np.zeros(n_actions))
 22 | 
 23 |         self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
 24 |                                 n_actions=n_actions, name='actor')
 25 |         self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
 26 |                                 n_actions=n_actions, name='critic')
 27 | 
 28 |         self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
 29 |                                 n_actions=n_actions, name='target_actor')
 30 | 
 31 |         self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
 32 |                                 n_actions=n_actions, name='target_critic')
 33 | 
 34 |         self.update_network_parameters(tau=1)
 35 | 
 36 |     def choose_action(self, observation):
 37 |         self.actor.eval()
 38 |         state = T.tensor([observation], dtype=T.float).to(self.actor.device)
 39 |         mu = self.actor.forward(state).to(self.actor.device)
 40 |         mu_prime = mu + T.tensor(self.noise(), 
 41 |                                     dtype=T.float).to(self.actor.device)
 42 |         self.actor.train()
 43 | 
 44 |         return mu_prime.cpu().detach().numpy()[0]
 45 | 
 46 |     def remember(self, state, action, reward, state_, done):
 47 |         self.memory.store_transition(state, action, reward, state_, done)
 48 | 
 49 |     def save_models(self):
 50 |         self.actor.save_checkpoint()
 51 |         self.target_actor.save_checkpoint()
 52 |         self.critic.save_checkpoint()
 53 |         self.target_critic.save_checkpoint()
 54 | 
 55 |     def load_models(self):
 56 |         self.actor.load_checkpoint()
 57 |         self.target_actor.load_checkpoint()
 58 |         self.critic.load_checkpoint()
 59 |         self.target_critic.load_checkpoint()
 60 | 
 61 |     def learn(self):
 62 |         if self.memory.mem_cntr < self.batch_size:
 63 |             return
 64 | 
 65 |         states, actions, rewards, states_, done = \
 66 |                 self.memory.sample_buffer(self.batch_size)
 67 | 
 68 |         states = T.tensor(states, dtype=T.float).to(self.actor.device)
 69 |         states_ = T.tensor(states_, dtype=T.float).to(self.actor.device)
 70 |         actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
 71 |         rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
 72 |         done = T.tensor(done).to(self.actor.device)
 73 | 
 74 |         target_actions = self.target_actor.forward(states_)
 75 |         critic_value_ = self.target_critic.forward(states_, target_actions)
 76 |         critic_value = self.critic.forward(states, actions)
 77 | 
 78 |         critic_value_[done] = 0.0
 79 |         critic_value_ = critic_value_.view(-1)
 80 | 
 81 |         target = rewards + self.gamma*critic_value_
 82 |         target = target.view(self.batch_size, 1)
 83 | 
 84 |         self.critic.optimizer.zero_grad()
 85 |         critic_loss = F.mse_loss(target, critic_value)
 86 |         critic_loss.backward()
 87 |         self.critic.optimizer.step()
 88 | 
 89 |         self.actor.optimizer.zero_grad()
 90 |         actor_loss = -self.critic.forward(states, self.actor.forward(states))
 91 |         actor_loss = T.mean(actor_loss)
 92 |         actor_loss.backward()
 93 |         self.actor.optimizer.step()
 94 | 
 95 |         self.update_network_parameters()
 96 | 
 97 |     def update_network_parameters(self, tau=None):
 98 |         if tau is None:
 99 |             tau = self.tau
100 | 
101 |         actor_params = self.actor.named_parameters()
102 |         critic_params = self.critic.named_parameters()
103 |         target_actor_params = self.target_actor.named_parameters()
104 |         target_critic_params = self.target_critic.named_parameters()
105 | 
106 |         critic_state_dict = dict(critic_params)
107 |         actor_state_dict = dict(actor_params)
108 |         target_critic_state_dict = dict(target_critic_params)
109 |         target_actor_state_dict = dict(target_actor_params)
110 | 
111 |         for name in critic_state_dict:
112 |             critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
113 |                                 (1-tau)*target_critic_state_dict[name].clone()
114 | 
115 |         for name in actor_state_dict:
116 |              actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
117 |                                  (1-tau)*target_actor_state_dict[name].clone()
118 | 
119 |         self.target_critic.load_state_dict(critic_state_dict)
120 |         self.target_actor.load_state_dict(actor_state_dict)
121 |         #self.target_critic.load_state_dict(critic_state_dict, strict=False)
122 |         #self.target_actor.load_state_dict(actor_state_dict, strict=False)
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/DDPG/main_ddpg.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from ddpg_torch import Agent
 4 | from utils import plot_learning_curve
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('LunarLanderContinuous-v2')
 8 |     agent = Agent(alpha=0.0001, beta=0.001, 
 9 |                     input_dims=env.observation_space.shape, tau=0.001,
10 |                     batch_size=64, fc1_dims=400, fc2_dims=300, 
11 |                     n_actions=env.action_space.shape[0])
12 |     n_games = 1000
13 |     filename = 'LunarLander_alpha_' + str(agent.alpha) + '_beta_' + \
14 |                 str(agent.beta) + '_' + str(n_games) + '_games'
15 |     figure_file = 'plots/' + filename + '.png'
16 | 
17 |     best_score = env.reward_range[0]
18 |     score_history = []
19 |     for i in range(n_games):
20 |         observation = env.reset()
21 |         done = False
22 |         score = 0
23 |         agent.noise.reset()
24 |         while not done:
25 |             action = agent.choose_action(observation)
26 |             observation_, reward, done, info = env.step(action)
27 |             agent.remember(observation, action, reward, observation_, done)
28 |             agent.learn()
29 |             score += reward
30 |             observation = observation_
31 |         score_history.append(score)
32 |         avg_score = np.mean(score_history[-100:])
33 | 
34 |         if avg_score > best_score:
35 |             best_score = avg_score
36 |             agent.save_models()
37 | 
38 |         print('episode ', i, 'score %.1f' % score,
39 |                 'average score %.1f' % avg_score)
40 |     x = [i+1 for i in range(n_games)]
41 |     plot_learning_curve(x, score_history, figure_file)
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/DDPG/networks.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import torch as T
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | 
  8 | class CriticNetwork(nn.Module):
  9 |     def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, name,
 10 |                  chkpt_dir='tmp/ddpg'):
 11 |         super(CriticNetwork, self).__init__()
 12 |         self.input_dims = input_dims
 13 |         self.fc1_dims = fc1_dims
 14 |         self.fc2_dims = fc2_dims
 15 |         self.n_actions = n_actions
 16 |         self.name = name
 17 |         self.checkpoint_dir = chkpt_dir
 18 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg')
 19 | 
 20 |         self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
 21 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
 22 | 
 23 |         self.bn1 = nn.LayerNorm(self.fc1_dims)
 24 |         self.bn2 = nn.LayerNorm(self.fc2_dims)
 25 |         #self.bn1 = nn.BatchNorm1d(self.fc1_dims)
 26 |         #self.bn2 = nn.BatchNorm1d(self.fc2_dims)
 27 | 
 28 |         self.action_value = nn.Linear(self.n_actions, self.fc2_dims)
 29 |         
 30 |         self.q = nn.Linear(self.fc2_dims, 1)
 31 | 
 32 |         f1 = 1./np.sqrt(self.fc1.weight.data.size()[0])
 33 |         self.fc1.weight.data.uniform_(-f1, f1)
 34 |         self.fc1.bias.data.uniform_(-f1, f1)
 35 | 
 36 |         f2 = 1./np.sqrt(self.fc2.weight.data.size()[0])
 37 |         self.fc2.weight.data.uniform_(-f2, f2)
 38 |         self.fc2.bias.data.uniform_(-f2, f2)
 39 | 
 40 |         f3 = 0.003
 41 |         self.q.weight.data.uniform_(-f3, f3)
 42 |         self.q.bias.data.uniform_(-f3, f3)
 43 | 
 44 |         f4 = 1./np.sqrt(self.action_value.weight.data.size()[0])
 45 |         self.action_value.weight.data.uniform_(-f4, f4)
 46 |         self.action_value.bias.data.uniform_(-f4, f4)
 47 | 
 48 |         self.optimizer = optim.Adam(self.parameters(), lr=beta,
 49 |                                     weight_decay=0.01)
 50 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
 51 | 
 52 |         self.to(self.device)
 53 | 
 54 |     def forward(self, state, action):
 55 |         state_value = self.fc1(state)
 56 |         state_value = self.bn1(state_value)
 57 |         state_value = F.relu(state_value)
 58 |         state_value = self.fc2(state_value)
 59 |         state_value = self.bn2(state_value)
 60 |         #state_value = F.relu(state_value)
 61 |         #action_value = F.relu(self.action_value(action))
 62 |         action_value = self.action_value(action)
 63 |         state_action_value = F.relu(T.add(state_value, action_value))
 64 |         #state_action_value = T.add(state_value, action_value)
 65 |         state_action_value = self.q(state_action_value)
 66 | 
 67 |         return state_action_value
 68 | 
 69 |     def save_checkpoint(self):
 70 |         print('... saving checkpoint ...')
 71 |         T.save(self.state_dict(), self.checkpoint_file)
 72 | 
 73 |     def load_checkpoint(self):
 74 |         print('... loading checkpoint ...')
 75 |         self.load_state_dict(T.load(self.checkpoint_file))
 76 | 
 77 |     def save_best(self):
 78 |         print('... saving best checkpoint ...')
 79 |         checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best')
 80 |         T.save(self.state_dict(), checkpoint_file)
 81 | 
 82 | class ActorNetwork(nn.Module):
 83 |     def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, n_actions, name,
 84 |                  chkpt_dir='tmp/ddpg'):
 85 |         super(ActorNetwork, self).__init__()
 86 |         self.input_dims = input_dims
 87 |         self.fc1_dims = fc1_dims
 88 |         self.fc2_dims = fc2_dims
 89 |         self.n_actions = n_actions
 90 |         self.name = name
 91 |         self.checkpoint_dir = chkpt_dir
 92 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg')
 93 | 
 94 |         self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
 95 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
 96 | 
 97 |         self.bn1 = nn.LayerNorm(self.fc1_dims)
 98 |         self.bn2 = nn.LayerNorm(self.fc2_dims)
 99 | 
100 |         #self.bn1 = nn.BatchNorm1d(self.fc1_dims)
101 |         #self.bn2 = nn.BatchNorm1d(self.fc2_dims)
102 | 
103 |         self.mu = nn.Linear(self.fc2_dims, self.n_actions)
104 | 
105 |         f2 = 1./np.sqrt(self.fc2.weight.data.size()[0])
106 |         self.fc2.weight.data.uniform_(-f2, f2)
107 |         self.fc2.bias.data.uniform_(-f2, f2)
108 | 
109 |         f1 = 1./np.sqrt(self.fc1.weight.data.size()[0])
110 |         self.fc1.weight.data.uniform_(-f1, f1)
111 |         self.fc1.bias.data.uniform_(-f1, f1)
112 | 
113 |         f3 = 0.003
114 |         self.mu.weight.data.uniform_(-f3, f3)
115 |         self.mu.bias.data.uniform_(-f3, f3)
116 | 
117 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
118 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')
119 | 
120 |         self.to(self.device)
121 | 
122 |     def forward(self, state):
123 |         x = self.fc1(state)
124 |         x = self.bn1(x)
125 |         x = F.relu(x)
126 |         x = self.fc2(x)
127 |         x = self.bn2(x)
128 |         x = F.relu(x)
129 |         x = T.tanh(self.mu(x))
130 | 
131 |         return x
132 | 
133 |     def save_checkpoint(self):
134 |         print('... saving checkpoint ...')
135 |         T.save(self.state_dict(), self.checkpoint_file)
136 | 
137 |     def load_checkpoint(self):
138 |         print('... loading checkpoint ...')
139 |         self.load_state_dict(T.load(self.checkpoint_file))
140 | 
141 |     def save_best(self):
142 |         print('... saving best checkpoint ...')
143 |         checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best')
144 |         T.save(self.state_dict(), checkpoint_file)
145 | 


--------------------------------------------------------------------------------
/DDPG/noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class OUActionNoise():
 4 |     def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):
 5 |         self.theta = theta
 6 |         self.mu = mu
 7 |         self.sigma = sigma
 8 |         self.dt = dt
 9 |         self.x0 = x0
10 |         self.reset()
11 | 
12 |     def __call__(self):
13 |         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
14 |                 self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
15 |         self.x_prev = x
16 | 
17 |         return x
18 | 
19 |     def reset(self):
20 |         self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/DDPG/tf2/agent.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow.keras as keras
  3 | from tensorflow.keras.optimizers import Adam
  4 | from buffer import ReplayBuffer
  5 | from networks import ActorNetwork, CriticNetwork
  6 | 
  7 | 
  8 | class Agent:
  9 |     def __init__(self, input_dims, alpha=0.001, beta=0.002, env=None,
 10 |                  gamma=0.99, n_actions=2, max_size=1000000, tau=0.005,
 11 |                  fc1=400, fc2=300, batch_size=64, noise=0.1,
 12 |                  chkpt_dir='models/ddpg/'):
 13 |         self.gamma = gamma
 14 |         self.tau = tau
 15 |         self.memory = ReplayBuffer(max_size, input_dims, n_actions)
 16 |         self.batch_size = batch_size
 17 |         self.n_actions = n_actions
 18 |         self.noise = noise
 19 |         self.max_action = env.action_space.high[0]
 20 |         self.min_action = env.action_space.low[0]
 21 |         self.chkpt_dir = chkpt_dir
 22 | 
 23 |         self.actor = ActorNetwork(n_actions=n_actions,
 24 |                                   fc1_dims=fc1, fc2_dims=fc2)
 25 |         self.critic = CriticNetwork(n_actions=n_actions,
 26 |                                     fc1_dims=fc1, fc2_dims=fc2)
 27 |         self.target_actor = ActorNetwork(n_actions=n_actions,
 28 |                                          fc1_dims=fc1, fc2_dims=fc2)
 29 |         self.target_critic = CriticNetwork(n_actions=n_actions,
 30 |                                            fc1_dims=fc1, fc2_dims=fc2)
 31 | 
 32 |         self.actor.compile(optimizer=Adam(learning_rate=alpha))
 33 |         self.critic.compile(optimizer=Adam(learning_rate=beta))
 34 |         self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
 35 |         self.target_critic.compile(optimizer=Adam(learning_rate=beta))
 36 | 
 37 |         self.update_network_parameters(tau=1)
 38 | 
 39 |     def update_network_parameters(self, tau=None):
 40 |         if tau is None:
 41 |             tau = self.tau
 42 | 
 43 |         weights = []
 44 |         targets = self.target_actor.weights
 45 |         for i, weight in enumerate(self.actor.weights):
 46 |             weights.append(weight * tau + targets[i]*(1-tau))
 47 |         self.target_actor.set_weights(weights)
 48 | 
 49 |         weights = []
 50 |         targets = self.target_critic.weights
 51 |         for i, weight in enumerate(self.critic.weights):
 52 |             weights.append(weight * tau + targets[i]*(1-tau))
 53 |         self.target_critic.set_weights(weights)
 54 | 
 55 |     def store_transition(self, state, action, reward, new_state, done):
 56 |         self.memory.store_transition(state, action, reward, new_state, done)
 57 | 
 58 |     def save_models(self):
 59 |         print('... saving models ...')
 60 |         self.actor.save(self.chkpt_dir+'actor')
 61 |         self.target_actor.save(self.chkpt_dir+'target_actor')
 62 |         self.critic.save(self.chkpt_dir+'critic')
 63 |         self.target_critic.save(self.chkpt_dir+'target_critic')
 64 | 
 65 |     def load_models(self):
 66 |         print('... loading models ...')
 67 |         self.actor = keras.models.load_model(self.chkpt_dir+'actor')
 68 |         self.target_actor = \
 69 |             keras.models.load_model(self.chkpt_dir+'target_actor')
 70 |         self.critic = keras.models.load_model(self.chkpt_dir+'critic')
 71 |         self.target_critic = \
 72 |             keras.models.load_model(self.chkpt_dir+'target_critic')
 73 | 
 74 |     def choose_action(self, observation, evaluate=False):
 75 |         state = tf.convert_to_tensor([observation], dtype=tf.float32)
 76 |         actions = self.actor(state)
 77 |         if not evaluate:
 78 |             actions += tf.random.normal(shape=[self.n_actions],
 79 |                                         mean=0.0, stddev=self.noise)
 80 |         # note that if the env has an action > 1, we have to multiply by
 81 |         # max action at some point
 82 |         actions = tf.clip_by_value(actions, self.min_action, self.max_action)
 83 | 
 84 |         return actions[0]
 85 | 
 86 |     def learn(self):
 87 |         if self.memory.mem_cntr < self.batch_size:
 88 |             return
 89 | 
 90 |         state, action, reward, new_state, done = \
 91 |             self.memory.sample_buffer(self.batch_size)
 92 | 
 93 |         states = tf.convert_to_tensor(state, dtype=tf.float32)
 94 |         states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
 95 |         rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
 96 |         actions = tf.convert_to_tensor(action, dtype=tf.float32)
 97 | 
 98 |         with tf.GradientTape() as tape:
 99 |             target_actions = self.target_actor(states_)
100 |             critic_value_ = tf.squeeze(self.target_critic(
101 |                                 (states_, target_actions)), 1)
102 |             critic_value = tf.squeeze(self.critic((states, actions)), 1)
103 |             target = rewards + self.gamma*critic_value_*(1-done)
104 |             critic_loss = keras.losses.MSE(target, critic_value)
105 |         params = self.critic.trainable_variables
106 |         grads = tape.gradient(critic_loss, params)
107 |         self.critic.optimizer.apply_gradients(zip(grads, params))
108 | 
109 |         with tf.GradientTape() as tape:
110 |             new_policy_actions = self.actor(states)
111 |             actor_loss = -self.critic((states, new_policy_actions))
112 |             actor_loss = tf.math.reduce_mean(actor_loss)
113 |         params = self.actor.trainable_variables
114 |         grads = tape.gradient(actor_loss, params)
115 |         self.actor.optimizer.apply_gradients(zip(grads, params))
116 | 
117 |         self.update_network_parameters()
118 | 


--------------------------------------------------------------------------------
/DDPG/tf2/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer():
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape))
 8 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape))
 9 |         self.action_memory = np.zeros((self.mem_size, n_actions))
10 |         self.reward_memory = np.zeros(self.mem_size)
11 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 | 
13 |     def store_transition(self, state, action, reward, state_, done):
14 |         index = self.mem_cntr % self.mem_size
15 |         self.state_memory[index] = state
16 |         self.action_memory[index] = action
17 |         self.reward_memory[index] = reward
18 |         self.new_state_memory[index] = state_
19 |         self.terminal_memory[index] = done
20 | 
21 |         self.mem_cntr += 1
22 | 
23 |     def sample_buffer(self, batch_size):
24 |         max_mem = min(self.mem_cntr, self.mem_size)
25 | 
26 |         batch = np.random.choice(max_mem, batch_size)
27 | 
28 |         states = self.state_memory[batch]
29 |         actions = self.action_memory[batch]
30 |         rewards = self.reward_memory[batch]
31 |         states_ = self.new_state_memory[batch]
32 |         dones = self.terminal_memory[batch]
33 | 
34 |         return states, actions, rewards, states_, dones
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/DDPG/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from agent import Agent
 4 | from utils import plot_learning_curve, manage_memory
 5 | 
 6 | if __name__ == '__main__':
 7 |     manage_memory()
 8 |     env = gym.make('LunarLanderContinuous-v2')
 9 |     agent = Agent(input_dims=env.observation_space.shape, env=env,
10 |                   n_actions=env.action_space.shape[0],
11 |                   alpha=0.0001, beta=0.001)
12 |     n_games = 1000
13 | 
14 |     figure_file = 'plots/lunar_lander.png'
15 | 
16 |     best_score = env.reward_range[0]
17 |     score_history = []
18 |     load_checkpoint = False
19 | 
20 |     if load_checkpoint:
21 |         agent.load_models()
22 |         evaluate = True
23 |     else:
24 |         evaluate = False
25 | 
26 |     for i in range(n_games):
27 |         observation = env.reset()
28 |         done = False
29 |         score = 0
30 |         while not done:
31 |             action = agent.choose_action(observation, evaluate)
32 |             observation_, reward, done, info = env.step(action)
33 |             score += reward
34 |             agent.store_transition(observation, action, reward,
35 |                                    observation_, done)
36 |             if not load_checkpoint:
37 |                 agent.learn()
38 |             observation = observation_
39 | 
40 |         score_history.append(score)
41 |         avg_score = np.mean(score_history[-100:])
42 | 
43 |         if avg_score > best_score:
44 |             best_score = avg_score
45 |             if not load_checkpoint:
46 |                 agent.save_models()
47 | 
48 |         print('episode {} score {:.1f} avg score {:.1f}'.
49 |               format(i, score, avg_score))
50 | 
51 |     if not load_checkpoint:
52 |         x = [i+1 for i in range(n_games)]
53 |         plot_learning_curve(x, score_history, figure_file)
54 | 


--------------------------------------------------------------------------------
/DDPG/tf2/networks.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.keras as keras
 3 | from tensorflow.keras.layers import Dense
 4 | 
 5 | 
 6 | class CriticNetwork(keras.Model):
 7 |     def __init__(self, n_actions, fc1_dims=512, fc2_dims=512):
 8 |         super(CriticNetwork, self).__init__()
 9 |         self.fc1_dims = fc1_dims
10 |         self.fc2_dims = fc2_dims
11 |         self.n_actions = n_actions
12 | 
13 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
14 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
15 |         self.q = Dense(1, activation=None)
16 | 
17 |     # have to define inputs as a tuple because the model.save() function
18 |     # trips an error when trying to save a call function with two inputs.
19 |     def call(self, inputs):
20 |         state, action = inputs
21 |         action_value = self.fc1(tf.concat([state, action], axis=1))
22 |         action_value = self.fc2(action_value)
23 | 
24 |         q = self.q(action_value)
25 | 
26 |         return q
27 | 
28 | 
29 | class ActorNetwork(keras.Model):
30 |     def __init__(self, fc1_dims=512, fc2_dims=512, n_actions=2):
31 |         super(ActorNetwork, self).__init__()
32 |         self.fc1_dims = fc1_dims
33 |         self.fc2_dims = fc2_dims
34 |         self.n_actions = n_actions
35 | 
36 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
37 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
38 |         self.mu = Dense(self.n_actions, activation='tanh')
39 | 
40 |     def call(self, state):
41 |         prob = self.fc1(state)
42 |         prob = self.fc2(prob)
43 | 
44 |         mu = self.mu(prob)
45 | 
46 |         return mu
47 | 


--------------------------------------------------------------------------------
/DDPG/tf2/noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class OUActionNoise():
 4 |     def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):
 5 |         self.theta = theta
 6 |         self.mu = mu
 7 |         self.sigma = sigma
 8 |         self.dt = dt
 9 |         self.x0 = x0
10 |         self.reset()
11 | 
12 |     def __call__(self):
13 |         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
14 |                 self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
15 |         self.x_prev = x
16 | 
17 |         return x
18 | 
19 |     def reset(self):
20 |         self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/DDPG/tf2/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def manage_memory():
 7 |     gpus = tf.config.list_physical_devices('GPU')
 8 |     if gpus:
 9 |         try:
10 |             for gpu in gpus:
11 |                 tf.config.experimental.set_memory_growth(gpu, True)
12 |         except RuntimeError as e:
13 |             print(e)
14 | 
15 | 
16 | def plot_learning_curve(x, scores, figure_file):
17 |     running_avg = np.zeros(len(scores))
18 |     for i in range(len(running_avg)):
19 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
20 |     plt.plot(x, running_avg)
21 |     plt.title('Running average of previous 100 scores')
22 |     plt.savefig(figure_file)
23 | 


--------------------------------------------------------------------------------
/DDPG/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/Fundamentals/control_blackJack_no_es.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Agent():
 4 |     def __init__(self, eps=0.1, gamma=0.99):
 5 |         self.Q = {}
 6 |         self.sum_space = [i for i in range(4, 22)]
 7 |         self.dealer_show_card_space = [i+1 for i in range(10)]
 8 |         self.ace_space = [False, True]
 9 |         self.action_space = [0, 1] #stick or hit
10 | 
11 |         self.state_space = []
12 |         self.memory = []
13 |         self.pairs_visited = {}
14 |         self.returns = {}
15 | 
16 |         self.gamma = gamma
17 |         self.eps = eps
18 | 
19 |         self.init_vals()
20 |         self.init_policy()
21 | 
22 |     def init_vals(self):
23 |         for total in self.sum_space:
24 |             for card in self.dealer_show_card_space:
25 |                 for ace in self.ace_space:
26 |                     state = (total, card, ace)
27 |                     self.state_space.append(state)
28 |                     for action in self.action_space:
29 |                         self.Q[(state, action)] = 0
30 |                         self.returns[(state, action)] = []
31 |                         self.pairs_visited[(state, action)] = 0
32 | 
33 |     def init_policy(self):
34 |         policy = {}
35 |         n = len(self.action_space)
36 |         for state in self.state_space:
37 |             policy[state] = [1/n for _ in range(n)]
38 |         self.policy = policy
39 | 
40 |     def choose_action(self, state):
41 |         action = np.random.choice(self.action_space, p=self.policy[state])
42 |         return action
43 | 
44 |     def update_Q(self):
45 |         for idt, (state, action, _) in enumerate(self.memory):
46 |             G = 0
47 |             discount = 1
48 |             if self.pairs_visited[(state, action)] == 0:
49 |                 self.pairs_visited[(state, action)] += 1
50 |                 for t, (_, _, reward) in enumerate(self.memory[idt:]):
51 |                     G += reward * discount
52 |                     discount *= self.gamma
53 |                     self.returns[(state, action)].append(G)
54 | 
55 |         for state, action, _ in self.memory:
56 |             self.Q[(state, action)] = np.mean(self.returns[(state, action)])
57 |             self.update_policy(state)
58 | 
59 |         for state_action in self.pairs_visited.keys():
60 |             self.pairs_visited[state_action] = 0
61 | 
62 |         self.memory = []
63 | 
64 |     def update_policy(self, state):
65 |         actions = [self.Q[(state, a)] for a in self.action_space]
66 |         a_max = np.argmax(actions)
67 |         n_actions = len(self.action_space)
68 |         probs = []
69 |         for action in self.action_space:
70 |             prob = 1 - self.eps + self.eps / n_actions if action == a_max else \
71 |                     self.eps / n_actions
72 |             probs.append(prob)
73 |         self.policy[state] = probs
74 | 


--------------------------------------------------------------------------------
/Fundamentals/control_cartpole_q_learning.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Agent():
 4 |     def __init__(self, lr, gamma, n_actions, state_space, eps_start, eps_end,
 5 |                  eps_dec):
 6 |         self.lr = lr
 7 |         self.gamma = gamma
 8 |         self.actions = [i for i in range(n_actions)]
 9 |         self.states = state_space
10 |         self.epsilon = eps_start
11 |         self.eps_min = eps_end
12 |         self.eps_dec = eps_dec
13 | 
14 |         self.Q = {}
15 | 
16 |         self.init_Q()
17 | 
18 |     def init_Q(self):
19 |         for state in self.states:
20 |             for action in self.actions:
21 |                 self.Q[(state, action)] = 0.0
22 | 
23 |     def max_action(self, state):
24 |         actions = np.array([self.Q[(state, a)] for a in self.actions])
25 |         action = np.argmax(actions)
26 | 
27 |         return action
28 | 
29 |     def choose_action(self, state):
30 |         if np.random.random() < self.epsilon:
31 |             action = np.random.choice(self.actions)
32 |         else:
33 |             action = self.max_action(state)
34 | 
35 |         return action
36 | 
37 |     def decrement_epsilon(self):
38 |         self.epsilon = self.epsilon - self.eps_dec \
39 |                 if self.epsilon>self.eps_min else self.eps_min
40 | 
41 |     def learn(self, state, action, reward, state_):
42 |         a_max = self.max_action(state_)
43 | 
44 |         self.Q[(state, action)] = self.Q[(state, action)] + self.lr*(reward +
45 |                                         self.gamma*self.Q[(state_, a_max)] -
46 |                                         self.Q[(state, action)])
47 | 
48 | 


--------------------------------------------------------------------------------
/Fundamentals/main_control_blackJack_no_es.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import matplotlib.pyplot as plt
 3 | from control_blackJack_no_es import Agent
 4 | 
 5 | if __name__ == '__main__':
 6 |     env = gym.make('Blackjack-v0')
 7 |     agent = Agent(eps=0.001)
 8 |     n_episodes = 200000
 9 |     win_lose_draw = {-1:0, 0:0, 1:0}
10 |     win_rates = []
11 |     for i in range(n_episodes):
12 |         if i > 0 and i % 1000 == 0:
13 |             pct = win_lose_draw[1] / i
14 |             win_rates.append(pct)
15 |         if i % 50000 == 0:
16 |             rates = win_rates[-1] if win_rates else 0.0
17 |             print('starting episode', i, 'win rate %.3f' % rates)
18 |         observation = env.reset()
19 |         done = False
20 |         while not done:
21 |             action = agent.choose_action(observation)
22 |             observation_, reward, done, info = env.step(action)
23 |             agent.memory.append((observation, action, reward))
24 |             observation = observation_
25 |         agent.update_Q()
26 |         win_lose_draw[reward] += 1
27 |     plt.plot(win_rates)
28 |     plt.show()
29 | 


--------------------------------------------------------------------------------
/Fundamentals/main_control_cartpole_q_learning.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from control_cartpole_q_learning import Agent
 5 | 
 6 | class CartPoleStateDigitizer():
 7 |     def __init__(self, bounds=(2.4, 4, 0.209, 4), n_bins=10):
 8 |         """  
 9 |             bounds - bounds for linear space. Single floating point number for
10 |                      each observation element. Space is from -bound to +bound
11 |                      observation -> x, dx/dt, theta, dtheta/dt
12 |         """
13 |         self.position_space = np.linspace(-1*bounds[0], bounds[0], n_bins)
14 |         self.velocity_space = np.linspace(-1*bounds[1], bounds[1], n_bins)
15 |         self.pole_angle_space = np.linspace(-1*bounds[2], bounds[2], n_bins)
16 |         self.pole_velocity_space = np.linspace(-1*bounds[3], bounds[3], n_bins)
17 |         self.states = self.get_state_space()
18 | 
19 |     def get_state_space(self):
20 |         states = []
21 |         for i in range(len(self.position_space)+1):
22 |             for j in range(len(self.velocity_space)+1):
23 |                 for k in range(len(self.pole_angle_space)+1):
24 |                     for l in range(len(self.pole_velocity_space)+1):
25 |                         states.append((i,j,k,l))
26 |         return states
27 | 
28 |     def digitize(self, observation):
29 |         x, dx_dt, theta, dtheta_dt = observation
30 |         cart_x = int(np.digitize(x, self.position_space))
31 |         cart_dx_dt = int(np.digitize(dx_dt, self.velocity_space))
32 |         pole_theta = int(np.digitize(theta, self.pole_angle_space))
33 |         pole_dtheta_dt = int(np.digitize(dtheta_dt, self.pole_velocity_space))
34 | 
35 |         return (cart_x, cart_dx_dt, pole_theta, pole_dtheta_dt)
36 | 
37 | def plot_learning_curve(scores, x):
38 |     running_avg = np.zeros(len(scores))
39 |     for i in range(len(running_avg)):
40 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
41 |     plt.plot(x, running_avg)
42 |     plt.title('Running average of scores')
43 |     plt.show()
44 | 
45 | if __name__ == '__main__':
46 |     env = gym.make('CartPole-v0')
47 |     n_games = 50000
48 |     eps_dec =  2 / n_games
49 |     digitizer = CartPoleStateDigitizer()
50 |     agent = Agent(lr=0.01, gamma=0.99, n_actions=2, eps_start=1.0,
51 |             eps_end=0.01, eps_dec=eps_dec, state_space=digitizer.states)
52 | 
53 |     scores = []
54 | 
55 |     for i in range(n_games):
56 |         observation = env.reset()
57 |         done = False
58 |         score = 0
59 |         state = digitizer.digitize(observation)
60 |         while not done:
61 |             action = agent.choose_action(state)
62 |             observation_, reward, done, info = env.step(action)
63 |             state_ = digitizer.digitize(observation_)
64 |             agent.learn(state, action, reward, state_)
65 |             state = state_
66 |             score += reward
67 |         if i % 5000 == 0:
68 |             print('episode ', i, 'score %.1f' % score, 
69 |                   'epsilon %.2f' % agent.epsilon)
70 | 
71 |         agent.decrement_epsilon()
72 |         scores.append(score)
73 | 
74 |     x = [i + 1 for i in range(n_games)]
75 |     plot_learning_curve(scores, x)
76 | 


--------------------------------------------------------------------------------
/Fundamentals/main_prediction_blackJack.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from prediction_blackJack import Agent
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('Blackjack-v0')
 8 |     agent = Agent()
 9 |     n_episodes = 500000
10 |     for i in range(n_episodes):
11 |         if i % 50000 == 0:
12 |             print('starting episode', i)
13 |         observation = env.reset()
14 |         done = False
15 |         while not done:
16 |             action = agent.policy(observation)
17 |             observation_, reward, done, info = env.step(action)
18 |             agent.memory.append((observation, reward))
19 |             observation = observation_
20 |         agent.update_V()
21 |     print(agent.V[(21, 3, True)])
22 |     print(agent.V[(4, 1, False)])
23 | 


--------------------------------------------------------------------------------
/Fundamentals/prediction_blackJack.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Agent():
 4 |     def __init__(self, gamma=0.99):
 5 |         self.V = {}
 6 |         self.sum_space = [i for i in range(4, 22)]
 7 |         self.dealer_show_card_space = [i+1 for i in range(10)]
 8 |         self.ace_space = [False, True]
 9 |         self.action_space = [0, 1] # stick or hit
10 | 
11 |         self.state_space = []
12 |         self.returns = {}
13 |         self.states_visited = {} # first visit or not
14 |         self.memory = []
15 |         self.gamma = gamma
16 | 
17 |         self.init_vals()
18 | 
19 |     def init_vals(self):
20 |         for total in self.sum_space:
21 |             for card in self.dealer_show_card_space:
22 |                 for ace in self.ace_space:
23 |                     self.V[(total, card, ace)] = 0
24 |                     self.returns[(total, card, ace)] = []
25 |                     self.states_visited[(total, card, ace)] = 0
26 |                     self.state_space.append((total, card, ace))
27 | 
28 |     def policy(self, state):
29 |         total, _, _ = state
30 |         action = 0 if total >= 20 else 1
31 |         return action
32 | 
33 | 
34 |     def update_V(self):
35 |         for idt, (state, _) in enumerate(self.memory):
36 |             G = 0
37 |             if self.states_visited[state] == 0:
38 |                 self.states_visited[state] += 1
39 |                 discount = 1
40 |                 for t, (_, reward) in enumerate(self.memory[idt:]):
41 |                     G += reward * discount
42 |                     discount *= self.gamma
43 |                     self.returns[state].append(G)
44 | 
45 |         for state, _ in self.memory:
46 |             self.V[state] = np.mean(self.returns[state])
47 | 
48 |         for state in self.state_space:
49 |             self.states_visited[state] = 0
50 | 
51 |         self.memory = []
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/Fundamentals/prediction_cartpole_td_zero.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | 
 4 | def simple_policy(state):
 5 |     action = 0 if state < 5 else 1
 6 |     return action
 7 | 
 8 | if __name__ == '__main__':
 9 |     env = gym.make('CartPole-v0')
10 |     alpha = 0.1
11 |     gamma = 0.99
12 | 
13 |     states = np.linspace(-0.2094, 0.2094, 10)
14 |     V = {}
15 |     for state in range(len(states)+1):
16 |         V[state] = 0
17 | 
18 |     for i in range(5000):
19 |         observation = env.reset()
20 |         done = False
21 |         while not done:
22 |             state = int(np.digitize(observation[2], states))
23 |             action = simple_policy(state)
24 |             observation_, reward, done, info = env.step(action)
25 |             state_ = int(np.digitize(observation_[2], states))
26 |             V[state] = V[state] + alpha*(reward + gamma*V[state_] - V[state])
27 |             observation = observation_
28 | 
29 |     for state in V:
30 |         print(state, '%.3f' % V[state])
31 | 
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Actor-Critic-Methods-Paper-To-Code


--------------------------------------------------------------------------------
/Reinforce/lunar_lander_random.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | if __name__ == '__main__':
 4 |     env = gym.make('LunarLander-v2')
 5 | 
 6 |     n_games = 100
 7 | 
 8 |     for i in range(n_games):
 9 |         obs = env.reset()
10 |         score = 0
11 |         done = False
12 |         while not done:
13 |             action = env.action_space.sample()
14 |             obs_, reward, done, info = env.step(action)
15 |             score += reward
16 |             #env.render()
17 |         print('episode ', i, 'score %.1f' % score)
18 | 
19 | 


--------------------------------------------------------------------------------
/Reinforce/main_lunar_lander_reinforce.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from reinforce_torch import PolicyGradientAgent
 5 | 
 6 | def plot_learning_curve(scores, x, figure_file):
 7 |     running_avg = np.zeros(len(scores))
 8 |     for i in range(len(running_avg)):
 9 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
10 |     plt.plot(x, running_avg)
11 |     plt.title('Running average of previous 100 scores')
12 |     plt.savefig(figure_file)
13 | 
14 | if __name__ == '__main__':
15 |     env = gym.make('LunarLander-v2')
16 |     n_games = 3000
17 |     agent = PolicyGradientAgent(gamma=0.99, lr=0.0005, input_dims=[8],
18 |                                 n_actions=4)
19 | 
20 |     fname = 'REINFORCE_' + 'lunar_lunar_lr' + str(agent.lr) + '_' \
21 |             + str(n_games) + 'games'
22 |     figure_file = 'plots/' + fname + '.png'
23 | 
24 |     scores = []
25 |     for i in range(n_games):
26 |         done = False
27 |         observation = env.reset()
28 |         score = 0
29 |         while not done:
30 |             action = agent.choose_action(observation)
31 |             observation_, reward, done, info = env.step(action)
32 |             score += reward
33 |             agent.store_rewards(reward)
34 |             observation = observation_
35 |         agent.learn()
36 |         scores.append(score)
37 | 
38 |         avg_score = np.mean(scores[-100:])
39 |         print('episode ', i, 'score %.2f' % score,
40 |                 'average score %.2f' % avg_score)
41 | 
42 |     x = [i+1 for i in range(len(scores))]
43 |     plot_learning_curve(scores, x, figure_file)
44 | 


--------------------------------------------------------------------------------
/Reinforce/reinforce_torch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | 
 7 | class PolicyNetwork(nn.Module):
 8 |     def __init__(self, lr, input_dims, n_actions):
 9 |         super(PolicyNetwork, self).__init__()
10 |         self.fc1 = nn.Linear(*input_dims, 128)
11 |         self.fc2 = nn.Linear(128, 128)
12 |         self.fc3 = nn.Linear(128, n_actions)
13 |         self.optimizer = optim.Adam(self.parameters(), lr=lr)
14 | 
15 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
16 |         self.to(self.device)
17 | 
18 |     def forward(self, state):
19 |         x = F.relu(self.fc1(state))
20 |         x = F.relu(self.fc2(x))
21 |         x = self.fc3(x)
22 | 
23 |         return x
24 | 
25 | class PolicyGradientAgent():
26 |     def __init__(self, lr, input_dims, gamma=0.99, n_actions=4):
27 |         self.gamma = gamma
28 |         self.lr = lr
29 |         self.reward_memory = []
30 |         self.action_memory = []
31 | 
32 |         self.policy = PolicyNetwork(self.lr, input_dims, n_actions)
33 | 
34 |     def choose_action(self, observation):
35 |         state = T.Tensor([observation]).to(self.policy.device)
36 |         probabilities = F.softmax(self.policy.forward(state))
37 |         action_probs = T.distributions.Categorical(probabilities)
38 |         action = action_probs.sample()
39 |         log_probs = action_probs.log_prob(action)
40 |         self.action_memory.append(log_probs)
41 | 
42 |         return action.item()
43 | 
44 |     def store_rewards(self, reward):
45 |         self.reward_memory.append(reward)
46 | 
47 |     def learn(self):
48 |         self.policy.optimizer.zero_grad()
49 | 
50 |         # G_t = R_t+1 + gamma * R_t+2 + gamma**2 * R_t+3
51 |         # G_t = sum from k=0 to k=T {gamma**k * R_t+k+1}
52 |         G = np.zeros_like(self.reward_memory, dtype=np.float64)
53 |         for t in range(len(self.reward_memory)):
54 |             G_sum = 0
55 |             discount = 1
56 |             for k in range(t, len(self.reward_memory)):
57 |                 G_sum += self.reward_memory[k] * discount
58 |                 discount *= self.gamma
59 |             G[t] = G_sum
60 |         G = T.tensor(G, dtype=T.float).to(self.policy.device)
61 |         
62 |         loss = 0
63 |         for g, logprob in zip(G, self.action_memory):
64 |             loss += -g * logprob
65 |         loss.backward()
66 |         self.policy.optimizer.step()
67 | 
68 |         self.action_memory = []
69 |         self.reward_memory = []
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/Reinforce/tf2/agent.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.keras as keras
 3 | from networks import PolicyNetwork
 4 | import tensorflow_probability as tfp
 5 | from tensorflow.keras.optimizers import Adam
 6 | import numpy as np
 7 | 
 8 | 
 9 | class Agent:
10 |     def __init__(self, alpha=0.003, gamma=0.99, n_actions=4,
11 |                  fc1_dims=256, fc2_dims=256, chkpt_dir='models/'):
12 | 
13 |         self.gamma = gamma
14 |         self.lr = alpha
15 |         self.n_actions = n_actions
16 |         self.chkpt_dir = chkpt_dir
17 |         self.state_memory = []
18 |         self.action_memory = []
19 |         self.reward_memory = []
20 |         self.policy = PolicyNetwork(n_actions=n_actions, fc1_dims=fc1_dims,
21 |                                     fc2_dims=fc2_dims)
22 |         self.policy.compile(optimizer=Adam(learning_rate=self.lr))
23 | 
24 |     def save_models(self):
25 |         print('... saving models ...')
26 |         self.policy.save(self.chkpt_dir+'reinforce')
27 | 
28 |     def load_models(self):
29 |         print('... loading models ...')
30 |         self.policy = keras.models.load_model(self.chkpt_dir+'reinforce')
31 | 
32 |     def choose_action(self, observation):
33 |         state = tf.convert_to_tensor([observation], dtype=tf.float32)
34 |         probs = self.policy(state)
35 |         action_probs = tfp.distributions.Categorical(probs=probs)
36 |         action = action_probs.sample()
37 | 
38 |         return action.numpy()[0]
39 | 
40 |     def store_transition(self, observation, action, reward):
41 |         self.state_memory.append(observation)
42 |         self.action_memory.append(action)
43 |         self.reward_memory.append(reward)
44 | 
45 |     def learn(self):
46 |         actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
47 |         rewards = np.array(self.reward_memory)
48 | 
49 |         G = np.zeros_like(rewards)
50 |         for t in range(len(rewards)):
51 |             G_sum = 0
52 |             discount = 1
53 |             for k in range(t, len(rewards)):
54 |                 G_sum += rewards[k] * discount
55 |                 discount *= self.gamma
56 |             G[t] = G_sum
57 | 
58 |         with tf.GradientTape() as tape:
59 |             loss = 0
60 |             for idx, (g, state) in enumerate(zip(G, self.state_memory)):
61 |                 state = tf.convert_to_tensor([state], dtype=tf.float32)
62 |                 probs = self.policy(state)
63 |                 action_probs = tfp.distributions.Categorical(probs=probs)
64 |                 log_prob = action_probs.log_prob(actions[idx])
65 |                 loss += -g * tf.squeeze(log_prob)
66 |         params = self.policy.trainable_variables
67 |         grads = tape.gradient(loss, params)
68 |         self.policy.optimizer.apply_gradients(zip(grads, params))
69 | 
70 |         self.state_memory = []
71 |         self.action_memory = []
72 |         self.reward_memory = []
73 | 


--------------------------------------------------------------------------------
/Reinforce/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from agent import Agent
 4 | from utils import plot_learning_curve, manage_memory
 5 | # if you have more than 1 gpu, use device '0' or '1' to assign to a gpu
 6 | # import os
 7 | # os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
 8 | # os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     manage_memory()
13 |     best_score = -np.inf
14 |     env = gym.make('LunarLander-v2')
15 |     agent = Agent(alpha=0.0005, gamma=0.99, n_actions=env.action_space.n)
16 |     load_checkpoint = False
17 |     if load_checkpoint:
18 |         agent.load_models()
19 | 
20 |     num_episodes = 1000
21 |     score_history = []
22 | 
23 |     for i in range(num_episodes):
24 |         done = False
25 |         score = 0
26 |         observation = env.reset()
27 |         while not done:
28 |             action = agent.choose_action(observation)
29 |             observation_, reward, done, info = env.step(action)
30 |             agent.store_transition(observation, action, reward)
31 |             observation = observation_
32 |             score += reward
33 |         score_history.append(score)
34 | 
35 |         if not load_checkpoint:
36 |             agent.learn()
37 |         avg_score = np.mean(score_history[-100:])
38 |         if avg_score > best_score:
39 |             if not load_checkpoint:
40 |                 agent.save_models()
41 |             best_score = score
42 | 
43 |         print('episode {} score {:.1f} avg score {:.1f}'.
44 |               format(i, score, avg_score))
45 | 
46 |     filename = 'plots/lunar-lander.png'
47 |     x = [i for i in range(num_episodes)]
48 |     plot_learning_curve(x, score_history, filename)
49 | 


--------------------------------------------------------------------------------
/Reinforce/tf2/networks.py:
--------------------------------------------------------------------------------
 1 | import tensorflow.keras as keras
 2 | from tensorflow.keras.layers import Dense
 3 | 
 4 | 
 5 | class PolicyNetwork(keras.Model):
 6 |     def __init__(self, n_actions, fc1_dims=256, fc2_dims=256):
 7 |         super(PolicyNetwork, self).__init__()
 8 |         self.fc1_dims = fc1_dims
 9 |         self.fc2_dims = fc2_dims
10 | 
11 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
12 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
13 |         self.pi = Dense(n_actions, activation='softmax')
14 | 
15 |     def call(self, state):
16 |         value = self.fc1(state)
17 |         value = self.fc2(value)
18 | 
19 |         pi = self.pi(value)
20 | 
21 |         return pi
22 | 


--------------------------------------------------------------------------------
/Reinforce/tf2/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def manage_memory():
 7 |     gpus = tf.config.list_physical_devices('GPU')
 8 |     if gpus:
 9 |         try:
10 |             for gpu in gpus:
11 |                 tf.config.experimental.set_memory_growth(gpu, True)
12 |         except RuntimeError as e:
13 |             print(e)
14 | 
15 | 
16 | def plot_learning_curve(x, scores, figure_file):
17 |     running_avg = np.zeros(len(scores))
18 |     for i in range(len(running_avg)):
19 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
20 |     plt.plot(x, running_avg)
21 |     plt.title('Running average of previous 100 scores')
22 |     plt.savefig(figure_file)
23 | 


--------------------------------------------------------------------------------
/SAC/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer():
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape))
 8 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape))
 9 |         self.action_memory = np.zeros((self.mem_size, n_actions))
10 |         self.reward_memory = np.zeros(self.mem_size)
11 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 | 
13 |     def store_transition(self, state, action, reward, state_, done):
14 |         index = self.mem_cntr % self.mem_size
15 |         self.state_memory[index] = state
16 |         self.action_memory[index] = action
17 |         self.reward_memory[index] = reward
18 |         self.new_state_memory[index] = state_
19 |         self.terminal_memory[index] = done
20 | 
21 |         self.mem_cntr += 1
22 | 
23 |     def sample_buffer(self, batch_size):
24 |         max_mem = min(self.mem_cntr, self.mem_size)
25 | 
26 |         batch = np.random.choice(max_mem, batch_size)
27 | 
28 |         states = self.state_memory[batch]
29 |         actions = self.action_memory[batch]
30 |         rewards = self.reward_memory[batch]
31 |         states_ = self.new_state_memory[batch]
32 |         dones = self.terminal_memory[batch]
33 | 
34 |         return states, actions, rewards, states_, dones
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/SAC/main_sac.py:
--------------------------------------------------------------------------------
 1 | # the following 3 lines are helpful if you have multiple GPUs and want to train
 2 | # agents on multiple GPUs. I do this frequently when testing.
 3 | #import os
 4 | #os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
 5 | #os.environ['CUDA_VISIBLE_DEVICES'] = '1'
 6 | import pybullet_envs
 7 | import gym
 8 | import numpy as np
 9 | from sac_torch import Agent
10 | from utils import plot_learning_curve
11 | import numpy as np
12 | 
13 | if __name__ == '__main__':
14 |     #env_id = 'LunarLanderContinuous-v2'
15 |     #env_id = 'BipedalWalker-v2'
16 |     #env_id = 'AntBulletEnv-v0'
17 |     env_id = 'InvertedPendulumBulletEnv-v0'
18 |     #env_id = 'CartPoleContinuousBulletEnv-v0'
19 |     env = gym.make(env_id)
20 |     agent = Agent(alpha=0.0003, beta=0.0003, reward_scale=2, env_id=env_id, 
21 |                 input_dims=env.observation_space.shape, tau=0.005,
22 |                 env=env, batch_size=256, layer1_size=256, layer2_size=256,
23 |                 n_actions=env.action_space.shape[0])
24 |     n_games = 250
25 |     filename = env_id + '_'+ str(n_games) + 'games_scale' + str(agent.scale) + \
26 |                     '_clamp_on_sigma.png'
27 |     figure_file = 'plots/' + filename
28 | 
29 |     best_score = env.reward_range[0]
30 |     score_history = []
31 |     load_checkpoint = True
32 |     if load_checkpoint:
33 |         agent.load_models()
34 |         env.render(mode='human')
35 |     steps = 0
36 |     for i in range(n_games):
37 |         observation = env.reset()
38 |         done = False
39 |         score = 0
40 |         while not done:
41 |             action = agent.choose_action(observation)
42 |             observation_, reward, done, info = env.step(action)
43 |             steps += 1
44 |             agent.remember(observation, action, reward, observation_, done)
45 |             if not load_checkpoint:
46 |                 agent.learn()
47 |             score += reward
48 |             observation = observation_
49 |         score_history.append(score)
50 |         avg_score = np.mean(score_history[-100:])
51 | 
52 |         if avg_score > best_score:
53 |             best_score = avg_score
54 |             if not load_checkpoint:
55 |                 agent.save_models()
56 | 
57 |         print('episode ', i, 'score %.1f' % score,
58 |                 'trailing 100 games avg %.1f' % avg_score, 
59 |                 'steps %d' % steps, env_id, 
60 |                 ' scale ', agent.scale)
61 |     if not load_checkpoint:
62 |         x = [i+1 for i in range(n_games)]
63 |         plot_learning_curve(x, score_history, figure_file)
64 | 


--------------------------------------------------------------------------------
/SAC/networks.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch as T
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | from torch.distributions.normal import Normal
  7 | import numpy as np
  8 | 
  9 | class CriticNetwork(nn.Module):
 10 |     def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions,
 11 |             name, chkpt_dir='tmp/sac'):
 12 |         super(CriticNetwork, self).__init__()
 13 |         self.input_dims = input_dims
 14 |         self.fc1_dims = fc1_dims
 15 |         self.fc2_dims = fc2_dims
 16 |         self.n_actions = n_actions
 17 |         self.name = name
 18 |         self.checkpoint_dir = chkpt_dir
 19 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
 20 | 
 21 |         # I think this breaks if the env has a 2D state representation
 22 |         self.fc1 = nn.Linear(self.input_dims[0] + n_actions, self.fc1_dims)
 23 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
 24 |         self.q1 = nn.Linear(self.fc2_dims, 1)
 25 | 
 26 |         self.optimizer = optim.Adam(self.parameters(), lr=beta)
 27 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 28 | 
 29 |         self.to(self.device)
 30 | 
 31 |     def forward(self, state, action):
 32 |         q1_action_value = self.fc1(T.cat([state, action], dim=1))
 33 |         q1_action_value = F.relu(q1_action_value)
 34 |         q1_action_value = self.fc2(q1_action_value)
 35 |         q1_action_value = F.relu(q1_action_value)
 36 | 
 37 |         q1 = self.q1(q1_action_value)
 38 | 
 39 |         return q1
 40 | 
 41 |     def save_checkpoint(self):
 42 |         T.save(self.state_dict(), self.checkpoint_file)
 43 | 
 44 |     def load_checkpoint(self):
 45 |         self.load_state_dict(T.load(self.checkpoint_file))
 46 | 
 47 | class ActorNetwork(nn.Module):
 48 |     def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, max_action,
 49 |             n_actions, name, chkpt_dir='tmp/sac'):
 50 |         super(ActorNetwork, self).__init__()
 51 |         self.input_dims = input_dims
 52 |         self.fc1_dims = fc1_dims
 53 |         self.fc2_dims = fc2_dims
 54 |         self.n_actions = n_actions
 55 |         self.name = name
 56 |         self.max_action = max_action
 57 |         self.checkpoint_dir = chkpt_dir
 58 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
 59 |         self.reparam_noise = 1e-6
 60 | 
 61 |         self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
 62 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
 63 |         self.mu = nn.Linear(self.fc2_dims, self.n_actions)
 64 |         self.sigma = nn.Linear(self.fc2_dims, self.n_actions)
 65 | 
 66 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
 67 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
 68 | 
 69 |         self.to(self.device)
 70 | 
 71 |     def forward(self, state):
 72 |         prob = self.fc1(state)
 73 |         prob = F.relu(prob)
 74 |         prob = self.fc2(prob)
 75 |         prob = F.relu(prob)
 76 | 
 77 |         mu = self.mu(prob)
 78 |         #sigma = T.sigmoid(self.sigma(prob))
 79 |         sigma = self.sigma(prob)
 80 |         sigma = T.clamp(sigma, min=self.reparam_noise, max=1) 
 81 |         # authors use -20, 2 -> doesn't seem to work for my implementation
 82 | 
 83 |         return mu, sigma
 84 | 
 85 |     def sample_normal(self, state, reparameterize=True):
 86 |         mu, sigma = self.forward(state)
 87 |         probabilities = T.distributions.Normal(mu, sigma)
 88 | 
 89 |         if reparameterize:
 90 |             actions = probabilities.rsample() # reparameterizes the policy
 91 |         else:
 92 |             actions = probabilities.sample()
 93 | 
 94 |         action = T.tanh(actions)*T.tensor(self.max_action).to(self.device) 
 95 |         log_probs = probabilities.log_prob(actions)
 96 |         log_probs -= T.log(1-action.pow(2) + self.reparam_noise)
 97 |         log_probs = log_probs.sum(1, keepdim=True)
 98 | 
 99 |         return action, log_probs
100 | 
101 |     def sample_mvnormal(self, state, reparameterize=True):
102 |         """
103 |             Doesn't quite seem to work.  The agent never learns.
104 |         """
105 |         mu, sigma = self.forward(state)
106 |         n_batches = sigma.size()[0]
107 | 
108 |         cov = [sigma[i] * T.eye(self.n_actions).to(self.device) for i in range(n_batches)]
109 |         cov = T.stack(cov)
110 |         probabilities = T.distributions.MultivariateNormal(mu, cov)
111 | 
112 |         if reparameterize:
113 |             actions = probabilities.rsample() # reparameterizes the policy
114 |         else:
115 |             actions = probabilities.sample()
116 | 
117 |         action = T.tanh(actions) # enforce the action bound for (-1, 1)
118 |         log_probs = probabilities.log_prob(actions)
119 |         log_probs -= T.sum(T.log(1-action.pow(2) + self.reparam_noise))
120 |         log_probs = log_probs.sum(-1, keepdim=True)
121 | 
122 |         return action, log_probs
123 | 
124 |     def save_checkpoint(self):
125 |         T.save(self.state_dict(), self.checkpoint_file)
126 | 
127 |     def load_checkpoint(self):
128 |         self.load_state_dict(T.load(self.checkpoint_file))
129 | 
130 | class ValueNetwork(nn.Module):
131 |     def __init__(self, beta, input_dims, fc1_dims, fc2_dims,
132 |             name, chkpt_dir='tmp/sac'):
133 |         super(ValueNetwork, self).__init__()
134 |         self.input_dims = input_dims
135 |         self.fc1_dims = fc1_dims
136 |         self.fc2_dims = fc2_dims
137 |         self.name = name
138 |         self.checkpoint_dir = chkpt_dir
139 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_sac')
140 | 
141 |         self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
142 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
143 |         self.v = nn.Linear(self.fc2_dims, 1)
144 | 
145 |         self.optimizer = optim.Adam(self.parameters(), lr=beta)
146 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
147 | 
148 |         self.to(self.device)
149 | 
150 |     def forward(self, state):
151 |         state_value = self.fc1(state)
152 |         state_value = F.relu(state_value)
153 |         state_value = self.fc2(state_value)
154 |         state_value = F.relu(state_value)
155 | 
156 |         v = self.v(state_value)
157 | 
158 |         return v
159 | 
160 |     def save_checkpoint(self):
161 |         T.save(self.state_dict(), self.checkpoint_file)
162 | 
163 |     def load_checkpoint(self):
164 |         self.load_state_dict(T.load(self.checkpoint_file))
165 | 
166 | 


--------------------------------------------------------------------------------
/SAC/sac_torch.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch as T
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | from buffer import ReplayBuffer
  6 | from networks import ActorNetwork, CriticNetwork, ValueNetwork
  7 | 
  8 | class Agent():
  9 |     def __init__(self, alpha, beta, input_dims, tau, env,
 10 |             env_id, gamma=0.99, 
 11 |             n_actions=2, max_size=1000000, layer1_size=256,
 12 |             layer2_size=256, batch_size=100, reward_scale=2):
 13 |         self.gamma = gamma
 14 |         self.tau = tau
 15 |         self.memory = ReplayBuffer(max_size, input_dims, n_actions)
 16 |         self.batch_size = batch_size
 17 |         self.n_actions = n_actions
 18 | 
 19 |         self.actor = ActorNetwork(alpha, input_dims, layer1_size,
 20 |                                   layer2_size, n_actions=n_actions,
 21 |                                   name=env_id+'_actor', 
 22 |                                   max_action=env.action_space.high)
 23 |         self.critic_1 = CriticNetwork(beta, input_dims, layer1_size,
 24 |                                       layer2_size, n_actions=n_actions,
 25 |                                       name=env_id+'_critic_1')
 26 |         self.critic_2 = CriticNetwork(beta, input_dims, layer1_size,
 27 |                                       layer2_size, n_actions=n_actions,
 28 |                                       name=env_id+'_critic_2')
 29 |        
 30 |         self.value = ValueNetwork(beta, input_dims, layer1_size,
 31 |                                       layer2_size, name=env_id+'_value')
 32 |         self.target_value = ValueNetwork(beta, input_dims, layer1_size,
 33 |                                          layer2_size, name=env_id+'_target_value')
 34 | 
 35 |         self.scale = reward_scale
 36 |         self.update_network_parameters(tau=1)
 37 | 
 38 |     def choose_action(self, observation):
 39 |         state = T.Tensor([observation]).to(self.actor.device)
 40 |         actions, _ = self.actor.sample_normal(state, reparameterize=False)
 41 |         #actions, _ = self.actor.sample_mvnormal(state)
 42 |         # actions is an array of arrays due to the added dimension in state
 43 |         return actions.cpu().detach().numpy()[0]
 44 | 
 45 |     def remember(self, state, action, reward, new_state, done):
 46 |         self.memory.store_transition(state, action, reward, new_state, done)
 47 | 
 48 |     def learn(self):
 49 |         if self.memory.mem_cntr < self.batch_size:
 50 |             return
 51 | 
 52 |         state, action, reward, new_state, done = \
 53 |                 self.memory.sample_buffer(self.batch_size)
 54 | 
 55 |         reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device)
 56 |         done = T.tensor(done).to(self.critic_1.device)
 57 |         state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device)
 58 |         state = T.tensor(state, dtype=T.float).to(self.critic_1.device)
 59 |         action = T.tensor(action, dtype=T.float).to(self.critic_1.device)
 60 | 
 61 |         value = self.value(state).view(-1)
 62 |         value_ = self.target_value(state_).view(-1)
 63 |         value_[done] = 0.0
 64 |        
 65 |         actions, log_probs = self.actor.sample_normal(state, reparameterize=False)
 66 |         #actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False)
 67 |         log_probs = log_probs.view(-1)
 68 |         q1_new_policy = self.critic_1.forward(state, actions)
 69 |         q2_new_policy = self.critic_2.forward(state, actions)
 70 |         critic_value = T.min(q1_new_policy, q2_new_policy)
 71 |         critic_value = critic_value.view(-1)
 72 |         
 73 |         self.value.optimizer.zero_grad()
 74 |         value_target = critic_value - log_probs
 75 |         value_loss = 0.5 * (F.mse_loss(value, value_target))
 76 |         value_loss.backward(retain_graph=True)
 77 |         self.value.optimizer.step()
 78 | 
 79 |         actions, log_probs = self.actor.sample_normal(state, reparameterize=True)
 80 |         #actions, log_probs = self.actor.sample_mvnormal(state, reparameterize=False)
 81 |         log_probs = log_probs.view(-1)
 82 |         q1_new_policy = self.critic_1.forward(state, actions)
 83 |         q2_new_policy = self.critic_2.forward(state, actions)
 84 |         critic_value = T.min(q1_new_policy, q2_new_policy)
 85 |         critic_value = critic_value.view(-1)
 86 | 
 87 |         actor_loss = log_probs - critic_value
 88 |         actor_loss = T.mean(actor_loss)
 89 |         self.actor.optimizer.zero_grad()
 90 |         actor_loss.backward(retain_graph=True)
 91 |         self.actor.optimizer.step()
 92 | 
 93 |         self.critic_1.optimizer.zero_grad()
 94 |         self.critic_2.optimizer.zero_grad()
 95 |         q_hat = self.scale*reward + self.gamma*value_
 96 |         q1_old_policy = self.critic_1.forward(state, action).view(-1)
 97 |         q2_old_policy = self.critic_2.forward(state, action).view(-1)
 98 |         critic_1_loss = 0.5*F.mse_loss(q1_old_policy, q_hat)
 99 |         critic_2_loss = 0.5*F.mse_loss(q2_old_policy, q_hat)
100 | 
101 |         critic_loss = critic_1_loss + critic_2_loss
102 |         critic_loss.backward()
103 |         self.critic_1.optimizer.step()
104 |         self.critic_2.optimizer.step()
105 |         self.update_network_parameters()
106 | 
107 |     def update_network_parameters(self, tau=None):
108 |         if tau is None:
109 |             tau = self.tau
110 | 
111 |         target_value_params = self.target_value.named_parameters()
112 |         value_params = self.value.named_parameters()
113 | 
114 |         target_value_state_dict = dict(target_value_params)
115 |         value_state_dict = dict(value_params)
116 | 
117 |         for name in value_state_dict:
118 |             value_state_dict[name] = tau*value_state_dict[name].clone() + \
119 |                     (1-tau)*target_value_state_dict[name].clone()
120 | 
121 |         self.target_value.load_state_dict(value_state_dict)
122 | 
123 |     def save_models(self):
124 |         print('.... saving models ....')
125 |         self.actor.save_checkpoint()
126 |         self.value.save_checkpoint()
127 |         self.target_value.save_checkpoint()
128 |         self.critic_1.save_checkpoint()
129 |         self.critic_2.save_checkpoint()
130 | 
131 |     def load_models(self):
132 |         print('.... loading models ....')
133 |         self.actor.load_checkpoint()
134 |         self.value.load_checkpoint()
135 |         self.target_value.load_checkpoint()
136 |         self.critic_1.load_checkpoint()
137 |         self.critic_2.load_checkpoint()
138 | 
139 |     
140 | 


--------------------------------------------------------------------------------
/SAC/tf2/agent.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow.keras as keras
  3 | from tensorflow.keras.optimizers import Adam
  4 | import tensorflow_probability as tfp
  5 | from buffer import ReplayBuffer
  6 | from networks import ActorNetwork, CriticNetwork, ValueNetwork
  7 | 
  8 | 
  9 | class Agent:
 10 |     def __init__(self, alpha=0.0003, beta=0.0003, input_dims=[8],
 11 |                  env=None, gamma=0.99, n_actions=2, max_size=1000000,
 12 |                  tau=0.005, layer1_size=256, layer2_size=256,
 13 |                  batch_size=256, reward_scale=2, chkpt_dir='models/'):
 14 |         self.gamma = gamma
 15 |         self.tau = tau
 16 |         self.memory = ReplayBuffer(max_size, input_dims, n_actions)
 17 |         self.batch_size = batch_size
 18 |         self.n_actions = n_actions
 19 |         self.fname = chkpt_dir + 'SAC/'
 20 |         self.actor = ActorNetwork(n_actions=n_actions,
 21 |                                   max_action=env.action_space.high)
 22 |         self.critic_1 = CriticNetwork()
 23 |         self.critic_2 = CriticNetwork()
 24 |         self.value = ValueNetwork()
 25 |         self.target_value = ValueNetwork()
 26 | 
 27 |         self.actor.compile(optimizer=Adam(learning_rate=alpha))
 28 |         self.critic_1.compile(optimizer=Adam(learning_rate=beta))
 29 |         self.critic_2.compile(optimizer=Adam(learning_rate=beta))
 30 |         self.value.compile(optimizer=Adam(learning_rate=beta))
 31 |         self.target_value.compile(optimizer=Adam(learning_rate=beta))
 32 | 
 33 |         self.scale = reward_scale
 34 |         self.update_network_parameters(tau=1)
 35 | 
 36 |     def save_models(self):
 37 |         # for some environments we can try to save the model before
 38 |         # actually calling the learn function. This means we have an empty
 39 |         # graph, and TF2 will throw an error
 40 |         if self.memory.mem_cntr > self.batch_size:
 41 |             print('... saving models ...')
 42 |             self.actor.save(self.fname+'actor')
 43 |             self.critic_1.save(self.fname+'critic_1')
 44 |             self.critic_2.save(self.fname+'critic_2')
 45 |             self.value.save(self.fname+'value')
 46 |             self.target_value.save(self.fname+'target_value')
 47 | 
 48 |     def load_models(self):
 49 |         print('... loading models ...')
 50 |         self.actor = keras.models.load_model(self.fname+'actor')
 51 |         self.critic_1 = keras.models.load_model(self.fname+'critic_1')
 52 |         self.critic_2 = keras.models.load_model(self.fname+'critic_2')
 53 |         self.value = keras.models.load_model(self.fname+'value')
 54 |         self.target_value = keras.models.load_model(self.fname+'target_value')
 55 | 
 56 |     def sample_normal(self, state):
 57 |         mu, sigma = self.actor(state)
 58 |         probabilities = tfp.distributions.Normal(mu, sigma)
 59 |         actions = probabilities.sample()  # + something else
 60 |         action = tf.math.tanh(actions)*self.actor.max_action
 61 |         log_probs = probabilities.log_prob(actions)
 62 |         log_probs -= tf.math.log(1-tf.math.pow(action, 2)+self.actor.noise)
 63 |         log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True)
 64 | 
 65 |         return action, log_probs
 66 | 
 67 |     def choose_action(self, observation):
 68 |         state = tf.convert_to_tensor([observation])
 69 |         # actions, _ = self.actor.sample_normal(state)  # reparameterize=False)
 70 |         actions, _ = self.sample_normal(state)
 71 | 
 72 |         return actions[0]
 73 | 
 74 |     def store_transition(self, state, action, reward, new_state, done):
 75 |         self.memory.store_transition(state, action, reward, new_state, done)
 76 | 
 77 |     def update_network_parameters(self, tau=None):
 78 |         if tau is None:
 79 |             tau = self.tau
 80 | 
 81 |         weights = []
 82 |         targets = self.target_value.weights
 83 |         for i, weight in enumerate(self.value.weights):
 84 |             weights.append(weight * tau + targets[i]*(1-tau))
 85 | 
 86 |         self.target_value.set_weights(weights)
 87 | 
 88 |     def learn(self):
 89 |         if self.memory.mem_cntr < self.batch_size:
 90 |             return
 91 | 
 92 |         state, action, reward, new_state, done = \
 93 |             self.memory.sample_buffer(self.batch_size)
 94 | 
 95 |         states = tf.convert_to_tensor(state, dtype=tf.float32)
 96 |         states_ = tf.convert_to_tensor(new_state, dtype=tf.float32)
 97 |         rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
 98 |         actions = tf.convert_to_tensor(action, dtype=tf.float32)
 99 | 
100 |         with tf.GradientTape() as tape:
101 |             value = tf.squeeze(self.value(states), 1)
102 | 
103 |             current_policy_actions, log_probs = self.sample_normal(states)
104 |             # self.actor.sample_normal(states)  # reparameterize=False)
105 |             log_probs = tf.squeeze(log_probs, 1)
106 |             q1_new_pi = self.critic_1((states, current_policy_actions))
107 |             q2_new_pi = self.critic_2((states, current_policy_actions))
108 |             critic_value = tf.squeeze(
109 |                                 tf.math.minimum(q1_new_pi, q2_new_pi), 1)
110 | 
111 |             value_target = critic_value - log_probs
112 |             value_loss = 0.5 * keras.losses.MSE(value, value_target)
113 |         params = self.value.trainable_variables
114 |         grads = tape.gradient(value_loss, params)
115 |         self.value.optimizer.apply_gradients(zip(grads, params))
116 | 
117 |         with tf.GradientTape() as tape:
118 |             # in the original paper, they reparameterize here. We don't
119 |             # so it's just the usual action.
120 |             new_policy_actions, log_probs = self.sample_normal(states)
121 |             # self.actor.sample_normal(states)  # reparameterize=True)
122 |             log_probs = tf.squeeze(log_probs, 1)
123 |             q1_new_policy = self.critic_1((states, new_policy_actions))
124 |             q2_new_policy = self.critic_2((states, new_policy_actions))
125 |             critic_value = tf.squeeze(tf.math.minimum(
126 |                                         q1_new_policy, q2_new_policy), 1)
127 |             actor_loss = log_probs - critic_value
128 |             actor_loss = tf.math.reduce_mean(actor_loss)
129 |         params = self.actor.trainable_variables
130 |         grads = tape.gradient(actor_loss, params)
131 |         self.actor.optimizer.apply_gradients(zip(grads, params))
132 | 
133 |         with tf.GradientTape(persistent=True) as tape:
134 |             # I didn't know that these context managers shared values?
135 |             value_ = tf.squeeze(self.target_value(states_), 1)
136 |             q_hat = self.scale*rewards + self.gamma*value_*(1-done)
137 |             q1_old_policy = tf.squeeze(self.critic_1((states, actions)), 1)
138 |             q2_old_policy = tf.squeeze(self.critic_2((states, actions)), 1)
139 |             critic_1_loss = 0.5 * keras.losses.MSE(q1_old_policy, q_hat)
140 |             critic_2_loss = 0.5 * keras.losses.MSE(q2_old_policy, q_hat)
141 |         params_1 = self.critic_1.trainable_variables
142 |         params_2 = self.critic_2.trainable_variables
143 |         grads_1 = tape.gradient(critic_1_loss, params_1)
144 |         grads_2 = tape.gradient(critic_2_loss, params_2)
145 | 
146 |         self.critic_1.optimizer.apply_gradients(zip(grads_1, params_1))
147 |         self.critic_2.optimizer.apply_gradients(zip(grads_2, params_2))
148 | 
149 |         self.update_network_parameters()
150 | 


--------------------------------------------------------------------------------
/SAC/tf2/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer:
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape))
 8 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape))
 9 |         self.action_memory = np.zeros((self.mem_size, n_actions))
10 |         self.reward_memory = np.zeros(self.mem_size)
11 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 | 
13 |     def store_transition(self, state, action, reward, state_, done):
14 |         index = self.mem_cntr % self.mem_size
15 | 
16 |         self.state_memory[index] = state
17 |         self.new_state_memory[index] = state_
18 |         self.action_memory[index] = action
19 |         self.reward_memory[index] = reward
20 |         self.terminal_memory[index] = done
21 | 
22 |         self.mem_cntr += 1
23 | 
24 |     def sample_buffer(self, batch_size):
25 |         max_mem = min(self.mem_cntr, self.mem_size)
26 | 
27 |         batch = np.random.choice(max_mem, batch_size)
28 | 
29 |         states = self.state_memory[batch]
30 |         states_ = self.new_state_memory[batch]
31 |         actions = self.action_memory[batch]
32 |         rewards = self.reward_memory[batch]
33 |         dones = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, dones
36 | 


--------------------------------------------------------------------------------
/SAC/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import pybullet_envs
 2 | import gym
 3 | import numpy as np
 4 | from agent import Agent
 5 | from utils import plot_learning_curve, manage_memory
 6 | from gym import wrappers
 7 | 
 8 | if __name__ == '__main__':
 9 |     manage_memory()
10 |     env = gym.make('InvertedPendulumBulletEnv-v0')
11 |     agent = Agent(input_dims=env.observation_space.shape, env=env,
12 |                   n_actions=env.action_space.shape[0])
13 |     n_games = 250
14 |     render_video = False
15 | 
16 |     # do a mkdir video if you want to record video of the agent playing.
17 |     if render_video:
18 |         env = wrappers.Monitor(env, 'video',
19 |                                video_callable=lambda episode_id: True,
20 |                                force=True)
21 |     filename = 'inverted_pendulum.png'
22 | 
23 |     figure_file = 'plots/' + filename
24 | 
25 |     best_score = env.reward_range[0]
26 |     score_history = []
27 |     load_checkpoint = False
28 | 
29 |     if load_checkpoint:
30 |         agent.load_models()
31 |         env.render(mode='human')
32 | 
33 |     for i in range(n_games):
34 |         observation = env.reset()
35 |         done = False
36 |         score = 0
37 |         while not done:
38 |             action = agent.choose_action(observation)
39 |             observation_, reward, done, info = env.step(action)
40 |             score += reward
41 |             agent.store_transition(observation, action, reward,
42 |                                    observation_, done)
43 |             if not load_checkpoint:
44 |                 agent.learn()
45 |             observation = observation_
46 |         score_history.append(score)
47 |         avg_score = np.mean(score_history[-100:])
48 | 
49 |         if avg_score > best_score:
50 |             best_score = avg_score
51 |             if not load_checkpoint:
52 |                 agent.save_models()
53 |         print('episode {} score {:.1f} avg_score {:.1f}'.
54 |               format(i, score, avg_score))
55 | 
56 |     if not load_checkpoint:
57 |         x = [i+1 for i in range(n_games)]
58 |         plot_learning_curve(x, score_history, figure_file)
59 | 


--------------------------------------------------------------------------------
/SAC/tf2/networks.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow.keras as keras
  3 | # import tensorflow_probability as tfp
  4 | from tensorflow.keras.layers import Dense
  5 | 
  6 | 
  7 | class CriticNetwork(keras.Model):
  8 |     def __init__(self, fc1_dims=256, fc2_dims=256):
  9 |         super(CriticNetwork, self).__init__()
 10 |         self.fc1_dims = fc1_dims
 11 |         self.fc2_dims = fc2_dims
 12 | 
 13 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
 14 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
 15 |         self.q = Dense(1, activation=None)
 16 | 
 17 |     def call(self, inputs):
 18 |         state, action = inputs
 19 |         action_value = self.fc1(tf.concat([state, action], axis=1))
 20 |         action_value = self.fc2(action_value)
 21 | 
 22 |         q = self.q(action_value)
 23 | 
 24 |         return q
 25 | 
 26 | 
 27 | class ValueNetwork(keras.Model):
 28 |     def __init__(self, fc1_dims=256, fc2_dims=256):
 29 |         super(ValueNetwork, self).__init__()
 30 |         self.fc1_dims = fc1_dims
 31 |         self.fc2_dims = fc2_dims
 32 | 
 33 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
 34 |         self.fc2 = Dense(fc2_dims, activation='relu')
 35 |         self.v = Dense(1, activation=None)
 36 | 
 37 |     def call(self, state):
 38 |         state_value = self.fc1(state)
 39 |         state_value = self.fc2(state_value)
 40 | 
 41 |         v = self.v(state_value)
 42 | 
 43 |         return v
 44 | 
 45 | 
 46 | class ActorNetwork(keras.Model):
 47 |     def __init__(self, max_action, fc1_dims=256, fc2_dims=256, n_actions=2):
 48 |         super(ActorNetwork, self).__init__()
 49 |         self.fc1_dims = fc1_dims
 50 |         self.fc2_dims = fc2_dims
 51 |         self.n_actions = n_actions
 52 |         self.max_action = max_action
 53 |         self.noise = 1e-6
 54 | 
 55 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
 56 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
 57 |         self.mu = Dense(self.n_actions, activation=None)
 58 |         self.sigma = Dense(self.n_actions, activation=None)
 59 | 
 60 |     def call(self, state):
 61 |         prob = self.fc1(state)
 62 |         prob = self.fc2(prob)
 63 | 
 64 |         mu = self.mu(prob)
 65 |         sigma = self.sigma(prob)
 66 |         # might want to come back and change this,
 67 |         # perhaps tf plays more nicely with a sigma of ~0
 68 |         sigma = tf.clip_by_value(sigma, self.noise, 1)
 69 | 
 70 |         return mu, sigma
 71 |     """
 72 |     def sample_normal(self, state, reparameterize=True):
 73 |         mu, sigma = self.call(state)
 74 |         probabilities = tfp.distributions.Normal(mu, sigma)
 75 | 
 76 |         if reparameterize:
 77 |             actions = probabilities.sample()  # + something else
 78 |         else:
 79 |             actions = probabilities.sample()
 80 | 
 81 |         action = tf.math.tanh(actions)*self.max_action
 82 |         log_probs = probabilities.log_prob(actions)
 83 |         log_probs -= tf.math.log(1-tf.math.pow(action, 2)+self.noise)
 84 |         log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True)
 85 | 
 86 |         return action, log_probs
 87 | 
 88 |     def sample_normal(self, state):
 89 |         mu, sigma = self.call(state)
 90 |         probabilities = tfp.distributions.Normal(mu, sigma)
 91 | 
 92 |         actions = probabilities.sample()  # + something else
 93 | 
 94 |         action = tf.math.tanh(actions)*self.max_action
 95 |         log_probs = probabilities.log_prob(actions)
 96 |         log_probs -= tf.math.log(1-tf.math.pow(action, 2)+self.noise)
 97 |         log_probs = tf.math.reduce_sum(log_probs, axis=1, keepdims=True)
 98 | 
 99 |         return action, log_probs
100 |     """
101 | 


--------------------------------------------------------------------------------
/SAC/tf2/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def manage_memory():
 7 |     gpus = tf.config.list_physical_devices('GPU')
 8 |     if gpus:
 9 |         try:
10 |             for gpu in gpus:
11 |                 tf.config.experimental.set_memory_growth(gpu, True)
12 |         except RuntimeError as e:
13 |             print(e)
14 | 
15 | 
16 | def plot_learning_curve(x, scores, figure_file):
17 |     running_avg = np.zeros(len(scores))
18 |     for i in range(len(running_avg)):
19 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
20 |     plt.plot(x, running_avg)
21 |     plt.title('Running average of previous 100 scores')
22 |     plt.savefig(figure_file)
23 | 


--------------------------------------------------------------------------------
/SAC/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------
/TD3/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer():
 4 |     def __init__(self, max_size, input_shape, n_actions):
 5 |         self.mem_size = max_size
 6 |         self.mem_cntr = 0
 7 |         self.state_memory = np.zeros((self.mem_size, *input_shape))
 8 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape))
 9 |         self.action_memory = np.zeros((self.mem_size, n_actions))
10 |         self.reward_memory = np.zeros(self.mem_size)
11 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
12 | 
13 |     def store_transition(self, state, action, reward, state_, done):
14 |         index = self.mem_cntr % self.mem_size
15 |         self.state_memory[index] = state
16 |         self.action_memory[index] = action
17 |         self.reward_memory[index] = reward
18 |         self.new_state_memory[index] = state_
19 |         self.terminal_memory[index] = done
20 | 
21 |         self.mem_cntr += 1
22 | 
23 |     def sample_buffer(self, batch_size):
24 |         max_mem = min(self.mem_cntr, self.mem_size)
25 | 
26 |         batch = np.random.choice(max_mem, batch_size)
27 | 
28 |         states = self.state_memory[batch]
29 |         actions = self.action_memory[batch]
30 |         rewards = self.reward_memory[batch]
31 |         states_ = self.new_state_memory[batch]
32 |         dones = self.terminal_memory[batch]
33 | 
34 |         return states, actions, rewards, states_, dones
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/TD3/main_td3.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from td3_torch import Agent
 4 | from utils import plot_learning_curve
 5 | 
 6 | if __name__ == '__main__':
 7 |     env = gym.make('BipedalWalker-v2')
 8 |     #env = gym.make('LunarLanderContinuous-v2')
 9 |     agent = Agent(alpha=0.001, beta=0.001, 
10 |                 input_dims=env.observation_space.shape, tau=0.005,
11 |                 env=env, batch_size=100, layer1_size=400, layer2_size=300,
12 |                 n_actions=env.action_space.shape[0])
13 |     n_games = 1500
14 |     filename = 'Walker2d_' + str(n_games) + '_2.png'
15 |     figure_file = 'plots/' + filename
16 | 
17 |     best_score = env.reward_range[0]
18 |     score_history = []
19 | 
20 |     #agent.load_models()
21 | 
22 |     for i in range(n_games):
23 |         observation = env.reset()
24 |         done = False
25 |         score = 0
26 |         while not done:
27 |             action = agent.choose_action(observation)
28 |             observation_, reward, done, info = env.step(action)
29 |             agent.remember(observation, action, reward, observation_, done)
30 |             agent.learn()
31 |             score += reward
32 |             observation = observation_
33 |         score_history.append(score)
34 |         avg_score = np.mean(score_history[-100:])
35 | 
36 |         if avg_score > best_score:
37 |             best_score = avg_score
38 |             agent.save_models()
39 | 
40 |         print('episode ', i, 'score %.2f' % score,
41 |                 'trailing 100 games avg %.3f' % avg_score)
42 | 
43 |     x = [i+1 for i in range(n_games)]
44 |     plot_learning_curve(x, score_history, figure_file)
45 | 


--------------------------------------------------------------------------------
/TD3/networks.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch as T
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | import numpy as np
 7 | 
 8 | class CriticNetwork(nn.Module):
 9 |     def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions,
10 |             name, chkpt_dir='tmp/td3'):
11 |         super(CriticNetwork, self).__init__()
12 |         self.input_dims = input_dims
13 |         self.fc1_dims = fc1_dims
14 |         self.fc2_dims = fc2_dims
15 |         self.n_actions = n_actions
16 |         self.name = name
17 |         self.checkpoint_dir = chkpt_dir
18 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_td3')
19 | 
20 |         # I think this breaks if the env has a 2D state representation
21 |         self.fc1 = nn.Linear(self.input_dims[0] + n_actions, self.fc1_dims)
22 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
23 |         self.q1 = nn.Linear(self.fc2_dims, 1)
24 | 
25 |         self.optimizer = optim.Adam(self.parameters(), lr=beta)
26 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
27 | 
28 |         self.to(self.device)
29 | 
30 |     def forward(self, state, action):
31 |         q1_action_value = self.fc1(T.cat([state, action], dim=1))
32 |         q1_action_value = F.relu(q1_action_value)
33 |         q1_action_value = self.fc2(q1_action_value)
34 |         q1_action_value = F.relu(q1_action_value)
35 | 
36 |         q1 = self.q1(q1_action_value)
37 | 
38 |         return q1
39 | 
40 |     def save_checkpoint(self):
41 |         print('... saving checkpoint ...')
42 |         T.save(self.state_dict(), self.checkpoint_file)
43 | 
44 |     def load_checkpoint(self):
45 |         print('... loading checkpoint ...')
46 |         self.load_state_dict(T.load(self.checkpoint_file))
47 | 
48 | class ActorNetwork(nn.Module):
49 |     def __init__(self, alpha, input_dims, fc1_dims, fc2_dims,
50 |             n_actions, name, chkpt_dir='tmp/td3'):
51 |         super(ActorNetwork, self).__init__()
52 |         self.input_dims = input_dims
53 |         self.fc1_dims = fc1_dims
54 |         self.fc2_dims = fc2_dims
55 |         self.n_actions = n_actions
56 |         self.name = name
57 |         self.checkpoint_dir = chkpt_dir
58 |         self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_td3')
59 | 
60 |         self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
61 |         self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
62 |         self.mu = nn.Linear(self.fc2_dims, self.n_actions)
63 | 
64 |         self.optimizer = optim.Adam(self.parameters(), lr=alpha)
65 |         self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
66 | 
67 |         self.to(self.device)
68 | 
69 |     def forward(self, state):
70 |         prob = self.fc1(state)
71 |         prob = F.relu(prob)
72 |         prob = self.fc2(prob)
73 |         prob = F.relu(prob)
74 | 
75 |         prob = T.tanh(self.mu(prob)) # if action is > +/- 1 then multiply by max action
76 | 
77 |         return prob
78 | 
79 |     def save_checkpoint(self):
80 |         print('... saving checkpoint ...')
81 |         T.save(self.state_dict(), self.checkpoint_file)
82 | 
83 |     def load_checkpoint(self):
84 |         print('... loading checkpoint ...')
85 |         self.load_state_dict(T.load(self.checkpoint_file))
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/TD3/td3_torch.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch as T
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | from buffer import ReplayBuffer
  6 | from networks import ActorNetwork, CriticNetwork
  7 | 
  8 | class Agent():
  9 |     def __init__(self, alpha, beta, input_dims, tau, env,
 10 |             gamma=0.99, update_actor_interval=2, warmup=1000,
 11 |             n_actions=2, max_size=1000000, layer1_size=400,
 12 |             layer2_size=300, batch_size=100, noise=0.1):
 13 |         self.gamma = gamma
 14 |         self.tau = tau
 15 |         self.max_action = env.action_space.high
 16 |         self.min_action = env.action_space.low
 17 |         self.memory = ReplayBuffer(max_size, input_dims, n_actions)
 18 |         self.batch_size = batch_size
 19 |         self.learn_step_cntr = 0
 20 |         self.time_step = 0
 21 |         self.warmup = warmup
 22 |         self.n_actions = n_actions
 23 |         self.update_actor_iter = update_actor_interval
 24 | 
 25 |         self.actor = ActorNetwork(alpha, input_dims, layer1_size,
 26 |                                   layer2_size, n_actions=n_actions,
 27 |                                   name='actor')
 28 |         self.critic_1 = CriticNetwork(beta, input_dims, layer1_size,
 29 |                                       layer2_size, n_actions=n_actions,
 30 |                                       name='critic_1')
 31 |         self.critic_2 = CriticNetwork(beta, input_dims, layer1_size,
 32 |                                       layer2_size, n_actions=n_actions,
 33 |                                       name='critic_2')
 34 | 
 35 |         self.target_actor = ActorNetwork(alpha, input_dims, layer1_size,
 36 |                                          layer2_size, n_actions=n_actions,
 37 |                                          name='target_actor')
 38 |         self.target_critic_1 = CriticNetwork(beta, input_dims, layer1_size,
 39 |                                          layer2_size, n_actions=n_actions,
 40 |                                          name='target_critic_1')
 41 |         self.target_critic_2 = CriticNetwork(beta, input_dims, layer1_size,
 42 |                                          layer2_size, n_actions=n_actions,
 43 |                                          name='target_critic_2')
 44 | 
 45 |         self.noise = noise
 46 |         self.update_network_parameters(tau=1)
 47 | 
 48 |     def choose_action(self, observation):
 49 |         if self.time_step < self.warmup:
 50 |             mu = T.tensor(np.random.normal(scale=self.noise, size=(self.n_actions,)))
 51 |         else:
 52 |             state = T.tensor(observation, dtype=T.float).to(self.actor.device)
 53 |             mu = self.actor.forward(state).to(self.actor.device)
 54 |         mu_prime = mu + T.tensor(np.random.normal(scale=self.noise),
 55 |                 dtype=T.float).to(self.actor.device)
 56 |         mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0])
 57 |         self.time_step += 1
 58 |         return mu_prime.cpu().detach().numpy()
 59 | 
 60 |     def remember(self, state, action, reward, new_state, done):
 61 |         self.memory.store_transition(state, action, reward, new_state, done)
 62 | 
 63 |     def learn(self):
 64 |         if self.memory.mem_cntr < self.batch_size:
 65 |             return
 66 | 
 67 |         state, action, reward, new_state, done = \
 68 |                 self.memory.sample_buffer(self.batch_size)
 69 | 
 70 |         reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device)
 71 |         done = T.tensor(done).to(self.critic_1.device)
 72 |         state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device)
 73 |         state = T.tensor(state, dtype=T.float).to(self.critic_1.device)
 74 |         action = T.tensor(action, dtype=T.float).to(self.critic_1.device)
 75 | 
 76 |         target_actions = self.target_actor.forward(state_)
 77 |         target_actions = target_actions + \
 78 |                 T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5)
 79 |         # might break if elements of min and max are not all equal
 80 |         target_actions = T.clamp(target_actions, self.min_action[0], self.max_action[0])
 81 | 
 82 |         q1_ = self.target_critic_1.forward(state_, target_actions)
 83 |         q2_ = self.target_critic_2.forward(state_, target_actions)
 84 | 
 85 |         q1 = self.critic_1.forward(state, action)
 86 |         q2 = self.critic_2.forward(state, action)
 87 | 
 88 |         q1_[done] = 0.0
 89 |         q2_[done] = 0.0
 90 | 
 91 |         q1_ = q1_.view(-1)
 92 |         q2_ = q2_.view(-1)
 93 | 
 94 |         critic_value_ = T.min(q1_, q2_)
 95 | 
 96 |         target = reward + self.gamma*critic_value_
 97 |         target = target.view(self.batch_size, 1)
 98 | 
 99 |         self.critic_1.optimizer.zero_grad()
100 |         self.critic_2.optimizer.zero_grad()
101 | 
102 |         q1_loss = F.mse_loss(target, q1)
103 |         q2_loss = F.mse_loss(target, q2)
104 |         critic_loss = q1_loss + q2_loss
105 |         critic_loss.backward()
106 | 
107 |         self.critic_1.optimizer.step()
108 |         self.critic_2.optimizer.step()
109 | 
110 |         self.learn_step_cntr += 1
111 | 
112 |         if self.learn_step_cntr % self.update_actor_iter != 0:
113 |             return
114 | 
115 |         self.actor.optimizer.zero_grad()
116 |         actor_q1_loss = self.critic_1.forward(state, self.actor.forward(state))
117 |         actor_loss = -T.mean(actor_q1_loss)
118 |         actor_loss.backward()
119 |         self.actor.optimizer.step()
120 | 
121 |         self.update_network_parameters()
122 | 
123 |     def update_network_parameters(self, tau=None):
124 |         if tau is None:
125 |             tau = self.tau
126 | 
127 |         actor_params = self.actor.named_parameters()
128 |         critic_1_params = self.critic_1.named_parameters()
129 |         critic_2_params = self.critic_2.named_parameters()
130 |         target_actor_params = self.target_actor.named_parameters()
131 |         target_critic_1_params = self.target_critic_1.named_parameters()
132 |         target_critic_2_params = self.target_critic_2.named_parameters()
133 | 
134 |         critic_1_state_dict = dict(critic_1_params)
135 |         critic_2_state_dict = dict(critic_2_params)
136 |         actor_state_dict = dict(actor_params)
137 |         target_actor_state_dict = dict(target_actor_params)
138 |         target_critic_1_state_dict = dict(target_critic_1_params)
139 |         target_critic_2_state_dict = dict(target_critic_2_params)
140 | 
141 |         for name in critic_1_state_dict:
142 |             critic_1_state_dict[name] = tau*critic_1_state_dict[name].clone() + \
143 |                     (1-tau)*target_critic_1_state_dict[name].clone()
144 | 
145 |         for name in critic_2_state_dict:
146 |             critic_2_state_dict[name] = tau*critic_2_state_dict[name].clone() + \
147 |                     (1-tau)*target_critic_2_state_dict[name].clone()
148 | 
149 |         for name in actor_state_dict:
150 |             actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
151 |                     (1-tau)*target_actor_state_dict[name].clone()
152 | 
153 |         self.target_critic_1.load_state_dict(critic_1_state_dict)
154 |         self.target_critic_2.load_state_dict(critic_2_state_dict)
155 |         self.target_actor.load_state_dict(actor_state_dict)
156 | 
157 |     def save_models(self):
158 |         self.actor.save_checkpoint()
159 |         self.target_actor.save_checkpoint()
160 |         self.critic_1.save_checkpoint()
161 |         self.critic_2.save_checkpoint()
162 |         self.target_critic_1.save_checkpoint()
163 |         self.target_critic_2.save_checkpoint()
164 | 
165 |     def load_models(self):
166 |         self.actor.load_checkpoint()
167 |         self.target_actor.load_checkpoint()
168 |         self.critic_1.load_checkpoint()
169 |         self.critic_2.load_checkpoint()
170 |         self.target_critic_1.load_checkpoint()
171 |         self.target_critic_2.load_checkpoint()
172 | 
173 |     
174 | 


--------------------------------------------------------------------------------
/TD3/tf2/agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import tensorflow.keras as keras
  4 | from tensorflow.keras.optimizers import Adam
  5 | from buffer import ReplayBuffer
  6 | from networks import ActorNetwork, CriticNetwork
  7 | 
  8 | 
  9 | class Agent:
 10 |     def __init__(self, alpha, beta, input_dims, tau, env,
 11 |                  gamma=0.99, update_actor_interval=2, warmup=1000,
 12 |                  n_actions=2, max_size=1000000, layer1_size=400,
 13 |                  layer2_size=300, batch_size=100, noise=0.1,
 14 |                  chkpt_dir='models/'):
 15 |         self.gamma = gamma
 16 |         self.tau = tau
 17 |         self.max_action = env.action_space.high[0]
 18 |         self.min_action = env.action_space.low[0]
 19 |         self.memory = ReplayBuffer(max_size, input_dims, n_actions)
 20 |         self.batch_size = batch_size
 21 |         self.learn_step_cntr = 0
 22 |         self.time_step = 0
 23 |         self.warmup = warmup
 24 |         self.n_actions = n_actions
 25 |         self.fname = chkpt_dir
 26 |         self.update_actor_iter = update_actor_interval
 27 | 
 28 |         self.actor = ActorNetwork(layer1_size, layer2_size,
 29 |                                   n_actions=n_actions)
 30 | 
 31 |         self.critic_1 = CriticNetwork(layer1_size, layer2_size)
 32 |         self.critic_2 = CriticNetwork(layer1_size, layer2_size)
 33 | 
 34 |         self.target_actor = ActorNetwork(layer1_size, layer2_size,
 35 |                                          n_actions=n_actions)
 36 |         self.target_critic_1 = CriticNetwork(layer1_size, layer2_size)
 37 |         self.target_critic_2 = CriticNetwork(layer1_size, layer2_size)
 38 | 
 39 |         self.actor.compile(optimizer=Adam(learning_rate=alpha))
 40 |         self.critic_1.compile(optimizer=Adam(learning_rate=beta))
 41 |         self.critic_2.compile(optimizer=Adam(learning_rate=beta))
 42 | 
 43 |         self.target_actor.compile(optimizer=Adam(learning_rate=alpha))
 44 |         self.target_critic_1.compile(optimizer=Adam(learning_rate=beta))
 45 |         self.target_critic_2.compile(optimizer=Adam(learning_rate=beta))
 46 | 
 47 |         self.noise = noise
 48 |         self.update_network_parameters(tau=1)
 49 | 
 50 |     def save_models(self):
 51 |         if self.memory.mem_cntr > self.batch_size:
 52 |             print('... saving models ...')
 53 |             self.actor.save(self.fname+'actor')
 54 |             self.critic_1.save(self.fname+'critic_1')
 55 |             self.critic_2.save(self.fname+'critic_2')
 56 |             self.target_actor.save(self.fname+'target_actor')
 57 |             self.target_critic_1.save(self.fname+'target_critic_1')
 58 |             self.target_critic_2.save(self.fname+'target_critic_2')
 59 | 
 60 |     def load_models(self):
 61 |         print('... loading models ...')
 62 |         self.actor = keras.models.load_model(self.fname+'actor')
 63 |         self.critic_1 = keras.models.load_model(self.fname+'critic_1')
 64 |         self.critic_2 = keras.models.load_model(self.fname+'critic_2')
 65 |         self.target_actor = keras.models.load_model(self.fname+'target_actor')
 66 |         self.target_critic_1 = \
 67 |             keras.models.load_model(self.fname+'target_critic_1')
 68 |         self.target_critic_2 = \
 69 |             keras.models.load_model(self.fname+'target_critic_2')
 70 | 
 71 |     def choose_action(self, observation):
 72 |         if self.time_step < self.warmup:
 73 |             mu = np.random.normal(scale=self.noise, size=(self.n_actions,))
 74 |         else:
 75 |             state = tf.convert_to_tensor([observation], dtype=tf.float32)
 76 |             # returns a batch size of 1, want a scalar array
 77 |             mu = self.actor(state)[0]
 78 |         mu_prime = mu + np.random.normal(scale=self.noise)
 79 |         mu_prime = tf.clip_by_value(mu_prime, self.min_action, self.max_action)
 80 |         self.time_step += 1
 81 | 
 82 |         return mu_prime
 83 | 
 84 |     def remember(self, state, action, reward, new_state, done):
 85 |         self.memory.store_transition(state, action, reward, new_state, done)
 86 | 
 87 |     def learn(self):
 88 |         if self.memory.mem_cntr < self.batch_size:
 89 |             return
 90 | 
 91 |         states, actions, rewards, new_states, dones = \
 92 |             self.memory.sample_buffer(self.batch_size)
 93 | 
 94 |         states = tf.convert_to_tensor(states, dtype=tf.float32)
 95 |         actions = tf.convert_to_tensor(actions, dtype=tf.float32)
 96 |         rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
 97 |         states_ = tf.convert_to_tensor(new_states, dtype=tf.float32)
 98 | 
 99 |         with tf.GradientTape(persistent=True) as tape:
100 |             target_actions = self.target_actor(states_)
101 |             target_actions = target_actions + \
102 |                 tf.clip_by_value(np.random.normal(scale=0.2), -0.5, 0.5)
103 | 
104 |             target_actions = tf.clip_by_value(target_actions, self.min_action,
105 |                                               self.max_action)
106 | 
107 |             q1_ = self.target_critic_1((states_, target_actions))
108 |             q2_ = self.target_critic_2((states_, target_actions))
109 | 
110 |             q1 = tf.squeeze(self.critic_1((states, actions)), 1)
111 |             q2 = tf.squeeze(self.critic_2((states, actions)), 1)
112 | 
113 |             # shape is [batch_size, 1], want to collapse to [batch_size]
114 |             q1_ = tf.squeeze(q1_, 1)
115 |             q2_ = tf.squeeze(q2_, 1)
116 | 
117 |             critic_value_ = tf.math.minimum(q1_, q2_)
118 |             # in tf2 only integer scalar arrays can be used as indices
119 |             # and eager exection doesn't support assignment, so we can't do
120 |             # q1_[dones] = 0.0
121 |             target = rewards + self.gamma*critic_value_*(1-dones)
122 |             critic_1_loss = keras.losses.MSE(target, q1)
123 |             critic_2_loss = keras.losses.MSE(target, q2)
124 |         params_1 = self.critic_1.trainable_variables
125 |         params_2 = self.critic_2.trainable_variables
126 |         grads_1 = tape.gradient(critic_1_loss, params_1)
127 |         grads_2 = tape.gradient(critic_2_loss, params_2)
128 | 
129 |         self.critic_1.optimizer.apply_gradients(zip(grads_1, params_1))
130 |         self.critic_2.optimizer.apply_gradients(zip(grads_2, params_2))
131 | 
132 |         self.learn_step_cntr += 1
133 | 
134 |         if self.learn_step_cntr % self.update_actor_iter != 0:
135 |             return
136 | 
137 |         with tf.GradientTape() as tape:
138 |             new_actions = self.actor(states)
139 |             critic_1_value = self.critic_1((states, new_actions))
140 |             actor_loss = -tf.math.reduce_mean(critic_1_value)
141 |         params = self.actor.trainable_variables
142 |         grads = tape.gradient(actor_loss, params)
143 |         self.actor.optimizer.apply_gradients(zip(grads, params))
144 | 
145 |         self.update_network_parameters()
146 | 
147 |     def update_network_parameters(self, tau=None):
148 |         if tau is None:
149 |             tau = self.tau
150 | 
151 |         weights = []
152 |         targets = self.target_actor.weights
153 |         for i, weight in enumerate(self.actor.weights):
154 |             weights.append(weight * tau + targets[i]*(1-tau))
155 | 
156 |         self.target_actor.set_weights(weights)
157 | 
158 |         weights = []
159 |         targets = self.target_critic_1.weights
160 |         for i, weight in enumerate(self.critic_1.weights):
161 |             weights.append(weight * tau + targets[i]*(1-tau))
162 | 
163 |         self.target_critic_1.set_weights(weights)
164 | 
165 |         weights = []
166 |         targets = self.target_critic_2.weights
167 |         for i, weight in enumerate(self.critic_2.weights):
168 |             weights.append(weight * tau + targets[i]*(1-tau))
169 | 
170 |         self.target_critic_2.set_weights(weights)
171 | 


--------------------------------------------------------------------------------
/TD3/tf2/buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class ReplayBuffer:
 5 |     def __init__(self, max_size, input_shape, n_actions):
 6 |         self.mem_size = max_size
 7 |         self.mem_cntr = 0
 8 |         self.state_memory = np.zeros((self.mem_size, *input_shape))
 9 |         self.new_state_memory = np.zeros((self.mem_size, *input_shape))
10 |         self.action_memory = np.zeros((self.mem_size, n_actions))
11 |         self.reward_memory = np.zeros(self.mem_size)
12 |         self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
13 | 
14 |     def store_transition(self, state, action, reward, state_, done):
15 |         index = self.mem_cntr % self.mem_size
16 |         self.state_memory[index] = state
17 |         self.new_state_memory[index] = state_
18 |         self.terminal_memory[index] = done
19 |         self.reward_memory[index] = reward
20 |         self.action_memory[index] = action
21 | 
22 |         self.mem_cntr += 1
23 | 
24 |     def sample_buffer(self, batch_size):
25 |         max_mem = min(self.mem_cntr, self.mem_size)
26 | 
27 |         batch = np.random.choice(max_mem, batch_size)
28 | 
29 |         states = self.state_memory[batch]
30 |         states_ = self.new_state_memory[batch]
31 |         actions = self.action_memory[batch]
32 |         rewards = self.reward_memory[batch]
33 |         dones = self.terminal_memory[batch]
34 | 
35 |         return states, actions, rewards, states_, dones
36 | 


--------------------------------------------------------------------------------
/TD3/tf2/main.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from agent import Agent
 4 | from utils import plot_learning_curve, manage_memory
 5 | 
 6 | if __name__ == '__main__':
 7 |     # env = gym.make('BipedalWalker-v3')
 8 |     manage_memory()
 9 |     env = gym.make('LunarLanderContinuous-v2')
10 |     agent = Agent(alpha=0.0001, beta=0.001,
11 |                   input_dims=env.observation_space.shape, tau=0.005,
12 |                   env=env, batch_size=100, layer1_size=400, layer2_size=300,
13 |                   n_actions=env.action_space.shape[0])
14 |     n_games = 1000
15 |     filename = 'plots/' + 'lunar_lander_' + str(n_games) + '_games.png'
16 | 
17 |     best_score = env.reward_range[0]
18 |     score_history = []
19 |     load_checkpoint = False
20 |     if load_checkpoint:
21 |         agent.load_models()
22 | 
23 |     for i in range(n_games):
24 |         observation = env.reset()
25 |         done = False
26 |         score = 0
27 |         while not done:
28 |             action = agent.choose_action(observation)
29 |             observation_, reward, done, info = env.step(action)
30 |             agent.remember(observation, action, reward, observation_, done)
31 |             if not load_checkpoint:
32 |                 agent.learn()
33 |             score += reward
34 |             observation = observation_
35 |         score_history.append(score)
36 |         avg_score = np.mean(score_history[-100:])
37 | 
38 |         if avg_score > best_score:
39 |             best_score = avg_score
40 |             if not load_checkpoint:
41 |                 agent.save_models()
42 |         print('episode {} score {:.1f} avg score {:.1f}'.
43 |               format(i, score, avg_score))
44 |     x = [i+1 for i in range(n_games)]
45 |     plot_learning_curve(x, score_history, filename)
46 | 


--------------------------------------------------------------------------------
/TD3/tf2/networks.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.keras as keras
 3 | from tensorflow.keras.layers import Dense
 4 | 
 5 | 
 6 | class CriticNetwork(keras.Model):
 7 |     def __init__(self, fc1_dims, fc2_dims):
 8 |         super(CriticNetwork, self).__init__()
 9 |         self.fc1_dims = fc1_dims
10 |         self.fc2_dims = fc2_dims
11 | 
12 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
13 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
14 |         self.q = Dense(1, activation=None)
15 | 
16 |     def call(self, inputs):
17 |         state, action = inputs
18 |         q1_action_value = self.fc1(tf.concat([state, action], axis=1))
19 |         q1_action_value = self.fc2(q1_action_value)
20 | 
21 |         q = self.q(q1_action_value)
22 | 
23 |         return q
24 | 
25 | 
26 | class ActorNetwork(keras.Model):
27 |     def __init__(self, fc1_dims, fc2_dims, n_actions):
28 |         super(ActorNetwork, self).__init__()
29 |         self.fc1_dims = fc1_dims
30 |         self.fc2_dims = fc2_dims
31 | 
32 |         self.fc1 = Dense(self.fc1_dims, activation='relu')
33 |         self.fc2 = Dense(self.fc2_dims, activation='relu')
34 |         self.mu = Dense(n_actions, activation='tanh')
35 | 
36 |     def call(self, state):
37 |         prob = self.fc1(state)
38 |         prob = self.fc2(prob)
39 | 
40 |         mu = self.mu(prob)
41 | 
42 |         return mu
43 | 


--------------------------------------------------------------------------------
/TD3/tf2/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | def manage_memory():
 7 |     gpus = tf.config.list_physical_devices('GPU')
 8 |     if gpus:
 9 |         try:
10 |             for gpu in gpus:
11 |                 tf.config.experimental.set_memory_growth(gpu, True)
12 |         except RuntimeError as e:
13 |             print(e)
14 | 
15 | 
16 | def plot_learning_curve(x, scores, figure_file):
17 |     running_avg = np.zeros(len(scores))
18 |     for i in range(len(running_avg)):
19 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
20 |     plt.plot(x, running_avg)
21 |     plt.title('Running average of previous 100 scores')
22 |     plt.savefig(figure_file)
23 | 


--------------------------------------------------------------------------------
/TD3/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def plot_learning_curve(x, scores, figure_file):
 5 |     running_avg = np.zeros(len(scores))
 6 |     for i in range(len(running_avg)):
 7 |         running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
 8 |     plt.plot(x, running_avg)
 9 |     plt.title('Running average of previous 100 scores')
10 |     plt.savefig(figure_file)
11 | 


--------------------------------------------------------------------------------