├── .gitattributes ├── .gitignore ├── A2C ├── A2C.py ├── AC_CartPole.py └── AC_continue_Pendulum.py ├── A3C └── A3C.py ├── Actor_Critic └── Actor_Critic.py ├── D3QN └── D3QN.py ├── DDPG └── DDPG.py ├── DQN └── DQN.py ├── Double_DQN └── DDQN.py ├── Dueling_DQN └── Dueling_DQN.py ├── LICENSE ├── Noise_DQN ├── Noise_DQN.py └── replay_buffer.py ├── PPO └── PPO.py ├── Prioritized_Replay_DQN ├── Prioritized_Replay_DQN.py └── run_MountainCar.py ├── Q_Learning_maze ├── RL_brain.py ├── maze_env.py └── run_q_function.py ├── README.md ├── REINFORCE └── REINFORCE.py └── SAC └── SAC.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | ./__pycache__ -------------------------------------------------------------------------------- /A2C/A2C.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | import torch.multiprocessing as mp 8 | import numpy as np 9 | 10 | # Hyperparameters 11 | n_train_processes = 3 12 | learning_rate = 0.0002 13 | update_interval = 5 14 | gamma = 0.98 15 | max_train_steps = 60000 16 | PRINT_INTERVAL = update_interval * 100 17 | 18 | 19 | class ActorCritic(nn.Module): 20 | def __init__(self): 21 | super(ActorCritic, self).__init__() 22 | self.fc1 = nn.Linear(4, 256) 23 | self.fc_pi = nn.Linear(256, 2) 24 | self.fc_v = nn.Linear(256, 1) 25 | 26 | def pi(self, x, softmax_dim=1): 27 | x = F.relu(self.fc1(x)) 28 | x = self.fc_pi(x) 29 | prob = F.softmax(x, dim=softmax_dim) 30 | return prob 31 | 32 | def v(self, x): 33 | x = F.relu(self.fc1(x)) 34 | v = self.fc_v(x) 35 | return v 36 | 37 | 38 | def worker(worker_id, master_end, worker_end): 39 | master_end.close() # Forbid worker to use the master end for messaging 40 | env = gym.make('CartPole-v1') 41 | env.seed(worker_id) 42 | 43 | while True: 44 | cmd, data = worker_end.recv() 45 | if cmd == 'step': 46 | ob, reward, done, info = env.step(data) 47 | if done: 48 | ob = env.reset() 49 | worker_end.send((ob, reward, done, info)) 50 | elif cmd == 'reset': 51 | ob = env.reset() 52 | worker_end.send(ob) 53 | elif cmd == 'reset_task': 54 | ob = env.reset_task() 55 | worker_end.send(ob) 56 | elif cmd == 'close': 57 | worker_end.close() 58 | break 59 | elif cmd == 'get_spaces': 60 | worker_end.send((env.observation_space, env.action_space)) 61 | else: 62 | raise NotImplementedError 63 | 64 | 65 | class ParallelEnv: 66 | def __init__(self, n_train_processes): 67 | self.nenvs = n_train_processes 68 | self.waiting = False 69 | self.closed = False 70 | self.workers = list() 71 | 72 | master_ends, worker_ends = zip(*[mp.Pipe() for _ in range(self.nenvs)]) 73 | self.master_ends, self.worker_ends = master_ends, worker_ends 74 | 75 | for worker_id, (master_end, worker_end) in enumerate(zip(master_ends, worker_ends)): 76 | p = mp.Process(target=worker, 77 | args=(worker_id, master_end, worker_end)) 78 | p.daemon = True 79 | p.start() 80 | self.workers.append(p) 81 | 82 | # Forbid master to use the worker end for messaging 83 | for worker_end in worker_ends: 84 | worker_end.close() 85 | 86 | def step_async(self, actions): 87 | for master_end, action in zip(self.master_ends, actions): 88 | master_end.send(('step', action)) 89 | self.waiting = True 90 | 91 | def step_wait(self): 92 | results = [master_end.recv() for master_end in self.master_ends] 93 | self.waiting = False 94 | obs, rews, dones, infos = zip(*results) 95 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 96 | 97 | def reset(self): 98 | for master_end in self.master_ends: 99 | master_end.send(('reset', None)) 100 | return np.stack([master_end.recv() for master_end in self.master_ends]) 101 | 102 | def step(self, actions): 103 | self.step_async(actions) 104 | return self.step_wait() 105 | 106 | def close(self): # For clean up resources 107 | if self.closed: 108 | return 109 | if self.waiting: 110 | [master_end.recv() for master_end in self.master_ends] 111 | for master_end in self.master_ends: 112 | master_end.send(('close', None)) 113 | for worker in self.workers: 114 | worker.join() 115 | self.closed = True 116 | 117 | 118 | def test(step_idx, model): 119 | env = gym.make('CartPole-v1') 120 | score = 0.0 121 | done = False 122 | num_test = 10 123 | 124 | for _ in range(num_test): 125 | s = env.reset() 126 | while not done: 127 | prob = model.pi(torch.from_numpy(s).float(), softmax_dim=0) 128 | a = Categorical(prob).sample().numpy() 129 | s_prime, r, done, info = env.step(a) 130 | s = s_prime 131 | score += r 132 | done = False 133 | print(f"Step # :{step_idx}, avg score : {score / num_test:.1f}") 134 | 135 | env.close() 136 | 137 | 138 | def compute_target(v_final, r_lst, mask_lst): 139 | G = v_final.reshape(-1) 140 | td_target = list() 141 | 142 | for r, mask in zip(r_lst[::-1], mask_lst[::-1]): 143 | G = r + gamma * G * mask 144 | td_target.append(G) 145 | 146 | return torch.tensor(td_target[::-1]).float() 147 | 148 | 149 | if __name__ == '__main__': 150 | envs = ParallelEnv(n_train_processes) 151 | 152 | model = ActorCritic() 153 | optimizer = optim.Adam(model.parameters(), lr=learning_rate) 154 | 155 | step_idx = 0 156 | s = envs.reset() 157 | while step_idx < max_train_steps: 158 | s_lst, a_lst, r_lst, mask_lst = list(), list(), list(), list() 159 | for _ in range(update_interval): 160 | prob = model.pi(torch.from_numpy(s).float()) 161 | a = Categorical(prob).sample().numpy() 162 | s_prime, r, done, info = envs.step(a) 163 | 164 | s_lst.append(s) 165 | a_lst.append(a) 166 | r_lst.append(r / 100.0) 167 | mask_lst.append(1 - done) 168 | 169 | s = s_prime 170 | step_idx += 1 171 | 172 | s_final = torch.from_numpy(s_prime).float() 173 | v_final = model.v(s_final).detach().clone().numpy() 174 | td_target = compute_target(v_final, r_lst, mask_lst) 175 | 176 | td_target_vec = td_target.reshape(-1) 177 | s_vec = torch.tensor(s_lst).float().reshape(-1, 4) # 4 == Dimension of state 178 | a_vec = torch.tensor(a_lst).reshape(-1).unsqueeze(1) 179 | advantage = td_target_vec - model.v(s_vec).reshape(-1) 180 | 181 | pi = model.pi(s_vec, softmax_dim=1) 182 | pi_a = pi.gather(1, a_vec).reshape(-1) 183 | loss = -(torch.log(pi_a) * advantage.detach()).mean() + \ 184 | F.smooth_l1_loss(model.v(s_vec).reshape(-1), td_target_vec) 185 | 186 | optimizer.zero_grad() 187 | loss.backward() 188 | optimizer.step() 189 | 190 | if step_idx % PRINT_INTERVAL == 0: 191 | test(step_idx, model) 192 | 193 | envs.close() 194 | -------------------------------------------------------------------------------- /A2C/AC_CartPole.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | import torch.nn.functional as F 5 | from gym import make 6 | 7 | np.random.seed(1) 8 | torch.manual_seed(1) 9 | 10 | # Superparameters 11 | MAX_EPISODE = 3000 12 | DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold 13 | MAX_EP_STEPS = 1000 # maximum time step in one episode 14 | RENDER = True # rendering wastes time 15 | GAMMA = 0.9 # reward discount in TD error 16 | LR_A = 0.001 # learning rate for actor 17 | LR_C = 0.01 # learning rate for critic 18 | 19 | env = make('CartPole-v1') 20 | env.seed(1) 21 | env = env.unwrapped 22 | 23 | print("env.action_space :", env.action_space) 24 | print("env.observation_space :", env.observation_space) 25 | print("env.observation_space.high :", env.observation_space.high) 26 | print("env.observation_space.low :", env.observation_space.low) 27 | 28 | N_F = env.observation_space.shape[0] 29 | N_A = env.action_space.n 30 | 31 | 32 | class PolicyNet(nn.Module): 33 | def __init__(self, n_actions, n_features): 34 | super(PolicyNet, self).__init__() 35 | hidden_units = 20 36 | self.fc_layer = nn.Sequential(nn.Linear(n_features, hidden_units), 37 | nn.Sigmoid(), 38 | nn.Linear(hidden_units, n_actions), 39 | nn.Softmax(dim=-1)) 40 | 41 | def forward(self, x): 42 | output = self.fc_layer(x) 43 | return output 44 | 45 | 46 | class CriticNet(nn.Module): 47 | def __init__(self, n_features): 48 | super(CriticNet, self).__init__() 49 | hidden_units = 20 50 | self.fc_layer = nn.Sequential(nn.Linear(n_features, hidden_units), 51 | nn.ReLU(), 52 | nn.Linear(hidden_units, 1)) 53 | 54 | def forward(self, x): 55 | output = self.fc_layer(x) 56 | return output 57 | 58 | 59 | class Actor(object): 60 | def __init__(self, n_features, n_actions, lr=0.001): 61 | self.actor_net = PolicyNet(n_actions, n_features) 62 | self.n_features = n_features 63 | self.n_actions = n_actions 64 | self.lr = lr 65 | 66 | self.optimizer = torch.optim.Adam(self.actor_net.parameters(), 67 | self.lr) 68 | self.cost_his = [] 69 | 70 | def learn(self, s, a, td): 71 | state = torch.Tensor(s[np.newaxis, :]) 72 | torch_acts = torch.as_tensor(a) 73 | torch_acts_one_hot = F.one_hot(torch_acts, num_classes=self.n_actions) 74 | torch_td_error = torch.Tensor(td).reshape(-1, 1).detach() 75 | all_act_prob = self.actor_net(state) 76 | 77 | exp_v = torch.log(all_act_prob) * torch_acts_one_hot * torch_td_error 78 | loss = torch.mean(-exp_v) 79 | self.optimizer.zero_grad() 80 | loss.backward() 81 | self.optimizer.step() 82 | self.cost_his.append(loss.data.numpy()) 83 | return exp_v 84 | 85 | def choose_action(self, observation): 86 | state = torch.Tensor(observation[np.newaxis, :]) 87 | prob_weights = self.actor_net(state) 88 | action_idx = prob_weights.reshape(-1, ).multinomial(num_samples=1).numpy()[0] 89 | return action_idx 90 | 91 | 92 | class Critic(object): 93 | def __init__(self, n_features, lr=0.01): 94 | self.critic_net = CriticNet(n_features) 95 | self.n_features = n_features 96 | self.lr = lr 97 | self.optimizer = torch.optim.Adam(self.critic_net.parameters(), 98 | self.lr) 99 | self.cost_his = [] 100 | self.loss_function = torch.nn.MSELoss() 101 | 102 | def learn(self, s, r, s_): 103 | s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :]) 104 | 105 | v = self.critic_net(s) 106 | v_ = self.critic_net(s_).detach() 107 | td_error = r + GAMMA * v_ - v 108 | loss = self.loss_function(v, r + GAMMA * v_) 109 | # loss = torch.mean(torch.square(td_error)) 110 | 111 | self.optimizer.zero_grad() 112 | loss.backward() 113 | self.optimizer.step() 114 | self.cost_his.append(loss.data.numpy()) 115 | 116 | return td_error 117 | 118 | 119 | actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A) 120 | critic = Critic(n_features=N_F, 121 | lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor 122 | 123 | for i_episode in range(MAX_EPISODE): 124 | s = env.reset() 125 | t = 0 126 | track_r = [] 127 | while True: 128 | if RENDER: 129 | env.render() 130 | 131 | a = actor.choose_action(s) 132 | 133 | s_, r, done, info = env.step(a) 134 | 135 | if done: 136 | r = -20 137 | 138 | track_r.append(r) 139 | 140 | td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] 141 | actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] 142 | 143 | s = s_ 144 | t += 1 145 | 146 | if done or t >= MAX_EP_STEPS: 147 | ep_rs_sum = sum(track_r) 148 | 149 | if 'running_reward' not in globals(): 150 | running_reward = ep_rs_sum 151 | else: 152 | running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 153 | if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering 154 | print("episode:", i_episode, " reward:", int(running_reward)) 155 | break 156 | -------------------------------------------------------------------------------- /A2C/AC_continue_Pendulum.py: -------------------------------------------------------------------------------- 1 | # import numpy as np 2 | # import torch 3 | # from torch import nn 4 | # import torch.nn.functional as F 5 | # from gym import make 6 | # 7 | # np.random.seed(1) 8 | # torch.manual_seed(1) 9 | # 10 | # 11 | # class PolicyNet(nn.Module): 12 | # def __init__(self, n_features): 13 | # super(PolicyNet, self).__init__() 14 | # hidden_units = 30 15 | # self.feature_layer = nn.Sequential(nn.Linear(n_features, hidden_units), 16 | # nn.ReLU()) 17 | # self.mu_layer = nn.Linear(hidden_units, 1) 18 | # self.sigma_layer = nn.Sequential(nn.Linear(hidden_units, 1), 19 | # nn.ReLU()) 20 | # 21 | # def forward(self, x): 22 | # feature = self.feature_layer(x) 23 | # mu = self.mu_layer(feature) 24 | # sigma = self.sigma_layer(feature) 25 | # return mu, sigma 26 | # 27 | # 28 | # class CriticNet(nn.Module): 29 | # def __init__(self, n_features): 30 | # super(CriticNet, self).__init__() 31 | # hidden_units = 20 32 | # self.fc_layer = nn.Sequential(nn.Linear(n_features, hidden_units), 33 | # nn.ReLU(), 34 | # nn.Linear(hidden_units, 1)) 35 | # 36 | # def forward(self, x): 37 | # output = self.fc_layer(x) 38 | # return output 39 | # 40 | # 41 | # class Actor(object): 42 | # def __init__(self, n_features, action_bound, lr=0.0001): 43 | # self.actor_net = PolicyNet(n_features) 44 | # self.n_features = n_features 45 | # self.action_bound = action_bound 46 | # self.lr = lr 47 | # 48 | # self.optimizer = torch.optim.Adam(self.actor_net.parameters(), 49 | # self.lr) 50 | # 51 | # def learn(self, s, a, td): 52 | # state = torch.Tensor(s[np.newaxis, :]) 53 | # torch_acts = torch.as_tensor(a) 54 | # torch_td_error = torch.Tensor(td).reshape(-1, 1).detach() 55 | # 56 | # mu, sigma = self.actor_net(state) 57 | # mu, sigma = torch.squeeze(mu * 2), torch.squeeze(sigma + 0.001) 58 | # normal_dist = torch.distributions.Normal(mu, sigma) 59 | # 60 | # log_prob = normal_dist.log_prob(torch_acts) 61 | # exp_v = log_prob * torch_td_error 62 | # exp_v += 0.01 * normal_dist.entropy() 63 | # 64 | # loss = torch.mean(-exp_v) 65 | # self.optimizer.zero_grad() 66 | # 67 | # loss.backward() 68 | # self.optimizer.step() 69 | # return exp_v 70 | # 71 | # def choose_action(self, s): 72 | # state = torch.Tensor(s[np.newaxis, :]) 73 | # mu, sigma = self.actor_net(state) 74 | # mu, sigma = torch.squeeze(mu * 2), torch.squeeze(sigma + 0.001) 75 | # 76 | # normal_dist = torch.distributions.Normal(mu, sigma) 77 | # action = torch.clamp(normal_dist.sample(), torch.Tensor(self.action_bound[0]), 78 | # torch.Tensor(self.action_bound[1])) 79 | # 80 | # return action 81 | # 82 | # 83 | # class Critic(object): 84 | # def __init__(self, n_features, lr=0.01): 85 | # self.critic_net = CriticNet(n_features) 86 | # self.n_features = n_features 87 | # self.lr = lr 88 | # self.optimizer = torch.optim.Adam(self.critic_net.parameters(), 89 | # self.lr) 90 | # 91 | # self.loss_function = torch.nn.MSELoss() 92 | # 93 | # def learn(self, s, r, s_): 94 | # s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :]) 95 | # 96 | # v = self.critic_net(s) 97 | # v_ = self.critic_net(s_).detach() 98 | # td_error = r + GAMMA * v_ - v 99 | # # loss = self.loss_function(v, r + GAMMA * v_) 100 | # loss = torch.mean(torch.square(td_error)) 101 | # 102 | # self.optimizer.zero_grad() 103 | # loss.backward() 104 | # self.optimizer.step() 105 | # 106 | # return td_error 107 | # 108 | # 109 | # MAX_EPISODE = 1000 110 | # MAX_EP_STEPS = 200 111 | # DISPLAY_REWARD_THRESHOLD = -100 # renders environment if total episode reward is greater then this threshold 112 | # RENDER = False # rendering wastes time 113 | # GAMMA = 0.9 114 | # LR_A = 0.001 # learning rate for actor 115 | # LR_C = 0.01 # learning rate for critic 116 | # 117 | # env = make('Pendulum-v1') 118 | # env.seed(1) # reproducible 119 | # env = env.unwrapped 120 | # 121 | # N_S = env.observation_space.shape[0] 122 | # A_BOUND = env.action_space.high 123 | # 124 | # actor = Actor(n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND]) 125 | # critic = Critic(n_features=N_S, lr=LR_C) 126 | # 127 | # for i_episode in range(MAX_EPISODE): 128 | # s = env.reset() 129 | # t = 0 130 | # ep_rs = [] 131 | # while True: 132 | # # if RENDER: 133 | # env.render() 134 | # a = actor.choose_action(s) 135 | # 136 | # s_, r, done, info = env.step(a) 137 | # r /= 10 138 | # 139 | # td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] 140 | # actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] 141 | # 142 | # s = s_ 143 | # t += 1 144 | # ep_rs.append(r) 145 | # if t > MAX_EP_STEPS: 146 | # ep_rs_sum = sum(ep_rs) 147 | # if 'running_reward' not in globals(): 148 | # running_reward = ep_rs_sum 149 | # else: 150 | # running_reward = running_reward * 0.9 + ep_rs_sum * 0.1 151 | # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering 152 | # print("episode:", i_episode, " reward:", int(running_reward)) 153 | # break 154 | -------------------------------------------------------------------------------- /A3C/A3C.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | import torch.multiprocessing as mp 8 | import time 9 | 10 | # Hyperparameters 11 | n_train_processes = 3 12 | learning_rate = 0.0002 13 | update_interval = 5 14 | gamma = 0.98 15 | max_train_ep = 300 16 | max_test_ep = 400 17 | 18 | 19 | # Asynchronous Advantage Actor-Critic(A3C) 20 | class ActorCritic(nn.Module): 21 | def __init__(self): 22 | super(ActorCritic, self).__init__() 23 | self.fc1 = nn.Linear(4, 256) 24 | self.fc_pi = nn.Linear(256, 2) 25 | self.fc_v = nn.Linear(256, 1) 26 | 27 | def pi(self, x, softmax_dim=0): 28 | x = F.relu(self.fc1(x)) 29 | x = self.fc_pi(x) 30 | prob = F.softmax(x, dim=softmax_dim) 31 | return prob 32 | 33 | def v(self, x): 34 | x = F.relu(self.fc1(x)) 35 | v = self.fc_v(x) 36 | return v 37 | 38 | 39 | def train(global_model, rank): 40 | local_model = ActorCritic() 41 | local_model.load_state_dict(global_model.state_dict()) 42 | 43 | optimizer = optim.Adam(global_model.parameters(), lr=learning_rate) 44 | 45 | env = gym.make('CartPole-v1') 46 | 47 | for n_epi in range(max_train_ep): 48 | done = False 49 | s = env.reset() 50 | while not done: 51 | s_lst, a_lst, r_lst = [], [], [] 52 | for t in range(update_interval): 53 | prob = local_model.pi(torch.from_numpy(s).float()) 54 | m = Categorical(prob) 55 | a = m.sample().item() 56 | s_prime, r, done, info = env.step(a) 57 | 58 | s_lst.append(s) 59 | a_lst.append([a]) 60 | r_lst.append(r / 100.0) 61 | 62 | s = s_prime 63 | if done: 64 | break 65 | 66 | s_final = torch.tensor(s_prime, dtype=torch.float) 67 | R = 0.0 if done else local_model.v(s_final).item() 68 | td_target_lst = [] 69 | for reward in r_lst[::-1]: 70 | R = gamma * R + reward 71 | td_target_lst.append([R]) 72 | td_target_lst.reverse() 73 | 74 | s_batch, a_batch, td_target = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 75 | torch.tensor(td_target_lst) 76 | advantage = td_target - local_model.v(s_batch) 77 | 78 | pi = local_model.pi(s_batch, softmax_dim=1) 79 | pi_a = pi.gather(1, a_batch) 80 | loss = -torch.log(pi_a) * advantage.detach() + \ 81 | F.smooth_l1_loss(local_model.v(s_batch), td_target.detach()) 82 | 83 | optimizer.zero_grad() 84 | loss.mean().backward() 85 | for global_param, local_param in zip(global_model.parameters(), local_model.parameters()): 86 | global_param._grad = local_param.grad 87 | optimizer.step() 88 | local_model.load_state_dict(global_model.state_dict()) 89 | 90 | env.close() 91 | print("Training process {} reached maximum episode.".format(rank)) 92 | 93 | 94 | def test(global_model): 95 | env = gym.make('CartPole-v1') 96 | score = 0.0 97 | print_interval = 20 98 | 99 | for n_epi in range(max_test_ep): 100 | done = False 101 | s = env.reset() 102 | while not done: 103 | prob = global_model.pi(torch.from_numpy(s).float()) 104 | a = Categorical(prob).sample().item() 105 | s_prime, r, done, info = env.step(a) 106 | s = s_prime 107 | score += r 108 | 109 | if n_epi % print_interval == 0 and n_epi != 0: 110 | print("# of episode :{}, avg score : {:.1f}".format( 111 | n_epi, score / print_interval)) 112 | score = 0.0 113 | time.sleep(1) 114 | env.close() 115 | 116 | 117 | if __name__ == '__main__': 118 | global_model = ActorCritic() 119 | global_model.share_memory() 120 | 121 | processes = [] 122 | for rank in range(n_train_processes + 1): # + 1 for test process 123 | if rank == 0: 124 | p = mp.Process(target=test, args=(global_model,)) 125 | else: 126 | p = mp.Process(target=train, args=(global_model, rank,)) 127 | p.start() 128 | processes.append(p) 129 | for p in processes: 130 | p.join() 131 | -------------------------------------------------------------------------------- /Actor_Critic/Actor_Critic.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torch.distributions import Categorical 8 | 9 | # Hyperparameters 10 | learning_rate = 0.0002 11 | gamma = 0.98 12 | n_rollout = 10 13 | MAX_EPISODE = 10000 14 | RENDER = True 15 | 16 | env = gym.make('CartPole-v1') 17 | env = env.unwrapped 18 | env.seed(1) 19 | torch.manual_seed(1) 20 | 21 | print("env.action_space :", env.action_space) 22 | print("env.observation_space :", env.observation_space) 23 | 24 | n_features = env.observation_space.shape[0] 25 | n_actions = env.action_space.n 26 | 27 | 28 | class ActorCritic(nn.Module): 29 | def __init__(self): 30 | super(ActorCritic, self).__init__() 31 | self.data = [] 32 | 33 | hidden_dims = 256 34 | self.feature_layer = nn.Sequential(nn.Linear(n_features, hidden_dims), 35 | nn.ReLU()) 36 | 37 | self.fc_pi = nn.Linear(hidden_dims, n_actions) 38 | self.fc_v = nn.Linear(hidden_dims, 1) 39 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 40 | 41 | def pi(self, x): 42 | x = self.feature_layer(x) 43 | x = self.fc_pi(x) 44 | prob = F.softmax(x, dim=-1) 45 | return prob 46 | 47 | def v(self, x): 48 | x = self.feature_layer(x) 49 | v = self.fc_v(x) 50 | return v 51 | 52 | def put_data(self, transition): 53 | self.data.append(transition) 54 | 55 | def make_batch(self): 56 | s_lst, a_lst, r_lst, s_next_lst, done_lst = [], [], [], [], [] 57 | for transition in self.data: 58 | s, a, r, s_, done = transition 59 | s_lst.append(s) 60 | a_lst.append([a]) 61 | r_lst.append([r / 100.0]) 62 | s_next_lst.append(s_) 63 | done_mask = 0.0 if done else 1.0 64 | done_lst.append([done_mask]) 65 | 66 | s_batch, a_batch, r_batch, s_next_batch, done_batch = torch.tensor(numpy.array(s_lst), 67 | dtype=torch.float), torch.tensor( 68 | a_lst), torch.tensor(numpy.array(r_lst), dtype=torch.float), torch.tensor( 69 | numpy.array(s_next_lst), dtype=torch.float), torch.tensor( 70 | numpy.array(done_lst), dtype=torch.float) 71 | self.data = [] 72 | return s_batch, a_batch, r_batch, s_next_batch, done_batch 73 | 74 | def train_net(self): 75 | s, a, r, s_, done = self.make_batch() 76 | td_target = r + gamma * self.v(s_) * done 77 | delta = td_target - self.v(s) 78 | 79 | pi = self.pi(s) 80 | pi_a = pi.gather(1, a) 81 | loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach()) 82 | 83 | self.optimizer.zero_grad() 84 | loss.mean().backward() 85 | self.optimizer.step() 86 | 87 | 88 | def main(): 89 | model = ActorCritic() 90 | print_interval = 20 91 | score = 0.0 92 | 93 | for n_epi in range(MAX_EPISODE): 94 | done = False 95 | s = env.reset() 96 | while not done: 97 | for t in range(n_rollout): 98 | if RENDER: 99 | env.render() 100 | prob = model.pi(torch.from_numpy(s).float()) 101 | m = Categorical(prob) 102 | a = m.sample().item() 103 | s_next, r, done, info = env.step(a) 104 | model.put_data((s, a, r, s_next, done)) 105 | 106 | s = s_next 107 | score += r 108 | 109 | if done: 110 | break 111 | 112 | model.train_net() 113 | 114 | if n_epi % print_interval == 0 and n_epi != 0: 115 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score / print_interval)) 116 | score = 0.0 117 | env.close() 118 | 119 | 120 | if __name__ == '__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /D3QN/D3QN.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import collections 3 | import random 4 | 5 | import numpy 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | # Hyperparameters 12 | learning_rate = 0.0005 13 | gamma = 0.98 14 | buffer_limit = 50000 15 | batch_size = 32 16 | MAX_EPISODE = 10000 17 | RENDER = True 18 | 19 | env = gym.make('CartPole-v1') 20 | env = env.unwrapped 21 | env.seed(1) 22 | torch.manual_seed(1) 23 | 24 | print("env.action_space :", env.action_space) 25 | print("env.observation_space :", env.observation_space) 26 | 27 | n_features = env.observation_space.shape[0] 28 | n_actions = env.action_space.n 29 | 30 | 31 | class ReplayBuffer(): 32 | def __init__(self): 33 | self.buffer = collections.deque(maxlen=buffer_limit) 34 | 35 | def put(self, transition): 36 | self.buffer.append(transition) 37 | 38 | def sample(self, n): 39 | mini_batch = random.sample(self.buffer, n) 40 | s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], [] 41 | 42 | for transition in mini_batch: 43 | s, a, r, s_, done_mask = transition 44 | s_lst.append(s) 45 | a_lst.append([a]) 46 | r_lst.append([r]) 47 | s_next_lst.append(s_) 48 | done_mask_lst.append([done_mask]) 49 | 50 | return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst)), \ 51 | torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \ 52 | torch.tensor(numpy.array(done_mask_lst)) 53 | 54 | def size(self): 55 | return len(self.buffer) 56 | 57 | 58 | class DQNDuelingNet(nn.Module): 59 | def __init__(self): 60 | super(DQNDuelingNet, self).__init__() 61 | hidden_dims = 128 62 | self.feature_layer = nn.Sequential(nn.Linear(n_features, hidden_dims), 63 | nn.ReLU()) 64 | self.value_layer = nn.Linear(hidden_dims, 1) 65 | self.advantage_layer = nn.Linear(hidden_dims, n_actions) 66 | 67 | def forward(self, x): 68 | feature = self.feature_layer(x) 69 | value = self.value_layer(feature) 70 | advantage = self.advantage_layer(feature) 71 | 72 | avg_advantage = torch.mean(input=advantage, dim=-1, keepdim=True) 73 | q_values = value + (advantage - avg_advantage) 74 | return q_values 75 | 76 | 77 | # Epsilon_Greedy_Exploration 78 | # MAX_Greedy_Update 79 | class Dueling_DQN: 80 | def __init__(self): 81 | # [target_net, evaluate_net] 82 | self.evaluate_net = DQNDuelingNet() 83 | self.target_net = type(self.evaluate_net)() 84 | self.target_net.load_state_dict(self.evaluate_net.state_dict()) # copy weights and stuff 85 | 86 | self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(), 87 | learning_rate) 88 | self.memory = ReplayBuffer() 89 | 90 | def train(self): 91 | s, a, r, s_, done_mask = self.memory.sample(batch_size) 92 | 93 | q_out = self.evaluate_net(s) 94 | q_a = q_out.gather(1, a) 95 | 96 | # 与Dueling DQN的不同之处 97 | # max_q_prime = torch.max(self.target_net(s_), dim=1, keepdim=True).values 98 | # target = r + gamma * max_q_prime * done_mask 99 | q_target_next = self.target_net(s_).detach() 100 | q_eval_next = self.evaluate_net(s_).detach() 101 | q_next = q_target_next.gather(1, q_eval_next.argmax(axis=1).reshape(-1, 1)) 102 | target = r + gamma * q_next * done_mask 103 | 104 | loss = F.smooth_l1_loss(q_a, target) 105 | 106 | self.optimizer.zero_grad() 107 | loss.backward() 108 | self.optimizer.step() 109 | 110 | def sample_action(self, obs, epsilon): 111 | coin = random.random() 112 | if coin < epsilon: 113 | return random.randint(0, 1) 114 | else: 115 | out = self.evaluate_net(obs) 116 | return out.argmax().item() 117 | 118 | 119 | def main(): 120 | trainer = Dueling_DQN() 121 | 122 | print_interval = 20 123 | score = 0.0 124 | 125 | for n_epi in range(MAX_EPISODE): 126 | epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200)) # Linear annealing from 8% to 1% 127 | s = env.reset() 128 | done = False 129 | 130 | while not done: 131 | if RENDER: 132 | env.render() 133 | a = trainer.sample_action(torch.from_numpy(s).float(), epsilon) 134 | s_, r, done, info = env.step(a) 135 | done_mask = 0.0 if done else 1.0 136 | trainer.memory.put((s, a, r / 100.0, s_, done_mask)) 137 | s = s_ 138 | 139 | score += r 140 | if done: 141 | break 142 | 143 | if trainer.memory.size() > 2000: 144 | trainer.train() 145 | 146 | if n_epi % print_interval == 0 and n_epi != 0: 147 | trainer.target_net.load_state_dict(trainer.evaluate_net.state_dict()) 148 | print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format( 149 | n_epi, score / print_interval, trainer.memory.size(), epsilon * 100)) 150 | score = 0.0 151 | env.close() 152 | 153 | 154 | if __name__ == '__main__': 155 | main() 156 | -------------------------------------------------------------------------------- /DDPG/DDPG.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import random 3 | import collections 4 | 5 | import numpy 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import torch.optim as optim 11 | 12 | # Hyperparameters 13 | lr_mu = 0.0005 14 | lr_q = 0.001 15 | gamma = 0.99 16 | batch_size = 32 17 | buffer_limit = 50000 18 | tau = 0.005 # for target network soft update 19 | 20 | MAX_EPISODE = 10000 21 | RENDER = True 22 | 23 | env = gym.make('Pendulum-v1') 24 | # env = env.unwrapped 25 | env.seed(1) 26 | torch.manual_seed(1) 27 | 28 | print("env.action_space :", env.action_space) 29 | print("env.observation_space :", env.observation_space) 30 | 31 | n_features = env.observation_space.shape[0] 32 | n_actions = env.action_space.shape[0] 33 | 34 | 35 | # class NormalizedActions(gym.ActionWrapper): 36 | # def action(self, action): 37 | # low_bound = self.action_space.low 38 | # upper_bound = self.action_space.high 39 | # 40 | # action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound) 41 | # # 将经过tanh输出的值重新映射回环境的真实值内 42 | # action = np.clip(action, low_bound, upper_bound) 43 | # 44 | # return action 45 | # 46 | # def reverse_action(self, action): 47 | # low_bound = self.action_space.low 48 | # upper_bound = self.action_space.high 49 | # 50 | # # 因为激活函数使用的是tanh,这里将环境输出的动作正则化到(-1,1) 51 | # 52 | # action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1 53 | # action = np.clip(action, low_bound, upper_bound) 54 | # 55 | # return action 56 | 57 | 58 | class ReplayBuffer(): 59 | def __init__(self): 60 | self.buffer = collections.deque(maxlen=buffer_limit) # 初始化buffer容量 61 | 62 | def put(self, transition): 63 | self.buffer.append(transition) # 存入一个transition 64 | 65 | def sample(self, n): # 取样 66 | mini_batch = random.sample(self.buffer, n) 67 | s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], [] 68 | 69 | for transition in mini_batch: 70 | s, a, r, s_, done_mask = transition 71 | s_lst.append(s) 72 | a_lst.append([a]) 73 | r_lst.append([r]) 74 | s_next_lst.append(s_) 75 | done_mask_lst.append([done_mask]) 76 | 77 | return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst), dtype=torch.float), \ 78 | torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \ 79 | torch.tensor(numpy.array(done_mask_lst)) 80 | 81 | def size(self): 82 | return len(self.buffer) 83 | 84 | 85 | class MuNet(nn.Module): 86 | def __init__(self): 87 | super(MuNet, self).__init__() 88 | 89 | hidden_dims = 128 90 | hidden_dims_2 = 64 91 | self.fc1 = nn.Linear(n_features, hidden_dims) 92 | self.fc2 = nn.Linear(hidden_dims, hidden_dims_2) 93 | self.fc_mu = nn.Linear(hidden_dims_2, 1) 94 | 95 | def forward(self, x): 96 | x = F.relu(self.fc1(x)) 97 | x = F.relu(self.fc2(x)) 98 | mu = torch.tanh(self.fc_mu(x)) * 2 # Multipled by 2 because the action space of the Pendulum-v0 is [-2,2] 99 | return mu 100 | 101 | 102 | class QNet(nn.Module): 103 | def __init__(self): 104 | super(QNet, self).__init__() 105 | hidden_dims = 64 106 | hidden_dims_2 = 32 107 | self.fc_s = nn.Linear(n_features, hidden_dims) 108 | self.fc_a = nn.Linear(1, hidden_dims) 109 | self.fc_q = nn.Linear(hidden_dims * 2, hidden_dims_2) 110 | self.fc_out = nn.Linear(hidden_dims_2, n_actions) 111 | 112 | def forward(self, x, a): 113 | h1 = F.relu(self.fc_s(x)) 114 | h2 = F.relu(self.fc_a(a)) 115 | cat = torch.cat([h1, h2], dim=1) 116 | q = F.relu(self.fc_q(cat)) 117 | q = self.fc_out(q) 118 | return q 119 | 120 | 121 | class OrnsteinUhlenbeckNoise: 122 | def __init__(self, mu): 123 | self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1 124 | self.mu = mu 125 | self.x_prev = np.zeros_like(self.mu) 126 | 127 | def __call__(self): 128 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt( 129 | self.dt) * np.random.normal(size=self.mu.shape) 130 | self.x_prev = x 131 | return x 132 | 133 | 134 | def train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer): 135 | s, a, r, s_next, done_mask = memory.sample(batch_size) 136 | 137 | target = r + gamma * q_target(s_next, mu_target(s_next)) * done_mask 138 | q_loss = F.smooth_l1_loss(q(s, a), target.detach()) 139 | q_optimizer.zero_grad() 140 | q_loss.backward() 141 | q_optimizer.step() 142 | 143 | mu_loss = -q(s, mu(s)).mean() # That's all for the policy loss. 144 | mu_optimizer.zero_grad() 145 | mu_loss.backward() 146 | mu_optimizer.step() 147 | 148 | 149 | def soft_update(net, net_target): 150 | for param_target, param in zip(net_target.parameters(), net.parameters()): 151 | param_target.data.copy_(param.data * tau + (1.0 - tau) * param_target.data) 152 | 153 | 154 | def main(): 155 | memory = ReplayBuffer() 156 | 157 | q, q_target = QNet(), QNet() 158 | q_target.load_state_dict(q.state_dict()) 159 | mu, mu_target = MuNet(), MuNet() 160 | mu_target.load_state_dict(mu.state_dict()) 161 | 162 | score = 0.0 163 | print_interval = 20 164 | 165 | mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu) 166 | q_optimizer = optim.Adam(q.parameters(), lr=lr_q) 167 | ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(1)) 168 | 169 | for n_epi in range(MAX_EPISODE): 170 | s = env.reset() 171 | done = False 172 | 173 | while not done: 174 | if RENDER: 175 | env.render() 176 | a = mu(torch.from_numpy(s).float()) 177 | a = a.item() + ou_noise()[0] 178 | s_next, r, done, info = env.step([a]) 179 | memory.put((s, a, r / 100.0, s_next, done)) 180 | score += r 181 | s = s_next 182 | 183 | if memory.size() > 2000: 184 | for i in range(10): 185 | train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer) 186 | soft_update(mu, mu_target) 187 | soft_update(q, q_target) 188 | 189 | if n_epi % print_interval == 0 and n_epi != 0: 190 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score / print_interval)) 191 | score = 0.0 192 | 193 | env.close() 194 | 195 | 196 | if __name__ == '__main__': 197 | main() 198 | -------------------------------------------------------------------------------- /DQN/DQN.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import collections 3 | import random 4 | 5 | import numpy 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | # Hyperparameters 12 | learning_rate = 0.0005 13 | gamma = 0.98 14 | buffer_limit = 50000 15 | batch_size = 32 16 | MAX_EPISODE = 10000 17 | RENDER = True 18 | 19 | env = gym.make('CartPole-v1') 20 | env = env.unwrapped 21 | env.seed(1) 22 | torch.manual_seed(1) 23 | 24 | print("env.action_space :", env.action_space) 25 | print("env.observation_space :", env.observation_space) 26 | 27 | n_features = env.observation_space.shape[0] 28 | n_actions = env.action_space.n 29 | 30 | 31 | class ReplayBuffer(): 32 | def __init__(self): 33 | self.buffer = collections.deque(maxlen=buffer_limit) # 初始化buffer容量 34 | 35 | def put(self, transition): 36 | self.buffer.append(transition) # 存入一个transition 37 | 38 | def sample(self, n): # 取样 39 | mini_batch = random.sample(self.buffer, n) 40 | s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], [] 41 | 42 | for transition in mini_batch: 43 | s, a, r, s_, done_mask = transition 44 | s_lst.append(s) 45 | a_lst.append([a]) 46 | r_lst.append([r]) 47 | s_next_lst.append(s_) 48 | done_mask_lst.append([done_mask]) 49 | 50 | return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst)), \ 51 | torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \ 52 | torch.tensor(numpy.array(done_mask_lst)) 53 | 54 | def size(self): 55 | return len(self.buffer) 56 | 57 | 58 | class QNetwork(nn.Module): 59 | def __init__(self): 60 | super(QNetwork, self).__init__() 61 | hidden_dims = 128 62 | self.out_layer = torch.nn.Sequential(nn.Linear(n_features, hidden_dims), 63 | nn.ReLU(), 64 | nn.Linear(hidden_dims, n_actions)) 65 | 66 | def forward(self, x): 67 | return self.out_layer(x) 68 | 69 | 70 | # Deep Q Network off-policy 71 | # Epsilon_Greedy_Exploration 72 | # MAX_Greedy_Update 73 | class DeepQNetwork: 74 | def __init__(self): 75 | # [target_net, evaluate_net] 76 | self.evaluate_net = QNetwork() 77 | self.target_net = type(self.evaluate_net)() # target network与evaluate_net结构相同 78 | self.target_net.load_state_dict(self.evaluate_net.state_dict()) # copy weights and stuff 79 | 80 | self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(), 81 | learning_rate) 82 | self.memory = ReplayBuffer() 83 | 84 | def train(self): 85 | # sample a batch from the replay buffer 86 | s, a, r, s_, done_mask = self.memory.sample(batch_size) 87 | 88 | q_out = self.evaluate_net(s) 89 | q_a = q_out.gather(1, a) 90 | max_q_prime = torch.max(self.target_net(s_), dim=1, keepdim=True).values 91 | target = r + gamma * max_q_prime * done_mask 92 | loss = F.smooth_l1_loss(q_a, target) 93 | 94 | self.optimizer.zero_grad() 95 | loss.backward() 96 | self.optimizer.step() 97 | 98 | def sample_action(self, obs, epsilon): 99 | coin = random.random() 100 | if coin < epsilon: 101 | return env.action_space.sample() 102 | else: 103 | out = self.evaluate_net(obs) 104 | return out.argmax().item() 105 | 106 | 107 | def main(): 108 | trainer = DeepQNetwork() 109 | 110 | print_interval = 20 111 | score = 0.0 112 | 113 | for n_epi in range(MAX_EPISODE): 114 | epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200)) # Linear annealing from 8% to 1% 115 | s = env.reset() 116 | done = False 117 | 118 | while not done: 119 | if RENDER: 120 | env.render() 121 | a = trainer.sample_action(torch.from_numpy(s).float(), epsilon) 122 | s_, r, done, info = env.step(a) 123 | done_mask = 0.0 if done else 1.0 124 | trainer.memory.put((s, a, r / 100.0, s_, done_mask)) 125 | s = s_ 126 | 127 | score += r 128 | if done: 129 | break 130 | 131 | if trainer.memory.size() > 2000: 132 | trainer.train() # 训练数据存储到一定量后开始训练网络 133 | 134 | if n_epi % print_interval == 0 and n_epi != 0: 135 | # 更新target network 136 | trainer.target_net.load_state_dict(trainer.evaluate_net.state_dict()) 137 | print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format( 138 | n_epi, score / print_interval, trainer.memory.size(), epsilon * 100)) 139 | score = 0.0 140 | env.close() 141 | 142 | 143 | if __name__ == '__main__': 144 | main() 145 | -------------------------------------------------------------------------------- /Double_DQN/DDQN.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import collections 3 | import random 4 | 5 | import numpy 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | # Hyperparameters 12 | learning_rate = 0.0005 13 | gamma = 0.98 14 | buffer_limit = 50000 15 | batch_size = 32 16 | MAX_EPISODE = 10000 17 | RENDER = True 18 | 19 | env = gym.make('CartPole-v1') 20 | env = env.unwrapped 21 | env.seed(1) 22 | torch.manual_seed(1) 23 | 24 | print("env.action_space :", env.action_space) 25 | print("env.observation_space :", env.observation_space) 26 | 27 | n_features = env.observation_space.shape[0] 28 | n_actions = env.action_space.n 29 | 30 | 31 | class ReplayBuffer(): 32 | def __init__(self): 33 | self.buffer = collections.deque(maxlen=buffer_limit) 34 | 35 | def put(self, transition): 36 | self.buffer.append(transition) 37 | 38 | def sample(self, n): 39 | mini_batch = random.sample(self.buffer, n) 40 | s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], [] 41 | 42 | for transition in mini_batch: 43 | s, a, r, s_, done_mask = transition 44 | s_lst.append(s) 45 | a_lst.append([a]) 46 | r_lst.append([r]) 47 | s_next_lst.append(s_) 48 | done_mask_lst.append([done_mask]) 49 | 50 | return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst)), \ 51 | torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \ 52 | torch.tensor(numpy.array(done_mask_lst)) 53 | 54 | def size(self): 55 | return len(self.buffer) 56 | 57 | 58 | class QNetwork(nn.Module): 59 | def __init__(self): 60 | super(QNetwork, self).__init__() 61 | hidden_dims = 128 62 | self.out_layer = torch.nn.Sequential(nn.Linear(n_features, hidden_dims), 63 | nn.ReLU(), 64 | nn.Linear(hidden_dims, n_actions)) 65 | 66 | def forward(self, x): 67 | return self.out_layer(x) 68 | 69 | 70 | # Deep Q Network off-policy 71 | # Epsilon_Greedy_Exploration 72 | # MAX_Greedy_Update 73 | class DeepQNetwork: 74 | def __init__(self): 75 | # [target_net, evaluate_net] 76 | self.evaluate_net = QNetwork() 77 | self.target_net = type(self.evaluate_net)() 78 | self.target_net.load_state_dict(self.evaluate_net.state_dict()) # copy weights and stuff 79 | 80 | self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(), 81 | learning_rate) 82 | self.memory = ReplayBuffer() 83 | 84 | def train(self): 85 | s, a, r, s_, done_mask = self.memory.sample(batch_size) 86 | 87 | # 重点部分 88 | q_eval = self.evaluate_net(s).gather(1, a) 89 | q_target_next = self.target_net(s_).detach() 90 | q_eval_next = self.evaluate_net(s_).detach() 91 | q_next = q_target_next.gather(1, q_eval_next.argmax(axis=1).reshape(-1, 1)) 92 | target = r + gamma * q_next * done_mask 93 | 94 | loss = F.smooth_l1_loss(q_eval, target) 95 | 96 | self.optimizer.zero_grad() 97 | loss.backward() 98 | self.optimizer.step() 99 | 100 | def sample_action(self, obs, epsilon): 101 | coin = random.random() 102 | if coin < epsilon: 103 | return env.action_space.sample() 104 | else: 105 | out = self.evaluate_net(obs) 106 | return out.argmax().item() 107 | 108 | 109 | def main(): 110 | env = gym.make('CartPole-v1') 111 | trainer = DeepQNetwork() 112 | 113 | print_interval = 20 114 | score = 0.0 115 | 116 | for n_epi in range(MAX_EPISODE): 117 | epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200)) # Linear annealing from 8% to 1% 118 | s = env.reset() 119 | done = False 120 | 121 | while not done: 122 | if RENDER: 123 | env.render() 124 | a = trainer.sample_action(torch.from_numpy(s).float(), epsilon) 125 | s_, r, done, info = env.step(a) 126 | done_mask = 0.0 if done else 1.0 127 | trainer.memory.put((s, a, r / 100.0, s_, done_mask)) 128 | s = s_ 129 | 130 | score += r 131 | if done: 132 | break 133 | 134 | if trainer.memory.size() > 2000: 135 | trainer.train() 136 | 137 | if n_epi % print_interval == 0 and n_epi != 0: 138 | trainer.target_net.load_state_dict(trainer.evaluate_net.state_dict()) 139 | print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format( 140 | n_epi, score / print_interval, trainer.memory.size(), epsilon * 100)) 141 | score = 0.0 142 | env.close() 143 | 144 | 145 | if __name__ == '__main__': 146 | main() 147 | -------------------------------------------------------------------------------- /Dueling_DQN/Dueling_DQN.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import collections 3 | import random 4 | 5 | import numpy 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | # Hyperparameters 12 | learning_rate = 0.0005 13 | gamma = 0.98 14 | buffer_limit = 50000 15 | batch_size = 32 16 | MAX_EPISODE = 10000 17 | RENDER = True 18 | 19 | env = gym.make('CartPole-v1') 20 | env = env.unwrapped 21 | env.seed(1) 22 | torch.manual_seed(1) 23 | 24 | print("env.action_space :", env.action_space) 25 | print("env.observation_space :", env.observation_space) 26 | 27 | n_features = env.observation_space.shape[0] 28 | n_actions = env.action_space.n 29 | 30 | 31 | class ReplayBuffer(): 32 | def __init__(self): 33 | self.buffer = collections.deque(maxlen=buffer_limit) 34 | 35 | def put(self, transition): 36 | self.buffer.append(transition) 37 | 38 | def sample(self, n): 39 | mini_batch = random.sample(self.buffer, n) 40 | s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], [] 41 | 42 | for transition in mini_batch: 43 | s, a, r, s_, done_mask = transition 44 | s_lst.append(s) 45 | a_lst.append([a]) 46 | r_lst.append([r]) 47 | s_next_lst.append(s_) 48 | done_mask_lst.append([done_mask]) 49 | 50 | return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst)), \ 51 | torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \ 52 | torch.tensor(numpy.array(done_mask_lst)) 53 | 54 | def size(self): 55 | return len(self.buffer) 56 | 57 | 58 | class DQNDuelingNet(nn.Module): 59 | def __init__(self): 60 | super(DQNDuelingNet, self).__init__() 61 | hidden_dims = 128 62 | self.feature_layer = nn.Sequential(nn.Linear(n_features, hidden_dims), 63 | nn.ReLU()) 64 | self.value_layer = nn.Linear(hidden_dims, 1) 65 | self.advantage_layer = nn.Linear(hidden_dims, n_actions) 66 | 67 | def forward(self, x): 68 | feature = self.feature_layer(x) 69 | value = self.value_layer(feature) 70 | advantage = self.advantage_layer(feature) 71 | 72 | avg_advantage = torch.mean(input=advantage, dim=-1, keepdim=True) 73 | q_values = value + (advantage - avg_advantage) 74 | return q_values 75 | 76 | 77 | # Epsilon_Greedy_Exploration 78 | # MAX_Greedy_Update 79 | class Dueling_DQN: 80 | def __init__(self): 81 | # [target_net, evaluate_net] 82 | self.evaluate_net = DQNDuelingNet() 83 | self.target_net = type(self.evaluate_net)() 84 | self.target_net.load_state_dict(self.evaluate_net.state_dict()) # copy weights and stuff 85 | 86 | self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(), 87 | learning_rate) 88 | self.memory = ReplayBuffer() 89 | 90 | def train(self): 91 | s, a, r, s_, done_mask = self.memory.sample(batch_size) 92 | 93 | q_out = self.evaluate_net(s) 94 | q_a = q_out.gather(1, a) 95 | max_q_prime = torch.max(self.target_net(s_), dim=1, keepdim=True).values 96 | target = r + gamma * max_q_prime * done_mask 97 | loss = F.smooth_l1_loss(q_a, target) 98 | 99 | self.optimizer.zero_grad() 100 | loss.backward() 101 | self.optimizer.step() 102 | 103 | def sample_action(self, obs, epsilon): 104 | coin = random.random() 105 | if coin < epsilon: 106 | return random.randint(0, 1) 107 | else: 108 | out = self.evaluate_net(obs) 109 | return out.argmax().item() 110 | 111 | 112 | def main(): 113 | trainer = Dueling_DQN() 114 | 115 | print_interval = 20 116 | score = 0.0 117 | 118 | for n_epi in range(MAX_EPISODE): 119 | epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200)) # Linear annealing from 8% to 1% 120 | s = env.reset() 121 | done = False 122 | 123 | while not done: 124 | if RENDER: 125 | env.render() 126 | a = trainer.sample_action(torch.from_numpy(s).float(), epsilon) 127 | s_, r, done, info = env.step(a) 128 | done_mask = 0.0 if done else 1.0 129 | trainer.memory.put((s, a, r / 100.0, s_, done_mask)) 130 | s = s_ 131 | 132 | score += r 133 | if done: 134 | break 135 | 136 | if trainer.memory.size() > 2000: 137 | trainer.train() 138 | 139 | if n_epi % print_interval == 0 and n_epi != 0: 140 | trainer.target_net.load_state_dict(trainer.evaluate_net.state_dict()) 141 | print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format( 142 | n_epi, score / print_interval, trainer.memory.size(), epsilon * 100)) 143 | score = 0.0 144 | env.close() 145 | 146 | 147 | if __name__ == '__main__': 148 | main() 149 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 老胡 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Noise_DQN/Noise_DQN.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections import deque 3 | import gym 4 | import matplotlib.pyplot as plt 5 | import torch 6 | import torch.autograd as autograd 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import torch.optim as optim 10 | 11 | from replay_buffer import * 12 | 13 | USE_CUDA = torch.cuda.is_available() 14 | device = torch.device("cuda" if USE_CUDA else "cpu") 15 | 16 | 17 | # 定义一个添加噪声的网络层 18 | class NoisyLinear(nn.Module): 19 | def __init__(self, in_features, out_features, std_init=0.4): 20 | super(NoisyLinear, self).__init__() 21 | 22 | self.in_features = in_features 23 | self.out_features = out_features 24 | self.std_init = std_init 25 | 26 | self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features)) 27 | self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features)) 28 | # 向模块添加持久缓冲区,这通常用于注册不应被视为模型参数的缓冲区。例如,BatchNorm的running_mean不是一个参数,而是持久状态的一部分。 29 | # 缓冲区可以使用给定的名称作为属性访问。 30 | self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features)) 31 | 32 | self.bias_mu = nn.Parameter(torch.FloatTensor(out_features)) 33 | self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features)) 34 | self.register_buffer('bias_epsilon', torch.FloatTensor(out_features)) 35 | 36 | self.reset_parameters() 37 | self.reset_noise() 38 | 39 | def forward(self, x): 40 | if self.training: 41 | weight = self.weight_mu + self.weight_sigma.mul(self.weight_epsilon.to(device)) 42 | bias = self.bias_mu + self.bias_sigma.mul(self.bias_epsilon.to(device)) 43 | else: 44 | weight = self.weight_mu 45 | bias = self.bias_mu 46 | return F.linear(x, weight, bias) 47 | 48 | def reset_parameters(self): 49 | mu_range = 1 / math.sqrt(self.weight_mu.size(1)) 50 | 51 | self.weight_mu.data.uniform_(-mu_range, mu_range) 52 | self.weight_sigma.data.uniform_(self.std_init / math.sqrt(self.bias_sigma.size(0))) 53 | 54 | self.bias_mu.data.uniform_(-mu_range, mu_range) 55 | self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0))) 56 | 57 | def reset_noise(self): 58 | epsilon_in = self._scale_noise(self.in_features) 59 | epsilon_out = self._scale_noise(self.out_features) 60 | 61 | self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in)) 62 | self.bias_epsilon.copy_(self._scale_noise(self.out_features)) 63 | 64 | def _scale_noise(self, size): 65 | x = torch.randn(size) 66 | x = x.sign().mul(x.abs().sqrt()) 67 | return x 68 | 69 | 70 | class NoisyDQN(nn.Module): 71 | def __init__(self, observation_space, action_sapce): 72 | super(NoisyDQN, self).__init__() 73 | 74 | self.linear = nn.Linear(observation_space, 128) 75 | self.noisy1 = NoisyLinear(128, 128) 76 | self.noisy2 = NoisyLinear(128, action_sapce) 77 | 78 | def forward(self, x): 79 | x = F.relu(self.linear(x)) 80 | x = F.relu(self.noisy1(x)) 81 | x = self.noisy2(x) 82 | return x 83 | 84 | def act(self, state): 85 | state = torch.FloatTensor(state).unsqueeze(0) 86 | q_value = self.forward(state) 87 | action = q_value.max(1)[1].data[0] 88 | action = action.cpu().numpy() # 从网络中得到的tensor形式,因为之后要输入给gym环境中,这里把它放回cpu,转为数组形式 89 | action = int(action) 90 | return action 91 | 92 | def reset_noise(self): 93 | self.noisy1.reset_noise() 94 | self.noisy2.reset_noise() 95 | 96 | 97 | class ReplayBuffer(object): 98 | def __init__(self, capacity): 99 | # deque模块是python标准库collections中的一项,它提供了两端都可以操作的序列,其实就是双向队列, 100 | # 可以从左右两端增加元素,或者是删除元素。如果设置了最大长度,非输入端的数据会逐步移出窗口。 101 | self.buffer = deque(maxlen=capacity) 102 | 103 | def push(self, state, aciton, reward, next_state, done): 104 | state = np.expand_dims(state, 0) 105 | # 这里增加维度的操作是为了便于之后使用concatenate进行拼接 106 | next_state = np.expand_dims(next_state, 0) 107 | self.buffer.append((state, aciton, reward, next_state, done)) 108 | 109 | def sample(self, batch_size): 110 | # 将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表 111 | state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size)) 112 | # 最后使用concatenate对数组进行拼接,相当于少了一个维度 113 | return np.concatenate(state), action, reward, np.concatenate(next_state), done 114 | 115 | 116 | def compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size, beta): 117 | state, action, reward, next_state, done, weights, indices = replay_buffer.sample(batch_size, beta) 118 | 119 | state = torch.FloatTensor(np.float32(state)).to(device) 120 | next_state = torch.FloatTensor(np.float32(next_state)).to(device) 121 | action = torch.LongTensor(action).to(device) 122 | reward = torch.FloatTensor(reward).to(device) 123 | done = torch.FloatTensor(np.float32(done)).to(device) 124 | weights = torch.FloatTensor(weights).to(device) 125 | 126 | q_values = current_model(state) 127 | next_q_values = target_model(next_state) 128 | 129 | q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) 130 | # gather可以看作是对q_values的查询,即元素都是q_values中的元素,查询索引都存在action中。输出大小与action.unsqueeze(1)一致。 131 | # dim=1,它存放的都是第1维度的索引;dim=0,它存放的都是第0维度的索引; 132 | # 这里增加维度主要是为了方便gather操作,之后再删除该维度 133 | next_q_value = next_q_values.max(1)[0] 134 | 135 | expected_q_value = reward + gamma * next_q_value * (1 - done) 136 | 137 | loss = (q_value - expected_q_value.detach()).pow(2) * weights 138 | prios = loss + 1e-5 139 | loss = loss.mean() 140 | 141 | optimizer.zero_grad() 142 | loss.backward() 143 | optimizer.step() 144 | 145 | replay_buffer.update_priorities(indices, prios.data.cpu().numpy()) 146 | current_model.reset_noise() 147 | target_model.reset_noise() 148 | 149 | return loss 150 | 151 | 152 | def update_target(current_model, target_model): 153 | target_model.load_state_dict(current_model.state_dict()) # 加载模型 154 | 155 | 156 | def plot(frame_idx, rewards, losses): 157 | plt.figure(figsize=(20, 5)) 158 | plt.subplot(131) 159 | plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:]))) 160 | plt.plot(rewards) 161 | plt.subplot(132) 162 | plt.title('loss') 163 | plt.plot(losses) 164 | plt.show() 165 | 166 | 167 | def main(): 168 | env_id = "CartPole-v1" 169 | env = gym.make(env_id) 170 | 171 | observation_space = env.observation_space.shape[0] 172 | action_sapce = env.action_space.n 173 | 174 | current_model = NoisyDQN(observation_space, action_sapce) 175 | target_model = NoisyDQN(observation_space, action_sapce) 176 | 177 | if USE_CUDA: 178 | current_model = current_model.cuda() 179 | target_model = target_model.cuda() 180 | 181 | optimizer = optim.Adam(current_model.parameters()) 182 | 183 | beta_start = 0.4 184 | beta_frames = 1000 185 | beta_by_frame = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames) 186 | 187 | replay_buffer = PrioritizedReplayBuffer(10000, alpha=0.6) 188 | 189 | update_target(current_model, target_model) 190 | 191 | num_frames = 10000 192 | batch_size = 32 193 | gamma = 0.99 194 | 195 | losses = [] 196 | all_rewards = [] 197 | episode_reward = 0 198 | 199 | state = env.reset() 200 | for frame_idx in range(1, num_frames + 1): 201 | # 显示动画 202 | # env.render() 203 | action = current_model.act(state) 204 | 205 | next_state, reward, done, _ = env.step(action) 206 | replay_buffer.push(state, action, reward, next_state, done) 207 | 208 | state = next_state 209 | episode_reward += reward 210 | 211 | if done: 212 | state = env.reset() 213 | all_rewards.append(episode_reward) 214 | episode_reward = 0 215 | 216 | if len(replay_buffer) > batch_size: 217 | beta = beta_by_frame(frame_idx) 218 | loss = compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size, beta) 219 | losses.append(np.array(loss.data.cpu())) 220 | 221 | if frame_idx % 200 == 0: 222 | plot(frame_idx, all_rewards, losses) 223 | 224 | if frame_idx % 1000 == 0: 225 | update_target(current_model, target_model) 226 | 227 | 228 | if __name__ == '__main__': 229 | main() 230 | -------------------------------------------------------------------------------- /Noise_DQN/replay_buffer.py: -------------------------------------------------------------------------------- 1 | #code from openai 2 | #https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py 3 | 4 | import numpy as np 5 | import random 6 | 7 | import operator 8 | 9 | 10 | class SegmentTree(object): 11 | def __init__(self, capacity, operation, neutral_element): 12 | """Build a Segment Tree data structure. 13 | https://en.wikipedia.org/wiki/Segment_tree 14 | Can be used as regular array, but with two 15 | important differences: 16 | a) setting item's value is slightly slower. 17 | It is O(lg capacity) instead of O(1). 18 | b) user has access to an efficient `reduce` 19 | operation which reduces `operation` over 20 | a contiguous subsequence of items in the 21 | array. 22 | Paramters 23 | --------- 24 | capacity: int 25 | Total size of the array - must be a power of two. 26 | operation: lambda obj, obj -> obj 27 | and operation for combining elements (eg. sum, max) 28 | must for a mathematical group together with the set of 29 | possible values for array elements. 30 | neutral_element: obj 31 | neutral element for the operation above. eg. float('-inf') 32 | for max and 0 for sum. 33 | """ 34 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." 35 | self._capacity = capacity 36 | self._value = [neutral_element for _ in range(2 * capacity)] 37 | self._operation = operation 38 | 39 | def _reduce_helper(self, start, end, node, node_start, node_end): 40 | if start == node_start and end == node_end: 41 | return self._value[node] 42 | mid = (node_start + node_end) // 2 43 | if end <= mid: 44 | return self._reduce_helper(start, end, 2 * node, node_start, mid) 45 | else: 46 | if mid + 1 <= start: 47 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) 48 | else: 49 | return self._operation( 50 | self._reduce_helper(start, mid, 2 * node, node_start, mid), 51 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) 52 | ) 53 | 54 | def reduce(self, start=0, end=None): 55 | """Returns result of applying `self.operation` 56 | to a contiguous subsequence of the array. 57 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) 58 | Parameters 59 | ---------- 60 | start: int 61 | beginning of the subsequence 62 | end: int 63 | end of the subsequences 64 | Returns 65 | ------- 66 | reduced: obj 67 | result of reducing self.operation over the specified range of array elements. 68 | """ 69 | if end is None: 70 | end = self._capacity 71 | if end < 0: 72 | end += self._capacity 73 | end -= 1 74 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1) 75 | 76 | def __setitem__(self, idx, val): 77 | # index of the leaf 78 | idx += self._capacity 79 | self._value[idx] = val 80 | idx //= 2 81 | while idx >= 1: 82 | self._value[idx] = self._operation( 83 | self._value[2 * idx], 84 | self._value[2 * idx + 1] 85 | ) 86 | idx //= 2 87 | 88 | def __getitem__(self, idx): 89 | assert 0 <= idx < self._capacity 90 | return self._value[self._capacity + idx] 91 | 92 | 93 | class SumSegmentTree(SegmentTree): 94 | def __init__(self, capacity): 95 | super(SumSegmentTree, self).__init__( 96 | capacity=capacity, 97 | operation=operator.add, 98 | neutral_element=0.0 99 | ) 100 | 101 | def sum(self, start=0, end=None): 102 | """Returns arr[start] + ... + arr[end]""" 103 | return super(SumSegmentTree, self).reduce(start, end) 104 | 105 | def find_prefixsum_idx(self, prefixsum): 106 | """Find the highest index `i` in the array such that 107 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum 108 | if array values are probabilities, this function 109 | allows to sample indexes according to the discrete 110 | probability efficiently. 111 | Parameters 112 | ---------- 113 | perfixsum: float 114 | upperbound on the sum of array prefix 115 | Returns 116 | ------- 117 | idx: int 118 | highest index satisfying the prefixsum constraint 119 | """ 120 | assert 0 <= prefixsum <= self.sum() + 1e-5 121 | idx = 1 122 | while idx < self._capacity: # while non-leaf 123 | if self._value[2 * idx] > prefixsum: 124 | idx = 2 * idx 125 | else: 126 | prefixsum -= self._value[2 * idx] 127 | idx = 2 * idx + 1 128 | return idx - self._capacity 129 | 130 | 131 | class MinSegmentTree(SegmentTree): 132 | def __init__(self, capacity): 133 | super(MinSegmentTree, self).__init__( 134 | capacity=capacity, 135 | operation=min, 136 | neutral_element=float('inf') 137 | ) 138 | 139 | def min(self, start=0, end=None): 140 | """Returns min(arr[start], ..., arr[end])""" 141 | 142 | return super(MinSegmentTree, self).reduce(start, end) 143 | 144 | 145 | class ReplayBuffer(object): 146 | def __init__(self, size): 147 | """Create Replay buffer. 148 | Parameters 149 | ---------- 150 | size: int 151 | Max number of transitions to store in the buffer. When the buffer 152 | overflows the old memories are dropped. 153 | """ 154 | self._storage = [] 155 | self._maxsize = size 156 | self._next_idx = 0 157 | 158 | def __len__(self): 159 | return len(self._storage) 160 | 161 | def push(self, state, action, reward, next_state, done): 162 | data = (state, action, reward, next_state, done) 163 | 164 | if self._next_idx >= len(self._storage): 165 | self._storage.append(data) 166 | else: 167 | self._storage[self._next_idx] = data 168 | self._next_idx = (self._next_idx + 1) % self._maxsize 169 | 170 | def _encode_sample(self, idxes): 171 | obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], [] 172 | for i in idxes: 173 | data = self._storage[i] 174 | obs_t, action, reward, obs_tp1, done = data 175 | obses_t.append(np.array(obs_t, copy=False)) 176 | actions.append(np.array(action, copy=False)) 177 | rewards.append(reward) 178 | obses_tp1.append(np.array(obs_tp1, copy=False)) 179 | dones.append(done) 180 | return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones) 181 | 182 | def sample(self, batch_size): 183 | """Sample a batch of experiences. 184 | Parameters 185 | ---------- 186 | batch_size: int 187 | How many transitions to sample. 188 | Returns 189 | ------- 190 | obs_batch: np.array 191 | batch of observations 192 | act_batch: np.array 193 | batch of actions executed given obs_batch 194 | rew_batch: np.array 195 | rewards received as results of executing act_batch 196 | next_obs_batch: np.array 197 | next set of observations seen after executing act_batch 198 | done_mask: np.array 199 | done_mask[i] = 1 if executing act_batch[i] resulted in 200 | the end of an episode and 0 otherwise. 201 | """ 202 | idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)] 203 | return self._encode_sample(idxes) 204 | 205 | 206 | class PrioritizedReplayBuffer(ReplayBuffer): 207 | def __init__(self, size, alpha): 208 | """Create Prioritized Replay buffer. 209 | Parameters 210 | ---------- 211 | size: int 212 | Max number of transitions to store in the buffer. When the buffer 213 | overflows the old memories are dropped. 214 | alpha: float 215 | how much prioritization is used 216 | (0 - no prioritization, 1 - full prioritization) 217 | See Also 218 | -------- 219 | ReplayBuffer.__init__ 220 | """ 221 | super(PrioritizedReplayBuffer, self).__init__(size) 222 | assert alpha > 0 223 | self._alpha = alpha 224 | 225 | it_capacity = 1 226 | while it_capacity < size: 227 | it_capacity *= 2 228 | 229 | self._it_sum = SumSegmentTree(it_capacity) 230 | self._it_min = MinSegmentTree(it_capacity) 231 | self._max_priority = 1.0 232 | 233 | def push(self, *args, **kwargs): 234 | """See ReplayBuffer.store_effect""" 235 | idx = self._next_idx 236 | super(PrioritizedReplayBuffer, self).push(*args, **kwargs) 237 | self._it_sum[idx] = self._max_priority ** self._alpha 238 | self._it_min[idx] = self._max_priority ** self._alpha 239 | 240 | def _sample_proportional(self, batch_size): 241 | res = [] 242 | for _ in range(batch_size): 243 | # TODO(szymon): should we ensure no repeats? 244 | mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1) 245 | idx = self._it_sum.find_prefixsum_idx(mass) 246 | res.append(idx) 247 | return res 248 | 249 | def sample(self, batch_size, beta): 250 | """Sample a batch of experiences. 251 | compared to ReplayBuffer.sample 252 | it also returns importance weights and idxes 253 | of sampled experiences. 254 | Parameters 255 | ---------- 256 | batch_size: int 257 | How many transitions to sample. 258 | beta: float 259 | To what degree to use importance weights 260 | (0 - no corrections, 1 - full correction) 261 | Returns 262 | ------- 263 | obs_batch: np.array 264 | batch of observations 265 | act_batch: np.array 266 | batch of actions executed given obs_batch 267 | rew_batch: np.array 268 | rewards received as results of executing act_batch 269 | next_obs_batch: np.array 270 | next set of observations seen after executing act_batch 271 | done_mask: np.array 272 | done_mask[i] = 1 if executing act_batch[i] resulted in 273 | the end of an episode and 0 otherwise. 274 | weights: np.array 275 | Array of shape (batch_size,) and dtype np.float32 276 | denoting importance weight of each sampled transition 277 | idxes: np.array 278 | Array of shape (batch_size,) and dtype np.int32 279 | idexes in buffer of sampled experiences 280 | """ 281 | assert beta > 0 282 | 283 | idxes = self._sample_proportional(batch_size) 284 | 285 | weights = [] 286 | p_min = self._it_min.min() / self._it_sum.sum() 287 | max_weight = (p_min * len(self._storage)) ** (-beta) 288 | 289 | for idx in idxes: 290 | p_sample = self._it_sum[idx] / self._it_sum.sum() 291 | weight = (p_sample * len(self._storage)) ** (-beta) 292 | weights.append(weight / max_weight) 293 | weights = np.array(weights) 294 | encoded_sample = self._encode_sample(idxes) 295 | return tuple(list(encoded_sample) + [weights, idxes]) 296 | 297 | def update_priorities(self, idxes, priorities): 298 | """Update priorities of sampled transitions. 299 | sets priority of transition at index idxes[i] in buffer 300 | to priorities[i]. 301 | Parameters 302 | ---------- 303 | idxes: [int] 304 | List of idxes of sampled transitions 305 | priorities: [float] 306 | List of updated priorities corresponding to 307 | transitions at the sampled idxes denoted by 308 | variable `idxes`. 309 | """ 310 | assert len(idxes) == len(priorities) 311 | for idx, priority in zip(idxes, priorities): 312 | assert priority > 0 313 | assert 0 <= idx < len(self._storage) 314 | self._it_sum[idx] = priority ** self._alpha 315 | self._it_min[idx] = priority ** self._alpha 316 | 317 | self._max_priority = max(self._max_priority, priority) -------------------------------------------------------------------------------- /PPO/PPO.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | # Hyperparameters 9 | learning_rate = 0.0005 10 | gamma = 0.98 11 | lmbda = 0.95 12 | eps_clip = 0.1 13 | K_epoch = 3 14 | T_horizon = 20 15 | 16 | MAX_EPISODE = 10000 17 | RENDER = False 18 | 19 | env = gym.make('CartPole-v1') 20 | env = env.unwrapped 21 | 22 | print("env.action_space :", env.action_space) 23 | print("env.observation_space :", env.observation_space) 24 | 25 | n_features = env.observation_space.shape[0] 26 | n_actions = env.action_space.n 27 | 28 | 29 | class PPO(nn.Module): 30 | def __init__(self): 31 | super(PPO, self).__init__() 32 | self.data = [] 33 | hidden_dims = 256 34 | self.fc1 = nn.Linear(n_features, hidden_dims) 35 | self.fc_pi = nn.Linear(hidden_dims, n_actions) 36 | self.fc_v = nn.Linear(hidden_dims, 1) 37 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 38 | 39 | def pi(self, x, softmax_dim=-1): 40 | x = F.relu(self.fc1(x)) 41 | x = self.fc_pi(x) 42 | prob = F.softmax(x, dim=softmax_dim) 43 | return prob 44 | 45 | def v(self, x): 46 | x = F.relu(self.fc1(x)) 47 | v = self.fc_v(x) 48 | return v 49 | 50 | def put_data(self, transition): 51 | self.data.append(transition) 52 | 53 | def make_batch(self): 54 | s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], [] 55 | for transition in self.data: 56 | s, a, r, s_prime, prob_a, done = transition 57 | 58 | s_lst.append(s) 59 | a_lst.append([a]) 60 | r_lst.append([r]) 61 | s_prime_lst.append(s_prime) 62 | prob_a_lst.append([prob_a]) 63 | done_mask = 0 if done else 1 64 | done_lst.append([done_mask]) 65 | 66 | s, a, r, s_prime, done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ 67 | torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ 68 | torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst) 69 | self.data = [] 70 | return s, a, r, s_prime, done_mask, prob_a 71 | 72 | def train_net(self): 73 | s, a, r, s_prime, done_mask, prob_a = self.make_batch() 74 | 75 | for i in range(K_epoch): 76 | td_target = r + gamma * self.v(s_prime) * done_mask 77 | delta = td_target - self.v(s) 78 | delta = delta.detach().numpy() 79 | 80 | advantage_lst = [] 81 | advantage = 0.0 82 | for delta_t in delta[::-1]: 83 | advantage = gamma * lmbda * advantage + delta_t[0] 84 | advantage_lst.append([advantage]) 85 | advantage_lst.reverse() 86 | advantage = torch.tensor(advantage_lst, dtype=torch.float) 87 | 88 | pi = self.pi(s, softmax_dim=1) 89 | pi_a = pi.gather(1, a) 90 | ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a)) # a/b == exp(log(a)-log(b)) 91 | 92 | surr1 = ratio * advantage 93 | surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * advantage 94 | loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s), td_target.detach()) 95 | 96 | self.optimizer.zero_grad() 97 | loss.mean().backward() 98 | self.optimizer.step() 99 | 100 | 101 | def main(): 102 | model = PPO() 103 | score = 0.0 104 | print_interval = 20 105 | 106 | for n_epi in range(MAX_EPISODE): 107 | s = env.reset() 108 | done = False 109 | while not done: 110 | for t in range(T_horizon): 111 | if RENDER: 112 | env.render() 113 | prob = model.pi(torch.from_numpy(s).float()) 114 | m = Categorical(prob) 115 | a = m.sample().item() 116 | s_prime, r, done, info = env.step(a) 117 | 118 | model.put_data((s, a, r / 100.0, s_prime, prob[a].item(), done)) 119 | s = s_prime 120 | 121 | score += r 122 | if done: 123 | break 124 | 125 | model.train_net() 126 | 127 | if n_epi % print_interval == 0 and n_epi != 0: 128 | print("# of episode :{}, avg score : {:.1f}".format(n_epi, score / print_interval)) 129 | score = 0.0 130 | 131 | env.close() 132 | 133 | 134 | if __name__ == '__main__': 135 | main() 136 | -------------------------------------------------------------------------------- /Prioritized_Replay_DQN/Prioritized_Replay_DQN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | 5 | np.random.seed(1) 6 | torch.manual_seed(1) 7 | 8 | 9 | class SumTree(object): 10 | """ 11 | This SumTree code is a modified version and the original code is from: 12 | https://github.com/jaara/AI-blog/blob/master/SumTree.py 13 | Story data with its priority in the tree. 14 | """ 15 | data_pointer = 0 16 | 17 | def __init__(self, capacity): 18 | self.capacity = capacity # for all priority values 19 | self.tree = np.zeros(2 * capacity - 1) 20 | # [--------------Parent nodes-------------][-------leaves to recode priority-------] 21 | # size: capacity - 1 size: capacity 22 | self.data = np.zeros(capacity, dtype=object) # for all transitions 23 | # [--------------data frame-------------] 24 | # size: capacity 25 | 26 | def add(self, p, data): 27 | tree_idx = self.data_pointer + self.capacity - 1 28 | self.data[self.data_pointer] = data # update data_frame 29 | self.update(tree_idx, p) # update tree_frame 30 | 31 | self.data_pointer += 1 32 | if self.data_pointer >= self.capacity: # replace when exceed the capacity 33 | self.data_pointer = 0 34 | 35 | def update(self, tree_idx, p): 36 | change = p - self.tree[tree_idx] 37 | self.tree[tree_idx] = p 38 | # then propagate the change through tree 39 | while tree_idx != 0: # this method is faster than the recursive loop in the reference code 40 | tree_idx = (tree_idx - 1) // 2 41 | self.tree[tree_idx] += change 42 | 43 | def get_leaf(self, v): 44 | """ 45 | Tree structure and array storage: 46 | Tree index: 47 | 0 -> storing priority sum 48 | / \ 49 | 1 2 50 | / \ / \ 51 | 3 4 5 6 -> storing priority for transitions 52 | Array type for storing: 53 | [0,1,2,3,4,5,6] 54 | """ 55 | parent_idx = 0 56 | while True: # the while loop is faster than the method in the reference code 57 | cl_idx = 2 * parent_idx + 1 # this leaf's left and right kids 58 | cr_idx = cl_idx + 1 59 | if cl_idx >= len(self.tree): # reach bottom, end search 60 | leaf_idx = parent_idx 61 | break 62 | else: # downward search, always search for a higher priority node 63 | if v <= self.tree[cl_idx]: 64 | parent_idx = cl_idx 65 | else: 66 | v -= self.tree[cl_idx] 67 | parent_idx = cr_idx 68 | 69 | data_idx = leaf_idx - self.capacity + 1 70 | return leaf_idx, self.tree[leaf_idx], self.data[data_idx] 71 | 72 | @property 73 | def total_p(self): 74 | return self.tree[0] # the root 75 | 76 | 77 | class Memory(object): # stored as ( s, a, r, s_ ) in SumTree 78 | """ 79 | This Memory class is modified based on the original code from: 80 | https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py 81 | """ 82 | epsilon = 0.01 # small amount to avoid zero priority 83 | alpha = 0.6 # [0~1] convert the importance of TD error to priority 84 | beta = 0.4 # importance-sampling, from initial value increasing to 1 85 | beta_increment_per_sampling = 0.001 86 | abs_err_upper = 1. # clipped abs error 87 | 88 | def __init__(self, capacity): 89 | self.tree = SumTree(capacity) 90 | 91 | def store(self, transition): 92 | max_p = np.max(self.tree.tree[-self.tree.capacity:]) 93 | if max_p == 0: 94 | max_p = self.abs_err_upper 95 | self.tree.add(max_p, transition) # set the max p for new p 96 | 97 | def sample(self, n): 98 | b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, self.tree.data[0].size)), np.empty( 99 | (n, 1)) 100 | pri_seg = self.tree.total_p / n # priority segment 101 | self.beta = np.min([1., self.beta + self.beta_increment_per_sampling]) # max = 1 102 | 103 | min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p # for later calculate ISweight 104 | for i in range(n): 105 | a, b = pri_seg * i, pri_seg * (i + 1) 106 | v = np.random.uniform(a, b) 107 | idx, p, data = self.tree.get_leaf(v) 108 | prob = p / self.tree.total_p 109 | ISWeights[i, 0] = np.power(prob / min_prob, -self.beta) 110 | b_idx[i], b_memory[i, :] = idx, data 111 | return b_idx, b_memory, ISWeights 112 | 113 | def batch_update(self, tree_idx, abs_errors): 114 | abs_errors += self.epsilon # convert to abs and avoid 0 115 | clipped_errors = np.minimum(abs_errors, self.abs_err_upper) 116 | ps = np.power(clipped_errors, self.alpha) 117 | for ti, p in zip(tree_idx, ps): 118 | self.tree.update(ti, p) 119 | 120 | 121 | class DQNNet(nn.Module): 122 | def __init__(self, n_actions, n_features): 123 | super(DQNNet, self).__init__() 124 | self.out_layer = torch.nn.Sequential(nn.Linear(n_features, 10), 125 | nn.ReLU(), 126 | nn.Linear(10, n_actions)) 127 | 128 | def forward(self, x): 129 | return self.out_layer(x) 130 | 131 | 132 | class DQNPrioritizedReplay: 133 | def __init__( 134 | self, 135 | n_actions, 136 | n_features, 137 | learning_rate=0.005, 138 | reward_decay=0.9, 139 | e_greedy=0.9, 140 | replace_target_iter=500, 141 | memory_size=10000, 142 | batch_size=32, 143 | e_greedy_increment=None, 144 | ): 145 | self.memory_counter = 0 146 | self.n_actions = n_actions 147 | self.n_features = n_features 148 | self.lr = learning_rate 149 | self.gamma = reward_decay 150 | self.epsilon_max = e_greedy 151 | self.replace_target_iter = replace_target_iter 152 | self.memory_size = memory_size 153 | self.batch_size = batch_size 154 | self.epsilon_increment = e_greedy_increment 155 | self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max 156 | # total learning step 157 | self.learn_step_counter = 0 158 | 159 | # ---------------------------重要部分--------------------------- 160 | self.memory = Memory(capacity=memory_size) 161 | # ---------------------------重要部分--------------------------- 162 | 163 | self._build_net() 164 | self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(), 165 | learning_rate) 166 | self.cost_his = [] 167 | 168 | def _build_net(self): 169 | self.evaluate_net = DQNNet(self.n_actions, self.n_features) 170 | self.target_net = type(self.evaluate_net)(self.n_actions, self.n_features) 171 | self.target_net.load_state_dict(self.evaluate_net.state_dict()) # copy weights and stuff 172 | 173 | def store_transition(self, s, a, r, s_): 174 | # ---------------------------重要部分--------------------------- 175 | # prioritized replay 176 | transition = np.hstack((s, [a, r], s_)) 177 | self.memory.store(transition) # have high priority for newly arrived transition 178 | self.memory_counter += 1 179 | # ---------------------------重要部分--------------------------- 180 | 181 | def choose_action(self, observation): 182 | state = torch.Tensor(observation[np.newaxis, :]) 183 | 184 | if np.random.uniform() < self.epsilon: 185 | actions_value = self.evaluate_net(state) 186 | action = actions_value.argmax(axis=1).numpy()[0] 187 | else: 188 | action = np.random.randint(0, self.n_actions) 189 | return action 190 | 191 | def learn(self): 192 | # check to replace target parameters 193 | if self.learn_step_counter % self.replace_target_iter == 0: 194 | self.target_net.load_state_dict(self.evaluate_net.state_dict()) # copy weights and stuff 195 | 196 | # ---------------------------重要部分--------------------------- 197 | tree_idx, batch_memory, ISWeights = self.memory.sample( 198 | self.batch_size) 199 | # ---------------------------重要部分--------------------------- 200 | 201 | s = torch.Tensor(batch_memory[:, :self.n_features]) 202 | u = torch.LongTensor(batch_memory[:, self.n_features, np.newaxis]) 203 | r = torch.Tensor(batch_memory[:, self.n_features + 1, np.newaxis]) 204 | s_ = torch.Tensor(batch_memory[:, -self.n_features:]) 205 | 206 | q_eval = self.evaluate_net(s).gather(1, u) 207 | 208 | q_target_next = self.target_net(s_).detach() 209 | q_eval_next = self.evaluate_net(s_).detach() 210 | q_next = q_target_next.gather(1, q_eval_next.argmax(axis=1).reshape(-1, 1)) 211 | delta = r + self.gamma * q_next - q_eval 212 | self.optimizer.zero_grad() 213 | 214 | # ---------------------------重要部分--------------------------- 215 | abs_errors = torch.sum( 216 | torch.abs( 217 | self.evaluate_net(s).detach() - 218 | self.target_net(s).detach()), 1) # for updating Sumtree 219 | loss = torch.mean(torch.Tensor(ISWeights) * delta ** 2) 220 | self.memory.batch_update(tree_idx, abs_errors) # update priority 221 | # ---------------------------重要部分--------------------------- 222 | 223 | # train eval network 224 | loss.backward() 225 | self.optimizer.step() 226 | self.cost_his.append(loss.data.numpy()) 227 | 228 | # increasing epsilon 229 | self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max 230 | self.learn_step_counter += 1 231 | 232 | def plot_cost(self): 233 | import matplotlib.pyplot as plt 234 | plt.plot(np.arange(len(self.cost_his)), self.cost_his) 235 | plt.ylabel('Cost') 236 | plt.xlabel('training steps') 237 | plt.show() 238 | -------------------------------------------------------------------------------- /Prioritized_Replay_DQN/run_MountainCar.py: -------------------------------------------------------------------------------- 1 | from Prioritized_Replay_DQN import DQNPrioritizedReplay 2 | from gym import make 3 | import numpy as np 4 | 5 | MEMORY_SIZE = 10000 6 | ACTION_SPACE = 11 7 | 8 | env = make('MountainCar-v0') 9 | env = env.unwrapped 10 | env.seed(21) 11 | 12 | print("env.action_space :", env.action_space) 13 | print("env.observation_space :", env.observation_space) 14 | print("env.observation_space :", env.observation_space.high) 15 | print("env.observation_space :", env.observation_space.low) 16 | 17 | RL = DQNPrioritizedReplay(env.action_space.n, env.observation_space.shape[0], 18 | learning_rate=0.01, 19 | reward_decay=0.9, 20 | e_greedy=0.9, 21 | replace_target_iter=200, 22 | memory_size=MEMORY_SIZE, 23 | e_greedy_increment=0.001 24 | ) 25 | 26 | total_steps = 0 27 | for i in range(500): 28 | observation = env.reset() 29 | while True: 30 | env.render() 31 | 32 | action = RL.choose_action(observation) 33 | 34 | observation_, reward, done, info = env.step(action) 35 | 36 | if done: reward = 10 37 | 38 | RL.store_transition(observation, action, reward, observation_) 39 | 40 | if RL.memory_counter > RL.memory_size: 41 | RL.learn() 42 | 43 | if done: 44 | break 45 | observation = observation_ 46 | total_steps += 1 47 | 48 | env.close() 49 | RL.plot_cost() 50 | -------------------------------------------------------------------------------- /Q_Learning_maze/RL_brain.py: -------------------------------------------------------------------------------- 1 | """ 2 | This part of code is the Q learning brain, which is a brain of the agent. 3 | All decisions are made in here. 4 | """ 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | class QLearningTable: 11 | def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9): 12 | self.actions = actions # a list 13 | self.lr = learning_rate 14 | self.gamma = reward_decay 15 | self.epsilon = e_greedy 16 | self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64) 17 | 18 | def choose_action(self, observation): 19 | self.check_state_exist(observation) 20 | # action selection 21 | if np.random.uniform() < self.epsilon: 22 | # choose best action 23 | state_action = self.q_table.loc[observation, :] 24 | # some actions may have the same value, randomly choose on in these actions 25 | action = np.random.choice(state_action[state_action == np.max(state_action)].index) 26 | else: 27 | # choose random action 28 | action = np.random.choice(self.actions) 29 | return action 30 | 31 | def learn(self, s, a, r, s_): 32 | self.check_state_exist(s_) 33 | q_predict = self.q_table.loc[s, a] 34 | if s_ != 'terminal': 35 | q_target = r + self.gamma * self.q_table.loc[s_, :].max() # next state is not terminal 36 | else: 37 | q_target = r # next state is terminal 38 | self.q_table.loc[s, a] += self.lr * (q_target - q_predict) # update 39 | 40 | def check_state_exist(self, state): 41 | if state not in self.q_table.index: 42 | # append new state to q table 43 | self.q_table = self.q_table.append( 44 | pd.Series( 45 | [0] * len(self.actions), 46 | index=self.q_table.columns, 47 | name=state, 48 | ) 49 | ) 50 | -------------------------------------------------------------------------------- /Q_Learning_maze/maze_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reinforcement learning maze example. 3 | 4 | Red rectangle: explorer. 5 | Black rectangles: hells [reward = -1]. 6 | Yellow bin circle: paradise [reward = +1]. 7 | All other states: ground [reward = 0]. 8 | 9 | This script is the environment part of this example. The RL is in RL_brain.py. 10 | """ 11 | 12 | import numpy as np 13 | import time 14 | import sys 15 | 16 | import tkinter as tk 17 | 18 | UNIT = 40 # pixels 19 | MAZE_H = 4 # grid height 20 | MAZE_W = 4 # grid width 21 | 22 | 23 | class Maze(tk.Tk, object): 24 | def __init__(self): 25 | super(Maze, self).__init__() 26 | self.action_space = ['u', 'd', 'l', 'r'] 27 | self.n_actions = len(self.action_space) 28 | self.title('maze') 29 | self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT)) 30 | self._build_maze() 31 | 32 | def _build_maze(self): 33 | self.canvas = tk.Canvas(self, bg='white', 34 | height=MAZE_H * UNIT, 35 | width=MAZE_W * UNIT) 36 | 37 | # create grids 38 | for c in range(0, MAZE_W * UNIT, UNIT): 39 | x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT 40 | self.canvas.create_line(x0, y0, x1, y1) 41 | for r in range(0, MAZE_H * UNIT, UNIT): 42 | x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r 43 | self.canvas.create_line(x0, y0, x1, y1) 44 | 45 | # create origin 46 | origin = np.array([20, 20]) 47 | 48 | # hell 49 | hell1_center = origin + np.array([UNIT * 2, UNIT]) 50 | self.hell1 = self.canvas.create_rectangle( 51 | hell1_center[0] - 15, hell1_center[1] - 15, 52 | hell1_center[0] + 15, hell1_center[1] + 15, 53 | fill='black') 54 | # hell 55 | hell2_center = origin + np.array([UNIT, UNIT * 2]) 56 | self.hell2 = self.canvas.create_rectangle( 57 | hell2_center[0] - 15, hell2_center[1] - 15, 58 | hell2_center[0] + 15, hell2_center[1] + 15, 59 | fill='black') 60 | 61 | # create oval 62 | oval_center = origin + UNIT * 2 63 | self.oval = self.canvas.create_oval( 64 | oval_center[0] - 15, oval_center[1] - 15, 65 | oval_center[0] + 15, oval_center[1] + 15, 66 | fill='yellow') 67 | 68 | # create red rect 69 | self.rect = self.canvas.create_rectangle( 70 | origin[0] - 15, origin[1] - 15, 71 | origin[0] + 15, origin[1] + 15, 72 | fill='red') 73 | 74 | # pack all 75 | self.canvas.pack() 76 | 77 | def reset(self): 78 | self.update() 79 | time.sleep(0.5) 80 | self.canvas.delete(self.rect) 81 | origin = np.array([20, 20]) 82 | self.rect = self.canvas.create_rectangle( 83 | origin[0] - 15, origin[1] - 15, 84 | origin[0] + 15, origin[1] + 15, 85 | fill='red') 86 | # return observation 87 | return self.canvas.coords(self.rect) 88 | 89 | def step(self, action): 90 | s = self.canvas.coords(self.rect) 91 | base_action = np.array([0, 0]) 92 | if action == 0: # up 93 | if s[1] > UNIT: 94 | base_action[1] -= UNIT 95 | elif action == 1: # down 96 | if s[1] < (MAZE_H - 1) * UNIT: 97 | base_action[1] += UNIT 98 | elif action == 2: # right 99 | if s[0] < (MAZE_W - 1) * UNIT: 100 | base_action[0] += UNIT 101 | elif action == 3: # left 102 | if s[0] > UNIT: 103 | base_action[0] -= UNIT 104 | 105 | self.canvas.move(self.rect, base_action[0], base_action[1]) # move agent 106 | 107 | s_ = self.canvas.coords(self.rect) # next state 108 | 109 | # reward function 110 | if s_ == self.canvas.coords(self.oval): 111 | reward = 1 112 | done = True 113 | s_ = 'terminal' 114 | elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]: 115 | reward = -1 116 | done = True 117 | s_ = 'terminal' 118 | else: 119 | reward = 0 120 | done = False 121 | 122 | return s_, reward, done 123 | 124 | def render(self): 125 | time.sleep(0.1) 126 | self.update() 127 | 128 | 129 | def update(): 130 | for t in range(10): 131 | s = env.reset() 132 | while True: 133 | env.render() 134 | a = 1 135 | s, r, done = env.step(a) 136 | if done: 137 | break 138 | 139 | 140 | if __name__ == '__main__': 141 | env = Maze() 142 | env.after(100, update) 143 | env.mainloop() 144 | -------------------------------------------------------------------------------- /Q_Learning_maze/run_q_function.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reinforcement learning maze example. 3 | 4 | Red rectangle: explorer. 5 | Black rectangles: hells [reward = -1]. 6 | Yellow bin circle: paradise [reward = +1]. 7 | All other states: ground [reward = 0]. 8 | 9 | This script is the main part which controls the update method of this example. 10 | The RL is in RL_brain.py. 11 | """ 12 | 13 | from maze_env import Maze 14 | from RL_brain import QLearningTable 15 | 16 | 17 | def update(): 18 | for episode in range(100): 19 | # initial observation 20 | observation = env.reset() 21 | 22 | while True: 23 | # fresh env 24 | env.render() 25 | 26 | # RL choose action based on observation 27 | action = RL.choose_action(str(observation)) 28 | 29 | # RL take action and get next observation and reward 30 | observation_, reward, done = env.step(action) 31 | 32 | # RL learn from this transition 33 | RL.learn(str(observation), action, reward, str(observation_)) 34 | 35 | # swap observation 36 | observation = observation_ 37 | 38 | # break while loop when end of this episode 39 | if done: 40 | break 41 | 42 | # end of game 43 | print('game over') 44 | env.destroy() 45 | 46 | 47 | if __name__ == "__main__": 48 | env = Maze() 49 | actions = list(range(env.n_actions)) 50 | print(actions) 51 | RL = QLearningTable(actions=list(range(env.n_actions))) 52 | 53 | env.after(100, update) 54 | env.mainloop() 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 强化学习Pytorch 2 | 经典强化学习算法的 Pytorch 实现 3 | 4 | ## 环境依赖 5 | 6 | 1. PyTorch 7 | 2. OpenAI GYM 8 | 9 | ## 算法实现 10 | 11 | 12 | 13 | ## 参考链接 14 | 1. https://github.com/seungeunrho/minimalRL -------------------------------------------------------------------------------- /REINFORCE/REINFORCE.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Categorical 7 | 8 | # Hyperparameters 9 | gamma = 0.0002 10 | learning_rate = 0.98 11 | MAX_EPISODE = 10000 12 | RENDER = True 13 | 14 | env = gym.make('CartPole-v1') 15 | env = env.unwrapped 16 | env.seed(1) 17 | torch.manual_seed(1) 18 | 19 | print("env.action_space :", env.action_space) 20 | print("env.observation_space :", env.observation_space) 21 | 22 | n_features = env.observation_space.shape[0] 23 | n_actions = env.action_space.n 24 | 25 | 26 | class Policy(nn.Module): 27 | def __init__(self): 28 | super(Policy, self).__init__() 29 | self.episode = [] 30 | 31 | hidden_units = 10 32 | self.fc_layer = nn.Sequential(nn.Linear(n_features, hidden_units), 33 | nn.Sigmoid(), 34 | nn.Linear(hidden_units, n_actions), 35 | nn.Softmax(dim=-1)) 36 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 37 | 38 | def forward(self, x): 39 | x = self.fc_layer(x) 40 | return x 41 | 42 | def put_data(self, item): 43 | self.episode.append(item) 44 | 45 | def train_net(self): 46 | reward = 0 47 | self.optimizer.zero_grad() 48 | for r, prob in self.episode[::-1]: 49 | reward = r + gamma * reward 50 | loss = -torch.log(prob) * reward 51 | loss.backward() 52 | 53 | self.optimizer.step() 54 | 55 | self.episode = [] 56 | 57 | def choose_action(self, observation): 58 | prob_weights = self.forward(torch.from_numpy(observation)) 59 | m = Categorical(prob_weights) 60 | action_idx = m.sample() 61 | return action_idx, prob_weights 62 | 63 | 64 | def main(): 65 | policy = Policy() 66 | score = 0.0 67 | print_interval = 20 68 | 69 | for n_epi in range(MAX_EPISODE): 70 | s = env.reset() 71 | done = False 72 | 73 | while not done: # CartPole-v1 forced to terminates at 500 step. 74 | if RENDER: 75 | env.render() 76 | action, prob_weights = policy.choose_action(s) 77 | s_, r, done, info = env.step(action.item()) 78 | policy.put_data((r, prob_weights[action])) 79 | s = s_ 80 | score += r 81 | 82 | policy.train_net() 83 | 84 | if n_epi % print_interval == 0 and n_epi != 0: 85 | print("# of episode :{}, avg score : {}".format(n_epi, score / print_interval)) 86 | score = 0.0 87 | env.close() 88 | 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /SAC/SAC.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.distributions import Normal 7 | import numpy as np 8 | import collections, random 9 | 10 | # Hyperparameters 11 | lr_pi = 0.0005 12 | lr_q = 0.001 13 | init_alpha = 0.01 14 | gamma = 0.98 15 | batch_size = 32 16 | buffer_limit = 50000 17 | tau = 0.01 # for target network soft update 18 | target_entropy = -1.0 # for automated alpha update 19 | lr_alpha = 0.001 # for automated alpha update 20 | 21 | 22 | class ReplayBuffer(): 23 | def __init__(self): 24 | self.buffer = collections.deque(maxlen=buffer_limit) 25 | 26 | def put(self, transition): 27 | self.buffer.append(transition) 28 | 29 | def sample(self, n): 30 | mini_batch = random.sample(self.buffer, n) 31 | s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], [] 32 | 33 | for transition in mini_batch: 34 | s, a, r, s_prime, done = transition 35 | s_lst.append(s) 36 | a_lst.append([a]) 37 | r_lst.append([r]) 38 | s_prime_lst.append(s_prime) 39 | done_mask = 0.0 if done else 1.0 40 | done_mask_lst.append([done_mask]) 41 | 42 | return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst, dtype=torch.float), \ 43 | torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \ 44 | torch.tensor(done_mask_lst, dtype=torch.float) 45 | 46 | def size(self): 47 | return len(self.buffer) 48 | 49 | 50 | class PolicyNet(nn.Module): 51 | def __init__(self, learning_rate): 52 | super(PolicyNet, self).__init__() 53 | self.fc1 = nn.Linear(3, 128) 54 | self.fc_mu = nn.Linear(128, 1) 55 | self.fc_std = nn.Linear(128, 1) 56 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 57 | 58 | self.log_alpha = torch.tensor(np.log(init_alpha)) 59 | self.log_alpha.requires_grad = True 60 | self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr=lr_alpha) 61 | 62 | def forward(self, x): 63 | x = F.relu(self.fc1(x)) 64 | mu = self.fc_mu(x) 65 | std = F.softplus(self.fc_std(x)) 66 | dist = Normal(mu, std) 67 | action = dist.rsample() 68 | log_prob = dist.log_prob(action) 69 | real_action = torch.tanh(action) 70 | real_log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7) 71 | return real_action, real_log_prob 72 | 73 | def train_net(self, q1, q2, mini_batch): 74 | s, _, _, _, _ = mini_batch 75 | a, log_prob = self.forward(s) 76 | entropy = -self.log_alpha.exp() * log_prob 77 | 78 | q1_val, q2_val = q1(s, a), q2(s, a) 79 | q1_q2 = torch.cat([q1_val, q2_val], dim=1) 80 | min_q = torch.min(q1_q2, 1, keepdim=True)[0] 81 | 82 | loss = -min_q - entropy # for gradient ascent 83 | self.optimizer.zero_grad() 84 | loss.mean().backward() 85 | self.optimizer.step() 86 | 87 | self.log_alpha_optimizer.zero_grad() 88 | alpha_loss = -(self.log_alpha.exp() * (log_prob + target_entropy).detach()).mean() 89 | alpha_loss.backward() 90 | self.log_alpha_optimizer.step() 91 | 92 | 93 | class QNet(nn.Module): 94 | def __init__(self, learning_rate): 95 | super(QNet, self).__init__() 96 | self.fc_s = nn.Linear(3, 64) 97 | self.fc_a = nn.Linear(1, 64) 98 | self.fc_cat = nn.Linear(128, 32) 99 | self.fc_out = nn.Linear(32, 1) 100 | self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) 101 | 102 | def forward(self, x, a): 103 | h1 = F.relu(self.fc_s(x)) 104 | h2 = F.relu(self.fc_a(a)) 105 | cat = torch.cat([h1, h2], dim=1) 106 | q = F.relu(self.fc_cat(cat)) 107 | q = self.fc_out(q) 108 | return q 109 | 110 | def train_net(self, target, mini_batch): 111 | s, a, r, s_prime, done = mini_batch 112 | loss = F.smooth_l1_loss(self.forward(s, a), target) 113 | self.optimizer.zero_grad() 114 | loss.mean().backward() 115 | self.optimizer.step() 116 | 117 | def soft_update(self, net_target): 118 | for param_target, param in zip(net_target.parameters(), self.parameters()): 119 | param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau) 120 | 121 | 122 | def calc_target(pi, q1, q2, mini_batch): 123 | s, a, r, s_prime, done = mini_batch 124 | 125 | with torch.no_grad(): 126 | a_prime, log_prob = pi(s_prime) 127 | entropy = -pi.log_alpha.exp() * log_prob 128 | q1_val, q2_val = q1(s_prime, a_prime), q2(s_prime, a_prime) 129 | q1_q2 = torch.cat([q1_val, q2_val], dim=1) 130 | min_q = torch.min(q1_q2, 1, keepdim=True)[0] 131 | target = r + gamma * done * (min_q + entropy) 132 | 133 | return target 134 | 135 | 136 | def main(): 137 | env = gym.make('Pendulum-v1') 138 | memory = ReplayBuffer() 139 | q1, q2, q1_target, q2_target = QNet(lr_q), QNet(lr_q), QNet(lr_q), QNet(lr_q) 140 | pi = PolicyNet(lr_pi) 141 | 142 | q1_target.load_state_dict(q1.state_dict()) 143 | q2_target.load_state_dict(q2.state_dict()) 144 | 145 | score = 0.0 146 | print_interval = 20 147 | 148 | for n_epi in range(10000): 149 | s = env.reset() 150 | 151 | done = False 152 | 153 | while not done: 154 | env.render() 155 | a, log_prob = pi(torch.from_numpy(s).float()) 156 | s_prime, r, done, info = env.step([2.0 * a.item()]) 157 | memory.put((s, a.item(), r / 10.0, s_prime, done)) 158 | score += r 159 | s = s_prime 160 | 161 | if memory.size() > 1000: 162 | for i in range(20): 163 | mini_batch = memory.sample(batch_size) 164 | td_target = calc_target(pi, q1_target, q2_target, mini_batch) 165 | q1.train_net(td_target, mini_batch) 166 | q2.train_net(td_target, mini_batch) 167 | entropy = pi.train_net(q1, q2, mini_batch) 168 | q1.soft_update(q1_target) 169 | q2.soft_update(q2_target) 170 | 171 | if n_epi % print_interval == 0 and n_epi != 0: 172 | print("# of episode :{}, avg score : {:.1f} alpha:{:.4f}".format(n_epi, score / print_interval, 173 | pi.log_alpha.exp())) 174 | score = 0.0 175 | 176 | env.close() 177 | 178 | 179 | if __name__ == '__main__': 180 | main() 181 | --------------------------------------------------------------------------------