├── .gitattributes
├── .gitignore
├── A2C
    ├── A2C.py
    ├── AC_CartPole.py
    └── AC_continue_Pendulum.py
├── A3C
    └── A3C.py
├── Actor_Critic
    └── Actor_Critic.py
├── D3QN
    └── D3QN.py
├── DDPG
    └── DDPG.py
├── DQN
    └── DQN.py
├── Double_DQN
    └── DDQN.py
├── Dueling_DQN
    └── Dueling_DQN.py
├── LICENSE
├── Noise_DQN
    ├── Noise_DQN.py
    └── replay_buffer.py
├── PPO
    └── PPO.py
├── Prioritized_Replay_DQN
    ├── Prioritized_Replay_DQN.py
    └── run_MountainCar.py
├── Q_Learning_maze
    ├── RL_brain.py
    ├── maze_env.py
    └── run_q_function.py
├── README.md
├── REINFORCE
    └── REINFORCE.py
└── SAC
    └── SAC.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 | ./__pycache__


--------------------------------------------------------------------------------
/A2C/A2C.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | from torch.distributions import Categorical
  7 | import torch.multiprocessing as mp
  8 | import numpy as np
  9 | 
 10 | # Hyperparameters
 11 | n_train_processes = 3
 12 | learning_rate = 0.0002
 13 | update_interval = 5
 14 | gamma = 0.98
 15 | max_train_steps = 60000
 16 | PRINT_INTERVAL = update_interval * 100
 17 | 
 18 | 
 19 | class ActorCritic(nn.Module):
 20 |     def __init__(self):
 21 |         super(ActorCritic, self).__init__()
 22 |         self.fc1 = nn.Linear(4, 256)
 23 |         self.fc_pi = nn.Linear(256, 2)
 24 |         self.fc_v = nn.Linear(256, 1)
 25 | 
 26 |     def pi(self, x, softmax_dim=1):
 27 |         x = F.relu(self.fc1(x))
 28 |         x = self.fc_pi(x)
 29 |         prob = F.softmax(x, dim=softmax_dim)
 30 |         return prob
 31 | 
 32 |     def v(self, x):
 33 |         x = F.relu(self.fc1(x))
 34 |         v = self.fc_v(x)
 35 |         return v
 36 | 
 37 | 
 38 | def worker(worker_id, master_end, worker_end):
 39 |     master_end.close()  # Forbid worker to use the master end for messaging
 40 |     env = gym.make('CartPole-v1')
 41 |     env.seed(worker_id)
 42 | 
 43 |     while True:
 44 |         cmd, data = worker_end.recv()
 45 |         if cmd == 'step':
 46 |             ob, reward, done, info = env.step(data)
 47 |             if done:
 48 |                 ob = env.reset()
 49 |             worker_end.send((ob, reward, done, info))
 50 |         elif cmd == 'reset':
 51 |             ob = env.reset()
 52 |             worker_end.send(ob)
 53 |         elif cmd == 'reset_task':
 54 |             ob = env.reset_task()
 55 |             worker_end.send(ob)
 56 |         elif cmd == 'close':
 57 |             worker_end.close()
 58 |             break
 59 |         elif cmd == 'get_spaces':
 60 |             worker_end.send((env.observation_space, env.action_space))
 61 |         else:
 62 |             raise NotImplementedError
 63 | 
 64 | 
 65 | class ParallelEnv:
 66 |     def __init__(self, n_train_processes):
 67 |         self.nenvs = n_train_processes
 68 |         self.waiting = False
 69 |         self.closed = False
 70 |         self.workers = list()
 71 | 
 72 |         master_ends, worker_ends = zip(*[mp.Pipe() for _ in range(self.nenvs)])
 73 |         self.master_ends, self.worker_ends = master_ends, worker_ends
 74 | 
 75 |         for worker_id, (master_end, worker_end) in enumerate(zip(master_ends, worker_ends)):
 76 |             p = mp.Process(target=worker,
 77 |                            args=(worker_id, master_end, worker_end))
 78 |             p.daemon = True
 79 |             p.start()
 80 |             self.workers.append(p)
 81 | 
 82 |         # Forbid master to use the worker end for messaging
 83 |         for worker_end in worker_ends:
 84 |             worker_end.close()
 85 | 
 86 |     def step_async(self, actions):
 87 |         for master_end, action in zip(self.master_ends, actions):
 88 |             master_end.send(('step', action))
 89 |         self.waiting = True
 90 | 
 91 |     def step_wait(self):
 92 |         results = [master_end.recv() for master_end in self.master_ends]
 93 |         self.waiting = False
 94 |         obs, rews, dones, infos = zip(*results)
 95 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
 96 | 
 97 |     def reset(self):
 98 |         for master_end in self.master_ends:
 99 |             master_end.send(('reset', None))
100 |         return np.stack([master_end.recv() for master_end in self.master_ends])
101 | 
102 |     def step(self, actions):
103 |         self.step_async(actions)
104 |         return self.step_wait()
105 | 
106 |     def close(self):  # For clean up resources
107 |         if self.closed:
108 |             return
109 |         if self.waiting:
110 |             [master_end.recv() for master_end in self.master_ends]
111 |         for master_end in self.master_ends:
112 |             master_end.send(('close', None))
113 |         for worker in self.workers:
114 |             worker.join()
115 |             self.closed = True
116 | 
117 | 
118 | def test(step_idx, model):
119 |     env = gym.make('CartPole-v1')
120 |     score = 0.0
121 |     done = False
122 |     num_test = 10
123 | 
124 |     for _ in range(num_test):
125 |         s = env.reset()
126 |         while not done:
127 |             prob = model.pi(torch.from_numpy(s).float(), softmax_dim=0)
128 |             a = Categorical(prob).sample().numpy()
129 |             s_prime, r, done, info = env.step(a)
130 |             s = s_prime
131 |             score += r
132 |         done = False
133 |     print(f"Step # :{step_idx}, avg score : {score / num_test:.1f}")
134 | 
135 |     env.close()
136 | 
137 | 
138 | def compute_target(v_final, r_lst, mask_lst):
139 |     G = v_final.reshape(-1)
140 |     td_target = list()
141 | 
142 |     for r, mask in zip(r_lst[::-1], mask_lst[::-1]):
143 |         G = r + gamma * G * mask
144 |         td_target.append(G)
145 | 
146 |     return torch.tensor(td_target[::-1]).float()
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     envs = ParallelEnv(n_train_processes)
151 | 
152 |     model = ActorCritic()
153 |     optimizer = optim.Adam(model.parameters(), lr=learning_rate)
154 | 
155 |     step_idx = 0
156 |     s = envs.reset()
157 |     while step_idx < max_train_steps:
158 |         s_lst, a_lst, r_lst, mask_lst = list(), list(), list(), list()
159 |         for _ in range(update_interval):
160 |             prob = model.pi(torch.from_numpy(s).float())
161 |             a = Categorical(prob).sample().numpy()
162 |             s_prime, r, done, info = envs.step(a)
163 | 
164 |             s_lst.append(s)
165 |             a_lst.append(a)
166 |             r_lst.append(r / 100.0)
167 |             mask_lst.append(1 - done)
168 | 
169 |             s = s_prime
170 |             step_idx += 1
171 | 
172 |         s_final = torch.from_numpy(s_prime).float()
173 |         v_final = model.v(s_final).detach().clone().numpy()
174 |         td_target = compute_target(v_final, r_lst, mask_lst)
175 | 
176 |         td_target_vec = td_target.reshape(-1)
177 |         s_vec = torch.tensor(s_lst).float().reshape(-1, 4)  # 4 == Dimension of state
178 |         a_vec = torch.tensor(a_lst).reshape(-1).unsqueeze(1)
179 |         advantage = td_target_vec - model.v(s_vec).reshape(-1)
180 | 
181 |         pi = model.pi(s_vec, softmax_dim=1)
182 |         pi_a = pi.gather(1, a_vec).reshape(-1)
183 |         loss = -(torch.log(pi_a) * advantage.detach()).mean() + \
184 |                F.smooth_l1_loss(model.v(s_vec).reshape(-1), td_target_vec)
185 | 
186 |         optimizer.zero_grad()
187 |         loss.backward()
188 |         optimizer.step()
189 | 
190 |         if step_idx % PRINT_INTERVAL == 0:
191 |             test(step_idx, model)
192 | 
193 |     envs.close()
194 | 


--------------------------------------------------------------------------------
/A2C/AC_CartPole.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch import nn
  4 | import torch.nn.functional as F
  5 | from gym import make
  6 | 
  7 | np.random.seed(1)
  8 | torch.manual_seed(1)
  9 | 
 10 | # Superparameters
 11 | MAX_EPISODE = 3000
 12 | DISPLAY_REWARD_THRESHOLD = 200  # renders environment if total episode reward is greater then this threshold
 13 | MAX_EP_STEPS = 1000  # maximum time step in one episode
 14 | RENDER = True  # rendering wastes time
 15 | GAMMA = 0.9  # reward discount in TD error
 16 | LR_A = 0.001  # learning rate for actor
 17 | LR_C = 0.01  # learning rate for critic
 18 | 
 19 | env = make('CartPole-v1')
 20 | env.seed(1)
 21 | env = env.unwrapped
 22 | 
 23 | print("env.action_space :", env.action_space)
 24 | print("env.observation_space :", env.observation_space)
 25 | print("env.observation_space.high :", env.observation_space.high)
 26 | print("env.observation_space.low :", env.observation_space.low)
 27 | 
 28 | N_F = env.observation_space.shape[0]
 29 | N_A = env.action_space.n
 30 | 
 31 | 
 32 | class PolicyNet(nn.Module):
 33 |     def __init__(self, n_actions, n_features):
 34 |         super(PolicyNet, self).__init__()
 35 |         hidden_units = 20
 36 |         self.fc_layer = nn.Sequential(nn.Linear(n_features, hidden_units),
 37 |                                       nn.Sigmoid(),
 38 |                                       nn.Linear(hidden_units, n_actions),
 39 |                                       nn.Softmax(dim=-1))
 40 | 
 41 |     def forward(self, x):
 42 |         output = self.fc_layer(x)
 43 |         return output
 44 | 
 45 | 
 46 | class CriticNet(nn.Module):
 47 |     def __init__(self, n_features):
 48 |         super(CriticNet, self).__init__()
 49 |         hidden_units = 20
 50 |         self.fc_layer = nn.Sequential(nn.Linear(n_features, hidden_units),
 51 |                                       nn.ReLU(),
 52 |                                       nn.Linear(hidden_units, 1))
 53 | 
 54 |     def forward(self, x):
 55 |         output = self.fc_layer(x)
 56 |         return output
 57 | 
 58 | 
 59 | class Actor(object):
 60 |     def __init__(self, n_features, n_actions, lr=0.001):
 61 |         self.actor_net = PolicyNet(n_actions, n_features)
 62 |         self.n_features = n_features
 63 |         self.n_actions = n_actions
 64 |         self.lr = lr
 65 | 
 66 |         self.optimizer = torch.optim.Adam(self.actor_net.parameters(),
 67 |                                           self.lr)
 68 |         self.cost_his = []
 69 | 
 70 |     def learn(self, s, a, td):
 71 |         state = torch.Tensor(s[np.newaxis, :])
 72 |         torch_acts = torch.as_tensor(a)
 73 |         torch_acts_one_hot = F.one_hot(torch_acts, num_classes=self.n_actions)
 74 |         torch_td_error = torch.Tensor(td).reshape(-1, 1).detach()
 75 |         all_act_prob = self.actor_net(state)
 76 | 
 77 |         exp_v = torch.log(all_act_prob) * torch_acts_one_hot * torch_td_error
 78 |         loss = torch.mean(-exp_v)
 79 |         self.optimizer.zero_grad()
 80 |         loss.backward()
 81 |         self.optimizer.step()
 82 |         self.cost_his.append(loss.data.numpy())
 83 |         return exp_v
 84 | 
 85 |     def choose_action(self, observation):
 86 |         state = torch.Tensor(observation[np.newaxis, :])
 87 |         prob_weights = self.actor_net(state)
 88 |         action_idx = prob_weights.reshape(-1, ).multinomial(num_samples=1).numpy()[0]
 89 |         return action_idx
 90 | 
 91 | 
 92 | class Critic(object):
 93 |     def __init__(self, n_features, lr=0.01):
 94 |         self.critic_net = CriticNet(n_features)
 95 |         self.n_features = n_features
 96 |         self.lr = lr
 97 |         self.optimizer = torch.optim.Adam(self.critic_net.parameters(),
 98 |                                           self.lr)
 99 |         self.cost_his = []
100 |         self.loss_function = torch.nn.MSELoss()
101 | 
102 |     def learn(self, s, r, s_):
103 |         s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :])
104 | 
105 |         v = self.critic_net(s)
106 |         v_ = self.critic_net(s_).detach()
107 |         td_error = r + GAMMA * v_ - v
108 |         loss = self.loss_function(v, r + GAMMA * v_)
109 |         # loss = torch.mean(torch.square(td_error))
110 | 
111 |         self.optimizer.zero_grad()
112 |         loss.backward()
113 |         self.optimizer.step()
114 |         self.cost_his.append(loss.data.numpy())
115 | 
116 |         return td_error
117 | 
118 | 
119 | actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A)
120 | critic = Critic(n_features=N_F,
121 |                 lr=LR_C)  # we need a good teacher, so the teacher should learn faster than the actor
122 | 
123 | for i_episode in range(MAX_EPISODE):
124 |     s = env.reset()
125 |     t = 0
126 |     track_r = []
127 |     while True:
128 |         if RENDER:
129 |             env.render()
130 | 
131 |         a = actor.choose_action(s)
132 | 
133 |         s_, r, done, info = env.step(a)
134 | 
135 |         if done:
136 |             r = -20
137 | 
138 |         track_r.append(r)
139 | 
140 |         td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
141 |         actor.learn(s, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]
142 | 
143 |         s = s_
144 |         t += 1
145 | 
146 |         if done or t >= MAX_EP_STEPS:
147 |             ep_rs_sum = sum(track_r)
148 | 
149 |             if 'running_reward' not in globals():
150 |                 running_reward = ep_rs_sum
151 |             else:
152 |                 running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
153 |             if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
154 |             print("episode:", i_episode, "  reward:", int(running_reward))
155 |             break
156 | 


--------------------------------------------------------------------------------
/A2C/AC_continue_Pendulum.py:
--------------------------------------------------------------------------------
  1 | # import numpy as np
  2 | # import torch
  3 | # from torch import nn
  4 | # import torch.nn.functional as F
  5 | # from gym import make
  6 | #
  7 | # np.random.seed(1)
  8 | # torch.manual_seed(1)
  9 | #
 10 | #
 11 | # class PolicyNet(nn.Module):
 12 | #     def __init__(self, n_features):
 13 | #         super(PolicyNet, self).__init__()
 14 | #         hidden_units = 30
 15 | #         self.feature_layer = nn.Sequential(nn.Linear(n_features, hidden_units),
 16 | #                                            nn.ReLU())
 17 | #         self.mu_layer = nn.Linear(hidden_units, 1)
 18 | #         self.sigma_layer = nn.Sequential(nn.Linear(hidden_units, 1),
 19 | #                                          nn.ReLU())
 20 | #
 21 | #     def forward(self, x):
 22 | #         feature = self.feature_layer(x)
 23 | #         mu = self.mu_layer(feature)
 24 | #         sigma = self.sigma_layer(feature)
 25 | #         return mu, sigma
 26 | #
 27 | #
 28 | # class CriticNet(nn.Module):
 29 | #     def __init__(self, n_features):
 30 | #         super(CriticNet, self).__init__()
 31 | #         hidden_units = 20
 32 | #         self.fc_layer = nn.Sequential(nn.Linear(n_features, hidden_units),
 33 | #                                       nn.ReLU(),
 34 | #                                       nn.Linear(hidden_units, 1))
 35 | #
 36 | #     def forward(self, x):
 37 | #         output = self.fc_layer(x)
 38 | #         return output
 39 | #
 40 | #
 41 | # class Actor(object):
 42 | #     def __init__(self, n_features, action_bound, lr=0.0001):
 43 | #         self.actor_net = PolicyNet(n_features)
 44 | #         self.n_features = n_features
 45 | #         self.action_bound = action_bound
 46 | #         self.lr = lr
 47 | #
 48 | #         self.optimizer = torch.optim.Adam(self.actor_net.parameters(),
 49 | #                                           self.lr)
 50 | #
 51 | #     def learn(self, s, a, td):
 52 | #         state = torch.Tensor(s[np.newaxis, :])
 53 | #         torch_acts = torch.as_tensor(a)
 54 | #         torch_td_error = torch.Tensor(td).reshape(-1, 1).detach()
 55 | #
 56 | #         mu, sigma = self.actor_net(state)
 57 | #         mu, sigma = torch.squeeze(mu * 2), torch.squeeze(sigma + 0.001)
 58 | #         normal_dist = torch.distributions.Normal(mu, sigma)
 59 | #
 60 | #         log_prob = normal_dist.log_prob(torch_acts)
 61 | #         exp_v = log_prob * torch_td_error
 62 | #         exp_v += 0.01 * normal_dist.entropy()
 63 | #
 64 | #         loss = torch.mean(-exp_v)
 65 | #         self.optimizer.zero_grad()
 66 | #
 67 | #         loss.backward()
 68 | #         self.optimizer.step()
 69 | #         return exp_v
 70 | #
 71 | #     def choose_action(self, s):
 72 | #         state = torch.Tensor(s[np.newaxis, :])
 73 | #         mu, sigma = self.actor_net(state)
 74 | #         mu, sigma = torch.squeeze(mu * 2), torch.squeeze(sigma + 0.001)
 75 | #
 76 | #         normal_dist = torch.distributions.Normal(mu, sigma)
 77 | #         action = torch.clamp(normal_dist.sample(), torch.Tensor(self.action_bound[0]),
 78 | #                              torch.Tensor(self.action_bound[1]))
 79 | #
 80 | #         return action
 81 | #
 82 | #
 83 | # class Critic(object):
 84 | #     def __init__(self, n_features, lr=0.01):
 85 | #         self.critic_net = CriticNet(n_features)
 86 | #         self.n_features = n_features
 87 | #         self.lr = lr
 88 | #         self.optimizer = torch.optim.Adam(self.critic_net.parameters(),
 89 | #                                           self.lr)
 90 | #
 91 | #         self.loss_function = torch.nn.MSELoss()
 92 | #
 93 | #     def learn(self, s, r, s_):
 94 | #         s, s_ = torch.Tensor(s[np.newaxis, :]), torch.Tensor(s_[np.newaxis, :])
 95 | #
 96 | #         v = self.critic_net(s)
 97 | #         v_ = self.critic_net(s_).detach()
 98 | #         td_error = r + GAMMA * v_ - v
 99 | #         # loss = self.loss_function(v, r + GAMMA * v_)
100 | #         loss = torch.mean(torch.square(td_error))
101 | #
102 | #         self.optimizer.zero_grad()
103 | #         loss.backward()
104 | #         self.optimizer.step()
105 | #
106 | #         return td_error
107 | #
108 | #
109 | # MAX_EPISODE = 1000
110 | # MAX_EP_STEPS = 200
111 | # DISPLAY_REWARD_THRESHOLD = -100  # renders environment if total episode reward is greater then this threshold
112 | # RENDER = False  # rendering wastes time
113 | # GAMMA = 0.9
114 | # LR_A = 0.001  # learning rate for actor
115 | # LR_C = 0.01  # learning rate for critic
116 | #
117 | # env = make('Pendulum-v1')
118 | # env.seed(1)  # reproducible
119 | # env = env.unwrapped
120 | #
121 | # N_S = env.observation_space.shape[0]
122 | # A_BOUND = env.action_space.high
123 | #
124 | # actor = Actor(n_features=N_S, lr=LR_A, action_bound=[-A_BOUND, A_BOUND])
125 | # critic = Critic(n_features=N_S, lr=LR_C)
126 | #
127 | # for i_episode in range(MAX_EPISODE):
128 | #     s = env.reset()
129 | #     t = 0
130 | #     ep_rs = []
131 | #     while True:
132 | #         # if RENDER:
133 | #         env.render()
134 | #         a = actor.choose_action(s)
135 | #
136 | #         s_, r, done, info = env.step(a)
137 | #         r /= 10
138 | #
139 | #         td_error = critic.learn(s, r, s_)  # gradient = grad[r + gamma * V(s_) - V(s)]
140 | #         actor.learn(s, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]
141 | #
142 | #         s = s_
143 | #         t += 1
144 | #         ep_rs.append(r)
145 | #         if t > MAX_EP_STEPS:
146 | #             ep_rs_sum = sum(ep_rs)
147 | #             if 'running_reward' not in globals():
148 | #                 running_reward = ep_rs_sum
149 | #             else:
150 | #                 running_reward = running_reward * 0.9 + ep_rs_sum * 0.1
151 | #             if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True  # rendering
152 | #             print("episode:", i_episode, "  reward:", int(running_reward))
153 | #             break
154 | 


--------------------------------------------------------------------------------
/A3C/A3C.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | from torch.distributions import Categorical
  7 | import torch.multiprocessing as mp
  8 | import time
  9 | 
 10 | # Hyperparameters
 11 | n_train_processes = 3
 12 | learning_rate = 0.0002
 13 | update_interval = 5
 14 | gamma = 0.98
 15 | max_train_ep = 300
 16 | max_test_ep = 400
 17 | 
 18 | 
 19 | # Asynchronous Advantage Actor-Critic(A3C)
 20 | class ActorCritic(nn.Module):
 21 |     def __init__(self):
 22 |         super(ActorCritic, self).__init__()
 23 |         self.fc1 = nn.Linear(4, 256)
 24 |         self.fc_pi = nn.Linear(256, 2)
 25 |         self.fc_v = nn.Linear(256, 1)
 26 | 
 27 |     def pi(self, x, softmax_dim=0):
 28 |         x = F.relu(self.fc1(x))
 29 |         x = self.fc_pi(x)
 30 |         prob = F.softmax(x, dim=softmax_dim)
 31 |         return prob
 32 | 
 33 |     def v(self, x):
 34 |         x = F.relu(self.fc1(x))
 35 |         v = self.fc_v(x)
 36 |         return v
 37 | 
 38 | 
 39 | def train(global_model, rank):
 40 |     local_model = ActorCritic()
 41 |     local_model.load_state_dict(global_model.state_dict())
 42 | 
 43 |     optimizer = optim.Adam(global_model.parameters(), lr=learning_rate)
 44 | 
 45 |     env = gym.make('CartPole-v1')
 46 | 
 47 |     for n_epi in range(max_train_ep):
 48 |         done = False
 49 |         s = env.reset()
 50 |         while not done:
 51 |             s_lst, a_lst, r_lst = [], [], []
 52 |             for t in range(update_interval):
 53 |                 prob = local_model.pi(torch.from_numpy(s).float())
 54 |                 m = Categorical(prob)
 55 |                 a = m.sample().item()
 56 |                 s_prime, r, done, info = env.step(a)
 57 | 
 58 |                 s_lst.append(s)
 59 |                 a_lst.append([a])
 60 |                 r_lst.append(r / 100.0)
 61 | 
 62 |                 s = s_prime
 63 |                 if done:
 64 |                     break
 65 | 
 66 |             s_final = torch.tensor(s_prime, dtype=torch.float)
 67 |             R = 0.0 if done else local_model.v(s_final).item()
 68 |             td_target_lst = []
 69 |             for reward in r_lst[::-1]:
 70 |                 R = gamma * R + reward
 71 |                 td_target_lst.append([R])
 72 |             td_target_lst.reverse()
 73 | 
 74 |             s_batch, a_batch, td_target = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
 75 |                                           torch.tensor(td_target_lst)
 76 |             advantage = td_target - local_model.v(s_batch)
 77 | 
 78 |             pi = local_model.pi(s_batch, softmax_dim=1)
 79 |             pi_a = pi.gather(1, a_batch)
 80 |             loss = -torch.log(pi_a) * advantage.detach() + \
 81 |                    F.smooth_l1_loss(local_model.v(s_batch), td_target.detach())
 82 | 
 83 |             optimizer.zero_grad()
 84 |             loss.mean().backward()
 85 |             for global_param, local_param in zip(global_model.parameters(), local_model.parameters()):
 86 |                 global_param._grad = local_param.grad
 87 |             optimizer.step()
 88 |             local_model.load_state_dict(global_model.state_dict())
 89 | 
 90 |     env.close()
 91 |     print("Training process {} reached maximum episode.".format(rank))
 92 | 
 93 | 
 94 | def test(global_model):
 95 |     env = gym.make('CartPole-v1')
 96 |     score = 0.0
 97 |     print_interval = 20
 98 | 
 99 |     for n_epi in range(max_test_ep):
100 |         done = False
101 |         s = env.reset()
102 |         while not done:
103 |             prob = global_model.pi(torch.from_numpy(s).float())
104 |             a = Categorical(prob).sample().item()
105 |             s_prime, r, done, info = env.step(a)
106 |             s = s_prime
107 |             score += r
108 | 
109 |         if n_epi % print_interval == 0 and n_epi != 0:
110 |             print("# of episode :{}, avg score : {:.1f}".format(
111 |                 n_epi, score / print_interval))
112 |             score = 0.0
113 |             time.sleep(1)
114 |     env.close()
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     global_model = ActorCritic()
119 |     global_model.share_memory()
120 | 
121 |     processes = []
122 |     for rank in range(n_train_processes + 1):  # + 1 for test process
123 |         if rank == 0:
124 |             p = mp.Process(target=test, args=(global_model,))
125 |         else:
126 |             p = mp.Process(target=train, args=(global_model, rank,))
127 |         p.start()
128 |         processes.append(p)
129 |     for p in processes:
130 |         p.join()
131 | 


--------------------------------------------------------------------------------
/Actor_Critic/Actor_Critic.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | from torch.distributions import Categorical
  8 | 
  9 | # Hyperparameters
 10 | learning_rate = 0.0002
 11 | gamma = 0.98
 12 | n_rollout = 10
 13 | MAX_EPISODE = 10000
 14 | RENDER = True
 15 | 
 16 | env = gym.make('CartPole-v1')
 17 | env = env.unwrapped
 18 | env.seed(1)
 19 | torch.manual_seed(1)
 20 | 
 21 | print("env.action_space :", env.action_space)
 22 | print("env.observation_space :", env.observation_space)
 23 | 
 24 | n_features = env.observation_space.shape[0]
 25 | n_actions = env.action_space.n
 26 | 
 27 | 
 28 | class ActorCritic(nn.Module):
 29 |     def __init__(self):
 30 |         super(ActorCritic, self).__init__()
 31 |         self.data = []
 32 | 
 33 |         hidden_dims = 256
 34 |         self.feature_layer = nn.Sequential(nn.Linear(n_features, hidden_dims),
 35 |                                            nn.ReLU())
 36 | 
 37 |         self.fc_pi = nn.Linear(hidden_dims, n_actions)
 38 |         self.fc_v = nn.Linear(hidden_dims, 1)
 39 |         self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
 40 | 
 41 |     def pi(self, x):
 42 |         x = self.feature_layer(x)
 43 |         x = self.fc_pi(x)
 44 |         prob = F.softmax(x, dim=-1)
 45 |         return prob
 46 | 
 47 |     def v(self, x):
 48 |         x = self.feature_layer(x)
 49 |         v = self.fc_v(x)
 50 |         return v
 51 | 
 52 |     def put_data(self, transition):
 53 |         self.data.append(transition)
 54 | 
 55 |     def make_batch(self):
 56 |         s_lst, a_lst, r_lst, s_next_lst, done_lst = [], [], [], [], []
 57 |         for transition in self.data:
 58 |             s, a, r, s_, done = transition
 59 |             s_lst.append(s)
 60 |             a_lst.append([a])
 61 |             r_lst.append([r / 100.0])
 62 |             s_next_lst.append(s_)
 63 |             done_mask = 0.0 if done else 1.0
 64 |             done_lst.append([done_mask])
 65 | 
 66 |         s_batch, a_batch, r_batch, s_next_batch, done_batch = torch.tensor(numpy.array(s_lst),
 67 |                                                                            dtype=torch.float), torch.tensor(
 68 |             a_lst), torch.tensor(numpy.array(r_lst), dtype=torch.float), torch.tensor(
 69 |             numpy.array(s_next_lst), dtype=torch.float), torch.tensor(
 70 |             numpy.array(done_lst), dtype=torch.float)
 71 |         self.data = []
 72 |         return s_batch, a_batch, r_batch, s_next_batch, done_batch
 73 | 
 74 |     def train_net(self):
 75 |         s, a, r, s_, done = self.make_batch()
 76 |         td_target = r + gamma * self.v(s_) * done
 77 |         delta = td_target - self.v(s)
 78 | 
 79 |         pi = self.pi(s)
 80 |         pi_a = pi.gather(1, a)
 81 |         loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach())
 82 | 
 83 |         self.optimizer.zero_grad()
 84 |         loss.mean().backward()
 85 |         self.optimizer.step()
 86 | 
 87 | 
 88 | def main():
 89 |     model = ActorCritic()
 90 |     print_interval = 20
 91 |     score = 0.0
 92 | 
 93 |     for n_epi in range(MAX_EPISODE):
 94 |         done = False
 95 |         s = env.reset()
 96 |         while not done:
 97 |             for t in range(n_rollout):
 98 |                 if RENDER:
 99 |                     env.render()
100 |                 prob = model.pi(torch.from_numpy(s).float())
101 |                 m = Categorical(prob)
102 |                 a = m.sample().item()
103 |                 s_next, r, done, info = env.step(a)
104 |                 model.put_data((s, a, r, s_next, done))
105 | 
106 |                 s = s_next
107 |                 score += r
108 | 
109 |                 if done:
110 |                     break
111 | 
112 |             model.train_net()
113 | 
114 |         if n_epi % print_interval == 0 and n_epi != 0:
115 |             print("# of episode :{}, avg score : {:.1f}".format(n_epi, score / print_interval))
116 |             score = 0.0
117 |     env.close()
118 | 
119 | 
120 | if __name__ == '__main__':
121 |     main()
122 | 


--------------------------------------------------------------------------------
/D3QN/D3QN.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import collections
  3 | import random
  4 | 
  5 | import numpy
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | 
 11 | # Hyperparameters
 12 | learning_rate = 0.0005
 13 | gamma = 0.98
 14 | buffer_limit = 50000
 15 | batch_size = 32
 16 | MAX_EPISODE = 10000
 17 | RENDER = True
 18 | 
 19 | env = gym.make('CartPole-v1')
 20 | env = env.unwrapped
 21 | env.seed(1)
 22 | torch.manual_seed(1)
 23 | 
 24 | print("env.action_space :", env.action_space)
 25 | print("env.observation_space :", env.observation_space)
 26 | 
 27 | n_features = env.observation_space.shape[0]
 28 | n_actions = env.action_space.n
 29 | 
 30 | 
 31 | class ReplayBuffer():
 32 |     def __init__(self):
 33 |         self.buffer = collections.deque(maxlen=buffer_limit)
 34 | 
 35 |     def put(self, transition):
 36 |         self.buffer.append(transition)
 37 | 
 38 |     def sample(self, n):
 39 |         mini_batch = random.sample(self.buffer, n)
 40 |         s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], []
 41 | 
 42 |         for transition in mini_batch:
 43 |             s, a, r, s_, done_mask = transition
 44 |             s_lst.append(s)
 45 |             a_lst.append([a])
 46 |             r_lst.append([r])
 47 |             s_next_lst.append(s_)
 48 |             done_mask_lst.append([done_mask])
 49 | 
 50 |         return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst)), \
 51 |                torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \
 52 |                torch.tensor(numpy.array(done_mask_lst))
 53 | 
 54 |     def size(self):
 55 |         return len(self.buffer)
 56 | 
 57 | 
 58 | class DQNDuelingNet(nn.Module):
 59 |     def __init__(self):
 60 |         super(DQNDuelingNet, self).__init__()
 61 |         hidden_dims = 128
 62 |         self.feature_layer = nn.Sequential(nn.Linear(n_features, hidden_dims),
 63 |                                            nn.ReLU())
 64 |         self.value_layer = nn.Linear(hidden_dims, 1)
 65 |         self.advantage_layer = nn.Linear(hidden_dims, n_actions)
 66 | 
 67 |     def forward(self, x):
 68 |         feature = self.feature_layer(x)
 69 |         value = self.value_layer(feature)
 70 |         advantage = self.advantage_layer(feature)
 71 | 
 72 |         avg_advantage = torch.mean(input=advantage, dim=-1, keepdim=True)
 73 |         q_values = value + (advantage - avg_advantage)
 74 |         return q_values
 75 | 
 76 | 
 77 | # Epsilon_Greedy_Exploration
 78 | # MAX_Greedy_Update
 79 | class Dueling_DQN:
 80 |     def __init__(self):
 81 |         # [target_net, evaluate_net]
 82 |         self.evaluate_net = DQNDuelingNet()
 83 |         self.target_net = type(self.evaluate_net)()
 84 |         self.target_net.load_state_dict(self.evaluate_net.state_dict())  # copy weights and stuff
 85 | 
 86 |         self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(),
 87 |                                           learning_rate)
 88 |         self.memory = ReplayBuffer()
 89 | 
 90 |     def train(self):
 91 |         s, a, r, s_, done_mask = self.memory.sample(batch_size)
 92 | 
 93 |         q_out = self.evaluate_net(s)
 94 |         q_a = q_out.gather(1, a)
 95 | 
 96 |         # 与Dueling DQN的不同之处
 97 |         # max_q_prime = torch.max(self.target_net(s_), dim=1, keepdim=True).values
 98 |         #  target = r + gamma * max_q_prime * done_mask
 99 |         q_target_next = self.target_net(s_).detach()
100 |         q_eval_next = self.evaluate_net(s_).detach()
101 |         q_next = q_target_next.gather(1, q_eval_next.argmax(axis=1).reshape(-1, 1))
102 |         target = r + gamma * q_next * done_mask
103 | 
104 |         loss = F.smooth_l1_loss(q_a, target)
105 | 
106 |         self.optimizer.zero_grad()
107 |         loss.backward()
108 |         self.optimizer.step()
109 | 
110 |     def sample_action(self, obs, epsilon):
111 |         coin = random.random()
112 |         if coin < epsilon:
113 |             return random.randint(0, 1)
114 |         else:
115 |             out = self.evaluate_net(obs)
116 |             return out.argmax().item()
117 | 
118 | 
119 | def main():
120 |     trainer = Dueling_DQN()
121 | 
122 |     print_interval = 20
123 |     score = 0.0
124 | 
125 |     for n_epi in range(MAX_EPISODE):
126 |         epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200))  # Linear annealing from 8% to 1%
127 |         s = env.reset()
128 |         done = False
129 | 
130 |         while not done:
131 |             if RENDER:
132 |                 env.render()
133 |             a = trainer.sample_action(torch.from_numpy(s).float(), epsilon)
134 |             s_, r, done, info = env.step(a)
135 |             done_mask = 0.0 if done else 1.0
136 |             trainer.memory.put((s, a, r / 100.0, s_, done_mask))
137 |             s = s_
138 | 
139 |             score += r
140 |             if done:
141 |                 break
142 | 
143 |         if trainer.memory.size() > 2000:
144 |             trainer.train()
145 | 
146 |         if n_epi % print_interval == 0 and n_epi != 0:
147 |             trainer.target_net.load_state_dict(trainer.evaluate_net.state_dict())
148 |             print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
149 |                 n_epi, score / print_interval, trainer.memory.size(), epsilon * 100))
150 |             score = 0.0
151 |     env.close()
152 | 
153 | 
154 | if __name__ == '__main__':
155 |     main()
156 | 


--------------------------------------------------------------------------------
/DDPG/DDPG.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import random
  3 | import collections
  4 | 
  5 | import numpy
  6 | import numpy as np
  7 | import torch
  8 | import torch.nn as nn
  9 | import torch.nn.functional as F
 10 | import torch.optim as optim
 11 | 
 12 | # Hyperparameters
 13 | lr_mu = 0.0005
 14 | lr_q = 0.001
 15 | gamma = 0.99
 16 | batch_size = 32
 17 | buffer_limit = 50000
 18 | tau = 0.005  # for target network soft update
 19 | 
 20 | MAX_EPISODE = 10000
 21 | RENDER = True
 22 | 
 23 | env = gym.make('Pendulum-v1')
 24 | # env = env.unwrapped
 25 | env.seed(1)
 26 | torch.manual_seed(1)
 27 | 
 28 | print("env.action_space :", env.action_space)
 29 | print("env.observation_space :", env.observation_space)
 30 | 
 31 | n_features = env.observation_space.shape[0]
 32 | n_actions = env.action_space.shape[0]
 33 | 
 34 | 
 35 | # class NormalizedActions(gym.ActionWrapper):
 36 | #     def action(self, action):
 37 | #         low_bound = self.action_space.low
 38 | #         upper_bound = self.action_space.high
 39 | #
 40 | #         action = low_bound + (action + 1.0) * 0.5 * (upper_bound - low_bound)
 41 | #         # 将经过tanh输出的值重新映射回环境的真实值内
 42 | #         action = np.clip(action, low_bound, upper_bound)
 43 | #
 44 | #         return action
 45 | #
 46 | #     def reverse_action(self, action):
 47 | #         low_bound = self.action_space.low
 48 | #         upper_bound = self.action_space.high
 49 | #
 50 | #         # 因为激活函数使用的是tanh，这里将环境输出的动作正则化到（-1，1）
 51 | #
 52 | #         action = 2 * (action - low_bound) / (upper_bound - low_bound) - 1
 53 | #         action = np.clip(action, low_bound, upper_bound)
 54 | #
 55 | #         return action
 56 | 
 57 | 
 58 | class ReplayBuffer():
 59 |     def __init__(self):
 60 |         self.buffer = collections.deque(maxlen=buffer_limit)  # 初始化buffer容量
 61 | 
 62 |     def put(self, transition):
 63 |         self.buffer.append(transition)  # 存入一个transition
 64 | 
 65 |     def sample(self, n):  # 取样
 66 |         mini_batch = random.sample(self.buffer, n)
 67 |         s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], []
 68 | 
 69 |         for transition in mini_batch:
 70 |             s, a, r, s_, done_mask = transition
 71 |             s_lst.append(s)
 72 |             a_lst.append([a])
 73 |             r_lst.append([r])
 74 |             s_next_lst.append(s_)
 75 |             done_mask_lst.append([done_mask])
 76 | 
 77 |         return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst), dtype=torch.float), \
 78 |                torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \
 79 |                torch.tensor(numpy.array(done_mask_lst))
 80 | 
 81 |     def size(self):
 82 |         return len(self.buffer)
 83 | 
 84 | 
 85 | class MuNet(nn.Module):
 86 |     def __init__(self):
 87 |         super(MuNet, self).__init__()
 88 | 
 89 |         hidden_dims = 128
 90 |         hidden_dims_2 = 64
 91 |         self.fc1 = nn.Linear(n_features, hidden_dims)
 92 |         self.fc2 = nn.Linear(hidden_dims, hidden_dims_2)
 93 |         self.fc_mu = nn.Linear(hidden_dims_2, 1)
 94 | 
 95 |     def forward(self, x):
 96 |         x = F.relu(self.fc1(x))
 97 |         x = F.relu(self.fc2(x))
 98 |         mu = torch.tanh(self.fc_mu(x)) * 2  # Multipled by 2 because the action space of the Pendulum-v0 is [-2,2]
 99 |         return mu
100 | 
101 | 
102 | class QNet(nn.Module):
103 |     def __init__(self):
104 |         super(QNet, self).__init__()
105 |         hidden_dims = 64
106 |         hidden_dims_2 = 32
107 |         self.fc_s = nn.Linear(n_features, hidden_dims)
108 |         self.fc_a = nn.Linear(1, hidden_dims)
109 |         self.fc_q = nn.Linear(hidden_dims * 2, hidden_dims_2)
110 |         self.fc_out = nn.Linear(hidden_dims_2, n_actions)
111 | 
112 |     def forward(self, x, a):
113 |         h1 = F.relu(self.fc_s(x))
114 |         h2 = F.relu(self.fc_a(a))
115 |         cat = torch.cat([h1, h2], dim=1)
116 |         q = F.relu(self.fc_q(cat))
117 |         q = self.fc_out(q)
118 |         return q
119 | 
120 | 
121 | class OrnsteinUhlenbeckNoise:
122 |     def __init__(self, mu):
123 |         self.theta, self.dt, self.sigma = 0.1, 0.01, 0.1
124 |         self.mu = mu
125 |         self.x_prev = np.zeros_like(self.mu)
126 | 
127 |     def __call__(self):
128 |         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(
129 |             self.dt) * np.random.normal(size=self.mu.shape)
130 |         self.x_prev = x
131 |         return x
132 | 
133 | 
134 | def train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer):
135 |     s, a, r, s_next, done_mask = memory.sample(batch_size)
136 | 
137 |     target = r + gamma * q_target(s_next, mu_target(s_next)) * done_mask
138 |     q_loss = F.smooth_l1_loss(q(s, a), target.detach())
139 |     q_optimizer.zero_grad()
140 |     q_loss.backward()
141 |     q_optimizer.step()
142 | 
143 |     mu_loss = -q(s, mu(s)).mean()  # That's all for the policy loss.
144 |     mu_optimizer.zero_grad()
145 |     mu_loss.backward()
146 |     mu_optimizer.step()
147 | 
148 | 
149 | def soft_update(net, net_target):
150 |     for param_target, param in zip(net_target.parameters(), net.parameters()):
151 |         param_target.data.copy_(param.data * tau + (1.0 - tau) * param_target.data)
152 | 
153 | 
154 | def main():
155 |     memory = ReplayBuffer()
156 | 
157 |     q, q_target = QNet(), QNet()
158 |     q_target.load_state_dict(q.state_dict())
159 |     mu, mu_target = MuNet(), MuNet()
160 |     mu_target.load_state_dict(mu.state_dict())
161 | 
162 |     score = 0.0
163 |     print_interval = 20
164 | 
165 |     mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu)
166 |     q_optimizer = optim.Adam(q.parameters(), lr=lr_q)
167 |     ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(1))
168 | 
169 |     for n_epi in range(MAX_EPISODE):
170 |         s = env.reset()
171 |         done = False
172 | 
173 |         while not done:
174 |             if RENDER:
175 |                 env.render()
176 |             a = mu(torch.from_numpy(s).float())
177 |             a = a.item() + ou_noise()[0]
178 |             s_next, r, done, info = env.step([a])
179 |             memory.put((s, a, r / 100.0, s_next, done))
180 |             score += r
181 |             s = s_next
182 | 
183 |         if memory.size() > 2000:
184 |             for i in range(10):
185 |                 train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer)
186 |                 soft_update(mu, mu_target)
187 |                 soft_update(q, q_target)
188 | 
189 |         if n_epi % print_interval == 0 and n_epi != 0:
190 |             print("# of episode :{}, avg score : {:.1f}".format(n_epi, score / print_interval))
191 |             score = 0.0
192 | 
193 |     env.close()
194 | 
195 | 
196 | if __name__ == '__main__':
197 |     main()
198 | 


--------------------------------------------------------------------------------
/DQN/DQN.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import collections
  3 | import random
  4 | 
  5 | import numpy
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | 
 11 | # Hyperparameters
 12 | learning_rate = 0.0005
 13 | gamma = 0.98
 14 | buffer_limit = 50000
 15 | batch_size = 32
 16 | MAX_EPISODE = 10000
 17 | RENDER = True
 18 | 
 19 | env = gym.make('CartPole-v1')
 20 | env = env.unwrapped
 21 | env.seed(1)
 22 | torch.manual_seed(1)
 23 | 
 24 | print("env.action_space :", env.action_space)
 25 | print("env.observation_space :", env.observation_space)
 26 | 
 27 | n_features = env.observation_space.shape[0]
 28 | n_actions = env.action_space.n
 29 | 
 30 | 
 31 | class ReplayBuffer():
 32 |     def __init__(self):
 33 |         self.buffer = collections.deque(maxlen=buffer_limit)  # 初始化buffer容量
 34 | 
 35 |     def put(self, transition):
 36 |         self.buffer.append(transition)  # 存入一个transition
 37 | 
 38 |     def sample(self, n):  # 取样
 39 |         mini_batch = random.sample(self.buffer, n)
 40 |         s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], []
 41 | 
 42 |         for transition in mini_batch:
 43 |             s, a, r, s_, done_mask = transition
 44 |             s_lst.append(s)
 45 |             a_lst.append([a])
 46 |             r_lst.append([r])
 47 |             s_next_lst.append(s_)
 48 |             done_mask_lst.append([done_mask])
 49 | 
 50 |         return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst)), \
 51 |                torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \
 52 |                torch.tensor(numpy.array(done_mask_lst))
 53 | 
 54 |     def size(self):
 55 |         return len(self.buffer)
 56 | 
 57 | 
 58 | class QNetwork(nn.Module):
 59 |     def __init__(self):
 60 |         super(QNetwork, self).__init__()
 61 |         hidden_dims = 128
 62 |         self.out_layer = torch.nn.Sequential(nn.Linear(n_features, hidden_dims),
 63 |                                              nn.ReLU(),
 64 |                                              nn.Linear(hidden_dims, n_actions))
 65 | 
 66 |     def forward(self, x):
 67 |         return self.out_layer(x)
 68 | 
 69 | 
 70 | # Deep Q Network off-policy
 71 | # Epsilon_Greedy_Exploration
 72 | # MAX_Greedy_Update
 73 | class DeepQNetwork:
 74 |     def __init__(self):
 75 |         # [target_net, evaluate_net]
 76 |         self.evaluate_net = QNetwork()
 77 |         self.target_net = type(self.evaluate_net)()  # target network与evaluate_net结构相同
 78 |         self.target_net.load_state_dict(self.evaluate_net.state_dict())  # copy weights and stuff
 79 | 
 80 |         self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(),
 81 |                                           learning_rate)
 82 |         self.memory = ReplayBuffer()
 83 | 
 84 |     def train(self):
 85 |         # sample a batch from the replay buffer
 86 |         s, a, r, s_, done_mask = self.memory.sample(batch_size)
 87 | 
 88 |         q_out = self.evaluate_net(s)
 89 |         q_a = q_out.gather(1, a)
 90 |         max_q_prime = torch.max(self.target_net(s_), dim=1, keepdim=True).values
 91 |         target = r + gamma * max_q_prime * done_mask
 92 |         loss = F.smooth_l1_loss(q_a, target)
 93 | 
 94 |         self.optimizer.zero_grad()
 95 |         loss.backward()
 96 |         self.optimizer.step()
 97 | 
 98 |     def sample_action(self, obs, epsilon):
 99 |         coin = random.random()
100 |         if coin < epsilon:
101 |             return env.action_space.sample()
102 |         else:
103 |             out = self.evaluate_net(obs)
104 |             return out.argmax().item()
105 | 
106 | 
107 | def main():
108 |     trainer = DeepQNetwork()
109 | 
110 |     print_interval = 20
111 |     score = 0.0
112 | 
113 |     for n_epi in range(MAX_EPISODE):
114 |         epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200))  # Linear annealing from 8% to 1%
115 |         s = env.reset()
116 |         done = False
117 | 
118 |         while not done:
119 |             if RENDER:
120 |                 env.render()
121 |             a = trainer.sample_action(torch.from_numpy(s).float(), epsilon)
122 |             s_, r, done, info = env.step(a)
123 |             done_mask = 0.0 if done else 1.0
124 |             trainer.memory.put((s, a, r / 100.0, s_, done_mask))
125 |             s = s_
126 | 
127 |             score += r
128 |             if done:
129 |                 break
130 | 
131 |         if trainer.memory.size() > 2000:
132 |             trainer.train()  # 训练数据存储到一定量后开始训练网络
133 | 
134 |         if n_epi % print_interval == 0 and n_epi != 0:
135 |             # 更新target network
136 |             trainer.target_net.load_state_dict(trainer.evaluate_net.state_dict())
137 |             print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
138 |                 n_epi, score / print_interval, trainer.memory.size(), epsilon * 100))
139 |             score = 0.0
140 |     env.close()
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     main()
145 | 


--------------------------------------------------------------------------------
/Double_DQN/DDQN.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import collections
  3 | import random
  4 | 
  5 | import numpy
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | 
 11 | # Hyperparameters
 12 | learning_rate = 0.0005
 13 | gamma = 0.98
 14 | buffer_limit = 50000
 15 | batch_size = 32
 16 | MAX_EPISODE = 10000
 17 | RENDER = True
 18 | 
 19 | env = gym.make('CartPole-v1')
 20 | env = env.unwrapped
 21 | env.seed(1)
 22 | torch.manual_seed(1)
 23 | 
 24 | print("env.action_space :", env.action_space)
 25 | print("env.observation_space :", env.observation_space)
 26 | 
 27 | n_features = env.observation_space.shape[0]
 28 | n_actions = env.action_space.n
 29 | 
 30 | 
 31 | class ReplayBuffer():
 32 |     def __init__(self):
 33 |         self.buffer = collections.deque(maxlen=buffer_limit)
 34 | 
 35 |     def put(self, transition):
 36 |         self.buffer.append(transition)
 37 | 
 38 |     def sample(self, n):
 39 |         mini_batch = random.sample(self.buffer, n)
 40 |         s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], []
 41 | 
 42 |         for transition in mini_batch:
 43 |             s, a, r, s_, done_mask = transition
 44 |             s_lst.append(s)
 45 |             a_lst.append([a])
 46 |             r_lst.append([r])
 47 |             s_next_lst.append(s_)
 48 |             done_mask_lst.append([done_mask])
 49 | 
 50 |         return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst)), \
 51 |                torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \
 52 |                torch.tensor(numpy.array(done_mask_lst))
 53 | 
 54 |     def size(self):
 55 |         return len(self.buffer)
 56 | 
 57 | 
 58 | class QNetwork(nn.Module):
 59 |     def __init__(self):
 60 |         super(QNetwork, self).__init__()
 61 |         hidden_dims = 128
 62 |         self.out_layer = torch.nn.Sequential(nn.Linear(n_features, hidden_dims),
 63 |                                              nn.ReLU(),
 64 |                                              nn.Linear(hidden_dims, n_actions))
 65 | 
 66 |     def forward(self, x):
 67 |         return self.out_layer(x)
 68 | 
 69 | 
 70 | # Deep Q Network off-policy
 71 | # Epsilon_Greedy_Exploration
 72 | # MAX_Greedy_Update
 73 | class DeepQNetwork:
 74 |     def __init__(self):
 75 |         # [target_net, evaluate_net]
 76 |         self.evaluate_net = QNetwork()
 77 |         self.target_net = type(self.evaluate_net)()
 78 |         self.target_net.load_state_dict(self.evaluate_net.state_dict())  # copy weights and stuff
 79 | 
 80 |         self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(),
 81 |                                           learning_rate)
 82 |         self.memory = ReplayBuffer()
 83 | 
 84 |     def train(self):
 85 |         s, a, r, s_, done_mask = self.memory.sample(batch_size)
 86 | 
 87 |         # 重点部分
 88 |         q_eval = self.evaluate_net(s).gather(1, a)
 89 |         q_target_next = self.target_net(s_).detach()
 90 |         q_eval_next = self.evaluate_net(s_).detach()
 91 |         q_next = q_target_next.gather(1, q_eval_next.argmax(axis=1).reshape(-1, 1))
 92 |         target = r + gamma * q_next * done_mask
 93 | 
 94 |         loss = F.smooth_l1_loss(q_eval, target)
 95 | 
 96 |         self.optimizer.zero_grad()
 97 |         loss.backward()
 98 |         self.optimizer.step()
 99 | 
100 |     def sample_action(self, obs, epsilon):
101 |         coin = random.random()
102 |         if coin < epsilon:
103 |             return env.action_space.sample()
104 |         else:
105 |             out = self.evaluate_net(obs)
106 |             return out.argmax().item()
107 | 
108 | 
109 | def main():
110 |     env = gym.make('CartPole-v1')
111 |     trainer = DeepQNetwork()
112 | 
113 |     print_interval = 20
114 |     score = 0.0
115 | 
116 |     for n_epi in range(MAX_EPISODE):
117 |         epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200))  # Linear annealing from 8% to 1%
118 |         s = env.reset()
119 |         done = False
120 | 
121 |         while not done:
122 |             if RENDER:
123 |                 env.render()
124 |             a = trainer.sample_action(torch.from_numpy(s).float(), epsilon)
125 |             s_, r, done, info = env.step(a)
126 |             done_mask = 0.0 if done else 1.0
127 |             trainer.memory.put((s, a, r / 100.0, s_, done_mask))
128 |             s = s_
129 | 
130 |             score += r
131 |             if done:
132 |                 break
133 | 
134 |         if trainer.memory.size() > 2000:
135 |             trainer.train()
136 | 
137 |         if n_epi % print_interval == 0 and n_epi != 0:
138 |             trainer.target_net.load_state_dict(trainer.evaluate_net.state_dict())
139 |             print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
140 |                 n_epi, score / print_interval, trainer.memory.size(), epsilon * 100))
141 |             score = 0.0
142 |     env.close()
143 | 
144 | 
145 | if __name__ == '__main__':
146 |     main()
147 | 


--------------------------------------------------------------------------------
/Dueling_DQN/Dueling_DQN.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import collections
  3 | import random
  4 | 
  5 | import numpy
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | 
 11 | # Hyperparameters
 12 | learning_rate = 0.0005
 13 | gamma = 0.98
 14 | buffer_limit = 50000
 15 | batch_size = 32
 16 | MAX_EPISODE = 10000
 17 | RENDER = True
 18 | 
 19 | env = gym.make('CartPole-v1')
 20 | env = env.unwrapped
 21 | env.seed(1)
 22 | torch.manual_seed(1)
 23 | 
 24 | print("env.action_space :", env.action_space)
 25 | print("env.observation_space :", env.observation_space)
 26 | 
 27 | n_features = env.observation_space.shape[0]
 28 | n_actions = env.action_space.n
 29 | 
 30 | 
 31 | class ReplayBuffer():
 32 |     def __init__(self):
 33 |         self.buffer = collections.deque(maxlen=buffer_limit)
 34 | 
 35 |     def put(self, transition):
 36 |         self.buffer.append(transition)
 37 | 
 38 |     def sample(self, n):
 39 |         mini_batch = random.sample(self.buffer, n)
 40 |         s_lst, a_lst, r_lst, s_next_lst, done_mask_lst = [], [], [], [], []
 41 | 
 42 |         for transition in mini_batch:
 43 |             s, a, r, s_, done_mask = transition
 44 |             s_lst.append(s)
 45 |             a_lst.append([a])
 46 |             r_lst.append([r])
 47 |             s_next_lst.append(s_)
 48 |             done_mask_lst.append([done_mask])
 49 | 
 50 |         return torch.tensor(numpy.array(s_lst), dtype=torch.float), torch.tensor(numpy.array(a_lst)), \
 51 |                torch.tensor(numpy.array(r_lst)), torch.tensor(numpy.array(s_next_lst), dtype=torch.float), \
 52 |                torch.tensor(numpy.array(done_mask_lst))
 53 | 
 54 |     def size(self):
 55 |         return len(self.buffer)
 56 | 
 57 | 
 58 | class DQNDuelingNet(nn.Module):
 59 |     def __init__(self):
 60 |         super(DQNDuelingNet, self).__init__()
 61 |         hidden_dims = 128
 62 |         self.feature_layer = nn.Sequential(nn.Linear(n_features, hidden_dims),
 63 |                                            nn.ReLU())
 64 |         self.value_layer = nn.Linear(hidden_dims, 1)
 65 |         self.advantage_layer = nn.Linear(hidden_dims, n_actions)
 66 | 
 67 |     def forward(self, x):
 68 |         feature = self.feature_layer(x)
 69 |         value = self.value_layer(feature)
 70 |         advantage = self.advantage_layer(feature)
 71 | 
 72 |         avg_advantage = torch.mean(input=advantage, dim=-1, keepdim=True)
 73 |         q_values = value + (advantage - avg_advantage)
 74 |         return q_values
 75 | 
 76 | 
 77 | # Epsilon_Greedy_Exploration
 78 | # MAX_Greedy_Update
 79 | class Dueling_DQN:
 80 |     def __init__(self):
 81 |         # [target_net, evaluate_net]
 82 |         self.evaluate_net = DQNDuelingNet()
 83 |         self.target_net = type(self.evaluate_net)()
 84 |         self.target_net.load_state_dict(self.evaluate_net.state_dict())  # copy weights and stuff
 85 | 
 86 |         self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(),
 87 |                                           learning_rate)
 88 |         self.memory = ReplayBuffer()
 89 | 
 90 |     def train(self):
 91 |         s, a, r, s_, done_mask = self.memory.sample(batch_size)
 92 | 
 93 |         q_out = self.evaluate_net(s)
 94 |         q_a = q_out.gather(1, a)
 95 |         max_q_prime = torch.max(self.target_net(s_), dim=1, keepdim=True).values
 96 |         target = r + gamma * max_q_prime * done_mask
 97 |         loss = F.smooth_l1_loss(q_a, target)
 98 | 
 99 |         self.optimizer.zero_grad()
100 |         loss.backward()
101 |         self.optimizer.step()
102 | 
103 |     def sample_action(self, obs, epsilon):
104 |         coin = random.random()
105 |         if coin < epsilon:
106 |             return random.randint(0, 1)
107 |         else:
108 |             out = self.evaluate_net(obs)
109 |             return out.argmax().item()
110 | 
111 | 
112 | def main():
113 |     trainer = Dueling_DQN()
114 | 
115 |     print_interval = 20
116 |     score = 0.0
117 | 
118 |     for n_epi in range(MAX_EPISODE):
119 |         epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200))  # Linear annealing from 8% to 1%
120 |         s = env.reset()
121 |         done = False
122 | 
123 |         while not done:
124 |             if RENDER:
125 |                 env.render()
126 |             a = trainer.sample_action(torch.from_numpy(s).float(), epsilon)
127 |             s_, r, done, info = env.step(a)
128 |             done_mask = 0.0 if done else 1.0
129 |             trainer.memory.put((s, a, r / 100.0, s_, done_mask))
130 |             s = s_
131 | 
132 |             score += r
133 |             if done:
134 |                 break
135 | 
136 |         if trainer.memory.size() > 2000:
137 |             trainer.train()
138 | 
139 |         if n_epi % print_interval == 0 and n_epi != 0:
140 |             trainer.target_net.load_state_dict(trainer.evaluate_net.state_dict())
141 |             print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
142 |                 n_epi, score / print_interval, trainer.memory.size(), epsilon * 100))
143 |             score = 0.0
144 |     env.close()
145 | 
146 | 
147 | if __name__ == '__main__':
148 |     main()
149 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 老胡
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Noise_DQN/Noise_DQN.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from collections import deque
  3 | import gym
  4 | import matplotlib.pyplot as plt
  5 | import torch
  6 | import torch.autograd as autograd
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | import torch.optim as optim
 10 | 
 11 | from replay_buffer import *
 12 | 
 13 | USE_CUDA = torch.cuda.is_available()
 14 | device = torch.device("cuda" if USE_CUDA else "cpu")
 15 | 
 16 | 
 17 | # 定义一个添加噪声的网络层
 18 | class NoisyLinear(nn.Module):
 19 |     def __init__(self, in_features, out_features, std_init=0.4):
 20 |         super(NoisyLinear, self).__init__()
 21 | 
 22 |         self.in_features = in_features
 23 |         self.out_features = out_features
 24 |         self.std_init = std_init
 25 | 
 26 |         self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
 27 |         self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
 28 |         # 向模块添加持久缓冲区,这通常用于注册不应被视为模型参数的缓冲区。例如，BatchNorm的running_mean不是一个参数，而是持久状态的一部分。
 29 |         # 缓冲区可以使用给定的名称作为属性访问。
 30 |         self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))
 31 | 
 32 |         self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
 33 |         self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
 34 |         self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
 35 | 
 36 |         self.reset_parameters()
 37 |         self.reset_noise()
 38 | 
 39 |     def forward(self, x):
 40 |         if self.training:
 41 |             weight = self.weight_mu + self.weight_sigma.mul(self.weight_epsilon.to(device))
 42 |             bias = self.bias_mu + self.bias_sigma.mul(self.bias_epsilon.to(device))
 43 |         else:
 44 |             weight = self.weight_mu
 45 |             bias = self.bias_mu
 46 |         return F.linear(x, weight, bias)
 47 | 
 48 |     def reset_parameters(self):
 49 |         mu_range = 1 / math.sqrt(self.weight_mu.size(1))
 50 | 
 51 |         self.weight_mu.data.uniform_(-mu_range, mu_range)
 52 |         self.weight_sigma.data.uniform_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
 53 | 
 54 |         self.bias_mu.data.uniform_(-mu_range, mu_range)
 55 |         self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
 56 | 
 57 |     def reset_noise(self):
 58 |         epsilon_in = self._scale_noise(self.in_features)
 59 |         epsilon_out = self._scale_noise(self.out_features)
 60 | 
 61 |         self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
 62 |         self.bias_epsilon.copy_(self._scale_noise(self.out_features))
 63 | 
 64 |     def _scale_noise(self, size):
 65 |         x = torch.randn(size)
 66 |         x = x.sign().mul(x.abs().sqrt())
 67 |         return x
 68 | 
 69 | 
 70 | class NoisyDQN(nn.Module):
 71 |     def __init__(self, observation_space, action_sapce):
 72 |         super(NoisyDQN, self).__init__()
 73 | 
 74 |         self.linear = nn.Linear(observation_space, 128)
 75 |         self.noisy1 = NoisyLinear(128, 128)
 76 |         self.noisy2 = NoisyLinear(128, action_sapce)
 77 | 
 78 |     def forward(self, x):
 79 |         x = F.relu(self.linear(x))
 80 |         x = F.relu(self.noisy1(x))
 81 |         x = self.noisy2(x)
 82 |         return x
 83 | 
 84 |     def act(self, state):
 85 |         state = torch.FloatTensor(state).unsqueeze(0)
 86 |         q_value = self.forward(state)
 87 |         action = q_value.max(1)[1].data[0]
 88 |         action = action.cpu().numpy()  # 从网络中得到的tensor形式，因为之后要输入给gym环境中，这里把它放回cpu，转为数组形式
 89 |         action = int(action)
 90 |         return action
 91 | 
 92 |     def reset_noise(self):
 93 |         self.noisy1.reset_noise()
 94 |         self.noisy2.reset_noise()
 95 | 
 96 | 
 97 | class ReplayBuffer(object):
 98 |     def __init__(self, capacity):
 99 |         # deque模块是python标准库collections中的一项，它提供了两端都可以操作的序列，其实就是双向队列，
100 |         # 可以从左右两端增加元素，或者是删除元素。如果设置了最大长度，非输入端的数据会逐步移出窗口。
101 |         self.buffer = deque(maxlen=capacity)
102 | 
103 |     def push(self, state, aciton, reward, next_state, done):
104 |         state = np.expand_dims(state, 0)
105 |         # 这里增加维度的操作是为了便于之后使用concatenate进行拼接
106 |         next_state = np.expand_dims(next_state, 0)
107 |         self.buffer.append((state, aciton, reward, next_state, done))
108 | 
109 |     def sample(self, batch_size):
110 |         # 将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表
111 |         state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
112 |         # 最后使用concatenate对数组进行拼接，相当于少了一个维度
113 |         return np.concatenate(state), action, reward, np.concatenate(next_state), done
114 | 
115 | 
116 | def compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size, beta):
117 |     state, action, reward, next_state, done, weights, indices = replay_buffer.sample(batch_size, beta)
118 | 
119 |     state = torch.FloatTensor(np.float32(state)).to(device)
120 |     next_state = torch.FloatTensor(np.float32(next_state)).to(device)
121 |     action = torch.LongTensor(action).to(device)
122 |     reward = torch.FloatTensor(reward).to(device)
123 |     done = torch.FloatTensor(np.float32(done)).to(device)
124 |     weights = torch.FloatTensor(weights).to(device)
125 | 
126 |     q_values = current_model(state)
127 |     next_q_values = target_model(next_state)
128 | 
129 |     q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
130 |     # gather可以看作是对q_values的查询，即元素都是q_values中的元素，查询索引都存在action中。输出大小与action.unsqueeze(1)一致。
131 |     # dim=1,它存放的都是第1维度的索引；dim=0，它存放的都是第0维度的索引；
132 |     # 这里增加维度主要是为了方便gather操作，之后再删除该维度
133 |     next_q_value = next_q_values.max(1)[0]
134 | 
135 |     expected_q_value = reward + gamma * next_q_value * (1 - done)
136 | 
137 |     loss = (q_value - expected_q_value.detach()).pow(2) * weights
138 |     prios = loss + 1e-5
139 |     loss = loss.mean()
140 | 
141 |     optimizer.zero_grad()
142 |     loss.backward()
143 |     optimizer.step()
144 | 
145 |     replay_buffer.update_priorities(indices, prios.data.cpu().numpy())
146 |     current_model.reset_noise()
147 |     target_model.reset_noise()
148 | 
149 |     return loss
150 | 
151 | 
152 | def update_target(current_model, target_model):
153 |     target_model.load_state_dict(current_model.state_dict())  # 加载模型
154 | 
155 | 
156 | def plot(frame_idx, rewards, losses):
157 |     plt.figure(figsize=(20, 5))
158 |     plt.subplot(131)
159 |     plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-10:])))
160 |     plt.plot(rewards)
161 |     plt.subplot(132)
162 |     plt.title('loss')
163 |     plt.plot(losses)
164 |     plt.show()
165 | 
166 | 
167 | def main():
168 |     env_id = "CartPole-v1"
169 |     env = gym.make(env_id)
170 | 
171 |     observation_space = env.observation_space.shape[0]
172 |     action_sapce = env.action_space.n
173 | 
174 |     current_model = NoisyDQN(observation_space, action_sapce)
175 |     target_model = NoisyDQN(observation_space, action_sapce)
176 | 
177 |     if USE_CUDA:
178 |         current_model = current_model.cuda()
179 |         target_model = target_model.cuda()
180 | 
181 |     optimizer = optim.Adam(current_model.parameters())
182 | 
183 |     beta_start = 0.4
184 |     beta_frames = 1000
185 |     beta_by_frame = lambda frame_idx: min(1.0, beta_start + frame_idx * (1.0 - beta_start) / beta_frames)
186 | 
187 |     replay_buffer = PrioritizedReplayBuffer(10000, alpha=0.6)
188 | 
189 |     update_target(current_model, target_model)
190 | 
191 |     num_frames = 10000
192 |     batch_size = 32
193 |     gamma = 0.99
194 | 
195 |     losses = []
196 |     all_rewards = []
197 |     episode_reward = 0
198 | 
199 |     state = env.reset()
200 |     for frame_idx in range(1, num_frames + 1):
201 |         # 显示动画
202 |         # env.render()
203 |         action = current_model.act(state)
204 | 
205 |         next_state, reward, done, _ = env.step(action)
206 |         replay_buffer.push(state, action, reward, next_state, done)
207 | 
208 |         state = next_state
209 |         episode_reward += reward
210 | 
211 |         if done:
212 |             state = env.reset()
213 |             all_rewards.append(episode_reward)
214 |             episode_reward = 0
215 | 
216 |         if len(replay_buffer) > batch_size:
217 |             beta = beta_by_frame(frame_idx)
218 |             loss = compute_td_loss(current_model, target_model, optimizer, replay_buffer, gamma, batch_size, beta)
219 |             losses.append(np.array(loss.data.cpu()))
220 | 
221 |         if frame_idx % 200 == 0:
222 |             plot(frame_idx, all_rewards, losses)
223 | 
224 |         if frame_idx % 1000 == 0:
225 |             update_target(current_model, target_model)
226 | 
227 | 
228 | if __name__ == '__main__':
229 |     main()
230 | 


--------------------------------------------------------------------------------
/Noise_DQN/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | #code from openai
  2 | #https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
  3 | 
  4 | import numpy as np
  5 | import random
  6 | 
  7 | import operator
  8 | 
  9 | 
 10 | class SegmentTree(object):
 11 |     def __init__(self, capacity, operation, neutral_element):
 12 |         """Build a Segment Tree data structure.
 13 |         https://en.wikipedia.org/wiki/Segment_tree
 14 |         Can be used as regular array, but with two
 15 |         important differences:
 16 |             a) setting item's value is slightly slower.
 17 |                It is O(lg capacity) instead of O(1).
 18 |             b) user has access to an efficient `reduce`
 19 |                operation which reduces `operation` over
 20 |                a contiguous subsequence of items in the
 21 |                array.
 22 |         Paramters
 23 |         ---------
 24 |         capacity: int
 25 |             Total size of the array - must be a power of two.
 26 |         operation: lambda obj, obj -> obj
 27 |             and operation for combining elements (eg. sum, max)
 28 |             must for a mathematical group together with the set of
 29 |             possible values for array elements.
 30 |         neutral_element: obj
 31 |             neutral element for the operation above. eg. float('-inf')
 32 |             for max and 0 for sum.
 33 |         """
 34 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 35 |         self._capacity = capacity
 36 |         self._value = [neutral_element for _ in range(2 * capacity)]
 37 |         self._operation = operation
 38 | 
 39 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 40 |         if start == node_start and end == node_end:
 41 |             return self._value[node]
 42 |         mid = (node_start + node_end) // 2
 43 |         if end <= mid:
 44 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 45 |         else:
 46 |             if mid + 1 <= start:
 47 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 48 |             else:
 49 |                 return self._operation(
 50 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 51 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 52 |                 )
 53 | 
 54 |     def reduce(self, start=0, end=None):
 55 |         """Returns result of applying `self.operation`
 56 |         to a contiguous subsequence of the array.
 57 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 58 |         Parameters
 59 |         ----------
 60 |         start: int
 61 |             beginning of the subsequence
 62 |         end: int
 63 |             end of the subsequences
 64 |         Returns
 65 |         -------
 66 |         reduced: obj
 67 |             result of reducing self.operation over the specified range of array elements.
 68 |         """
 69 |         if end is None:
 70 |             end = self._capacity
 71 |         if end < 0:
 72 |             end += self._capacity
 73 |         end -= 1
 74 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 75 | 
 76 |     def __setitem__(self, idx, val):
 77 |         # index of the leaf
 78 |         idx += self._capacity
 79 |         self._value[idx] = val
 80 |         idx //= 2
 81 |         while idx >= 1:
 82 |             self._value[idx] = self._operation(
 83 |                 self._value[2 * idx],
 84 |                 self._value[2 * idx + 1]
 85 |             )
 86 |             idx //= 2
 87 | 
 88 |     def __getitem__(self, idx):
 89 |         assert 0 <= idx < self._capacity
 90 |         return self._value[self._capacity + idx]
 91 | 
 92 | 
 93 | class SumSegmentTree(SegmentTree):
 94 |     def __init__(self, capacity):
 95 |         super(SumSegmentTree, self).__init__(
 96 |             capacity=capacity,
 97 |             operation=operator.add,
 98 |             neutral_element=0.0
 99 |         )
100 | 
101 |     def sum(self, start=0, end=None):
102 |         """Returns arr[start] + ... + arr[end]"""
103 |         return super(SumSegmentTree, self).reduce(start, end)
104 | 
105 |     def find_prefixsum_idx(self, prefixsum):
106 |         """Find the highest index `i` in the array such that
107 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
108 |         if array values are probabilities, this function
109 |         allows to sample indexes according to the discrete
110 |         probability efficiently.
111 |         Parameters
112 |         ----------
113 |         perfixsum: float
114 |             upperbound on the sum of array prefix
115 |         Returns
116 |         -------
117 |         idx: int
118 |             highest index satisfying the prefixsum constraint
119 |         """
120 |         assert 0 <= prefixsum <= self.sum() + 1e-5
121 |         idx = 1
122 |         while idx < self._capacity:  # while non-leaf
123 |             if self._value[2 * idx] > prefixsum:
124 |                 idx = 2 * idx
125 |             else:
126 |                 prefixsum -= self._value[2 * idx]
127 |                 idx = 2 * idx + 1
128 |         return idx - self._capacity
129 | 
130 | 
131 | class MinSegmentTree(SegmentTree):
132 |     def __init__(self, capacity):
133 |         super(MinSegmentTree, self).__init__(
134 |             capacity=capacity,
135 |             operation=min,
136 |             neutral_element=float('inf')
137 |         )
138 | 
139 |     def min(self, start=0, end=None):
140 |         """Returns min(arr[start], ...,  arr[end])"""
141 | 
142 |         return super(MinSegmentTree, self).reduce(start, end)
143 | 
144 | 
145 | class ReplayBuffer(object):
146 |     def __init__(self, size):
147 |         """Create Replay buffer.
148 |         Parameters
149 |         ----------
150 |         size: int
151 |             Max number of transitions to store in the buffer. When the buffer
152 |             overflows the old memories are dropped.
153 |         """
154 |         self._storage = []
155 |         self._maxsize = size
156 |         self._next_idx = 0
157 | 
158 |     def __len__(self):
159 |         return len(self._storage)
160 | 
161 |     def push(self, state, action, reward, next_state, done):
162 |         data = (state, action, reward, next_state, done)
163 | 
164 |         if self._next_idx >= len(self._storage):
165 |             self._storage.append(data)
166 |         else:
167 |             self._storage[self._next_idx] = data
168 |         self._next_idx = (self._next_idx + 1) % self._maxsize
169 | 
170 |     def _encode_sample(self, idxes):
171 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
172 |         for i in idxes:
173 |             data = self._storage[i]
174 |             obs_t, action, reward, obs_tp1, done = data
175 |             obses_t.append(np.array(obs_t, copy=False))
176 |             actions.append(np.array(action, copy=False))
177 |             rewards.append(reward)
178 |             obses_tp1.append(np.array(obs_tp1, copy=False))
179 |             dones.append(done)
180 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
181 | 
182 |     def sample(self, batch_size):
183 |         """Sample a batch of experiences.
184 |         Parameters
185 |         ----------
186 |         batch_size: int
187 |             How many transitions to sample.
188 |         Returns
189 |         -------
190 |         obs_batch: np.array
191 |             batch of observations
192 |         act_batch: np.array
193 |             batch of actions executed given obs_batch
194 |         rew_batch: np.array
195 |             rewards received as results of executing act_batch
196 |         next_obs_batch: np.array
197 |             next set of observations seen after executing act_batch
198 |         done_mask: np.array
199 |             done_mask[i] = 1 if executing act_batch[i] resulted in
200 |             the end of an episode and 0 otherwise.
201 |         """
202 |         idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
203 |         return self._encode_sample(idxes)
204 | 
205 | 
206 | class PrioritizedReplayBuffer(ReplayBuffer):
207 |     def __init__(self, size, alpha):
208 |         """Create Prioritized Replay buffer.
209 |         Parameters
210 |         ----------
211 |         size: int
212 |             Max number of transitions to store in the buffer. When the buffer
213 |             overflows the old memories are dropped.
214 |         alpha: float
215 |             how much prioritization is used
216 |             (0 - no prioritization, 1 - full prioritization)
217 |         See Also
218 |         --------
219 |         ReplayBuffer.__init__
220 |         """
221 |         super(PrioritizedReplayBuffer, self).__init__(size)
222 |         assert alpha > 0
223 |         self._alpha = alpha
224 | 
225 |         it_capacity = 1
226 |         while it_capacity < size:
227 |             it_capacity *= 2
228 | 
229 |         self._it_sum = SumSegmentTree(it_capacity)
230 |         self._it_min = MinSegmentTree(it_capacity)
231 |         self._max_priority = 1.0
232 | 
233 |     def push(self, *args, **kwargs):
234 |         """See ReplayBuffer.store_effect"""
235 |         idx = self._next_idx
236 |         super(PrioritizedReplayBuffer, self).push(*args, **kwargs)
237 |         self._it_sum[idx] = self._max_priority ** self._alpha
238 |         self._it_min[idx] = self._max_priority ** self._alpha
239 | 
240 |     def _sample_proportional(self, batch_size):
241 |         res = []
242 |         for _ in range(batch_size):
243 |             # TODO(szymon): should we ensure no repeats?
244 |             mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
245 |             idx = self._it_sum.find_prefixsum_idx(mass)
246 |             res.append(idx)
247 |         return res
248 | 
249 |     def sample(self, batch_size, beta):
250 |         """Sample a batch of experiences.
251 |         compared to ReplayBuffer.sample
252 |         it also returns importance weights and idxes
253 |         of sampled experiences.
254 |         Parameters
255 |         ----------
256 |         batch_size: int
257 |             How many transitions to sample.
258 |         beta: float
259 |             To what degree to use importance weights
260 |             (0 - no corrections, 1 - full correction)
261 |         Returns
262 |         -------
263 |         obs_batch: np.array
264 |             batch of observations
265 |         act_batch: np.array
266 |             batch of actions executed given obs_batch
267 |         rew_batch: np.array
268 |             rewards received as results of executing act_batch
269 |         next_obs_batch: np.array
270 |             next set of observations seen after executing act_batch
271 |         done_mask: np.array
272 |             done_mask[i] = 1 if executing act_batch[i] resulted in
273 |             the end of an episode and 0 otherwise.
274 |         weights: np.array
275 |             Array of shape (batch_size,) and dtype np.float32
276 |             denoting importance weight of each sampled transition
277 |         idxes: np.array
278 |             Array of shape (batch_size,) and dtype np.int32
279 |             idexes in buffer of sampled experiences
280 |         """
281 |         assert beta > 0
282 | 
283 |         idxes = self._sample_proportional(batch_size)
284 | 
285 |         weights = []
286 |         p_min = self._it_min.min() / self._it_sum.sum()
287 |         max_weight = (p_min * len(self._storage)) ** (-beta)
288 | 
289 |         for idx in idxes:
290 |             p_sample = self._it_sum[idx] / self._it_sum.sum()
291 |             weight = (p_sample * len(self._storage)) ** (-beta)
292 |             weights.append(weight / max_weight)
293 |         weights = np.array(weights)
294 |         encoded_sample = self._encode_sample(idxes)
295 |         return tuple(list(encoded_sample) + [weights, idxes])
296 | 
297 |     def update_priorities(self, idxes, priorities):
298 |         """Update priorities of sampled transitions.
299 |         sets priority of transition at index idxes[i] in buffer
300 |         to priorities[i].
301 |         Parameters
302 |         ----------
303 |         idxes: [int]
304 |             List of idxes of sampled transitions
305 |         priorities: [float]
306 |             List of updated priorities corresponding to
307 |             transitions at the sampled idxes denoted by
308 |             variable `idxes`.
309 |         """
310 |         assert len(idxes) == len(priorities)
311 |         for idx, priority in zip(idxes, priorities):
312 |             assert priority > 0
313 |             assert 0 <= idx < len(self._storage)
314 |             self._it_sum[idx] = priority ** self._alpha
315 |             self._it_min[idx] = priority ** self._alpha
316 | 
317 |             self._max_priority = max(self._max_priority, priority)


--------------------------------------------------------------------------------
/PPO/PPO.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | from torch.distributions import Categorical
  7 | 
  8 | # Hyperparameters
  9 | learning_rate = 0.0005
 10 | gamma = 0.98
 11 | lmbda = 0.95
 12 | eps_clip = 0.1
 13 | K_epoch = 3
 14 | T_horizon = 20
 15 | 
 16 | MAX_EPISODE = 10000
 17 | RENDER = False
 18 | 
 19 | env = gym.make('CartPole-v1')
 20 | env = env.unwrapped
 21 | 
 22 | print("env.action_space :", env.action_space)
 23 | print("env.observation_space :", env.observation_space)
 24 | 
 25 | n_features = env.observation_space.shape[0]
 26 | n_actions = env.action_space.n
 27 | 
 28 | 
 29 | class PPO(nn.Module):
 30 |     def __init__(self):
 31 |         super(PPO, self).__init__()
 32 |         self.data = []
 33 |         hidden_dims = 256
 34 |         self.fc1 = nn.Linear(n_features, hidden_dims)
 35 |         self.fc_pi = nn.Linear(hidden_dims, n_actions)
 36 |         self.fc_v = nn.Linear(hidden_dims, 1)
 37 |         self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
 38 | 
 39 |     def pi(self, x, softmax_dim=-1):
 40 |         x = F.relu(self.fc1(x))
 41 |         x = self.fc_pi(x)
 42 |         prob = F.softmax(x, dim=softmax_dim)
 43 |         return prob
 44 | 
 45 |     def v(self, x):
 46 |         x = F.relu(self.fc1(x))
 47 |         v = self.fc_v(x)
 48 |         return v
 49 | 
 50 |     def put_data(self, transition):
 51 |         self.data.append(transition)
 52 | 
 53 |     def make_batch(self):
 54 |         s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
 55 |         for transition in self.data:
 56 |             s, a, r, s_prime, prob_a, done = transition
 57 | 
 58 |             s_lst.append(s)
 59 |             a_lst.append([a])
 60 |             r_lst.append([r])
 61 |             s_prime_lst.append(s_prime)
 62 |             prob_a_lst.append([prob_a])
 63 |             done_mask = 0 if done else 1
 64 |             done_lst.append([done_mask])
 65 | 
 66 |         s, a, r, s_prime, done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
 67 |                                               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
 68 |                                               torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
 69 |         self.data = []
 70 |         return s, a, r, s_prime, done_mask, prob_a
 71 | 
 72 |     def train_net(self):
 73 |         s, a, r, s_prime, done_mask, prob_a = self.make_batch()
 74 | 
 75 |         for i in range(K_epoch):
 76 |             td_target = r + gamma * self.v(s_prime) * done_mask
 77 |             delta = td_target - self.v(s)
 78 |             delta = delta.detach().numpy()
 79 | 
 80 |             advantage_lst = []
 81 |             advantage = 0.0
 82 |             for delta_t in delta[::-1]:
 83 |                 advantage = gamma * lmbda * advantage + delta_t[0]
 84 |                 advantage_lst.append([advantage])
 85 |             advantage_lst.reverse()
 86 |             advantage = torch.tensor(advantage_lst, dtype=torch.float)
 87 | 
 88 |             pi = self.pi(s, softmax_dim=1)
 89 |             pi_a = pi.gather(1, a)
 90 |             ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == exp(log(a)-log(b))
 91 | 
 92 |             surr1 = ratio * advantage
 93 |             surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * advantage
 94 |             loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s), td_target.detach())
 95 | 
 96 |             self.optimizer.zero_grad()
 97 |             loss.mean().backward()
 98 |             self.optimizer.step()
 99 | 
100 | 
101 | def main():
102 |     model = PPO()
103 |     score = 0.0
104 |     print_interval = 20
105 | 
106 |     for n_epi in range(MAX_EPISODE):
107 |         s = env.reset()
108 |         done = False
109 |         while not done:
110 |             for t in range(T_horizon):
111 |                 if RENDER:
112 |                     env.render()
113 |                 prob = model.pi(torch.from_numpy(s).float())
114 |                 m = Categorical(prob)
115 |                 a = m.sample().item()
116 |                 s_prime, r, done, info = env.step(a)
117 | 
118 |                 model.put_data((s, a, r / 100.0, s_prime, prob[a].item(), done))
119 |                 s = s_prime
120 | 
121 |                 score += r
122 |                 if done:
123 |                     break
124 | 
125 |             model.train_net()
126 | 
127 |         if n_epi % print_interval == 0 and n_epi != 0:
128 |             print("# of episode :{}, avg score : {:.1f}".format(n_epi, score / print_interval))
129 |             score = 0.0
130 | 
131 |     env.close()
132 | 
133 | 
134 | if __name__ == '__main__':
135 |     main()
136 | 


--------------------------------------------------------------------------------
/Prioritized_Replay_DQN/Prioritized_Replay_DQN.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch import nn
  4 | 
  5 | np.random.seed(1)
  6 | torch.manual_seed(1)
  7 | 
  8 | 
  9 | class SumTree(object):
 10 |     """
 11 |     This SumTree code is a modified version and the original code is from:
 12 |     https://github.com/jaara/AI-blog/blob/master/SumTree.py
 13 |     Story data with its priority in the tree.
 14 |     """
 15 |     data_pointer = 0
 16 | 
 17 |     def __init__(self, capacity):
 18 |         self.capacity = capacity  # for all priority values
 19 |         self.tree = np.zeros(2 * capacity - 1)
 20 |         # [--------------Parent nodes-------------][-------leaves to recode priority-------]
 21 |         #             size: capacity - 1                       size: capacity
 22 |         self.data = np.zeros(capacity, dtype=object)  # for all transitions
 23 |         # [--------------data frame-------------]
 24 |         #             size: capacity
 25 | 
 26 |     def add(self, p, data):
 27 |         tree_idx = self.data_pointer + self.capacity - 1
 28 |         self.data[self.data_pointer] = data  # update data_frame
 29 |         self.update(tree_idx, p)  # update tree_frame
 30 | 
 31 |         self.data_pointer += 1
 32 |         if self.data_pointer >= self.capacity:  # replace when exceed the capacity
 33 |             self.data_pointer = 0
 34 | 
 35 |     def update(self, tree_idx, p):
 36 |         change = p - self.tree[tree_idx]
 37 |         self.tree[tree_idx] = p
 38 |         # then propagate the change through tree
 39 |         while tree_idx != 0:  # this method is faster than the recursive loop in the reference code
 40 |             tree_idx = (tree_idx - 1) // 2
 41 |             self.tree[tree_idx] += change
 42 | 
 43 |     def get_leaf(self, v):
 44 |         """
 45 |         Tree structure and array storage:
 46 |         Tree index:
 47 |              0         -> storing priority sum
 48 |             / \
 49 |           1     2
 50 |          / \   / \
 51 |         3   4 5   6    -> storing priority for transitions
 52 |         Array type for storing:
 53 |         [0,1,2,3,4,5,6]
 54 |         """
 55 |         parent_idx = 0
 56 |         while True:  # the while loop is faster than the method in the reference code
 57 |             cl_idx = 2 * parent_idx + 1  # this leaf's left and right kids
 58 |             cr_idx = cl_idx + 1
 59 |             if cl_idx >= len(self.tree):  # reach bottom, end search
 60 |                 leaf_idx = parent_idx
 61 |                 break
 62 |             else:  # downward search, always search for a higher priority node
 63 |                 if v <= self.tree[cl_idx]:
 64 |                     parent_idx = cl_idx
 65 |                 else:
 66 |                     v -= self.tree[cl_idx]
 67 |                     parent_idx = cr_idx
 68 | 
 69 |         data_idx = leaf_idx - self.capacity + 1
 70 |         return leaf_idx, self.tree[leaf_idx], self.data[data_idx]
 71 | 
 72 |     @property
 73 |     def total_p(self):
 74 |         return self.tree[0]  # the root
 75 | 
 76 | 
 77 | class Memory(object):  # stored as ( s, a, r, s_ ) in SumTree
 78 |     """
 79 |     This Memory class is modified based on the original code from:
 80 |     https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
 81 |     """
 82 |     epsilon = 0.01  # small amount to avoid zero priority
 83 |     alpha = 0.6  # [0~1] convert the importance of TD error to priority
 84 |     beta = 0.4  # importance-sampling, from initial value increasing to 1
 85 |     beta_increment_per_sampling = 0.001
 86 |     abs_err_upper = 1.  # clipped abs error
 87 | 
 88 |     def __init__(self, capacity):
 89 |         self.tree = SumTree(capacity)
 90 | 
 91 |     def store(self, transition):
 92 |         max_p = np.max(self.tree.tree[-self.tree.capacity:])
 93 |         if max_p == 0:
 94 |             max_p = self.abs_err_upper
 95 |         self.tree.add(max_p, transition)  # set the max p for new p
 96 | 
 97 |     def sample(self, n):
 98 |         b_idx, b_memory, ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, self.tree.data[0].size)), np.empty(
 99 |             (n, 1))
100 |         pri_seg = self.tree.total_p / n  # priority segment
101 |         self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])  # max = 1
102 | 
103 |         min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_p  # for later calculate ISweight
104 |         for i in range(n):
105 |             a, b = pri_seg * i, pri_seg * (i + 1)
106 |             v = np.random.uniform(a, b)
107 |             idx, p, data = self.tree.get_leaf(v)
108 |             prob = p / self.tree.total_p
109 |             ISWeights[i, 0] = np.power(prob / min_prob, -self.beta)
110 |             b_idx[i], b_memory[i, :] = idx, data
111 |         return b_idx, b_memory, ISWeights
112 | 
113 |     def batch_update(self, tree_idx, abs_errors):
114 |         abs_errors += self.epsilon  # convert to abs and avoid 0
115 |         clipped_errors = np.minimum(abs_errors, self.abs_err_upper)
116 |         ps = np.power(clipped_errors, self.alpha)
117 |         for ti, p in zip(tree_idx, ps):
118 |             self.tree.update(ti, p)
119 | 
120 | 
121 | class DQNNet(nn.Module):
122 |     def __init__(self, n_actions, n_features):
123 |         super(DQNNet, self).__init__()
124 |         self.out_layer = torch.nn.Sequential(nn.Linear(n_features, 10),
125 |                                              nn.ReLU(),
126 |                                              nn.Linear(10, n_actions))
127 | 
128 |     def forward(self, x):
129 |         return self.out_layer(x)
130 | 
131 | 
132 | class DQNPrioritizedReplay:
133 |     def __init__(
134 |             self,
135 |             n_actions,
136 |             n_features,
137 |             learning_rate=0.005,
138 |             reward_decay=0.9,
139 |             e_greedy=0.9,
140 |             replace_target_iter=500,
141 |             memory_size=10000,
142 |             batch_size=32,
143 |             e_greedy_increment=None,
144 |     ):
145 |         self.memory_counter = 0
146 |         self.n_actions = n_actions
147 |         self.n_features = n_features
148 |         self.lr = learning_rate
149 |         self.gamma = reward_decay
150 |         self.epsilon_max = e_greedy
151 |         self.replace_target_iter = replace_target_iter
152 |         self.memory_size = memory_size
153 |         self.batch_size = batch_size
154 |         self.epsilon_increment = e_greedy_increment
155 |         self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
156 |         # total learning step
157 |         self.learn_step_counter = 0
158 | 
159 |         # ---------------------------重要部分---------------------------
160 |         self.memory = Memory(capacity=memory_size)
161 |         # ---------------------------重要部分---------------------------
162 | 
163 |         self._build_net()
164 |         self.optimizer = torch.optim.Adam(self.evaluate_net.parameters(),
165 |                                           learning_rate)
166 |         self.cost_his = []
167 | 
168 |     def _build_net(self):
169 |         self.evaluate_net = DQNNet(self.n_actions, self.n_features)
170 |         self.target_net = type(self.evaluate_net)(self.n_actions, self.n_features)
171 |         self.target_net.load_state_dict(self.evaluate_net.state_dict())  # copy weights and stuff
172 | 
173 |     def store_transition(self, s, a, r, s_):
174 |         # ---------------------------重要部分---------------------------
175 |         # prioritized replay
176 |         transition = np.hstack((s, [a, r], s_))
177 |         self.memory.store(transition)  # have high priority for newly arrived transition
178 |         self.memory_counter += 1
179 |         # ---------------------------重要部分---------------------------
180 | 
181 |     def choose_action(self, observation):
182 |         state = torch.Tensor(observation[np.newaxis, :])
183 | 
184 |         if np.random.uniform() < self.epsilon:
185 |             actions_value = self.evaluate_net(state)
186 |             action = actions_value.argmax(axis=1).numpy()[0]
187 |         else:
188 |             action = np.random.randint(0, self.n_actions)
189 |         return action
190 | 
191 |     def learn(self):
192 |         # check to replace target parameters
193 |         if self.learn_step_counter % self.replace_target_iter == 0:
194 |             self.target_net.load_state_dict(self.evaluate_net.state_dict())  # copy weights and stuff
195 | 
196 |         # ---------------------------重要部分---------------------------
197 |         tree_idx, batch_memory, ISWeights = self.memory.sample(
198 |             self.batch_size)
199 |         # ---------------------------重要部分---------------------------
200 | 
201 |         s = torch.Tensor(batch_memory[:, :self.n_features])
202 |         u = torch.LongTensor(batch_memory[:, self.n_features, np.newaxis])
203 |         r = torch.Tensor(batch_memory[:, self.n_features + 1, np.newaxis])
204 |         s_ = torch.Tensor(batch_memory[:, -self.n_features:])
205 | 
206 |         q_eval = self.evaluate_net(s).gather(1, u)
207 | 
208 |         q_target_next = self.target_net(s_).detach()
209 |         q_eval_next = self.evaluate_net(s_).detach()
210 |         q_next = q_target_next.gather(1, q_eval_next.argmax(axis=1).reshape(-1, 1))
211 |         delta = r + self.gamma * q_next - q_eval
212 |         self.optimizer.zero_grad()
213 | 
214 |         # ---------------------------重要部分---------------------------
215 |         abs_errors = torch.sum(
216 |             torch.abs(
217 |                 self.evaluate_net(s).detach() -
218 |                 self.target_net(s).detach()), 1)  # for updating Sumtree
219 |         loss = torch.mean(torch.Tensor(ISWeights) * delta ** 2)
220 |         self.memory.batch_update(tree_idx, abs_errors)  # update priority
221 |         # ---------------------------重要部分---------------------------
222 | 
223 |         # train eval network
224 |         loss.backward()
225 |         self.optimizer.step()
226 |         self.cost_his.append(loss.data.numpy())
227 | 
228 |         # increasing epsilon
229 |         self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
230 |         self.learn_step_counter += 1
231 | 
232 |     def plot_cost(self):
233 |         import matplotlib.pyplot as plt
234 |         plt.plot(np.arange(len(self.cost_his)), self.cost_his)
235 |         plt.ylabel('Cost')
236 |         plt.xlabel('training steps')
237 |         plt.show()
238 | 


--------------------------------------------------------------------------------
/Prioritized_Replay_DQN/run_MountainCar.py:
--------------------------------------------------------------------------------
 1 | from Prioritized_Replay_DQN import DQNPrioritizedReplay
 2 | from gym import make
 3 | import numpy as np
 4 | 
 5 | MEMORY_SIZE = 10000
 6 | ACTION_SPACE = 11
 7 | 
 8 | env = make('MountainCar-v0')
 9 | env = env.unwrapped
10 | env.seed(21)
11 | 
12 | print("env.action_space :", env.action_space)
13 | print("env.observation_space :", env.observation_space)
14 | print("env.observation_space :", env.observation_space.high)
15 | print("env.observation_space :", env.observation_space.low)
16 | 
17 | RL = DQNPrioritizedReplay(env.action_space.n, env.observation_space.shape[0],
18 |                           learning_rate=0.01,
19 |                           reward_decay=0.9,
20 |                           e_greedy=0.9,
21 |                           replace_target_iter=200,
22 |                           memory_size=MEMORY_SIZE,
23 |                           e_greedy_increment=0.001
24 |                           )
25 | 
26 | total_steps = 0
27 | for i in range(500):
28 |     observation = env.reset()
29 |     while True:
30 |         env.render()
31 | 
32 |         action = RL.choose_action(observation)
33 | 
34 |         observation_, reward, done, info = env.step(action)
35 | 
36 |         if done: reward = 10
37 | 
38 |         RL.store_transition(observation, action, reward, observation_)
39 | 
40 |         if RL.memory_counter > RL.memory_size:
41 |             RL.learn()
42 | 
43 |         if done:
44 |             break
45 |         observation = observation_
46 |         total_steps += 1
47 | 
48 | env.close()
49 | RL.plot_cost()
50 | 


--------------------------------------------------------------------------------
/Q_Learning_maze/RL_brain.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This part of code is the Q learning brain, which is a brain of the agent.
 3 | All decisions are made in here.
 4 | """
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | 
10 | class QLearningTable:
11 |     def __init__(self, actions, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9):
12 |         self.actions = actions  # a list
13 |         self.lr = learning_rate
14 |         self.gamma = reward_decay
15 |         self.epsilon = e_greedy
16 |         self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
17 | 
18 |     def choose_action(self, observation):
19 |         self.check_state_exist(observation)
20 |         # action selection
21 |         if np.random.uniform() < self.epsilon:
22 |             # choose best action
23 |             state_action = self.q_table.loc[observation, :]
24 |             # some actions may have the same value, randomly choose on in these actions
25 |             action = np.random.choice(state_action[state_action == np.max(state_action)].index)
26 |         else:
27 |             # choose random action
28 |             action = np.random.choice(self.actions)
29 |         return action
30 | 
31 |     def learn(self, s, a, r, s_):
32 |         self.check_state_exist(s_)
33 |         q_predict = self.q_table.loc[s, a]
34 |         if s_ != 'terminal':
35 |             q_target = r + self.gamma * self.q_table.loc[s_, :].max()  # next state is not terminal
36 |         else:
37 |             q_target = r  # next state is terminal
38 |         self.q_table.loc[s, a] += self.lr * (q_target - q_predict)  # update
39 | 
40 |     def check_state_exist(self, state):
41 |         if state not in self.q_table.index:
42 |             # append new state to q table
43 |             self.q_table = self.q_table.append(
44 |                 pd.Series(
45 |                     [0] * len(self.actions),
46 |                     index=self.q_table.columns,
47 |                     name=state,
48 |                 )
49 |             )
50 | 


--------------------------------------------------------------------------------
/Q_Learning_maze/maze_env.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reinforcement learning maze example.
  3 | 
  4 | Red rectangle:          explorer.
  5 | Black rectangles:       hells       [reward = -1].
  6 | Yellow bin circle:      paradise    [reward = +1].
  7 | All other states:       ground      [reward = 0].
  8 | 
  9 | This script is the environment part of this example. The RL is in RL_brain.py.
 10 | """
 11 | 
 12 | import numpy as np
 13 | import time
 14 | import sys
 15 | 
 16 | import tkinter as tk
 17 | 
 18 | UNIT = 40  # pixels
 19 | MAZE_H = 4  # grid height
 20 | MAZE_W = 4  # grid width
 21 | 
 22 | 
 23 | class Maze(tk.Tk, object):
 24 |     def __init__(self):
 25 |         super(Maze, self).__init__()
 26 |         self.action_space = ['u', 'd', 'l', 'r']
 27 |         self.n_actions = len(self.action_space)
 28 |         self.title('maze')
 29 |         self.geometry('{0}x{1}'.format(MAZE_H * UNIT, MAZE_H * UNIT))
 30 |         self._build_maze()
 31 | 
 32 |     def _build_maze(self):
 33 |         self.canvas = tk.Canvas(self, bg='white',
 34 |                                 height=MAZE_H * UNIT,
 35 |                                 width=MAZE_W * UNIT)
 36 | 
 37 |         # create grids
 38 |         for c in range(0, MAZE_W * UNIT, UNIT):
 39 |             x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT
 40 |             self.canvas.create_line(x0, y0, x1, y1)
 41 |         for r in range(0, MAZE_H * UNIT, UNIT):
 42 |             x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r
 43 |             self.canvas.create_line(x0, y0, x1, y1)
 44 | 
 45 |         # create origin
 46 |         origin = np.array([20, 20])
 47 | 
 48 |         # hell
 49 |         hell1_center = origin + np.array([UNIT * 2, UNIT])
 50 |         self.hell1 = self.canvas.create_rectangle(
 51 |             hell1_center[0] - 15, hell1_center[1] - 15,
 52 |             hell1_center[0] + 15, hell1_center[1] + 15,
 53 |             fill='black')
 54 |         # hell
 55 |         hell2_center = origin + np.array([UNIT, UNIT * 2])
 56 |         self.hell2 = self.canvas.create_rectangle(
 57 |             hell2_center[0] - 15, hell2_center[1] - 15,
 58 |             hell2_center[0] + 15, hell2_center[1] + 15,
 59 |             fill='black')
 60 | 
 61 |         # create oval
 62 |         oval_center = origin + UNIT * 2
 63 |         self.oval = self.canvas.create_oval(
 64 |             oval_center[0] - 15, oval_center[1] - 15,
 65 |             oval_center[0] + 15, oval_center[1] + 15,
 66 |             fill='yellow')
 67 | 
 68 |         # create red rect
 69 |         self.rect = self.canvas.create_rectangle(
 70 |             origin[0] - 15, origin[1] - 15,
 71 |             origin[0] + 15, origin[1] + 15,
 72 |             fill='red')
 73 | 
 74 |         # pack all
 75 |         self.canvas.pack()
 76 | 
 77 |     def reset(self):
 78 |         self.update()
 79 |         time.sleep(0.5)
 80 |         self.canvas.delete(self.rect)
 81 |         origin = np.array([20, 20])
 82 |         self.rect = self.canvas.create_rectangle(
 83 |             origin[0] - 15, origin[1] - 15,
 84 |             origin[0] + 15, origin[1] + 15,
 85 |             fill='red')
 86 |         # return observation
 87 |         return self.canvas.coords(self.rect)
 88 | 
 89 |     def step(self, action):
 90 |         s = self.canvas.coords(self.rect)
 91 |         base_action = np.array([0, 0])
 92 |         if action == 0:  # up
 93 |             if s[1] > UNIT:
 94 |                 base_action[1] -= UNIT
 95 |         elif action == 1:  # down
 96 |             if s[1] < (MAZE_H - 1) * UNIT:
 97 |                 base_action[1] += UNIT
 98 |         elif action == 2:  # right
 99 |             if s[0] < (MAZE_W - 1) * UNIT:
100 |                 base_action[0] += UNIT
101 |         elif action == 3:  # left
102 |             if s[0] > UNIT:
103 |                 base_action[0] -= UNIT
104 | 
105 |         self.canvas.move(self.rect, base_action[0], base_action[1])  # move agent
106 | 
107 |         s_ = self.canvas.coords(self.rect)  # next state
108 | 
109 |         # reward function
110 |         if s_ == self.canvas.coords(self.oval):
111 |             reward = 1
112 |             done = True
113 |             s_ = 'terminal'
114 |         elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
115 |             reward = -1
116 |             done = True
117 |             s_ = 'terminal'
118 |         else:
119 |             reward = 0
120 |             done = False
121 | 
122 |         return s_, reward, done
123 | 
124 |     def render(self):
125 |         time.sleep(0.1)
126 |         self.update()
127 | 
128 | 
129 | def update():
130 |     for t in range(10):
131 |         s = env.reset()
132 |         while True:
133 |             env.render()
134 |             a = 1
135 |             s, r, done = env.step(a)
136 |             if done:
137 |                 break
138 | 
139 | 
140 | if __name__ == '__main__':
141 |     env = Maze()
142 |     env.after(100, update)
143 |     env.mainloop()
144 | 


--------------------------------------------------------------------------------
/Q_Learning_maze/run_q_function.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Reinforcement learning maze example.
 3 | 
 4 | Red rectangle:          explorer.
 5 | Black rectangles:       hells       [reward = -1].
 6 | Yellow bin circle:      paradise    [reward = +1].
 7 | All other states:       ground      [reward = 0].
 8 | 
 9 | This script is the main part which controls the update method of this example.
10 | The RL is in RL_brain.py.
11 | """
12 | 
13 | from maze_env import Maze
14 | from RL_brain import QLearningTable
15 | 
16 | 
17 | def update():
18 |     for episode in range(100):
19 |         # initial observation
20 |         observation = env.reset()
21 | 
22 |         while True:
23 |             # fresh env
24 |             env.render()
25 | 
26 |             # RL choose action based on observation
27 |             action = RL.choose_action(str(observation))
28 | 
29 |             # RL take action and get next observation and reward
30 |             observation_, reward, done = env.step(action)
31 | 
32 |             # RL learn from this transition
33 |             RL.learn(str(observation), action, reward, str(observation_))
34 | 
35 |             # swap observation
36 |             observation = observation_
37 | 
38 |             # break while loop when end of this episode
39 |             if done:
40 |                 break
41 | 
42 |     # end of game
43 |     print('game over')
44 |     env.destroy()
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     env = Maze()
49 |     actions = list(range(env.n_actions))
50 |     print(actions)
51 |     RL = QLearningTable(actions=list(range(env.n_actions)))
52 | 
53 |     env.after(100, update)
54 |     env.mainloop()
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 强化学习Pytorch
 2 | 经典强化学习算法的 Pytorch 实现
 3 | 
 4 | ## 环境依赖
 5 | 
 6 | 1. PyTorch
 7 | 2. OpenAI GYM
 8 | 
 9 | ## 算法实现
10 | 
11 | 
12 | 
13 | ## 参考链接
14 | 1. https://github.com/seungeunrho/minimalRL


--------------------------------------------------------------------------------
/REINFORCE/REINFORCE.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | import torch.optim as optim
 6 | from torch.distributions import Categorical
 7 | 
 8 | # Hyperparameters
 9 | gamma = 0.0002
10 | learning_rate = 0.98
11 | MAX_EPISODE = 10000
12 | RENDER = True
13 | 
14 | env = gym.make('CartPole-v1')
15 | env = env.unwrapped
16 | env.seed(1)
17 | torch.manual_seed(1)
18 | 
19 | print("env.action_space :", env.action_space)
20 | print("env.observation_space :", env.observation_space)
21 | 
22 | n_features = env.observation_space.shape[0]
23 | n_actions = env.action_space.n
24 | 
25 | 
26 | class Policy(nn.Module):
27 |     def __init__(self):
28 |         super(Policy, self).__init__()
29 |         self.episode = []
30 | 
31 |         hidden_units = 10
32 |         self.fc_layer = nn.Sequential(nn.Linear(n_features, hidden_units),
33 |                                       nn.Sigmoid(),
34 |                                       nn.Linear(hidden_units, n_actions),
35 |                                       nn.Softmax(dim=-1))
36 |         self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
37 | 
38 |     def forward(self, x):
39 |         x = self.fc_layer(x)
40 |         return x
41 | 
42 |     def put_data(self, item):
43 |         self.episode.append(item)
44 | 
45 |     def train_net(self):
46 |         reward = 0
47 |         self.optimizer.zero_grad()
48 |         for r, prob in self.episode[::-1]:
49 |             reward = r + gamma * reward
50 |             loss = -torch.log(prob) * reward
51 |             loss.backward()
52 | 
53 |         self.optimizer.step()
54 | 
55 |         self.episode = []
56 | 
57 |     def choose_action(self, observation):
58 |         prob_weights = self.forward(torch.from_numpy(observation))
59 |         m = Categorical(prob_weights)
60 |         action_idx = m.sample()
61 |         return action_idx, prob_weights
62 | 
63 | 
64 | def main():
65 |     policy = Policy()
66 |     score = 0.0
67 |     print_interval = 20
68 | 
69 |     for n_epi in range(MAX_EPISODE):
70 |         s = env.reset()
71 |         done = False
72 | 
73 |         while not done:  # CartPole-v1 forced to terminates at 500 step.
74 |             if RENDER:
75 |                 env.render()
76 |             action, prob_weights = policy.choose_action(s)
77 |             s_, r, done, info = env.step(action.item())
78 |             policy.put_data((r, prob_weights[action]))
79 |             s = s_
80 |             score += r
81 | 
82 |         policy.train_net()
83 | 
84 |         if n_epi % print_interval == 0 and n_epi != 0:
85 |             print("# of episode :{}, avg score : {}".format(n_epi, score / print_interval))
86 |             score = 0.0
87 |     env.close()
88 | 
89 | 
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/SAC/SAC.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import torch.optim as optim
  6 | from torch.distributions import Normal
  7 | import numpy as np
  8 | import collections, random
  9 | 
 10 | # Hyperparameters
 11 | lr_pi = 0.0005
 12 | lr_q = 0.001
 13 | init_alpha = 0.01
 14 | gamma = 0.98
 15 | batch_size = 32
 16 | buffer_limit = 50000
 17 | tau = 0.01  # for target network soft update
 18 | target_entropy = -1.0  # for automated alpha update
 19 | lr_alpha = 0.001  # for automated alpha update
 20 | 
 21 | 
 22 | class ReplayBuffer():
 23 |     def __init__(self):
 24 |         self.buffer = collections.deque(maxlen=buffer_limit)
 25 | 
 26 |     def put(self, transition):
 27 |         self.buffer.append(transition)
 28 | 
 29 |     def sample(self, n):
 30 |         mini_batch = random.sample(self.buffer, n)
 31 |         s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
 32 | 
 33 |         for transition in mini_batch:
 34 |             s, a, r, s_prime, done = transition
 35 |             s_lst.append(s)
 36 |             a_lst.append([a])
 37 |             r_lst.append([r])
 38 |             s_prime_lst.append(s_prime)
 39 |             done_mask = 0.0 if done else 1.0
 40 |             done_mask_lst.append([done_mask])
 41 | 
 42 |         return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst, dtype=torch.float), \
 43 |                torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \
 44 |                torch.tensor(done_mask_lst, dtype=torch.float)
 45 | 
 46 |     def size(self):
 47 |         return len(self.buffer)
 48 | 
 49 | 
 50 | class PolicyNet(nn.Module):
 51 |     def __init__(self, learning_rate):
 52 |         super(PolicyNet, self).__init__()
 53 |         self.fc1 = nn.Linear(3, 128)
 54 |         self.fc_mu = nn.Linear(128, 1)
 55 |         self.fc_std = nn.Linear(128, 1)
 56 |         self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
 57 | 
 58 |         self.log_alpha = torch.tensor(np.log(init_alpha))
 59 |         self.log_alpha.requires_grad = True
 60 |         self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr=lr_alpha)
 61 | 
 62 |     def forward(self, x):
 63 |         x = F.relu(self.fc1(x))
 64 |         mu = self.fc_mu(x)
 65 |         std = F.softplus(self.fc_std(x))
 66 |         dist = Normal(mu, std)
 67 |         action = dist.rsample()
 68 |         log_prob = dist.log_prob(action)
 69 |         real_action = torch.tanh(action)
 70 |         real_log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7)
 71 |         return real_action, real_log_prob
 72 | 
 73 |     def train_net(self, q1, q2, mini_batch):
 74 |         s, _, _, _, _ = mini_batch
 75 |         a, log_prob = self.forward(s)
 76 |         entropy = -self.log_alpha.exp() * log_prob
 77 | 
 78 |         q1_val, q2_val = q1(s, a), q2(s, a)
 79 |         q1_q2 = torch.cat([q1_val, q2_val], dim=1)
 80 |         min_q = torch.min(q1_q2, 1, keepdim=True)[0]
 81 | 
 82 |         loss = -min_q - entropy  # for gradient ascent
 83 |         self.optimizer.zero_grad()
 84 |         loss.mean().backward()
 85 |         self.optimizer.step()
 86 | 
 87 |         self.log_alpha_optimizer.zero_grad()
 88 |         alpha_loss = -(self.log_alpha.exp() * (log_prob + target_entropy).detach()).mean()
 89 |         alpha_loss.backward()
 90 |         self.log_alpha_optimizer.step()
 91 | 
 92 | 
 93 | class QNet(nn.Module):
 94 |     def __init__(self, learning_rate):
 95 |         super(QNet, self).__init__()
 96 |         self.fc_s = nn.Linear(3, 64)
 97 |         self.fc_a = nn.Linear(1, 64)
 98 |         self.fc_cat = nn.Linear(128, 32)
 99 |         self.fc_out = nn.Linear(32, 1)
100 |         self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
101 | 
102 |     def forward(self, x, a):
103 |         h1 = F.relu(self.fc_s(x))
104 |         h2 = F.relu(self.fc_a(a))
105 |         cat = torch.cat([h1, h2], dim=1)
106 |         q = F.relu(self.fc_cat(cat))
107 |         q = self.fc_out(q)
108 |         return q
109 | 
110 |     def train_net(self, target, mini_batch):
111 |         s, a, r, s_prime, done = mini_batch
112 |         loss = F.smooth_l1_loss(self.forward(s, a), target)
113 |         self.optimizer.zero_grad()
114 |         loss.mean().backward()
115 |         self.optimizer.step()
116 | 
117 |     def soft_update(self, net_target):
118 |         for param_target, param in zip(net_target.parameters(), self.parameters()):
119 |             param_target.data.copy_(param_target.data * (1.0 - tau) + param.data * tau)
120 | 
121 | 
122 | def calc_target(pi, q1, q2, mini_batch):
123 |     s, a, r, s_prime, done = mini_batch
124 | 
125 |     with torch.no_grad():
126 |         a_prime, log_prob = pi(s_prime)
127 |         entropy = -pi.log_alpha.exp() * log_prob
128 |         q1_val, q2_val = q1(s_prime, a_prime), q2(s_prime, a_prime)
129 |         q1_q2 = torch.cat([q1_val, q2_val], dim=1)
130 |         min_q = torch.min(q1_q2, 1, keepdim=True)[0]
131 |         target = r + gamma * done * (min_q + entropy)
132 | 
133 |     return target
134 | 
135 | 
136 | def main():
137 |     env = gym.make('Pendulum-v1')
138 |     memory = ReplayBuffer()
139 |     q1, q2, q1_target, q2_target = QNet(lr_q), QNet(lr_q), QNet(lr_q), QNet(lr_q)
140 |     pi = PolicyNet(lr_pi)
141 | 
142 |     q1_target.load_state_dict(q1.state_dict())
143 |     q2_target.load_state_dict(q2.state_dict())
144 | 
145 |     score = 0.0
146 |     print_interval = 20
147 | 
148 |     for n_epi in range(10000):
149 |         s = env.reset()
150 | 
151 |         done = False
152 | 
153 |         while not done:
154 |             env.render()
155 |             a, log_prob = pi(torch.from_numpy(s).float())
156 |             s_prime, r, done, info = env.step([2.0 * a.item()])
157 |             memory.put((s, a.item(), r / 10.0, s_prime, done))
158 |             score += r
159 |             s = s_prime
160 | 
161 |         if memory.size() > 1000:
162 |             for i in range(20):
163 |                 mini_batch = memory.sample(batch_size)
164 |                 td_target = calc_target(pi, q1_target, q2_target, mini_batch)
165 |                 q1.train_net(td_target, mini_batch)
166 |                 q2.train_net(td_target, mini_batch)
167 |                 entropy = pi.train_net(q1, q2, mini_batch)
168 |                 q1.soft_update(q1_target)
169 |                 q2.soft_update(q2_target)
170 | 
171 |         if n_epi % print_interval == 0 and n_epi != 0:
172 |             print("# of episode :{}, avg score : {:.1f} alpha:{:.4f}".format(n_epi, score / print_interval,
173 |                                                                              pi.log_alpha.exp()))
174 |             score = 0.0
175 | 
176 |     env.close()
177 | 
178 | 
179 | if __name__ == '__main__':
180 |     main()
181 | 


--------------------------------------------------------------------------------