├── D3QN.py
├── DQN.py
├── MADDPG.py
├── README.md
├── Rl_net.py
├── base_env.py
├── env.py
├── replay_buffer.py
└── run_this.py


/D3QN.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import copy
  5 | 
  6 | from replay_buffer import ReplayBuffer
  7 | from env import ENV
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | class Network(nn.Module):
 11 | 
 12 |     def __init__(self, n_features, n_actions):
 13 |         super().__init__()
 14 |         self.fc1 = nn.Linear(n_features, 16)
 15 |         self.fc1.weight.data.normal_(0, 0.3)
 16 |         self.fc1.bias.data.normal_(0.1)
 17 |         self.relu = nn.ReLU()
 18 |         self.fc2 = nn.Linear(16, 32)
 19 |         self.fc2.weight.data.normal_(0, 0.3)
 20 |         self.fc2.bias.data.normal_(0.1)
 21 |         self.fc3 = nn.Linear(32, 64)
 22 |         self.fc3.weight.data.normal_(0, 0.3)
 23 |         self.fc3.bias.data.normal_(0.1)
 24 |         self.out = nn.Linear(64, n_actions)
 25 |         self.out.weight.data.normal_(0, 0.3)
 26 |         self.out.bias.data.normal_(0.1)
 27 |         self.v = nn.Linear(64, 1)
 28 |         self.v.weight.data.normal_(0, 0.3)
 29 |         self.v.bias.data.normal_(0.1)
 30 | 
 31 | 
 32 |     def forward(self, x):
 33 |         x = self.fc1(x)
 34 |         x = self.relu(x)
 35 |         x = self.fc2(x)
 36 |         x = self.relu(x)
 37 |         x = self.fc3(x)
 38 |         x = self.relu(x)
 39 | 
 40 |         v = self.v(x)
 41 |         a = self.out(x)
 42 | 
 43 |         return v + a - torch.mean(a, dim=-1, keepdim=True)
 44 | 
 45 | class D3QN:
 46 |     def __init__(self,
 47 |                  env,
 48 |                  learning_rate=0.01,
 49 |                  reward_decay=0.9,
 50 |                  e_greedy=0.9,
 51 |                  replace_target_iter=300,
 52 |                  memory_size=500,
 53 |                  batch_size=5,
 54 |                  e_greedy_increment=0.001,
 55 |                  epoch=100
 56 |                  ):
 57 |         # print("fea:", n_features)
 58 |         self.UEs = env.UEs
 59 |         self.n_actions = env.n_actions
 60 |         self.n_features = env.n_features
 61 |         self.actions = env.actions
 62 |         self.k = env.k
 63 |         self.learning_rate = learning_rate
 64 |         self.gama = reward_decay
 65 |         self.epsilon_max = e_greedy
 66 |         self.replace_target_iter = replace_target_iter
 67 |         self.memory_size = memory_size
 68 |         self.batch_size = batch_size
 69 |         self.epsilon_increment = e_greedy_increment
 70 |         self.epoch = epoch
 71 | 
 72 |         self.epsilon = 0
 73 |         self.learn_step_counter = 0
 74 | 
 75 |         # 初始化replay
 76 |         self.memory = ReplayBuffer(self.memory_size)
 77 | 
 78 |         self.cost_his = []
 79 | 
 80 |         self.eval_net = [None for _ in range(self.UEs)]
 81 |         self.target_net = [None for _ in range(self.UEs)]
 82 |         self.optimizer = [None for _ in range(self.UEs)]
 83 | 
 84 |         for i in range(self.UEs):
 85 | 
 86 |             self.eval_net[i], self.target_net[i] = Network(self.n_features , self.n_actions), Network(self.n_features,
 87 |                                                                                            self.n_actions)
 88 |             self.optimizer[i] = torch.optim.Adam(self.eval_net[i].parameters(), lr=learning_rate)
 89 | 
 90 |         self.loss_fun = nn.MSELoss()
 91 | 
 92 |     def store_memory(self, s, a, r, s_):
 93 |         self.memory.add(s, a, r, s_)
 94 | 
 95 |     def choose_action(self, observation):
 96 |         a = []
 97 |         for i in range(self.UEs):
 98 |             # observation = np.array(observation[i]).reshape(1, self.n_features)
 99 |             obs = np.array(observation[i]).reshape(1, self.n_features)
100 |             obs = torch.FloatTensor(obs[:])   # 增加一个维度 i.e[1,2,3,4,5]变成[[1,2,3,4,5]]
101 |             if np.random.uniform() < self.epsilon:
102 |                 # 选择q值最大的动作
103 |                 actions_value = self.eval_net[i](obs)
104 |                 index = torch.max(actions_value, 1)[1].data.numpy()
105 |                 index = index[0]
106 |                 action = self.actions[index]
107 |             else:
108 |                 index = np.random.randint(0, self.n_actions)
109 |                 action = self.actions[index]
110 |             a.append(action)
111 |         return a
112 | 
113 |     def learn(self, step, write):
114 |         if self.learn_step_counter % self.replace_target_iter == 0:
115 |             for i in range(self.UEs):
116 |                 self.target_net[i].load_state_dict(self.eval_net[i].state_dict())  # 直接赋值更新权重
117 |         self.learn_step_counter += 1
118 | 
119 |         for agent_idx, (agent_eval, agent_target, opt) in \
120 |             enumerate(zip(self.eval_net, self.target_net, self.optimizer)):
121 |             # 随机抽样
122 |             obs, action, reward, obs_ = self.memory.sample(self.batch_size, agent_idx)
123 |             actions_index = []
124 | 
125 |             rew = torch.tensor(reward, dtype=torch.float)
126 |             action_cur = torch.from_numpy(action).to(torch.float)
127 |             for i in range(self.batch_size):
128 |                 for j in range(self.UEs):
129 |                     a = action_cur[i][j]
130 |                     action_index = a[0] * self.k + a[1] / (1 / (self.k - 1))
131 |                     actions_index.append(int(action_index))
132 |             actions_index = torch.tensor(actions_index).reshape(self.batch_size, self.UEs, 1)
133 |             obs_n = torch.from_numpy(obs).to(torch.float)
134 |             obs_n_ = torch.from_numpy(obs_).to(torch.float)
135 |             obs_n = obs_n.reshape(self.batch_size, self.UEs, self.n_features)
136 |             obs_n_ = obs_n_.reshape(self.batch_size, self.UEs, self.n_features)
137 | 
138 |             q_target = torch.zeros((self.batch_size, self.UEs, 1))
139 |             q_eval = agent_eval(obs_n)
140 |             q = q_eval
141 | 
142 |             q_eval = agent_eval(obs_n).gather(-1, actions_index)
143 |             # q_eval = torch.gather(q_eval, dim=1, index=torch.unsqueeze(action_cur, 1))
144 |             q_next = agent_target(obs_n_).detach()
145 | 
146 |             for i in range(obs_n.shape[0]):
147 |                 for j in range(self.UEs):
148 |                     action = torch.argmax(q[i][j], 0).detach()
149 |                     q_target[i][j] = rew[i][j] + self.gama * q_next[i, j, action]
150 | 
151 |             loss = self.loss_fun(q_eval, q_target)
152 |             write.add_scalar("Loss/DQN", loss, step)
153 |             self.cost_his.append(loss)
154 |             opt.zero_grad()
155 |             loss.backward()
156 |             opt.step()
157 | 
158 |             self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
159 | #
160 | # write = SummaryWriter(log_dir="logs")
161 | # #
162 | # env = ENV(3, 3, 11, 1)
163 | # # DQN = Double_DQN(env, env.n_actions, env.n_features*5)
164 | # DQN = Double_DQN(env)
165 | # epoch_reward = [0.0]
166 | # epoch_average_reward = []
167 | # for epoch in range(1000):
168 | #     observation = env.reset()
169 | #     epoch_average_reward.append(epoch_reward[-1]/ (env.UEs * 100))
170 | #     epoch_reward.append(0)
171 | #     print("epoch:{}, cost:{}".format(epoch, epoch_average_reward[epoch]))
172 | #     # print("reset")
173 | #     for step in range(100):
174 | #         o1 = copy.deepcopy(observation)
175 | #         o2 = copy.deepcopy(observation)
176 | #
177 | #         action = DQN.choose_action(o1)
178 | #         o_, reward = env.step(o2, action, is_prob=False, is_compared=False)
179 | #         epoch_reward[-1] += np.sum(reward)
180 | #         DQN.store_memory(o2, action, reward, o_)
181 | #         DQN.learn(epoch, write)
182 | #         observation = o_
183 | #     # print("action:", action)
184 | 


--------------------------------------------------------------------------------
/DQN.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import copy
  5 | 
  6 | from replay_buffer import ReplayBuffer
  7 | from env import ENV
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | class Network(nn.Module):
 11 | 
 12 |     def __init__(self, n_features, n_actions):
 13 |         super().__init__()
 14 |         self.fc1 = nn.Linear(n_features, 16)
 15 |         self.fc1.weight.data.normal_(0, 0.3)
 16 |         self.fc1.bias.data.normal_(0.1)
 17 |         self.relu = nn.ReLU()
 18 |         self.fc2 = nn.Linear(16, 32)
 19 |         self.fc2.weight.data.normal_(0, 0.3)
 20 |         self.fc2.bias.data.normal_(0.1)
 21 |         self.fc3 = nn.Linear(32, 64)
 22 |         self.fc3.weight.data.normal_(0, 0.3)
 23 |         self.fc3.bias.data.normal_(0.1)
 24 |         self.out = nn.Linear(64, n_actions)
 25 |         self.out.weight.data.normal_(0, 0.3)
 26 |         self.out.bias.data.normal_(0.1)
 27 | 
 28 | 
 29 |     def forward(self, x):
 30 |         x = self.fc1(x)
 31 |         x = self.relu(x)
 32 |         x = self.fc2(x)
 33 |         x = self.relu(x)
 34 |         x = self.fc3(x)
 35 |         x = self.relu(x)
 36 |         return self.out(x)
 37 | 
 38 | class Double_DQN:
 39 |     def __init__(self,
 40 |                  env,
 41 |                  learning_rate=0.01,
 42 |                  reward_decay=0.9,
 43 |                  e_greedy=0.9,
 44 |                  replace_target_iter=300,
 45 |                  memory_size=500,
 46 |                  batch_size=5,
 47 |                  e_greedy_increment=0.001,
 48 |                  epoch=100
 49 |                  ):
 50 |         # print("fea:", n_features)
 51 |         self.UEs = env.UEs
 52 |         self.n_actions = env.n_actions
 53 |         self.n_features = env.n_features
 54 |         self.actions = env.actions
 55 |         self.k = env.k
 56 |         self.learning_rate = learning_rate
 57 |         self.gama = reward_decay
 58 |         self.epsilon_max = e_greedy
 59 |         self.replace_target_iter = replace_target_iter
 60 |         self.memory_size = memory_size
 61 |         self.batch_size = batch_size
 62 |         self.epsilon_increment = e_greedy_increment
 63 |         self.epoch = epoch
 64 | 
 65 |         self.epsilon = 0
 66 |         self.learn_step_counter = 0
 67 | 
 68 |         # 初始化replay
 69 |         self.memory = ReplayBuffer(self.memory_size)
 70 | 
 71 |         self.cost_his = []
 72 | 
 73 |         self.eval_net = [None for _ in range(self.UEs)]
 74 |         self.target_net = [None for _ in range(self.UEs)]
 75 |         self.optimizer = [None for _ in range(self.UEs)]
 76 | 
 77 |         for i in range(self.UEs):
 78 | 
 79 |             self.eval_net[i], self.target_net[i] = Network(self.n_features , self.n_actions), Network(self.n_features,
 80 |                                                                                            self.n_actions)
 81 |             self.optimizer[i] = torch.optim.Adam(self.eval_net[i].parameters(), lr=learning_rate)
 82 | 
 83 |         self.loss_fun = nn.MSELoss()
 84 | 
 85 |     def store_memory(self, s, a, r, s_):
 86 |         self.memory.add(s, a, r, s_)
 87 | 
 88 |     def choose_action(self, observation):
 89 |         a = []
 90 |         for i in range(self.UEs):
 91 |             # observation = np.array(observation[i]).reshape(1, self.n_features)
 92 |             obs = np.array(observation[i]).reshape(1, self.n_features)
 93 |             obs = torch.FloatTensor(obs[:])   # 增加一个维度 i.e[1,2,3,4,5]变成[[1,2,3,4,5]]
 94 |             if np.random.uniform() < self.epsilon:
 95 |                 # 选择q值最大的动作
 96 |                 actions_value = self.eval_net[i](obs)
 97 |                 index = torch.max(actions_value, 1)[1].data.numpy()
 98 |                 index = index[0]
 99 |                 action = self.actions[index]
100 |             else:
101 |                 index = np.random.randint(0, self.n_actions)
102 |                 action = self.actions[index]
103 |             a.append(action)
104 |         return a
105 | 
106 |     def learn(self, step, write):
107 |         if self.learn_step_counter % self.replace_target_iter == 0:
108 |             for i in range(self.UEs):
109 |                 self.target_net[i].load_state_dict(self.eval_net[i].state_dict())  # 直接赋值更新权重
110 |         self.learn_step_counter += 1
111 | 
112 |         for agent_idx, (agent_eval, agent_target, opt) in \
113 |             enumerate(zip(self.eval_net, self.target_net, self.optimizer)):
114 |             # 随机抽样
115 |             obs, action, reward, obs_ = self.memory.sample(self.batch_size, agent_idx)
116 |             actions_index = []
117 | 
118 |             rew = torch.tensor(reward, dtype=torch.float)
119 |             action_cur = torch.from_numpy(action).to(torch.float)
120 |             for i in range(self.batch_size):
121 |                 for j in range(self.UEs):
122 |                     a = action_cur[i][j]
123 |                     action_index = a[0] * self.k + a[1] / (1 / (self.k - 1))
124 |                     actions_index.append(int(action_index))
125 |             actions_index = torch.tensor(actions_index).reshape(self.batch_size, self.UEs, 1)
126 |             obs_n = torch.from_numpy(obs).to(torch.float)
127 |             obs_n_ = torch.from_numpy(obs_).to(torch.float)
128 |             obs_n = obs_n.reshape(self.batch_size, self.UEs, self.n_features)
129 |             obs_n_ = obs_n_.reshape(self.batch_size, self.UEs, self.n_features)
130 | 
131 |             q_target = torch.zeros((self.batch_size, self.UEs, 1))
132 |             q_eval = agent_eval(obs_n)
133 |             q = q_eval
134 | 
135 |             q_eval = agent_eval(obs_n).gather(-1, actions_index)
136 |             # q_eval = torch.gather(q_eval, dim=1, index=torch.unsqueeze(action_cur, 1))
137 |             q_next = agent_target(obs_n_).detach()
138 | 
139 |             for i in range(obs_n.shape[0]):
140 |                 for j in range(self.UEs):
141 |                     action = torch.argmax(q[i][j], 0).detach()
142 |                     q_target[i][j] = rew[i][j] + self.gama * q_next[i, j, action]
143 | 
144 |             loss = self.loss_fun(q_eval, q_target)
145 |             write.add_scalar("Loss/DQN", loss, step)
146 |             self.cost_his.append(loss)
147 |             opt.zero_grad()
148 |             loss.backward()
149 |             opt.step()
150 | 
151 |             self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
152 | #
153 | # write = SummaryWriter(log_dir="logs")
154 | # #
155 | # env = ENV(3, 3, 11, 1)
156 | # # DQN = Double_DQN(env, env.n_actions, env.n_features*5)
157 | # DQN = Double_DQN(env)
158 | # epoch_reward = [0.0]
159 | # epoch_average_reward = []
160 | # for epoch in range(1000):
161 | #     observation = env.reset()
162 | #     epoch_average_reward.append(epoch_reward[-1]/ (env.UEs * 100))
163 | #     epoch_reward.append(0)
164 | #     print("epoch:{}, cost:{}".format(epoch, epoch_average_reward[epoch]))
165 | #     # print("reset")
166 | #     for step in range(100):
167 | #         o1 = copy.deepcopy(observation)
168 | #         o2 = copy.deepcopy(observation)
169 | #
170 | #         action = DQN.choose_action(o1)
171 | #         o_, reward = env.step(o2, action, is_prob=False, is_compared=False)
172 | #         epoch_reward[-1] += np.sum(reward)
173 | #         DQN.store_memory(o2, action, reward, o_)
174 | #         DQN.learn(epoch, write)
175 | #         observation = o_
176 | #     # print("action:", action)
177 | 


--------------------------------------------------------------------------------
/MADDPG.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from Rl_net import actor, critic
  5 | 
  6 | learning_start_step = 200
  7 | learning_fre = 5
  8 | batch_size = 64
  9 | gamma = 0.9
 10 | lr = 0.01
 11 | max_grad_norm = 0.5
 12 | save_model = 40
 13 | save_dir = "models/simple_adversary"
 14 | save_fer = 400
 15 | tao = 0.01
 16 | 
 17 | 
 18 | class Maddpg(object):
 19 | 
 20 |     def get_train(self, env, obs_shape_n, action_shape_n):
 21 |         actors_cur = [None for _ in range(env.UEs)]
 22 |         critics_cur = [None for _ in range(env.UEs)]
 23 |         actors_target = [None for _ in range(env.UEs)]
 24 |         critics_target = [None for _ in range(env.UEs)]
 25 |         optimizer_a = [None for _ in range(env.UEs)]
 26 |         optimizer_c = [None for _ in range(env.UEs)]
 27 | 
 28 | 
 29 |         for i in range(env.UEs):
 30 |             actors_cur[i] = actor(obs_shape_n[i], action_shape_n[i])
 31 |             critics_cur[i] = critic(sum(obs_shape_n), sum(action_shape_n))
 32 |             actors_target[i] = actor(obs_shape_n[i], action_shape_n[i])
 33 |             critics_target[i] = critic(sum(obs_shape_n), sum(action_shape_n))
 34 |             optimizer_a[i] = torch.optim.Adam(actors_cur[i].parameters(), lr=lr)
 35 |             optimizer_c[i] = torch.optim.Adam(critics_cur[i].parameters(), lr=lr)
 36 |         actors_tar = self.update_train(actors_cur, actors_target, 1.0)
 37 |         critics_tar = self.update_train(critics_cur, critics_target, 1.0)
 38 |         return actors_cur, critics_cur, actors_tar, critics_tar, optimizer_a, optimizer_c
 39 | 
 40 |     def update_train(self, agents_cur, agents_tar, tao):
 41 |         """
 42 |         用于更新target网络，
 43 |         这个方法不同于直接复制，但结果一样
 44 |         out:
 45 |         |agents_tar: the agents with new par updated towards agents_current
 46 |         """
 47 |         for agent_c, agent_t in zip(agents_cur, agents_tar):
 48 |             key_list = list(agent_c.state_dict().keys())
 49 |             state_dict_t = agent_t.state_dict()
 50 |             state_dict_c = agent_c.state_dict()
 51 |             for key in key_list:
 52 |                 state_dict_t[key] = state_dict_c[key] * tao + \
 53 |                                     (1 - tao) * state_dict_t[key]
 54 |             agent_t.load_state_dict(state_dict_t)
 55 |         return agents_tar
 56 | 
 57 |     def agents_train(self, game_step, update_cnt, memory, obs_size, action_size,
 58 |                      actors_cur, actors_tar, critics_cur, critics_tar, optimizers_a, optimizers_c, write):
 59 |         """
 60 |         par:
 61 |         |input: the data for training
 62 |         |output: the data for next update
 63 |         """
 64 | 
 65 |         # 训练
 66 |         if (game_step > learning_start_step) and (game_step % learning_fre == 0):
 67 |             if update_cnt == 0: print('\r=start training...' + ''*100)
 68 |             update_cnt += 1
 69 | 
 70 |             for agent_idx, (actor_c, actor_t, critic_c, critic_t, opt_a, opt_c) in \
 71 |                 enumerate(zip(actors_cur, actors_tar, critics_cur, critics_tar, optimizers_a, optimizers_c)):
 72 |                 if opt_c == None: continue
 73 | 
 74 |                 # 随机抽样
 75 |                 rew = []
 76 |                 obs, action, reward, obs_ = memory.sample(batch_size, agent_idx)
 77 | 
 78 |                 for i in range(batch_size):
 79 |                     r = reward[i]
 80 |                     ar = sum(r)/len(r)
 81 |                     rew.append(ar)
 82 |                 # update critic
 83 |                 # rew = torch.tensor(reward, dtype=torch.float)
 84 |                 rew = torch.tensor(rew, dtype=torch.float)
 85 |                 action_cur = torch.from_numpy(action).to(torch.float)
 86 |                 obs_n = torch.from_numpy(obs).to(torch.float)
 87 |                 obs_n_ = torch.from_numpy(obs_).to(torch.float)
 88 |                 action_tar = torch.cat([a_t(obs_n_[:, obs_size[idx][0]:obs_size[idx][1]]).detach() \
 89 |                                         for idx, a_t in enumerate(actors_tar)], dim=1)
 90 |                 q = critic_c(obs_n, action_cur).reshape(-1)     # q
 91 |                 q_ = critic_t(obs_n_, action_tar).reshape(-1)   # q_
 92 |                 tar_value = q_ * gamma + rew
 93 |                 loss_c = torch.nn.MSELoss()(q, tar_value)
 94 |                 opt_c.zero_grad()
 95 |                 loss_c.backward()
 96 |                 nn.utils.clip_grad_norm_(critic_c.parameters(), max_grad_norm)
 97 |                 opt_c.step()
 98 | 
 99 |                 # update Actor
100 |                 # There is no need to cal other agent's action
101 |                 model_out, policy_c_new = actor_c(
102 |                     obs_n_[:, obs_size[agent_idx][0]:obs_size[agent_idx][1]], model_original_out=True)
103 |                 # update the action of this agent
104 |                 action_cur[:, action_size[agent_idx][0]:action_size[agent_idx][1]] = policy_c_new
105 |                 loss_pse = torch.mean(torch.pow(model_out, 2))
106 |                 loss_a = torch.mul(-1, torch.mean(critic_c(obs_n, action_cur)))
107 | 
108 |                 opt_a.zero_grad()
109 |                 loss_t = 1e-3 * loss_pse + loss_a
110 |                 loss_t.backward()
111 |                 nn.utils.clip_grad_norm_(actor_c.parameters(), max_grad_norm)
112 |                 opt_a.step()
113 | 
114 |                 write.add_scalar("Loss/Actor", loss_t, game_step)
115 |                 write.add_scalar("Loss/Critic", loss_c, game_step)
116 | 
117 |             # # save model
118 |             # if update_cnt > save_model and update_cnt % save_fer == 0:
119 |             #     time_now = time.strftime('%y%m_%d%H%M')
120 |             #     print('=time:{} step:{}        save'.format(time_now, game_step))
121 |             #     model_file_dir = os.path.join(save_dir, '{}_{}'.format(time_now, game_step))
122 |             #     if not os.path.exists(model_file_dir):  # make the path
123 |             #         os.makedirs(model_file_dir)
124 |             #     for agent_idx, (a_c, a_t, c_c, c_t) in \
125 |             #             enumerate(zip(actors_cur, actors_tar, critics_cur, critics_tar)):
126 |             #         torch.save(a_c, os.path.join(model_file_dir, 'a_c_{}.pt'.format(agent_idx)))
127 |             #         torch.save(a_t, os.path.join(model_file_dir, 'a_t_{}.pt'.format(agent_idx)))
128 |             #         torch.save(c_c, os.path.join(model_file_dir, 'c_c_{}.pt'.format(agent_idx)))
129 |             #         torch.save(c_t, os.path.join(model_file_dir, 'c_t_{}.pt'.format(agent_idx)))
130 | 
131 |             # update the tar par
132 |             actors_tar = self.update_train(actors_cur, actors_tar, tao)
133 |             critics_tar = self.update_train(critics_cur, critics_tar, tao)
134 |         return update_cnt, actors_cur, actors_tar, critics_cur, critics_tar
135 | 
136 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-Agent-Mec_Offloading-Use_DRL
 2 | 使用Drl来解决多智能体卸载问题
 3 | 
 4 | 对比了MADDPG和DQN算法
 5 | 
 6 | 环境参考论文：When Learning Joins Edge: Real-time Proportional Computation Offloading via Deep Reinforcement Learning CCF-C
 7 | 
 8 | 目前对环境的处理为，任务在每一个step中都被处理完毕，在下一个step，对上行链路，下行链路进行随机的加减来改变状态。环境存在着一些问题，目前仍在改善。
 9 | 
10 | 目前环境存在问题，任务大小，边缘服务器和用户的计算能力仍在调整。
11 | 


--------------------------------------------------------------------------------
/Rl_net.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class abstract_agent(nn.Module):
 6 | 
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def act(self, x):
11 |         policy, value = self.forward(x)
12 |         return policy, value
13 | 
14 | class critic(abstract_agent):
15 | 
16 |     def __init__(self, obs_shape, act_shape):
17 |         super().__init__()
18 |         self.LRelu = nn.LeakyReLU(0.01)
19 |         self.linear_c1 = nn.Linear(act_shape + obs_shape, 64)
20 |         self.linear_c2 = nn.Linear(64, 64)
21 |         self.linear_c = nn.Linear(64, 1)
22 | 
23 |     def reset_parameters(self):
24 |         nn.init.xavier_uniform(self.linear_c1.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布
25 |         nn.init.xavier_uniform(self.linear_c2.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布
26 |         nn.init.xavier_uniform(self.linear_c.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布
27 | 
28 |     def forward(self, obs_input, act_input):
29 |         x_cat = self.LRelu(self.linear_c1(torch.cat([obs_input, act_input], dim=1)))
30 |         x = self.LRelu(self.linear_c2(x_cat))
31 |         x = self.linear_c(x)
32 | 
33 |         return x
34 | 
35 | class actor(abstract_agent):
36 | 
37 |     def __init__(self, num_input, action_size):
38 |         super().__init__()
39 |         self.tanh = nn.Tanh()
40 |         self.LRelu = nn.LeakyReLU(0.01)
41 |         self.linear_a1 = nn.Linear(num_input, 64)
42 |         self.linear_a2 = nn.Linear(64, 64)
43 |         self.linear_a = nn.Linear(64, action_size)
44 | 
45 |     def reset_parameters(self):
46 |         nn.init.xavier_uniform(self.linear_c1.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布
47 |         nn.init.xavier_uniform(self.linear_c2.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布
48 |         nn.init.xavier_uniform(self.linear_c.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布
49 | 
50 |     def forward(self, x, model_original_out=False):
51 |         x = self.LRelu(self.linear_a1(x))
52 |         x = self.LRelu(self.linear_a2(x))
53 |         model_out = self.linear_a(x)
54 |         u = torch.rand_like(model_out)
55 |         policy = F.softmax(model_out - torch.log(-torch.log(u)), dim=-1)
56 |         if model_original_out:
57 |             return model_out, policy
58 |         return policy


--------------------------------------------------------------------------------
/base_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | 
 4 | 
 5 | 
 6 | class base(object):
 7 |     def __init__(self):
 8 |         # 频率
 9 |         self.Hz = 1
10 |         self.kHz = 1000 * self.Hz
11 |         self.mHz = 1000 * self.kHz
12 |         self.GHz = 1000 * self.mHz
13 | 
14 |         # 数据大小
15 |         self.bit = 1
16 |         self.B = 8 * self.bit
17 |         self.KB = 1024 * self.B
18 |         self.MB = 1024 * self.KB
19 | 
20 |         self.task_cpu_cycle = np.random.randint(500, 1000)                # 处理一bit任务所需要的CPU频率
21 |         self.task_size = np.random.randint(2 * 10**9, 3 * 10**9)  # 任务大小
22 |         self.task_require_cycle = self.task_size * self.task_cpu_cycle    # 处理一个任务所需要的cpu频率
23 | 
24 |         # 处理任务的时间 = 任务所需的cpu频率/设备的计算能力
25 | 
26 |         self.UE_f = np.random.randint(1.5 * self.GHz, 2 * self.GHz)    # UE的计算能力
27 |         self.MEC_f = np.random.randint(5 * self.GHz, 7 * self.GHz)   # MEC的计算能力
28 | 
29 |         # 能耗
30 |         self.J = 1
31 |         self.mJ = 1000 * 1000 * self.J
32 | 
33 |         self.tr_energy = 1 * self.J  # 传输1s的能耗
34 |         self.w = 10**(-28)  # 能耗系数
35 | 
36 |         """
37 |         进行简化
38 |         设置传输速率为：14 Mbps
39 |         传输时计算的是任务大小
40 |         """
41 |         self.r = 293 * self.MB
42 | 
43 | print(40 * math.log2(1 + (16 * 10)))
44 | 


--------------------------------------------------------------------------------
/env.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | 
  4 | import numpy as np
  5 | 
  6 | class ENV():
  7 |     def __init__(self, UEs, MECs, k, lam):
  8 |         self.UEs = UEs
  9 |         self.MECs = MECs
 10 |         self.k = k
 11 | 
 12 |         q = np.full((k, 1), 0.)
 13 |         p = np.linspace(0, 1, k).reshape((k, 1))
 14 |         # 创建动作
 15 |         for i in range(MECs - 1):
 16 |             a = np.full((k, 1), float(i + 1))
 17 |             b = np.linspace(0, 1, k).reshape((k, 1))
 18 |             q = np.append(q, a, axis=0)
 19 |             p = np.append(p, b, axis=0)
 20 | 
 21 |         self.actions = np.hstack((q, p))
 22 |         self.n_actions = len(self.actions)
 23 |         self.n_features = 3 + MECs * 3
 24 |         self.discount = 0.01
 25 | 
 26 |         # 基本参数
 27 |         # 频率
 28 |         self.Hz = 1
 29 |         self.kHz = 1000 * self.Hz
 30 |         self.mHz = 1000 * self.kHz
 31 |         self.GHz = 1000 * self.mHz
 32 |         self.nor = 10**(-7)
 33 |         self.nor1 = 10**19
 34 | 
 35 |         # 数据大小
 36 |         self.bit = 1
 37 |         self.B = 8 * self.bit
 38 |         self.KB = 1024 * self.B
 39 |         self.MB = 1024 * self.KB
 40 | 
 41 | 
 42 |         # self.task_cpu_cycle = np.random.randint(2 * 10**9, 3* 10**9)
 43 | 
 44 |         self.UE_f = np.random.randint(1.5 * self.GHz * self.nor, 2 * self.GHz * self.nor)     # UE的计算能力
 45 |         self.MEC_f = np.random.randint(5 * self.GHz * self.nor, 7 * self.GHz * self.nor)  # MEC的计算能力
 46 |         # self.UE_f = 500 * self.mHz     # UE的计算能力
 47 |         # self.MEC_f = np.random.randint(5.2 * self.GHz, 24.3 * self.GHz)  # MEC的计算能力
 48 |         self.tr_energy = 1      # 传输能耗
 49 |         self.r = 40 * math.log2(1 + (16 * 10)) * self.MB * self.nor # 传输速率
 50 |         # self.r = 800 # 传输速率
 51 |         self.ew, self.lw = 10**(-26), 3 * 10**(-26)# 能耗系数
 52 |         # self.ew, self.lw = 0.3, 0.15 # 能耗系数
 53 |         self.et, self.lt = 1, 1
 54 |         self.local_core_max, self.local_core_min = 1.3 * self.UE_f, 0.7 * self.UE_f
 55 |         self.server_core_max, self.server_core_min = 1.3 * self.MEC_f, 0.7 * self.MEC_f
 56 |         self.uplink_max, self.uplink_min = 1.3 * self.r, 0.7 * self.r
 57 |         self.downlink_max, self.downlink_min = 1.3 * self.r, 0.7 * self.r
 58 |         self.lam = lam
 59 |         self.e = 1
 60 | 
 61 | 
 62 |     def reset(self):
 63 |         obs = []
 64 |         servers_cap = []
 65 |         new_cap = True
 66 |         for i in range(self.UEs):
 67 |             uplink, downlink = [], []
 68 |             # np.random.seed(np.random.randint(1, 1000))
 69 |             # task_size = np.random.randint(2 * 10**8 * self.nor, 3 * 10**8 * self.nor) #   任务大小
 70 |             task_size = np.random.randint(1.5 * self.mHz, 2 * self.mHz) #   任务大小
 71 |             # self.task_size = self.task_size * self.task_cpu_cycle                     # 处理一个任务所需要的cpu频率
 72 |             # task_cpu_cycle = np.random.randint(2 * 10**9 * self.nor, 3 * 10**9 * self.nor)
 73 |             task_cpu_cycle = np.random.randint(10**3, 10**5)
 74 |             local_comp = np.random.randint(0.9 * self.UE_f, 1.1 * self.UE_f)    # UE的计算能力
 75 |             for i in range(self.MECs):
 76 |                 up = np.random.randint(0.9 * self.r, 1.1 * self.r)
 77 |                 down = np.random.randint(0.9 * self.r, 1.1 * self.r)
 78 |                 if new_cap:
 79 |                     cap = np.random.randint(0.9 * self.MEC_f, 1.1 * self.MEC_f)   # MEC计算能力
 80 |                     servers_cap.append(cap)
 81 |                 uplink.append(up)
 82 |                 downlink.append(down)
 83 |             observation = np.array([task_size, task_cpu_cycle, local_comp])
 84 |             observation = np.hstack((observation, servers_cap, uplink, downlink))
 85 |             obs.append(observation)
 86 |             new_cap = False
 87 |         return obs
 88 | 
 89 |     def choose_action(self, prob):
 90 |         """
 91 |         根据概率选择动作
 92 |         :param env:
 93 |         :param prob:
 94 |         :return: [[target_server, percentage]]
 95 |         """
 96 |         action_choice = np.linspace(0, 1, self.k)
 97 |         actions = []
 98 |         for i in range(self.UEs):
 99 |             a = np.random.choice(a=(self.MECs * self.k), p=prob[i])  # 在数组p中从a个数字中以概率p选中一个
100 |             target_server = int(a / self.k)
101 |             percen = action_choice[a % self.k]
102 |             action = [target_server, percen]
103 |             actions.append(action)
104 |         return actions
105 | 
106 |     def step(self, observation, actions_prob, is_prob=True, is_compared=True):
107 |         if is_prob:
108 |             actions = self.choose_action(actions_prob)
109 |         else: actions = actions_prob
110 |         new_cap = False
111 |         obs_ = []
112 |         rew, local, ran, mec = [], [], [], []
113 |         dpg_times, local_times, ran_times, mec_times = [], [], [], []
114 |         dpg_energys, local_energys, ran_energys, mec_energys = [], [], [], []
115 |         total = []
116 |         a, b, c, d = 0, 0, 0, 0
117 |         for i in range(self.UEs):
118 |             if i == self.UEs - 1: new_cap = True
119 |             # 提取信息
120 |             task_size, task_cpu_cycle, local_comp, servers_cap, uplink, downlink = \
121 |                 observation[i][0], observation[i][1], observation[i][2], observation[i][3:3+self.MECs], observation[i][3+self.MECs:3+self.MECs*2], observation[i][3+self.MECs*2:3+self.MECs*3]
122 |             # wait_local, wait_server = np.random.randint(0, 2), np.random.randint(0, 3)
123 | 
124 |             action = actions[i]
125 |             target_server, percen = int(action[0]), action[1]
126 | 
127 |             # 计算奖励
128 |             # 本地和服务器上都有
129 |             tr_time = (percen * task_size) / uplink[target_server] + self.discount * ( percen * task_size) / downlink[target_server]
130 |             tr_energy = (self.tr_energy * percen * task_size) / uplink[target_server] + self.discount * (self.tr_energy * percen * task_size) / downlink[target_server]
131 | 
132 | 
133 |             comp_local_time = task_cpu_cycle * (1 - percen) / (local_comp)
134 |             comp_local_energy = self.lw * task_cpu_cycle * (1 - percen) * local_comp**2
135 |             # comp_local_energy = task_size * (1 - percen) * local_comp
136 | 
137 | 
138 |             comp_mec_time = (percen * task_cpu_cycle) / servers_cap[target_server]
139 |             comp_mec_energy =self.ew * percen * task_cpu_cycle * servers_cap[target_server]**2
140 |             # comp_mec_energy =percen * task_size * servers_cap[target_server]
141 | 
142 |             comp_time = max(comp_local_time, comp_mec_time)
143 |             time_cost = (comp_time + tr_time) * self.et
144 |             energy_cost = (tr_energy + comp_local_energy + comp_mec_energy) * self.e
145 | 
146 |             total_cost = self.lam * time_cost + (1 - self.lam) * energy_cost
147 | 
148 |             # reward = -total_cost
149 | 
150 |             # 全本地
151 |             local_only_time = task_cpu_cycle/(local_comp) * self.et
152 |             local_only_energy = self.lw * task_cpu_cycle * local_comp**2 * self.e
153 |             # local_only_energy = task_size * local_comp
154 |             local_only = self.lam * local_only_time + (1 - self.lam) * local_only_energy
155 |             # print("task_cpu_cycle:", task_cpu_cycle)
156 |             # print("local_comp", local_comp)
157 |             # print("local_only_time:", local_only_time)
158 |             # print("local_only_energy:", local_only_energy)
159 |             # print("local_only:", local_only)
160 | 
161 |             # 全边缘
162 |             mec_only_tr_time = task_size / uplink[target_server] + self.discount * task_size / downlink[target_server]
163 |             mec_only_tr_energy = self.tr_energy * task_size / uplink[target_server] + self.discount * self.tr_energy * task_size / downlink[target_server]
164 |             # print("mec_only_tr_time:", mec_only_tr_time)
165 |             # print("mec_only_tr_energy:", mec_only_tr_energy)
166 | 
167 | 
168 |             mec_only_comp_time = task_cpu_cycle / servers_cap[target_server]
169 |             mec_only_comp_energy = self.ew * task_cpu_cycle * servers_cap[target_server]**2
170 |             # mec_only_comp_energy = task_size * servers_cap[target_server]
171 |             # print("mec_only_comp_time:", mec_only_comp_time)
172 |             # print("mec_only_comp_energy:", mec_only_comp_energy)
173 | 
174 |             mec_only_time_cost = (mec_only_tr_time + mec_only_comp_time) * self.et
175 |             mec_only_energy_cost = (mec_only_tr_energy + mec_only_comp_energy) * self.e
176 | 
177 |             mec_only = self.lam * mec_only_time_cost + (1 - self.lam) * mec_only_energy_cost
178 |             # print("mec_only_time_cost:", mec_only_time_cost)
179 |             # print("mec_only_energy_cost:", mec_only_energy_cost)
180 |             # print("----------------------------:", servers_cap[target_server])
181 | 
182 | 
183 |             # 随机卸载
184 |             percen_ran = np.random.uniform()    # 随机卸载比例
185 |             mec_ran = np.random.randint(self.MECs)  # 随机选择一个服务器进行卸载
186 | 
187 |             random_tr_time = (percen_ran * task_size) / uplink[mec_ran] + (self.discount * percen_ran * task_size) / downlink[mec_ran]
188 |             random_tr_energy = (self.tr_energy * percen_ran * task_size) / uplink[mec_ran] + self.discount * (self.tr_energy * percen_ran * task_size) / downlink[mec_ran]
189 | 
190 |             random_comp_local_time = (1 - percen_ran) * task_cpu_cycle / local_comp
191 |             random_comp_local_energy = self.lw * (1 - percen_ran) * task_cpu_cycle * local_comp**2
192 |             # random_comp_local_energy = (1 - percen_ran) * task_size * local_comp
193 | 
194 |             random_comp_mec_time = percen_ran * task_cpu_cycle / servers_cap[mec_ran]
195 |             random_comp_mec_energy = self.ew * percen_ran * task_cpu_cycle * servers_cap[mec_ran]**2
196 |             # random_comp_mec_energy = percen_ran * task_size * servers_cap[mec_ran]
197 | 
198 |             random_comp_time = max(random_comp_local_time, random_comp_mec_time)
199 |             random_time_cost = (random_comp_time + random_tr_time) * self.et
200 |             random_energy_cost = (random_tr_energy + random_comp_local_energy + random_comp_mec_energy) * self.e
201 | 
202 | 
203 |             random_total = self.lam * random_time_cost + (1 - self.lam) * random_energy_cost
204 |             random_total_cost2 = random_energy_cost
205 | 
206 |             # if total_cost < random_total or total_cost < mec_only or total_cost < local_only:
207 |             #     reward = -total_cost
208 |             # else:
209 |             #     print("惩罚")
210 |             #     reward = -1999
211 | 
212 |             reward = -total_cost
213 | 
214 |             # a += total_cost
215 |             # b += mec_only
216 |             # c += local_only
217 |             # d += random_total
218 | 
219 |             # 得到下一个observation
220 |             x = np.random.uniform()
221 |             y = 0.5
222 |             if (x > y):
223 |                 local_comp = min(local_comp + np.random.randint(0, 0.2 * self.UE_f), self.local_core_max)
224 |                 for j in range(self.MECs):
225 |                     cap = min(servers_cap[j] + np.random.randint(0, 0.3 * self.UE_f), self.server_core_max)
226 |                     # MEC容量保持一致
227 |                     if new_cap:
228 |                         for x in range(self.UEs):
229 |                             observation[x][2 + j] = cap
230 |                     downlink[j] = min(downlink[j] + np.random.randint(0, 0.2 * self.r), self.downlink_max)
231 |                     uplink[j] = min(uplink[j] + np.random.randint(0, 0.2 * self.r), self.uplink_max)
232 |             else:
233 |                 local_comp = max(local_comp + np.random.randint(-0.2 * self.UE_f, 0), self.local_core_min)
234 |                 for j in range(self.MECs):
235 |                     # MEC容量保持一致
236 |                     if new_cap:
237 |                         cap = max(servers_cap[j] + np.random.randint(0, 0.3 * self.UE_f), self.server_core_max)
238 |                         for x in range(self.UEs):
239 |                             observation[x][2 + j] = cap
240 |                     downlink[j] = max(downlink[j] - np.random.randint(0, 0.2 * self.r), self.downlink_min)
241 |                     uplink[j] = max(uplink[j] - np.random.randint(0, 0.2 * self.r), self.uplink_min)
242 | 
243 |             task_size = np.random.randint(10, 50)
244 |             task_cpu_cycle = np.random.randint(10**3, 10**5)  # 处理任务所需要的CPU频率
245 |             observation_ = np.array([task_size, task_cpu_cycle, local_comp])
246 |             observation_ = np.hstack((observation_, servers_cap, uplink, downlink))
247 |             obs_.append(observation_)
248 | 
249 |             rew.append(reward)
250 |             local.append(local_only)
251 |             mec.append(mec_only)
252 |             ran.append(random_total)
253 | 
254 |             dpg_times.append(time_cost)
255 |             local_times.append(local_only_time)
256 |             mec_times.append(mec_only_time_cost)
257 |             ran_times.append(random_time_cost)
258 | 
259 |             dpg_energys.append(energy_cost)
260 |             local_energys.append(local_only_energy)
261 |             mec_energys.append(mec_only_energy_cost)
262 |             ran_energys.append(random_energy_cost)
263 | 
264 |             total.append(total_cost)
265 | 
266 |         # if (a - b > 10 * self.UEs) or (a - c > 10 * self.UEs) or (a - d > 10 * self.UEs):
267 |         #     print("惩罚")
268 |         #     # print(a ,b, c, d)
269 |         #     for i in range(self.UEs):
270 |         #         rew[i] = -999
271 |         # else:
272 |         #     pass
273 | 
274 |         if is_compared:
275 |             return obs_, rew, local, mec, ran, dpg_times, local_times, mec_times, ran_times, dpg_energys, local_energys, mec_energys, ran_energys, total
276 |         else:
277 |             return obs_, rew, dpg_times, dpg_energys
278 |             # return obs_, total
279 | 
280 | 


--------------------------------------------------------------------------------
/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | class ReplayBuffer(object):
 5 |     def __init__(self, size):
 6 |         self.storage = []
 7 |         self.maxsize = int(size)
 8 |         self.next_idx = 0
 9 | 
10 |     def __len__(self):
11 |         return len(self.storage)
12 | 
13 |     def clear(self):
14 |         self.storage = []
15 |         self.next_idx = 0
16 | 
17 |     def add(self, o, a, r, o_):
18 |         data = (o, a, r, o_)
19 | 
20 |         if self.next_idx >= len(self.storage):
21 |             self.storage.append(data)
22 |         else:
23 |             self.storage[self.next_idx] = data
24 |         self.next_idx = (self.next_idx + 1) % self.maxsize
25 | 
26 |     # 提取每个agent在replay buffter中的值
27 |     def encode_sample(self, idxes, agent_dix):
28 |         observations, actions, rewards, observations_ = [], [], [], []
29 |         for i in idxes:
30 |             data = self.storage[i]
31 |             obs, act, rew, obs_ = data
32 |             observations.append(np.concatenate(obs[:]))
33 |             actions.append(act)
34 |             # rewards.append(rew[agent_dix])
35 |             rewards.append(rew)
36 |             observations_.append(np.concatenate(obs_[:]))
37 |         return np.array(observations), np.array(actions), np.array(rewards), np.array(observations_)
38 | 
39 |     # 随机抽样
40 |     def make_index(self, batch_size):
41 |         return [random.randint(0, len(self.storage) - 1) for _ in range(batch_size)]
42 | 
43 |     def sample(self, batch_size, agent_dix):
44 |         """Sample a batch of experiences.
45 |         Parameters
46 |         ----------
47 |         batch_size: int
48 |             How many transitions to sample.
49 |         Returns
50 |         -------
51 |         obs_batch: np.array
52 |             batch of observations
53 |         act_batch: np.array
54 |             batch of actions executed given obs_batch
55 |         rew_batch: np.array
56 |             rewards received as results of executing act_batch
57 |         next_obs_batch: np.array
58 |             next set of observations seen after executing act_batch
59 |             the end of an episode and 0 otherwise.
60 |         """
61 |         if batch_size > 0:
62 |             idxes = self.make_index(batch_size)
63 |         else:
64 |             idxes = range(0, len(self.storage))
65 |         return self.encode_sample(idxes, agent_dix)
66 | 
67 | # buffter = ReplayBuffter(100)
68 | # o = []
69 | # obs = np.array([238,  212,  228,  202,  213,  168,  180,  171,  195, 191,  311,  306,  408,  384,  351])
70 | # a = np.array([238,  212,  228,  202,  213,  175,  159,  165,  189, 189,  381,  416,  443,  488,  317])
71 | # o.append(obs)
72 | # o.append(a)
73 | # a = []
74 | # action1 = np.array([1, 0.1])
75 | # action2 = np.array([2, 0.2])
76 | # a.append(action1)
77 | # a.append(action2)
78 | # r = [1000, 2000]
79 | # o_ = []
80 | # obs_1 = np.array([38,  212,  228,  202,  213,  175,  159,  165,  189, 189,  381,  416,  443,  488,  317])
81 | # obs_2 = np.array([38,  212,  228,  202,  213,  175,  159,  165,  189, 189,  381,  416,  443,  488,  317])
82 | # o_.append(obs_1)
83 | # o_.append(obs_2)
84 | # buffter.add(o, a, r, o_)
85 | #
86 | # buffter.sample(1, 1)


--------------------------------------------------------------------------------
/run_this.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | from torch.utils.tensorboard import SummaryWriter
  4 | import numpy as np
  5 | 
  6 | from env import ENV
  7 | from replay_buffer import ReplayBuffer
  8 | from MADDPG import Maddpg
  9 | from DQN import Double_DQN
 10 | from D3QN import D3QN
 11 | 
 12 | learning_start_step = 200
 13 | learning_fre = 5
 14 | batch_size = 64
 15 | gamma = 0.9
 16 | lr = 0.01
 17 | max_grad_norm = 0.5
 18 | save_model = 40
 19 | save_dir = "models/simple_adversary"
 20 | save_fer = 400
 21 | tao = 0.01
 22 | memory_size = 2000
 23 | EPOCH = 350
 24 | STEP = 200
 25 | 
 26 | write = SummaryWriter(log_dir="logs")
 27 | 
 28 | def train(ue=3, mec=7, k=11*3, lam=0.5):
 29 |     """step1:create the environment"""
 30 |     u = ue
 31 |     m = mec
 32 |     k = k
 33 |     lam = lam
 34 |     env = ENV(u, m, k, lam)    # UE: MEC:, k:
 35 |     maddpg = Maddpg()
 36 |     dqn = Double_DQN(env)
 37 |     d3qn = D3QN(env)
 38 | 
 39 | 
 40 |     print('=============================')
 41 |     print('=1 Env {} is right ...')
 42 |     print('=============================')
 43 | 
 44 |     """step2:create agent"""
 45 |     obs_shape_n = [env.n_features for i in range(env.UEs)]
 46 |     action_shape_n = [env.n_actions for i in range(env.UEs)]
 47 |     actors_cur, critic_cur, actors_tar, critic_tar, optimizers_a, optimizers_c = \
 48 |         maddpg.get_train(env, obs_shape_n, action_shape_n)
 49 |     memory_dpg = ReplayBuffer(memory_size)
 50 |     # memory_dqn = ReplayBuffer(memory_size)
 51 | 
 52 |     print('=2 The {} agents are inited ...'.format(env.UEs))
 53 |     print('=============================')
 54 | 
 55 |     """step3: init the pars """
 56 |     obs_size = []
 57 |     action_size = []
 58 |     game_step = 0
 59 |     update_cnt = 0
 60 |     episode_rewards, episode_dqn, episode_d3qn, episode_local,  episode_mec, episode_ran = [0.0], [0.0], [0.0], [0.0], [0.0], [0.0] # sum of rewards for all agents
 61 |     episode_time_dpg,  episode_time_dqn, episode_time_d3qn, episode_time_local, episode_time_ran, episode_time_mec = [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]
 62 |     episode_energy_dpg, episode_energy_dqn, episode_energy_d3qn, episode_energy_local, episode_energy_ran, episode_energy_mec = [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]
 63 |     episode_total_cost = [0.0]
 64 |     # agent_rewards = [[0.0] for _ in range(env.UEs)]  # individual agent reward
 65 |     epoch_average_reward, epoch_average_dqn, epoch_average_d3qn, epoch_average_local, epoch_average_mec, epoch_average_ran= [], [], [], [], [], []
 66 |     epoch_average_time_reward, epoch_average_time_dqn, epoch_average_time_d3qn, epoch_average_time_local, epoch_average_time_mec, epoch_average_time_ran= [], [], [], [], [], []
 67 |     epoch_average_energy_reward, epoch_average_energy_dqn, epoch_average_energy_d3qn, epoch_average_energy_local, epoch_average_energy_mec, epoch_average_energy_ran= [], [], [], [], [], []
 68 |     epoch_average_total_cost = []
 69 | 
 70 |     head_o, head_a, end_o, end_a = 0, 0, 0, 0
 71 |     for obs_shape, action_shape in zip(obs_shape_n, action_shape_n):
 72 |         end_o = end_o + obs_shape
 73 |         end_a = end_a + action_shape
 74 |         range_o = (head_o, end_o)
 75 |         range_a = (head_a, end_a)
 76 |         obs_size.append(range_o)
 77 |         action_size.append(range_a)
 78 |         head_o = end_o
 79 |         head_a = end_a
 80 | 
 81 |     print('=3 starting iterations ...')
 82 |     print('=============================')
 83 | 
 84 |     for epoch in range(EPOCH):
 85 |         obs = env.reset()
 86 | 
 87 |         for time_1 in range(STEP):
 88 | 
 89 |             action_prob = [agent(torch.from_numpy(observation).to(torch.float)).detach().cpu().numpy() \
 90 |                         for agent, observation in zip(actors_cur, obs)]
 91 |             action_dqn = dqn.choose_action(obs)
 92 |             action_d3qn = d3qn.choose_action(obs)
 93 | 
 94 |             o1 = copy.deepcopy(obs)
 95 |             o2 = copy.deepcopy(obs)
 96 |             obs_old = copy.deepcopy(obs)
 97 |             obs_, rew, local, mec, ran, time_dpg, time_local, time_mec, time_ran, energy_dpg, energy_local, energy_mec, energy_ran, total_cost = env.step(obs, action_prob)
 98 |             obs_dqn, rew_dqn, time_dqn, energy_dqn = env.step(o1, action_dqn, is_prob=False, is_compared=False)
 99 |             obs_d3qn, rew_d3qn, time_d3qn, energy_d3qn = env.step(o2, action_d3qn, is_prob=False, is_compared=False)
100 | 
101 | 
102 |             # save the expeeinece
103 |             memory_dpg.add(obs_old, np.concatenate(action_prob), rew, obs_)
104 |             dqn.store_memory(obs_old, action_dqn, rew_dqn, obs_dqn)
105 |             d3qn.store_memory(obs_old, action_d3qn, rew_d3qn, obs_d3qn)
106 | 
107 |             episode_rewards[-1] += np.sum(rew)
108 |             episode_dqn[-1] += np.sum(rew_dqn)
109 |             episode_d3qn[-1] += np.sum(rew_d3qn)
110 |             episode_local[-1] += np.sum(local)
111 |             episode_mec[-1] += np.sum(mec)
112 |             episode_ran[-1] += np.sum(ran)
113 | 
114 |             episode_time_dpg[-1] += np.sum(time_dpg)
115 |             episode_time_dqn[-1] += np.sum(time_dqn)
116 |             episode_time_d3qn[-1] += np.sum(time_d3qn)
117 |             episode_time_local[-1] += np.sum(time_local)
118 |             episode_time_ran[-1] += np.sum(time_ran)
119 |             episode_time_mec[-1] += np.sum(time_mec)
120 | 
121 |             episode_energy_dpg[-1] += np.sum(energy_dpg)
122 |             episode_energy_dqn[-1] += np.sum(energy_dqn)
123 |             episode_energy_d3qn[-1] += np.sum(energy_d3qn)
124 |             episode_energy_local[-1] += np.sum(energy_local)
125 |             episode_energy_mec[-1] += np.sum(energy_mec)
126 |             episode_energy_ran[-1] += np.sum(energy_ran)
127 |             episode_total_cost[-1] += np.sum(total_cost)
128 |             # for i, rew in enumerate(rew):agent_rewards[i][-1] += rew
129 | 
130 |             # train agent
131 |             if game_step > 1000 and game_step % 100 == 0:
132 |                 update_cnt, actors_cur, actors_tar, critic_cur, critic_tar = maddpg.agents_train(
133 |                     game_step, update_cnt, memory_dpg, obs_size, action_size,
134 |                     actors_cur, actors_tar, critic_cur, critic_tar, optimizers_a, optimizers_c, write)
135 |                 dqn.learn(game_step, write)
136 |                 d3qn.learn(game_step, write)
137 | 
138 |             # update obs
139 |             game_step += 1
140 |             obs = obs_
141 |         epoch_average_reward.append(- episode_rewards[-1] / (env.UEs * STEP))
142 |         epoch_average_dqn.append(- episode_dqn[-1] / (env.UEs * STEP))
143 |         epoch_average_d3qn.append(- episode_d3qn[-1] / (env.UEs * STEP))
144 |         epoch_average_local.append(episode_local[-1] / (env.UEs * STEP))
145 |         epoch_average_mec.append(episode_mec[-1] / (env.UEs * STEP))
146 |         epoch_average_ran.append(episode_ran[-1] / (env.UEs * STEP))
147 | 
148 |         epoch_average_time_reward.append(episode_time_dpg[-1] / (env.UEs * STEP))
149 |         epoch_average_time_dqn.append(episode_time_dqn[-1] / (env.UEs * STEP))
150 |         epoch_average_time_d3qn.append(episode_time_dqn[-1] / (env.UEs * STEP))
151 |         epoch_average_time_local.append(episode_time_local[-1] / (env.UEs * STEP))
152 |         epoch_average_time_mec.append(episode_time_mec[-1] / (env.UEs * STEP))
153 |         epoch_average_time_ran.append(episode_time_ran[-1] / (env.UEs * STEP))
154 | 
155 |         epoch_average_energy_reward.append(episode_energy_dpg[-1] / (env.UEs * STEP))
156 |         epoch_average_energy_dqn.append(episode_energy_dqn[-1] / (env.UEs * STEP))
157 |         epoch_average_energy_d3qn.append(episode_energy_dqn[-1] / (env.UEs * STEP))
158 |         epoch_average_energy_local.append(episode_energy_local[-1] / (env.UEs * STEP))
159 |         epoch_average_energy_mec.append(episode_energy_mec[-1] / (env.UEs * STEP))
160 |         epoch_average_energy_ran.append(episode_energy_ran[-1] / (env.UEs * STEP))
161 |         epoch_average_total_cost.append(episode_total_cost[-1] / (env.UEs * STEP))
162 | 
163 |         episode_rewards.append(0)
164 |         episode_dqn.append(0)
165 |         episode_d3qn.append(0)
166 |         episode_local.append(0)
167 |         episode_mec.append(0)
168 |         episode_ran.append(0)
169 | 
170 |         episode_time_dpg.append(0)
171 |         episode_time_dqn.append(0)
172 |         episode_time_d3qn.append(0)
173 |         episode_time_local.append(0)
174 |         episode_time_mec.append(0)
175 |         episode_time_ran.append(0)
176 | 
177 |         episode_energy_dpg.append(0)
178 |         episode_energy_dqn.append(0)
179 |         episode_energy_d3qn.append(0)
180 |         episode_energy_local.append(0)
181 |         episode_energy_mec.append(0)
182 |         episode_energy_ran.append(0)
183 | 
184 |         episode_total_cost.append(0)
185 |         # for a_r in agent_rewards:
186 |         #     a_r.append(0)
187 |         # print("------reset-------")
188 |         write.add_scalars("cost", {'MADDPG': epoch_average_total_cost[epoch],
189 |                                    'DQN': epoch_average_dqn[epoch],
190 |                                    'D3QN': epoch_average_d3qn[epoch],
191 |                                    'Local': epoch_average_local[epoch],
192 |                                    'Mec': epoch_average_mec[epoch],
193 |                                    'random': epoch_average_ran[epoch]}, epoch)
194 |         # write.add_scalars("cost", {'MADDPG': - episode_rewards[-1] /STEP,
195 |         #                            # 'DQN': epoch_average_dqn[epoch],
196 |         #                            'Local': episode_local[-1] / STEP,
197 |         #                            'Mec': episode_mec[-1] / STEP,
198 |         #                            'random': episode_ran[-1] / STEP}, epoch)
199 |         write.add_scalars("cost/energy", {'MADDPG': epoch_average_energy_reward[epoch],
200 |                                      'DQN': epoch_average_energy_dqn[epoch],
201 |                                      'D3QN': epoch_average_energy_d3qn[epoch],
202 |                                      'Local': epoch_average_energy_local[epoch],
203 |                                      'Mec': epoch_average_energy_mec[epoch],
204 |                                      'random': epoch_average_energy_ran[epoch]}, epoch)
205 |         write.add_scalars("cost/delay", {'MADDPG': epoch_average_time_reward[epoch],
206 |                                     'DQN': epoch_average_time_dqn[epoch],
207 |                                     'D3QN': epoch_average_time_d3qn[epoch],
208 |                                     'Local': epoch_average_time_local[epoch],
209 |                                     'Mec': epoch_average_time_mec[epoch],
210 |                                     'random': epoch_average_time_ran[epoch]}, epoch)
211 |         # print("epoch:{},MADDPG:{}".format(epoch, epoch_average_total_cost[epoch]))
212 |         # # print("epoch:{},DQN:{}".format(epoch, epoch_average_dqn[epoch]))
213 |         # print("epoch:{},Local:{}".format(epoch, epoch_average_local[epoch]))
214 |         # print("epoch:{},Mec:{}".format(epoch, epoch_average_mec[epoch]))
215 |         # print("epoch:{},random:{}".format(epoch, epoch_average_ran[epoch]))
216 |         # if epoch_average_mec[epoch] > epoch_average_reward[epoch]:
217 |         #     print("True")
218 |         # print("---------------------------------------")
219 |     # return a
220 | 
221 | 
222 | 
223 | if __name__ == '__main__':
224 |     # for i in range(5):
225 |     #     cost = train(i + 10)
226 |     #     print(i + 10, "cost:", cost)
227 |     #     write.add_scalar("cost", cost, i + 10)
228 |     #     write.close()
229 |     train()
230 | 


--------------------------------------------------------------------------------