├── D3QN.py ├── DQN.py ├── MADDPG.py ├── README.md ├── Rl_net.py ├── base_env.py ├── env.py ├── replay_buffer.py └── run_this.py /D3QN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import copy 5 | 6 | from replay_buffer import ReplayBuffer 7 | from env import ENV 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | class Network(nn.Module): 11 | 12 | def __init__(self, n_features, n_actions): 13 | super().__init__() 14 | self.fc1 = nn.Linear(n_features, 16) 15 | self.fc1.weight.data.normal_(0, 0.3) 16 | self.fc1.bias.data.normal_(0.1) 17 | self.relu = nn.ReLU() 18 | self.fc2 = nn.Linear(16, 32) 19 | self.fc2.weight.data.normal_(0, 0.3) 20 | self.fc2.bias.data.normal_(0.1) 21 | self.fc3 = nn.Linear(32, 64) 22 | self.fc3.weight.data.normal_(0, 0.3) 23 | self.fc3.bias.data.normal_(0.1) 24 | self.out = nn.Linear(64, n_actions) 25 | self.out.weight.data.normal_(0, 0.3) 26 | self.out.bias.data.normal_(0.1) 27 | self.v = nn.Linear(64, 1) 28 | self.v.weight.data.normal_(0, 0.3) 29 | self.v.bias.data.normal_(0.1) 30 | 31 | 32 | def forward(self, x): 33 | x = self.fc1(x) 34 | x = self.relu(x) 35 | x = self.fc2(x) 36 | x = self.relu(x) 37 | x = self.fc3(x) 38 | x = self.relu(x) 39 | 40 | v = self.v(x) 41 | a = self.out(x) 42 | 43 | return v + a - torch.mean(a, dim=-1, keepdim=True) 44 | 45 | class D3QN: 46 | def __init__(self, 47 | env, 48 | learning_rate=0.01, 49 | reward_decay=0.9, 50 | e_greedy=0.9, 51 | replace_target_iter=300, 52 | memory_size=500, 53 | batch_size=5, 54 | e_greedy_increment=0.001, 55 | epoch=100 56 | ): 57 | # print("fea:", n_features) 58 | self.UEs = env.UEs 59 | self.n_actions = env.n_actions 60 | self.n_features = env.n_features 61 | self.actions = env.actions 62 | self.k = env.k 63 | self.learning_rate = learning_rate 64 | self.gama = reward_decay 65 | self.epsilon_max = e_greedy 66 | self.replace_target_iter = replace_target_iter 67 | self.memory_size = memory_size 68 | self.batch_size = batch_size 69 | self.epsilon_increment = e_greedy_increment 70 | self.epoch = epoch 71 | 72 | self.epsilon = 0 73 | self.learn_step_counter = 0 74 | 75 | # 初始化replay 76 | self.memory = ReplayBuffer(self.memory_size) 77 | 78 | self.cost_his = [] 79 | 80 | self.eval_net = [None for _ in range(self.UEs)] 81 | self.target_net = [None for _ in range(self.UEs)] 82 | self.optimizer = [None for _ in range(self.UEs)] 83 | 84 | for i in range(self.UEs): 85 | 86 | self.eval_net[i], self.target_net[i] = Network(self.n_features , self.n_actions), Network(self.n_features, 87 | self.n_actions) 88 | self.optimizer[i] = torch.optim.Adam(self.eval_net[i].parameters(), lr=learning_rate) 89 | 90 | self.loss_fun = nn.MSELoss() 91 | 92 | def store_memory(self, s, a, r, s_): 93 | self.memory.add(s, a, r, s_) 94 | 95 | def choose_action(self, observation): 96 | a = [] 97 | for i in range(self.UEs): 98 | # observation = np.array(observation[i]).reshape(1, self.n_features) 99 | obs = np.array(observation[i]).reshape(1, self.n_features) 100 | obs = torch.FloatTensor(obs[:]) # 增加一个维度 i.e[1,2,3,4,5]变成[[1,2,3,4,5]] 101 | if np.random.uniform() < self.epsilon: 102 | # 选择q值最大的动作 103 | actions_value = self.eval_net[i](obs) 104 | index = torch.max(actions_value, 1)[1].data.numpy() 105 | index = index[0] 106 | action = self.actions[index] 107 | else: 108 | index = np.random.randint(0, self.n_actions) 109 | action = self.actions[index] 110 | a.append(action) 111 | return a 112 | 113 | def learn(self, step, write): 114 | if self.learn_step_counter % self.replace_target_iter == 0: 115 | for i in range(self.UEs): 116 | self.target_net[i].load_state_dict(self.eval_net[i].state_dict()) # 直接赋值更新权重 117 | self.learn_step_counter += 1 118 | 119 | for agent_idx, (agent_eval, agent_target, opt) in \ 120 | enumerate(zip(self.eval_net, self.target_net, self.optimizer)): 121 | # 随机抽样 122 | obs, action, reward, obs_ = self.memory.sample(self.batch_size, agent_idx) 123 | actions_index = [] 124 | 125 | rew = torch.tensor(reward, dtype=torch.float) 126 | action_cur = torch.from_numpy(action).to(torch.float) 127 | for i in range(self.batch_size): 128 | for j in range(self.UEs): 129 | a = action_cur[i][j] 130 | action_index = a[0] * self.k + a[1] / (1 / (self.k - 1)) 131 | actions_index.append(int(action_index)) 132 | actions_index = torch.tensor(actions_index).reshape(self.batch_size, self.UEs, 1) 133 | obs_n = torch.from_numpy(obs).to(torch.float) 134 | obs_n_ = torch.from_numpy(obs_).to(torch.float) 135 | obs_n = obs_n.reshape(self.batch_size, self.UEs, self.n_features) 136 | obs_n_ = obs_n_.reshape(self.batch_size, self.UEs, self.n_features) 137 | 138 | q_target = torch.zeros((self.batch_size, self.UEs, 1)) 139 | q_eval = agent_eval(obs_n) 140 | q = q_eval 141 | 142 | q_eval = agent_eval(obs_n).gather(-1, actions_index) 143 | # q_eval = torch.gather(q_eval, dim=1, index=torch.unsqueeze(action_cur, 1)) 144 | q_next = agent_target(obs_n_).detach() 145 | 146 | for i in range(obs_n.shape[0]): 147 | for j in range(self.UEs): 148 | action = torch.argmax(q[i][j], 0).detach() 149 | q_target[i][j] = rew[i][j] + self.gama * q_next[i, j, action] 150 | 151 | loss = self.loss_fun(q_eval, q_target) 152 | write.add_scalar("Loss/DQN", loss, step) 153 | self.cost_his.append(loss) 154 | opt.zero_grad() 155 | loss.backward() 156 | opt.step() 157 | 158 | self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max 159 | # 160 | # write = SummaryWriter(log_dir="logs") 161 | # # 162 | # env = ENV(3, 3, 11, 1) 163 | # # DQN = Double_DQN(env, env.n_actions, env.n_features*5) 164 | # DQN = Double_DQN(env) 165 | # epoch_reward = [0.0] 166 | # epoch_average_reward = [] 167 | # for epoch in range(1000): 168 | # observation = env.reset() 169 | # epoch_average_reward.append(epoch_reward[-1]/ (env.UEs * 100)) 170 | # epoch_reward.append(0) 171 | # print("epoch:{}, cost:{}".format(epoch, epoch_average_reward[epoch])) 172 | # # print("reset") 173 | # for step in range(100): 174 | # o1 = copy.deepcopy(observation) 175 | # o2 = copy.deepcopy(observation) 176 | # 177 | # action = DQN.choose_action(o1) 178 | # o_, reward = env.step(o2, action, is_prob=False, is_compared=False) 179 | # epoch_reward[-1] += np.sum(reward) 180 | # DQN.store_memory(o2, action, reward, o_) 181 | # DQN.learn(epoch, write) 182 | # observation = o_ 183 | # # print("action:", action) 184 | -------------------------------------------------------------------------------- /DQN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import copy 5 | 6 | from replay_buffer import ReplayBuffer 7 | from env import ENV 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | class Network(nn.Module): 11 | 12 | def __init__(self, n_features, n_actions): 13 | super().__init__() 14 | self.fc1 = nn.Linear(n_features, 16) 15 | self.fc1.weight.data.normal_(0, 0.3) 16 | self.fc1.bias.data.normal_(0.1) 17 | self.relu = nn.ReLU() 18 | self.fc2 = nn.Linear(16, 32) 19 | self.fc2.weight.data.normal_(0, 0.3) 20 | self.fc2.bias.data.normal_(0.1) 21 | self.fc3 = nn.Linear(32, 64) 22 | self.fc3.weight.data.normal_(0, 0.3) 23 | self.fc3.bias.data.normal_(0.1) 24 | self.out = nn.Linear(64, n_actions) 25 | self.out.weight.data.normal_(0, 0.3) 26 | self.out.bias.data.normal_(0.1) 27 | 28 | 29 | def forward(self, x): 30 | x = self.fc1(x) 31 | x = self.relu(x) 32 | x = self.fc2(x) 33 | x = self.relu(x) 34 | x = self.fc3(x) 35 | x = self.relu(x) 36 | return self.out(x) 37 | 38 | class Double_DQN: 39 | def __init__(self, 40 | env, 41 | learning_rate=0.01, 42 | reward_decay=0.9, 43 | e_greedy=0.9, 44 | replace_target_iter=300, 45 | memory_size=500, 46 | batch_size=5, 47 | e_greedy_increment=0.001, 48 | epoch=100 49 | ): 50 | # print("fea:", n_features) 51 | self.UEs = env.UEs 52 | self.n_actions = env.n_actions 53 | self.n_features = env.n_features 54 | self.actions = env.actions 55 | self.k = env.k 56 | self.learning_rate = learning_rate 57 | self.gama = reward_decay 58 | self.epsilon_max = e_greedy 59 | self.replace_target_iter = replace_target_iter 60 | self.memory_size = memory_size 61 | self.batch_size = batch_size 62 | self.epsilon_increment = e_greedy_increment 63 | self.epoch = epoch 64 | 65 | self.epsilon = 0 66 | self.learn_step_counter = 0 67 | 68 | # 初始化replay 69 | self.memory = ReplayBuffer(self.memory_size) 70 | 71 | self.cost_his = [] 72 | 73 | self.eval_net = [None for _ in range(self.UEs)] 74 | self.target_net = [None for _ in range(self.UEs)] 75 | self.optimizer = [None for _ in range(self.UEs)] 76 | 77 | for i in range(self.UEs): 78 | 79 | self.eval_net[i], self.target_net[i] = Network(self.n_features , self.n_actions), Network(self.n_features, 80 | self.n_actions) 81 | self.optimizer[i] = torch.optim.Adam(self.eval_net[i].parameters(), lr=learning_rate) 82 | 83 | self.loss_fun = nn.MSELoss() 84 | 85 | def store_memory(self, s, a, r, s_): 86 | self.memory.add(s, a, r, s_) 87 | 88 | def choose_action(self, observation): 89 | a = [] 90 | for i in range(self.UEs): 91 | # observation = np.array(observation[i]).reshape(1, self.n_features) 92 | obs = np.array(observation[i]).reshape(1, self.n_features) 93 | obs = torch.FloatTensor(obs[:]) # 增加一个维度 i.e[1,2,3,4,5]变成[[1,2,3,4,5]] 94 | if np.random.uniform() < self.epsilon: 95 | # 选择q值最大的动作 96 | actions_value = self.eval_net[i](obs) 97 | index = torch.max(actions_value, 1)[1].data.numpy() 98 | index = index[0] 99 | action = self.actions[index] 100 | else: 101 | index = np.random.randint(0, self.n_actions) 102 | action = self.actions[index] 103 | a.append(action) 104 | return a 105 | 106 | def learn(self, step, write): 107 | if self.learn_step_counter % self.replace_target_iter == 0: 108 | for i in range(self.UEs): 109 | self.target_net[i].load_state_dict(self.eval_net[i].state_dict()) # 直接赋值更新权重 110 | self.learn_step_counter += 1 111 | 112 | for agent_idx, (agent_eval, agent_target, opt) in \ 113 | enumerate(zip(self.eval_net, self.target_net, self.optimizer)): 114 | # 随机抽样 115 | obs, action, reward, obs_ = self.memory.sample(self.batch_size, agent_idx) 116 | actions_index = [] 117 | 118 | rew = torch.tensor(reward, dtype=torch.float) 119 | action_cur = torch.from_numpy(action).to(torch.float) 120 | for i in range(self.batch_size): 121 | for j in range(self.UEs): 122 | a = action_cur[i][j] 123 | action_index = a[0] * self.k + a[1] / (1 / (self.k - 1)) 124 | actions_index.append(int(action_index)) 125 | actions_index = torch.tensor(actions_index).reshape(self.batch_size, self.UEs, 1) 126 | obs_n = torch.from_numpy(obs).to(torch.float) 127 | obs_n_ = torch.from_numpy(obs_).to(torch.float) 128 | obs_n = obs_n.reshape(self.batch_size, self.UEs, self.n_features) 129 | obs_n_ = obs_n_.reshape(self.batch_size, self.UEs, self.n_features) 130 | 131 | q_target = torch.zeros((self.batch_size, self.UEs, 1)) 132 | q_eval = agent_eval(obs_n) 133 | q = q_eval 134 | 135 | q_eval = agent_eval(obs_n).gather(-1, actions_index) 136 | # q_eval = torch.gather(q_eval, dim=1, index=torch.unsqueeze(action_cur, 1)) 137 | q_next = agent_target(obs_n_).detach() 138 | 139 | for i in range(obs_n.shape[0]): 140 | for j in range(self.UEs): 141 | action = torch.argmax(q[i][j], 0).detach() 142 | q_target[i][j] = rew[i][j] + self.gama * q_next[i, j, action] 143 | 144 | loss = self.loss_fun(q_eval, q_target) 145 | write.add_scalar("Loss/DQN", loss, step) 146 | self.cost_his.append(loss) 147 | opt.zero_grad() 148 | loss.backward() 149 | opt.step() 150 | 151 | self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max 152 | # 153 | # write = SummaryWriter(log_dir="logs") 154 | # # 155 | # env = ENV(3, 3, 11, 1) 156 | # # DQN = Double_DQN(env, env.n_actions, env.n_features*5) 157 | # DQN = Double_DQN(env) 158 | # epoch_reward = [0.0] 159 | # epoch_average_reward = [] 160 | # for epoch in range(1000): 161 | # observation = env.reset() 162 | # epoch_average_reward.append(epoch_reward[-1]/ (env.UEs * 100)) 163 | # epoch_reward.append(0) 164 | # print("epoch:{}, cost:{}".format(epoch, epoch_average_reward[epoch])) 165 | # # print("reset") 166 | # for step in range(100): 167 | # o1 = copy.deepcopy(observation) 168 | # o2 = copy.deepcopy(observation) 169 | # 170 | # action = DQN.choose_action(o1) 171 | # o_, reward = env.step(o2, action, is_prob=False, is_compared=False) 172 | # epoch_reward[-1] += np.sum(reward) 173 | # DQN.store_memory(o2, action, reward, o_) 174 | # DQN.learn(epoch, write) 175 | # observation = o_ 176 | # # print("action:", action) 177 | -------------------------------------------------------------------------------- /MADDPG.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from Rl_net import actor, critic 5 | 6 | learning_start_step = 200 7 | learning_fre = 5 8 | batch_size = 64 9 | gamma = 0.9 10 | lr = 0.01 11 | max_grad_norm = 0.5 12 | save_model = 40 13 | save_dir = "models/simple_adversary" 14 | save_fer = 400 15 | tao = 0.01 16 | 17 | 18 | class Maddpg(object): 19 | 20 | def get_train(self, env, obs_shape_n, action_shape_n): 21 | actors_cur = [None for _ in range(env.UEs)] 22 | critics_cur = [None for _ in range(env.UEs)] 23 | actors_target = [None for _ in range(env.UEs)] 24 | critics_target = [None for _ in range(env.UEs)] 25 | optimizer_a = [None for _ in range(env.UEs)] 26 | optimizer_c = [None for _ in range(env.UEs)] 27 | 28 | 29 | for i in range(env.UEs): 30 | actors_cur[i] = actor(obs_shape_n[i], action_shape_n[i]) 31 | critics_cur[i] = critic(sum(obs_shape_n), sum(action_shape_n)) 32 | actors_target[i] = actor(obs_shape_n[i], action_shape_n[i]) 33 | critics_target[i] = critic(sum(obs_shape_n), sum(action_shape_n)) 34 | optimizer_a[i] = torch.optim.Adam(actors_cur[i].parameters(), lr=lr) 35 | optimizer_c[i] = torch.optim.Adam(critics_cur[i].parameters(), lr=lr) 36 | actors_tar = self.update_train(actors_cur, actors_target, 1.0) 37 | critics_tar = self.update_train(critics_cur, critics_target, 1.0) 38 | return actors_cur, critics_cur, actors_tar, critics_tar, optimizer_a, optimizer_c 39 | 40 | def update_train(self, agents_cur, agents_tar, tao): 41 | """ 42 | 用于更新target网络, 43 | 这个方法不同于直接复制,但结果一样 44 | out: 45 | |agents_tar: the agents with new par updated towards agents_current 46 | """ 47 | for agent_c, agent_t in zip(agents_cur, agents_tar): 48 | key_list = list(agent_c.state_dict().keys()) 49 | state_dict_t = agent_t.state_dict() 50 | state_dict_c = agent_c.state_dict() 51 | for key in key_list: 52 | state_dict_t[key] = state_dict_c[key] * tao + \ 53 | (1 - tao) * state_dict_t[key] 54 | agent_t.load_state_dict(state_dict_t) 55 | return agents_tar 56 | 57 | def agents_train(self, game_step, update_cnt, memory, obs_size, action_size, 58 | actors_cur, actors_tar, critics_cur, critics_tar, optimizers_a, optimizers_c, write): 59 | """ 60 | par: 61 | |input: the data for training 62 | |output: the data for next update 63 | """ 64 | 65 | # 训练 66 | if (game_step > learning_start_step) and (game_step % learning_fre == 0): 67 | if update_cnt == 0: print('\r=start training...' + ''*100) 68 | update_cnt += 1 69 | 70 | for agent_idx, (actor_c, actor_t, critic_c, critic_t, opt_a, opt_c) in \ 71 | enumerate(zip(actors_cur, actors_tar, critics_cur, critics_tar, optimizers_a, optimizers_c)): 72 | if opt_c == None: continue 73 | 74 | # 随机抽样 75 | rew = [] 76 | obs, action, reward, obs_ = memory.sample(batch_size, agent_idx) 77 | 78 | for i in range(batch_size): 79 | r = reward[i] 80 | ar = sum(r)/len(r) 81 | rew.append(ar) 82 | # update critic 83 | # rew = torch.tensor(reward, dtype=torch.float) 84 | rew = torch.tensor(rew, dtype=torch.float) 85 | action_cur = torch.from_numpy(action).to(torch.float) 86 | obs_n = torch.from_numpy(obs).to(torch.float) 87 | obs_n_ = torch.from_numpy(obs_).to(torch.float) 88 | action_tar = torch.cat([a_t(obs_n_[:, obs_size[idx][0]:obs_size[idx][1]]).detach() \ 89 | for idx, a_t in enumerate(actors_tar)], dim=1) 90 | q = critic_c(obs_n, action_cur).reshape(-1) # q 91 | q_ = critic_t(obs_n_, action_tar).reshape(-1) # q_ 92 | tar_value = q_ * gamma + rew 93 | loss_c = torch.nn.MSELoss()(q, tar_value) 94 | opt_c.zero_grad() 95 | loss_c.backward() 96 | nn.utils.clip_grad_norm_(critic_c.parameters(), max_grad_norm) 97 | opt_c.step() 98 | 99 | # update Actor 100 | # There is no need to cal other agent's action 101 | model_out, policy_c_new = actor_c( 102 | obs_n_[:, obs_size[agent_idx][0]:obs_size[agent_idx][1]], model_original_out=True) 103 | # update the action of this agent 104 | action_cur[:, action_size[agent_idx][0]:action_size[agent_idx][1]] = policy_c_new 105 | loss_pse = torch.mean(torch.pow(model_out, 2)) 106 | loss_a = torch.mul(-1, torch.mean(critic_c(obs_n, action_cur))) 107 | 108 | opt_a.zero_grad() 109 | loss_t = 1e-3 * loss_pse + loss_a 110 | loss_t.backward() 111 | nn.utils.clip_grad_norm_(actor_c.parameters(), max_grad_norm) 112 | opt_a.step() 113 | 114 | write.add_scalar("Loss/Actor", loss_t, game_step) 115 | write.add_scalar("Loss/Critic", loss_c, game_step) 116 | 117 | # # save model 118 | # if update_cnt > save_model and update_cnt % save_fer == 0: 119 | # time_now = time.strftime('%y%m_%d%H%M') 120 | # print('=time:{} step:{} save'.format(time_now, game_step)) 121 | # model_file_dir = os.path.join(save_dir, '{}_{}'.format(time_now, game_step)) 122 | # if not os.path.exists(model_file_dir): # make the path 123 | # os.makedirs(model_file_dir) 124 | # for agent_idx, (a_c, a_t, c_c, c_t) in \ 125 | # enumerate(zip(actors_cur, actors_tar, critics_cur, critics_tar)): 126 | # torch.save(a_c, os.path.join(model_file_dir, 'a_c_{}.pt'.format(agent_idx))) 127 | # torch.save(a_t, os.path.join(model_file_dir, 'a_t_{}.pt'.format(agent_idx))) 128 | # torch.save(c_c, os.path.join(model_file_dir, 'c_c_{}.pt'.format(agent_idx))) 129 | # torch.save(c_t, os.path.join(model_file_dir, 'c_t_{}.pt'.format(agent_idx))) 130 | 131 | # update the tar par 132 | actors_tar = self.update_train(actors_cur, actors_tar, tao) 133 | critics_tar = self.update_train(critics_cur, critics_tar, tao) 134 | return update_cnt, actors_cur, actors_tar, critics_cur, critics_tar 135 | 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multi-Agent-Mec_Offloading-Use_DRL 2 | 使用Drl来解决多智能体卸载问题 3 | 4 | 对比了MADDPG和DQN算法 5 | 6 | 环境参考论文:When Learning Joins Edge: Real-time Proportional Computation Offloading via Deep Reinforcement Learning CCF-C 7 | 8 | 目前对环境的处理为,任务在每一个step中都被处理完毕,在下一个step,对上行链路,下行链路进行随机的加减来改变状态。环境存在着一些问题,目前仍在改善。 9 | 10 | 目前环境存在问题,任务大小,边缘服务器和用户的计算能力仍在调整。 11 | -------------------------------------------------------------------------------- /Rl_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class abstract_agent(nn.Module): 6 | 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def act(self, x): 11 | policy, value = self.forward(x) 12 | return policy, value 13 | 14 | class critic(abstract_agent): 15 | 16 | def __init__(self, obs_shape, act_shape): 17 | super().__init__() 18 | self.LRelu = nn.LeakyReLU(0.01) 19 | self.linear_c1 = nn.Linear(act_shape + obs_shape, 64) 20 | self.linear_c2 = nn.Linear(64, 64) 21 | self.linear_c = nn.Linear(64, 1) 22 | 23 | def reset_parameters(self): 24 | nn.init.xavier_uniform(self.linear_c1.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布 25 | nn.init.xavier_uniform(self.linear_c2.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布 26 | nn.init.xavier_uniform(self.linear_c.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布 27 | 28 | def forward(self, obs_input, act_input): 29 | x_cat = self.LRelu(self.linear_c1(torch.cat([obs_input, act_input], dim=1))) 30 | x = self.LRelu(self.linear_c2(x_cat)) 31 | x = self.linear_c(x) 32 | 33 | return x 34 | 35 | class actor(abstract_agent): 36 | 37 | def __init__(self, num_input, action_size): 38 | super().__init__() 39 | self.tanh = nn.Tanh() 40 | self.LRelu = nn.LeakyReLU(0.01) 41 | self.linear_a1 = nn.Linear(num_input, 64) 42 | self.linear_a2 = nn.Linear(64, 64) 43 | self.linear_a = nn.Linear(64, action_size) 44 | 45 | def reset_parameters(self): 46 | nn.init.xavier_uniform(self.linear_c1.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布 47 | nn.init.xavier_uniform(self.linear_c2.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布 48 | nn.init.xavier_uniform(self.linear_c.weight, gain=nn.init.calculate_gain('leak_relu')) # 均匀分布 49 | 50 | def forward(self, x, model_original_out=False): 51 | x = self.LRelu(self.linear_a1(x)) 52 | x = self.LRelu(self.linear_a2(x)) 53 | model_out = self.linear_a(x) 54 | u = torch.rand_like(model_out) 55 | policy = F.softmax(model_out - torch.log(-torch.log(u)), dim=-1) 56 | if model_original_out: 57 | return model_out, policy 58 | return policy -------------------------------------------------------------------------------- /base_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | 4 | 5 | 6 | class base(object): 7 | def __init__(self): 8 | # 频率 9 | self.Hz = 1 10 | self.kHz = 1000 * self.Hz 11 | self.mHz = 1000 * self.kHz 12 | self.GHz = 1000 * self.mHz 13 | 14 | # 数据大小 15 | self.bit = 1 16 | self.B = 8 * self.bit 17 | self.KB = 1024 * self.B 18 | self.MB = 1024 * self.KB 19 | 20 | self.task_cpu_cycle = np.random.randint(500, 1000) # 处理一bit任务所需要的CPU频率 21 | self.task_size = np.random.randint(2 * 10**9, 3 * 10**9) # 任务大小 22 | self.task_require_cycle = self.task_size * self.task_cpu_cycle # 处理一个任务所需要的cpu频率 23 | 24 | # 处理任务的时间 = 任务所需的cpu频率/设备的计算能力 25 | 26 | self.UE_f = np.random.randint(1.5 * self.GHz, 2 * self.GHz) # UE的计算能力 27 | self.MEC_f = np.random.randint(5 * self.GHz, 7 * self.GHz) # MEC的计算能力 28 | 29 | # 能耗 30 | self.J = 1 31 | self.mJ = 1000 * 1000 * self.J 32 | 33 | self.tr_energy = 1 * self.J # 传输1s的能耗 34 | self.w = 10**(-28) # 能耗系数 35 | 36 | """ 37 | 进行简化 38 | 设置传输速率为:14 Mbps 39 | 传输时计算的是任务大小 40 | """ 41 | self.r = 293 * self.MB 42 | 43 | print(40 * math.log2(1 + (16 * 10))) 44 | -------------------------------------------------------------------------------- /env.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | 4 | import numpy as np 5 | 6 | class ENV(): 7 | def __init__(self, UEs, MECs, k, lam): 8 | self.UEs = UEs 9 | self.MECs = MECs 10 | self.k = k 11 | 12 | q = np.full((k, 1), 0.) 13 | p = np.linspace(0, 1, k).reshape((k, 1)) 14 | # 创建动作 15 | for i in range(MECs - 1): 16 | a = np.full((k, 1), float(i + 1)) 17 | b = np.linspace(0, 1, k).reshape((k, 1)) 18 | q = np.append(q, a, axis=0) 19 | p = np.append(p, b, axis=0) 20 | 21 | self.actions = np.hstack((q, p)) 22 | self.n_actions = len(self.actions) 23 | self.n_features = 3 + MECs * 3 24 | self.discount = 0.01 25 | 26 | # 基本参数 27 | # 频率 28 | self.Hz = 1 29 | self.kHz = 1000 * self.Hz 30 | self.mHz = 1000 * self.kHz 31 | self.GHz = 1000 * self.mHz 32 | self.nor = 10**(-7) 33 | self.nor1 = 10**19 34 | 35 | # 数据大小 36 | self.bit = 1 37 | self.B = 8 * self.bit 38 | self.KB = 1024 * self.B 39 | self.MB = 1024 * self.KB 40 | 41 | 42 | # self.task_cpu_cycle = np.random.randint(2 * 10**9, 3* 10**9) 43 | 44 | self.UE_f = np.random.randint(1.5 * self.GHz * self.nor, 2 * self.GHz * self.nor) # UE的计算能力 45 | self.MEC_f = np.random.randint(5 * self.GHz * self.nor, 7 * self.GHz * self.nor) # MEC的计算能力 46 | # self.UE_f = 500 * self.mHz # UE的计算能力 47 | # self.MEC_f = np.random.randint(5.2 * self.GHz, 24.3 * self.GHz) # MEC的计算能力 48 | self.tr_energy = 1 # 传输能耗 49 | self.r = 40 * math.log2(1 + (16 * 10)) * self.MB * self.nor # 传输速率 50 | # self.r = 800 # 传输速率 51 | self.ew, self.lw = 10**(-26), 3 * 10**(-26)# 能耗系数 52 | # self.ew, self.lw = 0.3, 0.15 # 能耗系数 53 | self.et, self.lt = 1, 1 54 | self.local_core_max, self.local_core_min = 1.3 * self.UE_f, 0.7 * self.UE_f 55 | self.server_core_max, self.server_core_min = 1.3 * self.MEC_f, 0.7 * self.MEC_f 56 | self.uplink_max, self.uplink_min = 1.3 * self.r, 0.7 * self.r 57 | self.downlink_max, self.downlink_min = 1.3 * self.r, 0.7 * self.r 58 | self.lam = lam 59 | self.e = 1 60 | 61 | 62 | def reset(self): 63 | obs = [] 64 | servers_cap = [] 65 | new_cap = True 66 | for i in range(self.UEs): 67 | uplink, downlink = [], [] 68 | # np.random.seed(np.random.randint(1, 1000)) 69 | # task_size = np.random.randint(2 * 10**8 * self.nor, 3 * 10**8 * self.nor) # 任务大小 70 | task_size = np.random.randint(1.5 * self.mHz, 2 * self.mHz) # 任务大小 71 | # self.task_size = self.task_size * self.task_cpu_cycle # 处理一个任务所需要的cpu频率 72 | # task_cpu_cycle = np.random.randint(2 * 10**9 * self.nor, 3 * 10**9 * self.nor) 73 | task_cpu_cycle = np.random.randint(10**3, 10**5) 74 | local_comp = np.random.randint(0.9 * self.UE_f, 1.1 * self.UE_f) # UE的计算能力 75 | for i in range(self.MECs): 76 | up = np.random.randint(0.9 * self.r, 1.1 * self.r) 77 | down = np.random.randint(0.9 * self.r, 1.1 * self.r) 78 | if new_cap: 79 | cap = np.random.randint(0.9 * self.MEC_f, 1.1 * self.MEC_f) # MEC计算能力 80 | servers_cap.append(cap) 81 | uplink.append(up) 82 | downlink.append(down) 83 | observation = np.array([task_size, task_cpu_cycle, local_comp]) 84 | observation = np.hstack((observation, servers_cap, uplink, downlink)) 85 | obs.append(observation) 86 | new_cap = False 87 | return obs 88 | 89 | def choose_action(self, prob): 90 | """ 91 | 根据概率选择动作 92 | :param env: 93 | :param prob: 94 | :return: [[target_server, percentage]] 95 | """ 96 | action_choice = np.linspace(0, 1, self.k) 97 | actions = [] 98 | for i in range(self.UEs): 99 | a = np.random.choice(a=(self.MECs * self.k), p=prob[i]) # 在数组p中从a个数字中以概率p选中一个 100 | target_server = int(a / self.k) 101 | percen = action_choice[a % self.k] 102 | action = [target_server, percen] 103 | actions.append(action) 104 | return actions 105 | 106 | def step(self, observation, actions_prob, is_prob=True, is_compared=True): 107 | if is_prob: 108 | actions = self.choose_action(actions_prob) 109 | else: actions = actions_prob 110 | new_cap = False 111 | obs_ = [] 112 | rew, local, ran, mec = [], [], [], [] 113 | dpg_times, local_times, ran_times, mec_times = [], [], [], [] 114 | dpg_energys, local_energys, ran_energys, mec_energys = [], [], [], [] 115 | total = [] 116 | a, b, c, d = 0, 0, 0, 0 117 | for i in range(self.UEs): 118 | if i == self.UEs - 1: new_cap = True 119 | # 提取信息 120 | task_size, task_cpu_cycle, local_comp, servers_cap, uplink, downlink = \ 121 | observation[i][0], observation[i][1], observation[i][2], observation[i][3:3+self.MECs], observation[i][3+self.MECs:3+self.MECs*2], observation[i][3+self.MECs*2:3+self.MECs*3] 122 | # wait_local, wait_server = np.random.randint(0, 2), np.random.randint(0, 3) 123 | 124 | action = actions[i] 125 | target_server, percen = int(action[0]), action[1] 126 | 127 | # 计算奖励 128 | # 本地和服务器上都有 129 | tr_time = (percen * task_size) / uplink[target_server] + self.discount * ( percen * task_size) / downlink[target_server] 130 | tr_energy = (self.tr_energy * percen * task_size) / uplink[target_server] + self.discount * (self.tr_energy * percen * task_size) / downlink[target_server] 131 | 132 | 133 | comp_local_time = task_cpu_cycle * (1 - percen) / (local_comp) 134 | comp_local_energy = self.lw * task_cpu_cycle * (1 - percen) * local_comp**2 135 | # comp_local_energy = task_size * (1 - percen) * local_comp 136 | 137 | 138 | comp_mec_time = (percen * task_cpu_cycle) / servers_cap[target_server] 139 | comp_mec_energy =self.ew * percen * task_cpu_cycle * servers_cap[target_server]**2 140 | # comp_mec_energy =percen * task_size * servers_cap[target_server] 141 | 142 | comp_time = max(comp_local_time, comp_mec_time) 143 | time_cost = (comp_time + tr_time) * self.et 144 | energy_cost = (tr_energy + comp_local_energy + comp_mec_energy) * self.e 145 | 146 | total_cost = self.lam * time_cost + (1 - self.lam) * energy_cost 147 | 148 | # reward = -total_cost 149 | 150 | # 全本地 151 | local_only_time = task_cpu_cycle/(local_comp) * self.et 152 | local_only_energy = self.lw * task_cpu_cycle * local_comp**2 * self.e 153 | # local_only_energy = task_size * local_comp 154 | local_only = self.lam * local_only_time + (1 - self.lam) * local_only_energy 155 | # print("task_cpu_cycle:", task_cpu_cycle) 156 | # print("local_comp", local_comp) 157 | # print("local_only_time:", local_only_time) 158 | # print("local_only_energy:", local_only_energy) 159 | # print("local_only:", local_only) 160 | 161 | # 全边缘 162 | mec_only_tr_time = task_size / uplink[target_server] + self.discount * task_size / downlink[target_server] 163 | mec_only_tr_energy = self.tr_energy * task_size / uplink[target_server] + self.discount * self.tr_energy * task_size / downlink[target_server] 164 | # print("mec_only_tr_time:", mec_only_tr_time) 165 | # print("mec_only_tr_energy:", mec_only_tr_energy) 166 | 167 | 168 | mec_only_comp_time = task_cpu_cycle / servers_cap[target_server] 169 | mec_only_comp_energy = self.ew * task_cpu_cycle * servers_cap[target_server]**2 170 | # mec_only_comp_energy = task_size * servers_cap[target_server] 171 | # print("mec_only_comp_time:", mec_only_comp_time) 172 | # print("mec_only_comp_energy:", mec_only_comp_energy) 173 | 174 | mec_only_time_cost = (mec_only_tr_time + mec_only_comp_time) * self.et 175 | mec_only_energy_cost = (mec_only_tr_energy + mec_only_comp_energy) * self.e 176 | 177 | mec_only = self.lam * mec_only_time_cost + (1 - self.lam) * mec_only_energy_cost 178 | # print("mec_only_time_cost:", mec_only_time_cost) 179 | # print("mec_only_energy_cost:", mec_only_energy_cost) 180 | # print("----------------------------:", servers_cap[target_server]) 181 | 182 | 183 | # 随机卸载 184 | percen_ran = np.random.uniform() # 随机卸载比例 185 | mec_ran = np.random.randint(self.MECs) # 随机选择一个服务器进行卸载 186 | 187 | random_tr_time = (percen_ran * task_size) / uplink[mec_ran] + (self.discount * percen_ran * task_size) / downlink[mec_ran] 188 | random_tr_energy = (self.tr_energy * percen_ran * task_size) / uplink[mec_ran] + self.discount * (self.tr_energy * percen_ran * task_size) / downlink[mec_ran] 189 | 190 | random_comp_local_time = (1 - percen_ran) * task_cpu_cycle / local_comp 191 | random_comp_local_energy = self.lw * (1 - percen_ran) * task_cpu_cycle * local_comp**2 192 | # random_comp_local_energy = (1 - percen_ran) * task_size * local_comp 193 | 194 | random_comp_mec_time = percen_ran * task_cpu_cycle / servers_cap[mec_ran] 195 | random_comp_mec_energy = self.ew * percen_ran * task_cpu_cycle * servers_cap[mec_ran]**2 196 | # random_comp_mec_energy = percen_ran * task_size * servers_cap[mec_ran] 197 | 198 | random_comp_time = max(random_comp_local_time, random_comp_mec_time) 199 | random_time_cost = (random_comp_time + random_tr_time) * self.et 200 | random_energy_cost = (random_tr_energy + random_comp_local_energy + random_comp_mec_energy) * self.e 201 | 202 | 203 | random_total = self.lam * random_time_cost + (1 - self.lam) * random_energy_cost 204 | random_total_cost2 = random_energy_cost 205 | 206 | # if total_cost < random_total or total_cost < mec_only or total_cost < local_only: 207 | # reward = -total_cost 208 | # else: 209 | # print("惩罚") 210 | # reward = -1999 211 | 212 | reward = -total_cost 213 | 214 | # a += total_cost 215 | # b += mec_only 216 | # c += local_only 217 | # d += random_total 218 | 219 | # 得到下一个observation 220 | x = np.random.uniform() 221 | y = 0.5 222 | if (x > y): 223 | local_comp = min(local_comp + np.random.randint(0, 0.2 * self.UE_f), self.local_core_max) 224 | for j in range(self.MECs): 225 | cap = min(servers_cap[j] + np.random.randint(0, 0.3 * self.UE_f), self.server_core_max) 226 | # MEC容量保持一致 227 | if new_cap: 228 | for x in range(self.UEs): 229 | observation[x][2 + j] = cap 230 | downlink[j] = min(downlink[j] + np.random.randint(0, 0.2 * self.r), self.downlink_max) 231 | uplink[j] = min(uplink[j] + np.random.randint(0, 0.2 * self.r), self.uplink_max) 232 | else: 233 | local_comp = max(local_comp + np.random.randint(-0.2 * self.UE_f, 0), self.local_core_min) 234 | for j in range(self.MECs): 235 | # MEC容量保持一致 236 | if new_cap: 237 | cap = max(servers_cap[j] + np.random.randint(0, 0.3 * self.UE_f), self.server_core_max) 238 | for x in range(self.UEs): 239 | observation[x][2 + j] = cap 240 | downlink[j] = max(downlink[j] - np.random.randint(0, 0.2 * self.r), self.downlink_min) 241 | uplink[j] = max(uplink[j] - np.random.randint(0, 0.2 * self.r), self.uplink_min) 242 | 243 | task_size = np.random.randint(10, 50) 244 | task_cpu_cycle = np.random.randint(10**3, 10**5) # 处理任务所需要的CPU频率 245 | observation_ = np.array([task_size, task_cpu_cycle, local_comp]) 246 | observation_ = np.hstack((observation_, servers_cap, uplink, downlink)) 247 | obs_.append(observation_) 248 | 249 | rew.append(reward) 250 | local.append(local_only) 251 | mec.append(mec_only) 252 | ran.append(random_total) 253 | 254 | dpg_times.append(time_cost) 255 | local_times.append(local_only_time) 256 | mec_times.append(mec_only_time_cost) 257 | ran_times.append(random_time_cost) 258 | 259 | dpg_energys.append(energy_cost) 260 | local_energys.append(local_only_energy) 261 | mec_energys.append(mec_only_energy_cost) 262 | ran_energys.append(random_energy_cost) 263 | 264 | total.append(total_cost) 265 | 266 | # if (a - b > 10 * self.UEs) or (a - c > 10 * self.UEs) or (a - d > 10 * self.UEs): 267 | # print("惩罚") 268 | # # print(a ,b, c, d) 269 | # for i in range(self.UEs): 270 | # rew[i] = -999 271 | # else: 272 | # pass 273 | 274 | if is_compared: 275 | return obs_, rew, local, mec, ran, dpg_times, local_times, mec_times, ran_times, dpg_energys, local_energys, mec_energys, ran_energys, total 276 | else: 277 | return obs_, rew, dpg_times, dpg_energys 278 | # return obs_, total 279 | 280 | -------------------------------------------------------------------------------- /replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class ReplayBuffer(object): 5 | def __init__(self, size): 6 | self.storage = [] 7 | self.maxsize = int(size) 8 | self.next_idx = 0 9 | 10 | def __len__(self): 11 | return len(self.storage) 12 | 13 | def clear(self): 14 | self.storage = [] 15 | self.next_idx = 0 16 | 17 | def add(self, o, a, r, o_): 18 | data = (o, a, r, o_) 19 | 20 | if self.next_idx >= len(self.storage): 21 | self.storage.append(data) 22 | else: 23 | self.storage[self.next_idx] = data 24 | self.next_idx = (self.next_idx + 1) % self.maxsize 25 | 26 | # 提取每个agent在replay buffter中的值 27 | def encode_sample(self, idxes, agent_dix): 28 | observations, actions, rewards, observations_ = [], [], [], [] 29 | for i in idxes: 30 | data = self.storage[i] 31 | obs, act, rew, obs_ = data 32 | observations.append(np.concatenate(obs[:])) 33 | actions.append(act) 34 | # rewards.append(rew[agent_dix]) 35 | rewards.append(rew) 36 | observations_.append(np.concatenate(obs_[:])) 37 | return np.array(observations), np.array(actions), np.array(rewards), np.array(observations_) 38 | 39 | # 随机抽样 40 | def make_index(self, batch_size): 41 | return [random.randint(0, len(self.storage) - 1) for _ in range(batch_size)] 42 | 43 | def sample(self, batch_size, agent_dix): 44 | """Sample a batch of experiences. 45 | Parameters 46 | ---------- 47 | batch_size: int 48 | How many transitions to sample. 49 | Returns 50 | ------- 51 | obs_batch: np.array 52 | batch of observations 53 | act_batch: np.array 54 | batch of actions executed given obs_batch 55 | rew_batch: np.array 56 | rewards received as results of executing act_batch 57 | next_obs_batch: np.array 58 | next set of observations seen after executing act_batch 59 | the end of an episode and 0 otherwise. 60 | """ 61 | if batch_size > 0: 62 | idxes = self.make_index(batch_size) 63 | else: 64 | idxes = range(0, len(self.storage)) 65 | return self.encode_sample(idxes, agent_dix) 66 | 67 | # buffter = ReplayBuffter(100) 68 | # o = [] 69 | # obs = np.array([238, 212, 228, 202, 213, 168, 180, 171, 195, 191, 311, 306, 408, 384, 351]) 70 | # a = np.array([238, 212, 228, 202, 213, 175, 159, 165, 189, 189, 381, 416, 443, 488, 317]) 71 | # o.append(obs) 72 | # o.append(a) 73 | # a = [] 74 | # action1 = np.array([1, 0.1]) 75 | # action2 = np.array([2, 0.2]) 76 | # a.append(action1) 77 | # a.append(action2) 78 | # r = [1000, 2000] 79 | # o_ = [] 80 | # obs_1 = np.array([38, 212, 228, 202, 213, 175, 159, 165, 189, 189, 381, 416, 443, 488, 317]) 81 | # obs_2 = np.array([38, 212, 228, 202, 213, 175, 159, 165, 189, 189, 381, 416, 443, 488, 317]) 82 | # o_.append(obs_1) 83 | # o_.append(obs_2) 84 | # buffter.add(o, a, r, o_) 85 | # 86 | # buffter.sample(1, 1) -------------------------------------------------------------------------------- /run_this.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | from torch.utils.tensorboard import SummaryWriter 4 | import numpy as np 5 | 6 | from env import ENV 7 | from replay_buffer import ReplayBuffer 8 | from MADDPG import Maddpg 9 | from DQN import Double_DQN 10 | from D3QN import D3QN 11 | 12 | learning_start_step = 200 13 | learning_fre = 5 14 | batch_size = 64 15 | gamma = 0.9 16 | lr = 0.01 17 | max_grad_norm = 0.5 18 | save_model = 40 19 | save_dir = "models/simple_adversary" 20 | save_fer = 400 21 | tao = 0.01 22 | memory_size = 2000 23 | EPOCH = 350 24 | STEP = 200 25 | 26 | write = SummaryWriter(log_dir="logs") 27 | 28 | def train(ue=3, mec=7, k=11*3, lam=0.5): 29 | """step1:create the environment""" 30 | u = ue 31 | m = mec 32 | k = k 33 | lam = lam 34 | env = ENV(u, m, k, lam) # UE: MEC:, k: 35 | maddpg = Maddpg() 36 | dqn = Double_DQN(env) 37 | d3qn = D3QN(env) 38 | 39 | 40 | print('=============================') 41 | print('=1 Env {} is right ...') 42 | print('=============================') 43 | 44 | """step2:create agent""" 45 | obs_shape_n = [env.n_features for i in range(env.UEs)] 46 | action_shape_n = [env.n_actions for i in range(env.UEs)] 47 | actors_cur, critic_cur, actors_tar, critic_tar, optimizers_a, optimizers_c = \ 48 | maddpg.get_train(env, obs_shape_n, action_shape_n) 49 | memory_dpg = ReplayBuffer(memory_size) 50 | # memory_dqn = ReplayBuffer(memory_size) 51 | 52 | print('=2 The {} agents are inited ...'.format(env.UEs)) 53 | print('=============================') 54 | 55 | """step3: init the pars """ 56 | obs_size = [] 57 | action_size = [] 58 | game_step = 0 59 | update_cnt = 0 60 | episode_rewards, episode_dqn, episode_d3qn, episode_local, episode_mec, episode_ran = [0.0], [0.0], [0.0], [0.0], [0.0], [0.0] # sum of rewards for all agents 61 | episode_time_dpg, episode_time_dqn, episode_time_d3qn, episode_time_local, episode_time_ran, episode_time_mec = [0.0], [0.0], [0.0], [0.0], [0.0], [0.0] 62 | episode_energy_dpg, episode_energy_dqn, episode_energy_d3qn, episode_energy_local, episode_energy_ran, episode_energy_mec = [0.0], [0.0], [0.0], [0.0], [0.0], [0.0] 63 | episode_total_cost = [0.0] 64 | # agent_rewards = [[0.0] for _ in range(env.UEs)] # individual agent reward 65 | epoch_average_reward, epoch_average_dqn, epoch_average_d3qn, epoch_average_local, epoch_average_mec, epoch_average_ran= [], [], [], [], [], [] 66 | epoch_average_time_reward, epoch_average_time_dqn, epoch_average_time_d3qn, epoch_average_time_local, epoch_average_time_mec, epoch_average_time_ran= [], [], [], [], [], [] 67 | epoch_average_energy_reward, epoch_average_energy_dqn, epoch_average_energy_d3qn, epoch_average_energy_local, epoch_average_energy_mec, epoch_average_energy_ran= [], [], [], [], [], [] 68 | epoch_average_total_cost = [] 69 | 70 | head_o, head_a, end_o, end_a = 0, 0, 0, 0 71 | for obs_shape, action_shape in zip(obs_shape_n, action_shape_n): 72 | end_o = end_o + obs_shape 73 | end_a = end_a + action_shape 74 | range_o = (head_o, end_o) 75 | range_a = (head_a, end_a) 76 | obs_size.append(range_o) 77 | action_size.append(range_a) 78 | head_o = end_o 79 | head_a = end_a 80 | 81 | print('=3 starting iterations ...') 82 | print('=============================') 83 | 84 | for epoch in range(EPOCH): 85 | obs = env.reset() 86 | 87 | for time_1 in range(STEP): 88 | 89 | action_prob = [agent(torch.from_numpy(observation).to(torch.float)).detach().cpu().numpy() \ 90 | for agent, observation in zip(actors_cur, obs)] 91 | action_dqn = dqn.choose_action(obs) 92 | action_d3qn = d3qn.choose_action(obs) 93 | 94 | o1 = copy.deepcopy(obs) 95 | o2 = copy.deepcopy(obs) 96 | obs_old = copy.deepcopy(obs) 97 | obs_, rew, local, mec, ran, time_dpg, time_local, time_mec, time_ran, energy_dpg, energy_local, energy_mec, energy_ran, total_cost = env.step(obs, action_prob) 98 | obs_dqn, rew_dqn, time_dqn, energy_dqn = env.step(o1, action_dqn, is_prob=False, is_compared=False) 99 | obs_d3qn, rew_d3qn, time_d3qn, energy_d3qn = env.step(o2, action_d3qn, is_prob=False, is_compared=False) 100 | 101 | 102 | # save the expeeinece 103 | memory_dpg.add(obs_old, np.concatenate(action_prob), rew, obs_) 104 | dqn.store_memory(obs_old, action_dqn, rew_dqn, obs_dqn) 105 | d3qn.store_memory(obs_old, action_d3qn, rew_d3qn, obs_d3qn) 106 | 107 | episode_rewards[-1] += np.sum(rew) 108 | episode_dqn[-1] += np.sum(rew_dqn) 109 | episode_d3qn[-1] += np.sum(rew_d3qn) 110 | episode_local[-1] += np.sum(local) 111 | episode_mec[-1] += np.sum(mec) 112 | episode_ran[-1] += np.sum(ran) 113 | 114 | episode_time_dpg[-1] += np.sum(time_dpg) 115 | episode_time_dqn[-1] += np.sum(time_dqn) 116 | episode_time_d3qn[-1] += np.sum(time_d3qn) 117 | episode_time_local[-1] += np.sum(time_local) 118 | episode_time_ran[-1] += np.sum(time_ran) 119 | episode_time_mec[-1] += np.sum(time_mec) 120 | 121 | episode_energy_dpg[-1] += np.sum(energy_dpg) 122 | episode_energy_dqn[-1] += np.sum(energy_dqn) 123 | episode_energy_d3qn[-1] += np.sum(energy_d3qn) 124 | episode_energy_local[-1] += np.sum(energy_local) 125 | episode_energy_mec[-1] += np.sum(energy_mec) 126 | episode_energy_ran[-1] += np.sum(energy_ran) 127 | episode_total_cost[-1] += np.sum(total_cost) 128 | # for i, rew in enumerate(rew):agent_rewards[i][-1] += rew 129 | 130 | # train agent 131 | if game_step > 1000 and game_step % 100 == 0: 132 | update_cnt, actors_cur, actors_tar, critic_cur, critic_tar = maddpg.agents_train( 133 | game_step, update_cnt, memory_dpg, obs_size, action_size, 134 | actors_cur, actors_tar, critic_cur, critic_tar, optimizers_a, optimizers_c, write) 135 | dqn.learn(game_step, write) 136 | d3qn.learn(game_step, write) 137 | 138 | # update obs 139 | game_step += 1 140 | obs = obs_ 141 | epoch_average_reward.append(- episode_rewards[-1] / (env.UEs * STEP)) 142 | epoch_average_dqn.append(- episode_dqn[-1] / (env.UEs * STEP)) 143 | epoch_average_d3qn.append(- episode_d3qn[-1] / (env.UEs * STEP)) 144 | epoch_average_local.append(episode_local[-1] / (env.UEs * STEP)) 145 | epoch_average_mec.append(episode_mec[-1] / (env.UEs * STEP)) 146 | epoch_average_ran.append(episode_ran[-1] / (env.UEs * STEP)) 147 | 148 | epoch_average_time_reward.append(episode_time_dpg[-1] / (env.UEs * STEP)) 149 | epoch_average_time_dqn.append(episode_time_dqn[-1] / (env.UEs * STEP)) 150 | epoch_average_time_d3qn.append(episode_time_dqn[-1] / (env.UEs * STEP)) 151 | epoch_average_time_local.append(episode_time_local[-1] / (env.UEs * STEP)) 152 | epoch_average_time_mec.append(episode_time_mec[-1] / (env.UEs * STEP)) 153 | epoch_average_time_ran.append(episode_time_ran[-1] / (env.UEs * STEP)) 154 | 155 | epoch_average_energy_reward.append(episode_energy_dpg[-1] / (env.UEs * STEP)) 156 | epoch_average_energy_dqn.append(episode_energy_dqn[-1] / (env.UEs * STEP)) 157 | epoch_average_energy_d3qn.append(episode_energy_dqn[-1] / (env.UEs * STEP)) 158 | epoch_average_energy_local.append(episode_energy_local[-1] / (env.UEs * STEP)) 159 | epoch_average_energy_mec.append(episode_energy_mec[-1] / (env.UEs * STEP)) 160 | epoch_average_energy_ran.append(episode_energy_ran[-1] / (env.UEs * STEP)) 161 | epoch_average_total_cost.append(episode_total_cost[-1] / (env.UEs * STEP)) 162 | 163 | episode_rewards.append(0) 164 | episode_dqn.append(0) 165 | episode_d3qn.append(0) 166 | episode_local.append(0) 167 | episode_mec.append(0) 168 | episode_ran.append(0) 169 | 170 | episode_time_dpg.append(0) 171 | episode_time_dqn.append(0) 172 | episode_time_d3qn.append(0) 173 | episode_time_local.append(0) 174 | episode_time_mec.append(0) 175 | episode_time_ran.append(0) 176 | 177 | episode_energy_dpg.append(0) 178 | episode_energy_dqn.append(0) 179 | episode_energy_d3qn.append(0) 180 | episode_energy_local.append(0) 181 | episode_energy_mec.append(0) 182 | episode_energy_ran.append(0) 183 | 184 | episode_total_cost.append(0) 185 | # for a_r in agent_rewards: 186 | # a_r.append(0) 187 | # print("------reset-------") 188 | write.add_scalars("cost", {'MADDPG': epoch_average_total_cost[epoch], 189 | 'DQN': epoch_average_dqn[epoch], 190 | 'D3QN': epoch_average_d3qn[epoch], 191 | 'Local': epoch_average_local[epoch], 192 | 'Mec': epoch_average_mec[epoch], 193 | 'random': epoch_average_ran[epoch]}, epoch) 194 | # write.add_scalars("cost", {'MADDPG': - episode_rewards[-1] /STEP, 195 | # # 'DQN': epoch_average_dqn[epoch], 196 | # 'Local': episode_local[-1] / STEP, 197 | # 'Mec': episode_mec[-1] / STEP, 198 | # 'random': episode_ran[-1] / STEP}, epoch) 199 | write.add_scalars("cost/energy", {'MADDPG': epoch_average_energy_reward[epoch], 200 | 'DQN': epoch_average_energy_dqn[epoch], 201 | 'D3QN': epoch_average_energy_d3qn[epoch], 202 | 'Local': epoch_average_energy_local[epoch], 203 | 'Mec': epoch_average_energy_mec[epoch], 204 | 'random': epoch_average_energy_ran[epoch]}, epoch) 205 | write.add_scalars("cost/delay", {'MADDPG': epoch_average_time_reward[epoch], 206 | 'DQN': epoch_average_time_dqn[epoch], 207 | 'D3QN': epoch_average_time_d3qn[epoch], 208 | 'Local': epoch_average_time_local[epoch], 209 | 'Mec': epoch_average_time_mec[epoch], 210 | 'random': epoch_average_time_ran[epoch]}, epoch) 211 | # print("epoch:{},MADDPG:{}".format(epoch, epoch_average_total_cost[epoch])) 212 | # # print("epoch:{},DQN:{}".format(epoch, epoch_average_dqn[epoch])) 213 | # print("epoch:{},Local:{}".format(epoch, epoch_average_local[epoch])) 214 | # print("epoch:{},Mec:{}".format(epoch, epoch_average_mec[epoch])) 215 | # print("epoch:{},random:{}".format(epoch, epoch_average_ran[epoch])) 216 | # if epoch_average_mec[epoch] > epoch_average_reward[epoch]: 217 | # print("True") 218 | # print("---------------------------------------") 219 | # return a 220 | 221 | 222 | 223 | if __name__ == '__main__': 224 | # for i in range(5): 225 | # cost = train(i + 10) 226 | # print(i + 10, "cost:", cost) 227 | # write.add_scalar("cost", cost, i + 10) 228 | # write.close() 229 | train() 230 | --------------------------------------------------------------------------------