├── README.md ├── distributions.py ├── env_setting.py ├── environment.py ├── generate_data.py ├── model.py ├── model_test.py ├── ppo_agent.py ├── storage.py ├── train.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # DRL-SIM 2 | This is the code accompanying the paper: "Social-Aware Incentive Mechanism for Vehicular Crowdsensing by Deep Reinforcement Learning" by Yinuo Zhao and Chi Harold Liu, published at TITS. 3 | 4 | ## Description 5 | This simplified code implements a DRL-based social-aware incentive mechanism to solve the optimal sensing strategy for all vehicles in vehicular crowdsensing. 6 | 7 | ## Dependencies 8 | 9 | You just need to install **torch**, numpy, random, csv, time, json, argparse by pip or conda 10 | 11 | ## Usage 12 | 13 | To generate E-R social graph, first you need to config the variable `mu` and `user_number` in `generate_data.py`. And then run the following command by 14 | 15 | ``` 16 | python generate_data.py 17 | ``` 18 | 19 | Then, copy the value of E-R social graph into `self.V['relationship']` in `env_setting.py`, and config other environment parameters there. 20 | 21 | After that, run the training and testing process by 22 | 23 | ``` 24 | python train.py --root-path [PATH to where to save results file and model] --user-num [USER NUMBER] 25 | ``` 26 | 27 | Finally, find the training and testing results under `--root-path` 28 | 29 | ## Contact 30 | If you have any question, please email `ynzhao@bit.edu.cn`. 31 | 32 | ## Paper 33 | 34 | If you are interested in our work, please cite our paper as 35 | ``` 36 | @article{zhao2020social, 37 | title={Social-aware incentive mechanism for vehicular crowdsensing by deep reinforcement learning}, 38 | author={Zhao, Yinuo and Liu, Chi Harold}, 39 | journal={IEEE Transactions on Intelligent Transportation Systems}, 40 | volume={22}, 41 | number={4}, 42 | pages={2314--2325}, 43 | year={2020}, 44 | publisher={IEEE} 45 | } 46 | ``` 47 | -------------------------------------------------------------------------------- /distributions.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from utils import init 4 | from torch.utils.data import WeightedRandomSampler 5 | from torch.distributions import Categorical 6 | 7 | 8 | class _Categorical(Categorical): 9 | """ 10 | a son class inherit from class torch.distributions.Categorical 11 | it adds a gumbel softmax sample method, for gumbel softmax sample 12 | and a mode method for argmax sample 13 | """ 14 | 15 | def __init__(self, _logits): 16 | super(_Categorical, self).__init__(logits=_logits) 17 | self._logits = self.logits 18 | self.weighted_sampler = WeightedRandomSampler 19 | 20 | def gumbel_softmax_sample(self, tau, device): 21 | dist = F.gumbel_softmax(self._logits, tau=tau, hard=False) 22 | action = torch.tensor(list(self.weighted_sampler(dist, 1, replacement=False))).to(device) 23 | return action.squeeze(-1) 24 | 25 | def mode(self): 26 | return torch.argmax(self._logits, dim=-1, keepdim=False) 27 | 28 | 29 | class MultiHeadCategorical(nn.Module): 30 | """ 31 | define a multi-head Categorical for multi-label classification 32 | --init: 33 | num_inputs: input feature dim 34 | dim_vec: a list for dim of each action space, e.g. [2,3,5], 2-dim for action1, 3-dim for action2, 5-dim for action3 35 | device: running device 36 | --forward: 37 | inputs: flatten input feature 38 | """ 39 | 40 | # @torchsnooper.snoop() 41 | def __init__(self, num_inputs, action_num, action_dim, device): 42 | super(MultiHeadCategorical, self).__init__() 43 | init_ = lambda m: init(m, 44 | nn.init.orthogonal_, 45 | lambda x: nn.init.constant_(x, 0), 46 | gain=0.01) 47 | self.action_num = action_dim 48 | self.linear_list = torch.nn.ModuleList( 49 | [init_(nn.Linear(num_inputs, action_dim).to(device)) for _ in range(action_num)]) 50 | self.action_num = action_num 51 | self.logits_head = [] 52 | self.weight_sample = WeightedRandomSampler 53 | self.device = device 54 | self.categorical_list = [] 55 | self.train() 56 | 57 | def forward(self, inputs): 58 | self.categorical_list = [_Categorical(linear(inputs)) for linear in self.linear_list] 59 | 60 | def gumbel_softmax_sample(self, tau): 61 | action = torch.cat([p.gumbel_softmax_sample(tau, self.device) for p in self.categorical_list]) 62 | return action 63 | 64 | @property 65 | def probs(self): 66 | if self.action_num == 1: 67 | return self.categorical_list[0].probs 68 | else: 69 | return torch.cat([p.probs.unsqueeze(-1) for p in self.categorical_list], dim=-1) 70 | 71 | def log_probs(self, action): 72 | if self.action_num == 1: 73 | return self.categorical_list[0].log_prob(action) 74 | else: 75 | return torch.cat([p.log_prob(a).unsqueeze(-1) for a, p in zip(action, self.categorical_list)], dim=-1) 76 | 77 | def mode(self): 78 | if self.action_num == 1: 79 | return self.categorical_list[0].mode() 80 | else: 81 | return torch.cat([p.mode() for p in self.categorical_list]) 82 | 83 | def sample(self): 84 | if self.action_num == 1: 85 | return self.categorical_list[0].sample() 86 | else: 87 | return torch.cat([p.sample() for p in self.categorical_list]) 88 | 89 | def entropy(self): 90 | if self.action_num == 1: 91 | return self.categorical_list[0].entropy() 92 | else: 93 | return torch.cat([p.entropy() for p in self.categorical_list]) 94 | -------------------------------------------------------------------------------- /env_setting.py: -------------------------------------------------------------------------------- 1 | class Setting(object): 2 | def __init__(self): 3 | self.V = { 4 | 'relationship': [ 5 | [0.11, 0.0, 0.11, 0.11, 0.11, 0.0, 0.22, 0.11, 0.22, 0.0, ], 6 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], 7 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], 8 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], 9 | [0.11, 0.0, 0.11, 0.11, 0.11, 0.0, 0.22, 0.11, 0.22, 0.0, ], 10 | [0.11, 0.0, 0.11, 0.11, 0.11, 0.0, 0.22, 0.11, 0.22, 0.0, ], 11 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ], 12 | [0.22, 0.0, 0.22, 0.22, 0.22, 0.0, 0.44, 0.22, 0.44, 0.0, ], 13 | [0.11, 0.0, 0.11, 0.11, 0.11, 0.0, 0.22, 0.11, 0.22, 0.0, ], 14 | [0.33, 0.0, 0.33, 0.33, 0.33, 0.0, 0.67, 0.33, 0.67, 0.0, ], 15 | ], 16 | 'cost': [5, 7, 9, 11, 13, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9], 17 | 'R': [15, 15.5, 16, 16.5, 17, 17.5, 18, 18.5, 19, 19.5, 20, 20.5, 21, 21.5], 18 | 'prob': [0.1, 0.2, 0.3, 0.4], 19 | 'task_num': 4, 20 | 'quality': [10, 20, 30, 40, 50, 10, 20, 30, 40, 50, 10, 20, 30, 40, 50, 10, 20, 30, 40, 50, 10, 20, 30, 40, 21 | 50], 22 | 'user_battery_budget': [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 23 | 50, 50, 50, 50, 50, 50, 50, 50, 50] 24 | } 25 | -------------------------------------------------------------------------------- /environment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | class Env(object): 6 | def __init__(self, user_num, state_dim, device, env_args): 7 | self.device = device 8 | 9 | self.user_num = user_num 10 | self.action_dim = user_num 11 | self.state_dim = state_dim 12 | 13 | relationship = env_args.V['relationship'] 14 | unit_cost = env_args.V['cost'] 15 | 16 | self.battery_budget = env_args.V['user_battery_budget'] 17 | self.task_num = env_args.V['task_num'] 18 | self.task_budget = env_args.V['R'] 19 | prob = env_args.V['prob'] 20 | self.prob = np.zeros(self.task_num) 21 | for i in range(self.task_num): 22 | self.prob[i] = prob[i] 23 | 24 | self.unit_cost = np.zeros(user_num) 25 | for i in range(user_num): 26 | self.unit_cost[i] = unit_cost[i] 27 | 28 | self.remain_energy = np.zeros(user_num) 29 | for i in range(user_num): 30 | self.remain_energy[i] = self.battery_budget[i] 31 | 32 | self.relationship = np.zeros((user_num, user_num)) 33 | for i in range(user_num): 34 | for j in range(user_num): 35 | self.relationship[i][j] = relationship[i][j] 36 | 37 | self.server_reward = 0 38 | self.total_server_reward = [] 39 | self.beta = np.zeros(user_num) 40 | beta = env_args.V['quality'] 41 | for i in range(self.task_num): 42 | self.beta[i] = beta[i] 43 | 44 | self.R = 0 45 | self.task_index = 0 46 | self.epoch = 0 47 | self.total_obtain_sensing_data = 0 48 | # self.max_completion_ratio = 0 49 | 50 | self.complete_task = 0 51 | self.total_task = 0 52 | 53 | self.task_cnt = np.zeros(self.task_num) 54 | self.obtain_sensing_data = np.zeros(self.task_num) 55 | 56 | self.final_contrib_data = 0 57 | 58 | self.intrinsic_reward = 0 59 | self.extrinsic_reward = 0 60 | 61 | def get_collected_data(self): 62 | return self.final_contrib_data 63 | 64 | def close(self): 65 | return None 66 | 67 | def plot_server_reward(self, episode): 68 | server_reward = self.server_reward 69 | self.server_reward = 0 70 | return server_reward 71 | 72 | def plot_complete_ratio(self, episode): 73 | obtain_sensing_data_list = [] 74 | for i in range(self.task_num): 75 | obtain_sensing_data = self.obtain_sensing_data[i] / self.task_cnt[i] 76 | obtain_sensing_data_list.append(obtain_sensing_data) 77 | self.total_obtain_sensing_data = 0 78 | self.epoch = 0 79 | self.obtain_sensing_data = np.zeros(self.task_num) 80 | self.task_cnt = np.zeros(self.task_num) 81 | 82 | def reset(self): 83 | for i in range(self.user_num): 84 | self.remain_energy[i] = self.battery_budget[i] 85 | 86 | self.task_index = np.random.choice(self.task_num, 1, False, self.prob)[0] 87 | # self.task_index = 0 88 | self.R = self.task_budget[self.task_index] 89 | 90 | state = np.zeros((self.user_num, self.state_dim)) 91 | for i in range(self.user_num): 92 | state[i, self.user_num:self.user_num + 1] = self.unit_cost[i] / 10 93 | state[i, self.user_num + 1:self.user_num + 2] = self.remain_energy[i] / 50 94 | state[i, self.user_num + 2:self.user_num + 3] = self.R / 10 95 | 96 | return torch.from_numpy(state).float().to(self.device) 97 | 98 | def get_completion_ratio(self): 99 | completion_ratio = self.complete_task / self.total_task 100 | self.complete_task = 0 101 | self.total_task = 0 102 | return completion_ratio 103 | 104 | def get_reward(self): 105 | extrinsic_reward = self.extrinsic_reward / self.user_num 106 | intrinsic_reward = self.intrinsic_reward / self.user_num 107 | self.extrinsic_reward = 0 108 | self.intrinsic_reward = 0 109 | return extrinsic_reward, intrinsic_reward 110 | 111 | def step(self, action): 112 | action = 0.2 * action.float().numpy() 113 | 114 | # -------standard action---------------------- 115 | for i in range(self.user_num): 116 | if action[i] > self.remain_energy[i] / self.unit_cost[i]: 117 | action[i] = self.remain_energy[i] / self.unit_cost[i] 118 | phi = np.zeros(self.user_num, 'float') 119 | 120 | for i in range(self.user_num): 121 | for j in range(self.user_num): 122 | phi[i] += self.relationship[i][j] * action[i] * action[j] 123 | total_sensing = action.sum() 124 | 125 | sensing_data = total_sensing 126 | self.total_obtain_sensing_data += sensing_data 127 | self.task_cnt[self.task_index] += 1 128 | self.obtain_sensing_data[self.task_index] += sensing_data 129 | self.final_contrib_data += total_sensing / self.R 130 | self.epoch += 1 131 | 132 | quality_sensing_data = action 133 | total_quality_sensing_data = np.sum(quality_sensing_data) 134 | # print(np.shape(action)) 135 | reward = np.zeros(self.user_num) 136 | self.server_reward += total_quality_sensing_data * self.beta[self.task_index] - self.R 137 | intrinsic_reward = 0 138 | extrinsic_reward = 0 139 | self.total_task += 1 140 | if total_sensing > 0.001: 141 | self.complete_task += 1 142 | for i in range(self.user_num): 143 | # reward[i] = action[i] / total_sensing * self.R - self.unit_cost[i] * action[i] + phi[i] 144 | reward[i] = quality_sensing_data[i] / total_quality_sensing_data * self.R - self.unit_cost[i] * action[ 145 | i] + phi[i] 146 | extrinsic_reward += quality_sensing_data[i] / total_quality_sensing_data * self.R - self.unit_cost[i] * \ 147 | action[i] 148 | intrinsic_reward += phi[i] 149 | self.remain_energy[i] -= self.unit_cost[i] * action[i] 150 | if self.remain_energy[i] <= 0.0001: 151 | self.remain_energy[i] = 0 152 | self.intrinsic_reward += intrinsic_reward 153 | self.extrinsic_reward += extrinsic_reward 154 | self.task_index = np.random.choice(self.task_num, 1, False, self.prob)[0] 155 | # self.task_index = 0 156 | self.R = self.task_budget[self.task_index] 157 | 158 | state = np.zeros((self.user_num, self.state_dim)) 159 | for i in range(self.user_num): 160 | state[i, 0:self.user_num] = action 161 | state[i, self.user_num:self.user_num + 1] = self.unit_cost[i] / 10 162 | state[i, self.user_num + 1:self.user_num + 2] = self.remain_energy[i] / 50 163 | state[i, self.user_num + 2:self.user_num + 3] = self.R / 10 164 | # reward = np.mean(reward, keepdims=True) 165 | 166 | done = False 167 | 168 | return torch.from_numpy(state).float().to(self.device), torch.from_numpy(reward).float(), done 169 | -------------------------------------------------------------------------------- /generate_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | mu = 0.9 4 | user_number = 10 5 | G = np.zeros((user_number, user_number), dtype=np.float) 6 | for i in range(user_number): 7 | for j in range(user_number): 8 | if i == j: 9 | G[i][j] = 0 10 | else: 11 | p = np.random.random() 12 | if p <= mu: 13 | G[i][j] = 1 14 | else: 15 | G[i][j] = 0 16 | for i in range(user_number): 17 | print('[', end='') 18 | for j in range(user_number): 19 | print(G[i][j], end=', ') 20 | print('], ') 21 | # G = np.array([ 22 | # [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, ], 23 | # [1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, ], 24 | # [1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ], 25 | # [1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, ], 26 | # [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ], 27 | # [1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, ], 28 | # [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, ], 29 | # [0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, ], 30 | # [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, ], 31 | # [1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, ], 32 | # ]) 33 | 34 | I = np.ones((user_number, 1)) 35 | K = np.dot(G, I) 36 | L = np.dot(G.transpose(), I) 37 | g = np.sum(G) 38 | d = g / user_number 39 | H = np.dot(K, L.transpose()) / g 40 | for i in range(user_number): 41 | print('[', end='') 42 | for j in range(user_number): 43 | # if i == j: 44 | # print('0.0', end=', ') 45 | # else: 46 | print(np.round(H[i][j], 2), end=', ') 47 | print('],') -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from distributions import MultiHeadCategorical 2 | import torch 3 | from utils import init 4 | import torch.nn as nn 5 | 6 | 7 | class Flatten(nn.Module): 8 | def forward(self, x): 9 | return x.view(x.size(0), -1) 10 | 11 | 12 | class Model(nn.Module): 13 | def __init__(self, state_dim, action_dim, device, trainable=True, hidsize=128): 14 | super(Model, self).__init__() 15 | init_ = lambda m: init(m, 16 | nn.init.orthogonal_, 17 | lambda x: nn.init.constant_(x, 0)) 18 | # feature extract 19 | self.base = nn.Sequential( 20 | init_(nn.Linear(state_dim, 128)), 21 | nn.ReLU(), 22 | init_(nn.Linear(128, hidsize)), 23 | nn.ReLU() 24 | ).to(device) 25 | 26 | # actor 27 | self.dist = MultiHeadCategorical(hidsize, 1, action_dim, device) 28 | # # critic 29 | # self.critic = nn.Sequential( 30 | # init_(nn.Linear(hidsize, 1)) 31 | # ).to(device) 32 | # critic 33 | self.q_network = nn.Sequential( 34 | init_(nn.Linear(hidsize, action_dim)), 35 | ).to(device) 36 | self.device = device 37 | self.identity = torch.eye(action_dim).to(device) 38 | if trainable: 39 | self.train() 40 | else: 41 | self.eval() 42 | 43 | # @torchsnooper.snoop() 44 | def act(self, inputs): 45 | with torch.no_grad(): 46 | obs_feature = self.base(inputs) 47 | 48 | # value = self.critic(obs_feature) 49 | self.dist(obs_feature) 50 | action = self.dist.sample() 51 | action_log_probs = self.dist.log_probs(action) 52 | action_log_probs = action_log_probs.mean(-1, keepdim=True) 53 | 54 | q_value = self.q_network(obs_feature) 55 | # mean 56 | value = torch.sum(self.dist.probs * q_value, -1, keepdim=True) 57 | return value, action.squeeze(), action_log_probs 58 | 59 | def get_value(self, inputs): 60 | obs_feature = self.base(inputs) 61 | # value = self.critic(obs_feature) 62 | self.dist(obs_feature) 63 | q_value = self.q_network(obs_feature) 64 | value = torch.sum(self.dist.probs * q_value, -1, keepdim=True) 65 | return value 66 | 67 | def evaluate_actions(self, inputs, action): 68 | obs_features = self.base(inputs) 69 | # value = self.critic(obs_features) 70 | q_value = self.q_network(obs_features) 71 | index = self.identity[action.squeeze(-1)] 72 | value = torch.sum(q_value * index, -1).unsqueeze(-1) 73 | 74 | self.dist(obs_features) 75 | 76 | action_log_probs = self.dist.log_probs(action).mean(-1, keepdim=True) 77 | 78 | dist_entropy = self.dist.entropy().mean() 79 | 80 | return value, action_log_probs, dist_entropy 81 | 82 | def print_grad(self): 83 | for name, p in self.named_parameters(): 84 | print('name: ', name, ' value: ', p.grad.mean(), 'p.requires_grad', p.requires_grad) 85 | 86 | 87 | -------------------------------------------------------------------------------- /model_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import torch 4 | from environment import Env 5 | from utils import seed_torch 6 | from ppo_agent import PPOAgent 7 | import csv 8 | 9 | 10 | def model_test(args, env_args): 11 | seed_torch(args.seed) 12 | 13 | print('in test process') 14 | os.environ['OMP_NUM_THREADS'] = '1' 15 | 16 | if args.test_device_num == -1: 17 | test_device_name = 'cpu' 18 | 19 | else: 20 | test_device_name = 'cuda:' + str(args.test_device_num) 21 | torch.cuda.set_device(args.test_device_num) 22 | 23 | # -------------get environment information------------ 24 | ppo_agent = [] 25 | for i in range(args.user_num): 26 | ppo_agent.append( 27 | PPOAgent(args.state_dim, test_device_name, args.lr, args.exploration_steps, 28 | args.mini_batch_num, args.use_gae, args.gamma, args.gae_param, args.ppo_epoch, 29 | args.clip, args.value_coeff, args.clip_coeff, args.ent_coeff)) 30 | 31 | ori_device_name = 'cuda:' + str(args.device_num) 32 | model_path = os.path.join(args.root_path, 'model') 33 | for i, agent in enumerate(ppo_agent): 34 | ppo_model_path = os.path.join(model_path, 'ppo_model' + str(i) + '.pt') 35 | agent.load_model(ppo_model_path, ori_device_name, test_device_name) 36 | agent.local_ppo_model.eval() 37 | 38 | done_time = 0 39 | episode_length = 0 40 | 41 | user_num = args.user_num 42 | env = Env(user_num, args.state_dim, test_device_name, env_args) 43 | 44 | action = torch.zeros(user_num, dtype=torch.long) 45 | final_av_reward = 0 46 | final_av_server_reward = 0 47 | test_file_path = os.path.join(args.root_path, 'test_file') 48 | if not os.path.exists(test_file_path): 49 | os.mkdir(test_file_path) 50 | test_result_profile = open(test_file_path + '/test_result.csv', 'w', newline='') 51 | test_writer = csv.writer(test_result_profile) 52 | 53 | av_ext_reward = 0 54 | av_int_rewards = 0 55 | 56 | av_completion_ratio = 0 57 | 58 | result_path = test_file_path + '/test_result.npz' 59 | # ----------------------------------------- 60 | 61 | all_remaining_energy = [] 62 | while True: 63 | if episode_length >= args.max_test_length: 64 | print('training over') 65 | break 66 | 67 | print('---------------in episode ', episode_length, '-----------------------') 68 | 69 | step = 0 70 | done = True 71 | av_reward = 0 72 | av_action = torch.zeros(user_num) 73 | 74 | obs = env.reset() 75 | interact_time = 0 76 | remaining_energy = [] 77 | remaining_energy.append(env.remain_energy.mean()) 78 | while step < args.exploration_steps: 79 | interact_time += 1 80 | # ----------------sample actions(no grad)------------------------ 81 | with torch.no_grad(): 82 | for i, agent in enumerate(ppo_agent): 83 | _, action[i], _ = agent.act(obs[i]) 84 | obs, reward, done = env.step(action) 85 | remaining_energy.append(env.remain_energy.mean()) 86 | av_reward += np.mean(reward.numpy()) 87 | av_action += 0.2 * action.float() 88 | 89 | step = step + 1 90 | done = interact_time >= args.max_interact_time 91 | if done: 92 | # env.draw_remain_energy(done_time) 93 | done_time += 1 94 | interact_time = 0 95 | obs = env.reset() 96 | if len(remaining_energy) == args.max_interact_time + 1 and len(all_remaining_energy) < 100: 97 | all_remaining_energy.append(remaining_energy) 98 | remaining_energy = [] 99 | remaining_energy.append(env.remain_energy.mean()) 100 | for i, agent in enumerate(ppo_agent): 101 | agent.reset(obs[i]) 102 | 103 | continue 104 | av_reward /= args.exploration_steps 105 | ext_reward, int_reward = env.get_reward() 106 | 107 | av_ext_reward += ext_reward / args.exploration_steps 108 | av_int_rewards += int_reward / args.exploration_steps 109 | 110 | completion_ratio = env.get_completion_ratio() 111 | av_completion_ratio += completion_ratio 112 | 113 | final_av_reward += av_reward 114 | final_av_server_reward += env.plot_server_reward(episode_length) / args.exploration_steps 115 | episode_length += 1 116 | 117 | test_writer.writerow( 118 | ['vehicle reward', 'server reward', 'extrinsic reward', 'intrinsic reward', 'completion ratio']) 119 | test_writer.writerow([final_av_reward / args.max_test_length, final_av_server_reward / args.max_test_length, 120 | av_ext_reward / args.max_test_length, av_int_rewards / args.max_test_length, 121 | av_completion_ratio / args.max_test_length]) 122 | test_result_profile.close() 123 | np.savez(result_path, np.asarray(all_remaining_energy)) 124 | print('Finish! Results saved in ', args.root_path) 125 | 126 | -------------------------------------------------------------------------------- /ppo_agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from storage import RolloutStorage 3 | from model import Model 4 | import torch.optim as optim 5 | 6 | 7 | class PPOAgent(): 8 | def __init__(self, state_dim, device, lr, exploration_steps, mini_batch_num, use_gae, gamma, 9 | gae_param, ppo_epoch, clip, value_coeff, clip_coeff, ent_coeff): 10 | self.local_ppo_model = Model(state_dim, 6, device) 11 | self.optimizer = optim.Adam(list(self.local_ppo_model.parameters()), lr=lr) 12 | self.rollout = RolloutStorage(exploration_steps, mini_batch_num, state_dim) 13 | self.rollout.to(device) 14 | self.use_gae = use_gae 15 | self.gamma = gamma 16 | self.gae_param = gae_param 17 | self.ppo_epoch = ppo_epoch 18 | self.clip = clip 19 | self.value_coeff = value_coeff 20 | self.clip_coeff = clip_coeff 21 | self.ent_coeff = ent_coeff 22 | 23 | def act(self, obs): 24 | value, action, action_log_probs = self.local_ppo_model.act(obs) 25 | return value, action, action_log_probs 26 | 27 | def insert(self, obs, action, action_log_probs, value, reward, masks): 28 | self.rollout.insert(obs, action, action_log_probs, value, reward, masks) 29 | 30 | def after_update(self, obs): 31 | self.rollout.after_update(obs) 32 | 33 | def load_model(self, path, device, test_device): 34 | self.local_ppo_model.load_state_dict(torch.load(path, map_location={device: test_device})) 35 | 36 | def reset(self, obs): 37 | self.rollout.reset(obs) 38 | 39 | def update(self, done): 40 | beta = 0.2 41 | with torch.no_grad(): 42 | if done: 43 | next_value = torch.zeros(1) 44 | else: 45 | next_value = self.local_ppo_model.get_value(self.rollout.obs[-1:]) 46 | 47 | self.rollout.compute_returns(next_value.detach(), self.use_gae, self.gamma, self.gae_param) 48 | 49 | advantages = self.rollout.returns[:-1] - self.rollout.value_preds[:-1] 50 | 51 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) 52 | 53 | av_value_loss = 0 54 | av_policy_loss = 0 55 | av_ent_loss = 0 56 | loss_cnt = 0 57 | 58 | for _ in range(self.ppo_epoch): 59 | data_generator = self.rollout.feed_forward_generator(advantages) 60 | for samples in data_generator: 61 | # signal_init = traffic_light.get() 62 | torch.cuda.empty_cache() 63 | obs_batch, next_obs_batch, action_batch, old_values, return_batch, masks_batch, \ 64 | old_action_log_probs, advantages_batch = samples 65 | 66 | cur_values, cur_action_log_probs, dist_entropy = self.local_ppo_model.evaluate_actions(obs_batch, 67 | action_batch) 68 | 69 | # ----------use ppo clip to compute loss------------------------ 70 | ratio = torch.exp(cur_action_log_probs - old_action_log_probs) 71 | surr1 = ratio * advantages_batch 72 | surr2 = torch.clamp(ratio, 1.0 - self.clip, 1.0 + self.clip) * advantages_batch 73 | 74 | action_loss = -torch.min(surr1, surr2).mean() 75 | 76 | value_pred_clipped = old_values + (cur_values - old_values).clamp(-self.clip, self.clip) 77 | value_losses = (cur_values - return_batch).pow(2) 78 | value_losses_clipped = (value_pred_clipped - return_batch).pow(2) 79 | value_loss = 0.5 * torch.max(value_losses, value_losses_clipped).mean() 80 | # value_loss = torch.mean((return_batch - cur_values)**2) 81 | 82 | value_loss = value_loss * self.value_coeff 83 | action_loss = action_loss * self.clip_coeff 84 | ent_loss = dist_entropy * self.ent_coeff 85 | # ------------------ for curiosity driven-------------------------- 86 | total_loss = value_loss + action_loss - ent_loss 87 | self.local_ppo_model.zero_grad() 88 | self.optimizer.zero_grad() 89 | total_loss.backward() 90 | self.optimizer.step() 91 | 92 | av_value_loss += float(value_loss) 93 | av_policy_loss += float(action_loss) 94 | av_ent_loss += float(ent_loss) 95 | loss_cnt += 1 96 | 97 | return av_value_loss / loss_cnt, av_policy_loss / loss_cnt, av_ent_loss / loss_cnt 98 | -------------------------------------------------------------------------------- /storage.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 3 | 4 | 5 | def _flatten_helper(T, N, _tensor): 6 | return _tensor.view(T * N, *_tensor.size()[2:]) 7 | 8 | 9 | class RolloutStorage(object): 10 | def __init__(self, num_steps, mini_batch_num, obs_shape): 11 | 12 | self.mini_batch_num = mini_batch_num 13 | self.obs = torch.zeros(num_steps + 1, obs_shape) 14 | self.rewards = torch.zeros(num_steps, 1) 15 | self.value_preds = torch.zeros(num_steps + 1, 1) 16 | self.returns = torch.zeros(num_steps + 1, 1) 17 | self.action_log_probs = torch.zeros(num_steps, 1) 18 | self.action = torch.zeros(num_steps, 1, dtype=torch.long) 19 | self.masks = torch.ones(num_steps + 1, 1) 20 | self.num_steps = num_steps 21 | self.step = 0 22 | 23 | def to(self, device): 24 | self.obs = self.obs.to(device) 25 | self.rewards = self.rewards.to(device) 26 | self.value_preds = self.value_preds.to(device) 27 | self.returns = self.returns.to(device) 28 | self.action_log_probs = self.action_log_probs.to(device) 29 | self.action = self.action.to(device) 30 | self.masks = self.masks.to(device) 31 | 32 | def reset(self, obs): 33 | self.obs[self.step].copy_(obs.squeeze(0)) 34 | self.masks[self.step].copy_(torch.zeros(1)) 35 | 36 | def insert(self, obs, actions, action_log_probs, value_preds, rewards, masks): 37 | self.action[self.step].copy_(actions) 38 | self.action_log_probs[self.step].copy_(action_log_probs.squeeze()) 39 | self.value_preds[self.step].copy_(value_preds.squeeze()) 40 | self.rewards[self.step].copy_(rewards.squeeze()) 41 | self.obs[self.step + 1].copy_(obs.squeeze()) 42 | self.masks[self.step + 1].copy_(masks.squeeze()) 43 | 44 | self.step = self.step + 1 45 | 46 | def update_reward(self, intrinsic_reward): 47 | intrinsic_reward = intrinsic_reward.clamp(-1, 1) 48 | num_steps = intrinsic_reward.size()[0] 49 | # print('self.rewards', self.rewards, 'intrinsic_reward', intrinsic_reward) 50 | for i in range(num_steps): 51 | self.rewards[i] = self.rewards[i] + intrinsic_reward[i] 52 | # self.rewards = self.rewards.clamp(-1, 1) 53 | 54 | def icm_tuple(self): 55 | obs = self.obs[:-1].clone().detach() 56 | next_obs = self.obs[1:].clone().detach() 57 | action = self.action.clone().detach() 58 | return obs, next_obs, action 59 | 60 | def after_update(self, obs): 61 | self.step = 0 62 | self.obs[0].copy_(obs.squeeze()) 63 | self.masks[0].copy_(torch.zeros(1)) 64 | 65 | def compute_returns(self, next_value, use_gae, gamma, tau): 66 | if use_gae: 67 | self.value_preds[-1] = next_value 68 | gae = 0 69 | for step in reversed(range(self.rewards.size(0))): 70 | delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step + 1] - \ 71 | self.value_preds[step] 72 | gae = delta + gamma * tau * self.masks[step + 1] * gae 73 | self.returns[step] = gae + self.value_preds[step] 74 | else: 75 | self.returns[-1] = next_value 76 | for step in reversed(range(self.rewards.size(0))): 77 | self.returns[step] = self.returns[step + 1] * gamma * self.masks[step + 1] + self.rewards[step] 78 | 79 | def feed_forward_generator(self, advantages): 80 | mini_batch_size = self.num_steps // self.mini_batch_num 81 | sampler = BatchSampler(SubsetRandomSampler(range(self.num_steps)), mini_batch_size, drop_last=False) 82 | for indices in sampler: 83 | next_indices = [indice + 1 for indice in indices] 84 | obs_batch = self.obs[indices] 85 | next_obs_batch = self.obs[next_indices] 86 | action_batch = self.action[indices] 87 | value_pred_batch = self.value_preds[indices] 88 | return_batch = self.returns[indices] 89 | old_action_log_probs_batch = self.action_log_probs[indices] 90 | advantages_batch = advantages[indices] 91 | masks_batch = self.masks[indices] 92 | yield obs_batch, next_obs_batch, action_batch, value_pred_batch, return_batch, \ 93 | masks_batch, old_action_log_probs_batch, advantages_batch 94 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import torch 4 | from environment import Env 5 | from utils import seed_torch 6 | from ppo_agent import PPOAgent 7 | import time 8 | from env_setting import Setting 9 | import json 10 | from model_test import model_test 11 | import argparse 12 | 13 | 14 | def main(args, env_args): 15 | seed_torch(args.seed) 16 | os.environ['OMP_NUM_THREADS'] = '1' 17 | if args.use_cuda: 18 | torch.cuda.set_device(args.device_num) 19 | device = torch.device('cuda' if args.use_cuda else 'cpu') 20 | # -------------get environment information------------ 21 | 22 | ppo_agent = [] 23 | for i in range(args.user_num): 24 | ppo_agent.append( 25 | PPOAgent(args.state_dim, device, args.lr, args.exploration_steps, 26 | args.mini_batch_num, args.use_gae, args.gamma, args.gae_param, args.ppo_epoch, 27 | args.clip, args.value_coeff, args.clip_coeff, args.ent_coeff)) 28 | 29 | done_time = 0 30 | episode_length = 0 31 | 32 | user_num = args.user_num 33 | env = Env(user_num, args.state_dim, device, env_args) 34 | 35 | action = torch.zeros(user_num, dtype=torch.long) 36 | value = torch.zeros(user_num) 37 | action_log_probs = torch.zeros(user_num) 38 | file_path = os.path.join(args.root_path, 'file') 39 | result_path = file_path + '/result.npz' 40 | model_path = os.path.join(args.root_path, 'model') 41 | os.mkdir(model_path) 42 | rewards = [] 43 | server_rewards = [] 44 | user_rewards = [[] for _ in range(user_num)] 45 | completion_ratio = [[] for _ in range(env.task_num)] 46 | 47 | ext_rewards = [] 48 | int_rewards = [] 49 | while True: 50 | if episode_length >= args.max_episode_length: 51 | print('training over') 52 | break 53 | 54 | print('---------------in episode ', episode_length, '-----------------------') 55 | 56 | step = 0 57 | done = True 58 | av_reward = torch.zeros(user_num) 59 | av_action = torch.zeros(user_num) 60 | 61 | obs = env.reset() 62 | for i, agent in enumerate(ppo_agent): 63 | agent.after_update(obs[i]) 64 | 65 | interact_time = 0 66 | sum_reward = 0.0 67 | sum_user_reward = np.zeros(user_num) 68 | while step < args.exploration_steps: 69 | interact_time += 1 70 | # ----------------sample actions(no grad)------------------------ 71 | with torch.no_grad(): 72 | for i, agent in enumerate(ppo_agent): 73 | value[i], action[i], action_log_probs[i] = agent.act(obs[i]) 74 | obs, reward, done = env.step(action) 75 | sum_reward += reward.numpy().mean() 76 | sum_user_reward += reward.numpy() 77 | 78 | av_reward += reward 79 | av_action += 0.2 * action.float() 80 | done = interact_time >= args.max_interact_time 81 | # ---------judge if game over -------------------- 82 | masks = torch.tensor([[0.0] if done else [1.0]]) 83 | # ----------add to memory --------------------------- 84 | for i, agent in enumerate(ppo_agent): 85 | agent.insert(obs[i].detach(), action[i].detach(), action_log_probs[i].detach(), value[i].detach(), 86 | reward[i].detach(), masks.detach()) 87 | step = step + 1 88 | 89 | if done: 90 | done_time += 1 91 | interact_time = 0 92 | obs = env.reset() 93 | for i, agent in enumerate(ppo_agent): 94 | agent.reset(obs[i]) 95 | 96 | continue 97 | 98 | server_reward = env.plot_server_reward(episode_length) / args.exploration_steps 99 | server_rewards.append(server_reward) 100 | ext_reward, int_reward = env.get_reward() 101 | ext_rewards.append(ext_reward / args.exploration_steps) 102 | int_rewards.append(int_reward / args.exploration_steps) 103 | # av_completion_ratio = env.get_completion_ratio() / args.exploration_steps 104 | 105 | # for i in range(env.task_num): 106 | # completion_ratio[i].append(av_completion_ratio[i]) 107 | av_value_loss = 0 108 | av_policy_loss = 0 109 | av_ent_loss = 0 110 | 111 | rewards.append(sum_reward / args.exploration_steps) 112 | for i in range(user_num): 113 | user_rewards[i].append(sum_user_reward[i] / args.exploration_steps) 114 | 115 | for i, agent in enumerate(ppo_agent): 116 | value_loss, policy_loss, ent_loss = agent.update(done) 117 | av_value_loss += value_loss 118 | av_policy_loss += policy_loss 119 | av_ent_loss += ent_loss 120 | 121 | av_value_loss /= user_num 122 | av_policy_loss /= user_num 123 | av_ent_loss /= user_num 124 | av_reward /= args.exploration_steps 125 | av_action /= args.exploration_steps 126 | episode_length += 1 127 | 128 | np.savez(result_path, np.asarray(rewards), np.asarray(server_rewards), np.asarray(ext_rewards), 129 | np.asarray(int_rewards)) 130 | # reward_profile.close() 131 | for i, ppo in enumerate(ppo_agent): 132 | ppo_model_path = os.path.join(model_path, 'ppo_model' + str(i) + '.pt') 133 | torch.save(ppo.local_ppo_model.state_dict(), ppo_model_path) 134 | model_test(args, env_args) 135 | 136 | 137 | if __name__ == "__main__": 138 | parser = argparse.ArgumentParser(description='Hyper-parameter setting for DRL-SIM.') 139 | # ------------------------------------- parameters that must be configured --------------------------------- 140 | parser.add_argument('--root-path', type=str, required=True, help='the path to save your results and models') 141 | parser.add_argument('--user-num', type=int, required=True, help='use cuda device to train models or not') 142 | 143 | # ------------------------------------- parameters that can be changed according to your need -------------- 144 | parser.add_argument('--use-cuda', type=bool, default=True, help='use cuda device to train models or not') 145 | parser.add_argument('--device-num', type=int, default=0, help='cuda device number for training') 146 | parser.add_argument('--test-device-num', type=int, default=0, help='cuda device number for testing') 147 | parser.add_argument('--max-episode-length', type=int, default=1000) 148 | parser.add_argument('--max-test-length', type=int, default=100) 149 | parser.add_argument('--exploration-steps', type=int, default=500) 150 | parser.add_argument('--mini-batch-num', type=int, default=4) 151 | parser.add_argument('--seed', type=int, default=1) 152 | parser.add_argument('--ppo-epoch', type=int, default=4) 153 | parser.add_argument('--max-interact-time', type=int, default=64) 154 | 155 | # ------------------------------------- parameters that never recommend to be changed --------------------- 156 | parser.add_argument('--lr', type=float, default=0.0003, help='optimizer learning rate') 157 | parser.add_argument('--clip', type=float, default=0.1) 158 | parser.add_argument('--ent-coeff', type=float, default=0.01) 159 | parser.add_argument('--value-coeff', type=float, default=0.1) 160 | parser.add_argument('--clip-coeff', type=float, default=1.0) 161 | parser.add_argument('--use-gae', type=bool, default=True) 162 | parser.add_argument('--gamma', type=float, default=0.99) 163 | parser.add_argument('--gae_param', type=float, default=0.95) 164 | 165 | args = parser.parse_args() 166 | args.state_dim = args.user_num + 3 167 | local_time = str(time.strftime("%Y/%m-%d/%H-%M-%S", time.localtime())) 168 | args.root_path = os.path.join(args.root_path, local_time) 169 | file_path = os.path.join(args.root_path, 'file') 170 | if not os.path.exists(file_path): 171 | os.makedirs(file_path) 172 | 173 | with open(os.path.join(file_path, 'agent_args.txt'), 'a') as f: 174 | f.write(json.dumps(args.__dict__)) 175 | 176 | env_args = Setting() 177 | with open(os.path.join(file_path, 'env_args.txt'), 'a') as f: 178 | f.write(json.dumps(env_args.__dict__)) 179 | main(args, env_args) 180 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import random 4 | import numpy as np 5 | import os 6 | 7 | 8 | # Get a render function 9 | def get_render_func(venv): 10 | if hasattr(venv, 'envs'): 11 | return venv.envs[0].render 12 | elif hasattr(venv, 'venv'): 13 | return get_render_func(venv.venv) 14 | elif hasattr(venv, 'env'): 15 | return get_render_func(venv.env) 16 | 17 | return None 18 | 19 | 20 | 21 | # Necessary for my KFAC implementation. 22 | class AddBias(nn.Module): 23 | def __init__(self, bias): 24 | super(AddBias, self).__init__() 25 | self._bias = nn.Parameter(bias.unsqueeze(1)) 26 | 27 | def forward(self, x): 28 | if x.dim() == 2: 29 | bias = self._bias.t().view(1, -1) 30 | else: 31 | bias = self._bias.t().view(1, -1, 1, 1) 32 | 33 | return x + bias 34 | 35 | 36 | def init(module, weight_init, bias_init, gain=1): 37 | weight_init(module.weight.data, gain=gain) 38 | bias_init(module.bias.data) 39 | return module 40 | 41 | 42 | # https://github.com/openai/baselines/blob/master/baselines/common/tf_util.py#L87 43 | def init_normc_(weight, gain=1): 44 | weight.normal_(0, 1) 45 | weight *= gain / torch.sqrt(weight.pow(2).sum(1, keepdim=True)) 46 | 47 | 48 | def seed_torch(seed=2019): 49 | # python & numpy 50 | random.seed(seed) 51 | np.random.seed(seed) 52 | os.environ['PYTHONHASHSEED'] = str(seed) 53 | 54 | # cpu & gpu 55 | torch.manual_seed(seed) # set seed for cpu 56 | torch.cuda.manual_seed(seed) # set seed for current GPU 57 | torch.cuda.manual_seed_all(seed) # set seed for all GPU 58 | 59 | torch.backends.cudnn.deterministic = True 60 | # torch.backends.benchmark = False 61 | torch.backends.benchmark = True 62 | 63 | --------------------------------------------------------------------------------