├── README.md
├── distributions.py
├── env_setting.py
├── environment.py
├── generate_data.py
├── model.py
├── model_test.py
├── ppo_agent.py
├── storage.py
├── train.py
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # DRL-SIM
 2 | This is the code accompanying the paper: "Social-Aware Incentive Mechanism for Vehicular Crowdsensing by Deep Reinforcement Learning" by Yinuo Zhao and Chi Harold Liu, published at TITS. 
 3 | 
 4 | ## Description
 5 | This simplified code implements a DRL-based social-aware incentive mechanism to solve the optimal sensing strategy for all vehicles in vehicular crowdsensing. 
 6 | 
 7 | ## Dependencies
 8 | 
 9 | You just need to install **torch**, numpy, random, csv, time, json, argparse by pip or conda
10 | 
11 | ## Usage
12 | 
13 | To generate E-R social graph, first you need to config the variable `mu` and `user_number` in `generate_data.py`. And then run the following command by
14 | 
15 | ```
16 | python generate_data.py
17 | ```
18 | 
19 | Then, copy the value of E-R social graph into `self.V['relationship']` in `env_setting.py`, and config other environment parameters there. 
20 | 
21 | After that, run the training and testing process by
22 | 
23 | ```
24 | python train.py --root-path [PATH to where to save results file and model] --user-num [USER NUMBER]
25 | ```
26 | 
27 | Finally, find the training and testing results under `--root-path`
28 | 
29 | ## Contact
30 | If you have any question, please email `ynzhao@bit.edu.cn`.
31 | 
32 | ## Paper
33 | 
34 | If you are interested in our work, please cite our paper as 
35 | ```
36 | @article{zhao2020social,
37 |   title={Social-aware incentive mechanism for vehicular crowdsensing by deep reinforcement learning},
38 |   author={Zhao, Yinuo and Liu, Chi Harold},
39 |   journal={IEEE Transactions on Intelligent Transportation Systems},
40 |   volume={22},
41 |   number={4},
42 |   pages={2314--2325},
43 |   year={2020},
44 |   publisher={IEEE}
45 | }
46 | ```
47 | 


--------------------------------------------------------------------------------
/distributions.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | from utils import init
 4 | from torch.utils.data import WeightedRandomSampler
 5 | from torch.distributions import Categorical
 6 | 
 7 | 
 8 | class _Categorical(Categorical):
 9 |     """
10 |     a son class inherit from class torch.distributions.Categorical
11 |     it adds a gumbel softmax sample method, for gumbel softmax sample
12 |     and a mode method for argmax sample
13 |     """
14 | 
15 |     def __init__(self, _logits):
16 |         super(_Categorical, self).__init__(logits=_logits)
17 |         self._logits = self.logits
18 |         self.weighted_sampler = WeightedRandomSampler
19 | 
20 |     def gumbel_softmax_sample(self, tau, device):
21 |         dist = F.gumbel_softmax(self._logits, tau=tau, hard=False)
22 |         action = torch.tensor(list(self.weighted_sampler(dist, 1, replacement=False))).to(device)
23 |         return action.squeeze(-1)
24 | 
25 |     def mode(self):
26 |         return torch.argmax(self._logits, dim=-1, keepdim=False)
27 | 
28 | 
29 | class MultiHeadCategorical(nn.Module):
30 |     """
31 |     define a multi-head Categorical for multi-label classification
32 |     --init:
33 |     num_inputs: input feature dim
34 |     dim_vec: a list for dim of each action space, e.g. [2,3,5], 2-dim for action1, 3-dim for action2, 5-dim for action3
35 |     device: running device
36 |     --forward:
37 |     inputs: flatten input feature
38 |     """
39 | 
40 |     # @torchsnooper.snoop()
41 |     def __init__(self, num_inputs, action_num, action_dim, device):
42 |         super(MultiHeadCategorical, self).__init__()
43 |         init_ = lambda m: init(m,
44 |                                nn.init.orthogonal_,
45 |                                lambda x: nn.init.constant_(x, 0),
46 |                                gain=0.01)
47 |         self.action_num = action_dim
48 |         self.linear_list = torch.nn.ModuleList(
49 |             [init_(nn.Linear(num_inputs, action_dim).to(device)) for _ in range(action_num)])
50 |         self.action_num = action_num
51 |         self.logits_head = []
52 |         self.weight_sample = WeightedRandomSampler
53 |         self.device = device
54 |         self.categorical_list = []
55 |         self.train()
56 | 
57 |     def forward(self, inputs):
58 |         self.categorical_list = [_Categorical(linear(inputs)) for linear in self.linear_list]
59 | 
60 |     def gumbel_softmax_sample(self, tau):
61 |         action = torch.cat([p.gumbel_softmax_sample(tau, self.device) for p in self.categorical_list])
62 |         return action
63 | 
64 |     @property
65 |     def probs(self):
66 |         if self.action_num == 1:
67 |             return self.categorical_list[0].probs
68 |         else:
69 |             return torch.cat([p.probs.unsqueeze(-1) for p in self.categorical_list], dim=-1)
70 | 
71 |     def log_probs(self, action):
72 |         if self.action_num == 1:
73 |             return self.categorical_list[0].log_prob(action)
74 |         else:
75 |             return torch.cat([p.log_prob(a).unsqueeze(-1) for a, p in zip(action, self.categorical_list)], dim=-1)
76 | 
77 |     def mode(self):
78 |         if self.action_num == 1:
79 |             return self.categorical_list[0].mode()
80 |         else:
81 |             return torch.cat([p.mode() for p in self.categorical_list])
82 | 
83 |     def sample(self):
84 |         if self.action_num == 1:
85 |             return self.categorical_list[0].sample()
86 |         else:
87 |             return torch.cat([p.sample() for p in self.categorical_list])
88 | 
89 |     def entropy(self):
90 |         if self.action_num == 1:
91 |             return self.categorical_list[0].entropy()
92 |         else:
93 |             return torch.cat([p.entropy() for p in self.categorical_list])
94 | 


--------------------------------------------------------------------------------
/env_setting.py:
--------------------------------------------------------------------------------
 1 | class Setting(object):
 2 |     def __init__(self):
 3 |         self.V = {
 4 |             'relationship': [
 5 |                 [0.11, 0.0, 0.11, 0.11, 0.11, 0.0, 0.22, 0.11, 0.22, 0.0, ],
 6 |                 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
 7 |                 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
 8 |                 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
 9 |                 [0.11, 0.0, 0.11, 0.11, 0.11, 0.0, 0.22, 0.11, 0.22, 0.0, ],
10 |                 [0.11, 0.0, 0.11, 0.11, 0.11, 0.0, 0.22, 0.11, 0.22, 0.0, ],
11 |                 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ],
12 |                 [0.22, 0.0, 0.22, 0.22, 0.22, 0.0, 0.44, 0.22, 0.44, 0.0, ],
13 |                 [0.11, 0.0, 0.11, 0.11, 0.11, 0.0, 0.22, 0.11, 0.22, 0.0, ],
14 |                 [0.33, 0.0, 0.33, 0.33, 0.33, 0.0, 0.67, 0.33, 0.67, 0.0, ],
15 |             ],
16 |             'cost': [5, 7, 9, 11, 13, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9, 5, 6, 7, 8, 9],
17 |             'R': [15, 15.5, 16, 16.5, 17, 17.5, 18, 18.5, 19, 19.5, 20, 20.5, 21, 21.5],
18 |             'prob': [0.1, 0.2, 0.3, 0.4],
19 |             'task_num': 4,
20 |             'quality': [10, 20, 30, 40, 50, 10, 20, 30, 40, 50, 10, 20, 30, 40, 50, 10, 20, 30, 40, 50, 10, 20, 30, 40,
21 |                         50],
22 |             'user_battery_budget': [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
23 |                                     50, 50, 50, 50, 50, 50, 50, 50, 50]
24 |         }
25 | 


--------------------------------------------------------------------------------
/environment.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | 
  5 | class Env(object):
  6 |     def __init__(self, user_num, state_dim, device, env_args):
  7 |         self.device = device
  8 | 
  9 |         self.user_num = user_num
 10 |         self.action_dim = user_num
 11 |         self.state_dim = state_dim
 12 | 
 13 |         relationship = env_args.V['relationship']
 14 |         unit_cost = env_args.V['cost']
 15 | 
 16 |         self.battery_budget = env_args.V['user_battery_budget']
 17 |         self.task_num = env_args.V['task_num']
 18 |         self.task_budget = env_args.V['R']
 19 |         prob = env_args.V['prob']
 20 |         self.prob = np.zeros(self.task_num)
 21 |         for i in range(self.task_num):
 22 |             self.prob[i] = prob[i]
 23 | 
 24 |         self.unit_cost = np.zeros(user_num)
 25 |         for i in range(user_num):
 26 |             self.unit_cost[i] = unit_cost[i]
 27 | 
 28 |         self.remain_energy = np.zeros(user_num)
 29 |         for i in range(user_num):
 30 |             self.remain_energy[i] = self.battery_budget[i]
 31 | 
 32 |         self.relationship = np.zeros((user_num, user_num))
 33 |         for i in range(user_num):
 34 |             for j in range(user_num):
 35 |                 self.relationship[i][j] = relationship[i][j]
 36 | 
 37 |         self.server_reward = 0
 38 |         self.total_server_reward = []
 39 |         self.beta = np.zeros(user_num)
 40 |         beta = env_args.V['quality']
 41 |         for i in range(self.task_num):
 42 |             self.beta[i] = beta[i]
 43 | 
 44 |         self.R = 0
 45 |         self.task_index = 0
 46 |         self.epoch = 0
 47 |         self.total_obtain_sensing_data = 0
 48 |         # self.max_completion_ratio = 0
 49 | 
 50 |         self.complete_task = 0
 51 |         self.total_task = 0
 52 | 
 53 |         self.task_cnt = np.zeros(self.task_num)
 54 |         self.obtain_sensing_data = np.zeros(self.task_num)
 55 | 
 56 |         self.final_contrib_data = 0
 57 | 
 58 |         self.intrinsic_reward = 0
 59 |         self.extrinsic_reward = 0
 60 | 
 61 |     def get_collected_data(self):
 62 |         return self.final_contrib_data
 63 | 
 64 |     def close(self):
 65 |         return None
 66 | 
 67 |     def plot_server_reward(self, episode):
 68 |         server_reward = self.server_reward
 69 |         self.server_reward = 0
 70 |         return server_reward
 71 | 
 72 |     def plot_complete_ratio(self, episode):
 73 |         obtain_sensing_data_list = []
 74 |         for i in range(self.task_num):
 75 |             obtain_sensing_data = self.obtain_sensing_data[i] / self.task_cnt[i]
 76 |             obtain_sensing_data_list.append(obtain_sensing_data)
 77 |         self.total_obtain_sensing_data = 0
 78 |         self.epoch = 0
 79 |         self.obtain_sensing_data = np.zeros(self.task_num)
 80 |         self.task_cnt = np.zeros(self.task_num)
 81 | 
 82 |     def reset(self):
 83 |         for i in range(self.user_num):
 84 |             self.remain_energy[i] = self.battery_budget[i]
 85 | 
 86 |         self.task_index = np.random.choice(self.task_num, 1, False, self.prob)[0]
 87 |         # self.task_index = 0
 88 |         self.R = self.task_budget[self.task_index]
 89 | 
 90 |         state = np.zeros((self.user_num, self.state_dim))
 91 |         for i in range(self.user_num):
 92 |             state[i, self.user_num:self.user_num + 1] = self.unit_cost[i] / 10
 93 |             state[i, self.user_num + 1:self.user_num + 2] = self.remain_energy[i] / 50
 94 |             state[i, self.user_num + 2:self.user_num + 3] = self.R / 10
 95 | 
 96 |         return torch.from_numpy(state).float().to(self.device)
 97 | 
 98 |     def get_completion_ratio(self):
 99 |         completion_ratio = self.complete_task / self.total_task
100 |         self.complete_task = 0
101 |         self.total_task = 0
102 |         return completion_ratio
103 | 
104 |     def get_reward(self):
105 |         extrinsic_reward = self.extrinsic_reward / self.user_num
106 |         intrinsic_reward = self.intrinsic_reward / self.user_num
107 |         self.extrinsic_reward = 0
108 |         self.intrinsic_reward = 0
109 |         return extrinsic_reward, intrinsic_reward
110 | 
111 |     def step(self, action):
112 |         action = 0.2 * action.float().numpy()
113 | 
114 |         # -------standard action----------------------
115 |         for i in range(self.user_num):
116 |             if action[i] > self.remain_energy[i] / self.unit_cost[i]:
117 |                 action[i] = self.remain_energy[i] / self.unit_cost[i]
118 |         phi = np.zeros(self.user_num, 'float')
119 | 
120 |         for i in range(self.user_num):
121 |             for j in range(self.user_num):
122 |                 phi[i] += self.relationship[i][j] * action[i] * action[j]
123 |         total_sensing = action.sum()
124 | 
125 |         sensing_data = total_sensing
126 |         self.total_obtain_sensing_data += sensing_data
127 |         self.task_cnt[self.task_index] += 1
128 |         self.obtain_sensing_data[self.task_index] += sensing_data
129 |         self.final_contrib_data += total_sensing / self.R
130 |         self.epoch += 1
131 | 
132 |         quality_sensing_data = action
133 |         total_quality_sensing_data = np.sum(quality_sensing_data)
134 |         # print(np.shape(action))
135 |         reward = np.zeros(self.user_num)
136 |         self.server_reward += total_quality_sensing_data * self.beta[self.task_index] - self.R
137 |         intrinsic_reward = 0
138 |         extrinsic_reward = 0
139 |         self.total_task += 1
140 |         if total_sensing > 0.001:
141 |             self.complete_task += 1
142 |             for i in range(self.user_num):
143 |                 # reward[i] = action[i] / total_sensing * self.R - self.unit_cost[i] * action[i] + phi[i]
144 |                 reward[i] = quality_sensing_data[i] / total_quality_sensing_data * self.R - self.unit_cost[i] * action[
145 |                     i] + phi[i]
146 |                 extrinsic_reward += quality_sensing_data[i] / total_quality_sensing_data * self.R - self.unit_cost[i] * \
147 |                                     action[i]
148 |                 intrinsic_reward += phi[i]
149 |                 self.remain_energy[i] -= self.unit_cost[i] * action[i]
150 |                 if self.remain_energy[i] <= 0.0001:
151 |                     self.remain_energy[i] = 0
152 |         self.intrinsic_reward += intrinsic_reward
153 |         self.extrinsic_reward += extrinsic_reward
154 |         self.task_index = np.random.choice(self.task_num, 1, False, self.prob)[0]
155 |         # self.task_index = 0
156 |         self.R = self.task_budget[self.task_index]
157 | 
158 |         state = np.zeros((self.user_num, self.state_dim))
159 |         for i in range(self.user_num):
160 |             state[i, 0:self.user_num] = action
161 |             state[i, self.user_num:self.user_num + 1] = self.unit_cost[i] / 10
162 |             state[i, self.user_num + 1:self.user_num + 2] = self.remain_energy[i] / 50
163 |             state[i, self.user_num + 2:self.user_num + 3] = self.R / 10
164 |         # reward = np.mean(reward, keepdims=True)
165 | 
166 |         done = False
167 | 
168 |         return torch.from_numpy(state).float().to(self.device), torch.from_numpy(reward).float(), done
169 | 


--------------------------------------------------------------------------------
/generate_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | mu = 0.9
 4 | user_number = 10
 5 | G = np.zeros((user_number, user_number), dtype=np.float)
 6 | for i in range(user_number):
 7 |     for j in range(user_number):
 8 |         if i == j:
 9 |             G[i][j] = 0
10 |         else:
11 |             p = np.random.random()
12 |             if p <= mu:
13 |                 G[i][j] = 1
14 |             else:
15 |                 G[i][j] = 0
16 | for i in range(user_number):
17 |     print('[', end='')
18 |     for j in range(user_number):
19 |         print(G[i][j], end=', ')
20 |     print('], ')
21 | # G = np.array([
22 | #     [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, ],
23 | #     [1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, ],
24 | #     [1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ],
25 | #     [1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, ],
26 | #     [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ],
27 | #     [1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, ],
28 | #     [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, ],
29 | #     [0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, ],
30 | #     [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, ],
31 | #     [1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, ],
32 | # ])
33 | 
34 | I = np.ones((user_number, 1))
35 | K = np.dot(G, I)
36 | L = np.dot(G.transpose(), I)
37 | g = np.sum(G)
38 | d = g / user_number
39 | H = np.dot(K, L.transpose()) / g
40 | for i in range(user_number):
41 |     print('[', end='')
42 |     for j in range(user_number):
43 |         # if i == j:
44 |         #     print('0.0', end=', ')
45 |         # else:
46 |         print(np.round(H[i][j], 2), end=', ')
47 |     print('],')


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | from distributions import MultiHeadCategorical
 2 | import torch
 3 | from utils import init
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class Flatten(nn.Module):
 8 |     def forward(self, x):
 9 |         return x.view(x.size(0), -1)
10 | 
11 | 
12 | class Model(nn.Module):
13 |     def __init__(self, state_dim, action_dim, device, trainable=True, hidsize=128):
14 |         super(Model, self).__init__()
15 |         init_ = lambda m: init(m,
16 |                                nn.init.orthogonal_,
17 |                                lambda x: nn.init.constant_(x, 0))
18 |         # feature extract
19 |         self.base = nn.Sequential(
20 |             init_(nn.Linear(state_dim, 128)),
21 |             nn.ReLU(),
22 |             init_(nn.Linear(128, hidsize)),
23 |             nn.ReLU()
24 |         ).to(device)
25 | 
26 |         # actor
27 |         self.dist = MultiHeadCategorical(hidsize, 1, action_dim, device)
28 |         # # critic
29 |         # self.critic = nn.Sequential(
30 |         #     init_(nn.Linear(hidsize, 1))
31 |         # ).to(device)
32 |         # critic
33 |         self.q_network = nn.Sequential(
34 |             init_(nn.Linear(hidsize, action_dim)),
35 |         ).to(device)
36 |         self.device = device
37 |         self.identity = torch.eye(action_dim).to(device)
38 |         if trainable:
39 |             self.train()
40 |         else:
41 |             self.eval()
42 | 
43 |     # @torchsnooper.snoop()
44 |     def act(self, inputs):
45 |         with torch.no_grad():
46 |             obs_feature = self.base(inputs)
47 | 
48 |             # value = self.critic(obs_feature)
49 |             self.dist(obs_feature)
50 |             action = self.dist.sample()
51 |             action_log_probs = self.dist.log_probs(action)
52 |             action_log_probs = action_log_probs.mean(-1, keepdim=True)
53 | 
54 |             q_value = self.q_network(obs_feature)
55 |             # mean
56 |             value = torch.sum(self.dist.probs * q_value, -1, keepdim=True)
57 |         return value, action.squeeze(), action_log_probs
58 | 
59 |     def get_value(self, inputs):
60 |         obs_feature = self.base(inputs)
61 |         # value = self.critic(obs_feature)
62 |         self.dist(obs_feature)
63 |         q_value = self.q_network(obs_feature)
64 |         value = torch.sum(self.dist.probs * q_value, -1, keepdim=True)
65 |         return value
66 | 
67 |     def evaluate_actions(self, inputs, action):
68 |         obs_features = self.base(inputs)
69 |         # value = self.critic(obs_features)
70 |         q_value = self.q_network(obs_features)
71 |         index = self.identity[action.squeeze(-1)]
72 |         value = torch.sum(q_value * index, -1).unsqueeze(-1)
73 | 
74 |         self.dist(obs_features)
75 | 
76 |         action_log_probs = self.dist.log_probs(action).mean(-1, keepdim=True)
77 | 
78 |         dist_entropy = self.dist.entropy().mean()
79 | 
80 |         return value, action_log_probs, dist_entropy
81 | 
82 |     def print_grad(self):
83 |         for name, p in self.named_parameters():
84 |             print('name: ', name, ' value: ', p.grad.mean(), 'p.requires_grad', p.requires_grad)
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/model_test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import torch
  4 | from environment import Env
  5 | from utils import seed_torch
  6 | from ppo_agent import PPOAgent
  7 | import csv
  8 | 
  9 | 
 10 | def model_test(args, env_args):
 11 |     seed_torch(args.seed)
 12 | 
 13 |     print('in test process')
 14 |     os.environ['OMP_NUM_THREADS'] = '1'
 15 | 
 16 |     if args.test_device_num == -1:
 17 |         test_device_name = 'cpu'
 18 | 
 19 |     else:
 20 |         test_device_name = 'cuda:' + str(args.test_device_num)
 21 |         torch.cuda.set_device(args.test_device_num)
 22 | 
 23 |     # -------------get environment information------------
 24 |     ppo_agent = []
 25 |     for i in range(args.user_num):
 26 |         ppo_agent.append(
 27 |             PPOAgent(args.state_dim, test_device_name, args.lr, args.exploration_steps,
 28 |                      args.mini_batch_num, args.use_gae, args.gamma, args.gae_param, args.ppo_epoch,
 29 |                      args.clip, args.value_coeff, args.clip_coeff, args.ent_coeff))
 30 | 
 31 |     ori_device_name = 'cuda:' + str(args.device_num)
 32 |     model_path = os.path.join(args.root_path, 'model')
 33 |     for i, agent in enumerate(ppo_agent):
 34 |         ppo_model_path = os.path.join(model_path, 'ppo_model' + str(i) + '.pt')
 35 |         agent.load_model(ppo_model_path, ori_device_name, test_device_name)
 36 |         agent.local_ppo_model.eval()
 37 | 
 38 |     done_time = 0
 39 |     episode_length = 0
 40 | 
 41 |     user_num = args.user_num
 42 |     env = Env(user_num, args.state_dim, test_device_name, env_args)
 43 | 
 44 |     action = torch.zeros(user_num, dtype=torch.long)
 45 |     final_av_reward = 0
 46 |     final_av_server_reward = 0
 47 |     test_file_path = os.path.join(args.root_path, 'test_file')
 48 |     if not os.path.exists(test_file_path):
 49 |         os.mkdir(test_file_path)
 50 |     test_result_profile = open(test_file_path + '/test_result.csv', 'w', newline='')
 51 |     test_writer = csv.writer(test_result_profile)
 52 | 
 53 |     av_ext_reward = 0
 54 |     av_int_rewards = 0
 55 | 
 56 |     av_completion_ratio = 0
 57 | 
 58 |     result_path = test_file_path + '/test_result.npz'
 59 |     # -----------------------------------------
 60 | 
 61 |     all_remaining_energy = []
 62 |     while True:
 63 |         if episode_length >= args.max_test_length:
 64 |             print('training over')
 65 |             break
 66 | 
 67 |         print('---------------in episode ', episode_length, '-----------------------')
 68 | 
 69 |         step = 0
 70 |         done = True
 71 |         av_reward = 0
 72 |         av_action = torch.zeros(user_num)
 73 | 
 74 |         obs = env.reset()
 75 |         interact_time = 0
 76 |         remaining_energy = []
 77 |         remaining_energy.append(env.remain_energy.mean())
 78 |         while step < args.exploration_steps:
 79 |             interact_time += 1
 80 |             # ----------------sample actions(no grad)------------------------
 81 |             with torch.no_grad():
 82 |                 for i, agent in enumerate(ppo_agent):
 83 |                     _, action[i], _ = agent.act(obs[i])
 84 |                 obs, reward, done = env.step(action)
 85 |             remaining_energy.append(env.remain_energy.mean())
 86 |             av_reward += np.mean(reward.numpy())
 87 |             av_action += 0.2 * action.float()
 88 | 
 89 |             step = step + 1
 90 |             done = interact_time >= args.max_interact_time
 91 |             if done:
 92 |                 # env.draw_remain_energy(done_time)
 93 |                 done_time += 1
 94 |                 interact_time = 0
 95 |                 obs = env.reset()
 96 |                 if len(remaining_energy) == args.max_interact_time + 1 and len(all_remaining_energy) < 100:
 97 |                     all_remaining_energy.append(remaining_energy)
 98 |                 remaining_energy = []
 99 |                 remaining_energy.append(env.remain_energy.mean())
100 |                 for i, agent in enumerate(ppo_agent):
101 |                     agent.reset(obs[i])
102 | 
103 |                 continue
104 |         av_reward /= args.exploration_steps
105 |         ext_reward, int_reward = env.get_reward()
106 | 
107 |         av_ext_reward += ext_reward / args.exploration_steps
108 |         av_int_rewards += int_reward / args.exploration_steps
109 | 
110 |         completion_ratio = env.get_completion_ratio()
111 |         av_completion_ratio += completion_ratio
112 | 
113 |         final_av_reward += av_reward
114 |         final_av_server_reward += env.plot_server_reward(episode_length) / args.exploration_steps
115 |         episode_length += 1
116 | 
117 |     test_writer.writerow(
118 |         ['vehicle reward', 'server reward', 'extrinsic reward', 'intrinsic reward', 'completion ratio'])
119 |     test_writer.writerow([final_av_reward / args.max_test_length, final_av_server_reward / args.max_test_length,
120 |                           av_ext_reward / args.max_test_length, av_int_rewards / args.max_test_length,
121 |                           av_completion_ratio / args.max_test_length])
122 |     test_result_profile.close()
123 |     np.savez(result_path, np.asarray(all_remaining_energy))
124 |     print('Finish! Results saved in ', args.root_path)
125 | 
126 | 


--------------------------------------------------------------------------------
/ppo_agent.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from storage import RolloutStorage
 3 | from model import Model
 4 | import torch.optim as optim
 5 | 
 6 | 
 7 | class PPOAgent():
 8 |     def __init__(self, state_dim, device, lr, exploration_steps, mini_batch_num, use_gae, gamma,
 9 |                  gae_param, ppo_epoch, clip, value_coeff, clip_coeff, ent_coeff):
10 |         self.local_ppo_model = Model(state_dim, 6, device)
11 |         self.optimizer = optim.Adam(list(self.local_ppo_model.parameters()), lr=lr)
12 |         self.rollout = RolloutStorage(exploration_steps, mini_batch_num, state_dim)
13 |         self.rollout.to(device)
14 |         self.use_gae = use_gae
15 |         self.gamma = gamma
16 |         self.gae_param = gae_param
17 |         self.ppo_epoch = ppo_epoch
18 |         self.clip = clip
19 |         self.value_coeff = value_coeff
20 |         self.clip_coeff = clip_coeff
21 |         self.ent_coeff = ent_coeff
22 | 
23 |     def act(self, obs):
24 |         value, action, action_log_probs = self.local_ppo_model.act(obs)
25 |         return value, action, action_log_probs
26 | 
27 |     def insert(self, obs, action, action_log_probs, value, reward, masks):
28 |         self.rollout.insert(obs, action, action_log_probs, value, reward, masks)
29 | 
30 |     def after_update(self, obs):
31 |         self.rollout.after_update(obs)
32 | 
33 |     def load_model(self, path, device, test_device):
34 |         self.local_ppo_model.load_state_dict(torch.load(path, map_location={device: test_device}))
35 | 
36 |     def reset(self, obs):
37 |         self.rollout.reset(obs)
38 | 
39 |     def update(self, done):
40 |         beta = 0.2
41 |         with torch.no_grad():
42 |             if done:
43 |                 next_value = torch.zeros(1)
44 |             else:
45 |                 next_value = self.local_ppo_model.get_value(self.rollout.obs[-1:])
46 | 
47 |         self.rollout.compute_returns(next_value.detach(), self.use_gae, self.gamma, self.gae_param)
48 | 
49 |         advantages = self.rollout.returns[:-1] - self.rollout.value_preds[:-1]
50 | 
51 |         advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
52 | 
53 |         av_value_loss = 0
54 |         av_policy_loss = 0
55 |         av_ent_loss = 0
56 |         loss_cnt = 0
57 | 
58 |         for _ in range(self.ppo_epoch):
59 |             data_generator = self.rollout.feed_forward_generator(advantages)
60 |             for samples in data_generator:
61 |                 # signal_init = traffic_light.get()
62 |                 torch.cuda.empty_cache()
63 |                 obs_batch, next_obs_batch, action_batch, old_values, return_batch, masks_batch, \
64 |                 old_action_log_probs, advantages_batch = samples
65 | 
66 |                 cur_values, cur_action_log_probs, dist_entropy = self.local_ppo_model.evaluate_actions(obs_batch,
67 |                                                                                                        action_batch)
68 | 
69 |                 # ----------use ppo clip to compute loss------------------------
70 |                 ratio = torch.exp(cur_action_log_probs - old_action_log_probs)
71 |                 surr1 = ratio * advantages_batch
72 |                 surr2 = torch.clamp(ratio, 1.0 - self.clip, 1.0 + self.clip) * advantages_batch
73 | 
74 |                 action_loss = -torch.min(surr1, surr2).mean()
75 | 
76 |                 value_pred_clipped = old_values + (cur_values - old_values).clamp(-self.clip, self.clip)
77 |                 value_losses = (cur_values - return_batch).pow(2)
78 |                 value_losses_clipped = (value_pred_clipped - return_batch).pow(2)
79 |                 value_loss = 0.5 * torch.max(value_losses, value_losses_clipped).mean()
80 |                 # value_loss = torch.mean((return_batch - cur_values)**2)
81 | 
82 |                 value_loss = value_loss * self.value_coeff
83 |                 action_loss = action_loss * self.clip_coeff
84 |                 ent_loss = dist_entropy * self.ent_coeff
85 |                 # ------------------ for curiosity driven--------------------------
86 |                 total_loss = value_loss + action_loss - ent_loss
87 |                 self.local_ppo_model.zero_grad()
88 |                 self.optimizer.zero_grad()
89 |                 total_loss.backward()
90 |                 self.optimizer.step()
91 | 
92 |                 av_value_loss += float(value_loss)
93 |                 av_policy_loss += float(action_loss)
94 |                 av_ent_loss += float(ent_loss)
95 |                 loss_cnt += 1
96 | 
97 |         return av_value_loss / loss_cnt, av_policy_loss / loss_cnt, av_ent_loss / loss_cnt
98 | 


--------------------------------------------------------------------------------
/storage.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 3 | 
 4 | 
 5 | def _flatten_helper(T, N, _tensor):
 6 |     return _tensor.view(T * N, *_tensor.size()[2:])
 7 | 
 8 | 
 9 | class RolloutStorage(object):
10 |     def __init__(self, num_steps, mini_batch_num, obs_shape):
11 | 
12 |         self.mini_batch_num = mini_batch_num
13 |         self.obs = torch.zeros(num_steps + 1, obs_shape)
14 |         self.rewards = torch.zeros(num_steps, 1)
15 |         self.value_preds = torch.zeros(num_steps + 1, 1)
16 |         self.returns = torch.zeros(num_steps + 1, 1)
17 |         self.action_log_probs = torch.zeros(num_steps, 1)
18 |         self.action = torch.zeros(num_steps, 1, dtype=torch.long)
19 |         self.masks = torch.ones(num_steps + 1, 1)
20 |         self.num_steps = num_steps
21 |         self.step = 0
22 | 
23 |     def to(self, device):
24 |         self.obs = self.obs.to(device)
25 |         self.rewards = self.rewards.to(device)
26 |         self.value_preds = self.value_preds.to(device)
27 |         self.returns = self.returns.to(device)
28 |         self.action_log_probs = self.action_log_probs.to(device)
29 |         self.action = self.action.to(device)
30 |         self.masks = self.masks.to(device)
31 | 
32 |     def reset(self, obs):
33 |         self.obs[self.step].copy_(obs.squeeze(0))
34 |         self.masks[self.step].copy_(torch.zeros(1))
35 | 
36 |     def insert(self, obs, actions, action_log_probs, value_preds, rewards, masks):
37 |         self.action[self.step].copy_(actions)
38 |         self.action_log_probs[self.step].copy_(action_log_probs.squeeze())
39 |         self.value_preds[self.step].copy_(value_preds.squeeze())
40 |         self.rewards[self.step].copy_(rewards.squeeze())
41 |         self.obs[self.step + 1].copy_(obs.squeeze())
42 |         self.masks[self.step + 1].copy_(masks.squeeze())
43 | 
44 |         self.step = self.step + 1
45 | 
46 |     def update_reward(self, intrinsic_reward):
47 |         intrinsic_reward = intrinsic_reward.clamp(-1, 1)
48 |         num_steps = intrinsic_reward.size()[0]
49 |         # print('self.rewards', self.rewards, 'intrinsic_reward', intrinsic_reward)
50 |         for i in range(num_steps):
51 |             self.rewards[i] = self.rewards[i] + intrinsic_reward[i]
52 |         # self.rewards = self.rewards.clamp(-1, 1)
53 | 
54 |     def icm_tuple(self):
55 |         obs = self.obs[:-1].clone().detach()
56 |         next_obs = self.obs[1:].clone().detach()
57 |         action = self.action.clone().detach()
58 |         return obs, next_obs, action
59 | 
60 |     def after_update(self, obs):
61 |         self.step = 0
62 |         self.obs[0].copy_(obs.squeeze())
63 |         self.masks[0].copy_(torch.zeros(1))
64 | 
65 |     def compute_returns(self, next_value, use_gae, gamma, tau):
66 |         if use_gae:
67 |             self.value_preds[-1] = next_value
68 |             gae = 0
69 |             for step in reversed(range(self.rewards.size(0))):
70 |                 delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step + 1] - \
71 |                         self.value_preds[step]
72 |                 gae = delta + gamma * tau * self.masks[step + 1] * gae
73 |                 self.returns[step] = gae + self.value_preds[step]
74 |         else:
75 |             self.returns[-1] = next_value
76 |             for step in reversed(range(self.rewards.size(0))):
77 |                 self.returns[step] = self.returns[step + 1] * gamma * self.masks[step + 1] + self.rewards[step]
78 | 
79 |     def feed_forward_generator(self, advantages):
80 |         mini_batch_size = self.num_steps // self.mini_batch_num
81 |         sampler = BatchSampler(SubsetRandomSampler(range(self.num_steps)), mini_batch_size, drop_last=False)
82 |         for indices in sampler:
83 |             next_indices = [indice + 1 for indice in indices]
84 |             obs_batch = self.obs[indices]
85 |             next_obs_batch = self.obs[next_indices]
86 |             action_batch = self.action[indices]
87 |             value_pred_batch = self.value_preds[indices]
88 |             return_batch = self.returns[indices]
89 |             old_action_log_probs_batch = self.action_log_probs[indices]
90 |             advantages_batch = advantages[indices]
91 |             masks_batch = self.masks[indices]
92 |             yield obs_batch, next_obs_batch, action_batch, value_pred_batch, return_batch, \
93 |                   masks_batch, old_action_log_probs_batch, advantages_batch
94 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import torch
  4 | from environment import Env
  5 | from utils import seed_torch
  6 | from ppo_agent import PPOAgent
  7 | import time
  8 | from env_setting import Setting
  9 | import json
 10 | from model_test import model_test
 11 | import argparse
 12 | 
 13 | 
 14 | def main(args, env_args):
 15 |     seed_torch(args.seed)
 16 |     os.environ['OMP_NUM_THREADS'] = '1'
 17 |     if args.use_cuda:
 18 |         torch.cuda.set_device(args.device_num)
 19 |     device = torch.device('cuda' if args.use_cuda else 'cpu')
 20 |     # -------------get environment information------------
 21 | 
 22 |     ppo_agent = []
 23 |     for i in range(args.user_num):
 24 |         ppo_agent.append(
 25 |             PPOAgent(args.state_dim, device, args.lr, args.exploration_steps,
 26 |                      args.mini_batch_num, args.use_gae, args.gamma, args.gae_param, args.ppo_epoch,
 27 |                      args.clip, args.value_coeff, args.clip_coeff, args.ent_coeff))
 28 | 
 29 |     done_time = 0
 30 |     episode_length = 0
 31 | 
 32 |     user_num = args.user_num
 33 |     env = Env(user_num, args.state_dim, device, env_args)
 34 | 
 35 |     action = torch.zeros(user_num, dtype=torch.long)
 36 |     value = torch.zeros(user_num)
 37 |     action_log_probs = torch.zeros(user_num)
 38 |     file_path = os.path.join(args.root_path, 'file')
 39 |     result_path = file_path + '/result.npz'
 40 |     model_path = os.path.join(args.root_path, 'model')
 41 |     os.mkdir(model_path)
 42 |     rewards = []
 43 |     server_rewards = []
 44 |     user_rewards = [[] for _ in range(user_num)]
 45 |     completion_ratio = [[] for _ in range(env.task_num)]
 46 | 
 47 |     ext_rewards = []
 48 |     int_rewards = []
 49 |     while True:
 50 |         if episode_length >= args.max_episode_length:
 51 |             print('training over')
 52 |             break
 53 | 
 54 |         print('---------------in episode ', episode_length, '-----------------------')
 55 | 
 56 |         step = 0
 57 |         done = True
 58 |         av_reward = torch.zeros(user_num)
 59 |         av_action = torch.zeros(user_num)
 60 | 
 61 |         obs = env.reset()
 62 |         for i, agent in enumerate(ppo_agent):
 63 |             agent.after_update(obs[i])
 64 | 
 65 |         interact_time = 0
 66 |         sum_reward = 0.0
 67 |         sum_user_reward = np.zeros(user_num)
 68 |         while step < args.exploration_steps:
 69 |             interact_time += 1
 70 |             # ----------------sample actions(no grad)------------------------
 71 |             with torch.no_grad():
 72 |                 for i, agent in enumerate(ppo_agent):
 73 |                     value[i], action[i], action_log_probs[i] = agent.act(obs[i])
 74 |                 obs, reward, done = env.step(action)
 75 |             sum_reward += reward.numpy().mean()
 76 |             sum_user_reward += reward.numpy()
 77 | 
 78 |             av_reward += reward
 79 |             av_action += 0.2 * action.float()
 80 |             done = interact_time >= args.max_interact_time
 81 |             # ---------judge if game over --------------------
 82 |             masks = torch.tensor([[0.0] if done else [1.0]])
 83 |             # ----------add to memory ---------------------------
 84 |             for i, agent in enumerate(ppo_agent):
 85 |                 agent.insert(obs[i].detach(), action[i].detach(), action_log_probs[i].detach(), value[i].detach(),
 86 |                              reward[i].detach(), masks.detach())
 87 |             step = step + 1
 88 | 
 89 |             if done:
 90 |                 done_time += 1
 91 |                 interact_time = 0
 92 |                 obs = env.reset()
 93 |                 for i, agent in enumerate(ppo_agent):
 94 |                     agent.reset(obs[i])
 95 | 
 96 |                 continue
 97 | 
 98 |         server_reward = env.plot_server_reward(episode_length) / args.exploration_steps
 99 |         server_rewards.append(server_reward)
100 |         ext_reward, int_reward = env.get_reward()
101 |         ext_rewards.append(ext_reward / args.exploration_steps)
102 |         int_rewards.append(int_reward / args.exploration_steps)
103 |         # av_completion_ratio = env.get_completion_ratio() / args.exploration_steps
104 | 
105 |         # for i in range(env.task_num):
106 |         #     completion_ratio[i].append(av_completion_ratio[i])
107 |         av_value_loss = 0
108 |         av_policy_loss = 0
109 |         av_ent_loss = 0
110 | 
111 |         rewards.append(sum_reward / args.exploration_steps)
112 |         for i in range(user_num):
113 |             user_rewards[i].append(sum_user_reward[i] / args.exploration_steps)
114 | 
115 |         for i, agent in enumerate(ppo_agent):
116 |             value_loss, policy_loss, ent_loss = agent.update(done)
117 |             av_value_loss += value_loss
118 |             av_policy_loss += policy_loss
119 |             av_ent_loss += ent_loss
120 | 
121 |         av_value_loss /= user_num
122 |         av_policy_loss /= user_num
123 |         av_ent_loss /= user_num
124 |         av_reward /= args.exploration_steps
125 |         av_action /= args.exploration_steps
126 |         episode_length += 1
127 | 
128 |     np.savez(result_path, np.asarray(rewards), np.asarray(server_rewards), np.asarray(ext_rewards),
129 |              np.asarray(int_rewards))
130 |     # reward_profile.close()
131 |     for i, ppo in enumerate(ppo_agent):
132 |         ppo_model_path = os.path.join(model_path, 'ppo_model' + str(i) + '.pt')
133 |         torch.save(ppo.local_ppo_model.state_dict(), ppo_model_path)
134 |     model_test(args, env_args)
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     parser = argparse.ArgumentParser(description='Hyper-parameter setting for DRL-SIM.')
139 |     # ------------------------------------- parameters that must be configured ---------------------------------
140 |     parser.add_argument('--root-path', type=str, required=True, help='the path to save your results and models')
141 |     parser.add_argument('--user-num', type=int, required=True, help='use cuda device to train models or not')
142 | 
143 |     # ------------------------------------- parameters that can be changed according to your need --------------
144 |     parser.add_argument('--use-cuda', type=bool, default=True, help='use cuda device to train models or not')
145 |     parser.add_argument('--device-num', type=int, default=0, help='cuda device number for training')
146 |     parser.add_argument('--test-device-num', type=int, default=0, help='cuda device number for testing')
147 |     parser.add_argument('--max-episode-length', type=int, default=1000)
148 |     parser.add_argument('--max-test-length', type=int, default=100)
149 |     parser.add_argument('--exploration-steps', type=int, default=500)
150 |     parser.add_argument('--mini-batch-num', type=int, default=4)
151 |     parser.add_argument('--seed', type=int, default=1)
152 |     parser.add_argument('--ppo-epoch', type=int, default=4)
153 |     parser.add_argument('--max-interact-time', type=int, default=64)
154 | 
155 |     # ------------------------------------- parameters that never recommend to be changed ---------------------
156 |     parser.add_argument('--lr', type=float, default=0.0003, help='optimizer learning rate')
157 |     parser.add_argument('--clip', type=float, default=0.1)
158 |     parser.add_argument('--ent-coeff', type=float, default=0.01)
159 |     parser.add_argument('--value-coeff', type=float, default=0.1)
160 |     parser.add_argument('--clip-coeff', type=float, default=1.0)
161 |     parser.add_argument('--use-gae', type=bool, default=True)
162 |     parser.add_argument('--gamma', type=float, default=0.99)
163 |     parser.add_argument('--gae_param', type=float, default=0.95)
164 | 
165 |     args = parser.parse_args()
166 |     args.state_dim = args.user_num + 3
167 |     local_time = str(time.strftime("%Y/%m-%d/%H-%M-%S", time.localtime()))
168 |     args.root_path = os.path.join(args.root_path, local_time)
169 |     file_path = os.path.join(args.root_path, 'file')
170 |     if not os.path.exists(file_path):
171 |         os.makedirs(file_path)
172 | 
173 |     with open(os.path.join(file_path, 'agent_args.txt'), 'a') as f:
174 |         f.write(json.dumps(args.__dict__))
175 | 
176 |     env_args = Setting()
177 |     with open(os.path.join(file_path, 'env_args.txt'), 'a') as f:
178 |         f.write(json.dumps(env_args.__dict__))
179 |     main(args, env_args)
180 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import random
 4 | import numpy as np
 5 | import os
 6 | 
 7 | 
 8 | # Get a render function
 9 | def get_render_func(venv):
10 |     if hasattr(venv, 'envs'):
11 |         return venv.envs[0].render
12 |     elif hasattr(venv, 'venv'):
13 |         return get_render_func(venv.venv)
14 |     elif hasattr(venv, 'env'):
15 |         return get_render_func(venv.env)
16 | 
17 |     return None
18 | 
19 | 
20 | 
21 | # Necessary for my KFAC implementation.
22 | class AddBias(nn.Module):
23 |     def __init__(self, bias):
24 |         super(AddBias, self).__init__()
25 |         self._bias = nn.Parameter(bias.unsqueeze(1))
26 | 
27 |     def forward(self, x):
28 |         if x.dim() == 2:
29 |             bias = self._bias.t().view(1, -1)
30 |         else:
31 |             bias = self._bias.t().view(1, -1, 1, 1)
32 | 
33 |         return x + bias
34 | 
35 | 
36 | def init(module, weight_init, bias_init, gain=1):
37 |     weight_init(module.weight.data, gain=gain)
38 |     bias_init(module.bias.data)
39 |     return module
40 | 
41 | 
42 | # https://github.com/openai/baselines/blob/master/baselines/common/tf_util.py#L87
43 | def init_normc_(weight, gain=1):
44 |     weight.normal_(0, 1)
45 |     weight *= gain / torch.sqrt(weight.pow(2).sum(1, keepdim=True))
46 | 
47 | 
48 | def seed_torch(seed=2019):
49 |     # python & numpy
50 |     random.seed(seed)
51 |     np.random.seed(seed)
52 |     os.environ['PYTHONHASHSEED'] = str(seed)
53 | 
54 |     # cpu & gpu
55 |     torch.manual_seed(seed)  # set seed for cpu
56 |     torch.cuda.manual_seed(seed)  # set seed for current GPU
57 |     torch.cuda.manual_seed_all(seed)  # set seed for all GPU
58 | 
59 |     torch.backends.cudnn.deterministic = True
60 |     # torch.backends.benchmark = False
61 |     torch.backends.benchmark = True
62 | 
63 | 


--------------------------------------------------------------------------------