├── README.md
├── data
    └── process.py
├── Parameter.py
├── Environment.py
├── DDPG.py
├── D_DDPG.py
└── DPDS.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Deep PDS Learning for AoI Minimization
 2 | Source code for paper [Age-Based Scheduling for Mobile Edge Computing: A Deep Reinforcement Learning Approach](https://ieeexplore.ieee.org/abstract/document/10449431), written in python and tensorflow.
 3 | 
 4 | ## Usage
 5 | ```bash
 6 | python DPDS.py
 7 | python DPDS.py --Alg='dpl'
 8 | python DPDS.py --Alg='coo'
 9 | python DPDS.py --Alg='lpo'
10 | ```
11 | The running data are (i) recorded by the tf.summary module and can be viewed in real time by running tensorboard in the `logs` directory and (ii) written into matlab format files (`.mat`) in the `data` directory after the simulation is finished.
12 | 
13 | ## Citation
14 | If you find our code helpful, please consider citing our paper.
15 | ```
16 | @article{he2024age,
17 |   title={Age-Based Scheduling for Mobile Edge Computing: A Deep Reinforcement Learning Approach},
18 |   author={He, Xingqiu and You, Chaoqun and Quek, Tony QS},
19 |   journal={IEEE Transactions on Mobile Computing},
20 |   year={2024},
21 |   publisher={IEEE}
22 | }
23 | ```
24 | 


--------------------------------------------------------------------------------
/data/process.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from scipy.io import loadmat, savemat
 4 | 
 5 | alglist = ['dpds', 'coo', 'dpl', 'lpo']
 6 | nlist = list(range(10, 21))
 7 | 
 8 | A_scale = {}
 9 | E_scale = {}
10 | A_energy = {}
11 | E_energy = {}
12 | 
13 | for alg in alglist:
14 |     A_scale[alg] = []
15 |     E_scale[alg] = []
16 |     A_energy[alg] = []
17 |     E_energy[alg] = []
18 | 
19 | for alg in alglist:
20 |     for n in nlist:
21 |         filename = alg + '_scale_n' + str(n) + '_lam5000.mat'
22 |         data = loadmat(filename)
23 |         A_scale[alg].append(np.mean(data['A']))
24 |         E_scale[alg].append(np.mean(data['E']))
25 | 
26 | print(A_scale)
27 | print(E_scale)
28 | 
29 | savedata = {'A': A_scale, 'E': E_scale}
30 | savemat('scale.mat', savedata)
31 | 
32 | 
33 | for alg in alglist:
34 |     for n in nlist:
35 |         filename = alg + '_energy' + str(n) + '_n15_lam5000.mat'
36 |         data = loadmat(filename)
37 |         A_energy[alg].append(np.mean(data['A']))
38 |         E_energy[alg].append(np.mean(data['E']))
39 | 
40 | print(A_energy)
41 | print(E_energy)
42 | 
43 | savedata = {'A': A_energy, 'E': E_energy}
44 | savemat('energy.mat', savedata)
45 | 


--------------------------------------------------------------------------------
/Parameter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import math
 4 | 
 5 | ''' Parameter List
 6 |     N = 10   ; number of WDs
 7 |     T = 100  ; number of time slots
 8 |     d_t = 0.01   ; duration of each slot
 9 | 
10 |     ; d ~ [d_min, d_max], uniform distribution
11 |     ; d = {10, 20, 30, 40, 50}Kb
12 |     d_min = 1Mb   ; lower bound of data size
13 |     d_max = 5Mb   ; upper bound of data size
14 |     d_int = 1Mb   ; interval between neighboring data size
15 |     d_sub = 5Kb   ; data size of each subtask
16 | 
17 |     ; for the task interval Delta, we consider three different distributions
18 |     pattern = {"geo", "map"}
19 |     ; 1. geometric distribution (i.e. g_i(t) bernoulli)
20 |     p_g = 0.2   ; P(Delta = k) = (1-p_g)^{k-1} * p_g
21 |     ; 2. MAP
22 |     TODO
23 | 
24 |     kappa = 1000   ; one bit data require kappa CPU cycles
25 |     gamma = 10^{-27}   ; energy-efficiency factor
26 |     eta = gamma * kappa^3 / d_t^2
27 |     h[i][t] = {3,6,9}*10^{-10}   ; channel gain
28 |     sigma^2 = 10^{-9}   ; noise power
29 | 
30 |     W_max = 10MHz   ; each WD share 1MHz (in expectation)
31 |     E_max[i] = 0.5W
32 |     f_max[i] = 1GHz
33 |     P_max[i] = 1W
34 | '''
35 | 
36 | ''' Variable List
37 |     f[i][t]   ; CPU frequency of WD i at slot t
38 |     d_l[i][t]   ; locally-processed data
39 |     E_l[i][t]   ; energy consumption due to local processing
40 | 
41 |     r[i][t]   ; wireless transmission rate
42 |     W[i][t]   ; wireless bandwidth
43 |     P[i][t]   ; transmission power
44 |     d_o[i][t]   ; offloaded data
45 |     E_o[i][t]   ; energy consumption due to offloading
46 | '''
47 | 
48 | class Parameter(object):
49 |     def __init__(self, N=3, T=100, pattern = "geo"):
50 |         super().__init__()
51 | 
52 |         self.N = N
53 |         self.T = T
54 |         self.d_t = 0.01
55 | 
56 |         self.pattern = pattern
57 |         self.d_lb = 2e4
58 |         self.d_ub = 5e4
59 |         self.p_g = 0.3
60 | 
61 |         self.kappa = 1000.0
62 |         self.gamma = 1e-28
63 |         self.eta = self.gamma * self.kappa**3 / self.d_t**2
64 |         self.sigma2 = 1e-11
65 | 
66 |         # channel model
67 |         self.length = 100
68 |         self.epsilon = 3.8
69 |         self.BS_loc = (self.length/2, self.length/2)
70 |         self.WD_loc_list = [(random.randint(0,self.length), random.randint(0,self.length)) for _ in range(self.N)]
71 |         self.distance = [math.sqrt((self.BS_loc[0]-WD_loc[0])**2 + (self.BS_loc[1]-WD_loc[1])**2) for WD_loc in self.WD_loc_list]
72 |         print(self.distance)
73 |         self.distance = np.array(self.distance)
74 | 
75 |         self.W_max = 20e6
76 |         self.E_max = [0.1*self.d_t for _ in range(N)]
77 |         self.f_max = [2e9 for _ in range(N)]
78 |         self.P_max = [1.0 for _ in range(N)]
79 |         self.lam_max = [100000 for _ in range(N)]
80 | 
81 |         self.beta = lambda t: 1 / np.sqrt(t)
82 |         #self.beta = lambda t: 0.0001
83 |         self.theta = lambda t: min(100, 1000/np.log(t+10))
84 |         #self.theta = lambda t: max(10, 10/np.sqrt(t))
85 |         #self.theta = lambda t: 0
86 |         self.lam_init = np.ones(N) * 5000
87 |         #self.lam_init = np.zeros(N)
88 | 


--------------------------------------------------------------------------------
/Environment.py:
--------------------------------------------------------------------------------
  1 | # +
  2 | import numpy as np
  3 | import sys
  4 | import random
  5 | import Parameter as Parameter
  6 | 
  7 | import pdb
  8 | 
  9 | 
 10 | # -
 11 | 
 12 | class Environment(object):
 13 |     def __init__(self, param: Parameter):
 14 |         super().__init__()
 15 |         self.param = param
 16 |         # self.reset()
 17 | 
 18 |     def reset(self):
 19 |         param = self.param
 20 | 
 21 |         self.timer = 0
 22 |         self.nTask = 0 #number of generated tasks
 23 |         self.next_gen = [0 for _ in range(param.N)]
 24 |         self.last_datasize = 0
 25 |         self.last_delta = np.zeros(param.N, dtype=int)
 26 | 
 27 |         # states
 28 |         self.d_r = np.zeros(param.N)
 29 |         self.a = np.zeros(param.N, dtype=int)
 30 |         self.delta = np.zeros(param.N, dtype=int)
 31 |         self.q = np.zeros(param.N, dtype=int)
 32 |         self.h = self.new_channel_gain()
 33 |         self.g = np.zeros(param.N, dtype=int)
 34 |         self.q_set = [[] for _ in range(param.N)]
 35 |         self.acc_E = np.zeros(param.N) # accumulated energy consumption
 36 | 
 37 |         # statistics
 38 |         self.E_stat = [0]
 39 |         self.A_stat = [0]
 40 | 
 41 |         return np.transpose(np.vstack((self.d_r, self.a, self.q, self.h)))
 42 | 
 43 |     def step(self, action):
 44 |         f = action[:, 0] * self.param.f_max
 45 |         P = action[:, 1] * self.param.P_max
 46 |         W = action[:, 2] * self.param.W_max
 47 |         #assert np.isclose(np.sum(action[:,2]), 1.)
 48 |         if not np.isclose(np.sum(action[:,2]), 1.):
 49 |             pdb.set_trace()
 50 | 
 51 |         if np.isnan(f).any() or np.isnan(P).any() or np.isnan(W).any():
 52 |             pdb.set_trace()
 53 | 
 54 |         d = f * self.param.d_t / self.param.kappa + \
 55 |                 self.param.d_t * W * np.log2(1+P*self.h/self.param.sigma2)
 56 |         E = self.param.gamma * f**3 * self.param.d_t + P * self.param.d_t
 57 |         #pdb.set_trace()
 58 |         self.update(d, E)
 59 | 
 60 |         return np.transpose(np.vstack((self.d_r, self.a, self.q, self.h))), E
 61 | 
 62 |     def update(self, d, E):
 63 |         self.timer += 1
 64 |         # at the beginning of each slot, observe current channel gain and previous task generation
 65 |         self.h = self.new_channel_gain()
 66 |         self.g = self.new_task_generation(self.delta)
 67 |         # update system states according to previous decisions
 68 |         d = np.minimum(d, self.d_r)
 69 |         self.d_r = self.d_r - d
 70 |         self.acc_E = self.acc_E + E
 71 |         for i in range(self.param.N):
 72 |             assert self.q[i] == len(self.q_set[i])
 73 |             if d[i] > 0 and self.d_r[i] == 0: # HOL task is completed
 74 |                 self.q_set[i].pop(0) # remove the HOL task
 75 |                 self.q[i] = self.q[i] - 1
 76 |                 if self.q[i] > 0:
 77 |                     self.d_r[i] = self.q_set[i][0].datasize
 78 |                     self.a[i] = self.timer - self.q_set[i][0].generationTime
 79 |                 else:
 80 |                     self.a[i] = 0
 81 |             elif self.d_r[i] > 0: # HOL task is not completed
 82 |                 self.a[i] = self.a[i] + 1
 83 |             else: # d_r[i] == 0 and d[i] == 0, then the queue must be empty in last slot
 84 |                 #assert self.q[i] - self.g[i] == 0
 85 |                 if self.q[i] - self.g[i] != 0:
 86 |                     print(self.q[i])
 87 |                     print(self.g[i])
 88 |                     pdb.set_trace()
 89 |                     assert False
 90 |                 if self.q[i] > 0:
 91 |                     self.d_r[i] = self.q_set[i][0].datasize
 92 |                     self.a[i] = self.timer - self.q_set[i][0].generationTime
 93 |                 else:
 94 |                     self.a[i] = 0
 95 |         #self.d_r = round_and_check(self.d_r)
 96 |         self.E_stat.append(np.sum(E)/self.param.N)
 97 |         self.A_stat.append(np.sum(self.a)/self.param.N)
 98 | 
 99 |     def new_channel_gain(self):
100 |         rand_expo = [random.expovariate(1.0) for _ in range(self.param.N)]
101 |         rand_expo = np.array(rand_expo)
102 |         h = 1e-3 * self.param.distance**(-self.param.epsilon) * rand_expo
103 |         return h
104 | 
105 |     def new_task_generation(self, old_delta):
106 |         # remember to minus the timer by 1, because we are observing the task generation of the previous slot
107 |         timer = self.timer - 1
108 |         g = np.zeros(self.param.N)
109 |         for i in range(self.param.N):
110 |             if self.next_gen[i] == timer:
111 |                 g[i] = 1
112 |                 self.last_delta[i] = self.delta[i]
113 |                 self.delta[i] = 1 # reset delta
114 |                 # generate new task
115 |                 self.last_datasize = random.randint(self.param.d_lb, self.param.d_ub)
116 |                 new_task = Task(self.nTask, timer, self.last_datasize, i)
117 |                 self.nTask += 1
118 |                 # remember to update q[i] and q_set[i]
119 |                 self.q_set[i].append(new_task)
120 |                 self.q[i] += 1
121 |                 if self.param.pattern == "geo":
122 |                     interval = np.random.geometric(self.param.p_g)
123 |                 elif self.param.pattern == "map":
124 |                     sys.exit("MAP not implemented yet!")
125 |                 else:
126 |                     sys.exit("arrival pattern not implemented yet!")
127 |                 assert interval > 0
128 |                 self.next_gen[i] += interval
129 |             else:
130 |                 self.delta[i] += 1
131 |         return g
132 | 
133 | class Task(object):
134 |     def __init__(self, index, gt, size, wd):
135 |         self.index = index
136 |         self.generationTime = gt
137 |         self.datasize = size
138 |         self.generationDevice = wd
139 | 


--------------------------------------------------------------------------------
/DDPG.py:
--------------------------------------------------------------------------------
  1 | # +
  2 | from Environment import Environment
  3 | from Parameter import Parameter
  4 | from scipy.io import savemat
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from tensorflow import keras
  8 | import random
  9 | import os
 10 | import time
 11 | import argparse
 12 | import sys
 13 | import time
 14 | import pickle
 15 | import pdb
 16 | 
 17 | # +
 18 | #################### params ###########################
 19 | parser = argparse.ArgumentParser(description='Hyper_params')
 20 | parser.add_argument('--Info', default='', type=str)  # information added to log dir name
 21 | 
 22 | parser.add_argument('--Seed', default=41, type=int)
 23 | parser.add_argument('--Units', default=256, type=int)  # hidden units num of NN
 24 | parser.add_argument('--Lr', default=0.001, type=float)  # learning rate
 25 | parser.add_argument('--omega', default=0.005, type=float)  # used to update target networks
 26 | parser.add_argument('--Max_Epsilon', default=1.0, type=float)
 27 | parser.add_argument('--Min_Epsilon', default=1.0, type=float)
 28 | parser.add_argument('--Epsilon_Decay', default=1.0, type=float)
 29 | parser.add_argument('--Batch_Size', default=256, type=int)
 30 | parser.add_argument('--Memory_Size', default=1000000, type=int) # buffer size
 31 | parser.add_argument('--Start_Size', default=0, type=int)  # random action before start_size
 32 | parser.add_argument('--Update_After', default=0, type=int)
 33 | parser.add_argument('--Train_Interval', default=1, type=int)
 34 | parser.add_argument('--load_weights', default=False, type=bool)
 35 | parser.add_argument('--Alg', default='ddpg', type=str)
 36 | parser.add_argument('--Gpu_Id', default="0", type=str) # -1 means CPU
 37 | parser.add_argument('--N', default=15, type=int)  # number of WDs
 38 | parser.add_argument('--T', default=1000000, type=int)  # number of simulated slots
 39 | parser.add_argument('--batch_norm', default=True, type=bool)
 40 | 
 41 | args = parser.parse_args()
 42 | # -
 43 | 
 44 | #################### seed ###########################
 45 | os.environ['TF_DETERMINISTIC_OPS'] = '1'
 46 | os.environ["CUDA_VISIBLE_DEVICES"] = args.Gpu_Id
 47 | gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
 48 | for gpu in gpus:
 49 |     tf.config.experimental.set_memory_growth(gpu, True)
 50 | print(tf.config.list_physical_devices())
 51 | tf.random.set_seed(args.Seed)
 52 | np.random.seed(args.Seed)
 53 | random.seed(args.Seed)
 54 | 
 55 | #################### log ###########################
 56 | # create log file
 57 | time_str = time.strftime("%m-%d_%H-%M", time.localtime())
 58 | alg = args.Alg
 59 | log_dir_name = 'logs/' + time_str + '_' + alg + args.Info + '_n' + \
 60 |                str(args.N) + '_seed' + str(args.Seed)
 61 | data_dir_name = 'data/' + alg + args.Info + '_n' + str(args.N)
 62 | fw = tf.summary.create_file_writer(log_dir_name)  # log file witer
 63 | 
 64 | # create dir to save model
 65 | if not os.path.exists(log_dir_name + '/models'):
 66 |     os.makedirs(log_dir_name + '/models')
 67 | 
 68 | # save params to a .txt file
 69 | prams_file = open(log_dir_name + '/prams_table.txt', 'w')
 70 | prams_file.writelines(f'{i:50} {v}\n' for i, v in args.__dict__.items())
 71 | prams_file.close()
 72 | 
 73 | ###################### env ###############################
 74 | param = Parameter(args.N, args.T)
 75 | param.lam_init = np.ones(param.N) * 1e4
 76 | param.theta = lambda t: 0
 77 | param.beta = lambda t: 0.01
 78 | env = Environment(param)
 79 | if args.load_weights:
 80 |     with open('models/v.pickle', 'rb') as f:
 81 |         Initial_v = pickle.load(f)
 82 | else:
 83 |     Initial_v = 0  # initial average reward
 84 | 
 85 | ###################### others ###############################
 86 | W_Initializer = tf.initializers.he_normal(args.Seed)  # NN initializer
 87 | Epsilon_Decay_Rate = (args.Min_Epsilon - args.Max_Epsilon) / (args.T) * args.Epsilon_Decay # factor of decay
 88 | TENSOR_FLOAT_TYPE = tf.dtypes.float32
 89 | TENSOR_INT_TYPE = tf.dtypes.int32
 90 | 
 91 | def softmax(x):
 92 |     return np.exp(x) / np.sum(np.exp(x))
 93 | 
 94 | # +
 95 | class ReplayBuffer:
 96 |     def __init__(self, buffer_capacity = 100000):
 97 |         self.buffer_capacity = buffer_capacity
 98 |         self.buffer_counter = 0
 99 | 
100 |         # dim(action) = N * 3
101 |         # dim(state) = N * 4
102 |         # dim(pds) = N * 4
103 |         buffer_a_dim = (buffer_capacity, param.N, 3)
104 |         buffer_s_dim = (buffer_capacity, param.N, 4)
105 | 
106 |         self.s_buffer = np.empty(buffer_s_dim, dtype=np.float32)
107 |         self.a_buffer = np.empty(buffer_a_dim, dtype=np.float32)
108 |         self.r_buffer = np.empty((buffer_capacity,), dtype=np.float32)
109 |         self.next_s_buffer = np.empty(buffer_s_dim, dtype=np.float32)
110 | 
111 |     def store(self, exp):
112 |         index = self.buffer_counter % self.buffer_capacity
113 | 
114 |         s, a, r, next_s = exp
115 |         self.s_buffer[index] = s
116 |         self.a_buffer[index] = a
117 |         self.r_buffer[index] = r
118 |         self.next_s_buffer[index] = next_s
119 | 
120 |         self.buffer_counter += 1
121 | 
122 |     def sample(self, batch_size):
123 |         sampling_range = min(self.buffer_counter, self.buffer_capacity)
124 |         idx = np.random.randint(0, sampling_range, batch_size)
125 | 
126 |         batch_s = tf.convert_to_tensor(self.s_buffer[idx])
127 |         batch_a = tf.convert_to_tensor(self.a_buffer[idx])
128 |         batch_r = tf.convert_to_tensor(self.r_buffer[idx])
129 |         batch_next_s = tf.convert_to_tensor(self.next_s_buffer[idx])
130 | 
131 |         return batch_s, batch_a, batch_r, batch_next_s
132 | 
133 | class OUActionNoise:
134 |     def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
135 |         self.theta = theta
136 |         self.mean = mean
137 |         self.std_dev = std_deviation
138 |         self.dt = dt
139 |         self.x_initial = x_initial
140 |         self.reset()
141 | 
142 |     def __call__(self):
143 |         # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
144 |         x = (
145 |             self.x_prev
146 |             + self.theta * (self.mean - self.x_prev) * self.dt
147 |             + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
148 |         )
149 |         # Store x into x_prev
150 |         # Makes next noise dependent on current one
151 |         self.x_prev = x
152 |         return x
153 | 
154 |     def reset(self):
155 |         if self.x_initial is not None:
156 |             self.x_prev = self.x_initial
157 |         else:
158 |             self.x_prev = np.zeros_like(self.mean)
159 | # -
160 | 
161 | class DPDS:
162 |     def __init__(self, batch_size, memory_size, max_epsilon):
163 | 
164 |         def build_actor():
165 |             inputs = keras.Input(shape=(param.N, 4))
166 |             x = keras.layers.Flatten()(inputs)
167 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
168 |             if args.batch_norm:
169 |                 x = keras.layers.BatchNormalization()(x)
170 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
171 |             if args.batch_norm:
172 |                 x = keras.layers.BatchNormalization()(x)
173 |             x = keras.layers.Dense(param.N*3, activation='sigmoid', kernel_initializer=W_Initializer)(x)
174 |             outputs = keras.layers.Reshape((param.N, 3))(x)
175 |             model = keras.Model(inputs=inputs, outputs=outputs)
176 |             return model
177 | 
178 |         def build_critic():
179 |             state_input = keras.layers.Input(shape=(param.N, 4))
180 |             state_x = keras.layers.Flatten()(state_input)
181 |             #state_x = keras.layers.Dense(args.Critic_Units, activation='relu', kernel_initializer=W_Initializer)(state_x)
182 |             #state_x = keras.layers.Dense(2*args.Critic_Units, activation='relu', kernel_initializer=W_Initializer)(state_x)
183 | 
184 |             action_input = keras.layers.Input(shape=(param.N, 3))
185 |             action_x = keras.layers.Flatten()(action_input)
186 |             #action_x = keras.layers.Dense(2*args.Critic_Units, activation='relu', kernel_initializer=W_Initializer)(action_x)
187 | 
188 |             concat = keras.layers.Concatenate()([state_x, action_x])
189 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(concat)
190 |             #if args.batch_norm:
191 |             #    x = keras.layers.BatchNormalization()(x)
192 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
193 |             #if args.batch_norm:
194 |             #    x = keras.layers.BatchNormalization()(x)
195 |             outputs = keras.layers.Dense(1)(x)
196 |             model = keras.Model([state_input, action_input], outputs)
197 |             return model
198 | 
199 |         if 'ddpg' in alg:
200 |             self.actor = build_actor()
201 |             self.critic = build_critic()
202 |             self.target_actor = build_actor()
203 |             self.target_critic = build_critic()
204 |             
205 |             if args.load_weights:
206 |                 self.critic.load_weights("models/critic")
207 |                 self.actor.load_weights("models/actor")
208 |                 print("load weight")
209 | 
210 |             self.target_actor.set_weights(self.actor.get_weights())
211 |             self.target_critic.set_weights(self.critic.get_weights())
212 |         else:
213 |             raise NotImplementedError("alg not implemented")
214 | 
215 |         self.actor_optimizer = tf.optimizers.Adam(args.Lr)
216 |         self.critic_optimizer = tf.optimizers.Adam(args.Lr*2)
217 |         self.epsilon = max_epsilon
218 |         self.batch_size = batch_size
219 |         self.buffer = ReplayBuffer(memory_size)
220 |         self.alg = alg
221 |         self.v = Initial_v  # average reward
222 |         self.target_v = Initial_v
223 |         self.lam = param.lam_init
224 | 
225 |         # transform some parameters to tensors
226 |         self.f_max = tf.convert_to_tensor(param.f_max)
227 |         self.P_max = tf.convert_to_tensor(param.P_max)
228 |         self.W_max = tf.convert_to_tensor(param.W_max)
229 |         self.E_max = tf.convert_to_tensor(param.E_max)
230 |         self.d_t = tf.convert_to_tensor(param.d_t)
231 |         self.kappa = tf.convert_to_tensor(param.kappa)
232 |         self.sigma2 = tf.convert_to_tensor(param.sigma2)
233 |         self.gamma = tf.convert_to_tensor(param.gamma)
234 | 
235 |     def random_action(self, s):
236 |         action = np.random.rand(param.N, 3)
237 |         # apply softmax to W so that its sum equals 1
238 |         action[:,2] = softmax(action[:,2])
239 |         return action
240 |     
241 |     def _choose_action(self, s):
242 |         action = self.actor(s[None, :])[0].numpy()
243 |         action[:,2] = softmax(action[:,2])
244 |         return action
245 | 
246 |     def choose_action(self, s, noise_object, epsilon):
247 |         action = self.actor(s[None, :])[0].numpy()
248 |         noise = noise_object()
249 |         # Adding noise to action
250 |         action = action + epsilon * noise
251 | 
252 |         # We make sure action is within bounds
253 |         legal_action = np.clip(action, 0, 1)
254 |         legal_action[:,2] = softmax(legal_action[:,2])
255 | 
256 |         return legal_action
257 | 
258 |     @tf.function(jit_compile=True)
259 |     def f_k(self, batch_state, batch_action):
260 |         f = batch_action[:, :, 0] * self.f_max
261 |         P = batch_action[:, :, 1] * self.P_max
262 |         W = tf.nn.softmax(batch_action[:, :, 2]) * self.W_max
263 |         d_r = batch_state[:, :, 0]
264 |         a = batch_state[:, :, 1]
265 |         q = batch_state[:, :, 2]
266 |         h = batch_state[:, :, 3]
267 | 
268 |         d = f * self.d_t / self.kappa + \
269 |                 self.d_t * W * (tf.math.log(1+P*h/self.sigma2)/tf.math.log(2.))
270 |         b = ((d > 0) & (d_r > 0) & (d >= d_r))
271 |         b = tf.where(b, 1.0, 0.0)
272 |         pds_q = q - b
273 |         pds_h = h
274 |         pds_d_r = tf.maximum(tf.constant(0, dtype=tf.float32), d_r - d)
275 |         pds_a = a - 5 * b
276 |         
277 |         s = tf.stack([pds_d_r, pds_a, pds_q, pds_h], axis=2)
278 |         return s
279 | 
280 |     @tf.function(jit_compile=True)
281 |     def cost(self, batch_state, batch_action):
282 |         f = batch_action[:, :, 0] * self.f_max
283 |         P = batch_action[:, :, 1] * self.P_max
284 |         W = tf.nn.softmax(batch_action[:, :, 2]) * self.W_max
285 |         E = self.gamma * f**3 * self.d_t + P * self.d_t
286 |         h = batch_state[:, :, 3]
287 |         d = f * self.d_t / self.kappa + \
288 |                 self.d_t * W * (tf.math.log(1+P*h/self.sigma2)/tf.math.log(2.))
289 |         # in expectation, completing a task reduces aoi by 1/p_g
290 |         aoi_per_bit = 1 / param.p_g / ((param.d_lb+param.d_ub)/2)
291 |         cost = tf.reduce_sum(batch_state[:, :, 1] - d*aoi_per_bit + self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
292 |         #cost = tf.reduce_sum(self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
293 |         #cost = tf.reduce_sum(batch_state[:, :, 1] + self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
294 |         #cost = tf.reduce_sum(batch_state[:, :, 1] + self.lam * (E - self.E_max), axis=1)
295 |         return cost
296 | 
297 |     @tf.function(jit_compile=True)
298 |     def train(self, s, a, r, s_next):
299 |         # update critic network
300 |         with tf.GradientTape() as tape:
301 |             # calculate target y
302 |             target_a_next = self.target_actor(s_next, training=True)
303 |             target_y = self.cost(s,a) + self.target_critic([s_next, target_a_next], training=True) - self.target_v
304 |             critic_value = self.critic([s, a], training=True)
305 |             td = critic_value - target_y
306 |             critic_loss = tf.math.reduce_mean(tf.math.abs(td))
307 | 
308 |         critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables)
309 |         #critic_grad = [tf.clip_by_norm(grad, 10.0) for grad in critic_grad]
310 |         self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables) )
311 | 
312 |         # update actor network
313 |         with tf.GradientTape() as tape:
314 |             actions = self.actor(s, training=True)
315 |             critic_value = self.critic([s, actions], training=True)
316 |             actor_loss = tf.math.reduce_mean(critic_value)
317 | 
318 |         actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
319 |         #actor_grad = [tf.clip_by_norm(grad, 10.0) for grad in actor_grad]
320 |         self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables) )
321 | 
322 |         # td is returned to update self.v
323 |         # we do not update self.v in this function because it leaks the local tensor 'td', which is prohibited by tensorflow
324 |         return (td, critic_loss, actor_loss, actor_grad, critic_grad, critic_value)
325 | 
326 |     def save_model(self, dir=log_dir_name + '/models'):
327 |         self.actor.save_weights(dir + '/' + self.alg + '_actor')
328 |         self.critic.save_weights(dir + '/' + self.alg + '_critic')
329 |         self.actor.save_weights('models/actor')
330 |         self.critic.save_weights('models/critic')
331 |         with open(dir + '/' + self.alg + '_v.pickle', 'wb') as f:
332 |             pickle.dump(self.v, f)
333 |         with open('models/v.pickle', 'wb') as f:
334 |             pickle.dump(self.v, f)
335 | 
336 |     
337 | @tf.function(jit_compile=True)
338 | def update_target(target_weights, weights, omega):
339 |     for (a, b) in zip(target_weights, weights):
340 |         a.assign(b * omega + a * (1 - omega))
341 | 
342 | def train(T):
343 |     agent = DPDS(args.Batch_Size, args.Memory_Size, args.Max_Epsilon)
344 |     print("============" + agent.alg + "============")
345 | 
346 |     state = env.reset()
347 |     std_dev = 0.01
348 |     ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
349 |     acc_E = np.zeros(param.N)
350 |     acc_A = np.zeros(param.N)
351 |     timer = 1
352 |     
353 |     acc_interaction_time = 0
354 |     acc_inference_time = 0
355 |     acc_training_time = 0
356 |     
357 |     while timer <= T:
358 |         if timer % 10000 == 0:
359 |             print(timer)
360 | 
361 |         if timer <= args.Start_Size:
362 |             action = agent.random_action(state)
363 |         else:
364 |             inference_begin = time.time()
365 |             action = agent.choose_action(state, ou_noise, agent.epsilon)
366 |             inference_end = time.time()
367 |             acc_inference_time += inference_end - inference_begin
368 |         interaction_begin = time.time()
369 |         next_state, E = env.step(action)
370 |         interaction_end = time.time()
371 |         acc_interaction_time += interaction_end - interaction_begin
372 |         #cost = np.sum(state[:,1] + agent.lam * (E - param.E_max))
373 |         cost = np.sum(agent.lam * (E - param.E_max))
374 | 
375 |         agent.buffer.store((state, action, cost, next_state))
376 | 
377 |         # train
378 |         if timer > args.Update_After and timer % args.Train_Interval == 0:
379 |             training_begin = time.time()
380 |             # sample from buffer
381 |             s, a, r, s_next = agent.buffer.sample(args.Batch_Size)
382 |             td, critic_loss, actor_loss, actor_grad, critic_grad, critic_value = agent.train(s, a, r, s_next)
383 |             #tf.print(actor_grad)
384 |             #tf.print(critic_grad)
385 |             agent.v = agent.v - param.beta(timer) * tf.reduce_mean(td)
386 |             training_end = time.time()
387 |             acc_training_time += training_end - training_begin
388 | 
389 |             # update target networks
390 |             update_target(agent.target_actor.variables, agent.actor.variables, args.omega)
391 |             update_target(agent.target_critic.variables, agent.critic.variables, args.omega)
392 |             agent.target_v = args.omega * agent.v + (1 - args.omega) * agent.target_v
393 | 
394 |             # update lambda
395 |             agent.lam = agent.lam + param.theta(timer) * (E - param.E_max)
396 |             agent.lam = np.maximum(0, agent.lam)
397 |             agent.lam = np.minimum(param.lam_max, agent.lam)
398 | 
399 |             with fw.as_default():
400 |                 tf.summary.scalar('critic_loss', critic_loss, step = timer)
401 |                 tf.summary.scalar('actor_loss', actor_loss, step = timer)
402 |             
403 |         # epsilon decay
404 |         agent.epsilon = max(Epsilon_Decay_Rate * timer + args.Max_Epsilon, args.Min_Epsilon)
405 | 
406 |         timer += 1
407 | 
408 |         # log
409 |         acc_E += E
410 |         acc_A += state[:,1]
411 |         with fw.as_default():
412 |             tf.summary.scalar('cost', cost, step=timer)
413 |             tf.summary.scalar('aoi', np.sum(state[:,1])/args.N, step=timer)
414 |             tf.summary.scalar('average aoi', np.sum(acc_A)/timer/args.N, step=timer)
415 |             tf.summary.scalar('energy', np.sum(E)/args.N, step=timer)
416 |             tf.summary.scalar('v', agent.v, step=timer)
417 |             tf.summary.scalar('epsilon', agent.epsilon, step=timer)
418 |             tf.summary.scalar('lambda', np.sum(agent.lam)/args.N, step=timer)
419 |             tf.summary.scalar('average energy', np.sum(acc_E)/timer/args.N, step=timer)
420 | 
421 |         state = next_state
422 |     
423 |     agent.save_model()
424 |     print("Average interaction time:", acc_interaction_time / args.T)
425 |     print("Average inference time:", acc_inference_time / (args.T - args.Start_Size))
426 |     print("Average training time:", acc_training_time / (args.T - args.Update_After))
427 |     print("Average number of tasks:", env.nTask / timer /args.N)
428 | 
429 |     data = {'N': param.N, 'E': env.E_stat, 'A': env.A_stat}
430 |     savemat(data_dir_name + '.mat', data)
431 | 
432 | if __name__ == "__main__":
433 |     train(args.T)
434 | 
435 | print(tf.__version__)
436 | 
437 | 
438 | 


--------------------------------------------------------------------------------
/D_DDPG.py:
--------------------------------------------------------------------------------
  1 | # +
  2 | from Environment import Environment
  3 | from Parameter import Parameter
  4 | from scipy.io import savemat
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from tensorflow import keras
  8 | import random
  9 | import os
 10 | import time
 11 | import argparse
 12 | import sys
 13 | import time
 14 | import pickle
 15 | import pdb
 16 | 
 17 | # +
 18 | #################### params ###########################
 19 | parser = argparse.ArgumentParser(description='Hyper_params')
 20 | parser.add_argument('--Info', default='', type=str)  # information added to log dir name
 21 | 
 22 | parser.add_argument('--Seed', default=41, type=int)
 23 | parser.add_argument('--Units', default=256, type=int)  # hidden units num of NN
 24 | parser.add_argument('--Lr', default=0.001, type=float)  # learning rate
 25 | parser.add_argument('--omega', default=0.005, type=float)  # used to update target networks
 26 | parser.add_argument('--Max_Epsilon', default=1.0, type=float)
 27 | parser.add_argument('--Min_Epsilon', default=1.0, type=float)
 28 | parser.add_argument('--Epsilon_Decay', default=1.0, type=float)
 29 | parser.add_argument('--Batch_Size', default=256, type=int)
 30 | parser.add_argument('--Memory_Size', default=1000000, type=int) # buffer size
 31 | parser.add_argument('--Start_Size', default=0, type=int)  # random action before start_size
 32 | parser.add_argument('--Update_After', default=0, type=int)
 33 | parser.add_argument('--Train_Interval', default=1, type=int)
 34 | parser.add_argument('--load_weights', default=False, type=bool)
 35 | parser.add_argument('--Alg', default='d_ddpg', type=str)
 36 | parser.add_argument('--Gpu_Id', default="0", type=str) # -1 means CPU
 37 | parser.add_argument('--N', default=15, type=int)  # number of WDs
 38 | parser.add_argument('--T', default=1000000, type=int)  # number of simulated slots
 39 | parser.add_argument('--batch_norm', default=True, type=bool)
 40 | parser.add_argument('--discount', default=0.99, type=float)
 41 | 
 42 | args = parser.parse_args()
 43 | # -
 44 | 
 45 | #################### seed ###########################
 46 | os.environ['TF_DETERMINISTIC_OPS'] = '1'
 47 | os.environ["CUDA_VISIBLE_DEVICES"] = args.Gpu_Id
 48 | gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
 49 | for gpu in gpus:
 50 |     tf.config.experimental.set_memory_growth(gpu, True)
 51 | print(tf.config.list_physical_devices())
 52 | tf.random.set_seed(args.Seed)
 53 | np.random.seed(args.Seed)
 54 | random.seed(args.Seed)
 55 | 
 56 | #################### log ###########################
 57 | # create log file
 58 | time_str = time.strftime("%m-%d_%H-%M", time.localtime())
 59 | alg = args.Alg
 60 | log_dir_name = 'logs/' + time_str + '_' + alg + args.Info + '_n' + \
 61 |                str(args.N) + '_seed' + str(args.Seed)
 62 | data_dir_name = 'data/' + alg + args.Info + '_n' + str(args.N)
 63 | fw = tf.summary.create_file_writer(log_dir_name)  # log file witer
 64 | 
 65 | # create dir to save model
 66 | if not os.path.exists(log_dir_name + '/models'):
 67 |     os.makedirs(log_dir_name + '/models')
 68 | 
 69 | # save params to a .txt file
 70 | prams_file = open(log_dir_name + '/prams_table.txt', 'w')
 71 | prams_file.writelines(f'{i:50} {v}\n' for i, v in args.__dict__.items())
 72 | prams_file.close()
 73 | 
 74 | ###################### env ###############################
 75 | param = Parameter(args.N, args.T)
 76 | param.lam_init = np.ones(param.N) * 10000
 77 | param.theta = lambda t: 0
 78 | #param.theta = lambda t: min(10, 100/np.log(t+10))
 79 | param.beta = lambda t: 0.01
 80 | env = Environment(param)
 81 | if args.load_weights:
 82 |     with open('models/v.pickle', 'rb') as f:
 83 |         Initial_v = pickle.load(f)
 84 | else:
 85 |     Initial_v = 0  # initial average reward
 86 | 
 87 | ###################### others ###############################
 88 | W_Initializer = tf.initializers.he_normal(args.Seed)  # NN initializer
 89 | Epsilon_Decay_Rate = (args.Min_Epsilon - args.Max_Epsilon) / (args.T) * args.Epsilon_Decay # factor of decay
 90 | TENSOR_FLOAT_TYPE = tf.dtypes.float32
 91 | TENSOR_INT_TYPE = tf.dtypes.int32
 92 | 
 93 | def softmax(x):
 94 |     return np.exp(x) / np.sum(np.exp(x))
 95 | 
 96 | # +
 97 | class ReplayBuffer:
 98 |     def __init__(self, buffer_capacity = 100000):
 99 |         self.buffer_capacity = buffer_capacity
100 |         self.buffer_counter = 0
101 | 
102 |         # dim(action) = N * 3
103 |         # dim(state) = N * 4
104 |         # dim(pds) = N * 4
105 |         buffer_a_dim = (buffer_capacity, param.N, 3)
106 |         buffer_s_dim = (buffer_capacity, param.N, 4)
107 | 
108 |         self.s_buffer = np.empty(buffer_s_dim, dtype=np.float32)
109 |         self.a_buffer = np.empty(buffer_a_dim, dtype=np.float32)
110 |         self.r_buffer = np.empty((buffer_capacity,), dtype=np.float32)
111 |         self.next_s_buffer = np.empty(buffer_s_dim, dtype=np.float32)
112 | 
113 |     def store(self, exp):
114 |         index = self.buffer_counter % self.buffer_capacity
115 | 
116 |         s, a, r, next_s = exp
117 |         self.s_buffer[index] = s
118 |         self.a_buffer[index] = a
119 |         self.r_buffer[index] = r
120 |         self.next_s_buffer[index] = next_s
121 | 
122 |         self.buffer_counter += 1
123 | 
124 |     def sample(self, batch_size):
125 |         sampling_range = min(self.buffer_counter, self.buffer_capacity)
126 |         idx = np.random.randint(0, sampling_range, batch_size)
127 | 
128 |         batch_s = tf.convert_to_tensor(self.s_buffer[idx])
129 |         batch_a = tf.convert_to_tensor(self.a_buffer[idx])
130 |         batch_r = tf.convert_to_tensor(self.r_buffer[idx])
131 |         batch_next_s = tf.convert_to_tensor(self.next_s_buffer[idx])
132 | 
133 |         return batch_s, batch_a, batch_r, batch_next_s
134 | 
135 | class OUActionNoise:
136 |     def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
137 |         self.theta = theta
138 |         self.mean = mean
139 |         self.std_dev = std_deviation
140 |         self.dt = dt
141 |         self.x_initial = x_initial
142 |         self.reset()
143 | 
144 |     def __call__(self):
145 |         # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
146 |         x = (
147 |             self.x_prev
148 |             + self.theta * (self.mean - self.x_prev) * self.dt
149 |             + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
150 |         )
151 |         # Store x into x_prev
152 |         # Makes next noise dependent on current one
153 |         self.x_prev = x
154 |         return x
155 | 
156 |     def reset(self):
157 |         if self.x_initial is not None:
158 |             self.x_prev = self.x_initial
159 |         else:
160 |             self.x_prev = np.zeros_like(self.mean)
161 | # -
162 | 
163 | class DPDS:
164 |     def __init__(self, batch_size, memory_size, max_epsilon):
165 | 
166 |         def build_actor():
167 |             inputs = keras.Input(shape=(param.N, 4))
168 |             x = keras.layers.Flatten()(inputs)
169 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
170 |             if args.batch_norm:
171 |                 x = keras.layers.BatchNormalization()(x)
172 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
173 |             if args.batch_norm:
174 |                 x = keras.layers.BatchNormalization()(x)
175 |             x = keras.layers.Dense(param.N*3, activation='sigmoid', kernel_initializer=W_Initializer)(x)
176 |             outputs = keras.layers.Reshape((param.N, 3))(x)
177 |             model = keras.Model(inputs=inputs, outputs=outputs)
178 |             return model
179 | 
180 |         def build_critic():
181 |             state_input = keras.layers.Input(shape=(param.N, 4))
182 |             state_x = keras.layers.Flatten()(state_input)
183 |             #state_x = keras.layers.Dense(args.Critic_Units, activation='relu', kernel_initializer=W_Initializer)(state_x)
184 |             #state_x = keras.layers.Dense(2*args.Critic_Units, activation='relu', kernel_initializer=W_Initializer)(state_x)
185 | 
186 |             action_input = keras.layers.Input(shape=(param.N, 3))
187 |             action_x = keras.layers.Flatten()(action_input)
188 |             #action_x = keras.layers.Dense(2*args.Critic_Units, activation='relu', kernel_initializer=W_Initializer)(action_x)
189 | 
190 |             concat = keras.layers.Concatenate()([state_x, action_x])
191 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(concat)
192 |             if args.batch_norm:
193 |                 x = keras.layers.BatchNormalization()(x)
194 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
195 |             if args.batch_norm:
196 |                 x = keras.layers.BatchNormalization()(x)
197 |             outputs = keras.layers.Dense(1)(x)
198 |             model = keras.Model([state_input, action_input], outputs)
199 |             return model
200 | 
201 |         if 'd_ddpg' in alg:
202 |             self.actor = build_actor()
203 |             self.critic = build_critic()
204 |             self.target_actor = build_actor()
205 |             self.target_critic = build_critic()
206 |             
207 |             if args.load_weights:
208 |                 self.critic.load_weights("models/critic")
209 |                 self.actor.load_weights("models/actor")
210 |                 print("load weight")
211 | 
212 |             self.target_actor.set_weights(self.actor.get_weights())
213 |             self.target_critic.set_weights(self.critic.get_weights())
214 |         else:
215 |             raise NotImplementedError("alg not implemented")
216 | 
217 |         self.actor_optimizer = tf.optimizers.Adam(args.Lr)
218 |         self.critic_optimizer = tf.optimizers.Adam(args.Lr*2)
219 |         self.epsilon = max_epsilon
220 |         self.batch_size = batch_size
221 |         self.buffer = ReplayBuffer(memory_size)
222 |         self.alg = alg
223 |         self.v = Initial_v  # average reward
224 |         self.target_v = Initial_v
225 |         self.lam = param.lam_init
226 | 
227 |         # transform some parameters to tensors
228 |         self.f_max = tf.convert_to_tensor(param.f_max)
229 |         self.P_max = tf.convert_to_tensor(param.P_max)
230 |         self.W_max = tf.convert_to_tensor(param.W_max)
231 |         self.E_max = tf.convert_to_tensor(param.E_max)
232 |         self.d_t = tf.convert_to_tensor(param.d_t)
233 |         self.kappa = tf.convert_to_tensor(param.kappa)
234 |         self.sigma2 = tf.convert_to_tensor(param.sigma2)
235 |         self.gamma = tf.convert_to_tensor(param.gamma)
236 | 
237 |     def random_action(self, s):
238 |         action = np.random.rand(param.N, 3)
239 |         # apply softmax to W so that its sum equals 1
240 |         action[:,2] = softmax(action[:,2])
241 |         return action
242 |     
243 |     def _choose_action(self, s):
244 |         action = self.actor(s[None, :])[0].numpy()
245 |         action[:,2] = softmax(action[:,2])
246 |         return action
247 | 
248 |     def choose_action(self, s, noise_object, epsilon):
249 |         action = self.actor(s[None, :])[0].numpy()
250 |         noise = noise_object()
251 |         # Adding noise to action
252 |         action = action + epsilon * noise
253 | 
254 |         # We make sure action is within bounds
255 |         legal_action = np.clip(action, 0, 1)
256 |         legal_action[:,2] = softmax(legal_action[:,2])
257 | 
258 |         return legal_action
259 | 
260 |     @tf.function(jit_compile=True)
261 |     def f_k(self, batch_state, batch_action):
262 |         f = batch_action[:, :, 0] * self.f_max
263 |         P = batch_action[:, :, 1] * self.P_max
264 |         W = tf.nn.softmax(batch_action[:, :, 2]) * self.W_max
265 |         d_r = batch_state[:, :, 0]
266 |         a = batch_state[:, :, 1]
267 |         q = batch_state[:, :, 2]
268 |         h = batch_state[:, :, 3]
269 | 
270 |         d = f * self.d_t / self.kappa + \
271 |                 self.d_t * W * (tf.math.log(1+P*h/self.sigma2)/tf.math.log(2.))
272 |         b = ((d > 0) & (d_r > 0) & (d >= d_r))
273 |         b = tf.where(b, 1.0, 0.0)
274 |         pds_q = q - b
275 |         pds_h = h
276 |         pds_d_r = tf.maximum(tf.constant(0, dtype=tf.float32), d_r - d)
277 |         pds_a = a - 5 * b
278 |         
279 |         s = tf.stack([pds_d_r, pds_a, pds_q, pds_h], axis=2)
280 |         return s
281 | 
282 |     @tf.function(jit_compile=True)
283 |     def cost(self, batch_state, batch_action):
284 |         f = batch_action[:, :, 0] * self.f_max
285 |         P = batch_action[:, :, 1] * self.P_max
286 |         W = tf.nn.softmax(batch_action[:, :, 2]) * self.W_max
287 |         E = self.gamma * f**3 * self.d_t + P * self.d_t
288 |         h = batch_state[:, :, 3]
289 |         d = f * self.d_t / self.kappa + \
290 |                 self.d_t * W * (tf.math.log(1+P*h/self.sigma2)/tf.math.log(2.))
291 |         # in expectation, completing a task reduces aoi by 1/p_g
292 |         aoi_per_bit = 1 / param.p_g / ((param.d_lb+param.d_ub)/2)
293 |         cost = tf.reduce_sum(batch_state[:, :, 1] - d*aoi_per_bit + self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
294 |         #cost = tf.reduce_sum(-d*aoi_per_bit + self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
295 |         #cost = tf.reduce_sum(self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
296 |         #cost = tf.reduce_sum(batch_state[:, :, 1] + self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
297 |         #cost = tf.reduce_sum(batch_state[:, :, 1] + self.lam * (E - self.E_max), axis=1)
298 |         return cost
299 | 
300 |     def norm(self, s):
301 |         s[:, :, 0] = s[:, :, 0] / 2000
302 |         s[:, :, 3] = -tf.math.log(s[:, :, 3] * 1000)
303 |         return s
304 | 
305 |     @tf.function(jit_compile=True)
306 |     def train(self, s, a, r, s_next):
307 |         # update critic network
308 |         with tf.GradientTape() as tape:
309 |             # calculate target y
310 |             target_a_next = self.target_actor(s_next, training=True)
311 |             target_a_next[:,:,2] = tf.nn.softmax(target_a_next[:,:,2])
312 |             target_y = self.cost(s,a) + args.discount * self.target_critic([s_next, target_a_next], training=True)
313 |             critic_value = self.critic([s, a], training=True)
314 |             td = critic_value - target_y
315 |             critic_loss = tf.math.reduce_mean(tf.math.abs(td))
316 | 
317 |         critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables)
318 |         #critic_grad = [tf.clip_by_norm(grad, 10.0) for grad in critic_grad]
319 |         self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables) )
320 | 
321 |         # update actor network
322 |         with tf.GradientTape() as tape:
323 |             actions = self.actor(s, training=True)
324 |             actions[:,:,2] = tf.nn.softmax(actions[:,:,2])
325 |             critic_value = self.critic([s, actions], training=True)
326 |             actor_loss = tf.math.reduce_mean(critic_value)
327 | 
328 |         actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
329 |         actor_grad = [tf.clip_by_norm(grad, 1.0) for grad in actor_grad]
330 |         self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables) )
331 | 
332 |         # td is returned to update self.v
333 |         # we do not update self.v in this function because it leaks the local tensor 'td', which is prohibited by tensorflow
334 |         return (td, critic_loss, actor_loss, actor_grad, critic_grad, critic_value)
335 | 
336 |     def save_model(self, dir=log_dir_name + '/models'):
337 |         self.actor.save_weights(dir + '/' + self.alg + '_actor')
338 |         self.critic.save_weights(dir + '/' + self.alg + '_critic')
339 |         self.actor.save_weights('models/actor')
340 |         self.critic.save_weights('models/critic')
341 |         with open(dir + '/' + self.alg + '_v.pickle', 'wb') as f:
342 |             pickle.dump(self.v, f)
343 |         with open('models/v.pickle', 'wb') as f:
344 |             pickle.dump(self.v, f)
345 | 
346 |     
347 | @tf.function(jit_compile=True)
348 | def update_target(target_weights, weights, omega):
349 |     for (a, b) in zip(target_weights, weights):
350 |         a.assign(b * omega + a * (1 - omega))
351 | 
352 | def train(T):
353 |     agent = DPDS(args.Batch_Size, args.Memory_Size, args.Max_Epsilon)
354 |     print("============" + agent.alg + "============")
355 | 
356 |     state = env.reset()
357 |     std_dev = 0.01
358 |     ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
359 |     acc_E = np.zeros(param.N)
360 |     acc_A = np.zeros(param.N)
361 |     timer = 1
362 |     
363 |     acc_interaction_time = 0
364 |     acc_inference_time = 0
365 |     acc_training_time = 0
366 |     
367 |     while timer <= T:
368 |         if timer % 10000 == 0:
369 |             print(timer)
370 | 
371 |         if timer <= args.Start_Size:
372 |             action = agent.random_action(state)
373 |         else:
374 |             inference_begin = time.time()
375 |             action = agent.choose_action(state, ou_noise, agent.epsilon)
376 |             inference_end = time.time()
377 |             acc_inference_time += inference_end - inference_begin
378 |         interaction_begin = time.time()
379 |         next_state, E = env.step(action)
380 |         interaction_end = time.time()
381 |         acc_interaction_time += interaction_end - interaction_begin
382 |         #cost = np.sum(state[:,1] + agent.lam * (E - param.E_max))
383 |         cost = np.sum(agent.lam * (E - param.E_max))
384 | 
385 |         agent.buffer.store((state, action, cost, next_state))
386 | 
387 |         # train
388 |         if timer > args.Update_After and timer % args.Train_Interval == 0:
389 |             training_begin = time.time()
390 |             # sample from buffer
391 |             s, a, r, s_next = agent.buffer.sample(args.Batch_Size)
392 |             td, critic_loss, actor_loss, actor_grad, critic_grad, critic_value = agent.train(s, a, r, s_next)
393 |             #tf.print(actor_grad)
394 |             #tf.print(critic_grad)
395 |             agent.v = agent.v - param.beta(timer) * tf.reduce_mean(td)
396 |             training_end = time.time()
397 |             acc_training_time += training_end - training_begin
398 | 
399 |             # update target networks
400 |             update_target(agent.target_actor.variables, agent.actor.variables, args.omega)
401 |             update_target(agent.target_critic.variables, agent.critic.variables, args.omega)
402 |             agent.target_v = args.omega * agent.v + (1 - args.omega) * agent.target_v
403 | 
404 |             # update lambda
405 |             agent.lam = agent.lam + param.theta(timer) * (E - param.E_max)
406 |             agent.lam = np.maximum(0, agent.lam)
407 |             agent.lam = np.minimum(param.lam_max, agent.lam)
408 | 
409 |             with fw.as_default():
410 |                 tf.summary.scalar('critic_loss', critic_loss, step = timer)
411 |                 tf.summary.scalar('actor_loss', actor_loss, step = timer)
412 |             
413 |         # epsilon decay
414 |         agent.epsilon = max(Epsilon_Decay_Rate * timer + args.Max_Epsilon, args.Min_Epsilon)
415 | 
416 |         timer += 1
417 | 
418 |         # log
419 |         acc_E += E
420 |         acc_A += state[:,1]
421 |         with fw.as_default():
422 |             tf.summary.scalar('cost', cost, step=timer)
423 |             tf.summary.scalar('aoi', np.sum(state[:,1])/args.N, step=timer)
424 |             tf.summary.scalar('average aoi', np.sum(acc_A)/timer/args.N, step=timer)
425 |             tf.summary.scalar('energy', np.sum(E)/args.N, step=timer)
426 |             tf.summary.scalar('v', agent.v, step=timer)
427 |             tf.summary.scalar('epsilon', agent.epsilon, step=timer)
428 |             tf.summary.scalar('lambda', np.sum(agent.lam)/args.N, step=timer)
429 |             tf.summary.scalar('average energy', np.sum(acc_E)/timer/args.N, step=timer)
430 | 
431 |         state = next_state
432 |     
433 |     agent.save_model()
434 |     print("Average interaction time:", acc_interaction_time / args.T)
435 |     print("Average inference time:", acc_inference_time / (args.T - args.Start_Size))
436 |     print("Average training time:", acc_training_time / (args.T - args.Update_After))
437 |     print("Average number of tasks:", env.nTask / timer /args.N)
438 | 
439 |     data = {'N': param.N, 'E': env.E_stat, 'A': env.A_stat}
440 |     savemat(data_dir_name + '.mat', data)
441 | 
442 | if __name__ == "__main__":
443 |     train(args.T)
444 | 
445 | print(tf.__version__)
446 | 
447 | 
448 | 


--------------------------------------------------------------------------------
/DPDS.py:
--------------------------------------------------------------------------------
  1 | # +
  2 | from Environment import Environment
  3 | from Parameter import Parameter
  4 | from scipy.io import savemat
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from tensorflow import keras
  8 | import random
  9 | import os
 10 | import time
 11 | import argparse
 12 | import sys
 13 | import time
 14 | import pickle
 15 | import pdb
 16 | 
 17 | # +
 18 | #################### params ###########################
 19 | parser = argparse.ArgumentParser(description='Hyper_params')
 20 | parser.add_argument('--Info', default='', type=str)  # information added to log dir name
 21 | 
 22 | parser.add_argument('--Seed', default=41, type=int)
 23 | parser.add_argument('--Units', default=128, type=int)  # hidden units num of critic NN
 24 | parser.add_argument('--Lr', default=0.001, type=float)  # learning rate
 25 | parser.add_argument('--omega', default=0.005, type=float)  # used to update target networks
 26 | parser.add_argument('--Max_Epsilon', default=0.0, type=float)
 27 | parser.add_argument('--Min_Epsilon', default=0.0, type=float)
 28 | parser.add_argument('--Epsilon_Decay', default=1.0, type=float)
 29 | parser.add_argument('--Batch_Size', default=256, type=int)
 30 | parser.add_argument('--Memory_Size', default=200000, type=int) # buffer size
 31 | parser.add_argument('--Start_Size', default=0, type=int)  # random action before start_size
 32 | parser.add_argument('--Update_After', default=0, type=int)
 33 | parser.add_argument('--Train_Interval', default=1, type=int)
 34 | parser.add_argument('--load_weights', default=False, type=bool)
 35 | parser.add_argument('--Alg', default='dpds', type=str)
 36 | parser.add_argument('--Gpu_Id', default="0", type=str) # -1 means CPU
 37 | parser.add_argument('--E_max', default=0.1, type=float)
 38 | parser.add_argument('--N', default=15, type=int)  # number of WDs
 39 | parser.add_argument('--T', default=100000, type=int)  # number of simulated slots
 40 | parser.add_argument('--batch_norm', default=True, type=bool)
 41 | 
 42 | args = parser.parse_args()
 43 | # -
 44 | 
 45 | #################### seed ###########################
 46 | os.environ['TF_DETERMINISTIC_OPS'] = '1'
 47 | os.environ["CUDA_VISIBLE_DEVICES"] = args.Gpu_Id
 48 | gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
 49 | for gpu in gpus:
 50 |     tf.config.experimental.set_memory_growth(gpu, True)
 51 | print(tf.config.list_physical_devices())
 52 | tf.random.set_seed(args.Seed)
 53 | np.random.seed(args.Seed)
 54 | random.seed(args.Seed)
 55 | 
 56 | param = Parameter(args.N, args.T)
 57 | param.E_max = [args.E_max*param.d_t for _ in range(param.N)]
 58 | if args.Alg == 'lpo':
 59 |     # for local processing only, we set the wireless bandwidth to zero
 60 |     param.W_max = 0.0
 61 |     param.E_max = [3*args.E_max*param.d_t for _ in range(param.N)]
 62 |     #param.lam_init = np.ones(param.N) * 1000
 63 |     #param.theta = lambda t: 0
 64 | if args.Alg == 'coo':
 65 |     # for computation offloading only, we set the local CPU frequency to zero
 66 |     param.f_max = [0.0 for _ in range(param.N)]
 67 |     param.E_max = [2*args.E_max*param.d_t for _ in range(param.N)]
 68 | env = Environment(param)
 69 | if args.load_weights:
 70 |     with open('models/v.pickle', 'rb') as f:
 71 |         Initial_v = pickle.load(f)
 72 | else:
 73 |     Initial_v = 0  # initial average reward
 74 | 
 75 | #################### log ###########################
 76 | # create log file
 77 | time_str = time.strftime("%m-%d_%H-%M", time.localtime())
 78 | alg = args.Alg
 79 | log_dir_name = 'logs/' + time_str + '_' + alg + args.Info + '_n' + \
 80 |                str(args.N) + '_seed' + str(args.Seed) + '_lam' + str(int(param.lam_init[0]))
 81 | data_dir_name = 'data/' + alg + '_' + args.Info + '_n' + \
 82 |                str(args.N) + '_lam' + str(int(param.lam_init[0]))
 83 | fw = tf.summary.create_file_writer(log_dir_name)  # log file witer
 84 | 
 85 | # create dir to save model
 86 | if not os.path.exists(log_dir_name + '/models'):
 87 |     os.makedirs(log_dir_name + '/models')
 88 | 
 89 | # save params to a .txt file
 90 | prams_file = open(log_dir_name + '/prams_table.txt', 'w')
 91 | prams_file.writelines(f'{i:50} {v}\n' for i, v in args.__dict__.items())
 92 | prams_file.close()
 93 | 
 94 | ###################### others ###############################
 95 | W_Initializer = tf.initializers.he_normal(args.Seed)  # NN initializer
 96 | Epsilon_Decay_Rate = (args.Min_Epsilon - args.Max_Epsilon) / (args.T) * args.Epsilon_Decay # factor of decay
 97 | TENSOR_FLOAT_TYPE = tf.dtypes.float32
 98 | TENSOR_INT_TYPE = tf.dtypes.int32
 99 | 
100 | def softmax(x):
101 |     return np.exp(x) / np.sum(np.exp(x))
102 | 
103 | # +
104 | class ReplayBuffer:
105 |     def __init__(self, buffer_capacity = 100000):
106 |         self.buffer_capacity = buffer_capacity
107 |         self.buffer_counter = 0
108 | 
109 |         # dim(action) = N * 3
110 |         # dim(state) = N * 4
111 |         # dim(pds) = N * 4
112 |         buffer_a_dim = (buffer_capacity, param.N, 3)
113 |         buffer_s_dim = (buffer_capacity, param.N, 4)
114 | 
115 |         self.s_buffer = np.empty(buffer_s_dim, dtype=np.float32)
116 |         self.a_buffer = np.empty(buffer_a_dim, dtype=np.float32)
117 |         self.r_buffer = np.empty((buffer_capacity,), dtype=np.float32)
118 |         self.next_s_buffer = np.empty(buffer_s_dim, dtype=np.float32)
119 | 
120 |     def store(self, exp):
121 |         index = self.buffer_counter % self.buffer_capacity
122 | 
123 |         s, a, r, next_s = exp
124 |         self.s_buffer[index] = s
125 |         self.a_buffer[index] = a
126 |         self.r_buffer[index] = r
127 |         self.next_s_buffer[index] = next_s
128 | 
129 |         self.buffer_counter += 1
130 | 
131 |     def sample(self, batch_size):
132 |         sampling_range = min(self.buffer_counter, self.buffer_capacity)
133 |         idx = np.random.randint(0, sampling_range, batch_size)
134 | 
135 |         batch_s = tf.convert_to_tensor(self.s_buffer[idx])
136 |         batch_a = tf.convert_to_tensor(self.a_buffer[idx])
137 |         batch_r = tf.convert_to_tensor(self.r_buffer[idx])
138 |         batch_next_s = tf.convert_to_tensor(self.next_s_buffer[idx])
139 | 
140 |         return batch_s, batch_a, batch_r, batch_next_s
141 | 
142 | class OUActionNoise:
143 |     def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
144 |         self.theta = theta
145 |         self.mean = mean
146 |         self.std_dev = std_deviation
147 |         self.dt = dt
148 |         self.x_initial = x_initial
149 |         self.reset()
150 | 
151 |     def __call__(self):
152 |         # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
153 |         x = (
154 |             self.x_prev
155 |             + self.theta * (self.mean - self.x_prev) * self.dt
156 |             + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
157 |         )
158 |         # Store x into x_prev
159 |         # Makes next noise dependent on current one
160 |         self.x_prev = x
161 |         return x
162 | 
163 |     def reset(self):
164 |         if self.x_initial is not None:
165 |             self.x_prev = self.x_initial
166 |         else:
167 |             self.x_prev = np.zeros_like(self.mean)
168 | # -
169 | 
170 | class DPDS:
171 |     def __init__(self, batch_size, memory_size, max_epsilon):
172 | 
173 |         def build_actor():
174 |             inputs = keras.Input(shape=(param.N, 4))
175 |             x = keras.layers.Flatten()(inputs)
176 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
177 |             if args.batch_norm:
178 |                 x = keras.layers.BatchNormalization()(x)
179 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
180 |             if args.batch_norm:
181 |                 x = keras.layers.BatchNormalization()(x)
182 |             x = keras.layers.Dense(param.N*3, activation='sigmoid', kernel_initializer=W_Initializer)(x)
183 |             outputs = keras.layers.Reshape((param.N, 3))(x)
184 |             model = keras.Model(inputs=inputs, outputs=outputs)
185 |             return model
186 | 
187 |         def build_value():
188 |             inputs = keras.Input(shape=(param.N, 4))
189 |             x = keras.layers.Flatten()(inputs)
190 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
191 |             if args.batch_norm:
192 |                 x = keras.layers.BatchNormalization()(x)
193 |             x = keras.layers.Dense(args.Units, activation='relu', kernel_initializer=W_Initializer)(x)
194 |             if args.batch_norm:
195 |                 x = keras.layers.BatchNormalization()(x)
196 |             outputs = keras.layers.Dense(1)(x)
197 |             model = keras.Model(inputs=inputs, outputs=outputs)
198 |             return model
199 | 
200 |         self.actor = build_actor()
201 |         self.value = build_value()
202 |         self.target_actor = build_actor()
203 |         self.target_value = build_value()
204 |         if args.load_weights:
205 |             self.value.load_weights("models/value")
206 |             self.actor.load_weights("models/actor")
207 |             print("load weight")
208 |         self.target_actor.set_weights(self.actor.get_weights())
209 |         self.target_value.set_weights(self.value.get_weights())
210 | 
211 |         self.actor_optimizer = tf.optimizers.Adam(args.Lr)
212 |         self.value_optimizer = tf.optimizers.Adam(args.Lr*2)
213 |         self.epsilon = max_epsilon
214 |         self.batch_size = batch_size
215 |         self.buffer = ReplayBuffer(memory_size)
216 |         self.alg = alg
217 |         self.v = Initial_v  # average reward
218 |         self.target_v = Initial_v
219 |         self.lam = param.lam_init
220 | 
221 |         # transform some parameters to tensors
222 |         self.f_max = tf.convert_to_tensor(param.f_max)
223 |         self.P_max = tf.convert_to_tensor(param.P_max)
224 |         self.W_max = tf.convert_to_tensor(param.W_max)
225 |         self.E_max = tf.convert_to_tensor(param.E_max)
226 |         self.d_t = tf.convert_to_tensor(param.d_t)
227 |         self.kappa = tf.convert_to_tensor(param.kappa)
228 |         self.sigma2 = tf.convert_to_tensor(param.sigma2)
229 |         self.gamma = tf.convert_to_tensor(param.gamma)
230 | 
231 |     def random_action(self, s):
232 |         action = np.random.rand(param.N, 3)
233 |         # apply softmax to W so that its sum equals 1
234 |         action[:,2] = softmax(action[:,2])
235 |         return action
236 |     
237 |     def _choose_action(self, s):
238 |         action = self.actor(s[None, :])[0].numpy()
239 |         action[:,2] = softmax(action[:,2])
240 |         return action
241 | 
242 |     def choose_action(self, s, noise_object, epsilon):
243 |         action = self.actor(s[None, :])[0].numpy()
244 |         noise = noise_object()
245 |         # Adding noise to action
246 |         action = action + epsilon * noise
247 | 
248 |         # We make sure action is within bounds
249 |         legal_action = np.clip(action, 0, 1)
250 |         legal_action[:,2] = softmax(legal_action[:,2])
251 | 
252 |         return legal_action
253 | 
254 |     @tf.function(jit_compile=True)
255 |     def f_k(self, batch_state, batch_action):
256 |         f = batch_action[:, :, 0] * self.f_max
257 |         P = batch_action[:, :, 1] * self.P_max
258 |         W = tf.nn.softmax(batch_action[:, :, 2]) * self.W_max
259 |         d_r = batch_state[:, :, 0]
260 |         a = batch_state[:, :, 1]
261 |         q = batch_state[:, :, 2]
262 |         h = batch_state[:, :, 3]
263 | 
264 |         d = f * self.d_t / self.kappa + \
265 |                 self.d_t * W * (tf.math.log(1+P*h/self.sigma2)/tf.math.log(2.))
266 |         b = ((d > 0) & (d_r > 0) & (d >= d_r))
267 |         b = tf.where(b, 1.0, 0.0)
268 |         pds_q = q - b
269 |         pds_h = h
270 |         pds_d_r = tf.maximum(tf.constant(0, dtype=tf.float32), d_r - d)
271 |         # explanation of the following update formula can be found here: https://github.com/XingqiuHe/DPDS/issues/2
272 |         pds_a = a - 3.3 * b 
273 |         
274 |         s = tf.stack([pds_d_r, pds_a, pds_q, pds_h], axis=2)
275 |         return s
276 | 
277 |     @tf.function(jit_compile=True)
278 |     def cost(self, batch_state, batch_action):
279 |         f = batch_action[:, :, 0] * self.f_max
280 |         P = batch_action[:, :, 1] * self.P_max
281 |         W = tf.nn.softmax(batch_action[:, :, 2]) * self.W_max
282 |         E = self.gamma * f**3 * self.d_t + P * self.d_t
283 |         h = batch_state[:, :, 3]
284 |         d = f * self.d_t / self.kappa + \
285 |                 self.d_t * W * (tf.math.log(1+P*h/self.sigma2)/tf.math.log(2.))
286 |         # in expectation, completing a task reduces aoi by 1/p_g
287 |         aoi_per_bit = 1 / param.p_g / ((param.d_lb+param.d_ub)/2)
288 |         if args.Alg == 'dpl':
289 |             # for delay-based algorithm, the total delay equals the sum of length of queues
290 |             cost = tf.reduce_sum(batch_state[:, :, 2] - d/((param.d_lb+param.d_ub)/2) + self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
291 |         else:
292 |             cost = tf.reduce_sum(batch_state[:, :, 1] - d*aoi_per_bit + self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
293 |             #cost = tf.reduce_sum(batch_state[:, :, 1] + self.lam * tf.math.maximum(0.0, E - self.E_max), axis=1)
294 |             #cost = tf.reduce_sum(batch_state[:, :, 1] + self.lam * (E - self.E_max), axis=1)
295 |         return cost
296 | 
297 |     @tf.function(jit_compile=True)
298 |     def train(self, s, a, r, s_next):
299 |         # update value network
300 |         with tf.GradientTape() as tape:
301 |             # calculate target y
302 |             target_a_next = self.target_actor(s_next, training=True)
303 |             target_pds_next = self.f_k(s_next, target_a_next)
304 |             target_y = self.cost(s_next, target_a_next) + \
305 |                 self.target_value(target_pds_next, training=True) - self.target_v
306 |             pds = self.f_k(s,a)
307 |             pds_value = self.value(pds, training=True)
308 |             td = pds_value - target_y
309 |             value_loss = tf.math.reduce_mean(tf.math.abs(td))
310 | 
311 |         value_grad = tape.gradient(value_loss, self.value.trainable_variables)
312 |         #value_grad = [tf.clip_by_norm(grad, 10.0) for grad in value_grad]
313 |         self.value_optimizer.apply_gradients( zip(value_grad, self.value.trainable_variables) )
314 | 
315 |         # update actor network
316 |         with tf.GradientTape() as tape:
317 |             actions = self.actor(s, training=True)
318 |             pds = self.f_k(s, actions)
319 |             #critic_value = cost(s, actions) + self.value(pds, training=True) - self.v
320 |             c = self.cost(s, actions)
321 |             value = self.value(pds, training=True)
322 |             critic_value = c + value
323 |             actor_loss = tf.math.reduce_mean(critic_value)
324 | 
325 |         actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables)
326 |         #actor_grad = [tf.clip_by_norm(grad, 10.0) for grad in actor_grad]
327 |         self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables) )
328 | 
329 |         # td is returned to update self.v
330 |         # we do not update self.v in this function because it leaks the local tensor 'td', which is prohibited by tensorflow
331 |         return (td, value_loss, actor_loss, c, value)
332 | 
333 |     def save_model(self, dir=log_dir_name + '/models'):
334 |         self.actor.save_weights(dir + '/' + self.alg + '_actor')
335 |         self.value.save_weights(dir + '/' + self.alg + '_value')
336 |         self.actor.save_weights('models/actor')
337 |         self.value.save_weights('models/value')
338 |         with open(dir + '/' + self.alg + '_v.pickle', 'wb') as f:
339 |             pickle.dump(self.v, f)
340 |         with open('models/v.pickle', 'wb') as f:
341 |             pickle.dump(self.v, f)
342 | 
343 |     
344 | @tf.function(jit_compile=True)
345 | def update_target(target_weights, weights, omega):
346 |     for (a, b) in zip(target_weights, weights):
347 |         a.assign(b * omega + a * (1 - omega))
348 | 
349 | def train(T):
350 |     agent = DPDS(args.Batch_Size, args.Memory_Size, args.Max_Epsilon)
351 |     print("============" + agent.alg + "============")
352 | 
353 |     state = env.reset()
354 |     std_dev = 0.01
355 |     ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
356 |     acc_E = np.zeros(param.N)
357 |     acc_A = np.zeros(param.N)
358 |     stat_W = []
359 |     timer = 1
360 |     
361 |     acc_interaction_time = 0
362 |     acc_inference_time = 0
363 |     acc_training_time = 0
364 |     
365 |     while timer <= T:
366 |         if timer % 10000 == 0:
367 |             print(timer)
368 | 
369 |         if timer <= args.Start_Size:
370 |             action = agent.random_action(state)
371 |         else:
372 |             inference_begin = time.time()
373 |             action = agent.choose_action(state, ou_noise, agent.epsilon)
374 |             inference_end = time.time()
375 |             acc_inference_time += inference_end - inference_begin
376 |         interaction_begin = time.time()
377 |         next_state, E = env.step(action)
378 |         interaction_end = time.time()
379 |         acc_interaction_time += interaction_end - interaction_begin
380 |         cost = np.sum(state[:,1] + agent.lam * (E - param.E_max))
381 | 
382 |         agent.buffer.store((state, action, cost, next_state))
383 | 
384 |         # train
385 |         if timer > args.Update_After and timer % args.Train_Interval == 0:
386 |             training_begin = time.time()
387 |             # sample from buffer
388 |             s, a, r, s_next = agent.buffer.sample(args.Batch_Size)
389 |             td, value_loss, actor_loss, c, v = agent.train(s, a, r, s_next)
390 |             agent.v = agent.v - param.beta(timer) * tf.reduce_mean(td)
391 |             training_end = time.time()
392 |             acc_training_time += training_end - training_begin
393 | 
394 |             # update target networks
395 |             update_target(agent.target_actor.variables, agent.actor.variables, args.omega)
396 |             update_target(agent.target_value.variables, agent.value.variables, args.omega)
397 |             agent.target_v = args.omega * agent.v + (1 - args.omega) * agent.target_v
398 | 
399 |             # update lambda
400 |             agent.lam = agent.lam + param.theta(timer) * (E - param.E_max)
401 |             agent.lam = np.maximum(0, agent.lam)
402 |             agent.lam = np.minimum(param.lam_max, agent.lam)
403 | 
404 |             with fw.as_default():
405 |                 tf.summary.scalar('value_loss', value_loss, step = timer)
406 |                 tf.summary.scalar('actor_loss', actor_loss, step = timer)
407 |                 tf.summary.scalar('cost1', tf.math.reduce_mean(c), step = timer)
408 |                 tf.summary.scalar('value', tf.math.reduce_mean(v), step = timer)
409 |             
410 |         # epsilon decay
411 |         agent.epsilon = max(Epsilon_Decay_Rate * timer + args.Max_Epsilon, args.Min_Epsilon)
412 | 
413 |         timer += 1
414 | 
415 |         """
416 |         TODO
417 |             we need to explain that in the paper that we can guarantee constraint (10b) by 
418 |             applying softmax on the W
419 |             also need to specify we use values between (0,1) to represent the control variables
420 |             also need to discuss we do not need to ensure constraint (7)
421 |         """
422 | 
423 |         # log
424 |         acc_E += E
425 |         acc_A += state[:,1]
426 |         stat_W.append(action[:,2])
427 |         with fw.as_default():
428 |             tf.summary.scalar('cost', cost, step=timer)
429 |             tf.summary.scalar('aoi', np.sum(state[:,1])/args.N, step=timer)
430 |             tf.summary.scalar('average aoi', np.sum(acc_A)/timer/args.N, step=timer)
431 |             tf.summary.scalar('energy', np.sum(E)/args.N, step=timer)
432 |             tf.summary.scalar('v', agent.v, step=timer)
433 |             tf.summary.scalar('epsilon', agent.epsilon, step=timer)
434 |             tf.summary.scalar('lambda', np.sum(agent.lam)/args.N, step=timer)
435 |             tf.summary.scalar('average energy', np.sum(acc_E)/timer/args.N, step=timer)
436 | 
437 |         state = next_state
438 |     
439 |     agent.save_model()
440 |     print("Average interaction time:", acc_interaction_time / args.T)
441 |     print("Average inference time:", acc_inference_time / (args.T - args.Start_Size))
442 |     print("Average training time:", acc_training_time / (args.T - args.Update_After))
443 |     print("Average number of tasks:", env.nTask / timer /args.N)
444 | 
445 |     data = {'N': param.N, 'locations': param.WD_loc_list, 'distances': param.distance, 'E': env.E_stat, 'A': env.A_stat}
446 |     savemat(data_dir_name + '.mat', data)
447 | 
448 | if __name__ == "__main__":
449 |     train(args.T)
450 | 
451 | print(tf.__version__)
452 | 
453 | 
454 | 


--------------------------------------------------------------------------------