├── .gitignore ├── README.md ├── components ├── buffer.py └── episodebuffer.py ├── env └── environment.py ├── main.py ├── modules ├── agents │ ├── baselines.py │ ├── gatagent.py │ ├── gcnagent.py │ ├── mlpagent.py │ └── tomagent.py └── tom │ └── observer.py ├── plot.py ├── train_gat_agent.py ├── train_gcn_agent.py ├── train_mlp_agent.py ├── train_tom_agent.py └── utils ├── policy.py └── scheduler.py /.gitignore: -------------------------------------------------------------------------------- 1 | /saved 2 | **/__pycache__ 3 | **/*.sh -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Description 2 | Deep Reinforcement Learning-based Dependent Task Offloading with Theory of Mind 3 | * environment 4 | * Task 5 | - data size: $[8\times10^5, 1.6\times10^6]$ 6 | - cpu cycles: $[2\times10^8, 2\times10^9]$ 7 | * Local 8 | - cpu: $[1\times10^8, 2\times10^8]$ 9 | - storage: $[1\times10^8, 2\times10^8]$ 10 | * Edge 11 | - cpu: $[1\times10^9, 2\times10^9]$ 12 | - trans: $[2\times10^6, 4\times10^6]$ 13 | - storage: $[1\times10^9, 2\times10^9]$ 14 | * Cloud 15 | - trans: $[2.4\times10^6, 4.8\times10^6]$ 16 | - fixed trans time: $3$ 17 | * agents 18 | - [x] theory of mind + dqn 19 | - [x] gat + dqn 20 | - [x] dqn 21 | - [x] all local, all edge, all cloud, random, greedy 22 | * buffer 23 | - [x] episode buffer 24 | 25 | ### MDP 26 | * state 27 | - state space: [task_idx, task_info, dev_info] 28 | * action 29 | - action space: M + 1 + 1 30 | - 0~M-1: edge 31 | - M: local 32 | - M+1: cloud 33 | * reward 34 | - DVR (deadline violation ratio) 35 | 36 | ### train and run 37 | * train 38 | ```python 39 | python train_tom_agent.py 40 | ``` 41 | * evaluate 42 | ```python 43 | python main.py 44 | ``` -------------------------------------------------------------------------------- /components/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | 4 | from env.environment import Environment 5 | 6 | class ReplayBuffer: 7 | def __init__(self, buffer_size, batch_size, env): 8 | self.buffer_size = buffer_size 9 | self.batch_size = batch_size 10 | self.env = env 11 | self.n_state = env.get_state_size() 12 | self.n_action = env.get_action_size() 13 | 14 | self.state = deque(maxlen=buffer_size) 15 | self.action = deque(maxlen=buffer_size) 16 | self.reward = deque(maxlen=buffer_size) 17 | self.next_state = deque(maxlen=buffer_size) 18 | self.avail_action = deque(maxlen=buffer_size) 19 | self.IDs = deque(maxlen=buffer_size) 20 | 21 | def can_sample(self): 22 | return self.batch_size <= len(self.state) 23 | 24 | def sample(self): 25 | if self.can_sample(): 26 | indices = np.random.choice(len(self.state), self.batch_size, replace=False) 27 | batch_state = [self.state[i] for i in indices] 28 | batch_action = [self.action[i] for i in indices] 29 | batch_reward = [self.reward[i] for i in indices] 30 | batch_next_state = [self.next_state[i] for i in indices] 31 | batch_avail_action = [self.avail_action[i] for i in indices] 32 | batch_IDs = [self.IDs[i] for i in indices] 33 | 34 | return self.env.decode_batch_state(np.array(batch_state)), \ 35 | np.array(batch_action), \ 36 | np.array(batch_reward), \ 37 | self.env.decode_batch_state(np.array(batch_next_state)), \ 38 | np.array(batch_avail_action), \ 39 | np.array(batch_IDs) 40 | return None 41 | 42 | def store(self, state, action, reward, next_state, avail_action): 43 | self.state.append(self.env.encode_state(state)) 44 | self.action.append(action) 45 | self.reward.append(reward) 46 | self.next_state.append(self.env.encode_state(next_state)) 47 | self.avail_action.append(avail_action) 48 | self.IDs.append(self.env.ID) -------------------------------------------------------------------------------- /components/episodebuffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Episode: 4 | def __init__(self, env): 5 | self.n_state = env.get_state_size() 6 | self.n_action = env.get_action_size() 7 | 8 | def update(self, states, actions, rewards, next_states, avail_actions, ID): 9 | self.states = states 10 | self.actions = actions 11 | self.rewards = rewards 12 | self.next_states = next_states 13 | self.avail_actions = avail_actions 14 | self.ID = ID 15 | 16 | class ReplayBuffer: 17 | def __init__(self, buffer_size, batch_size): 18 | self.buffer_size = buffer_size 19 | self.batch_size = batch_size 20 | self.episode_in_buffer = 0 21 | self.buffer = [] 22 | 23 | def can_sample(self): 24 | return self.batch_size < self.episode_in_buffer 25 | 26 | def sample(self): 27 | if self.can_sample(): 28 | indices = np.random.choice(self.episode_in_buffer, self.batch_size, replace=False) 29 | episodes = [self.buffer[i] for i in indices] 30 | return episodes 31 | return None 32 | 33 | def insert_an_episode(self, episode): 34 | self.buffer.append(episode) 35 | self.episode_in_buffer = self.episode_in_buffer + 1 if self.episode_in_buffer < self.buffer_size else self.buffer_size 36 | 37 | def get_IDs(self): 38 | return [episode.ID for episode in self.buffer] -------------------------------------------------------------------------------- /env/environment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as stats 3 | import matplotlib.pyplot as plt 4 | import networkx as nx 5 | import random 6 | import time 7 | 8 | class Constant: 9 | c_max = 5e10 # CPU频率最大值 10 | r_max = 8e6 # 传输速率最大值 11 | s_max = 5e10 # 存储容量最大值 12 | 13 | 14 | def normalize(data): 15 | assert type(data) == np.ndarray, "data must be numpy ndarray" 16 | return (data - np.min(data)) / (np.max(data) - np.min(data)) 17 | 18 | 19 | class Environment: 20 | 21 | def __init__(self): 22 | self.ID = 0 # 环境唯一标识 23 | self.adjs = {} # 所有任务的邻接矩阵 24 | 25 | # self.min_num_nodes = 80 # 最小节点数 26 | # self.max_num_nodes = 100 # 最大节点数 27 | # self.min_num_edges = 200 # 最小边数 28 | # self.max_num_edges = 250 # 最大边数 29 | self.min_num_nodes = 100 30 | self.max_num_nodes = 100 31 | self.min_num_edges = 250 32 | self.max_num_edges = 250 33 | self.M = 5 # 基站数量 34 | pass 35 | 36 | def generate_dag(self, num_nodes, num_edges): 37 | # Create a directed acyclic graph 38 | G = nx.DiGraph() 39 | G.add_nodes_from(range(num_nodes)) 40 | while G.number_of_edges() < num_edges: 41 | a, b = np.random.randint(0, num_nodes, size=2) 42 | if a != b and not G.has_edge(a, b): 43 | G.add_edge(a, b) 44 | if not nx.is_directed_acyclic_graph(G): 45 | G.remove_edge(a, b) 46 | 47 | return G 48 | 49 | def generate_tolerance(self, G, queue): 50 | # 正态分布 51 | lower, upper = 1, 6 52 | mu, sigma = 4, 1 53 | X = stats.truncnorm((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma) 54 | # plot 55 | # plt.hist(X.rvs(1000), bins=100, density=True) 56 | tolerance = np.zeros(len(queue)) 57 | for task_idx in queue: 58 | tolerance[task_idx] = X.rvs() 59 | dependencies = list(G.predecessors(task_idx)) 60 | if len(dependencies) > 0: 61 | tolerance[task_idx] = tolerance[task_idx] + max([tolerance[dep] for dep in dependencies]) 62 | sorted_tolerance = np.array([tolerance[task_idx] for task_idx in queue]) 63 | return sorted_tolerance 64 | 65 | 66 | def generate_task(self): 67 | # 任务拓扑图 68 | self.G = self.generate_dag(self.num_nodes, self.num_edges) 69 | self.adjs[self.ID] = np.pad(nx.to_numpy_array(self.G, dtype=int), \ 70 | ((0, self.max_num_nodes - self.num_nodes), (0, self.max_num_nodes - self.num_nodes))) 71 | 72 | # 调度队列 73 | self.queue = np.array(list(nx.topological_sort(self.G))) 74 | 75 | # 任务属性 76 | self.data_size = np.random.uniform(8e5, 1.6e6, self.num_nodes) 77 | self.cpu_cycles = np.random.uniform(2e8, 2e9, self.num_nodes) 78 | self.tolerance = self.generate_tolerance(self.G, self.queue) 79 | 80 | # 任务状态信息 81 | self.current_idx = 0 82 | self.completed = [False] * self.num_nodes # 任务是否完成 83 | self.T_com = np.zeros(self.num_nodes) # 任务完成时间 84 | self.free = [False] * self.num_nodes # 任务是否释放资源 85 | self.off_dev = [-1] * self.num_nodes # 任务执行设备 86 | pass 87 | 88 | 89 | def init_cluster(self): 90 | # TODO: 数据设计不合理 91 | 92 | # 本地 93 | self.local_cpu_cycles = np.random.uniform(1e8, 2e8) # 本地处理任务的CPU频率 94 | self.local_storage = np.random.uniform(1e8, 2e8) # 本地的存储容量 95 | # 边缘 96 | self.edge_cpu_cycles = np.zeros(self.M) # 基站的CPU频率 97 | self.edge_trans_rate = np.zeros(self.M) # 基站的传输速率 98 | self.edge_storage = np.zeros(self.M) # 基站的存储容量 99 | delta_edge_cpu_cycles = (2e9 - 1e9) / self.M 100 | delta_edge_trans_rate = (4e6 - 2e6) / self.M 101 | delta_edge_storage = (2e9 - 1e9) / self.M 102 | for i in range(self.M): 103 | self.edge_cpu_cycles[i] = np.random.uniform(1e9 + i*delta_edge_cpu_cycles, 1e9 + (i+1)*delta_edge_cpu_cycles) 104 | self.edge_trans_rate[i] = np.random.uniform(1e6 + i*delta_edge_trans_rate, 1e6 + (i+1)*delta_edge_trans_rate) 105 | self.edge_storage[i] = np.random.uniform(1e9 + i*delta_edge_storage, 1e9 + (i+1)*delta_edge_storage) 106 | # 云 107 | self.cloud_trans_rate = np.random.uniform(2.4e6, 4.8e6) # 云的传输速率 108 | self.cloud_fixed_time = 3 # 固定传输时延 109 | pass 110 | 111 | def reset(self, seed=0): 112 | # 简化训练复杂度:例如节点数可选值有5个,边可选值有5个 113 | random.seed(seed) 114 | np.random.seed(seed) 115 | 116 | self.ID = self.ID + 1 117 | self.num_nodes = random.choice(range(self.min_num_nodes, self.max_num_nodes + 5, 5)) 118 | self.num_edges = random.choice(range(self.min_num_edges, self.max_num_edges + 10, 10)) 119 | 120 | self.generate_task() 121 | self.init_cluster() 122 | self.done = False 123 | 124 | return self.get_state() 125 | 126 | def step(self, action): 127 | """ 128 | action: [0, 1, 2, ... M, M+1] 129 | edge: 0, 1, 2, ..., M-1 130 | local: M 131 | cloud: M+1 132 | """ 133 | task_idx = self.current_idx 134 | 135 | # 本地 136 | if action == self.M: 137 | if task_idx == 0: 138 | self.T_com[task_idx] = self.cpu_cycles[task_idx] / self.local_cpu_cycles 139 | else: 140 | # 调度队列中前一个节点 141 | last_T = self.T_com[task_idx-1] 142 | # 图中前继节点 143 | pred_T = max(self.T_com[:task_idx]) 144 | self.T_com[task_idx] = max(self.cpu_cycles[task_idx] / self.local_cpu_cycles + pred_T, last_T) 145 | self.local_storage = self.local_storage - self.data_size[task_idx] 146 | self.off_dev[task_idx] = 0 147 | # 云 148 | elif action == self.M + 1: 149 | if task_idx == 0: 150 | self.T_com[task_idx] = self.data_size[task_idx] / self.cloud_trans_rate + self.cloud_fixed_time 151 | else: 152 | last_T = self.T_com[task_idx-1] 153 | pred_T = max(self.T_com[:task_idx]) 154 | self.T_com[task_idx] = max(self.data_size[task_idx] / self.cloud_trans_rate + self.cloud_fixed_time + pred_T, last_T) 155 | self.off_dev[task_idx] = self.M + 1 156 | # 边缘 157 | else: 158 | edge_idx = action 159 | if task_idx == 0: 160 | self.T_com[task_idx] = self.cpu_cycles[task_idx] / self.edge_cpu_cycles[edge_idx] + self.data_size[task_idx] / self.edge_trans_rate[edge_idx] 161 | else: 162 | last_T = self.T_com[task_idx-1] 163 | pred_T = max(self.T_com[:task_idx]) 164 | self.T_com[task_idx] = max(self.cpu_cycles[task_idx] / self.edge_cpu_cycles[edge_idx] + self.data_size[task_idx] / self.edge_trans_rate[edge_idx] + pred_T, last_T) 165 | self.edge_storage[edge_idx] = self.edge_storage[edge_idx] - self.data_size[task_idx] 166 | self.off_dev[task_idx] = edge_idx 167 | 168 | # 更新任务状态 169 | for idx in range(task_idx + 1): 170 | if self.T_com[idx] > 0: 171 | self.completed[idx] = True 172 | for idx in range(task_idx + 1): 173 | if self.completed[idx] and not self.free[idx]: 174 | if self.off_dev[idx] == 0: 175 | self.local_storage = self.local_storage + self.data_size[idx] 176 | elif self.off_dev[idx] == self.M + 1: 177 | pass 178 | else: 179 | edge_idx = self.off_dev[idx] 180 | self.edge_storage[edge_idx] = self.edge_storage[edge_idx] + self.data_size[idx] 181 | self.free[idx] = True 182 | 183 | self.current_idx = self.current_idx + 1 184 | if self.current_idx == len(self.G.nodes): 185 | self.done = True 186 | return self.get_state(), self.get_reward(), self.done 187 | 188 | 189 | def get_state(self): 190 | task_idx = np.zeros(self.max_num_nodes) 191 | if self.current_idx < self.num_nodes: 192 | task_idx[self.current_idx] = 1.0 193 | task_info_padding = np.stack([np.pad(normalize(self.data_size), (0, self.max_num_nodes - self.num_nodes)), \ 194 | np.pad(normalize(self.cpu_cycles), (0, self.max_num_nodes - self.num_nodes)), \ 195 | np.pad(normalize(self.tolerance), (0, self.max_num_nodes - self.num_nodes))], axis=1) 196 | 197 | dev_cpu_cycles = np.append(np.append(self.local_cpu_cycles, self.edge_cpu_cycles), Constant.c_max) 198 | dev_trans_rate = np.append(np.append(Constant.r_max, self.edge_trans_rate), self.cloud_trans_rate) 199 | dev_storage = np.append(np.append(self.local_storage, self.edge_storage), Constant.s_max) 200 | dev_info = np.stack([normalize(dev_cpu_cycles), normalize(dev_trans_rate), normalize(dev_storage)], axis=1) 201 | 202 | state = (task_idx, task_info_padding, dev_info) 203 | return state 204 | 205 | def get_reward(self): 206 | # TODO: reward设计不合理,没法用于强化学习训练 207 | # DVR 208 | # 注意这里是self.current_idx - 1而不是self.current_idx 209 | task_idx = self.current_idx - 1 210 | r1 = (self.tolerance[task_idx] - self.T_com[task_idx]) / sum(self.tolerance) 211 | return r1 212 | 213 | def get_state_size(self): 214 | """ 215 | task_idx + task_info + dev_info 216 | """ 217 | return self.max_num_nodes + self.max_num_nodes * 3 + (self.M + 2) * 3 218 | 219 | def get_action_size(self): 220 | return self.M + 2 221 | 222 | def encode_state(self, state): 223 | task_idx, task_info, dev_info = state 224 | return np.hstack((task_idx.flatten(), task_info.flatten(), dev_info.flatten())) 225 | 226 | def encode_batch_state(self, batch_state): 227 | task_idx, task_info, dev_info = batch_state 228 | batch_size = task_idx.shape[0] 229 | return np.hstack((task_idx.reshape(batch_size, -1), task_info.reshape((batch_size, -1)), dev_info.reshape((batch_size, -1)))) 230 | 231 | def decode_batch_state(self, batch_state): 232 | max_num_nodes = self.max_num_nodes 233 | M = self.M 234 | 235 | task_idx_dim = max_num_nodes 236 | task_info_dim = max_num_nodes * 3 237 | dev_info_dim = (M + 2) * 3 238 | 239 | task_idx = np.array([item.reshape(max_num_nodes, 1) for item in batch_state[:, :task_idx_dim]]) 240 | task_info = np.array([item.reshape(max_num_nodes, 3) for item in batch_state[:, task_idx_dim:task_idx_dim+task_info_dim]]) 241 | dev_info = np.array([item.reshape((M+2), 3) for item in batch_state[:, task_idx_dim+task_info_dim:task_idx_dim+task_info_dim+dev_info_dim]]) 242 | return (task_idx, task_info, dev_info) 243 | 244 | def get_avail_actions(self): 245 | avail_actions = np.ones(self.M + 2) 246 | task_idx = self.current_idx 247 | if self.data_size[task_idx] > self.local_storage: 248 | avail_actions[0] = 0 249 | for edge_idx in range(1, self.M): 250 | if self.data_size[task_idx] > self.edge_storage[edge_idx]: 251 | avail_actions[edge_idx] = 0 252 | return avail_actions 253 | 254 | def get_metric(self): 255 | task_idx = self.current_idx 256 | dvr_count = 0 257 | for idx in range(task_idx): 258 | if self.T_com[idx] > self.tolerance[idx]: 259 | dvr_count = dvr_count + 1 260 | return dvr_count / len(self.G.nodes) 261 | 262 | def update_adjs(self, IDs): 263 | self.adjs = {key: value for key, value in self.adjs.items() if key in IDs} 264 | 265 | def log(self): 266 | # 任务 267 | print("调度顺序:", self.queue) 268 | print("任务数据大小:", self.data_size) 269 | print("任务CPU周期数:", self.cpu_cycles) 270 | print("任务容忍时间:", self.tolerance) 271 | # 设备 272 | print("本地的CPU频率:", self.local_cpu_cycles) 273 | print("本地的存储容量:", self.local_storage) 274 | print("基站的CPU频率:", self.edge_cpu_cycles) 275 | print("基站的传输速率:", self.edge_trans_rate) 276 | print("基站的存储容量:", self.edge_storage) 277 | print("云的传输速率:", self.cloud_trans_rate) 278 | print("固定传输时延:", self.cloud_fixed_time) 279 | 280 | def plot_task(self): 281 | plt.figure(figsize=(8, 6)) 282 | pos = nx.spring_layout(self.G) # Positions for all nodes 283 | nx.draw(self.G, pos, with_labels=True, node_color='skyblue', edge_color='k', node_size=500, font_size=16, arrows=True) 284 | plt.title("Directed Acyclic Graph (DAG)") 285 | plt.show() 286 | 287 | 288 | def main(): 289 | env = Environment() 290 | env.reset() 291 | env.log() 292 | env.plot_task() 293 | 294 | if __name__ == "__main__": 295 | main() -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import time 2 | from env.environment import Environment 3 | from modules.agents.mlpagent import MLPAgent 4 | from modules.agents.gatagent import GATAgent 5 | from modules.agents.tomagent import ToMAgent 6 | from modules.agents.baselines import LocalAgent, EdgeAgent, CloudAgent, RandomAgent, GreedyAgent 7 | 8 | def main(name="greedy", seed=0): 9 | env = Environment() 10 | 11 | if name == "local": 12 | agent = LocalAgent() 13 | elif name == "edge": 14 | agent = EdgeAgent() 15 | elif name == "cloud": 16 | agent = CloudAgent() 17 | elif name == "random": 18 | agent = RandomAgent() 19 | elif name == "greedy": 20 | agent = GreedyAgent() 21 | elif name == "mlp": 22 | agent = MLPAgent(env) 23 | # TODO: set your model path 24 | path = "" 25 | if path != "": 26 | agent.load_models(path) 27 | elif name == "gat": 28 | agent = GATAgent(env) 29 | path = "" 30 | if path != "": 31 | agent.load_models(path) 32 | elif name == "tom": 33 | agent = ToMAgent(env) 34 | path = "" 35 | if path != "": 36 | agent.load_models(path) 37 | agent.init_hidden() 38 | 39 | state = env.reset(seed) 40 | ep_reward = 0 41 | while not env.done: 42 | avail_action = env.get_avail_actions() 43 | action = agent.choose_action(state, avail_action, evaluate=True) 44 | 45 | state, reward, done = env.step(action) 46 | 47 | ep_reward = ep_reward + reward 48 | return env.get_metric(), ep_reward 49 | 50 | if __name__ == "__main__": 51 | start_time = time.time() 52 | for name in ["local", "edge", "cloud", "random", "greedy", "mlp", "gat", "tom"]: 53 | episodes = 100 54 | dvr_rate_mean = 0 55 | ep_reward_mean = 0 56 | for i in range(episodes): 57 | dvr_rate, ep_reward = main(name, int(start_time)+i) 58 | dvr_rate_mean += dvr_rate / episodes 59 | ep_reward_mean += ep_reward / episodes 60 | print("agent: {}\t, dvr_rate_mean: {}\t\t, ep_reward_mean: {}".format(name, dvr_rate_mean, ep_reward_mean)) -------------------------------------------------------------------------------- /modules/agents/baselines.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class LocalAgent: 4 | def __init__(self): 5 | pass 6 | 7 | def choose_action(self, state, avail_action, evaluate=False): 8 | action = len(avail_action) - 2 9 | return action 10 | 11 | class EdgeAgent: 12 | def __init__(self): 13 | pass 14 | 15 | def choose_action(self, state, avail_action, evaluate=False): 16 | M = len(avail_action) - 2 17 | avail_action = avail_action[:M] 18 | action = np.random.choice(len(avail_action), p=avail_action/sum(avail_action)) 19 | return action 20 | 21 | class CloudAgent: 22 | def __init__(self): 23 | pass 24 | 25 | def choose_action(self, state, avail_action, evaluate=False): 26 | action = len(avail_action) - 1 27 | return action 28 | 29 | class RandomAgent: 30 | def __init__(self): 31 | pass 32 | 33 | def choose_action(self, state, avail_action, evaluate=False): 34 | action = np.random.choice(len(avail_action), p=avail_action/sum(avail_action)) 35 | return action 36 | 37 | class GreedyAgent: 38 | def __init__(self): 39 | pass 40 | 41 | def choose_action(self, state, avail_action, evaluate=False): 42 | M = len(avail_action) - 2 43 | task_idx, task_info, dev_info = state 44 | w_comp = 0.8 45 | w_trans = 0.2 46 | v_comp = dev_info[:M, 0] / sum(dev_info[:M, 0]) 47 | v_trans = dev_info[:M, 1] / sum(dev_info[:M, 1]) 48 | action = np.random.choice(M, p=w_comp*v_comp+w_trans*v_trans) 49 | return action -------------------------------------------------------------------------------- /modules/agents/gatagent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import os 6 | import time 7 | 8 | from utils.scheduler import LinearSchedule 9 | from utils.policy import GATPolicy 10 | from components.buffer import ReplayBuffer 11 | 12 | class GATAgent: 13 | def __init__(self, env): 14 | super(GATAgent, self).__init__() 15 | self.env = env 16 | self.n_state = self.env.get_state_size() 17 | self.n_action = self.env.get_action_size() 18 | 19 | # TODO: hyper-parameters should be fine-tuned 20 | self.buffer_size = 50000 # 500 episodes * 100 nodes 21 | self.batch_size = 256 22 | self.lr = 0.0005 23 | self.gamma = 0.99 24 | self.epsilon_start = 0.0 25 | self.epsilon_finish = 0.99 26 | self.epsilon_time_length = 5000 # 50 episodes * 100 nodes 27 | self.epsilon_schedule = LinearSchedule(self.epsilon_start, self.epsilon_finish, self.epsilon_time_length) 28 | self.target_update_interval = 2000 # update target network every 20 episodes 29 | self.grad_norm_clip = 10 # avoid gradient explode 30 | 31 | self.net = GATPolicy(3, self.n_action, self.env.max_num_nodes, self.env.M) 32 | self.target_net = GATPolicy(3, self.n_action, self.env.max_num_nodes, self.env.M) 33 | 34 | self.learn_step_counter = 0 35 | self.buffer = ReplayBuffer(self.buffer_size, self.batch_size, self.env) 36 | self.params = list(self.net.parameters()) 37 | self.optimizer = torch.optim.RMSprop(params=self.params, lr=self.lr) 38 | 39 | 40 | def choose_action(self, state, avail_action, t=0, evaluate=False): 41 | if evaluate: 42 | epsilon = 1.0 43 | else: 44 | epsilon = self.epsilon_schedule.eval(t) 45 | task_idx, task_info, dev_info = state 46 | adj = self.env.adjs[self.env.ID] 47 | task_idx = torch.unsqueeze(torch.FloatTensor(task_idx), 0) 48 | task_info = torch.unsqueeze(torch.FloatTensor(task_info), 0) 49 | dev_info = torch.unsqueeze(torch.FloatTensor(dev_info), 0) 50 | adj = torch.unsqueeze(torch.FloatTensor(adj), 0) 51 | action_value = self.net.forward(task_idx, task_info, dev_info, adj) 52 | 53 | action_value = action_value.squeeze() 54 | action_value[avail_action == 0] = -9999999 55 | if np.random.randn() <= epsilon: # greedy policy 56 | action = torch.max(action_value, dim=0)[1].data.numpy() 57 | else: # random policy 58 | action = np.random.choice(self.n_action, p=avail_action/sum(avail_action)) 59 | return action 60 | 61 | 62 | def learn(self): 63 | 64 | #update target parameters 65 | if self.learn_step_counter % self.target_update_interval ==0: 66 | self.target_net.load_state_dict(self.net.state_dict()) 67 | self.learn_step_counter+=1 68 | 69 | # sample from replay buffer 70 | batch_state, batch_action, batch_reward, batch_next_state, batch_avail_action, batch_IDs = self.buffer.sample() 71 | idx, x, y = batch_state 72 | target_idx, target_x, target_y = batch_next_state 73 | adj = np.array([self.env.adjs[ID] for ID in batch_IDs]) 74 | batch_action = torch.LongTensor(batch_action.astype(int)) 75 | batch_reward = torch.FloatTensor(batch_reward) 76 | batch_avail_action = torch.FloatTensor(batch_avail_action) 77 | q = torch.gather(self.net(torch.FloatTensor(idx), torch.FloatTensor(x), torch.FloatTensor(y), torch.FloatTensor(adj)), dim=1, index=batch_action.unsqueeze(1)) 78 | q_next = self.target_net(torch.FloatTensor(target_idx), torch.FloatTensor(target_x), torch.FloatTensor(target_y), torch.FloatTensor(adj)).detach() 79 | q_next[batch_avail_action == 0] = -9999999 80 | q_target = batch_reward.view(self.batch_size, 1) + self.gamma * q_next.max(1)[0].view(self.batch_size, 1) 81 | 82 | loss = F.mse_loss(q, q_target) 83 | 84 | # update parameters 85 | self.optimizer.zero_grad() 86 | grad_norm = torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_norm_clip) 87 | if grad_norm > 0: 88 | print("grad_norm:", grad_norm) 89 | loss.backward() 90 | self.optimizer.step() 91 | 92 | 93 | def store_transition(self, state, action, reward, next_state, avail_action): 94 | self.buffer.store(state, action, reward, next_state, avail_action) 95 | 96 | 97 | def save_models(self, path): 98 | if not os.path.exists(path): 99 | os.makedirs(path) 100 | torch.save(self.net.state_dict(), "{}/net.th".format(path)) 101 | torch.save(self.optimizer.state_dict(), "{}/opt.th".format(path)) 102 | 103 | 104 | def load_models(self, path): 105 | self.net.load_state_dict(torch.load("{}/net.th".format(path), map_location=lambda storage, loc: storage)) 106 | self.optimizer.load_state_dict(torch.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage)) -------------------------------------------------------------------------------- /modules/agents/gcnagent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import os 6 | import time 7 | 8 | from utils.scheduler import LinearSchedule 9 | from utils.policy import GCNPolicy 10 | from components.buffer import ReplayBuffer 11 | 12 | class GCNAgent: 13 | def __init__(self, env): 14 | super(GCNAgent, self).__init__() 15 | self.env = env 16 | self.n_state = self.env.get_state_size() 17 | self.n_action = self.env.get_action_size() 18 | 19 | # TODO: hyper-parameters should be fine-tuned 20 | self.buffer_size = 50000 # 500 episodes * 100 nodes 21 | self.batch_size = 256 22 | self.lr = 0.0005 23 | self.gamma = 0.99 24 | self.epsilon_start = 0.0 25 | self.epsilon_finish = 0.99 26 | self.epsilon_time_length = 5000 # 50 episodes * 100 nodes 27 | self.epsilon_schedule = LinearSchedule(self.epsilon_start, self.epsilon_finish, self.epsilon_time_length) 28 | self.target_update_interval = 2000 # update target network every 20 episodes 29 | self.grad_norm_clip = 10 # avoid gradient explode 30 | 31 | self.net = GCNPolicy(3, self.n_action, self.env.max_num_nodes, self.env.M) 32 | self.target_net = GCNPolicy(3, self.n_action, self.env.max_num_nodes, self.env.M) 33 | 34 | self.learn_step_counter = 0 35 | self.buffer = ReplayBuffer(self.buffer_size, self.batch_size, self.env) 36 | self.params = list(self.net.parameters()) 37 | self.optimizer = torch.optim.RMSprop(params=self.params, lr=self.lr) 38 | 39 | 40 | def choose_action(self, state, avail_action, t=0, evaluate=False): 41 | if evaluate: 42 | epsilon = 1.0 43 | else: 44 | epsilon = self.epsilon_schedule.eval(t) 45 | task_idx, task_info, dev_info = state 46 | adj = self.env.adjs[self.env.ID] 47 | task_idx = torch.unsqueeze(torch.FloatTensor(task_idx), 0) 48 | task_info = torch.unsqueeze(torch.FloatTensor(task_info), 0) 49 | dev_info = torch.unsqueeze(torch.FloatTensor(dev_info), 0) 50 | adj = torch.unsqueeze(torch.FloatTensor(adj), 0) 51 | action_value = self.net.forward(task_idx, task_info, dev_info, adj) 52 | 53 | action_value = action_value.squeeze() 54 | action_value[avail_action == 0] = -9999999 55 | if np.random.randn() <= epsilon: # greedy policy 56 | action = torch.max(action_value, dim=0)[1].data.numpy() 57 | else: # random policy 58 | action = np.random.choice(self.n_action, p=avail_action/sum(avail_action)) 59 | return action 60 | 61 | 62 | def learn(self): 63 | 64 | #update target parameters 65 | if self.learn_step_counter % self.target_update_interval ==0: 66 | self.target_net.load_state_dict(self.net.state_dict()) 67 | self.learn_step_counter+=1 68 | 69 | # sample from replay buffer 70 | batch_state, batch_action, batch_reward, batch_next_state, batch_avail_action, batch_IDs = self.buffer.sample() 71 | idx, x, y = batch_state 72 | target_idx, target_x, target_y = batch_next_state 73 | adj = np.array([self.env.adjs[ID] for ID in batch_IDs]) 74 | batch_action = torch.LongTensor(batch_action.astype(int)) 75 | batch_reward = torch.FloatTensor(batch_reward) 76 | batch_avail_action = torch.FloatTensor(batch_avail_action) 77 | q = torch.gather(self.net(torch.FloatTensor(idx), torch.FloatTensor(x), torch.FloatTensor(y), torch.FloatTensor(adj)), dim=1, index=batch_action.unsqueeze(1)) 78 | q_next = self.target_net(torch.FloatTensor(target_idx), torch.FloatTensor(target_x), torch.FloatTensor(target_y), torch.FloatTensor(adj)).detach() 79 | q_next[batch_avail_action == 0] = -9999999 80 | q_target = batch_reward.view(self.batch_size, 1) + self.gamma * q_next.max(1)[0].view(self.batch_size, 1) 81 | 82 | loss = F.mse_loss(q, q_target) 83 | 84 | # update parameters 85 | self.optimizer.zero_grad() 86 | grad_norm = torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_norm_clip) 87 | if grad_norm > 0: 88 | print("grad_norm:", grad_norm) 89 | loss.backward() 90 | self.optimizer.step() 91 | 92 | 93 | def store_transition(self, state, action, reward, next_state, avail_action): 94 | self.buffer.store(state, action, reward, next_state, avail_action) 95 | 96 | 97 | def save_models(self, path): 98 | if not os.path.exists(path): 99 | os.makedirs(path) 100 | torch.save(self.net.state_dict(), "{}/net.th".format(path)) 101 | torch.save(self.optimizer.state_dict(), "{}/opt.th".format(path)) 102 | 103 | 104 | def load_models(self, path): 105 | self.net.load_state_dict(torch.load("{}/net.th".format(path), map_location=lambda storage, loc: storage)) 106 | self.optimizer.load_state_dict(torch.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage)) -------------------------------------------------------------------------------- /modules/agents/mlpagent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import os 6 | import time 7 | 8 | from utils.scheduler import LinearSchedule 9 | from utils.policy import MLPPolicy 10 | from components.buffer import ReplayBuffer 11 | 12 | class MLPAgent: 13 | def __init__(self, env): 14 | super(MLPAgent, self).__init__() 15 | self.env = env 16 | self.n_state = self.env.get_state_size() 17 | self.n_action = self.env.get_action_size() 18 | 19 | # TODO: hyper-parameters should be fine-tuned 20 | self.buffer_size = 50000 # 500 episodes * 100 nodes 21 | self.batch_size = 256 22 | self.lr = 0.0005 23 | self.gamma = 0.99 24 | self.epsilon_start = 0.0 25 | self.epsilon_finish = 0.99 26 | self.epsilon_time_length = 5000 # 50 episodes * 100 nodes 27 | self.epsilon_schedule = LinearSchedule(self.epsilon_start, self.epsilon_finish, self.epsilon_time_length) 28 | self.target_update_interval = 2000 # update target network every 20 episodes 29 | self.grad_norm_clip = 10 # avoid gradient explode 30 | 31 | self.net = MLPPolicy(self.n_state, self.n_action) 32 | self.target_net = MLPPolicy(self.n_state, self.n_action) 33 | 34 | self.learn_step_counter = 0 35 | self.buffer = ReplayBuffer(self.buffer_size, self.batch_size, self.env) 36 | self.params = list(self.net.parameters()) 37 | self.optimizer = torch.optim.RMSprop(params=self.params, lr=self.lr) 38 | 39 | 40 | def choose_action(self, state, avail_action, t=0, evaluate=False): 41 | if evaluate: 42 | epsilon = 1.0 43 | else: 44 | epsilon = self.epsilon_schedule.eval(t) 45 | inputs = torch.FloatTensor(self.env.encode_state(state)).unsqueeze(0) 46 | action_value = self.net.forward(inputs) 47 | action_value = action_value.squeeze() 48 | action_value[avail_action == 0] = -9999999 49 | if np.random.randn() <= epsilon: # greedy policy 50 | action = torch.max(action_value, dim=0)[1].item() 51 | else: # random policy 52 | action = int(np.random.choice(self.n_action, p=avail_action/sum(avail_action))) 53 | return action 54 | 55 | 56 | def learn(self): 57 | 58 | #update target parameters 59 | if self.learn_step_counter % self.target_update_interval ==0: 60 | self.target_net.load_state_dict(self.net.state_dict()) 61 | self.learn_step_counter+=1 62 | 63 | # sample from replay buffer 64 | batch_state, batch_action, batch_reward, batch_next_state, batch_avail_action, batch_IDs = self.buffer.sample() 65 | 66 | batch_state = torch.FloatTensor(self.env.encode_batch_state(batch_state)) 67 | batch_next_state = torch.FloatTensor(self.env.encode_batch_state(batch_next_state)) 68 | batch_action = torch.LongTensor(batch_action.astype(int)) 69 | batch_reward = torch.FloatTensor(batch_reward) 70 | batch_avail_action = torch.FloatTensor(batch_avail_action) 71 | 72 | # calculate loss 73 | q = torch.gather(self.net(batch_state), dim=1, index=batch_action.unsqueeze(1)) 74 | q_next = self.target_net(batch_next_state).detach() 75 | q_next[batch_avail_action == 0] = -9999999 76 | q_target = batch_reward.view(self.batch_size, 1) + self.gamma * q_next.max(1)[0].view(self.batch_size, 1) 77 | loss = F.mse_loss(q, q_target) 78 | 79 | # update parameters 80 | self.optimizer.zero_grad() 81 | grad_norm = torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_norm_clip) 82 | if grad_norm > 0: 83 | print("grad_norm:", grad_norm) 84 | loss.backward() 85 | self.optimizer.step() 86 | 87 | 88 | def store_transition(self, state, action, reward, next_state, avail_action): 89 | self.buffer.store(state, action, reward, next_state, avail_action) 90 | 91 | 92 | def save_models(self, path): 93 | if not os.path.exists(path): 94 | os.makedirs(path) 95 | torch.save(self.net.state_dict(), "{}/net.th".format(path)) 96 | torch.save(self.optimizer.state_dict(), "{}/opt.th".format(path)) 97 | 98 | 99 | def load_models(self, path): 100 | self.net.load_state_dict(torch.load("{}/net.th".format(path), map_location=lambda storage, loc: storage)) 101 | self.optimizer.load_state_dict(torch.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage)) -------------------------------------------------------------------------------- /modules/agents/tomagent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | import os 6 | import time 7 | 8 | from utils.scheduler import LinearSchedule 9 | from utils.policy import MLPPolicy 10 | from components.episodebuffer import ReplayBuffer 11 | from modules.tom.observer import Observer 12 | 13 | class ToMAgent: 14 | def __init__(self, env): 15 | super(ToMAgent, self).__init__() 16 | self.env = env 17 | self.n_state = self.env.get_state_size() 18 | self.n_action = self.env.get_action_size() 19 | self.observer = Observer(env) 20 | self.character_dim = self.observer.cnet.character_dim 21 | self.mental_dim = self.observer.mnet.mental_dim 22 | 23 | # TODO: hyper-parameters should be fine-tuned 24 | self.buffer_size = 5000 # 5000 episodes 25 | self.batch_size = 64 26 | self.lr = 0.0005 27 | self.gamma = 0.99 28 | self.epsilon_start = 0.0 29 | self.epsilon_finish = 0.99 30 | self.epsilon_time_length = 10000 # 100 episodes * 100 nodes 31 | self.epsilon_schedule = LinearSchedule(self.epsilon_start, self.epsilon_finish, self.epsilon_time_length) 32 | self.target_update_interval = 50 # update target network every 50 episodes 33 | self.grad_norm_clip = 10 # avoid gradient explode 34 | 35 | self.net = MLPPolicy(self.n_state + self.character_dim + self.mental_dim, self.n_action) 36 | self.target_net = MLPPolicy(self.n_state + self.character_dim + self.mental_dim, self.n_action) 37 | 38 | self.learn_step_counter = 0 39 | self.buffer = ReplayBuffer(self.buffer_size, self.batch_size) 40 | self.params = list(self.net.parameters()) + list(self.observer.cnet.parameters()) + list(self.observer.mnet.parameters()) 41 | self.optimizer = torch.optim.RMSprop(params=self.params, lr=self.lr) 42 | 43 | self.e_character = None 44 | self.e_mental = None 45 | self.hidden_state = None 46 | 47 | 48 | def choose_action(self, state, avail_action, t=0, evaluate=False): 49 | if self.e_character is None: 50 | self.e_character = torch.zeros(1, self.observer.cnet.character_dim) 51 | if self.e_mental is None: 52 | self.e_mental = torch.zeros(1, self.observer.mnet.mental_dim) 53 | if evaluate: 54 | epsilon = 1.0 55 | else: 56 | epsilon = self.epsilon_schedule.eval(t) 57 | 58 | inputs = torch.cat([torch.FloatTensor(self.env.encode_state(state)).unsqueeze(0), self.e_character, self.e_mental], dim=-1) 59 | action_value = self.net.forward(inputs) 60 | action_value = action_value.squeeze() 61 | action_value[avail_action == 0] = -9999999 62 | if np.random.randn() <= epsilon: # greedy policy 63 | action = torch.max(action_value, dim=0)[1].item() 64 | else: # random policy 65 | action = int(np.random.choice(self.n_action, p=avail_action/sum(avail_action))) 66 | 67 | # calculate mental embedding 68 | self.e_mental, self.hidden_state = self.observer.calc_mental(state, action, self.e_character, self.hidden_state) 69 | 70 | return action 71 | 72 | 73 | def learn(self): 74 | 75 | #update target parameters 76 | if self.learn_step_counter % self.target_update_interval ==0: 77 | self.target_net.load_state_dict(self.net.state_dict()) 78 | self.learn_step_counter+=1 79 | 80 | # sample from replay buffer 81 | episodes = self.buffer.sample() 82 | 83 | # calculate character embedding 84 | self.e_character = self.observer.calc_character(episodes) 85 | 86 | # get relevant quantities 87 | states, actions, rewards, next_states, avail_actions = [], [], [], [], [] 88 | e_mentals, next_e_mentals = [], [] 89 | self.init_hidden() 90 | for episode in episodes: 91 | timesteps = len(episode.states) 92 | for t in range(timesteps): 93 | self.e_mental, self.hidden_state = self.observer.calc_mental(episode.states[t], episode.actions[t], self.e_character, self.hidden_state) 94 | if t < timesteps - 1: 95 | states.append(self.env.encode_state(episode.states[t])) 96 | actions.append(episode.actions[t]) 97 | rewards.append(episode.rewards[t]) 98 | next_states.append(self.env.encode_state(episode.next_states[t])) 99 | avail_actions.append(episode.avail_actions[t]) 100 | e_mentals.append(self.e_mental.squeeze()) 101 | if t > 0: 102 | next_e_mentals.append(self.e_mental.squeeze()) 103 | states = torch.FloatTensor(np.array(states)) 104 | actions = torch.LongTensor(np.array(actions)) 105 | rewards = torch.FloatTensor(np.array(rewards)) 106 | next_states = torch.FloatTensor(np.array(next_states)) 107 | avail_actions = torch.FloatTensor(np.array(avail_actions)) 108 | e_mentals = torch.stack(e_mentals, dim=0) 109 | e_characters = self.e_character.expand_as(e_mentals) 110 | 111 | # calculate loss 112 | inputs = torch.cat([states, e_characters, e_mentals], dim=1) 113 | next_inputs = torch.cat([next_states, e_characters, e_mentals], dim=1) 114 | q = torch.gather(self.net(inputs), dim=1, index=actions.unsqueeze(1)) 115 | q_next = self.target_net(next_inputs).detach() 116 | q_next[avail_actions == 0] = -9999999 117 | q_target = (rewards + self.gamma * q_next.max(1)[0]).unsqueeze(1) 118 | loss = F.mse_loss(q, q_target) 119 | 120 | # update parameters 121 | self.optimizer.zero_grad() 122 | grad_norm = torch.nn.utils.clip_grad_norm_(self.net.parameters(), self.grad_norm_clip) 123 | if grad_norm > 0: 124 | print("grad_norm:", grad_norm) 125 | loss.backward() 126 | self.optimizer.step() 127 | 128 | 129 | def init_hidden(self): 130 | self.hidden_state = self.observer.mnet.init_hidden() 131 | 132 | 133 | def store_episode(self, episode): 134 | self.buffer.insert_an_episode(episode) 135 | 136 | 137 | def save_models(self, path): 138 | if not os.path.exists(path): 139 | os.makedirs(path) 140 | torch.save(self.net.state_dict(), "{}/net.th".format(path)) 141 | torch.save(self.observer.cnet.state_dict(), "{}/cnet.th".format(path)) 142 | torch.save(self.observer.mnet.state_dict(), "{}/mnet.th".format(path)) 143 | torch.save(self.optimizer.state_dict(), "{}/opt.th".format(path)) 144 | 145 | 146 | def load_models(self, path): 147 | self.net.load_state_dict(torch.load("{}/net.th".format(path), map_location=lambda storage, loc: storage)) 148 | self.observer.cnet.load_state_dict(torch.load("{}/cnet.th".format(path), map_location=lambda storage, loc: storage)) 149 | self.observer.mnet.load_state_dict(torch.load("{}/mnet.th".format(path), map_location=lambda storage, loc: storage)) 150 | self.optimizer.load_state_dict(torch.load("{}/opt.th".format(path), map_location=lambda storage, loc: storage)) -------------------------------------------------------------------------------- /modules/tom/observer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class CharacterNetwork(nn.Module): 7 | def __init__(self, input_shape): 8 | super(CharacterNetwork, self).__init__() 9 | self.input_shape = input_shape 10 | self.rnn_hidden_dim = 32 11 | self.character_dim = 8 12 | 13 | self.fc1 = nn.Linear(input_shape, self.rnn_hidden_dim) 14 | self.rnn = nn.GRUCell(self.rnn_hidden_dim, self.rnn_hidden_dim) 15 | self.fc2 = nn.Linear(self.rnn_hidden_dim, self.character_dim) 16 | 17 | 18 | def init_hidden(self): 19 | return self.fc1.weight.new(1, self.rnn_hidden_dim).zero_() 20 | 21 | 22 | def forward(self, inputs, hidden_state): 23 | x = F.relu(self.fc1(inputs)) 24 | hidden_state = self.rnn(x, hidden_state) 25 | e_character = self.fc2(hidden_state) 26 | return e_character, hidden_state 27 | 28 | 29 | class MentalNetwork(nn.Module): 30 | def __init__(self, input_shape): 31 | super(MentalNetwork, self).__init__() 32 | self.input_shape = input_shape 33 | self.rnn_hidden_dim = 32 34 | self.mental_dim = 8 35 | 36 | self.fc1 = nn.Linear(self.input_shape, self.rnn_hidden_dim) 37 | self.rnn = nn.GRUCell(self.rnn_hidden_dim, self.rnn_hidden_dim) 38 | self.fc2 = nn.Linear(self.rnn_hidden_dim, self.mental_dim) 39 | 40 | 41 | def init_hidden(self): 42 | return self.fc1.weight.new(1, self.rnn_hidden_dim).zero_() 43 | 44 | 45 | def forward(self, inputs, hidden_state): 46 | x = F.relu(self.fc1(inputs)) 47 | hidden_state = self.rnn(x, hidden_state) 48 | e_mental = self.fc2(hidden_state) 49 | return e_mental, hidden_state 50 | 51 | 52 | class Observer: 53 | def __init__(self, env): 54 | self.env = env 55 | self.n_state = env.get_state_size() 56 | self.n_action = env.get_action_size() 57 | 58 | self.cnet = CharacterNetwork(self.n_state + self.n_action) 59 | self.mnet = MentalNetwork(self.n_state + self.n_action + self.cnet.character_dim) 60 | 61 | 62 | def calc_character(self, episodes): 63 | e_character_sum = torch.zeros(1, self.cnet.character_dim) 64 | for episode in episodes: 65 | timesteps = len(episode.states) 66 | hidden_state = self.cnet.init_hidden() 67 | for t in range(timesteps): 68 | state, action = episode.states[t], episode.actions[t] 69 | state = torch.FloatTensor(self.env.encode_state(state)).unsqueeze(0) 70 | action_onehot = torch.LongTensor(np.eye(self.n_action)[action]).unsqueeze(0) 71 | inputs = torch.cat([state, action_onehot], dim=1) 72 | e_character, hidden_state = self.cnet(inputs, hidden_state) 73 | e_character_sum += e_character 74 | return e_character_sum 75 | 76 | 77 | def calc_mental(self, state, action, e_character, hidden_state): 78 | if e_character is None: 79 | e_character = torch.zeros(1, self.cnet.character_dim) 80 | state = torch.FloatTensor(self.env.encode_state(state)).unsqueeze(0) 81 | action_onehot = torch.LongTensor(np.eye(self.n_action)[action]).unsqueeze(0) 82 | inputs = torch.cat([state, action_onehot, e_character], dim=1) 83 | e_mental, hidden_state = self.mnet(inputs, hidden_state) 84 | return e_mental, hidden_state -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import os 4 | 5 | def moving_average(data, window_size): 6 | return np.convolve(data, np.ones(window_size)/window_size, mode='valid') 7 | 8 | def load_data(file_path): 9 | with open(file_path, 'r') as f: 10 | data = eval(f.read()) 11 | return np.array(data) 12 | 13 | def plot(data): 14 | window_sizes = [10, 50, 100] 15 | plt.figure(figsize=(10, 6)) 16 | for window_size in window_sizes: 17 | smoothed_data = moving_average(data, window_size) 18 | plt.plot(np.arange(window_size-1, len(data)), smoothed_data, label=f'Window size = {window_size}') 19 | plt.title('Episode Reward') 20 | plt.xlabel('Episodes') 21 | plt.ylabel('Reward') 22 | plt.legend() 23 | plt.show() 24 | 25 | if __name__ == '__main__': 26 | file_path = os.path.join('saved', 'off', 'gat', '1736146194.1024332', 'ep_reward.txt') 27 | data = load_data(file_path) 28 | plot(data) 29 | -------------------------------------------------------------------------------- /train_gat_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import os 4 | 5 | from env.environment import Environment 6 | from components.episodebuffer import Episode, ReplayBuffer 7 | from modules.agents.gatagent import GATAgent 8 | 9 | def save(dirname, filename, data): 10 | if not os.path.exists(dirname): 11 | os.makedirs(dirname) 12 | with open(os.path.join(dirname, filename), 'w') as f: 13 | f.write(str(data)) 14 | 15 | def train(): 16 | start_time = time.time() 17 | env = Environment() 18 | agent = GATAgent(env) 19 | # episodes = 50001 20 | episodes = 5001 21 | dvr_list = [] 22 | reward_list = [] 23 | t = 0 24 | for i in range(episodes): 25 | state = env.reset(seed=int(start_time)+i) 26 | ep_reward = 0 27 | done = False 28 | while not done: 29 | avail_action = env.get_avail_actions() 30 | action = agent.choose_action(state, avail_action, t) 31 | next_state, reward, done = env.step(action) 32 | agent.store_transition(state, action, reward, next_state, avail_action) 33 | 34 | ep_reward += reward 35 | 36 | if agent.buffer.can_sample(): 37 | agent.learn() 38 | if done: 39 | print("episode: {} , the episode reward is {}".format(i, round(ep_reward, 3))) 40 | break 41 | state = next_state 42 | t = t + 1 43 | dvr_rate = env.get_metric() 44 | dvr_list.append(dvr_rate) 45 | reward_list.append(ep_reward) 46 | 47 | if i % 500 == 0: 48 | env.update_adjs(set(agent.buffer.IDs)) 49 | 50 | if i % 500 == 0: 51 | agent.save_models("./saved/off/gat/{}/{}".format(start_time, i)) 52 | save("./saved/off/gat/{}".format(start_time), "dvr.txt", dvr_list) 53 | save("./saved/off/gat/{}".format(start_time), "ep_reward.txt", reward_list) 54 | 55 | 56 | if __name__ == '__main__': 57 | train() -------------------------------------------------------------------------------- /train_gcn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import os 4 | 5 | from env.environment import Environment 6 | from components.episodebuffer import Episode, ReplayBuffer 7 | from modules.agents.gcnagent import GCNAgent 8 | 9 | def save(dirname, filename, data): 10 | if not os.path.exists(dirname): 11 | os.makedirs(dirname) 12 | with open(os.path.join(dirname, filename), 'w') as f: 13 | f.write(str(data)) 14 | 15 | def train(): 16 | start_time = time.time() 17 | env = Environment() 18 | agent = GCNAgent(env) 19 | # episodes = 50001 20 | episodes = 5001 21 | dvr_list = [] 22 | reward_list = [] 23 | t = 0 24 | for i in range(episodes): 25 | state = env.reset(seed=int(start_time)+i) 26 | ep_reward = 0 27 | done = False 28 | while not done: 29 | avail_action = env.get_avail_actions() 30 | action = agent.choose_action(state, avail_action, t) 31 | next_state, reward, done = env.step(action) 32 | agent.store_transition(state, action, reward, next_state, avail_action) 33 | 34 | ep_reward += reward 35 | 36 | if agent.buffer.can_sample(): 37 | agent.learn() 38 | if done: 39 | print("episode: {} , the episode reward is {}".format(i, round(ep_reward, 3))) 40 | break 41 | state = next_state 42 | t = t + 1 43 | dvr_rate = env.get_metric() 44 | dvr_list.append(dvr_rate) 45 | reward_list.append(ep_reward) 46 | 47 | if i % 500 == 0: 48 | env.update_adjs(set(agent.buffer.IDs)) 49 | 50 | if i % 500 == 0: 51 | agent.save_models("./saved/off/gcn/{}/{}".format(start_time, i)) 52 | save("./saved/off/gcn/{}".format(start_time), "dvr.txt", dvr_list) 53 | save("./saved/off/gcn/{}".format(start_time), "ep_reward.txt", reward_list) 54 | 55 | 56 | if __name__ == '__main__': 57 | train() -------------------------------------------------------------------------------- /train_mlp_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import os 4 | 5 | from env.environment import Environment 6 | from components.episodebuffer import Episode, ReplayBuffer 7 | from modules.agents.mlpagent import MLPAgent 8 | 9 | def save(dirname, filename, data): 10 | if not os.path.exists(dirname): 11 | os.makedirs(dirname) 12 | with open(os.path.join(dirname, filename), 'w') as f: 13 | f.write(str(data)) 14 | 15 | def train(): 16 | start_time = time.time() 17 | env = Environment() 18 | agent = MLPAgent(env) 19 | # episodes = 50001 20 | episodes = 5001 21 | dvr_list = [] 22 | reward_list = [] 23 | t = 0 24 | for i in range(episodes): 25 | state = env.reset(seed=int(start_time)+i) 26 | ep_reward = 0 27 | done = False 28 | while not done: 29 | avail_action = env.get_avail_actions() 30 | action = agent.choose_action(state, avail_action, t) 31 | next_state, reward, done = env.step(action) 32 | agent.store_transition(state, action, reward, next_state, avail_action) 33 | 34 | ep_reward += reward 35 | 36 | if agent.buffer.can_sample(): 37 | agent.learn() 38 | if done: 39 | print("episode: {} , the episode reward is {}".format(i, round(ep_reward, 3))) 40 | break 41 | state = next_state 42 | t = t + 1 43 | dvr_rate = env.get_metric() 44 | dvr_list.append(dvr_rate) 45 | reward_list.append(ep_reward) 46 | 47 | if i % 500 == 0: 48 | env.update_adjs(set(agent.buffer.IDs)) 49 | 50 | if i % 500 == 0: 51 | agent.save_models("./saved/off/mlp/{}/{}".format(start_time, i)) 52 | save("./saved/off/mlp/{}".format(start_time), "dvr.txt", dvr_list) 53 | save("./saved/off/mlp/{}".format(start_time), "ep_reward.txt", reward_list) 54 | 55 | 56 | if __name__ == '__main__': 57 | train() -------------------------------------------------------------------------------- /train_tom_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import os 4 | 5 | from env.environment import Environment 6 | from components.episodebuffer import Episode, ReplayBuffer 7 | from modules.agents.tomagent import ToMAgent 8 | 9 | def save(dirname, filename, data): 10 | if not os.path.exists(dirname): 11 | os.makedirs(dirname) 12 | with open(os.path.join(dirname, filename), 'w') as f: 13 | f.write(str(data)) 14 | 15 | def main(): 16 | start_time = time.time() 17 | env = Environment() 18 | agent = ToMAgent(env) 19 | # episodes = 50001 20 | episodes = 5001 21 | dvr_list = [] 22 | reward_list = [] 23 | t = 0 24 | for i in range(episodes): 25 | episode = Episode(env) 26 | states, actions, rewards, next_states, avail_actions = [], [], [], [], [] 27 | agent.init_hidden() 28 | 29 | # sample 30 | state = env.reset(seed=int(start_time)+i) 31 | ep_reward = 0 32 | done = False 33 | while not done: 34 | # select action 35 | avail_action = env.get_avail_actions() 36 | action = agent.choose_action(state, avail_action, t) 37 | 38 | # step 39 | next_state, reward, done = env.step(action) 40 | 41 | ep_reward += reward 42 | 43 | if done: 44 | print("episode: {} , the episode reward is {}".format(i, round(ep_reward, 3))) 45 | break 46 | 47 | states.append(state) 48 | actions.append(action) 49 | rewards.append(reward) 50 | next_states.append(next_state) 51 | avail_actions.append(avail_action) 52 | state = next_state 53 | t = t + 1 54 | 55 | # store 56 | episode.update(states, actions, rewards, next_states, avail_actions, env.ID) 57 | agent.store_episode(episode) 58 | 59 | # learn 60 | if agent.buffer.can_sample(): 61 | agent.learn() 62 | 63 | dvr_rate = env.get_metric() 64 | dvr_list.append(dvr_rate) 65 | reward_list.append(ep_reward) 66 | 67 | if i % 500 == 0: 68 | env.update_adjs(set(agent.buffer.get_IDs())) 69 | 70 | if i % 500 == 0: 71 | agent.save_models("./saved/off/tom/{}/{}".format(start_time, i)) 72 | save("./saved/off/tom/{}".format(start_time), "dvr.txt", dvr_list) 73 | save("./saved/off/tom/{}".format(start_time), "ep_reward.txt", reward_list) 74 | 75 | 76 | if __name__ == '__main__': 77 | main() -------------------------------------------------------------------------------- /utils/policy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | # pip install torch_geometric 5 | from torch_geometric.nn import GCNConv, GATConv 6 | from torch_geometric.data import Data, Batch 7 | from torch_geometric.utils import dense_to_sparse 8 | 9 | class MLPPolicy(nn.Module): 10 | def __init__(self, input_shape, n_action): 11 | super(MLPPolicy, self).__init__() 12 | self.name = "mlp" 13 | 14 | self.hid_size = 128 15 | self.fc1 = nn.Linear(input_shape, self.hid_size) 16 | self.fc2 = nn.Linear(self.hid_size, n_action) 17 | 18 | def forward(self, x): 19 | """ 20 | x: [batch_size, input_shape] 21 | input_shape = n_state + character_dim + mental_dim 22 | """ 23 | x = F.relu(self.fc1(x)) 24 | action_prob = self.fc2(x) 25 | return action_prob 26 | 27 | 28 | class GCNPolicy(nn.Module): 29 | def __init__(self, n_feature, n_action, num_nodes, M): 30 | super(GCNPolicy, self).__init__() 31 | self.name = "gcn" 32 | 33 | self.gnn_hid_size = 3 34 | self.gnn_out_dim = 3 35 | self.hid_size = 128 36 | self.gcn1 = GCNConv(n_feature, self.gnn_hid_size) 37 | self.gcn2 = GCNConv(self.gnn_hid_size, self.gnn_out_dim) 38 | self.fc1 = nn.Linear(num_nodes + num_nodes * self.gnn_out_dim + (M + 2) * 3, self.hid_size) 39 | self.fc2 = nn.Linear(self.hid_size, n_action) 40 | 41 | def forward(self, idx, x, y, adj): 42 | """ 43 | idx: [batch_size, num_nodes] 44 | x: [batch_size, num_nodes, 3] 45 | y: [batch_size, M+2, 3] 46 | adj: [batch_size, num_nodes, num_nodes] 47 | """ 48 | batch_size = x.shape[0] 49 | 50 | data_list = [Data(x=x[i], edge_index=dense_to_sparse(adj[i])[0]) for i in range(batch_size)] 51 | batch = Batch.from_data_list(data_list) 52 | x = F.relu(self.gcn1(batch.x, batch.edge_index)) # [batch_size * num_nodes, gnn_hid_size] 53 | x = self.gcn2(x, batch.edge_index) # [batch_size * num_nodes, gnn_out_dim] 54 | 55 | x = x.reshape(batch_size, -1) # [batch_size, num_nodes * gnn_out_dim] 56 | idx = idx.reshape(batch_size, -1) # [batch_size, num_nodes * 1] 57 | y = y.reshape(batch_size, -1) # [batch_size, (M+2) * 3] 58 | x = F.relu(self.fc1(torch.cat([idx, x, y], dim=-1))) 59 | action_prob = self.fc2(x) 60 | return action_prob 61 | 62 | 63 | class GATPolicy(nn.Module): 64 | def __init__(self, n_feature, n_action, num_nodes, M): 65 | super(GATPolicy, self).__init__() 66 | self.name = "gat" 67 | 68 | self.heads = 2 69 | self.gnn_hid_size = 3 70 | self.gnn_out_dim = 3 71 | self.hid_size = 128 72 | self.gat1 = GATConv(n_feature, self.gnn_hid_size, heads=self.heads) 73 | self.gat2 = GATConv(self.gnn_hid_size * self.heads, self.gnn_out_dim) 74 | self.fc1 = nn.Linear(num_nodes + num_nodes * self.gnn_out_dim + (M + 2) * 3, self.hid_size) 75 | self.fc2 = nn.Linear(self.hid_size, n_action) 76 | 77 | def forward(self, idx, x, y, adj): 78 | """ 79 | idx: [batch_size, num_nodes] 80 | x: [batch_size, num_nodes, 3] 81 | y: [batch_size, M+2, 3] 82 | adj: [batch_size, num_nodes, num_nodes] 83 | """ 84 | batch_size = x.shape[0] 85 | 86 | data_list = [Data(x=x[i], edge_index=dense_to_sparse(adj[i])[0]) for i in range(batch_size)] 87 | batch = Batch.from_data_list(data_list) 88 | x = F.relu(self.gat1(batch.x, batch.edge_index)) # [batch_size * num_nodes, gnn_hid_size * heads] 89 | x = self.gat2(x, batch.edge_index) # [batch_size * num_nodes, gnn_out_dim] 90 | 91 | x = x.reshape(batch_size, -1) # [batch_size, num_nodes * gnn_out_dim] 92 | idx = idx.reshape(batch_size, -1) # [batch_size, num_nodes * 1] 93 | y = y.reshape(batch_size, -1) # [batch_size, (M+2) * 3] 94 | x = F.relu(self.fc1(torch.cat([idx, x, y], dim=-1))) 95 | action_prob = self.fc2(x) 96 | return action_prob -------------------------------------------------------------------------------- /utils/scheduler.py: -------------------------------------------------------------------------------- 1 | class LinearSchedule: 2 | def __init__(self, 3 | start, 4 | finish, 5 | time_length): 6 | 7 | self.start = start 8 | self.finish = finish 9 | self.time_length = time_length 10 | self.delta = (self.finish - self.start) / self.time_length 11 | 12 | def eval(self, T): 13 | return min(self.finish, self.start + self.delta * T) --------------------------------------------------------------------------------