├── DPP-v1 ├── .gitkeep ├── MyNN.py └── Test_Model.py ├── Initial_workin_code.py ├── README.md ├── dynamic_obstacles.gif ├── empty-env.png └── gridworld_dynamic_obstacles.py /DPP-v1/.gitkeep: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DPP-v1/MyNN.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import gym_minigrid 3 | import math 4 | import random 5 | import numpy as np 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | from collections import namedtuple 9 | from itertools import count 10 | from PIL import Image 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.optim as optim 15 | import torch.nn.functional as F 16 | import torchvision.transforms as T 17 | 18 | 19 | env = gym.make('MiniGrid-Dynamic-Obstacles-8x8-v0')#'MiniGrid-Empty-5x5-v0') 20 | 21 | print(env.observation_space) 22 | print(env.action_space) 23 | 24 | 25 | 26 | Transition = namedtuple('Transition', 27 | ('state', 'action', 'next_state', 'reward')) 28 | 29 | 30 | class ReplayMemory(object): 31 | 32 | 33 | 34 | 35 | def __init__(self, capacity): 36 | self.capacity = capacity 37 | self.memory = [] 38 | self.position = 0 39 | 40 | def push(self, *args): 41 | """Saves a transition.""" 42 | if len(self.memory) < self.capacity: 43 | self.memory.append(None) 44 | self.memory[self.position] = Transition(*args) 45 | self.position = (self.position + 1) % self.capacity 46 | 47 | def sample(self, batch_size): 48 | return random.sample(self.memory, batch_size) 49 | 50 | def __len__(self): 51 | return len(self.memory) 52 | 53 | class DQN(nn.Module): 54 | 55 | def __init__(self, h, w, outputs): 56 | super(DQN, self).__init__() 57 | self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1) 58 | self.bn1 = nn.BatchNorm2d(32) 59 | self.conv2 = nn.Conv2d(32,64, kernel_size=3, stride=1) 60 | self.bn2 = nn.BatchNorm2d(64) 61 | self.conv3 = nn.Conv2d(64, 64, kernel_size=2, stride=1) 62 | self.bn3 = nn.BatchNorm2d(64) 63 | 64 | # Number of Linear input connections depends on output of conv2d layers 65 | # and therefore the input image size, so compute it. 66 | def conv2d_size_out(size, kernel_size = 5, stride = 2): 67 | return (size - (kernel_size - 1) - 1) // stride + 1 68 | convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) 69 | convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) 70 | linear_input_size = convw * convh * 64 71 | self.head = nn.Linear(linear_input_size, outputs) 72 | 73 | # Called with either one element to determine next action, or a batch 74 | # during optimization. Returns tensor([[left0exp,right0exp]...]). 75 | def forward(self, x): 76 | x = F.relu(self.bn1(self.conv1(x))) 77 | x = F.relu(self.bn2(self.conv2(x))) 78 | x = F.relu(self.bn3(self.conv3(x))) 79 | return self.head(x.view(x.size(0), -1)) 80 | 81 | BATCH_SIZE = 256 #128 82 | GAMMA = 0.999 83 | EPS_START = 1 84 | EPS_END = 0.05 85 | 86 | EPS_DECAY = 4000 #200 for 50 epi 87 | num_episodes = 500 88 | TARGET_UPDATE = 30 89 | #PATH ='./logs/' 90 | 91 | # Get screen size so that we can initialize layers correctly based on shape 92 | # returned from AI gym. Typical dimensions at this point are close to 3x40x90 93 | # which is the result of a clamped and down-scaled render buffer in get_screen() 94 | 95 | 96 | init_screen = env.reset() 97 | screen_height =7 98 | screen_width = 7 99 | device = 'cpu' 100 | render_status = False 101 | save_model = True 102 | PATH = './logs/' 103 | 104 | # Get number of actions from gym action space 105 | n_actions = env.action_space.n 106 | 107 | policy_net = DQN(screen_height, screen_width, n_actions).to(device) 108 | target_net = DQN(screen_height, screen_width, n_actions).to(device) 109 | target_net.load_state_dict(policy_net.state_dict()) 110 | target_net.eval() 111 | 112 | optimizer = optim.Adam(policy_net.parameters()) 113 | memory = ReplayMemory(10000) 114 | 115 | 116 | episode_durations = [] 117 | eps_of_episode = [] 118 | reward_hist=[] 119 | steps_done = 0 120 | 121 | 122 | def save_logs(Network , grap_plot ,Test_name , ith_sample): 123 | torch.save(Network,PATH+Test_name+ith_sample) 124 | plt.savefig(PATH+ith_sample+'.png') 125 | 126 | 127 | def get_state(x): 128 | #print(x) 129 | x.unsqueeze_(0) 130 | 131 | x.transpose_(1,3) 132 | #print(x) 133 | return x 134 | 135 | def select_action(state): 136 | global steps_done 137 | sample = random.random() 138 | eps_threshold = EPS_END + (EPS_START - EPS_END) * \ 139 | math.exp(-1. * steps_done / EPS_DECAY) 140 | 141 | 142 | steps_done += 1 143 | eps_of_episode.append(eps_threshold) 144 | if sample > eps_threshold: 145 | with torch.no_grad(): 146 | # t.max(1) will return largest column value of each row. 147 | # second column on max result is index of where max element was 148 | # found, so we pick action with the larger expected reward. 149 | return policy_net(state).max(1)[1].view(1, 1) 150 | else: 151 | return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long) 152 | 153 | 154 | 155 | 156 | def plot_durations(): 157 | plt.figure(2) 158 | plt.clf() 159 | durations_t = torch.tensor(episode_durations, dtype=torch.float) 160 | plt.subplot(221) 161 | plt.title('Training...') 162 | plt.xlabel('Episode') 163 | plt.ylabel('Duration_alive') 164 | plt.plot(durations_t.numpy()) 165 | plt.subplot(222) 166 | #plt.title('Training...') 167 | plt.xlabel('Time_Steps') 168 | plt.ylabel('Epsilon') 169 | plt.plot(torch.tensor(eps_of_episode,dtype=torch.float).numpy()) 170 | plt.subplot(223) 171 | plt.title('Training...') 172 | plt.xlabel('Episode') 173 | plt.ylabel('Total_reward') 174 | rew_hist_tensor = torch.tensor(reward_hist,dtype=torch.float) 175 | plt.plot(rew_hist_tensor.numpy()) 176 | #plt.show() 177 | 178 | # Take 100 episode averages and plot them too 179 | if len(durations_t) >= 100: 180 | 181 | plt.subplot(221) 182 | means = durations_t.unfold(0, 100, 1).mean(1).view(-1) 183 | means = torch.cat((torch.zeros(99), means)) 184 | plt.plot(means.numpy()) 185 | plt.subplot(223) 186 | means = rew_hist_tensor.unfold(0, 100, 1).mean(1).view(-1) 187 | means = torch.cat((torch.zeros(99), means)) 188 | plt.plot(means.numpy()) 189 | 190 | 191 | plt.pause(0.001) # pause a bit so that plots are updated 192 | # if is_ipython: 193 | # display.clear_output(wait=True) 194 | # display.display(plt.gcf()) 195 | 196 | def optimize_model(): 197 | if len(memory) < BATCH_SIZE: 198 | return 199 | transitions = memory.sample(BATCH_SIZE) 200 | # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for 201 | # detailed explanation). This converts batch-array of Transitions 202 | # to Transition of batch-arrays. 203 | batch = Transition(*zip(*transitions)) 204 | 205 | # Compute a mask of non-final states and concatenate the batch elements 206 | # (a final state would've been the one after which simulation ended) 207 | non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, 208 | batch.next_state)), device=device, dtype=torch.bool) 209 | non_final_next_states = torch.cat([s for s in batch.next_state 210 | if s is not None]) 211 | state_batch = torch.cat(batch.state) 212 | action_batch = torch.cat(batch.action) 213 | reward_batch = torch.cat(batch.reward) 214 | 215 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 216 | # columns of actions taken. These are the actions which would've been taken 217 | # for each batch state according to policy_net 218 | state_action_values = policy_net(state_batch).gather(1, action_batch) 219 | 220 | # Compute V(s_{t+1}) for all next states. 221 | # Expected values of actions for non_final_next_states are computed based 222 | # on the "older" target_net; selecting their best reward with max(1)[0]. 223 | # This is merged based on the mask, such that we'll have either the expected 224 | # state value or 0 in case the state was final. 225 | next_state_values = torch.zeros(BATCH_SIZE, device=device) 226 | next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach() 227 | # Compute the expected Q values 228 | expected_state_action_values = (next_state_values * GAMMA) + reward_batch 229 | 230 | # Compute Huber loss 231 | loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) 232 | 233 | # Optimize the model 234 | optimizer.zero_grad() 235 | loss.backward() 236 | for param in policy_net.parameters(): 237 | param.grad.data.clamp_(-1, 1) 238 | optimizer.step() 239 | 240 | def train(): 241 | 242 | for i_episode in range(num_episodes): 243 | # Initialize the environment and state 244 | env.reset() 245 | #last_screen = env.render() 246 | 247 | #current_screen = env.render() 248 | 249 | state_dic,_,_,_= env.step(env.action_space.sample())#env.render() 250 | state = get_state(torch.tensor(state_dic['image'],dtype =torch.float)) 251 | #print(state.size()) 252 | #print(state) 253 | reward_epi = 0 254 | for t in count(): 255 | # Select and perform an action 256 | if render_status: 257 | env.render() 258 | action = select_action(state) 259 | observation, reward, done, _ = env.step(action.item()) 260 | 261 | reward = torch.tensor([reward],dtype=torch.long, device=device) 262 | 263 | 264 | # Observe new state 265 | 266 | 267 | if not done: 268 | next_state =get_state(torch.tensor(observation['image'],dtype =torch.float)) #env.render()# current_screen - last_screen 269 | else: 270 | next_state = None 271 | 272 | # Store the transition in memory 273 | memory.push(state, action, next_state, reward) 274 | 275 | # Move to the next state 276 | state = next_state 277 | 278 | # Perform one step of the optimization (on the target network) 279 | reward_epi = reward_epi+reward.item() 280 | optimize_model() 281 | 282 | if done: 283 | if t != 0: 284 | reward_epi = reward_epi/t 285 | reward_hist.append(reward_epi) 286 | episode_durations.append(t + 1) 287 | plot_durations() 288 | break 289 | 290 | # Update the target network, copying all weights and biases in DQN 291 | if i_episode % TARGET_UPDATE == 0: 292 | target_net.load_state_dict(policy_net.state_dict()) 293 | 294 | print('Complete') 295 | #env.render() 296 | env.close() 297 | #plt.ioff() 298 | #plt.show() 299 | 300 | 301 | if __name__ == '__main__': 302 | print("Training :") 303 | train() 304 | Folder_name = 'test_(8x8)' 305 | if save_model == True : 306 | save_logs(policy_net,plt,Folder_name,'2') 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | # for i_episode in range(20): 319 | # observation = env.reset() 320 | # for t in range(100): 321 | # env.render() 322 | # print(observation) 323 | # action = env.action_space.sample() 324 | # observation, reward, done, info = env.step(action) 325 | # if done: 326 | # print("Episode finished after {} timesteps".format(t+1)) 327 | # break 328 | # env.close() -------------------------------------------------------------------------------- /DPP-v1/Test_Model.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import gym_minigrid 3 | from itertools import count 4 | import time 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | env = gym.make('MiniGrid-Dynamic-Obstacles-8x8-v0')#'MiniGrid-Empty-5x5-v0') 11 | Fil_name='test_1' 12 | PATH = './logs/'+Fil_name 13 | print(env.observation_space) 14 | print(env.action_space) 15 | 16 | 17 | class DQN(nn.Module): 18 | 19 | def __init__(self, h, w, outputs): 20 | super(DQN, self).__init__() 21 | self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1) 22 | self.bn1 = nn.BatchNorm2d(32) 23 | self.conv2 = nn.Conv2d(32,64, kernel_size=3, stride=1) 24 | self.bn2 = nn.BatchNorm2d(64) 25 | self.conv3 = nn.Conv2d(64, 64, kernel_size=2, stride=1) 26 | self.bn3 = nn.BatchNorm2d(64) 27 | 28 | # Number of Linear input connections depends on output of conv2d layers 29 | # and therefore the input image size, so compute it. 30 | def conv2d_size_out(size, kernel_size = 5, stride = 2): 31 | return (size - (kernel_size - 1) - 1) // stride + 1 32 | convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) 33 | convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) 34 | linear_input_size = convw * convh * 64 35 | self.head = nn.Linear(linear_input_size, outputs) 36 | 37 | # Called with either one element to determine next action, or a batch 38 | # during optimization. Returns tensor([[left0exp,right0exp]...]). 39 | def forward(self, x): 40 | x = F.relu(self.bn1(self.conv1(x))) 41 | x = F.relu(self.bn2(self.conv2(x))) 42 | x = F.relu(self.bn3(self.conv3(x))) 43 | return self.head(x.view(x.size(0), -1)) 44 | def get_state(x): 45 | #print(x) 46 | x.unsqueeze_(0) 47 | 48 | x.transpose_(1,3) 49 | #print(x) 50 | return x 51 | 52 | 53 | model = torch.load(PATH) 54 | model.eval() 55 | device = 'cpu' 56 | def test_model(num_of_episodes,): 57 | 58 | 59 | for i in range(num_of_episodes): 60 | env.reset() 61 | 62 | state_dic,_,_,_= env.step(env.action_space.sample())#env.render() 63 | state = get_state(torch.tensor(state_dic['image'],dtype =torch.float)) 64 | reward_this_epi = 0 65 | for t in count(): 66 | env.render() 67 | time.sleep(0.1) 68 | action = model(state).max(1)[1].view(1, 1) 69 | observation,reward ,done,_ =env.step(action.item()) 70 | reward = torch.tensor([reward],dtype=torch.long, device=device) 71 | reward_this_epi = reward_this_epi+ reward.item() 72 | state =get_state(torch.tensor(observation['image'],dtype =torch.float)) 73 | if done: 74 | reward_this_epi =reward_this_epi/(t+1) 75 | print('Episode:',i,'Episode_Score:',reward_this_epi,'Steps_alive',t+1) 76 | break 77 | 78 | 79 | if __name__ == '__main__': 80 | print('Testing..') 81 | test_model(10) 82 | -------------------------------------------------------------------------------- /Initial_workin_code.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | Created on Thu Sep 28 20:52:25 2017 4 | @author: Administrator 5 | """ 6 | import tensorflow as tf 7 | import numpy as np 8 | import time 9 | import random 10 | from math import sqrt,cos,sin,atan2 11 | import pygame 12 | from pygame.locals import * 13 | from sys import exit 14 | import os 15 | import matplotlib.pyplot as plt 16 | # import tensorflow.compat.v1 as tf1 17 | # tf1.disable_v2_behavior() 18 | #tf.compat.v1.disable_v2_behavior() 19 | 20 | 21 | pygame.init() 22 | fpsClock = pygame.time.Clock() 23 | XDIM = 640 24 | YDIM = 480 25 | SCREEN_SIZE = (XDIM, YDIM) 26 | screen = pygame.display.set_mode(SCREEN_SIZE, 0, 32) 27 | pygame.display.set_caption("NAVIGATION!") 28 | show_data = True 29 | 30 | test_mode = True #False 31 | load_model = False 32 | 33 | class vector(object): 34 | def __init__(self,x=0,y=0): 35 | self.x = x 36 | self.y = y 37 | def cal_vector(self,p1,p2): 38 | self.x = p2[0]-p1[0] 39 | self.y = p2[1]-p1[1] 40 | return (self.x,self.y) 41 | def get_magnitude(self): 42 | return sqrt(self.x**2+self.y**2) 43 | 44 | def get_point(vector,p1): 45 | return (vector[0]+p1[0],vector[1]+p1[1]) 46 | 47 | def get_distance(p1,p2): 48 | return sqrt((p1[0]-p2[0])**2+(p1[1]-p2[1])**2) 49 | 50 | class lidar(object): 51 | def __init__(self,p,distance=200,angle=0): 52 | self.x = p[0] 53 | self.y = p[1] 54 | self.distance = distance 55 | self.angle = 0 56 | self.lidar =(0,0) 57 | self.data = [] 58 | self.data_2 = [] 59 | def scan(self): 60 | dis = [] 61 | for i in range(0,360,1): 62 | for j in range(1,40): 63 | x1 = int(self.x + self.distance*cos(i*3.14/180)*j/40) 64 | y1 = int(self.y + self.distance*sin(i*3.14/180)*j/40) 65 | #print(x1,y1) 66 | pix = (0,0,0) 67 | if(x1>0 and x10 and y1=XDIM or y1<=0 or y1>=YDIM): 72 | break 73 | elif(pix[0]>100 and pix[1]>100 and pix[2]>100 ): 74 | break 75 | else: 76 | dis = [x1,y1] 77 | if len(self.data)<360: 78 | self.data.append(dis) 79 | else: 80 | self.data[i] = dis 81 | #print(self.data[i]) 82 | if(self.x>0 and self.x0 and self.y0 and self.miny>0 and self.maxx0 and self.miny+y>0 and self.maxx+xlength//2 and max_xlength//2 and max_x>=XDIM-length//2: 231 | x = random.randint(min_x+1,XDIM-length//2-1) 232 | elif min_x<=length//2 and max_x>=XDIM-length//2: 233 | x = random.randint(length//2+1,XDIM-length//2-1) 234 | 235 | if(min_y>width//2 and max_ywidth//2 and max_y>=YDIM-width//2: 240 | y = random.randint(min_y+1,YDIM-width//2-1) 241 | elif min_y<=width//2 and max_y>=YDIM-width//2: 242 | y = random.randint(width//2+1,YDIM-width//2-1) 243 | #print(x,y) 244 | flag = self.point_collision_detect(x,y,length,width) 245 | count +=1 246 | if(count > 200): 247 | print("over count") 248 | break 249 | return x,y,length,width 250 | 251 | 252 | def goal_point(self,length=5,width=5): 253 | flag = False 254 | while (not flag): 255 | x = random.randint(int(length//2+1),int(XDIM-length//2-1)) 256 | y = random.randint(int(width//2+1),int(YDIM-width//2-1)) 257 | flag = self.point_collision_detect(x,y,length,width) 258 | #print(not flag) 259 | pygame.draw.circle(screen, (0,255,0),(x,y),length) 260 | self.goal = x,y,length,width 261 | return x,y,length,width 262 | 263 | def goal_point_notrandom(self,x,y,length=5,wigth=5): 264 | self.goal = x,y,length,wigth 265 | pygame.draw.circle(screen, (0, 255, 0), (x, y), length) 266 | return x,y,length,wigth 267 | 268 | def check_goal(self,p,size=[20,20]): 269 | dis = get_distance(p,(self.goal[0],self.goal[1])) 270 | if self.point_collision_detect(p[0],p[1],size[0],size[1]): 271 | if dis<=self.scope: 272 | self.reward = 1 273 | if(test_mode ==True): 274 | return self.reward, True 275 | return self.reward,False 276 | else: 277 | self.reward = -0.004 278 | return self.reward,False 279 | else: 280 | self.reward = -1 281 | return self.reward,False 282 | 283 | 284 | def reset(self,level=3): 285 | screen.fill((20,20,20)) 286 | self.init_obstacles(level) 287 | pygame.draw.circle(screen, (0, 255, 0), (self.goal[0], self.goal[1]), self.goal[2]) 288 | if(test_mode == True): 289 | self.draw_old_goalpoint() 290 | self.draw_part_path() 291 | self.draw_global_path() 292 | 293 | 294 | def add_old_goalpoint(self,node): 295 | self.old_goalpoint.append(node) 296 | 297 | def clear_old_goalpoint(self): 298 | self.old_goalpoint = [] 299 | 300 | def draw_old_goalpoint(self): 301 | if(len(self.old_goalpoint)>2): 302 | for goal_point in self.old_goalpoint: 303 | pygame.draw.circle(screen, (0, 0, 255), goal_point, 5) 304 | 305 | def draw_global_path(self): 306 | path_start = [] 307 | if (len(self.old_goalpoint) > 2): 308 | path_start = self.old_goalpoint[0] 309 | for path_end in self.old_goalpoint: 310 | pygame.draw.line(screen, (180, 180, 0), path_start, path_end, 2) 311 | path_start = path_end 312 | 313 | def add_node(self,node): 314 | self.node.append(node) 315 | 316 | def clear_node(self): 317 | self.node=[] 318 | 319 | def draw_part_path(self): 320 | node_start =[] 321 | if(len(self.node)>2): 322 | node_start = self.node[0] 323 | for node_end in self.node: 324 | pygame.draw.line(screen, (255,0,0), node_start, node_end, 2) 325 | node_start = node_end 326 | 327 | def init_obstacles(self,configNum,movespeed=1): 328 | rectObs = [] 329 | 330 | if(self.dyenv >100 and self.obstacle_flag == 0): 331 | self.obstacle_flag = 1 332 | elif(self.obstacle_flag == 1 and self.dyenv>0): 333 | movespeed =-1 334 | else: 335 | self.obstacle_flag = 0 336 | movespeed = 1 337 | self.dyenv += movespeed 338 | #print("config "+ str(configNum)) 339 | if (configNum == 0): 340 | rectObs.append(pygame.Rect((640 / 2.0 - 50, 480/ 2.0 - 100),(100,200))) 341 | if (configNum == 1): 342 | #rectObs.append(pygame.Rect((40,20),(20,200))) 343 | rectObs.append(pygame.Rect((120,280),(20,200))) 344 | rectObs.append(pygame.Rect((100,100),(80,20))) 345 | #rectObs.append(pygame.Rect((60,300),(50,20))) 346 | rectObs.append(pygame.Rect((140,0),(20,120))) 347 | rectObs.append(pygame.Rect((140,300),(80,20))) 348 | #rectObs.append(pygame.Rect((200,400),(150,20))) 349 | #rectObs.append(pygame.Rect((280,200),(20,200))) 350 | #rectObs.append(pygame.Rect((300,420),(250,20))) 351 | rectObs.append(pygame.Rect((350,0),(20,100))) 352 | rectObs.append(pygame.Rect((350,400),(20,100))) 353 | rectObs.append(pygame.Rect((400,340),(100,20))) 354 | rectObs.append(pygame.Rect((450,200),(150,20))) 355 | rectObs.append(pygame.Rect((500,0),(20,140))) 356 | rectObs.append(pygame.Rect((550,350),(20,500))) 357 | rectObs.append(pygame.Rect((620,50),(80,20))) 358 | rectObs.append(pygame.Rect((620,300),(80,20))) 359 | rectObs.append(pygame.Rect((220, 150 + self.dyenv), (20, 20))) 360 | rectObs.append(pygame.Rect((300+ self.dyenv, 240 ), (20, 20))) 361 | rectObs.append(pygame.Rect((40 + self.dyenv, 240), (20, 20))) 362 | #rectObs.append(pygame.Rect((700,50),(20,270))) 363 | rectObs.append(pygame.Rect((140,0),(600,20))) 364 | rectObs.append(pygame.Rect((0, 0), (20, 460))) 365 | rectObs.append(pygame.Rect((0, 0), (620, 20))) 366 | rectObs.append(pygame.Rect((620, 0), (20, 460))) 367 | rectObs.append(pygame.Rect((0, 460), (640, 20))) 368 | if (configNum == 2): 369 | rectObs.append(pygame.Rect((200,80),(20,200))) 370 | rectObs.append(pygame.Rect((220,80),(200,20))) 371 | rectObs.append(pygame.Rect((200,350),(200,20))) 372 | rectObs.append(pygame.Rect((400, 200), (60, 60))) 373 | rectObs.append(pygame.Rect((80, 100), (40, 40))) 374 | rectObs.append(pygame.Rect((80, 200+self.dyenv), (20, 20))) 375 | rectObs.append(pygame.Rect((280+self.dyenv, 200 ), (20, 20))) 376 | rectObs.append(pygame.Rect((500, 300 + self.dyenv), (20, 20))) 377 | 378 | rectObs.append(pygame.Rect((0,0),(20,460))) 379 | rectObs.append(pygame.Rect((0,0),(620,20))) 380 | rectObs.append(pygame.Rect((620,0),(20,460))) 381 | rectObs.append(pygame.Rect((0,460),(640,20))) 382 | # rectObs.append(pygame.Rect((40,10),(100,200))) 383 | if (configNum == 3): 384 | rectObs.append(pygame.Rect((40,40),(40,40))) 385 | rectObs.append(pygame.Rect((140, 140), (80, 80))) 386 | rectObs.append(pygame.Rect((350, 400), (40, 40))) 387 | rectObs.append(pygame.Rect((500, 160), (40, 40))) 388 | rectObs.append(pygame.Rect((380, 100), (40, 40))) 389 | rectObs.append(pygame.Rect((300, 340), (80, 40))) 390 | rectObs.append(pygame.Rect((80+ self.dyenv, 300 ), (20, 20))) 391 | rectObs.append(pygame.Rect((280 + self.dyenv, 240), (20, 20))) 392 | rectObs.append(pygame.Rect((500, 300 + self.dyenv), (20, 20))) 393 | rectObs.append(pygame.Rect((400, 250 + self.dyenv), (20, 20))) 394 | 395 | rectObs.append(pygame.Rect((0, 0), (20, 460))) 396 | rectObs.append(pygame.Rect((0, 0), (620, 20))) 397 | rectObs.append(pygame.Rect((620, 0), (20, 460))) 398 | rectObs.append(pygame.Rect((0, 460), (640, 20))) 399 | 400 | 401 | for rect in rectObs: 402 | pygame.draw.rect(screen, (255,255,255), rect) 403 | 404 | 405 | Env = NavigationEnv() 406 | px,py,pl,pw = Env.random_point() 407 | car_A = car(px,py,pl,pw) 408 | lidar_A = lidar(car_A.positon()) 409 | 410 | def observe(): 411 | state = lidar_A.state_2() 412 | if len(state) < 400: 413 | for i in range(20): 414 | state.append([Env.goal[0]-car_A.positon()[0],Env.goal[1]-car_A.positon()[1]]) 415 | for i in range(20): 416 | state.append([Env.goal[0]-car_A.positon()[0],Env.goal[1]-car_A.positon()[1]]) 417 | else : 418 | for i in range(20): 419 | state[360+i] = [Env.goal[0]-car_A.positon()[0],Env.goal[1]-car_A.positon()[1]] 420 | for i in range(20): 421 | state[380+i] = [Env.goal[0]-car_A.positon()[0],Env.goal[1]-car_A.positon()[1]] 422 | reward,done = Env.check_goal(car_A.positon(),car_A.size()) 423 | #for s in state: 424 | #print (s) 425 | return state,reward,done 426 | 427 | 428 | 429 | class Qnetwork(object): 430 | def __init__(self,size): 431 | self.Input=tf.compat.v1.placeholder(shape=[None,800],dtype=tf.float32) 432 | self.imageIn=tf.reshape(self.Input,shape=[-1,20,20,2]) 433 | self.conv1=tf.contrib.layers.convolution2d(inputs=self.imageIn,num_outputs=16,kernel_size=[2,2],stride=[2,2],padding='VALID',biases_initializer=None) 434 | self.conv2=tf.contrib.layers.convolution2d(inputs=self.conv1,num_outputs=32,kernel_size=[2,2],stride=[2,2],padding='VALID',biases_initializer=None) 435 | self.conv3=tf.contrib.layers.convolution2d(inputs=self.conv2,num_outputs=256,kernel_size=[5,5],stride=[1,1],padding='VALID',biases_initializer=None) 436 | self.fullconnect1 = tf.reshape(self.conv3,shape=[-1,256]) 437 | self.W1=tf.Variable(tf.random_normal([256,size])) 438 | self.b1=tf.Variable(tf.constant(0.1,shape=[size])) 439 | self.layer1=tf.matmul(self.fullconnect1,self.W1)+self.b1 440 | self.W2=tf.Variable(tf.random_normal([size,size])) 441 | self.b2=tf.Variable(tf.constant(0.1,shape=[size])) 442 | self.layer2=tf.nn.relu(tf.matmul(self.layer1,self.W2)+self.b2) 443 | self.layerAC,self.layerVC=tf.split(self.layer2,2,1) 444 | self.AW=tf.Variable(tf.random_normal([size//2,Env.actions])) 445 | self.VW=tf.Variable(tf.random_normal([size//2,1])) 446 | self.Advantage=tf.matmul(self.layerAC,self.AW) 447 | self.Value=tf.matmul(self.layerVC,self.VW) 448 | 449 | self.Qout=self.Value+tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,reduction_indices=1,keep_dims=True)) 450 | self.predict=tf.argmax(self.Qout,1) 451 | 452 | self.targetQ=tf.placeholder(shape=[None],dtype=tf.float32) 453 | self.actions=tf.placeholder(shape=[None],dtype=tf.int32) 454 | self.actions_onehot=tf.one_hot(self.actions, Env.actions, dtype=tf.float32) 455 | self.Q=tf.reduce_sum(tf.multiply(self.Qout,self.actions_onehot),reduction_indices=1) 456 | 457 | self.td_error=tf.square(self.targetQ-self.Q) 458 | self.loss=tf.reduce_mean(self.td_error) 459 | self.trainer=tf.train.AdamOptimizer(learning_rate=0.0001) #0.0001 460 | self.updateModel=self.trainer.minimize(self.loss) 461 | 462 | class experience_buffer(): 463 | def __init__(self,buffer_size =80000): 464 | self.buffer=[] 465 | self.buffer_size=buffer_size 466 | 467 | def add(self,experience): 468 | if len(self.buffer)+len(experience) >=self.buffer_size: 469 | self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size]=[] 470 | self.buffer.extend(experience) 471 | 472 | def sample(self,size): 473 | return np.reshape(np.array(random.sample(self.buffer,size)),[size,5]) 474 | 475 | def processState(states): 476 | return np.reshape(states,[800]) 477 | 478 | def updateTargetGraph(tfVars,tau): 479 | total_vars=len(tfVars) 480 | op_holder=[] 481 | for idx,var in enumerate(tfVars[0:total_vars//2]): 482 | op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau)+((1-tau)*tfVars[idx+total_vars//2].value()))) 483 | return op_holder 484 | 485 | def updateTarget(op_holder,sess): 486 | for op in op_holder: 487 | sess.run(op) 488 | 489 | batch_size=32 490 | updata_freq=4 491 | y=0.95 492 | startE=0.4 493 | endE=0.05 494 | anneling_steps=80000 495 | num_episodes=20000 496 | pre_train_steps=5000 497 | max_epLength=150 498 | 499 | path="./dqn" 500 | h_size=512 501 | tau=0.001 502 | 503 | mainQN=Qnetwork(h_size) 504 | targetQN=Qnetwork(h_size) 505 | 506 | init=tf.global_variables_initializer() 507 | 508 | trainables=tf.trainable_variables() 509 | print(len(trainables)) 510 | if(len(trainables)>18): 511 | exit() 512 | targetOps=updateTargetGraph(trainables,tau) 513 | #print(targetOps) 514 | 515 | 516 | myBuffer=experience_buffer() 517 | e=startE 518 | stepDrop=(startE-endE)/num_episodes 519 | 520 | rList=[] 521 | total_steps=0 522 | everage_reward = [] 523 | 524 | font = pygame.font.SysFont("arial", 16); 525 | font_height = font.get_linesize() 526 | event_text = [] 527 | 528 | 529 | plt_targetQ=[] 530 | plt_mainQ=[] 531 | plt_tderr=[] 532 | 533 | saver=tf.train.Saver() 534 | if not os.path.exists(path): 535 | os.makedirs(path) 536 | 537 | config = tf.ConfigProto() 538 | config.gpu_options.per_process_gpu_memory_fraction = 0.8 539 | #session = tf.Session() 540 | with tf.Session(config=config) as sess: 541 | if load_model ==True: 542 | print('Loading Model...') 543 | ckpt=tf.train.get_checkpoint_state(path) 544 | saver.restore(sess,ckpt.model_checkpoint_path) 545 | else: 546 | sess.run(init) #if load_model=false 547 | updateTarget(targetOps,sess) 548 | for i in range(num_episodes+1): 549 | #if(i%400==0): 550 | #max_epLength +=1 551 | episodeBuffer=experience_buffer() 552 | #px,py,pl,pw = Env.random_point() 553 | if(test_mode == False): 554 | map_level = np.random.randint(2,4) 555 | else: 556 | map_level = 3 557 | if test_mode ==True: 558 | if (i == 0): 559 | car_A.move_to([80, 120]) 560 | Env.add_old_goalpoint([60, 120]) 561 | if(i<=5): 562 | Env.goal_point_notrandom(110+80*i,200+30*i) 563 | Env.add_old_goalpoint([110+80*i,200+30*i]) 564 | else: 565 | Env.goal_point_notrandom(570, 300) 566 | Env.add_old_goalpoint([570, 300]) 567 | px,py=car_A.positon() 568 | 569 | #px, py, pl, pw = Env.close_to_goalpoint(150, 150) # 300 570 | else: 571 | Env.goal_point() 572 | Env.reset(map_level) 573 | px,py,pl,pw = Env.close_to_goalpoint(50+i//400,50+i//400) #300 574 | car_A = car(px, py, pl, pw) 575 | car_A.show() 576 | Env.add_node(car_A.positon()) 577 | lidar_A = lidar(car_A.positon()) 578 | lidar_A.scan() 579 | s,r,d = observe() 580 | s = processState(s) 581 | d=False 582 | rALL=0 583 | j=0 584 | while j < max_epLength: 585 | j += 1 586 | if test_mode ==True: 587 | if np.random.rand(1) < 0.05 : 588 | a=np.random.randint(0,8) 589 | else: 590 | a=sess.run(mainQN.predict,feed_dict={mainQN.Input:[s]})[0] 591 | b_out=sess.run(mainQN.Qout,feed_dict={mainQN.Input:[s]}) 592 | b_Q=max(b_out.ravel()) 593 | plt_mainQ.append(b_Q) 594 | if(len(plt_mainQ)>2000): 595 | del plt_mainQ[0] 596 | if(show_data == True): 597 | #b_out=sess.run(mainQN.Qout,feed_dict={mainQN.Input:[s]}) 598 | print('a:',a) 599 | print('b_out',b_out) 600 | print('s',s[1],s[91],s[181],s[271]) 601 | print('b_Q',b_Q) 602 | if(len(plt_mainQ)>1000): 603 | plt.plot(np.arange(len(plt_mainQ)), plt_mainQ) 604 | plt.ylabel('plt_mainQ') 605 | plt.xlabel('training steps') 606 | plt.show() 607 | else: 608 | if np.random.rand(1) < e or total_steps < pre_train_steps: 609 | a=np.random.randint(0,8) 610 | else: 611 | a=sess.run(mainQN.predict,feed_dict={mainQN.Input:[s]})[0] 612 | #print(a) 613 | car_A.step(a) 614 | #lidar_A = lidar(car_A.positon()) 615 | if test_mode == True: 616 | Env.add_node(car_A.positon()) 617 | lidar_A.pos_change(car_A.positon()) 618 | lidar_A.scan() 619 | s1,r,d = observe() 620 | s1 = processState(s1) 621 | total_steps+=1 622 | if(show_data == True): 623 | print('r',r) 624 | episodeBuffer.add(np.reshape(np.array([s,a,r,s1,d]),[1,5])) 625 | 626 | if total_steps > pre_train_steps: 627 | 628 | if total_steps %(updata_freq)==0: 629 | trainBatch=myBuffer.sample(batch_size) 630 | A=sess.run(mainQN.predict,feed_dict={mainQN.Input:np.vstack(trainBatch[:,3])}) 631 | Q=sess.run(targetQN.Qout,feed_dict={targetQN.Input:np.vstack(trainBatch[:,3])}) 632 | doubleQ=Q[range(batch_size),A] 633 | targetQ=trainBatch[:,2]+y*doubleQ 634 | if(i%10==0 and j%100 ==0): 635 | plt_targetQ.append(targetQ[10]) 636 | if(len(plt_targetQ)>4000): 637 | del plt_targetQ[0] 638 | td_error = sess.run(mainQN.loss,feed_dict={mainQN.Input:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ,mainQN.actions:trainBatch[:,1]}) 639 | if(i%10==0 and j%100 ==0): 640 | plt_tderr.append(td_error) 641 | if (len(plt_tderr) > 4000): 642 | del plt_tderr[0] 643 | if(show_data == True): 644 | if(len(plt_tderr)>10): 645 | plt.plot(np.arange(len(plt_tderr)), plt_tderr) 646 | plt.ylabel('plt_tderr') 647 | plt.xlabel('training steps') 648 | plt.show() 649 | print('td_error',td_error) 650 | print('doubleQ[10]',doubleQ[10]) 651 | print('targetQ[10]',targetQ[10]) 652 | if(len(plt_targetQ)>10): 653 | plt.plot(np.arange(len(plt_targetQ)), plt_targetQ) 654 | plt.ylabel('plt_targetQ') 655 | plt.xlabel('training steps') 656 | plt.show() 657 | if(len(everage_reward)>5): 658 | plt.plot(np.arange(len(everage_reward)), everage_reward) 659 | plt.ylabel('everage_reward') 660 | plt.xlabel('training steps /100') 661 | plt.show() 662 | _ =sess.run(mainQN.updateModel,feed_dict={mainQN.Input:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ,mainQN.actions:trainBatch[:,1]}) 663 | 664 | updateTarget(targetOps,sess) 665 | 666 | rALL += r 667 | s = s1 668 | 669 | if d ==True: 670 | break 671 | 672 | 673 | for event in pygame.event.get(): 674 | if event.type == QUIT: 675 | #exit() 676 | print('quit') 677 | elif event.type == MOUSEBUTTONDOWN : 678 | if(show_data == False): 679 | show_data = True 680 | else: 681 | show_data = False 682 | #x,y = pygame.mouse.get_pos() 683 | #car_A.move_to((x,y)) 684 | #car_A.show() 685 | #lidar_A.pos_change((x,y)) 686 | #lidar_A.scan() 687 | #x1,y1 = pygame.mouse.get_pos() 688 | #car_A.move_to((x1,y1)) 689 | #bound = car_A.show() 690 | #if bound == True: 691 | #lidar_A.pos_change((x1,y1)) 692 | #lidar_A.scan() 693 | 694 | 695 | Env.reset(map_level) 696 | car_A.show() 697 | lidar_A.show() 698 | #pygame.display.update() 699 | #fpsClock.tick(10) 700 | if e>endE: 701 | e-=stepDrop 702 | myBuffer.add(episodeBuffer.buffer) 703 | rList.append(rALL) 704 | if i>0 and i % 25==0: 705 | print('episode',i,',average reward of last 25 episode',np.mean(rList[-25:])) 706 | if i>0 and i % 100==0: 707 | everage_reward.append(np.mean(rList[-100:])) 708 | 709 | if i>0 and i % 2000==0: 710 | saver.save(sess,path+'/model-'+str(i)+'.ckpt') 711 | print("Saved Model") 712 | saver.save(sess,path+'/model-'+str(i)+'.ckpt') 713 | 714 | 715 | 716 | 717 | 718 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DRLPathPlanner 2 | 3 | -A framework for path-planing and obstacle avoidance using Deep Reinforcement Learning Techniques. 4 | 5 | ## Introduction: 6 | The absence robust and generalized path planner has been one of the greatest hindrances in realizing any form of cyber-physical system as a part of our daily lives, coexisting and enhancing the standard of our livelihood and efficiency. The recent years have seen Reinforcement Learning emerging as a viable solution for any tasks related to sequential decision making. In this project, we plan to give this problem more of a Learning approach than the traditional Planning approach. We also notice that this essentially is a learning problem as an agent should learn to generalize enough and adapt to the unpredictability and uncertainties put forward by real-world environments.Thus, we propose to build a Path planning Deep Reinforcement Learning Based Agent that could tackle dynamic obstacles and as well could navigate through a non-stationary environment to reach a given goal state. We believe that an agent of this sort could act as a High-level motion planner in various robotics and navigation-related tasks. 7 | 8 | ## A Brief Overview: 9 | ### Our Environment: 10 | We selected a seemingly simple grid-world to first check the feasibility of our approach.Our need was rightly satisfied by a third party Gym environment named “MiniGrid”. 11 | 12 |

13 | 14 |

15 |

16 | 17 |

18 | 19 | ## On Going work: 20 | * The overall problem was divided into subtasks and were solved by individual Deep Q-Networks. 21 | * Experimenting with applying a hierarchical structure to the sub tasks and combine with Hierarhial Reinforcement Learning. 22 | * Also simulataneously developing Evolutionary Strategy based solution so as to solve the above problem from a Evolutionary Reinforcement Learning based approach. 23 | -------------------------------------------------------------------------------- /dynamic_obstacles.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lok-i/DRLPathPlanner/5bb6694f3fbf53a2aa79b9195cdfe89caee597ea/dynamic_obstacles.gif -------------------------------------------------------------------------------- /empty-env.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lok-i/DRLPathPlanner/5bb6694f3fbf53a2aa79b9195cdfe89caee597ea/empty-env.png -------------------------------------------------------------------------------- /gridworld_dynamic_obstacles.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import gym_minigrid 3 | import math 4 | import random 5 | import numpy as np 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | from collections import namedtuple 9 | from itertools import count 10 | from PIL import Image 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.optim as optim 15 | import torch.nn.functional as F 16 | import torchvision.transforms as T 17 | 18 | 19 | env = gym.make('MiniGrid-Dynamic-Obstacles-16x16-v0')#'MiniGrid-Empty-5x5-v0') 20 | 21 | print(env.observation_space) 22 | print(env.action_space) 23 | 24 | 25 | 26 | Transition = namedtuple('Transition', 27 | ('state', 'action', 'next_state', 'reward')) 28 | 29 | 30 | class ReplayMemory(object): 31 | 32 | 33 | 34 | 35 | def __init__(self, capacity): 36 | self.capacity = capacity 37 | self.memory = [] 38 | self.position = 0 39 | 40 | def push(self, *args): 41 | """Saves a transition.""" 42 | if len(self.memory) < self.capacity: 43 | self.memory.append(None) 44 | self.memory[self.position] = Transition(*args) 45 | self.position = (self.position + 1) % self.capacity 46 | 47 | def sample(self, batch_size): 48 | return random.sample(self.memory, batch_size) 49 | 50 | def __len__(self): 51 | return len(self.memory) 52 | 53 | class DQN(nn.Module): 54 | 55 | def __init__(self, h, w, outputs): 56 | super(DQN, self).__init__() 57 | self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1) 58 | self.bn1 = nn.BatchNorm2d(16) 59 | self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1) 60 | self.bn2 = nn.BatchNorm2d(32) 61 | self.conv3 = nn.Conv2d(32, 32, kernel_size=2, stride=1) 62 | self.bn3 = nn.BatchNorm2d(32) 63 | 64 | # Number of Linear input connections depends on output of conv2d layers 65 | # and therefore the input image size, so compute it. 66 | def conv2d_size_out(size, kernel_size = 5, stride = 2): 67 | return (size - (kernel_size - 1) - 1) // stride + 1 68 | convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w))) 69 | convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h))) 70 | linear_input_size = convw * convh * 32 71 | self.head = nn.Linear(linear_input_size, outputs) 72 | 73 | # Called with either one element to determine next action, or a batch 74 | # during optimization. Returns tensor([[left0exp,right0exp]...]). 75 | def forward(self, x): 76 | x = F.relu(self.bn1(self.conv1(x))) 77 | x = F.relu(self.bn2(self.conv2(x))) 78 | x = F.relu(self.bn3(self.conv3(x))) 79 | return self.head(x.view(x.size(0), -1)) 80 | 81 | BATCH_SIZE = 256 82 | GAMMA = 0.999 83 | EPS_START = 0.9 84 | eps_of_episode = [] 85 | 86 | EPS_END = 0.05 87 | EPS_DECAY = 100 88 | TARGET_UPDATE = 10 89 | 90 | # Get screen size so that we can initialize layers correctly based on shape 91 | # returned from AI gym. Typical dimensions at this point are close to 3x40x90 92 | # which is the result of a clamped and down-scaled render buffer in get_screen() 93 | 94 | 95 | init_screen = env.reset() 96 | screen_height =7 97 | screen_width = 7 98 | device = 'cpu' 99 | # Get number of actions from gym action space 100 | n_actions = env.action_space.n 101 | 102 | policy_net = DQN(screen_height, screen_width, n_actions).to(device) 103 | target_net = DQN(screen_height, screen_width, n_actions).to(device) 104 | target_net.load_state_dict(policy_net.state_dict()) 105 | target_net.eval() 106 | 107 | optimizer = optim.RMSprop(policy_net.parameters()) 108 | memory = ReplayMemory(10000) 109 | 110 | 111 | steps_done = 0 112 | 113 | def get_state(x): 114 | #print(x) 115 | x.unsqueeze_(0) 116 | 117 | x.transpose_(1,3) 118 | #print(x) 119 | return x 120 | 121 | def select_action(state): 122 | global steps_done 123 | sample = random.random() 124 | eps_threshold = EPS_END + (EPS_START - EPS_END) * \ 125 | math.exp(-1. * steps_done / EPS_DECAY) 126 | 127 | 128 | steps_done += 1 129 | eps_of_episode.append(eps_threshold) 130 | if sample > eps_threshold: 131 | with torch.no_grad(): 132 | # t.max(1) will return largest column value of each row. 133 | # second column on max result is index of where max element was 134 | # found, so we pick action with the larger expected reward. 135 | return policy_net(state).max(1)[1].view(1, 1) 136 | else: 137 | return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long) 138 | 139 | 140 | episode_durations = [] 141 | 142 | 143 | def plot_durations(): 144 | plt.figure(2) 145 | plt.clf() 146 | durations_t = torch.tensor(episode_durations, dtype=torch.float) 147 | plt.subplot(211) 148 | plt.title('Training...') 149 | plt.xlabel('Episode') 150 | plt.ylabel('Score') 151 | plt.plot(durations_t.numpy()) 152 | plt.subplot(212) 153 | plt.title('Training...') 154 | plt.xlabel('Episode') 155 | plt.ylabel('Epsilon') 156 | plt.plot(torch.tensor(eps_of_episode,dtype=torch.float).numpy()) 157 | #plt.show() 158 | 159 | # Take 100 episode averages and plot them too 160 | if len(durations_t) >= 100: 161 | means = durations_t.unfold(0, 100, 1).mean(1).view(-1) 162 | means = torch.cat((torch.zeros(99), means)) 163 | plt.plot(means.numpy()) 164 | 165 | plt.pause(0.001) # pause a bit so that plots are updated 166 | # if is_ipython: 167 | # display.clear_output(wait=True) 168 | # display.display(plt.gcf()) 169 | 170 | def optimize_model(): 171 | if len(memory) < BATCH_SIZE: 172 | return 173 | transitions = memory.sample(BATCH_SIZE) 174 | # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for 175 | # detailed explanation). This converts batch-array of Transitions 176 | # to Transition of batch-arrays. 177 | batch = Transition(*zip(*transitions)) 178 | 179 | # Compute a mask of non-final states and concatenate the batch elements 180 | # (a final state would've been the one after which simulation ended) 181 | non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, 182 | batch.next_state)), device=device, dtype=torch.bool) 183 | non_final_next_states = torch.cat([s for s in batch.next_state 184 | if s is not None]) 185 | state_batch = torch.cat(batch.state) 186 | action_batch = torch.cat(batch.action) 187 | reward_batch = torch.cat(batch.reward) 188 | 189 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 190 | # columns of actions taken. These are the actions which would've been taken 191 | # for each batch state according to policy_net 192 | state_action_values = policy_net(state_batch).gather(1, action_batch) 193 | 194 | # Compute V(s_{t+1}) for all next states. 195 | # Expected values of actions for non_final_next_states are computed based 196 | # on the "older" target_net; selecting their best reward with max(1)[0]. 197 | # This is merged based on the mask, such that we'll have either the expected 198 | # state value or 0 in case the state was final. 199 | next_state_values = torch.zeros(BATCH_SIZE, device=device) 200 | next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach() 201 | # Compute the expected Q values 202 | expected_state_action_values = (next_state_values * GAMMA) + reward_batch 203 | 204 | # Compute Huber loss 205 | loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1)) 206 | 207 | # Optimize the model 208 | optimizer.zero_grad() 209 | loss.backward() 210 | for param in policy_net.parameters(): 211 | param.grad.data.clamp_(-1, 1) 212 | optimizer.step() 213 | 214 | def train(): 215 | num_episodes = 500 216 | for i_episode in range(num_episodes): 217 | # Initialize the environment and state 218 | env.reset() 219 | #last_screen = env.render() 220 | 221 | #current_screen = env.render() 222 | 223 | state_dic,_,_,_= env.step(env.action_space.sample())#env.render() 224 | state = get_state(torch.tensor(state_dic['image'],dtype =torch.float)) 225 | #print(state.size()) 226 | #print(state) 227 | for t in count(): 228 | # Select and perform an action 229 | env.render() 230 | action = select_action(state) 231 | observation, reward, done, _ = env.step(action.item()) 232 | 233 | reward = torch.tensor([reward],dtype=torch.long, device=device) 234 | 235 | 236 | # Observe new state 237 | 238 | 239 | if not done: 240 | next_state =get_state(torch.tensor(observation['image'],dtype =torch.float)) #env.render()# current_screen - last_screen 241 | else: 242 | next_state = None 243 | 244 | # Store the transition in memory 245 | memory.push(state, action, next_state, reward) 246 | 247 | # Move to the next state 248 | state = next_state 249 | 250 | # Perform one step of the optimization (on the target network) 251 | optimize_model() 252 | if done: 253 | episode_durations.append(t + 1) 254 | plot_durations() 255 | break 256 | # Update the target network, copying all weights and biases in DQN 257 | if i_episode % TARGET_UPDATE == 0: 258 | target_net.load_state_dict(policy_net.state_dict()) 259 | 260 | print('Complete') 261 | env.render() 262 | env.close() 263 | plt.ioff() 264 | plt.show() 265 | 266 | 267 | if __name__ == '__main__': 268 | print("Training :") 269 | train() 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | # for i_episode in range(20): 280 | # observation = env.reset() 281 | # for t in range(100): 282 | # env.render() 283 | # print(observation) 284 | # action = env.action_space.sample() 285 | # observation, reward, done, info = env.step(action) 286 | # if done: 287 | # print("Episode finished after {} timesteps".format(t+1)) 288 | # break 289 | # env.close() 290 | --------------------------------------------------------------------------------