├── DPP-v1
    ├── .gitkeep
    ├── MyNN.py
    └── Test_Model.py
├── Initial_workin_code.py
├── README.md
├── dynamic_obstacles.gif
├── empty-env.png
└── gridworld_dynamic_obstacles.py


/DPP-v1/.gitkeep:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DPP-v1/MyNN.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import gym_minigrid
  3 | import math
  4 | import random
  5 | import numpy as np
  6 | import matplotlib
  7 | import matplotlib.pyplot as plt
  8 | from collections import namedtuple
  9 | from itertools import count
 10 | from PIL import Image
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.optim as optim
 15 | import torch.nn.functional as F
 16 | import torchvision.transforms as T
 17 | 
 18 | 
 19 | env = gym.make('MiniGrid-Dynamic-Obstacles-8x8-v0')#'MiniGrid-Empty-5x5-v0')
 20 | 
 21 | print(env.observation_space)
 22 | print(env.action_space)
 23 | 
 24 | 
 25 | 
 26 | Transition = namedtuple('Transition',
 27 |                         ('state', 'action', 'next_state', 'reward'))
 28 | 
 29 | 
 30 | class ReplayMemory(object):
 31 | 
 32 | 
 33 | 
 34 | 
 35 |     def __init__(self, capacity):
 36 |         self.capacity = capacity
 37 |         self.memory = []
 38 |         self.position = 0
 39 | 
 40 |     def push(self, *args):
 41 |         """Saves a transition."""
 42 |         if len(self.memory) < self.capacity:
 43 |             self.memory.append(None)
 44 |         self.memory[self.position] = Transition(*args)
 45 |         self.position = (self.position + 1) % self.capacity
 46 | 
 47 |     def sample(self, batch_size):
 48 |         return random.sample(self.memory, batch_size)
 49 | 
 50 |     def __len__(self):
 51 |         return len(self.memory)
 52 | 
 53 | class DQN(nn.Module):
 54 | 
 55 |     def __init__(self, h, w, outputs):
 56 |         super(DQN, self).__init__()
 57 |         self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1)
 58 |         self.bn1 = nn.BatchNorm2d(32)
 59 |         self.conv2 = nn.Conv2d(32,64, kernel_size=3, stride=1)
 60 |         self.bn2 = nn.BatchNorm2d(64)
 61 |         self.conv3 = nn.Conv2d(64, 64, kernel_size=2, stride=1)
 62 |         self.bn3 = nn.BatchNorm2d(64)
 63 | 
 64 |         # Number of Linear input connections depends on output of conv2d layers
 65 |         # and therefore the input image size, so compute it.
 66 |         def conv2d_size_out(size, kernel_size = 5, stride = 2):
 67 |             return (size - (kernel_size - 1) - 1) // stride  + 1
 68 |         convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
 69 |         convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
 70 |         linear_input_size = convw * convh * 64
 71 |         self.head = nn.Linear(linear_input_size, outputs)
 72 | 
 73 |     # Called with either one element to determine next action, or a batch
 74 |     # during optimization. Returns tensor([[left0exp,right0exp]...]).
 75 |     def forward(self, x):
 76 |         x = F.relu(self.bn1(self.conv1(x)))
 77 |         x = F.relu(self.bn2(self.conv2(x)))
 78 |         x = F.relu(self.bn3(self.conv3(x)))
 79 |         return self.head(x.view(x.size(0), -1))
 80 | 
 81 | BATCH_SIZE = 256 #128
 82 | GAMMA = 0.999
 83 | EPS_START = 1
 84 | EPS_END = 0.05
 85 | 
 86 | EPS_DECAY = 4000 #200 for 50 epi
 87 | num_episodes = 500
 88 | TARGET_UPDATE = 30
 89 | #PATH ='./logs/'
 90 | 
 91 | # Get screen size so that we can initialize layers correctly based on shape
 92 | # returned from AI gym. Typical dimensions at this point are close to 3x40x90
 93 | # which is the result of a clamped and down-scaled render buffer in get_screen()
 94 | 
 95 | 
 96 | init_screen = env.reset()
 97 | screen_height =7 
 98 | screen_width = 7 
 99 | device = 'cpu'
100 | render_status = False
101 | save_model = True
102 | PATH = './logs/'
103 | 
104 | # Get number of actions from gym action space
105 | n_actions = env.action_space.n
106 | 
107 | policy_net = DQN(screen_height, screen_width, n_actions).to(device)
108 | target_net = DQN(screen_height, screen_width, n_actions).to(device)
109 | target_net.load_state_dict(policy_net.state_dict())
110 | target_net.eval()
111 | 
112 | optimizer = optim.Adam(policy_net.parameters())
113 | memory = ReplayMemory(10000)
114 | 
115 | 
116 | episode_durations = []
117 | eps_of_episode = []
118 | reward_hist=[]
119 | steps_done = 0
120 | 
121 | 
122 | def save_logs(Network , grap_plot ,Test_name , ith_sample):
123 |     torch.save(Network,PATH+Test_name+ith_sample)
124 |     plt.savefig(PATH+ith_sample+'.png')
125 |     
126 | 
127 | def get_state(x):
128 | 	#print(x)
129 | 	x.unsqueeze_(0)
130 | 	
131 | 	x.transpose_(1,3)
132 | 	#print(x)
133 | 	return x
134 | 
135 | def select_action(state):
136 |     global steps_done
137 |     sample = random.random()
138 |     eps_threshold = EPS_END + (EPS_START - EPS_END) * \
139 |         math.exp(-1. * steps_done / EPS_DECAY)
140 |    	
141 | 
142 |     steps_done += 1
143 |     eps_of_episode.append(eps_threshold)
144 |     if sample > eps_threshold:
145 |         with torch.no_grad():
146 |             # t.max(1) will return largest column value of each row.
147 |             # second column on max result is index of where max element was
148 |             # found, so we pick action with the larger expected reward.
149 |             return policy_net(state).max(1)[1].view(1, 1)
150 |     else:
151 |         return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)
152 | 
153 | 
154 | 
155 | 
156 | def plot_durations():
157 |     plt.figure(2)
158 |     plt.clf()
159 |     durations_t = torch.tensor(episode_durations, dtype=torch.float)
160 |     plt.subplot(221)
161 |     plt.title('Training...')
162 |     plt.xlabel('Episode')
163 |     plt.ylabel('Duration_alive')
164 |     plt.plot(durations_t.numpy())
165 |     plt.subplot(222)
166 |     #plt.title('Training...')
167 |     plt.xlabel('Time_Steps')
168 |     plt.ylabel('Epsilon')
169 |     plt.plot(torch.tensor(eps_of_episode,dtype=torch.float).numpy())
170 |     plt.subplot(223)
171 |     plt.title('Training...')
172 |     plt.xlabel('Episode')
173 |     plt.ylabel('Total_reward')
174 |     rew_hist_tensor = torch.tensor(reward_hist,dtype=torch.float)
175 |     plt.plot(rew_hist_tensor.numpy())
176 |     #plt.show()
177 | 
178 |     # Take 100 episode averages and plot them too
179 |     if len(durations_t) >= 100:
180 | 
181 |     	plt.subplot(221)
182 |     	means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
183 |     	means = torch.cat((torch.zeros(99), means))
184 |     	plt.plot(means.numpy())
185 |     	plt.subplot(223)
186 |     	means = rew_hist_tensor.unfold(0, 100, 1).mean(1).view(-1)
187 |     	means = torch.cat((torch.zeros(99), means))
188 |     	plt.plot(means.numpy())
189 | 
190 | 
191 |     plt.pause(0.001)  # pause a bit so that plots are updated
192 |     # if is_ipython:
193 |     #     display.clear_output(wait=True)
194 |     #     display.display(plt.gcf())      
195 | 
196 | def optimize_model():
197 |     if len(memory) < BATCH_SIZE:
198 |         return
199 |     transitions = memory.sample(BATCH_SIZE)
200 |     # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
201 |     # detailed explanation). This converts batch-array of Transitions
202 |     # to Transition of batch-arrays.
203 |     batch = Transition(*zip(*transitions))
204 | 
205 |     # Compute a mask of non-final states and concatenate the batch elements
206 |     # (a final state would've been the one after which simulation ended)
207 |     non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
208 |                                           batch.next_state)), device=device, dtype=torch.bool)
209 |     non_final_next_states = torch.cat([s for s in batch.next_state
210 |                                                 if s is not None])
211 |     state_batch = torch.cat(batch.state)
212 |     action_batch = torch.cat(batch.action)
213 |     reward_batch = torch.cat(batch.reward)
214 | 
215 |     # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
216 |     # columns of actions taken. These are the actions which would've been taken
217 |     # for each batch state according to policy_net
218 |     state_action_values = policy_net(state_batch).gather(1, action_batch)
219 | 
220 |     # Compute V(s_{t+1}) for all next states.
221 |     # Expected values of actions for non_final_next_states are computed based
222 |     # on the "older" target_net; selecting their best reward with max(1)[0].
223 |     # This is merged based on the mask, such that we'll have either the expected
224 |     # state value or 0 in case the state was final.
225 |     next_state_values = torch.zeros(BATCH_SIZE, device=device)
226 |     next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
227 |     # Compute the expected Q values
228 |     expected_state_action_values = (next_state_values * GAMMA) + reward_batch
229 | 
230 |     # Compute Huber loss
231 |     loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
232 | 
233 |     # Optimize the model
234 |     optimizer.zero_grad()
235 |     loss.backward()
236 |     for param in policy_net.parameters():
237 |         param.grad.data.clamp_(-1, 1)
238 |     optimizer.step()
239 | 
240 | def train():
241 | 	
242 | 	for i_episode in range(num_episodes):
243 | 	    # Initialize the environment and state
244 | 	    env.reset()
245 | 	    #last_screen = env.render()
246 | 
247 | 	    #current_screen = env.render()
248 | 	    
249 | 	    state_dic,_,_,_=     env.step(env.action_space.sample())#env.render()
250 | 	    state = get_state(torch.tensor(state_dic['image'],dtype =torch.float))
251 | 	    #print(state.size())
252 | 	    #print(state)
253 | 	    reward_epi = 0 
254 | 	    for t in count():
255 | 	        # Select and perform an action
256 | 	        if render_status:
257 | 	        	env.render()
258 | 	        action = select_action(state)
259 | 	        observation, reward, done, _ = env.step(action.item())
260 | 	        
261 | 	        reward = torch.tensor([reward],dtype=torch.long, device=device)
262 | 	        
263 | 
264 | 	        # Observe new state
265 | 	        
266 | 	        
267 | 	        if not done:
268 | 	            next_state =get_state(torch.tensor(observation['image'],dtype =torch.float)) #env.render()# current_screen - last_screen
269 | 	        else:
270 | 	            next_state = None
271 | 
272 | 	        # Store the transition in memory
273 | 	        memory.push(state, action, next_state, reward)
274 | 
275 | 	        # Move to the next state
276 | 	        state = next_state
277 | 
278 | 	        # Perform one step of the optimization (on the target network)
279 | 	        reward_epi = reward_epi+reward.item()
280 | 	        optimize_model()
281 | 
282 | 	        if done:
283 | 	        	if t != 0:
284 | 	        		reward_epi = reward_epi/t
285 | 	        	reward_hist.append(reward_epi)
286 | 	        	episode_durations.append(t + 1)
287 | 	        	plot_durations()
288 | 	        	break
289 | 	        	
290 | 	    # Update the target network, copying all weights and biases in DQN
291 | 	    if i_episode % TARGET_UPDATE == 0:
292 | 	        target_net.load_state_dict(policy_net.state_dict())
293 | 
294 | 	print('Complete')
295 | 	#env.render()
296 | 	env.close()
297 | 	#plt.ioff()
298 | 	#plt.show()
299 | 
300 | 
301 | if __name__ == '__main__':
302 | 	print("Training :")
303 | 	train()
304 | 	Folder_name = 'test_(8x8)'
305 | 	if save_model == True :
306 | 		save_logs(policy_net,plt,Folder_name,'2')
307 | 
308 | 	
309 | 	
310 | 
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | # for i_episode in range(20):
319 | #     observation = env.reset()
320 | #     for t in range(100):
321 | #         env.render()
322 | #         print(observation)
323 | #         action = env.action_space.sample()
324 | #         observation, reward, done, info = env.step(action)
325 | #         if done:
326 | #             print("Episode finished after {} timesteps".format(t+1))
327 | #             break
328 | # env.close()


--------------------------------------------------------------------------------
/DPP-v1/Test_Model.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import gym_minigrid
 3 | from itertools import count
 4 | import time
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | env = gym.make('MiniGrid-Dynamic-Obstacles-8x8-v0')#'MiniGrid-Empty-5x5-v0')
11 | Fil_name='test_1'
12 | PATH = './logs/'+Fil_name
13 | print(env.observation_space)
14 | print(env.action_space)
15 | 
16 | 
17 | class DQN(nn.Module):
18 | 
19 |     def __init__(self, h, w, outputs):
20 |         super(DQN, self).__init__()
21 |         self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1)
22 |         self.bn1 = nn.BatchNorm2d(32)
23 |         self.conv2 = nn.Conv2d(32,64, kernel_size=3, stride=1)
24 |         self.bn2 = nn.BatchNorm2d(64)
25 |         self.conv3 = nn.Conv2d(64, 64, kernel_size=2, stride=1)
26 |         self.bn3 = nn.BatchNorm2d(64)
27 | 
28 |         # Number of Linear input connections depends on output of conv2d layers
29 |         # and therefore the input image size, so compute it.
30 |         def conv2d_size_out(size, kernel_size = 5, stride = 2):
31 |             return (size - (kernel_size - 1) - 1) // stride  + 1
32 |         convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
33 |         convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
34 |         linear_input_size = convw * convh * 64
35 |         self.head = nn.Linear(linear_input_size, outputs)
36 | 
37 |     # Called with either one element to determine next action, or a batch
38 |     # during optimization. Returns tensor([[left0exp,right0exp]...]).
39 |     def forward(self, x):
40 |         x = F.relu(self.bn1(self.conv1(x)))
41 |         x = F.relu(self.bn2(self.conv2(x)))
42 |         x = F.relu(self.bn3(self.conv3(x)))
43 |         return self.head(x.view(x.size(0), -1))
44 | def get_state(x):
45 | 	#print(x)
46 | 	x.unsqueeze_(0)
47 | 	
48 | 	x.transpose_(1,3)
49 | 	#print(x)
50 | 	return x
51 | 
52 | 
53 | model = torch.load(PATH)
54 | model.eval()
55 | device = 'cpu'
56 | def test_model(num_of_episodes,):
57 | 	
58 | 	
59 | 	for i in range(num_of_episodes):
60 | 		env.reset()
61 | 
62 | 		state_dic,_,_,_=     env.step(env.action_space.sample())#env.render()
63 | 		state = get_state(torch.tensor(state_dic['image'],dtype =torch.float))
64 | 		reward_this_epi = 0
65 | 		for t in count():
66 | 			env.render()
67 | 			time.sleep(0.1)
68 | 			action = model(state).max(1)[1].view(1, 1)
69 | 			observation,reward ,done,_ =env.step(action.item())
70 | 			reward = torch.tensor([reward],dtype=torch.long, device=device)
71 | 			reward_this_epi = reward_this_epi+ reward.item()
72 | 			state =get_state(torch.tensor(observation['image'],dtype =torch.float))
73 | 			if done:
74 | 				reward_this_epi =reward_this_epi/(t+1)
75 | 				print('Episode:',i,'Episode_Score:',reward_this_epi,'Steps_alive',t+1)
76 | 				break
77 | 
78 | 		
79 | if __name__ == '__main__':
80 | 	print('Testing..')
81 | 	test_model(10)
82 | 


--------------------------------------------------------------------------------
/Initial_workin_code.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | Created on Thu Sep 28 20:52:25 2017
  4 | @author: Administrator
  5 | """
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | import time
  9 | import random
 10 | from math import sqrt,cos,sin,atan2
 11 | import pygame
 12 | from pygame.locals import *
 13 | from sys import exit
 14 | import os
 15 | import matplotlib.pyplot as plt
 16 | # import tensorflow.compat.v1 as tf1
 17 | # tf1.disable_v2_behavior()
 18 | #tf.compat.v1.disable_v2_behavior()
 19 | 
 20 | 
 21 | pygame.init()
 22 | fpsClock = pygame.time.Clock()
 23 | XDIM = 640
 24 | YDIM = 480
 25 | SCREEN_SIZE = (XDIM, YDIM)
 26 | screen = pygame.display.set_mode(SCREEN_SIZE, 0, 32)
 27 | pygame.display.set_caption("NAVIGATION!")
 28 | show_data = True
 29 | 
 30 | test_mode = True #False      
 31 | load_model = False         
 32 | 
 33 | class vector(object):
 34 |     def __init__(self,x=0,y=0):
 35 |         self.x = x
 36 |         self.y = y
 37 |     def cal_vector(self,p1,p2):
 38 |         self.x = p2[0]-p1[0]
 39 |         self.y = p2[1]-p1[1]
 40 |         return (self.x,self.y) 
 41 |     def get_magnitude(self):
 42 |         return sqrt(self.x**2+self.y**2)
 43 |  
 44 | def get_point(vector,p1):
 45 |     return (vector[0]+p1[0],vector[1]+p1[1])
 46 | 
 47 | def get_distance(p1,p2):              
 48 |     return sqrt((p1[0]-p2[0])**2+(p1[1]-p2[1])**2)
 49 | 
 50 | class lidar(object):
 51 |     def __init__(self,p,distance=200,angle=0):
 52 |         self.x = p[0]
 53 |         self.y = p[1]
 54 |         self.distance = distance  
 55 |         self.angle = 0
 56 |         self.lidar =(0,0)
 57 |         self.data = [] 
 58 |         self.data_2 = [] 
 59 |     def scan(self):
 60 |         dis = []
 61 |         for i in range(0,360,1):  
 62 |             for j in range(1,40):
 63 |                 x1 = int(self.x + self.distance*cos(i*3.14/180)*j/40)
 64 |                 y1 = int(self.y + self.distance*sin(i*3.14/180)*j/40)
 65 |                 #print(x1,y1)
 66 |                 pix = (0,0,0)
 67 |                 if(x1>0 and x1<XDIM and y1>0 and y1<YDIM): 
 68 |                     pix = screen.get_at((x1,y1))
 69 |                     #print(pix)
 70 |                 
 71 |                 if(x1<=0 or x1>=XDIM or y1<=0 or y1>=YDIM):
 72 |                     break
 73 |                 elif(pix[0]>100 and pix[1]>100 and pix[2]>100 ):
 74 |                     break
 75 |                 else:
 76 |                     dis = [x1,y1]
 77 |             if len(self.data)<360:
 78 |                 self.data.append(dis)                        
 79 |             else:
 80 |                 self.data[i] = dis
 81 |             #print(self.data[i])
 82 |             if(self.x>0 and self.x<XDIM and self.y>0 and self.y<YDIM):      
 83 |                 vec = vector()
 84 |                 vec.cal_vector((self.x,self.y),self.data[i])
 85 |                 scan_dis = vec.get_magnitude()
 86 |                 if(len(self.data_2)<360):
 87 |                     self.data_2.append([i,scan_dis])         
 88 |                 else:
 89 |                     self.data_2[i] = [i,scan_dis]
 90 |                 if(scan_dis<190):
 91 |                     screen.set_at(self.data[i],(255,0,0))
 92 |                 
 93 |     def show(self):
 94 |         for d in self.data:
 95 |             vect = vector()
 96 |             vect.cal_vector((self.x,self.y),d)
 97 |             if(vect.get_magnitude()<190):
 98 |                 screen.set_at(d,(255,0,0))
 99 |         return self.data
100 |         
101 |     def state(self):
102 |         return self.data
103 |         
104 |     def state_2(self):
105 |         return self.data_2
106 |         
107 |     def pos_change(self,pos):
108 |         self.x = pos[0]
109 |         self.y = pos[1]
110 |         
111 |                 
112 |                     
113 | class car(object):
114 |     def __init__(self,x=50,y=50,length=20,width=20):
115 |         self.x = x
116 |         self.y = y
117 |         self.length = length
118 |         self.width = width
119 |         self.minx = int(self.x - self.length//2)
120 |         self.miny = int(self.y - self.width//2)
121 |         self.maxx = int(self.x + self.length//2)
122 |         self.maxy = int(self.y + self.width//2)
123 |     def show(self):
124 |         if(self.minx>0 and self.miny>0 and self.maxx<XDIM and self.maxy<YDIM):
125 |             pygame.draw.rect(screen,(90,90,90),(self.minx,self.miny,self.length,self.width))
126 |             return True
127 |         else:
128 |             print("error:the car Out of bounds")
129 |             return False
130 |     def move(self,x,y):
131 |         if(self.minx+x>0 and self.miny+y>0 and self.maxx+x<XDIM and self.maxy+y<YDIM):
132 |             self.x += x
133 |             self.y += y
134 |             self.minx += x
135 |             self.miny += y
136 |             self.maxx += x
137 |             self.maxy += y
138 |             return True
139 |         else:
140 |             return False
141 |     def step(self,action):
142 |         done = False
143 |         step_size = 4
144 |         if action == 0:
145 |             done = self.move(step_size,0)
146 |         elif action == 1:
147 |             done = self.move(step_size,step_size)
148 |         elif action == 2:
149 |             done = self.move(0,step_size)
150 |         elif action == 3:
151 |             done == self.move(-step_size,step_size)
152 |         elif action == 4:
153 |             done = self.move(-step_size,0)
154 |         elif action == 5:
155 |             done = self.move(-step_size,-step_size)
156 |         elif action == 6:
157 |             done = self.move(0,-step_size)
158 |         elif action == 7:
159 |             done = self.move(step_size,-step_size)
160 |         else:
161 |             done = False
162 |             
163 |     def move_to(self,pos):
164 |         self.x = pos[0]
165 |         self.y = pos[1]
166 |         self.minx = int(self.x - self.length/2)
167 |         self.miny = int(self.y - self.width/2)
168 |         self.maxx = int(self.x + self.length/2)
169 |         self.maxy = int(self.y + self.width/2)
170 |     def positon(self):
171 |         return self.x,self.y
172 |     def size(self):
173 |         return self.length,self.width
174 |         
175 | 
176 |      
177 | class NavigationEnv(object):
178 |     def __init__(self,level=1,actions=8,scope=15):
179 |         self.actions = 8
180 |         self.reward = 0
181 |         self.dyenv = 0
182 |         self.obstacle_flag = 0
183 |         self.init_obstacles(level)
184 |         self.goal =self.goal_point()
185 |         self.scope = scope   
186 |         self.node = []
187 |         self.old_goalpoint = []
188 | 
189 |         
190 |     def point_collision_detect(self,x,y,length=0,width=0):
191 |         if(length == 0 and width == 0):
192 |             pix = screen.get_at(x,y)
193 |             if(pix[0]<200 and pix[1]<200 and pix[2]<200):
194 |                 return True
195 |             else:
196 |                 return False
197 |         else:
198 |             p1 = screen.get_at((int(x-length//2),int(y-width//2)))
199 |             p2 = screen.get_at((int(x+length//2),int(y-width//2)))
200 |             p3 = screen.get_at((int(x-length//2),int(y+width//2)))
201 |             p4 = screen.get_at((int(x+length//2),int(y+width//2)))
202 |             #print(x,y)
203 |             if(p1[0]<200 and p2[0]<200 and p3[0]<200 and p4[0]<200):
204 |                 return True
205 |             else:
206 |                 return False
207 |                 
208 |     def random_point(self,length=20,width=20):  
209 |         flag = False
210 |         while (not flag):        
211 |             x = random.randint(int(length//2+1),int(XDIM-length//2-1))
212 |             y = random.randint(int(width//2+1),int(YDIM-width//2-1))
213 |             flag = self.point_collision_detect(x,y,length,width)
214 |             #print(not flag)
215 |         #pygame.draw.circle(screen, (0,255,0),(x,y),length)
216 |         return x,y,length,width
217 |         
218 |     def close_to_goalpoint(self,dis_x,dis_y,length=20,width=20): 
219 |         flag = False
220 |         min_x = self.goal[0]-dis_x
221 |         max_x = self.goal[0]+dis_x
222 |         min_y = self.goal[1]-dis_y
223 |         max_y = self.goal[1]+dis_y
224 |         count = 0
225 |         while(not flag):
226 |             if(min_x>length//2 and max_x<XDIM-length//2):
227 |                 x = random.randint(min_x+1 , max_x-1)
228 |             elif min_x<=length//2 and max_x<XDIM-length//2:
229 |                 x = random.randint(length//2+1,max_x-1)
230 |             elif min_x>length//2 and max_x>=XDIM-length//2:
231 |                 x = random.randint(min_x+1,XDIM-length//2-1) 
232 |             elif min_x<=length//2 and max_x>=XDIM-length//2:
233 |                 x = random.randint(length//2+1,XDIM-length//2-1)
234 |                 
235 |             if(min_y>width//2 and max_y<YDIM-width//2):
236 |                 y = random.randint(min_y+1 , max_y-1)
237 |             elif min_y<=width//2 and max_y<YDIM-width//2:
238 |                 y = random.randint(width//2+1,max_y-1)
239 |             elif min_y>width//2 and max_y>=YDIM-width//2:
240 |                 y = random.randint(min_y+1,YDIM-width//2-1) 
241 |             elif min_y<=width//2 and max_y>=YDIM-width//2:
242 |                 y = random.randint(width//2+1,YDIM-width//2-1)
243 |             #print(x,y)
244 |             flag = self.point_collision_detect(x,y,length,width)
245 |             count +=1
246 |             if(count > 200):
247 |                 print("over count")
248 |                 break
249 |         return x,y,length,width
250 |                 
251 |                
252 |     def goal_point(self,length=5,width=5):          
253 |         flag = False
254 |         while (not flag):        
255 |             x = random.randint(int(length//2+1),int(XDIM-length//2-1))
256 |             y = random.randint(int(width//2+1),int(YDIM-width//2-1))
257 |             flag = self.point_collision_detect(x,y,length,width)
258 |             #print(not flag)
259 |         pygame.draw.circle(screen, (0,255,0),(x,y),length)
260 |         self.goal = x,y,length,width
261 |         return x,y,length,width
262 | 
263 |     def goal_point_notrandom(self,x,y,length=5,wigth=5):
264 |         self.goal = x,y,length,wigth
265 |         pygame.draw.circle(screen, (0, 255, 0), (x, y), length)
266 |         return x,y,length,wigth
267 |         
268 |     def check_goal(self,p,size=[20,20]):
269 |         dis = get_distance(p,(self.goal[0],self.goal[1]))
270 |         if self.point_collision_detect(p[0],p[1],size[0],size[1]):
271 |             if dis<=self.scope:
272 |                 self.reward = 1 
273 |                 if(test_mode ==True):
274 |                     return self.reward, True
275 |                 return self.reward,False
276 |             else:
277 |                 self.reward = -0.004  
278 |                 return self.reward,False
279 |         else:
280 |             self.reward = -1
281 |             return self.reward,False
282 | 
283 |         
284 |     def reset(self,level=3):
285 |         screen.fill((20,20,20))
286 |         self.init_obstacles(level)
287 |         pygame.draw.circle(screen, (0, 255, 0), (self.goal[0], self.goal[1]), self.goal[2])
288 |         if(test_mode == True):
289 |             self.draw_old_goalpoint()
290 |             self.draw_part_path()
291 |             self.draw_global_path()
292 | 
293 | 
294 |     def add_old_goalpoint(self,node):
295 |         self.old_goalpoint.append(node)
296 | 
297 |     def clear_old_goalpoint(self):
298 |         self.old_goalpoint = []
299 | 
300 |     def draw_old_goalpoint(self):
301 |         if(len(self.old_goalpoint)>2):
302 |             for goal_point in self.old_goalpoint:
303 |                 pygame.draw.circle(screen, (0, 0, 255), goal_point, 5)
304 | 
305 |     def draw_global_path(self):
306 |         path_start = []
307 |         if (len(self.old_goalpoint) > 2):
308 |             path_start = self.old_goalpoint[0]
309 |             for path_end in self.old_goalpoint:
310 |                 pygame.draw.line(screen, (180, 180, 0), path_start, path_end, 2)
311 |                 path_start = path_end
312 | 
313 |     def add_node(self,node):
314 |         self.node.append(node)
315 | 
316 |     def clear_node(self):
317 |         self.node=[]
318 | 
319 |     def draw_part_path(self):
320 |         node_start =[]
321 |         if(len(self.node)>2):
322 |             node_start = self.node[0]
323 |             for node_end in self.node:
324 |                 pygame.draw.line(screen, (255,0,0), node_start, node_end, 2)
325 |                 node_start = node_end
326 | 
327 |     def init_obstacles(self,configNum,movespeed=1):
328 |         rectObs = []
329 | 
330 |         if(self.dyenv >100 and self.obstacle_flag == 0):
331 |             self.obstacle_flag = 1
332 |         elif(self.obstacle_flag == 1 and self.dyenv>0):
333 |             movespeed =-1
334 |         else:
335 |             self.obstacle_flag = 0
336 |             movespeed = 1
337 |         self.dyenv += movespeed
338 |         #print("config "+ str(configNum))
339 |         if (configNum == 0):
340 |             rectObs.append(pygame.Rect((640 / 2.0 - 50, 480/ 2.0 - 100),(100,200)))
341 |         if (configNum == 1):
342 |             #rectObs.append(pygame.Rect((40,20),(20,200)))
343 |             rectObs.append(pygame.Rect((120,280),(20,200)))
344 |             rectObs.append(pygame.Rect((100,100),(80,20)))
345 |             #rectObs.append(pygame.Rect((60,300),(50,20)))   
346 |             rectObs.append(pygame.Rect((140,0),(20,120)))
347 |             rectObs.append(pygame.Rect((140,300),(80,20)))
348 |             #rectObs.append(pygame.Rect((200,400),(150,20)))
349 |             #rectObs.append(pygame.Rect((280,200),(20,200)))
350 |             #rectObs.append(pygame.Rect((300,420),(250,20)))
351 |             rectObs.append(pygame.Rect((350,0),(20,100)))
352 |             rectObs.append(pygame.Rect((350,400),(20,100))) 
353 |             rectObs.append(pygame.Rect((400,340),(100,20)))
354 |             rectObs.append(pygame.Rect((450,200),(150,20)))
355 |             rectObs.append(pygame.Rect((500,0),(20,140)))
356 |             rectObs.append(pygame.Rect((550,350),(20,500)))
357 |             rectObs.append(pygame.Rect((620,50),(80,20))) 
358 |             rectObs.append(pygame.Rect((620,300),(80,20)))
359 |             rectObs.append(pygame.Rect((220, 150 + self.dyenv), (20, 20)))
360 |             rectObs.append(pygame.Rect((300+ self.dyenv, 240 ), (20, 20)))
361 |             rectObs.append(pygame.Rect((40 + self.dyenv, 240), (20, 20)))
362 |             #rectObs.append(pygame.Rect((700,50),(20,270)))
363 |             rectObs.append(pygame.Rect((140,0),(600,20)))
364 |             rectObs.append(pygame.Rect((0, 0), (20, 460)))
365 |             rectObs.append(pygame.Rect((0, 0), (620, 20)))
366 |             rectObs.append(pygame.Rect((620, 0), (20, 460)))
367 |             rectObs.append(pygame.Rect((0, 460), (640, 20)))
368 |         if (configNum == 2):
369 |             rectObs.append(pygame.Rect((200,80),(20,200)))
370 |             rectObs.append(pygame.Rect((220,80),(200,20)))
371 |             rectObs.append(pygame.Rect((200,350),(200,20)))
372 |             rectObs.append(pygame.Rect((400, 200), (60, 60)))
373 |             rectObs.append(pygame.Rect((80, 100), (40, 40)))
374 |             rectObs.append(pygame.Rect((80, 200+self.dyenv), (20, 20)))
375 |             rectObs.append(pygame.Rect((280+self.dyenv, 200 ), (20, 20)))
376 |             rectObs.append(pygame.Rect((500, 300 + self.dyenv), (20, 20)))
377 | 
378 |             rectObs.append(pygame.Rect((0,0),(20,460)))
379 |             rectObs.append(pygame.Rect((0,0),(620,20)))
380 |             rectObs.append(pygame.Rect((620,0),(20,460)))
381 |             rectObs.append(pygame.Rect((0,460),(640,20)))
382 |             # rectObs.append(pygame.Rect((40,10),(100,200)))
383 |         if (configNum == 3):
384 |             rectObs.append(pygame.Rect((40,40),(40,40)))
385 |             rectObs.append(pygame.Rect((140, 140), (80, 80)))
386 |             rectObs.append(pygame.Rect((350, 400), (40, 40)))
387 |             rectObs.append(pygame.Rect((500, 160), (40, 40)))
388 |             rectObs.append(pygame.Rect((380, 100), (40, 40)))
389 |             rectObs.append(pygame.Rect((300, 340), (80, 40)))
390 |             rectObs.append(pygame.Rect((80+ self.dyenv, 300 ), (20, 20)))
391 |             rectObs.append(pygame.Rect((280 + self.dyenv, 240), (20, 20)))
392 |             rectObs.append(pygame.Rect((500, 300 + self.dyenv), (20, 20)))
393 |             rectObs.append(pygame.Rect((400, 250 + self.dyenv), (20, 20)))
394 | 
395 |             rectObs.append(pygame.Rect((0, 0), (20, 460)))
396 |             rectObs.append(pygame.Rect((0, 0), (620, 20)))
397 |             rectObs.append(pygame.Rect((620, 0), (20, 460)))
398 |             rectObs.append(pygame.Rect((0, 460), (640, 20)))
399 | 
400 | 
401 |         for rect in rectObs:
402 |             pygame.draw.rect(screen, (255,255,255), rect)
403 | 
404 | 
405 | Env = NavigationEnv()                     
406 | px,py,pl,pw = Env.random_point()
407 | car_A = car(px,py,pl,pw)         
408 | lidar_A = lidar(car_A.positon())
409 | 
410 | def observe():
411 |     state = lidar_A.state_2()
412 |     if len(state) < 400:
413 |         for i in range(20):
414 |             state.append([Env.goal[0]-car_A.positon()[0],Env.goal[1]-car_A.positon()[1]])
415 |         for i in range(20):
416 |             state.append([Env.goal[0]-car_A.positon()[0],Env.goal[1]-car_A.positon()[1]])
417 |     else :
418 |         for i in range(20):
419 |             state[360+i] = [Env.goal[0]-car_A.positon()[0],Env.goal[1]-car_A.positon()[1]]
420 |         for i in range(20):
421 |             state[380+i] = [Env.goal[0]-car_A.positon()[0],Env.goal[1]-car_A.positon()[1]]
422 |     reward,done = Env.check_goal(car_A.positon(),car_A.size())
423 |     #for s in state:
424 |         #print (s)
425 |     return state,reward,done
426 |     
427 | 
428 |     
429 | class Qnetwork(object):
430 |     def __init__(self,size):
431 |         self.Input=tf.compat.v1.placeholder(shape=[None,800],dtype=tf.float32)
432 |         self.imageIn=tf.reshape(self.Input,shape=[-1,20,20,2])
433 |         self.conv1=tf.contrib.layers.convolution2d(inputs=self.imageIn,num_outputs=16,kernel_size=[2,2],stride=[2,2],padding='VALID',biases_initializer=None)
434 |         self.conv2=tf.contrib.layers.convolution2d(inputs=self.conv1,num_outputs=32,kernel_size=[2,2],stride=[2,2],padding='VALID',biases_initializer=None)
435 |         self.conv3=tf.contrib.layers.convolution2d(inputs=self.conv2,num_outputs=256,kernel_size=[5,5],stride=[1,1],padding='VALID',biases_initializer=None)	
436 |         self.fullconnect1 = tf.reshape(self.conv3,shape=[-1,256])
437 |         self.W1=tf.Variable(tf.random_normal([256,size]))
438 |         self.b1=tf.Variable(tf.constant(0.1,shape=[size]))
439 |         self.layer1=tf.matmul(self.fullconnect1,self.W1)+self.b1
440 |         self.W2=tf.Variable(tf.random_normal([size,size]))
441 |         self.b2=tf.Variable(tf.constant(0.1,shape=[size]))
442 |         self.layer2=tf.nn.relu(tf.matmul(self.layer1,self.W2)+self.b2)
443 |         self.layerAC,self.layerVC=tf.split(self.layer2,2,1)         
444 |         self.AW=tf.Variable(tf.random_normal([size//2,Env.actions]))
445 |         self.VW=tf.Variable(tf.random_normal([size//2,1]))
446 |         self.Advantage=tf.matmul(self.layerAC,self.AW)
447 |         self.Value=tf.matmul(self.layerVC,self.VW)
448 | 	
449 |         self.Qout=self.Value+tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,reduction_indices=1,keep_dims=True))
450 |         self.predict=tf.argmax(self.Qout,1)
451 | 
452 |         self.targetQ=tf.placeholder(shape=[None],dtype=tf.float32)
453 |         self.actions=tf.placeholder(shape=[None],dtype=tf.int32)
454 |         self.actions_onehot=tf.one_hot(self.actions, Env.actions, dtype=tf.float32)
455 |         self.Q=tf.reduce_sum(tf.multiply(self.Qout,self.actions_onehot),reduction_indices=1)
456 | 
457 |         self.td_error=tf.square(self.targetQ-self.Q)
458 |         self.loss=tf.reduce_mean(self.td_error)
459 |         self.trainer=tf.train.AdamOptimizer(learning_rate=0.0001) #0.0001
460 |         self.updateModel=self.trainer.minimize(self.loss)      
461 | 
462 | class experience_buffer():
463 |     def __init__(self,buffer_size =80000):
464 |         self.buffer=[]
465 |         self.buffer_size=buffer_size
466 | 
467 |     def add(self,experience):
468 |         if len(self.buffer)+len(experience) >=self.buffer_size:
469 |             self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size]=[]
470 |         self.buffer.extend(experience)
471 | 
472 |     def sample(self,size):
473 |         return np.reshape(np.array(random.sample(self.buffer,size)),[size,5]) 
474 | 
475 | def processState(states):                   
476 | 	return np.reshape(states,[800])
477 |         
478 | def updateTargetGraph(tfVars,tau):
479 | 	total_vars=len(tfVars)
480 | 	op_holder=[]
481 | 	for idx,var in enumerate(tfVars[0:total_vars//2]):
482 | 	    op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau)+((1-tau)*tfVars[idx+total_vars//2].value())))
483 | 	return op_holder
484 | 
485 | def updateTarget(op_holder,sess):
486 | 	for op in op_holder:
487 | 	    sess.run(op)
488 |         
489 | batch_size=32
490 | updata_freq=4
491 | y=0.95
492 | startE=0.4         
493 | endE=0.05       
494 | anneling_steps=80000
495 | num_episodes=20000
496 | pre_train_steps=5000
497 | max_epLength=150     
498 | 
499 | path="./dqn"
500 | h_size=512
501 | tau=0.001              
502 | 
503 | mainQN=Qnetwork(h_size)
504 | targetQN=Qnetwork(h_size)
505 | 
506 | init=tf.global_variables_initializer()
507 | 
508 | trainables=tf.trainable_variables()
509 | print(len(trainables))
510 | if(len(trainables)>18):
511 |     exit()
512 | targetOps=updateTargetGraph(trainables,tau)
513 | #print(targetOps)
514 | 
515 | 
516 | myBuffer=experience_buffer()   
517 | e=startE
518 | stepDrop=(startE-endE)/num_episodes
519 | 
520 | rList=[]
521 | total_steps=0
522 | everage_reward = []
523 | 
524 | font = pygame.font.SysFont("arial", 16);
525 | font_height = font.get_linesize()
526 | event_text = []
527 | 
528 | 
529 | plt_targetQ=[]
530 | plt_mainQ=[]
531 | plt_tderr=[]
532 | 
533 | saver=tf.train.Saver()
534 | if not os.path.exists(path):
535 |     os.makedirs(path)
536 | 
537 | config = tf.ConfigProto()
538 | config.gpu_options.per_process_gpu_memory_fraction = 0.8
539 | #session = tf.Session()
540 | with tf.Session(config=config) as sess:
541 |     if load_model ==True:
542 |         print('Loading Model...')
543 |         ckpt=tf.train.get_checkpoint_state(path)
544 |         saver.restore(sess,ckpt.model_checkpoint_path)
545 |     else:
546 |         sess.run(init)           #if load_model=false 
547 |         updateTarget(targetOps,sess)
548 |     for i in range(num_episodes+1):
549 |         #if(i%400==0):
550 |             #max_epLength +=1
551 |         episodeBuffer=experience_buffer()
552 |         #px,py,pl,pw = Env.random_point()   
553 |         if(test_mode == False):
554 |             map_level = np.random.randint(2,4)
555 |         else:
556 |             map_level = 3
557 |         if test_mode ==True:
558 |             if (i == 0):
559 |                 car_A.move_to([80, 120])
560 |                 Env.add_old_goalpoint([60, 120])
561 |             if(i<=5):
562 |                 Env.goal_point_notrandom(110+80*i,200+30*i)
563 |                 Env.add_old_goalpoint([110+80*i,200+30*i])
564 |             else:
565 |                 Env.goal_point_notrandom(570, 300)
566 |                 Env.add_old_goalpoint([570, 300])
567 |             px,py=car_A.positon()
568 | 
569 |                 #px, py, pl, pw = Env.close_to_goalpoint(150, 150)  # 300
570 |         else:
571 |             Env.goal_point()
572 |             Env.reset(map_level) 
573 |             px,py,pl,pw = Env.close_to_goalpoint(50+i//400,50+i//400)   #300
574 |             car_A = car(px, py, pl, pw) 
575 |         car_A.show()       
576 |         Env.add_node(car_A.positon())
577 |         lidar_A = lidar(car_A.positon())
578 |         lidar_A.scan()                      
579 |         s,r,d = observe()
580 |         s = processState(s)
581 |         d=False
582 |         rALL=0
583 |         j=0   
584 |         while j < max_epLength:
585 |             j += 1
586 |             if test_mode ==True:       
587 |                 if np.random.rand(1) < 0.05 :
588 |                     a=np.random.randint(0,8)
589 |                 else:
590 |                     a=sess.run(mainQN.predict,feed_dict={mainQN.Input:[s]})[0]
591 |                     b_out=sess.run(mainQN.Qout,feed_dict={mainQN.Input:[s]})
592 |                     b_Q=max(b_out.ravel())
593 |                     plt_mainQ.append(b_Q)
594 |                 if(len(plt_mainQ)>2000):
595 |                     del plt_mainQ[0]
596 |                 if(show_data == True): 
597 |                     #b_out=sess.run(mainQN.Qout,feed_dict={mainQN.Input:[s]})
598 |                     print('a:',a)
599 |                     print('b_out',b_out)
600 |                     print('s',s[1],s[91],s[181],s[271])   
601 |                     print('b_Q',b_Q)
602 |                     if(len(plt_mainQ)>1000):
603 |                         plt.plot(np.arange(len(plt_mainQ)), plt_mainQ)
604 |                         plt.ylabel('plt_mainQ')
605 |                         plt.xlabel('training steps')
606 |                         plt.show()
607 |             else:                    
608 |                 if np.random.rand(1) < e or total_steps < pre_train_steps: 
609 |                     a=np.random.randint(0,8)
610 |                 else:
611 |                     a=sess.run(mainQN.predict,feed_dict={mainQN.Input:[s]})[0]
612 |             #print(a)
613 |             car_A.step(a)  
614 |             #lidar_A = lidar(car_A.positon())
615 |             if test_mode == True:
616 |                 Env.add_node(car_A.positon())
617 |             lidar_A.pos_change(car_A.positon())
618 |             lidar_A.scan()           
619 |             s1,r,d = observe()   
620 |             s1 = processState(s1) 
621 |             total_steps+=1
622 |             if(show_data == True):
623 |                 print('r',r)
624 |             episodeBuffer.add(np.reshape(np.array([s,a,r,s1,d]),[1,5]))
625 |             
626 |             if total_steps > pre_train_steps:
627 |                 
628 |                 if total_steps %(updata_freq)==0:
629 |                     trainBatch=myBuffer.sample(batch_size)
630 |                     A=sess.run(mainQN.predict,feed_dict={mainQN.Input:np.vstack(trainBatch[:,3])})
631 |                     Q=sess.run(targetQN.Qout,feed_dict={targetQN.Input:np.vstack(trainBatch[:,3])})
632 |                     doubleQ=Q[range(batch_size),A]
633 |                     targetQ=trainBatch[:,2]+y*doubleQ
634 |                     if(i%10==0 and j%100 ==0):
635 |                         plt_targetQ.append(targetQ[10])  
636 |                     if(len(plt_targetQ)>4000):
637 |                         del plt_targetQ[0]
638 |                     td_error = sess.run(mainQN.loss,feed_dict={mainQN.Input:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ,mainQN.actions:trainBatch[:,1]})
639 |                     if(i%10==0 and j%100 ==0):
640 |                         plt_tderr.append(td_error)
641 |                     if (len(plt_tderr) > 4000):
642 |                         del plt_tderr[0]
643 |                     if(show_data == True):
644 |                         if(len(plt_tderr)>10):
645 |                             plt.plot(np.arange(len(plt_tderr)), plt_tderr)
646 |                             plt.ylabel('plt_tderr')
647 |                             plt.xlabel('training steps')
648 |                             plt.show()
649 |                         print('td_error',td_error)
650 |                         print('doubleQ[10]',doubleQ[10])
651 |                         print('targetQ[10]',targetQ[10])
652 |                         if(len(plt_targetQ)>10):
653 |                             plt.plot(np.arange(len(plt_targetQ)), plt_targetQ)
654 |                             plt.ylabel('plt_targetQ')
655 |                             plt.xlabel('training steps')
656 |                             plt.show()
657 |                         if(len(everage_reward)>5):
658 |                             plt.plot(np.arange(len(everage_reward)), everage_reward)
659 |                             plt.ylabel('everage_reward')
660 |                             plt.xlabel('training steps /100')
661 |                             plt.show()
662 |                     _ =sess.run(mainQN.updateModel,feed_dict={mainQN.Input:np.vstack(trainBatch[:,0]),mainQN.targetQ:targetQ,mainQN.actions:trainBatch[:,1]})
663 |                         
664 |                     updateTarget(targetOps,sess)
665 |                     
666 |             rALL += r
667 |             s = s1
668 |             
669 |             if d ==True:
670 |                 break
671 |             
672 |             
673 |             for event in pygame.event.get():
674 |                 if event.type == QUIT:
675 |                     #exit()
676 |                     print('quit')
677 |                 elif event.type == MOUSEBUTTONDOWN :
678 |                     if(show_data == False):
679 |                         show_data = True
680 |                     else:
681 |                         show_data = False
682 |                     #x,y = pygame.mouse.get_pos()
683 |                     #car_A.move_to((x,y))
684 |                     #car_A.show()
685 |                     #lidar_A.pos_change((x,y))
686 |                     #lidar_A.scan()
687 |                 #x1,y1 = pygame.mouse.get_pos()
688 |                 #car_A.move_to((x1,y1))
689 |                 #bound = car_A.show()
690 |                 #if bound == True:
691 |                     #lidar_A.pos_change((x1,y1))
692 |                     #lidar_A.scan()
693 |             
694 |             
695 |             Env.reset(map_level)
696 |             car_A.show()
697 |             lidar_A.show()
698 |             #pygame.display.update()            
699 |             #fpsClock.tick(10)
700 |         if e>endE:
701 |             e-=stepDrop
702 |         myBuffer.add(episodeBuffer.buffer)
703 |         rList.append(rALL)
704 |         if i>0 and i % 25==0:
705 |             print('episode',i,',average reward of last 25 episode',np.mean(rList[-25:]))
706 |         if i>0 and i % 100==0:
707 |             everage_reward.append(np.mean(rList[-100:]))
708 | 
709 |         if i>0 and i % 2000==0:
710 |             saver.save(sess,path+'/model-'+str(i)+'.ckpt')
711 |             print("Saved Model")
712 |     saver.save(sess,path+'/model-'+str(i)+'.ckpt')            
713 |             
714 |      
715 |      
716 |      
717 |      
718 |      


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DRLPathPlanner
 2 |    
 3 |    -A framework for path-planing and obstacle avoidance using Deep Reinforcement Learning Techniques.
 4 | 
 5 | ## Introduction:
 6 | The absence robust and generalized path planner has been one of the greatest hindrances in realizing any form of cyber-physical system as a part of our daily lives, coexisting and enhancing the standard of our livelihood and efficiency. The recent years have seen Reinforcement Learning emerging as a viable solution for any tasks related to sequential decision making. In this project, we plan to give this problem more of a Learning approach than the traditional Planning approach. We also notice that this essentially is a learning problem as an agent should learn to generalize enough and adapt to the unpredictability and uncertainties put forward by real-world environments.Thus, we propose to build a Path planning  Deep Reinforcement Learning Based Agent that could tackle dynamic obstacles and as well could navigate through a non-stationary environment to reach a given goal state. We believe that an agent of this sort could act as a High-level motion planner in various robotics and navigation-related tasks.
 7 | 
 8 | ## A Brief Overview:
 9 | ### Our Environment:
10 | We selected a seemingly simple grid-world to first check the feasibility of our approach.Our need was rightly satisfied by a third party Gym environment named “MiniGrid”.
11 | 
12 | <p align="center">
13 |    <img width="278" height="301" src="https://github.com/lok-i/DRLPathPlanner/blob/master/empty-env.png">
14 | </p>
15 | <p align="center">
16 |    <img width="400" height="480" src="https://github.com/lok-i/DRLPathPlanner/blob/master/dynamic_obstacles.gif">
17 |  </p>
18 | 
19 | ## On Going work:
20 | * The overall problem was divided into subtasks and were solved by individual Deep Q-Networks.
21 | * Experimenting with applying a hierarchical structure to the sub tasks and combine with Hierarhial Reinforcement Learning.
22 | * Also simulataneously developing Evolutionary Strategy based solution so as to solve the above problem from a Evolutionary Reinforcement Learning based approach.
23 | 


--------------------------------------------------------------------------------
/dynamic_obstacles.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lok-i/DRLPathPlanner/5bb6694f3fbf53a2aa79b9195cdfe89caee597ea/dynamic_obstacles.gif


--------------------------------------------------------------------------------
/empty-env.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lok-i/DRLPathPlanner/5bb6694f3fbf53a2aa79b9195cdfe89caee597ea/empty-env.png


--------------------------------------------------------------------------------
/gridworld_dynamic_obstacles.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import gym_minigrid
  3 | import math
  4 | import random
  5 | import numpy as np
  6 | import matplotlib
  7 | import matplotlib.pyplot as plt
  8 | from collections import namedtuple
  9 | from itertools import count
 10 | from PIL import Image
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.optim as optim
 15 | import torch.nn.functional as F
 16 | import torchvision.transforms as T
 17 | 
 18 | 
 19 | env = gym.make('MiniGrid-Dynamic-Obstacles-16x16-v0')#'MiniGrid-Empty-5x5-v0')
 20 | 
 21 | print(env.observation_space)
 22 | print(env.action_space)
 23 | 
 24 | 
 25 | 
 26 | Transition = namedtuple('Transition',
 27 |                         ('state', 'action', 'next_state', 'reward'))
 28 | 
 29 | 
 30 | class ReplayMemory(object):
 31 | 
 32 | 
 33 | 
 34 | 
 35 |     def __init__(self, capacity):
 36 |         self.capacity = capacity
 37 |         self.memory = []
 38 |         self.position = 0
 39 | 
 40 |     def push(self, *args):
 41 |         """Saves a transition."""
 42 |         if len(self.memory) < self.capacity:
 43 |             self.memory.append(None)
 44 |         self.memory[self.position] = Transition(*args)
 45 |         self.position = (self.position + 1) % self.capacity
 46 | 
 47 |     def sample(self, batch_size):
 48 |         return random.sample(self.memory, batch_size)
 49 | 
 50 |     def __len__(self):
 51 |         return len(self.memory)
 52 | 
 53 | class DQN(nn.Module):
 54 | 
 55 |     def __init__(self, h, w, outputs):
 56 |         super(DQN, self).__init__()
 57 |         self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1)
 58 |         self.bn1 = nn.BatchNorm2d(16)
 59 |         self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1)
 60 |         self.bn2 = nn.BatchNorm2d(32)
 61 |         self.conv3 = nn.Conv2d(32, 32, kernel_size=2, stride=1)
 62 |         self.bn3 = nn.BatchNorm2d(32)
 63 | 
 64 |         # Number of Linear input connections depends on output of conv2d layers
 65 |         # and therefore the input image size, so compute it.
 66 |         def conv2d_size_out(size, kernel_size = 5, stride = 2):
 67 |             return (size - (kernel_size - 1) - 1) // stride  + 1
 68 |         convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
 69 |         convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
 70 |         linear_input_size = convw * convh * 32
 71 |         self.head = nn.Linear(linear_input_size, outputs)
 72 | 
 73 |     # Called with either one element to determine next action, or a batch
 74 |     # during optimization. Returns tensor([[left0exp,right0exp]...]).
 75 |     def forward(self, x):
 76 |         x = F.relu(self.bn1(self.conv1(x)))
 77 |         x = F.relu(self.bn2(self.conv2(x)))
 78 |         x = F.relu(self.bn3(self.conv3(x)))
 79 |         return self.head(x.view(x.size(0), -1))
 80 | 
 81 | BATCH_SIZE = 256
 82 | GAMMA = 0.999
 83 | EPS_START = 0.9
 84 | eps_of_episode = []
 85 | 
 86 | EPS_END = 0.05
 87 | EPS_DECAY = 100
 88 | TARGET_UPDATE = 10
 89 | 
 90 | # Get screen size so that we can initialize layers correctly based on shape
 91 | # returned from AI gym. Typical dimensions at this point are close to 3x40x90
 92 | # which is the result of a clamped and down-scaled render buffer in get_screen()
 93 | 
 94 | 
 95 | init_screen = env.reset()
 96 | screen_height =7 
 97 | screen_width = 7 
 98 | device = 'cpu'
 99 | # Get number of actions from gym action space
100 | n_actions = env.action_space.n
101 | 
102 | policy_net = DQN(screen_height, screen_width, n_actions).to(device)
103 | target_net = DQN(screen_height, screen_width, n_actions).to(device)
104 | target_net.load_state_dict(policy_net.state_dict())
105 | target_net.eval()
106 | 
107 | optimizer = optim.RMSprop(policy_net.parameters())
108 | memory = ReplayMemory(10000)
109 | 
110 | 
111 | steps_done = 0
112 | 
113 | def get_state(x):
114 | 	#print(x)
115 | 	x.unsqueeze_(0)
116 | 	
117 | 	x.transpose_(1,3)
118 | 	#print(x)
119 | 	return x
120 | 
121 | def select_action(state):
122 |     global steps_done
123 |     sample = random.random()
124 |     eps_threshold = EPS_END + (EPS_START - EPS_END) * \
125 |         math.exp(-1. * steps_done / EPS_DECAY)
126 |    	
127 | 
128 |     steps_done += 1
129 |     eps_of_episode.append(eps_threshold)
130 |     if sample > eps_threshold:
131 |         with torch.no_grad():
132 |             # t.max(1) will return largest column value of each row.
133 |             # second column on max result is index of where max element was
134 |             # found, so we pick action with the larger expected reward.
135 |             return policy_net(state).max(1)[1].view(1, 1)
136 |     else:
137 |         return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)
138 | 
139 | 
140 | episode_durations = []
141 | 
142 | 
143 | def plot_durations():
144 |     plt.figure(2)
145 |     plt.clf()
146 |     durations_t = torch.tensor(episode_durations, dtype=torch.float)
147 |     plt.subplot(211)
148 |     plt.title('Training...')
149 |     plt.xlabel('Episode')
150 |     plt.ylabel('Score')
151 |     plt.plot(durations_t.numpy())
152 |     plt.subplot(212)
153 |     plt.title('Training...')
154 |     plt.xlabel('Episode')
155 |     plt.ylabel('Epsilon')
156 |     plt.plot(torch.tensor(eps_of_episode,dtype=torch.float).numpy())
157 |     #plt.show()
158 | 
159 |     # Take 100 episode averages and plot them too
160 |     if len(durations_t) >= 100:
161 |         means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
162 |         means = torch.cat((torch.zeros(99), means))
163 |         plt.plot(means.numpy())
164 | 
165 |     plt.pause(0.001)  # pause a bit so that plots are updated
166 |     # if is_ipython:
167 |     #     display.clear_output(wait=True)
168 |     #     display.display(plt.gcf())      
169 | 
170 | def optimize_model():
171 |     if len(memory) < BATCH_SIZE:
172 |         return
173 |     transitions = memory.sample(BATCH_SIZE)
174 |     # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
175 |     # detailed explanation). This converts batch-array of Transitions
176 |     # to Transition of batch-arrays.
177 |     batch = Transition(*zip(*transitions))
178 | 
179 |     # Compute a mask of non-final states and concatenate the batch elements
180 |     # (a final state would've been the one after which simulation ended)
181 |     non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
182 |                                           batch.next_state)), device=device, dtype=torch.bool)
183 |     non_final_next_states = torch.cat([s for s in batch.next_state
184 |                                                 if s is not None])
185 |     state_batch = torch.cat(batch.state)
186 |     action_batch = torch.cat(batch.action)
187 |     reward_batch = torch.cat(batch.reward)
188 | 
189 |     # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
190 |     # columns of actions taken. These are the actions which would've been taken
191 |     # for each batch state according to policy_net
192 |     state_action_values = policy_net(state_batch).gather(1, action_batch)
193 | 
194 |     # Compute V(s_{t+1}) for all next states.
195 |     # Expected values of actions for non_final_next_states are computed based
196 |     # on the "older" target_net; selecting their best reward with max(1)[0].
197 |     # This is merged based on the mask, such that we'll have either the expected
198 |     # state value or 0 in case the state was final.
199 |     next_state_values = torch.zeros(BATCH_SIZE, device=device)
200 |     next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
201 |     # Compute the expected Q values
202 |     expected_state_action_values = (next_state_values * GAMMA) + reward_batch
203 | 
204 |     # Compute Huber loss
205 |     loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
206 | 
207 |     # Optimize the model
208 |     optimizer.zero_grad()
209 |     loss.backward()
210 |     for param in policy_net.parameters():
211 |         param.grad.data.clamp_(-1, 1)
212 |     optimizer.step()
213 | 
214 | def train():
215 | 	num_episodes = 500
216 | 	for i_episode in range(num_episodes):
217 | 	    # Initialize the environment and state
218 | 	    env.reset()
219 | 	    #last_screen = env.render()
220 | 
221 | 	    #current_screen = env.render()
222 | 	    
223 | 	    state_dic,_,_,_=     env.step(env.action_space.sample())#env.render()
224 | 	    state = get_state(torch.tensor(state_dic['image'],dtype =torch.float))
225 | 	    #print(state.size())
226 | 	    #print(state)
227 | 	    for t in count():
228 | 	        # Select and perform an action
229 | 	        env.render()
230 | 	        action = select_action(state)
231 | 	        observation, reward, done, _ = env.step(action.item())
232 | 	        
233 | 	        reward = torch.tensor([reward],dtype=torch.long, device=device)
234 | 	        
235 | 
236 | 	        # Observe new state
237 | 	        
238 | 	        
239 | 	        if not done:
240 | 	            next_state =get_state(torch.tensor(observation['image'],dtype =torch.float)) #env.render()# current_screen - last_screen
241 | 	        else:
242 | 	            next_state = None
243 | 
244 | 	        # Store the transition in memory
245 | 	        memory.push(state, action, next_state, reward)
246 | 
247 | 	        # Move to the next state
248 | 	        state = next_state
249 | 
250 | 	        # Perform one step of the optimization (on the target network)
251 | 	        optimize_model()
252 | 	        if done:
253 | 	            episode_durations.append(t + 1)
254 | 	            plot_durations()
255 | 	            break
256 | 	    # Update the target network, copying all weights and biases in DQN
257 | 	    if i_episode % TARGET_UPDATE == 0:
258 | 	        target_net.load_state_dict(policy_net.state_dict())
259 | 
260 | 	print('Complete')
261 | 	env.render()
262 | 	env.close()
263 | 	plt.ioff()
264 | 	plt.show()
265 | 
266 | 
267 | if __name__ == '__main__':
268 | 	print("Training :")
269 | 	train()
270 | 	
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 
279 | # for i_episode in range(20):
280 | #     observation = env.reset()
281 | #     for t in range(100):
282 | #         env.render()
283 | #         print(observation)
284 | #         action = env.action_space.sample()
285 | #         observation, reward, done, info = env.step(action)
286 | #         if done:
287 | #             print("Episode finished after {} timesteps".format(t+1))
288 | #             break
289 | # env.close()
290 | 


--------------------------------------------------------------------------------