├── README.md ├── ai.py ├── car.kv ├── gui.py ├── last_brain.pth └── output.jpg /README.md: -------------------------------------------------------------------------------- 1 | # Optimal Path Planning: Deep Reinforcement Learning 2 | Optimal Path Planning with Deep Reinforcement Learning 3 | 4 | Basic concepts of Q learning algorithm, markov Decision Processes, Temporal Difference, and Deep Q Networks are used 5 | to train a tiny car find the optimal path from top left corner to bottom right corner. 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /ai.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mon May 7 20:12:40 2018 3 | 4 | @author: nader 5 | """ 6 | 7 | import numpy as np 8 | import random 9 | #for loading the brain 10 | import os 11 | import torch 12 | #module for implementing neural network 13 | import torch.nn as nn 14 | #contains functions for implementing neural network 15 | import torch.nn.functional as F 16 | #for optimizing stochastic gradient decent 17 | import torch.optim as optim 18 | #distencer to tensor and gradient 19 | #import torch.autograd as autograd 20 | from torch.autograd import Variable 21 | 22 | # Neural Network 23 | class Network(nn.Module): 24 | 25 | def __init__(self, input_size, nb_action): 26 | super(Network, self).__init__() 27 | self.input_size = input_size 28 | self.nb_action = nb_action 29 | # nn.Linear => all neuran in input layer are connected to hidden layer 30 | # fc1 , fc2 => full connections 31 | self.fc1 = nn.Linear(input_size, 30) 32 | self.fc2 = nn.Linear(30, nb_action) 33 | 34 | def forward(self, state):# function for propagation 35 | # we apply rectifier fuction to hidden neurons and we should give our input connection input states 36 | x = F.relu(self.fc1(state)) 37 | q_values = self.fc2(x) 38 | return q_values 39 | 40 | #Experience Replay 41 | class ReplayMemory(object): 42 | 43 | def __init__(self, capacity): 44 | #max number of transitions in memory event 45 | self.capacity = capacity 46 | #list of capacity events 47 | self.memory = [] 48 | 49 | def push(self, event): 50 | self.memory.append(event) 51 | if len(self.memory) > self.capacity: 52 | del self.memory[0] 53 | 54 | def sample(self, batch_size): 55 | #(state1,action1,reward1),(state2,acition2,reward2) => (state1,state2),(action1,action2),(reward1,reward2) 56 | #we are getting random objects from memory that have size equal to batch_size 57 | samples = zip(*random.sample(self.memory, batch_size)) 58 | #we finally put each of this batch to a pytorch variable whitch each one will recieve a gradient 59 | #we should concatenate each batch in sample with respect with first dimenstion so that in each row state action and reward corespond to same time t 60 | return map(lambda x: Variable(torch.cat(x, 0)), samples) 61 | 62 | 63 | class Dqn(): 64 | 65 | def __init__(self, input_size, nb_action, gamma): 66 | self.gamma = gamma 67 | self.reward_window = [] 68 | self.model = Network(input_size, nb_action) 69 | self.memory = ReplayMemory(100000) 70 | #parameters of our model and the learning rate 71 | self.optimizer = optim.Adam(self.model.parameters(), lr = 0.001) 72 | #for pytorch it not only has to be torch tensor and one more dimension that corresponds to the batch 73 | self.last_state = torch.Tensor(input_size).unsqueeze(0) 74 | #go straight 75 | self.last_action = 0 76 | self.last_reward = 0 77 | 78 | def select_action(self, state): 79 | #we feed input state to network and we get outputs which are q value for each action 80 | #and use softmax to get final action 81 | # softmax => we want to go to best action but we want to go to others too(probability for each q value) 82 | # state should be torch var so we wrap this which is a tensor to torch var 83 | #with volatile=true we wont include gradient assosiated with this state to the graph 84 | probs = F.softmax(self.model(Variable(state, volatile = True))*100) # T=100 85 | action = probs.multinomial() 86 | #cause multinomial returns a pytorch var with fake batch we need to select index [0,0] 87 | 88 | # m = torch.distributions.Categorical(probs) 89 | # action=m.sample() 90 | return action.data[0,0] 91 | 92 | def learn(self, batch_state, batch_next_state, batch_reward, batch_action): 93 | #we need gather result of action which were played 94 | outputs = self.model(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1) 95 | #we get max q value in next_state(0) according to all actions(1) 96 | next_outputs = self.model(batch_next_state).detach().max(1)[0] 97 | target = self.gamma*next_outputs + batch_reward 98 | #we use hubble loss 99 | td_loss = F.smooth_l1_loss(outputs, target) 100 | # zero_grad reinitializes the optimizer in every iteration of loop 101 | self.optimizer.zero_grad() 102 | #free memory because we go several times on the loss 103 | td_loss.backward(retain_variables = True) 104 | #.step updates th weights by backpropagating 105 | self.optimizer.step() 106 | 107 | def update(self, reward, new_signal): 108 | #convert signal to Tensor(float) since it is input of neural network and add dimention according to batch 109 | new_state = torch.Tensor(new_signal).float().unsqueeze(0) 110 | #torch.longtensor converts int to long in tensor 111 | self.memory.push((self.last_state, new_state, torch.LongTensor([int(self.last_action)]), torch.Tensor([self.last_reward]))) 112 | action = self.select_action(new_state) 113 | if len(self.memory.memory) > 100: 114 | #we get 100 from each of them 115 | batch_state, batch_next_state, batch_action, batch_reward = self.memory.sample(100) 116 | self.learn(batch_state, batch_next_state, batch_reward, batch_action) 117 | self.last_action = action 118 | self.last_state = new_state 119 | self.last_reward = reward 120 | self.reward_window.append(reward) 121 | if len(self.reward_window) > 1000: 122 | del self.reward_window[0] 123 | return action 124 | 125 | def score(self): 126 | return sum(self.reward_window)/(len(self.reward_window)+1.) 127 | 128 | def save(self): 129 | torch.save({'state_dict': self.model.state_dict(), 130 | 'optimizer' : self.optimizer.state_dict(), 131 | }, 'last_brain.pth') 132 | 133 | def load(self): 134 | if os.path.isfile('last_brain.pth'): 135 | print("=> loading checkpoint... ") 136 | checkpoint = torch.load('last_brain.pth') 137 | self.model.load_state_dict(checkpoint['state_dict']) 138 | self.optimizer.load_state_dict(checkpoint['optimizer']) 139 | print("done !") 140 | else: 141 | print("no checkpoint found...") -------------------------------------------------------------------------------- /car.kv: -------------------------------------------------------------------------------- 1 | #:kivy 1.0.9 2 | # ref: https://kivy.org/docs/tutorials/pong.html 3 | 4 | : 5 | size: 20, 10 6 | canvas: 7 | PushMatrix 8 | Rotate: 9 | angle: self.angle 10 | origin: self.center 11 | Rectangle: 12 | pos: self.pos 13 | size: self.size 14 | PopMatrix 15 | 16 | : 17 | size: 10,10 18 | canvas: 19 | Color: 20 | rgba: 1,1,0,1 21 | Ellipse: 22 | pos: self.pos 23 | size: self.size 24 | : 25 | size: 10,10 26 | canvas: 27 | Color: 28 | rgba: 1,1,0,1 29 | Ellipse: 30 | pos: self.pos 31 | size: self.size 32 | 33 | : 34 | size: 10,10 35 | canvas: 36 | Color: 37 | rgba: 1,1,0,1 38 | Ellipse: 39 | pos: self.pos 40 | size: self.size 41 | 42 | : 43 | car: game_car 44 | ball1: game_ball1 45 | ball2: game_ball2 46 | ball3: game_ball3 47 | 48 | Car: 49 | id: game_car 50 | center: self.parent.center 51 | Ball1: 52 | id: game_ball1 53 | center: self.parent.center 54 | Ball2: 55 | id: game_ball2 56 | center: self.parent.center 57 | Ball3: 58 | id: game_ball3 59 | center: self.parent.center 60 | -------------------------------------------------------------------------------- /gui.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Mon May 7 20:13:21 2018 3 | 4 | @author: nader 5 | """ 6 | 7 | import numpy as np 8 | from random import random, randint 9 | import matplotlib.pyplot as plt 10 | #import time 11 | 12 | from kivy.app import App 13 | from kivy.uix.widget import Widget 14 | from kivy.uix.button import Button 15 | from kivy.graphics import Color, Ellipse, Line 16 | from kivy.config import Config 17 | from kivy.properties import NumericProperty, ReferenceListProperty, ObjectProperty 18 | from kivy.vector import Vector 19 | from kivy.clock import Clock 20 | 21 | # Importing the AI 22 | from ai import Dqn 23 | 24 | # We use the code below to stop adding red cirles by right clicking 25 | Config.set('input', 'mouse', 'mouse,multitouch_on_demand') 26 | 27 | # we use last_x and last_y to keep track of last point we draw on canvas 28 | last_x = 0 29 | last_y = 0 30 | n_points = 0 31 | length = 0 32 | 33 | # Initialization of Brain with 5 sensors and 3 actions and 0.9 discount factor 34 | brain = Dqn(5,3,0.9) 35 | action2rotation = [0,20,-20] 36 | last_reward = 0 37 | scores = [] 38 | 39 | # map initialization 40 | first_update = True 41 | def init(): 42 | #sand is an array with size of screen pixel size 43 | global sand 44 | global goal_x 45 | global goal_y 46 | global first_update 47 | #longueu,largeu 48 | #sand = np.zeros((length,width)) 49 | # initializing sand array with 0 50 | sand=np.zeros((length,width)) 51 | goal_x = 20 52 | goal_y = width - 20 53 | first_update = False 54 | 55 | last_distance = 0 56 | 57 | 58 | class Car(Widget): 59 | 60 | # initializing the angle of the car 61 | angle = NumericProperty(0) 62 | # initializing rotation of the car 63 | rotation = NumericProperty(0) 64 | # initializing speed in x-vector 65 | velocity_x = NumericProperty(0) 66 | # initializing speed in y-vector 67 | velocity_y = NumericProperty(0) 68 | # speed vector 69 | velocity = ReferenceListProperty(velocity_x, velocity_y) 70 | # initializing x of forward sensor 71 | sensor1_x = NumericProperty(0) 72 | # initializing y of forward sensor 73 | sensor1_y = NumericProperty(0) 74 | # forward sensor vector 75 | sensor1 = ReferenceListProperty(sensor1_x, sensor1_y) 76 | # initializing x of left sensor 77 | sensor2_x = NumericProperty(0) 78 | # initializing y of left sensor 79 | sensor2_y = NumericProperty(0) 80 | # left sensor vector 81 | sensor2 = ReferenceListProperty(sensor2_x, sensor2_y) 82 | # initializing x of right sensor 83 | sensor3_x = NumericProperty(0) 84 | # initializing y of right sensor 85 | sensor3_y = NumericProperty(0) 86 | # right sensor vector 87 | sensor3 = ReferenceListProperty(sensor3_x, sensor3_y) 88 | # initializing signal recieved from f-sensor 89 | signal1 = NumericProperty(0) 90 | # initializing signal recieved from l-sensor 91 | signal2 = NumericProperty(0) 92 | # initializing signal recieved from r-sensor 93 | signal3 = NumericProperty(0) 94 | 95 | def move(self, rotation): 96 | # update position of car according to its last position and speed 97 | self.pos = Vector(*self.velocity) + self.pos 98 | self.rotation = rotation 99 | self.angle = self.angle + self.rotation 100 | 101 | # updating position of sensors 102 | self.sensor1 = Vector(30, 0).rotate(self.angle) + self.pos 103 | self.sensor2 = Vector(30, 0).rotate((self.angle+30)%360) + self.pos 104 | self.sensor3 = Vector(30, 0).rotate((self.angle-30)%360) + self.pos 105 | 106 | # getting signals recieved from sensors => density of wall or sand aside it 107 | self.signal1 = int(np.sum(sand[int(self.sensor1_x)-10:int(self.sensor1_x)+10, int(self.sensor1_y)-10:int(self.sensor1_y)+10]))/400. 108 | self.signal2 = int(np.sum(sand[int(self.sensor2_x)-10:int(self.sensor2_x)+10, int(self.sensor2_y)-10:int(self.sensor2_y)+10]))/400. 109 | self.signal3 = int(np.sum(sand[int(self.sensor3_x)-10:int(self.sensor3_x)+10, int(self.sensor3_y)-10:int(self.sensor3_y)+10]))/400. 110 | 111 | # checking if any sensor has detected full density of wall or sand 112 | if self.sensor1_x>length-10 or self.sensor1_x<10 or self.sensor1_y>width-10 or self.sensor1_y<10: 113 | self.signal1 = 1. 114 | if self.sensor2_x>length-10 or self.sensor2_x<10 or self.sensor2_y>width-10 or self.sensor2_y<10: 115 | self.signal2 = 1. 116 | if self.sensor3_x>length-10 or self.sensor3_x<10 or self.sensor3_y>width-10 or self.sensor3_y<10: 117 | self.signal3 = 1. 118 | 119 | # sensors 120 | class Ball1(Widget): 121 | pass 122 | class Ball2(Widget): 123 | pass 124 | class Ball3(Widget): 125 | pass 126 | 127 | # the main class 128 | class Game(Widget): 129 | 130 | # getting objects from kivy file 131 | car = ObjectProperty(None) 132 | ball1 = ObjectProperty(None) 133 | ball2 = ObjectProperty(None) 134 | ball3 = ObjectProperty(None) 135 | 136 | def serve_car(self): 137 | # car starts in the center of screen going right with speed of 6 138 | self.car.center = self.center 139 | self.car.velocity = Vector(6, 0) 140 | 141 | def update(self, dt):# update function for updating everything in new 142 | 143 | global brain 144 | global last_reward 145 | global scores 146 | global last_distance 147 | global goal_x 148 | global goal_y 149 | global length 150 | global width 151 | 152 | length = self.width 153 | width = self.height 154 | 155 | # to initialize map only once 156 | if first_update: 157 | init() 158 | 159 | diffrence_x = goal_x - self.car.x 160 | diffrence_y = goal_y - self.car.y 161 | # setting orientation of the agent according to goal 162 | orientation = Vector(*self.car.velocity).angle((diffrence_x,diffrence_y))/180. 163 | # our input state according to sensors and orientation 164 | last_signal = [self.car.signal1, self.car.signal2, self.car.signal3, orientation, -orientation] 165 | # getting action from ai 166 | action = brain.update(last_reward, last_signal) 167 | # appending new score to **score window** 168 | scores.append(brain.score()) 169 | rotation = action2rotation[action] 170 | # moving car according to rotation 171 | self.car.move(rotation) 172 | # setting new distance 173 | distance = np.sqrt((self.car.x - goal_x)**2 + (self.car.y - goal_y)**2) 174 | # updating sensors new position 175 | self.ball1.pos = self.car.sensor1 176 | self.ball2.pos = self.car.sensor2 177 | self.ball3.pos = self.car.sensor3 178 | 179 | if sand[int(self.car.x),int(self.car.y)] > 0:# changing speed when going into walls or sand 180 | self.car.velocity = Vector(1, 0).rotate(self.car.angle) 181 | # getting a very bad reward => -1 182 | last_reward = -0.5 183 | else: 184 | self.car.velocity = Vector(6, 0).rotate(self.car.angle) 185 | # normal move . reward => -0.2 186 | last_reward = -0.2 187 | if distance < last_distance: 188 | # if in correct direction get a little positive reward 189 | last_reward = 0.3 190 | 191 | if self.car.x < 10:# if car goes to left border of screen 192 | self.car.x = 10 193 | last_reward = -1 194 | if self.car.x > self.width - 10:# if car goes to right side border of screen 195 | self.car.x = self.width - 10 196 | last_reward = -1 197 | if self.car.y < 10:# if car goes to top border of screen 198 | self.car.y = 10 199 | last_reward = -1 200 | if self.car.y > self.height - 10:# if car goes to bottom border of screen 201 | self.car.y = self.height - 10 202 | last_reward = -1 203 | 204 | if distance < 100: 205 | goal_x = self.width-goal_x 206 | goal_y = self.height-goal_y 207 | # updating last distance to goal 208 | last_distance = distance 209 | 210 | 211 | class MyPaintWidget(Widget): 212 | 213 | def on_touch_down(self, touch): 214 | global length, n_points, last_x, last_y 215 | with self.canvas: 216 | Color(0.6,0.5,0.1) 217 | d = 10. 218 | touch.ud['line'] = Line(points = (touch.x, touch.y), width = 10) 219 | last_x = int(touch.x) 220 | last_y = int(touch.y) 221 | n_points = 0 222 | length = 0 223 | sand[int(touch.x),int(touch.y)] = 1 224 | 225 | def on_touch_move(self, touch): 226 | global length, n_points, last_x, last_y 227 | if touch.button == 'left': 228 | touch.ud['line'].points += [touch.x, touch.y] 229 | x = int(touch.x) 230 | y = int(touch.y) 231 | length += np.sqrt(max((x - last_x)**2 + (y - last_y)**2, 2)) 232 | n_points += 1. 233 | density = n_points/(length) 234 | touch.ud['line'].width = int(20 * density + 1) 235 | sand[int(touch.x) - 10 : int(touch.x) + 10, int(touch.y) - 10 : int(touch.y) + 10] = 1 236 | last_x = x 237 | last_y = y 238 | 239 | 240 | class CarApp(App): 241 | 242 | def build(self): 243 | parent = Game() 244 | parent.serve_car() 245 | Clock.schedule_interval(parent.update, 1.0/60.0) 246 | self.painter = MyPaintWidget() 247 | clearbtn = Button(text = 'clear') 248 | savebtn = Button(text = 'save', pos = (parent.width, 0)) 249 | loadbtn = Button(text = 'load', pos = (2 * parent.width, 0)) 250 | clearbtn.bind(on_release = self.clear_canvas) 251 | savebtn.bind(on_release = self.save) 252 | loadbtn.bind(on_release = self.load) 253 | parent.add_widget(self.painter) 254 | parent.add_widget(clearbtn) 255 | parent.add_widget(savebtn) 256 | parent.add_widget(loadbtn) 257 | return parent 258 | 259 | def clear_canvas(self, obj): 260 | global sand 261 | self.painter.canvas.clear() 262 | sand = np.zeros((length,width)) 263 | 264 | def save(self, obj): 265 | print("saving brain...") 266 | brain.save() 267 | plt.plot(scores) 268 | plt.show() 269 | 270 | def load(self, obj): 271 | print("loading last saved brain...") 272 | brain.load() 273 | 274 | if __name__ == '__main__': 275 | CarApp().run() 276 | -------------------------------------------------------------------------------- /last_brain.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naderAsadi/Optimal-Path-Planning-Deep-Reinforcement-Learning/e38452c00f2f9e15bd8f6762c6e4e7a3111c85d7/last_brain.pth -------------------------------------------------------------------------------- /output.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naderAsadi/Optimal-Path-Planning-Deep-Reinforcement-Learning/e38452c00f2f9e15bd8f6762c6e4e7a3111c85d7/output.jpg --------------------------------------------------------------------------------