├── .gitignore
├── 1.png
├── 2.GIF
├── 2.png
├── DRLCar.py
├── README.md
└── environment.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | *.local
3 | 


--------------------------------------------------------------------------------
/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fdevmsy/Reinforcement-Learning-Based-Self-Driving-Car/e362ea7b3b84f7e8eb3040c000bd74b4234fb5be/1.png


--------------------------------------------------------------------------------
/2.GIF:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fdevmsy/Reinforcement-Learning-Based-Self-Driving-Car/e362ea7b3b84f7e8eb3040c000bd74b4234fb5be/2.GIF


--------------------------------------------------------------------------------
/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fdevmsy/Reinforcement-Learning-Based-Self-Driving-Car/e362ea7b3b84f7e8eb3040c000bd74b4234fb5be/2.png


--------------------------------------------------------------------------------
/DRLCar.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import base64
  4 | import json
  5 | 
  6 | import socketio
  7 | import eventlet
  8 | import eventlet.wsgi
  9 | import time
 10 | from PIL import Image
 11 | from PIL import ImageOps
 12 | from flask import Flask, render_template
 13 | from io import BytesIO
 14 | 
 15 | # Modules for DQN
 16 | import tensorflow as tf
 17 | import math
 18 | import cv2
 19 | import random
 20 | import numpy as np
 21 | import copy
 22 | import matplotlib.pyplot as plt
 23 | import datetime
 24 | import os
 25 | 
 26 | # Unity connection
 27 | sio = socketio.Server()
 28 | app = Flask(__name__)
 29 | 
 30 | # DQN Parameters
 31 | algorithm = 'DQN'
 32 | 
 33 | Num_action = 5
 34 | Gamma = 0.99
 35 | Learning_rate = 0.00025
 36 | 
 37 | First_epsilon = 1.0
 38 | Final_epsilon = 0.01
 39 | Epsilon = First_epsilon
 40 | 
 41 | Num_replay_memory = 50000
 42 | Num_start_training = 25000
 43 | Num_training = 500000
 44 | Num_update = 5000
 45 | Num_batch = 32
 46 | Num_skipFrame = 4
 47 | Num_stackFrame = 4
 48 | Num_colorChannel = 1
 49 | Num_MapChannel = 1
 50 | 
 51 | img_size = 80
 52 | map_size = 81
 53 | 
 54 | Num_step_save = 50000
 55 | Num_step_plot = 100
 56 | 
 57 | # Parameters for Network
 58 | first_conv_img = [8,8, Num_colorChannel * Num_stackFrame * 2,32]
 59 | first_conv_map = [8, 8, Num_stackFrame, 32]
 60 | second_conv  = [4,4,32,64]
 61 | third_conv   = [3,3,64,64]
 62 | first_dense_img = [10*10*64, 1024]
 63 | first_dense_map = [11*11*64, 1024]
 64 | # first_dense = [10*10*64 + 11*11*64, 512]
 65 | first_dense = [10*10*64, 512]
 66 | second_dense = [512, 256]
 67 | third_dense = [256, Num_action]
 68 | 
 69 | # Initialize weights and bias
 70 | def weight_variable(shape):
 71 | 	return tf.Variable(xavier_initializer(shape))
 72 | 
 73 | def bias_variable(shape):
 74 | 	return tf.Variable(xavier_initializer(shape))
 75 | 
 76 | # Xavier Weights initializer
 77 | def xavier_initializer(shape):
 78 | 	dim_sum = np.sum(shape)
 79 | 	if len(shape) == 1:
 80 | 		dim_sum += 1
 81 | 	bound = np.sqrt(2.0 / dim_sum)
 82 | 	return tf.random_uniform(shape, minval=-bound, maxval=bound)
 83 | 
 84 | # Convolution and pooling
 85 | def conv2d(x,w, stride):
 86 | 	return tf.nn.conv2d(x,w,strides=[1, stride, stride, 1], padding='SAME')
 87 | 
 88 | def max_pool_2x2(x):
 89 | 	return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')
 90 | 
 91 | # Assign network variables to target networks
 92 | def assign_network_to_target():
 93 | 	# Get trainable variables
 94 | 	trainable_variables = tf.trainable_variables()
 95 | 	# network lstm variables
 96 | 	trainable_variables_network = [var for var in trainable_variables if var.name.startswith('network')]
 97 | 
 98 | 	# target lstm variables
 99 | 	trainable_variables_target = [var for var in trainable_variables if var.name.startswith('target')]
100 | 
101 | 	for i in range(len(trainable_variables_network)):
102 | 		sess.run(tf.assign(trainable_variables_target[i], trainable_variables_network[i]))
103 | 
104 | # Code for tensorboard
105 | def setup_summary():
106 | 	episode_score     = tf.Variable(0.)
107 | 
108 | 	tf.summary.scalar('Total Reward/' + str(Num_step_plot) + ' steps', episode_score)
109 | 
110 | 	summary_vars = [episode_score]
111 | 	summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]
112 | 	update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
113 | 	summary_op = tf.summary.merge_all()
114 | 	return summary_placeholders, update_ops, summary_op
115 | 
116 | # Input
117 | x_img = tf.placeholder(tf.float32, shape = [None, img_size, img_size, 2 * Num_colorChannel * Num_stackFrame])
118 | # x_map = tf.placeholder(tf.float32, shape = [None, map_size, map_size, Num_stackFrame])
119 | 
120 | # Normalize input
121 | x_img = (x_img - (255.0/2)) / (255.0/2)
122 | ###################################### Image Network ######################################
123 | with tf.variable_scope('network'):
124 | 	# Convolution variables
125 | 	w_conv1_img = weight_variable(first_conv_img)
126 | 	b_conv1_img = bias_variable([first_conv_img[3]])
127 | 
128 | 	w_conv2_img = weight_variable(second_conv)
129 | 	b_conv2_img = bias_variable([second_conv[3]])
130 | 
131 | 	w_conv3_img = weight_variable(third_conv)
132 | 	b_conv3_img = bias_variable([third_conv[3]])
133 | 
134 | 	w_fc1 = weight_variable(first_dense)
135 | 	b_fc1 = bias_variable([first_dense[1]])
136 | 
137 | 	w_fc2 = weight_variable(second_dense)
138 | 	b_fc2 = bias_variable([second_dense[1]])
139 | 
140 | 	w_fc3 = weight_variable(third_dense)
141 | 	b_fc3 = bias_variable([third_dense[1]])
142 | 
143 | ###################################### Image Network ######################################
144 | h_conv1_img = tf.nn.relu(conv2d(x_img, w_conv1_img, 4) + b_conv1_img)
145 | h_conv2_img = tf.nn.relu(conv2d(h_conv1_img, w_conv2_img, 2) + b_conv2_img)
146 | h_conv3_img = tf.nn.relu(conv2d(h_conv2_img, w_conv3_img, 1) + b_conv3_img)
147 | 
148 | h_pool3_flat_img = tf.reshape(h_conv3_img, [-1, first_dense_img[0]])
149 | 
150 | h_flat = h_pool3_flat_img
151 | h_fc1 = tf.nn.relu(tf.matmul(h_flat, w_fc1)+b_fc1)
152 | h_fc2 = tf.nn.relu(tf.matmul(h_fc1, w_fc2)+b_fc2)
153 | 
154 | output = tf.matmul(h_fc2, w_fc3) + b_fc3
155 | 
156 | ###################################### Image Target Network ######################################
157 | with tf.variable_scope('target'):
158 | 	# Convolution variables target
159 | 	w_conv1_target_img = weight_variable(first_conv_img)
160 | 	b_conv1_target_img = bias_variable([first_conv_img[3]])
161 | 
162 | 	w_conv2_target_img = weight_variable(second_conv)
163 | 	b_conv2_target_img = bias_variable([second_conv[3]])
164 | 
165 | 	w_conv3_target_img = weight_variable(third_conv)
166 | 	b_conv3_target_img = bias_variable([third_conv[3]])
167 | 
168 | 	# Densely connect layer variables target
169 | 	w_fc1_target = weight_variable(first_dense)
170 | 	b_fc1_target = bias_variable([first_dense[1]])
171 | 
172 | 	w_fc2_target = weight_variable(second_dense)
173 | 	b_fc2_target = bias_variable([second_dense[1]])
174 | 
175 | 	w_fc3_target = weight_variable(third_dense)
176 | 	b_fc3_target = bias_variable([third_dense[1]])
177 | 
178 | # img Target Network
179 | h_conv1_target_img = tf.nn.relu(conv2d(x_img, w_conv1_target_img, 4) + b_conv1_target_img)
180 | h_conv2_target_img = tf.nn.relu(conv2d(h_conv1_target_img, w_conv2_target_img, 2) + b_conv2_target_img)
181 | h_conv3_target_img = tf.nn.relu(conv2d(h_conv2_target_img, w_conv3_target_img, 1) + b_conv3_target_img)
182 | 
183 | h_pool3_flat_target_img = tf.reshape(h_conv3_target_img, [-1, first_dense_img[0]])
184 | 
185 | h_flat_target = h_pool3_flat_img
186 | h_fc1_target = tf.nn.relu(tf.matmul(h_flat_target, w_fc1_target)+b_fc1_target)
187 | h_fc2_target = tf.nn.relu(tf.matmul(h_fc1_target, w_fc2_target)+b_fc2_target)
188 | 
189 | output_target = tf.matmul(h_fc2_target, w_fc3_target) + b_fc3_target
190 | 
191 | ###################################### Calculate Loss & Train ######################################
192 | # Loss function and Train
193 | action_target = tf.placeholder(tf.float32, shape = [None, Num_action])
194 | y_prediction = tf.placeholder(tf.float32, shape = [None])
195 | 
196 | y_target = tf.reduce_sum(tf.multiply(output, action_target), reduction_indices = 1)
197 | Loss = tf.reduce_mean(tf.square(y_prediction - y_target))
198 | train_step = tf.train.AdamOptimizer(learning_rate = Learning_rate, epsilon = 1e-02).minimize(Loss)
199 | 
200 | # Initialize variables
201 | config = tf.ConfigProto()
202 | config.gpu_options.per_process_gpu_memory_fraction = 0.4
203 | 
204 | sess = tf.InteractiveSession(config=config)
205 | 
206 | # date - hour - minute of training time
207 | date_time = str(datetime.date.today()) + '_' + str(datetime.datetime.now().hour) + '_' + str(datetime.datetime.now().minute)
208 | 
209 | # Make folder for save data
210 | os.makedirs('saved_networks/' + date_time)
211 | 
212 | # Summary for tensorboard
213 | summary_placeholders, update_ops, summary_op = setup_summary()
214 | summary_writer = tf.summary.FileWriter('saved_networks/' + date_time, sess.graph)
215 | 
216 | init = tf.global_variables_initializer()
217 | sess.run(init)
218 | 
219 | # Load the file if the saved file exists
220 | saver = tf.train.Saver()
221 | # check_save = 1
222 | check_save = input('Is there any saved data?(1=y/2=n): ')
223 | 
224 | if check_save == 1:
225 | 	checkpoint = tf.train.get_checkpoint_state('saved_networks/' + date_time)
226 | 	if checkpoint and checkpoint.model_checkpoint_path:
227 | 		saver.restore(sess, checkpoint.model_checkpoint_path)
228 | 		print("Successfully loaded:", checkpoint.model_checkpoint_path)
229 | 	else:
230 | 		print("Could not find old network weights")
231 | 
232 | # Initial parameters
233 | Replay_memory = []
234 | step = 1
235 | Init = 0
236 | state = 'Observing'
237 | episode = 0
238 | score = 0
239 | 
240 | observation_in_img = 0
241 | observation_in_map = 0
242 | img_front_old = 0
243 | 
244 | Is_connect = False
245 | terminal_connect = 0
246 | 
247 | reward_x = []
248 | reward_y = []
249 | 
250 | observation_set_img = []
251 | observation_set_map = []
252 | 
253 | action_old = np.array([1, 0, 0, 0, 0])
254 | speed_old = 20
255 | Was_left_changing = False
256 | Was_right_changing = False
257 | 
258 | Vehicle_z_old = 0
259 | # Communication with Unity
260 | @sio.on('telemetry')
261 | def telemetry(sid, data):
262 | 	if data:
263 | 		# print("received!\n")
264 | 		global step, Replay_memory, observation_in_img, observation_in_map, Epsilon, terminal_connect, img_front_old, reward_x, reward_y, \
265 | 				observation_set_img, observation_set_map, TD_list, action_old, speed_old, Init, Was_left_changing, Was_right_changing, Vehicle_z_old,\
266 | 				episode, score
267 | 		# print(data)
268 | 		current_time = time.time()
269 | 
270 | 		Is_right_lane_changing = float(data["user/angle"])>0
271 | 		Is_left_lane_changing = float(data["user/angle"])<0
272 | 
273 | 		Is_lane_changing = False
274 | 
275 | 		if Is_right_lane_changing == 1 or Is_left_lane_changing == 1:
276 | 			Is_lane_changing = True
277 | 		else:
278 | 			Is_lane_changing = False
279 | 
280 | 		Vehicle_z = float(data["collide"])
281 | 
282 | 		# The current image from the camera of the car (front)
283 | 		imgString_front = data["camforward/image_array"]
284 | 		image_front = Image.open(BytesIO(base64.b64decode(imgString_front)))
285 | 		image_array_front = np.asarray(image_front)
286 | 		# ---------------------- Image transformation ----------------------
287 | 		#image_array_front = image_array_front[55:130, 60:260,:]
288 | 		image_trans_front = cv2.resize(image_array_front, (img_size, img_size))
289 | 
290 | 		if Num_colorChannel == 1:
291 | 			image_trans_front = cv2.cvtColor(image_trans_front, cv2.COLOR_RGB2GRAY)
292 | 			image_trans_front = np.reshape(image_trans_front, (img_size, img_size, 1))
293 | 
294 | 		#image_trans_front = (image_trans_front - (255./2.)) / (255./2.)
295 | 
296 | 		# ------------------------------------------------------------------
297 | 		# The current image from the camera of the car (rear)
298 | 		imgString_rear = data["camback/image_array"]
299 | 		image_rear = Image.open(BytesIO(base64.b64decode(imgString_rear)))
300 | 		image_array_rear = np.asarray(image_rear)
301 | 		# ---------------------- Image transformation ----------------------
302 | 		# image_array_rear = image_array_rear[55:130, 60:260,:]
303 | 
304 | 		image_trans_rear = cv2.resize(image_array_rear, (img_size, img_size))
305 | 
306 | 		if Num_colorChannel == 1:
307 | 			image_trans_rear = cv2.cvtColor(image_trans_rear, cv2.COLOR_RGB2GRAY)
308 | 			image_trans_rear = np.reshape(image_trans_rear, (img_size, img_size, 1))
309 | 
310 | 		# image_trans_rear = (image_trans_rear - (255./2.)) / (255./2.)
311 | 		# ------------------------------------------------------------------
312 | 
313 | 		# Initialization
314 | 		if Init == 0:
315 | 			observation_next_img = np.zeros([img_size, img_size, 2])
316 | 
317 | 			observation_in_img = np.zeros([img_size, img_size, 1])
318 | 
319 | 			for i in range(Num_stackFrame):
320 | 				observation_in_img = np.insert(observation_in_img, [1], image_trans_front, axis = 2)
321 | 				observation_in_img = np.insert(observation_in_img, [1], image_trans_rear , axis = 2)
322 | 
323 | 			observation_in_img = np.delete(observation_in_img, [0], axis = 2)
324 | 
325 | 			# Making observation set for img
326 | 			for i in range(Num_skipFrame * Num_stackFrame):
327 | 				observation_set_img.insert(0, observation_in_img[:,:,:2])
328 | 
329 | 			Vehicle_z_old = Vehicle_z
330 | 
331 | 
332 | 			Init = 1
333 | 			print('Initialization is Finished!')
334 | 
335 | 		# Processing input data
336 | 		observation_next_img = np.zeros([img_size, img_size, 1])
337 | 		observation_next_img = np.insert(observation_next_img, [1], image_trans_front, axis = 2)
338 | 		observation_next_img = np.insert(observation_next_img, [1], image_trans_rear , axis = 2)
339 | 		observation_next_img = np.delete(observation_next_img, [0], axis = 2)
340 | 		# print("mark1")
341 | 
342 | 		del observation_set_img[0]
343 | 		observation_set_img.append(observation_next_img)
344 | 		observation_next_in_img = np.zeros([img_size, img_size, 1])
345 | 		
346 | 		for stack_frame in range(Num_stackFrame):
347 | 			observation_next_in_img = np.insert(observation_next_in_img, [1], observation_set_img[-1 - (Num_skipFrame * stack_frame)], axis = 2)
348 | 
349 | 		observation_next_in_img = np.delete(observation_next_in_img, [0], axis = 2)
350 | 
351 | 		# Get data from Unity
352 | 		# reward = float(data["reward"])
353 | 	###### Q MARK #########
354 | 		# print("mark2")
355 | 		throttle = float(data['user/throttle'])
356 | 		angle = float(data['user/angle'])
357 | 		dist_path = float(data['distance/path'])
358 | 		# dist_path = float(data['distance/path'])
359 | 		print("distance: ", dist_path)
360 | 
361 | 		action_vehicle = 0
362 | 						
363 | 		if throttle > 0:
364 | 			antion_vehicle = 1.00
365 | 		if throttle < 0: 
366 | 			antion_vehicle = 2.00
367 | 		if angle > 0:
368 | 			action_vehicle = 4.00
369 | 		if angle < 0:
370 | 			action_vehicle = 3.00
371 | 
372 | 		# action_vehicle = float(data["Action_vehicle"])
373 | 		speed_vehicle = float(data["speed"])
374 | 
375 | 		# According to the last action, get reward.
376 | 		action_old_index = np.argmax(action_old)
377 | 
378 | 		reward = speed_vehicle / 10
379 | 		reward_bad = -500
380 | 		reward -= abs(dist_path) * 10
381 | 	###### Q Mark ########## 
382 | 		if action_old_index == 1:
383 | 			reward += 1
384 | 		elif action_old_index == 2:
385 | 			reward -= 5
386 | 		elif action_old_index == 3:
387 | 			reward -= 1
388 | 		elif action_old_index == 4:
389 | 			reward -= 1
390 | 
391 | 		# Get action with string
392 | 		action_str = ''
393 | 
394 | 		if action_old_index == 0:
395 | 			action_str = 'Nothing'
396 | 		elif action_old_index == 1:
397 | 			action_str = 'Acc'
398 | 		elif action_old_index == 2:
399 | 			action_str = 'Dec'
400 | 		elif action_old_index == 3:
401 | 			action_str = 'Left'
402 | 		elif action_old_index == 4:
403 | 			action_str = 'Right'
404 | 
405 | 		# If terminal is 1 ( = Collision), then reward is -100
406 | 		# terminal = terminal_connect
407 | 		terminal = 0
408 | 		# print("mark3") 
409 | 		# print(data["collide"])
410 | 	####### terminal ##############
411 | 		if data["collide"] == "1.0000":
412 | 		# if abs(Vehicle_z - Vehicle_z_old) > 1 and Vehicle_z_old < 21:
413 | 			print('Terminal!!')
414 | 			terminal = 1
415 | 
416 | 			# send_control(2)
417 | 			# print("Going Back")
418 | 			# send_control(2)
419 | 
420 | 		if terminal == 1 and step != 1:
421 | 			reward = reward_bad
422 | 
423 | 			if len(Replay_memory) > 15:
424 | 				# Replay_memory[-1][3] = reward_bad
425 | 
426 | 				RM_index = list(range(-15, 0))
427 | 				RM_index.reverse()
428 | 				RM_index_crash = -1
429 | 
430 | 				right_action = np.zeros([5])
431 | 				right_action[4] = 1
432 | 
433 | 				left_action = np.zeros([5])
434 | 				left_action[3] = 1
435 | 
436 | 				if Was_right_changing == 1:
437 | 					for i_RM in RM_index:
438 | 						if np.argmax(Replay_memory[i_RM][2]) == 4:
439 | 							RM_index_crash = i_RM
440 | 							break
441 | 
442 | 					Replay_memory[RM_index_crash][3] = reward_bad
443 | 
444 | 				if Was_left_changing == 1:
445 | 					for i_RM in RM_index:
446 | 						if np.argmax(Replay_memory[i_RM][2]) == 4:
447 | 							RM_index_crash = i_RM
448 | 							break
449 | 
450 | 					Replay_memory[RM_index_crash][3] = reward_bad
451 | 
452 | 		# It shows action which is decided by random or Q network while training
453 | 		Action_from = ''
454 | 
455 | 		# If step is less than Num_start_training, store replay memory
456 | 		if step <= Num_start_training:
457 | 			state = 'Observing'
458 | 			print("observing")
459 | 
460 | 			action = np.zeros([Num_action])
461 | 			action[random.randint(0, Num_action - 1)] = 1.0
462 | 
463 | 		elif step <= Num_start_training + Num_training:
464 | 			state = 'Training'
465 | 			print("training")
466 | 
467 | 			# Get action
468 | 			if random.random() < Epsilon:
469 | 				# print("using random")
470 | 				action = np.zeros([Num_action])
471 | 				action[random.randint(0, Num_action - 1)] = 1.0
472 | 				Action_from = 'Random'
473 | 			else:
474 | 				Q_value = output.eval(feed_dict={x_img: [observation_in_img]})
475 | 				# print("using prediction")
476 | 				action = np.zeros([Num_action])
477 | 				action[np.argmax(Q_value)] = 1
478 | 				Action_from = 'Q_network'
479 | 
480 | 			# Select minibatch
481 | 			minibatch =  random.sample(Replay_memory, Num_batch)
482 | 
483 | 			# Save the each batch data
484 | 			observation_batch_img      = [batch[0] for batch in minibatch]
485 | 			action_batch               = [batch[1] for batch in minibatch]
486 | 			reward_batch               = [batch[2] for batch in minibatch]
487 | 			observation_next_batch_img = [batch[3] for batch in minibatch]
488 | 			terminal_batch             = [batch[4] for batch in minibatch]
489 | 
490 | 			# Update target network according to the Num_update value
491 | 			if step % Num_update == 0:
492 | 				assign_network_to_target()
493 | 
494 | 			# Get Target value
495 | 			y_batch = []
496 | 			# print(len(observation_next_batch_img))
497 | 			# print('end')
498 | 			# Q_batch = output_target.eval(feed_dict = {x_img: observation_next_batch_img, x_map: observation_next_batch_map})
499 | 			try:
500 | 				Q_batch = output_target.eval(feed_dict = {x_img: observation_next_batch_img})
501 | 				# print("got q batch")
502 | 
503 | 
504 | 				for i in range(len(minibatch)):
505 | 					if terminal_batch[i] == True:
506 | 						y_batch.append(reward_batch[i])
507 | 					else:
508 | 						y_batch.append(reward_batch[i] + Gamma * np.max(Q_batch[i]))
509 | 
510 | 				train_step.run(feed_dict = {action_target: action_batch, y_prediction: y_batch, x_img: observation_batch_img})
511 | 
512 | 				# save progress every certain steps
513 | 				if step % Num_step_save == 0:
514 | 					saver.save(sess, 'saved_networks/' + date_time + '/' + algorithm)
515 | 					print('Model is saved!!!')
516 | 			except:
517 | 				# print("error")
518 | 				pass
519 | 		else:
520 | 			print("testing")
521 | 			# Testing code
522 | 			state = 'Testing'
523 | 			Q_value = output.eval(feed_dict={x_img: [observation_in_img]})
524 | 			print(Q_value)
525 | 			action = np.zeros([Num_action])
526 | 			action[np.argmax(Q_value)] = 1
527 | 
528 | 			Epsilon = 0
529 | 		# print("mark4")  
530 | 		## Saving the camera image
531 | 		# i_front = Image.fromarray(image_array_front, mode='RGB')
532 | 		# i_front.save("./Image_front/" + str(step) + '.jpg')
533 | 
534 | 		# i_rear = Image.fromarray(image_array_rear, mode='RGB')
535 | 		# i_rear.save("./Image_rear/" + str(step) + '.jpg')
536 | 
537 | 		# If replay memory is more than Num_replay_memory than erase one
538 | 		if state != 'Testing':
539 | 			if len(Replay_memory) > Num_replay_memory:
540 | 				del Replay_memory[0]
541 | 
542 | 			observation_in_img = np.uint8(observation_in_img)
543 | 			# observation_in_map = np.int8(observation_in_map)
544 | 			observation_next_in_img = np.uint8(observation_next_in_img)
545 | 			# observation_next_in_map = np.int8(observation_next_in_map)
546 | 
547 | 			# Save experience to the Replay memory  and TD_list
548 | 			Replay_memory.append([observation_in_img, action_old, reward, \
549 | 									observation_next_in_img, terminal])
550 | 		# Send action to Unity
551 | 		# print("action: ", action)
552 | 		action_in = np.argmax(action)
553 | 		# print("action_in: ", action_in)
554 | 		send_control(action_in)
555 | 		# print("mark5")  
556 | 		if state != 'Observing':
557 | 			score += reward
558 | 
559 | 			if step % Num_step_plot == 0 and step != Num_start_training:
560 | 				tensorboard_info = [score / Num_step_plot]
561 | 				for i in range(len(tensorboard_info)):
562 | 					sess.run(update_ops[i], feed_dict = {summary_placeholders[i]: float(tensorboard_info[i])})
563 | 				summary_str = sess.run(summary_op)
564 | 				summary_writer.add_summary(summary_str, step)
565 | 				score = 0
566 | 
567 | 		# Print information mark
568 | 		# print('Step: ' + str(step) + '  /  ' + 'Episode: ' + str(episode) + ' / ' + 'State: ' + state + '  /  ' + 'Action: ' + action_str + '  /  ' +
569 | 		#       'Reward: ' + str(reward) + ' / ' + 'Epsilon: ' + str(Epsilon) + '  /  ' + 'Action from: ' + Action_from + '\n')
570 | 
571 | 		if terminal == 1:
572 | 			if state != 'Observing':
573 | 				episode += 1
574 | 
575 | 		# Get current variables to old vatiables
576 | 		observation_in_img = observation_next_in_img
577 | 		# observation_in_map = observation_next_in_map
578 | 		# print('mark5.1')
579 | 		action_old = action
580 | 		speed_old = speed_vehicle
581 | 		img_front_old = image_array_front
582 | 		Was_left_changing = Is_left_lane_changing
583 | 		Was_right_changing = Is_right_lane_changing
584 | 
585 | 		Vehicle_z_old = Vehicle_z
586 | 		# Update step number and decrease epsilon
587 | 		step += 1
588 | 		if Epsilon > Final_epsilon and state == 'Training':
589 | 			Epsilon -= First_epsilon / Num_training
590 | 		# print('mark5.2')
591 | 	else:
592 | 		# NOTE: DON'T EDIT THIS.
593 | 		# self.sio.emit('manual', data={}, skip_sid=True)
594 | 		print("nothing received!")
595 | 	# print("mark6")
596 | # Connection with Unity
597 | @sio.on('connect')
598 | def connect(sid, environ):
599 | 	print("connect ", sid)
600 | 	send_control(-1)
601 | 
602 | # # Disconnect with Unity
603 | # @sio.on('disconnect')
604 | # def disconnect(sid):
605 | #   print('Client disconnected')
606 | 
607 | # Send control to Unity
608 | num_connection = 0
609 | def send_control(action):
610 | 	global num_connection
611 | 	data = {}
612 | 	if action == -1 or action == 0:
613 | 		steering_angle_a = 0
614 | 		throttle_a = 0
615 | 		data = {'steering_angle':steering_angle_a.__str__(), 'throttle': throttle_a.__str__()}
616 | 	
617 | 	# if action == 1:
618 | 	#   data = {'user/throttle':'0.8'}
619 | 	# if action == 2:
620 | 	#   data = {'user/throttle':'-0.8'} 
621 | 	# if action == 3:
622 | 	#   data = {'user/angle':'-8'} 
623 | 	# if action == 4:
624 | 	#   data = {'user/angle':'8'} 
625 | 	elif action == 1:
626 | 		data = {'throttle':'0.8', 'steering_angle':'0'}
627 | 	elif action == 2:
628 | 		data = {'throttle':'-0.8', 'steering_angle':'0'} 
629 | 	elif action == 3:
630 | 		data = {'steering_angle':'-15', 'throttle':'0.8'} 
631 | 	elif action == 4:
632 | 		data = {'steering_angle':'15', 'throttle':'0.8'} 
633 | 
634 | 
635 | 	if num_connection > 500:
636 | 		num_connection = 0
637 | 
638 | 	sio.emit("steer", data, skip_sid=True)
639 | 
640 | 	# sio.emit("onsteer", data={
641 | 	#   'action': action.__str__()
642 | 		# 'num_connection': num_connection.__str__()
643 | 	
644 | 
645 | 
646 | if __name__ == '__main__':
647 | 	# wrap Flask application with engineio's middleware
648 | 	app = socketio.Middleware(sio, app)
649 | 
650 | 	# deploy as an eventlet WSGI server
651 | 	eventlet.wsgi.server(eventlet.listen(('', 9090)), app)
652 | 
653 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Reinforcement Learning Self-Driving Car
  2 | 
  3 | ## Introduction
  4 | 
  5 | This project implemented a self-driving car in our simulator using reinforcement learning. The car is able to drive freely and stably in different scenes with or without random barriers. We didn't collect any dataset and train the model ourselves, instead we defined actions and corresponding rewards for the car and let it learn itself from exploring. 
  6 | 
  7 | ![Demo](https://github.com/Fdevmsy/Reinforcement-Learning-Based-Self-Driving-Car/blob/master/2.GIF)
  8 | 
  9 | The simulator is upgrading everyday, more scenes and functions are being added. Locating and navigation will be added soon.  
 10 | 
 11 | ![](1.png)
 12 | 
 13 | ## Installation
 14 | 
 15 | ### Homebrew
 16 | ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
 17 | ### OpenAI Gym dependencies
 18 | `brew install cmake boost boost-python sdl2 swig wget`
 19 | ### noti
 20 | `(curl -L https://github.com/variadico/noti/releases/download/v2.5.0/noti2.5.0.darwin-amd64.tar.gz | tar -xz); sudo mv noti /usr/local/bin/`
 21 | ### Node >= v7.0
 22 | `brew install node`
 23 | 
 24 | ### Project Dependencies
 25 | `./bin/copy-config
 26 |  npm install; sudo npm I -g grunt-cli`
 27 | 
 28 | `conda env create -f environment.yml
 29 |  source activate DRL`
 30 | 
 31 | 
 32 | ## Background
 33 | 
 34 | Reinforcement learning develops control patterns by providing feedback on a model’s selected actions, which encourages the model to select better actions in the future. At each time step, given some state s, the model will select an action a, and then observe the new state s' and a reward r based on some optimality criterion.
 35 | 
 36 | We specifically used a method known as Q learning, which approximates the maximum expected return for performing an action at a given state using an action-value (Q) function. Specifically, return gives the sum of the rewards until the game terminates, where the reward is discounted by a factor of γ at each time step. We formally define this as:
 37 | 
 38 | ![alt-text](http://imgur.com/h7MJxSJ.png "(1)")
 39 | 
 40 | We then define the action-value function:
 41 | 
 42 | ![alt-text](http://imgur.com/05MxGxk.png "(2)")
 43 | 
 44 | Note that if the optimal Q function is known for state s', we can write the optimal Q function at preceding state s as the maximum expected value of ![alt-text](http://imgur.com/1RSOCHo.png "Sorry, no alt-text for this one"). This identity is known as the Bellman equation:
 45 | 
 46 | ![alt-text](http://imgur.com/BERyjr2.png "(3)")
 47 | 
 48 | The intuition behind reinforcement learning is to continually update the action-value function based on observations using the Bellman equation. It has been shown by Sutton et al 1998 [2] that such update algorithms will converge on the optimal action-value function as time approaches infinity. Based on this, we can define Q as the output of a neural network, which has weights θ, and train this network by minimizing the following loss function at each iteration i:
 49 | 
 50 | ![alt-text](http://imgur.com/3gFka35.png "(4)")
 51 | 
 52 | Where y_i represents the target function we want to approach during each iteration. It is defined as:
 53 | 
 54 | ![alt-text](http://imgur.com/gKcXJfi.png "(5)")
 55 | 
 56 | Note that when i is equal to the final iteration of an episode (colloquially the end of a game), the Q function should be 0 since it is impossible to attain additional reward after the game has ended. Therefore, when i equals the terminal frame of an episode, we can simply write:
 57 | 
 58 | ![alt-text](http://imgur.com/nU8qRJM.png "(6)") 
 59 |  
 60 | ## Definition  
 61 | 
 62 | Actions: 
 63 | 
 64 | - 0: do nothing
 65 | - 1: Accelerate 
 66 | - 2: Decelerate 
 67 | - 3: Turn Left
 68 | - 4: Turn Right 
 69 | 
 70 | We train the network with these actions. In order to communicate with simulator, we covert them to car's throttle and steering wheel. 
 71 | 
 72 | ~~~python
 73 | if action == -1 or action == 0:
 74 | 	steering_angle_a = 0
 75 | 	throttle_a = 0
 76 | 	data = {'steering_angle':steering_angle_a.__str__(), 'throttle': throttle_a.__str__()}
 77 | 	
 78 | elif action == 1:
 79 | 	data = {'throttle':'0.8', 'steering_angle':'0'}
 80 | elif action == 2:
 81 | 	data = {'throttle':'-0.8', 'steering_angle':'0'} 
 82 | elif action == 3:
 83 | 	data = {'steering_angle':'-15', 'throttle':'0.8'} 
 84 | elif action == 4:
 85 | 	data = {'steering_angle':'15', 'throttle':'0.8'} 
 86 | 
 87 | Reward: 
 88 | 
 89 | reward = speed_vehicle / 10
 90 | reward_bad = -500000
 91 | reward -= abs(dist_path) * 10
 92 | 
 93 | ###### Q Mark ########## 
 94 | if action_old_index == 1:
 95 | 	reward += 1
 96 | elif action_old_index == 2:
 97 | 	reward -= 5
 98 | elif action_old_index == 3:
 99 | 	reward -= 1
100 | elif action_old_index == 4:
101 | 	reward -= 1
102 | 
103 | 	
104 | If terminated:
105 | 	reward -= 500
106 | ~~~
107 | 
108 | Our goal is to make a self-driving car. But what we want is not only driving without hitting the wall, stable and comfortable are also preferred. So every action like turning and decelerating would get a small minus reward. Accelerate is preferred so we give it +1. Hitting the wall is the top 1 thing to be avoid. So we give it -10.  
109 | 
110 | State: 
111 | The state is the current image from front camera and the speed. 
112 | 
113 | ## Usage
114 | 
115 | 1. Set up simulator 
116 | 2. `python python DRLCar.py`
117 | 
118 | ## Reference 
119 | 
120 | This project is greatly inspired by MLJejuCamp2017's project: 
121 | [https://github.com/MLJejuCamp2017/DRL_based_SelfDrivingCarControl](https://github.com/MLJejuCamp2017/DRL_based_SelfDrivingCarControl)
122 | 
123 | 
124 | ## Contact
125 | 
126 | MakerColider
127 | 
128 | Shiyu Mou
129 | shiyumou@usc.edu
130 | 
131 | 
132 | 
133 |  


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: DRL
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 | - python>=3.5
 6 | - anaconda
 7 | - six
 8 | - h5py
 9 | - matplotlib==1.4.3
10 | - seaborn>=0.7.1
11 | - Pillow>=3.3.1
12 | - PyOpenGL>=3.1.0
13 | - glances>=2.6.2
14 | - pytest-cov>=2.3.1
15 | - pytest-xdist>=1.15.0
16 | - pip:
17 |   - codacy-coverage>=1.3.3
18 |   - mem_top==0.1.5
19 |   - atari_py>=0.0.18
20 |   - cmake==0.6.0
21 |   - tensorflow>=1.0.0
22 |   - Keras>=1.2.2,<2.0.0
23 |   - "--editable=git+https://github.com/openai/gym.git#egg=gym[all]"
24 | 


--------------------------------------------------------------------------------