├── .gitignore ├── LICENSE ├── README.md └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | tmp 3 | logdir 4 | saved_networks* 5 | var 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 SunYeop Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # breakout-v0-player 2 | The DQN agent which plays breakout-v0 in gym.openai.com 3 | 4 | * https://www.youtube.com/watch?v=wH48jrxm_5Q 5 | * https://www.youtube.com/watch?v=OiMM9lKmOlQ 6 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import datetime 4 | import threading 5 | import random 6 | import time 7 | import sys 8 | import gym 9 | env = gym.make('Breakout-v0') 10 | 11 | CPU_ONLY = False 12 | TRAIN = True 13 | BENCHMARK = False 14 | 15 | if 'eval' in sys.argv: 16 | TRAIN = False 17 | if 'cpu' in sys.argv: 18 | CPU_ONLY = True 19 | if 'benchmark' in sys.argv: 20 | BENCHMARK = True 21 | 22 | NUM_AGENT_THREAD = 4 23 | LOG_INTERVAL = 1000 24 | SAVE_INTERVAL = 50000 25 | 26 | # hyperparameter settings 27 | GAMMA = .95 28 | LEARNING_RATE = .0002 29 | DECAY_RATE = .99 30 | MOMENTUM = 0 31 | EPSILON = 1e-6 32 | 33 | BATCH_SIZE = 32 34 | OBSERVE = 50000 35 | ACTION_HISTORY_LENGTH = 4 36 | MAX_EXPLORE_FRAMES = 1000000 37 | MIN_EXPLORE_RATE = .10 38 | MAX_D_SIZE = 1000000 # maximum size of replay queue 39 | C = 10000 # Q reset interval 40 | SCREEN_DIMS = 84, 84 41 | 42 | NUM_ACTIONS = env.action_space.n 43 | ACTION_MEANINGS = env.get_action_meanings() 44 | 45 | env = None 46 | 47 | print('breakout-v0-player is running with TRAIN=%s'%TRAIN) 48 | 49 | def conv2d(x, W, s, cpu_only=False): 50 | cpu_only = CPU_ONLY or cpu_only 51 | return tf.nn.conv2d(x, W, strides=[1, s, s, 1] if cpu_only else [1, 1, s, s], padding='VALID', data_format='NHWC' if cpu_only else 'NCHW') 52 | 53 | def weight_variable(shape, name=None): 54 | initial = tf.truncated_normal(shape, stddev=0.02) 55 | return tf.Variable(initial, name=name) 56 | 57 | def bias_variable(shape, name=None): 58 | initial = tf.constant(0.01, shape=shape) 59 | return tf.Variable(initial, name=name) 60 | 61 | def create_q(state, weights=None, cpu_only=False): 62 | cpu_only = CPU_ONLY or cpu_only 63 | 64 | if weights is not None: 65 | w_conv1, b_conv1, w_conv2, b_conv2, w_conv3, b_conv3, w_fc1, b_fc1, w_fc2, b_fc2 = weights 66 | 67 | if cpu_only: 68 | state = tf.transpose(state, perm=[0,2,3,1]) 69 | 70 | # state: (x_1, x_2, ... x_n) of shape [-1, ACTION_HISTORY_LENGTH, HEIGHT, WIDTH] 71 | with tf.name_scope('conv1'): 72 | if weights is None: 73 | w_conv1 = weight_variable([8, 8, ACTION_HISTORY_LENGTH, 32], name='w_conv1') 74 | b_conv1 = bias_variable([32], name='b_conv1') 75 | h_conv1 = tf.nn.relu(tf.nn.bias_add(conv2d(state, w_conv1, 4, cpu_only), b_conv1, data_format='NHWC' if cpu_only else 'NCHW')) 76 | 77 | with tf.name_scope('conv2'): 78 | if weights is None: 79 | w_conv2 = weight_variable([4, 4, 32, 64], name='w_conv2') 80 | b_conv2 = bias_variable([64]) 81 | h_conv2 = tf.nn.relu(tf.nn.bias_add(conv2d(h_conv1, w_conv2, 2, cpu_only), b_conv2, data_format='NHWC' if cpu_only else 'NCHW')) 82 | 83 | with tf.name_scope('conv3'): 84 | if weights is None: 85 | w_conv3 = weight_variable([3, 3, 64, 64], name='w_conv3') 86 | b_conv3 = bias_variable([64]) 87 | h_conv3 = tf.nn.relu(tf.nn.bias_add(conv2d(h_conv2, w_conv3, 1, cpu_only), b_conv3, data_format='NHWC' if cpu_only else 'NCHW')) 88 | 89 | if cpu_only: 90 | h_conv3 = tf.transpose(h_conv3, perm=[0,3,1,2]) 91 | 92 | shape = h_conv3.get_shape().as_list() 93 | H, W = shape[2], shape[3] 94 | h_conv3_flattened = tf.reshape(h_conv3, [-1, 64*H*W], name='h_conv3_flatten') 95 | 96 | with tf.name_scope('fc1'): 97 | if weights is None: 98 | w_fc1 = weight_variable([64*H*W, 512]) 99 | b_fc1 = bias_variable([512]) 100 | h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flattened, w_fc1) + b_fc1) 101 | 102 | with tf.name_scope('fc2'): 103 | if weights is None: 104 | w_fc2 = weight_variable([512, NUM_ACTIONS]) 105 | b_fc2 = bias_variable([NUM_ACTIONS]) 106 | h_fc2 = tf.matmul(h_fc1, w_fc2) + b_fc2 107 | 108 | return h_fc2, (w_conv1, b_conv1, w_conv2, b_conv2, w_conv3, b_conv3, w_fc1, b_fc1, w_fc2, b_fc2) 109 | 110 | def create_predicted_action(q_values): 111 | return tf.argmax(q_values, 1) 112 | 113 | def create_max_q(q_values): 114 | return tf.reduce_max(q_values, reduction_indices=1) 115 | 116 | def create_q_reduced_by_action(q_values, a): 117 | one_hot_encoded_a = tf.one_hot(a, NUM_ACTIONS, 1., 0.) 118 | q_value = tf.reduce_sum(q_values * one_hot_encoded_a, reduction_indices=1) 119 | return q_value 120 | 121 | def create_loss(q_values, y, a): 122 | q_value = create_q_reduced_by_action(q_values, a) 123 | loss = tf.reduce_mean(tf.square(y - q_value)) 124 | return loss 125 | 126 | def create_train_op(loss): 127 | return tf.train.RMSPropOptimizer(LEARNING_RATE, DECAY_RATE, MOMENTUM, EPSILON).minimize(loss) 128 | 129 | def create_preprocess(x): 130 | grayscale = tf.image.rgb_to_grayscale(x) 131 | resized = tf.image.resize_images(grayscale, *SCREEN_DIMS)/255. 132 | return resized 133 | 134 | def start_session(): 135 | global global_step, ph_new_global_step, assign_global_step 136 | global ph_state, ph_x 137 | global _preprocess, predicted_action, q_values, max_q, predicted_action_cpu, q_values_cpu, max_q_cpu 138 | global gamma_max_target_q, reset_target_q, gamma_target_q_reduced_by_action, predict_by_double_dqn 139 | global ph_y, ph_a 140 | global loss, train_op 141 | global input_summary, ph_avg_reward, reward_summary, ph_avg_score_per_episode, score_per_episode_summary, ph_avg_loss, loss_summary, ph_avg_max_q_value, max_q_value_summary, ph_exploration_rate, exploration_rate_summary 142 | 143 | with tf.Graph().as_default() as g: 144 | global_step = tf.Variable(0, name='step', trainable=False) 145 | ph_new_global_step = tf.placeholder(tf.int32, shape=[], name='new_global_step') 146 | assign_global_step = tf.assign(global_step, ph_new_global_step, name='assign_global_step') 147 | 148 | with tf.name_scope('input'): 149 | # preprocessed state(x_1, x_2, ..., x_n) 150 | ph_x = tf.placeholder(tf.int32, shape=[210, 160, 3]) 151 | ph_state = tf.placeholder(tf.float32, shape=[None, ACTION_HISTORY_LENGTH, *SCREEN_DIMS], name='state') 152 | ph_y = tf.placeholder(tf.float32, shape=[None], name='y') # y = r or r + gamma * max_Q^(s, a) 153 | ph_a = tf.placeholder(tf.int64, shape=[None], name='a') # actions 154 | 155 | with tf.device('/gpu:0'): 156 | with tf.name_scope('Q'): 157 | q_values, theta = create_q(ph_state) 158 | 159 | with tf.name_scope('pi'): 160 | predicted_action = create_predicted_action(q_values) 161 | 162 | with tf.name_scope('max_Q'): 163 | max_q = create_max_q(q_values) 164 | 165 | with tf.name_scope('target_Q'): 166 | target_q_values, theta_m1 = create_q(ph_state) 167 | 168 | with tf.name_scope('target_Q_reduced_by_action'): 169 | target_q_reduced_by_action = create_q_reduced_by_action(target_q_values, ph_a) 170 | 171 | with tf.name_scope('gamma_target_Q_reduced_by_action'): 172 | gamma_target_q_reduced_by_action = GAMMA * target_q_reduced_by_action 173 | 174 | with tf.name_scope('predict_by_double_dqn'): 175 | predict_by_double_dqn = GAMMA * create_q_reduced_by_action(target_q_values, predicted_action) 176 | 177 | with tf.name_scope('max_target_Q'): 178 | max_target_q = create_max_q(target_q_values) 179 | 180 | with tf.name_scope('gamma_max_target_Q'): 181 | gamma_max_target_q = GAMMA * max_target_q 182 | 183 | with tf.name_scope('reset_target_Q'): 184 | reset_target_q = tf.group(*(tf.assign(lvalue, rvalue) for lvalue, rvalue in zip(theta_m1, theta))) 185 | 186 | with tf.name_scope('loss'): 187 | loss = create_loss(q_values, ph_y, ph_a) 188 | 189 | with tf.name_scope('train'): 190 | train_op = create_train_op(loss) 191 | 192 | with tf.device('/cpu:0'): 193 | with tf.name_scope('preprocess'): 194 | _preprocess = create_preprocess(ph_x) 195 | 196 | with tf.name_scope('Q_cpu'): 197 | q_values_cpu, _ = create_q(ph_state, theta, cpu_only=True) 198 | 199 | with tf.name_scope('pi_cpu'): 200 | predicted_action_cpu = create_predicted_action(q_values_cpu) 201 | 202 | with tf.name_scope('max_Q_cpu'): 203 | max_q_cpu = create_max_q(q_values_cpu) 204 | 205 | # summaries 206 | input_summary = tf.image_summary('input', tf.reshape(tf.transpose(ph_state[0:1,:,:,:], perm=[1,2,3,0]), [-1, *SCREEN_DIMS, 1]), max_images=ACTION_HISTORY_LENGTH) 207 | 208 | # update every input() 209 | ph_avg_reward = tf.placeholder(tf.float32, shape=[], name='avg_reward') 210 | reward_summary = tf.scalar_summary('_reward', ph_avg_reward) 211 | 212 | # update at new_episode() 213 | ph_avg_score_per_episode = tf.placeholder(tf.float32, shape=[], name='avg_score_per_episode') 214 | score_per_episode_summary = tf.scalar_summary('_score_per_episode', ph_avg_score_per_episode) 215 | 216 | # update at train() 217 | ph_avg_loss = tf.placeholder(tf.float32, shape=[], name='avg_loss') 218 | loss_summary = tf.scalar_summary('_loss', ph_avg_loss) 219 | 220 | # update at train() 221 | ph_exploration_rate = tf.placeholder(tf.float32, shape=[], name='avg_loss') 222 | exploration_rate_summary = tf.scalar_summary('_exploration_rate', ph_exploration_rate) 223 | 224 | # update at inference 225 | ph_avg_max_q_value = tf.placeholder(tf.float32, shape=[], name='avg_max_q_value') 226 | max_q_value_summary = tf.scalar_summary('_max_q_value', ph_avg_max_q_value) 227 | 228 | # start session 229 | sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) 230 | initializers = (tf.initialize_all_variables(), reset_target_q) 231 | 232 | saver = tf.train.Saver() 233 | checkpoint = tf.train.get_checkpoint_state("saved_networks") 234 | 235 | if checkpoint and checkpoint.model_checkpoint_path: 236 | saver.restore(sess, checkpoint.model_checkpoint_path) 237 | print("Successfully loaded:", checkpoint.model_checkpoint_path) 238 | else: 239 | print("Could not find old network weights") 240 | import os 241 | assert os.path.isdir('saved_networks') 242 | assert TRAIN 243 | 244 | for initializer in initializers: 245 | sess.run(initializer) 246 | 247 | g.finalize() 248 | 249 | return sess, saver 250 | 251 | def save_networks(step): 252 | sess.run(assign_global_step, feed_dict={ph_new_global_step: step}) 253 | saver.save(sess, 'saved_networks/' + 'network' + '-dqn', global_step=step) 254 | print('[%s] Successfully saved networks -'%datetime.datetime.now(), step) 255 | 256 | def get_exploration_rate(): 257 | return max(MIN_EXPLORE_RATE, 1. + (MIN_EXPLORE_RATE - 1.) * step / MAX_EXPLORE_FRAMES) 258 | 259 | def train_step(): 260 | global step, st, ps 261 | global total_loss, cnt_loss 262 | 263 | minibatch = random.sample(D, BATCH_SIZE) 264 | 265 | state_batch = [] 266 | action_batch = [] 267 | y_batch = [] 268 | undone_indices = [] 269 | undone_state_p1 = [] 270 | 271 | for i, (t_state, t_action, t_reward, t_state_p1, t_done) in enumerate(minibatch): 272 | state_batch.append(t_state) 273 | action_batch.append(t_action) 274 | y_batch.append(t_reward) 275 | 276 | if t_done == False: # to calculate future rewards 277 | undone_indices.append(i) 278 | undone_state_p1.append(t_state_p1) 279 | 280 | # calculate future rewards 281 | predicted_q_values = sess.run(gamma_max_target_q, feed_dict={ph_state: undone_state_p1}) 282 | 283 | # double DQN 284 | #predicted_q_values = sess.run(predict_by_double_dqn, feed_dict={ph_state: undone_state_p1}) 285 | 286 | for i, j in enumerate(undone_indices): 287 | y_batch[j] += predicted_q_values[i] 288 | 289 | # train 290 | _, current_loss = sess.run([train_op, loss], feed_dict={ph_y: y_batch, ph_state: state_batch, ph_a: action_batch}) 291 | 292 | # log loss 293 | cnt_loss += 1 294 | total_loss += current_loss 295 | t_cnt_loss = cnt_loss 296 | 297 | if t_cnt_loss == (LOG_INTERVAL // 10): # and TRAIN # is always True 298 | summary_writer.add_summary(sess.run(loss_summary, feed_dict={ph_avg_loss: total_loss/cnt_loss}), step) 299 | summary_writer.add_summary(sess.run(exploration_rate_summary, feed_dict={ph_exploration_rate: get_exploration_rate()}), step) 300 | 301 | total_loss = 0 302 | cnt_loss = 0 303 | 304 | step += 1 305 | 306 | if BENCHMARK and step%100==0: 307 | print((step-ps)/(time.time()-st),'iterations per second') 308 | st = time.time() 309 | ps = step 310 | 311 | if step % C == 0: 312 | sess.run(reset_target_q) 313 | 314 | if step % SAVE_INTERVAL == 0 and not BENCHMARK: 315 | print('Autosaving networks ...') 316 | save_networks(step) 317 | 318 | def preprocess(x): 319 | return sess.run(_preprocess, feed_dict={ph_x: x})[:, :, 0] 320 | 321 | def put_experience(s, a, r, s_p, t, D_lock=None): 322 | global D_index 323 | 324 | if D_lock: 325 | D_lock.acquire() 326 | 327 | new_exp = (s, a, r, s_p, t) 328 | 329 | if len(D) >= MAX_D_SIZE: 330 | D[D_index] = new_exp 331 | D_index += 1 332 | if D_index == len(D): 333 | D_index = 0 334 | else: 335 | D.append(new_exp) 336 | 337 | if D_lock: 338 | D_lock.release() 339 | 340 | def agent_worker(agent_coord, D_lock=None): 341 | assert OBSERVE <= MAX_D_SIZE 342 | 343 | global D, total_loss, cnt_loss, st, ps 344 | 345 | env = gym.make('Breakout-v0') 346 | get_state = lambda current:prev_ob_list[-ACTION_HISTORY_LENGTH:] if current else prev_ob_list[-ACTION_HISTORY_LENGTH-1:-1] 347 | 348 | total_reward = 0 349 | cnt_reward = 0 350 | 351 | total_score_per_episode = 0 352 | cnt_score_per_episode = 0 353 | 354 | total_max_q_value = 0 355 | cnt_max_q_value = 0 356 | 357 | total_loss = 0 358 | cnt_loss = 0 359 | 360 | # benchmark 361 | st = time.time() 362 | ps = step 363 | 364 | while not agent_coord.should_stop(): 365 | # new episode 366 | observation = env.reset() 367 | done = None 368 | score = 0 369 | cnt_same_state = 0 370 | last_score = None 371 | 372 | prev_ob_list = [preprocess(observation)] * (ACTION_HISTORY_LENGTH - 1) # previous observations 373 | 374 | while not agent_coord.should_stop(): 375 | prev_ob_list.append(preprocess(observation)) 376 | 377 | if not TRAIN: 378 | env.render() 379 | 380 | if done is not None and TRAIN: 381 | put_experience(get_state(False), action, min(1, reward), get_state(True), done, D_lock) 382 | 383 | if len(D) > (OBSERVE if not BENCHMARK else BATCH_SIZE): 384 | train_step() 385 | 386 | if done is not None and done: 387 | if not TRAIN: 388 | print('score:', score) 389 | time.sleep(1) 390 | break 391 | 392 | if TRAIN and (random.random() < get_exploration_rate()): 393 | action = env.action_space.sample() 394 | else: 395 | # evaluate 396 | ops = [predicted_action, max_q] 397 | 398 | if not TRAIN: 399 | ops = [predicted_action, max_q, q_values] 400 | 401 | feed_dict = {ph_state: (get_state(True),)} 402 | 403 | if cnt_max_q_value == LOG_INTERVAL: 404 | ops.extend([input_summary, max_q_value_summary]) 405 | feed_dict[ph_avg_max_q_value] = total_max_q_value / cnt_max_q_value 406 | total_max_q_value = 0 407 | cnt_max_q_value = 0 408 | 409 | ret = sess.run(ops, feed_dict=feed_dict) 410 | action = ret[0][0] 411 | 412 | # prevent the agent from doing nothing 413 | if not TRAIN: 414 | if last_score == score: 415 | cnt_same_state += 1 416 | 417 | if cnt_same_state >= 50: 418 | action = 1 # FIRE 419 | cnt_same_state = 0 420 | else: 421 | cnt_same_state = 0 422 | 423 | last_score = score 424 | 425 | if len(D) >= OBSERVE: 426 | total_max_q_value += ret[1][0] 427 | cnt_max_q_value += 1 428 | 429 | if TRAIN: 430 | for summary in ret[2:]: 431 | summary_writer.add_summary(summary, step) 432 | else: 433 | print(ret[-1]) 434 | print(ACTION_MEANINGS[action], '\t' if len(ACTION_MEANINGS[action]) >= 8 else '\t\t', ret[1][0]) 435 | 436 | observation, reward, done, info = env.step(action) 437 | score += reward 438 | 439 | if len(D) >= OBSERVE: 440 | total_reward += reward 441 | cnt_reward += 1 442 | 443 | if cnt_reward == (LOG_INTERVAL*10): 444 | summary_writer.add_summary(sess.run(reward_summary, feed_dict={ph_avg_reward: total_reward/cnt_reward}), step) 445 | total_reward = 0 446 | cnt_reward = 0 447 | 448 | # episode done 449 | if len(D) >= OBSERVE: 450 | total_score_per_episode += score 451 | cnt_score_per_episode += 1 452 | 453 | if cnt_score_per_episode == (LOG_INTERVAL//10): 454 | summary_writer.add_summary(sess.run(score_per_episode_summary, feed_dict={ph_avg_score_per_episode:total_score_per_episode/cnt_score_per_episode}), step) 455 | total_score_per_episode = 0 456 | cnt_score_per_episode = 0 457 | 458 | def main(): 459 | global sess, saver, summary_writer, D, D_index, step 460 | sess, saver = start_session() 461 | step = sess.run(global_step) 462 | 463 | summary_writer=tf.train.SummaryWriter('logdir', sess.graph) 464 | coord = tf.train.Coordinator() 465 | 466 | D = [] # replay memory 467 | D_index = 0 468 | 469 | if TRAIN: 470 | D_lock = threading.Lock() 471 | 472 | agent_coord = tf.train.Coordinator() 473 | agent_threads = [] 474 | 475 | for i in range(NUM_AGENT_THREAD): 476 | agent_thread = threading.Thread(target=agent_worker, args=(agent_coord, D_lock)) 477 | agent_thread.start() 478 | agent_threads.append(agent_thread) 479 | 480 | print("Waiting for initial observation") 481 | 482 | while len(D) < (OBSERVE if not BENCHMARK else BATCH_SIZE): 483 | print("Current len(D):", len(D)) 484 | time.sleep(1) 485 | 486 | agent_coord.request_stop() 487 | agent_coord.join(agent_threads) 488 | 489 | try: 490 | agent_worker(coord) 491 | except Exception as e: 492 | print(e) 493 | # Report exceptions to the coordinator. 494 | coord.request_stop(e) 495 | finally: 496 | coord.request_stop() 497 | 498 | if TRAIN and not BENCHMARK: 499 | print('Received should_stop - Saving networks ...') 500 | save_networks(step) 501 | 502 | if __name__ == '__main__': 503 | main() --------------------------------------------------------------------------------