├── .gitignore
├── LICENSE
├── README.md
└── main.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | tmp
3 | logdir
4 | saved_networks*
5 | var
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 SunYeop Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # breakout-v0-player
2 | The DQN agent which plays breakout-v0 in gym.openai.com
3 | 
4 | * https://www.youtube.com/watch?v=wH48jrxm_5Q
5 | * https://www.youtube.com/watch?v=OiMM9lKmOlQ
6 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import datetime
  4 | import threading
  5 | import random
  6 | import time
  7 | import sys
  8 | import gym
  9 | env = gym.make('Breakout-v0')
 10 | 
 11 | CPU_ONLY = False
 12 | TRAIN = True
 13 | BENCHMARK = False
 14 | 
 15 | if 'eval' in sys.argv:
 16 | 	TRAIN = False
 17 | if 'cpu' in sys.argv:
 18 | 	CPU_ONLY = True
 19 | if 'benchmark' in sys.argv:
 20 | 	BENCHMARK = True
 21 | 
 22 | NUM_AGENT_THREAD = 4
 23 | LOG_INTERVAL = 1000
 24 | SAVE_INTERVAL = 50000
 25 | 
 26 | # hyperparameter settings
 27 | GAMMA = .95
 28 | LEARNING_RATE = .0002
 29 | DECAY_RATE = .99
 30 | MOMENTUM = 0
 31 | EPSILON = 1e-6
 32 | 
 33 | BATCH_SIZE = 32
 34 | OBSERVE = 50000
 35 | ACTION_HISTORY_LENGTH = 4
 36 | MAX_EXPLORE_FRAMES = 1000000
 37 | MIN_EXPLORE_RATE = .10
 38 | MAX_D_SIZE = 1000000 # maximum size of replay queue
 39 | C = 10000 # Q reset interval
 40 | SCREEN_DIMS = 84, 84
 41 | 
 42 | NUM_ACTIONS = env.action_space.n
 43 | ACTION_MEANINGS = env.get_action_meanings()
 44 | 
 45 | env = None
 46 | 
 47 | print('breakout-v0-player is running with TRAIN=%s'%TRAIN)
 48 | 
 49 | def conv2d(x, W, s, cpu_only=False):
 50 | 	cpu_only = CPU_ONLY or cpu_only
 51 | 	return tf.nn.conv2d(x, W, strides=[1, s, s, 1] if cpu_only else [1, 1, s, s], padding='VALID', data_format='NHWC' if cpu_only else 'NCHW')
 52 | 
 53 | def weight_variable(shape, name=None):
 54 | 	initial = tf.truncated_normal(shape, stddev=0.02)
 55 | 	return tf.Variable(initial, name=name)
 56 | 
 57 | def bias_variable(shape, name=None):
 58 | 	initial = tf.constant(0.01, shape=shape)
 59 | 	return tf.Variable(initial, name=name)
 60 | 
 61 | def create_q(state, weights=None, cpu_only=False):
 62 | 	cpu_only = CPU_ONLY or cpu_only
 63 | 
 64 | 	if weights is not None:
 65 | 		w_conv1, b_conv1, w_conv2, b_conv2, w_conv3, b_conv3, w_fc1, b_fc1, w_fc2, b_fc2 = weights
 66 | 
 67 | 	if cpu_only:
 68 | 		state = tf.transpose(state, perm=[0,2,3,1])
 69 | 
 70 | 	# state: (x_1, x_2, ... x_n) of shape [-1, ACTION_HISTORY_LENGTH, HEIGHT, WIDTH]
 71 | 	with tf.name_scope('conv1'):
 72 | 		if weights is None:
 73 | 			w_conv1 = weight_variable([8, 8, ACTION_HISTORY_LENGTH, 32], name='w_conv1')
 74 | 			b_conv1 = bias_variable([32], name='b_conv1')
 75 | 		h_conv1 = tf.nn.relu(tf.nn.bias_add(conv2d(state, w_conv1, 4, cpu_only), b_conv1, data_format='NHWC' if cpu_only else 'NCHW'))
 76 | 
 77 | 	with tf.name_scope('conv2'):
 78 | 		if weights is None:
 79 | 			w_conv2 = weight_variable([4, 4, 32, 64], name='w_conv2')
 80 | 			b_conv2 = bias_variable([64])
 81 | 		h_conv2 = tf.nn.relu(tf.nn.bias_add(conv2d(h_conv1, w_conv2, 2, cpu_only), b_conv2, data_format='NHWC' if cpu_only else 'NCHW'))
 82 | 
 83 | 	with tf.name_scope('conv3'):
 84 | 		if weights is None:
 85 | 			w_conv3 = weight_variable([3, 3, 64, 64], name='w_conv3')
 86 | 			b_conv3 = bias_variable([64])
 87 | 		h_conv3 = tf.nn.relu(tf.nn.bias_add(conv2d(h_conv2, w_conv3, 1, cpu_only), b_conv3, data_format='NHWC' if cpu_only else 'NCHW'))
 88 | 
 89 | 	if cpu_only:
 90 | 		h_conv3 = tf.transpose(h_conv3, perm=[0,3,1,2])
 91 | 
 92 | 	shape = h_conv3.get_shape().as_list()
 93 | 	H, W = shape[2], shape[3]
 94 | 	h_conv3_flattened = tf.reshape(h_conv3, [-1, 64*H*W], name='h_conv3_flatten')
 95 | 
 96 | 	with tf.name_scope('fc1'):
 97 | 		if weights is None:
 98 | 			w_fc1 = weight_variable([64*H*W, 512])
 99 | 			b_fc1 = bias_variable([512])
100 | 		h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flattened, w_fc1) + b_fc1)
101 | 
102 | 	with tf.name_scope('fc2'):
103 | 		if weights is None:
104 | 			w_fc2 = weight_variable([512, NUM_ACTIONS])
105 | 			b_fc2 = bias_variable([NUM_ACTIONS])
106 | 		h_fc2 = tf.matmul(h_fc1, w_fc2) + b_fc2
107 | 
108 | 	return h_fc2, (w_conv1, b_conv1, w_conv2, b_conv2, w_conv3, b_conv3, w_fc1, b_fc1, w_fc2, b_fc2)
109 | 
110 | def create_predicted_action(q_values):
111 | 	return tf.argmax(q_values, 1)
112 | 
113 | def create_max_q(q_values):
114 | 	return tf.reduce_max(q_values, reduction_indices=1)
115 | 
116 | def create_q_reduced_by_action(q_values, a):
117 | 	one_hot_encoded_a = tf.one_hot(a, NUM_ACTIONS, 1., 0.)
118 | 	q_value = tf.reduce_sum(q_values * one_hot_encoded_a, reduction_indices=1)
119 | 	return q_value
120 | 
121 | def create_loss(q_values, y, a):
122 | 	q_value = create_q_reduced_by_action(q_values, a)
123 | 	loss = tf.reduce_mean(tf.square(y - q_value))
124 | 	return loss
125 | 
126 | def create_train_op(loss):
127 | 	return tf.train.RMSPropOptimizer(LEARNING_RATE, DECAY_RATE, MOMENTUM, EPSILON).minimize(loss)
128 | 
129 | def create_preprocess(x):
130 | 	grayscale = tf.image.rgb_to_grayscale(x)
131 | 	resized = tf.image.resize_images(grayscale, *SCREEN_DIMS)/255.
132 | 	return resized
133 | 
134 | def start_session():
135 | 	global global_step, ph_new_global_step, assign_global_step
136 | 	global ph_state, ph_x
137 | 	global _preprocess, predicted_action, q_values, max_q, predicted_action_cpu, q_values_cpu, max_q_cpu
138 | 	global gamma_max_target_q, reset_target_q, gamma_target_q_reduced_by_action, predict_by_double_dqn
139 | 	global ph_y, ph_a
140 | 	global loss, train_op
141 | 	global input_summary, ph_avg_reward, reward_summary, ph_avg_score_per_episode, score_per_episode_summary, ph_avg_loss, loss_summary, ph_avg_max_q_value, max_q_value_summary, ph_exploration_rate, exploration_rate_summary
142 | 	
143 | 	with tf.Graph().as_default() as g:
144 | 		global_step = tf.Variable(0, name='step', trainable=False)
145 | 		ph_new_global_step = tf.placeholder(tf.int32, shape=[], name='new_global_step')
146 | 		assign_global_step = tf.assign(global_step, ph_new_global_step, name='assign_global_step')
147 | 
148 | 		with tf.name_scope('input'):
149 | 			# preprocessed state(x_1, x_2, ..., x_n)
150 | 			ph_x = tf.placeholder(tf.int32, shape=[210, 160, 3])
151 | 			ph_state = tf.placeholder(tf.float32, shape=[None, ACTION_HISTORY_LENGTH, *SCREEN_DIMS], name='state')
152 | 			ph_y = tf.placeholder(tf.float32, shape=[None], name='y') # y = r or r + gamma * max_Q^(s, a)
153 | 			ph_a = tf.placeholder(tf.int64, shape=[None], name='a') # actions
154 | 
155 | 		with tf.device('/gpu:0'):
156 | 			with tf.name_scope('Q'):
157 | 				q_values, theta = create_q(ph_state)
158 | 
159 | 			with tf.name_scope('pi'):
160 | 				predicted_action = create_predicted_action(q_values)
161 | 
162 | 			with tf.name_scope('max_Q'):
163 | 				max_q = create_max_q(q_values)
164 | 
165 | 			with tf.name_scope('target_Q'):
166 | 				target_q_values, theta_m1 = create_q(ph_state)
167 | 
168 | 			with tf.name_scope('target_Q_reduced_by_action'):
169 | 				target_q_reduced_by_action = create_q_reduced_by_action(target_q_values, ph_a)
170 | 
171 | 			with tf.name_scope('gamma_target_Q_reduced_by_action'):
172 | 				gamma_target_q_reduced_by_action = GAMMA * target_q_reduced_by_action
173 | 
174 | 			with tf.name_scope('predict_by_double_dqn'):
175 | 				predict_by_double_dqn = GAMMA * create_q_reduced_by_action(target_q_values, predicted_action)
176 | 
177 | 			with tf.name_scope('max_target_Q'):
178 | 				max_target_q = create_max_q(target_q_values)
179 | 
180 | 			with tf.name_scope('gamma_max_target_Q'):
181 | 				gamma_max_target_q = GAMMA * max_target_q
182 | 
183 | 			with tf.name_scope('reset_target_Q'):
184 | 				reset_target_q = tf.group(*(tf.assign(lvalue, rvalue) for lvalue, rvalue in zip(theta_m1, theta)))
185 | 
186 | 			with tf.name_scope('loss'):
187 | 				loss = create_loss(q_values, ph_y, ph_a)
188 | 
189 | 			with tf.name_scope('train'):
190 | 				train_op = create_train_op(loss)
191 | 
192 | 		with tf.device('/cpu:0'):
193 | 			with tf.name_scope('preprocess'):
194 | 				_preprocess = create_preprocess(ph_x)
195 | 
196 | 			with tf.name_scope('Q_cpu'):
197 | 				q_values_cpu, _ = create_q(ph_state, theta, cpu_only=True)
198 | 
199 | 			with tf.name_scope('pi_cpu'):
200 | 				predicted_action_cpu = create_predicted_action(q_values_cpu)
201 | 
202 | 			with tf.name_scope('max_Q_cpu'):
203 | 				max_q_cpu = create_max_q(q_values_cpu)
204 | 
205 | 			# summaries
206 | 			input_summary = tf.image_summary('input', tf.reshape(tf.transpose(ph_state[0:1,:,:,:], perm=[1,2,3,0]), [-1, *SCREEN_DIMS, 1]), max_images=ACTION_HISTORY_LENGTH)
207 | 
208 | 			# update every input()
209 | 			ph_avg_reward = tf.placeholder(tf.float32, shape=[], name='avg_reward')
210 | 			reward_summary = tf.scalar_summary('_reward', ph_avg_reward)
211 | 
212 | 			# update at new_episode()
213 | 			ph_avg_score_per_episode = tf.placeholder(tf.float32, shape=[], name='avg_score_per_episode')
214 | 			score_per_episode_summary = tf.scalar_summary('_score_per_episode', ph_avg_score_per_episode)
215 | 
216 | 			# update at train()
217 | 			ph_avg_loss = tf.placeholder(tf.float32, shape=[], name='avg_loss')
218 | 			loss_summary = tf.scalar_summary('_loss', ph_avg_loss)
219 | 
220 | 			# update at train()
221 | 			ph_exploration_rate = tf.placeholder(tf.float32, shape=[], name='avg_loss')
222 | 			exploration_rate_summary = tf.scalar_summary('_exploration_rate', ph_exploration_rate)
223 | 
224 | 			# update at inference
225 | 			ph_avg_max_q_value = tf.placeholder(tf.float32, shape=[], name='avg_max_q_value')
226 | 			max_q_value_summary = tf.scalar_summary('_max_q_value', ph_avg_max_q_value)
227 | 
228 | 		# start session
229 | 		sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
230 | 		initializers = (tf.initialize_all_variables(), reset_target_q)
231 | 
232 | 		saver = tf.train.Saver()
233 | 		checkpoint = tf.train.get_checkpoint_state("saved_networks")
234 | 
235 | 		if checkpoint and checkpoint.model_checkpoint_path:
236 | 			saver.restore(sess, checkpoint.model_checkpoint_path)
237 | 			print("Successfully loaded:", checkpoint.model_checkpoint_path)
238 | 		else:
239 | 			print("Could not find old network weights")
240 | 			import os
241 | 			assert os.path.isdir('saved_networks')
242 | 			assert TRAIN
243 | 
244 | 			for initializer in initializers:
245 | 				sess.run(initializer)
246 | 
247 | 		g.finalize()
248 | 
249 | 	return sess, saver
250 | 
251 | def save_networks(step):
252 | 	sess.run(assign_global_step, feed_dict={ph_new_global_step: step})
253 | 	saver.save(sess, 'saved_networks/' + 'network' + '-dqn', global_step=step)
254 | 	print('[%s] Successfully saved networks -'%datetime.datetime.now(), step)
255 | 
256 | def get_exploration_rate():
257 | 	return max(MIN_EXPLORE_RATE, 1. + (MIN_EXPLORE_RATE - 1.) * step / MAX_EXPLORE_FRAMES)
258 | 
259 | def train_step():
260 | 	global step, st, ps
261 | 	global total_loss, cnt_loss
262 | 
263 | 	minibatch = random.sample(D, BATCH_SIZE)
264 | 
265 | 	state_batch = []
266 | 	action_batch = []
267 | 	y_batch = []
268 | 	undone_indices = []
269 | 	undone_state_p1 = []
270 | 
271 | 	for i, (t_state, t_action, t_reward, t_state_p1, t_done) in enumerate(minibatch):
272 | 		state_batch.append(t_state)
273 | 		action_batch.append(t_action)
274 | 		y_batch.append(t_reward)
275 | 
276 | 		if t_done == False: # to calculate future rewards
277 | 			undone_indices.append(i)
278 | 			undone_state_p1.append(t_state_p1)
279 | 
280 | 	# calculate future rewards
281 | 	predicted_q_values = sess.run(gamma_max_target_q, feed_dict={ph_state: undone_state_p1})
282 | 
283 | 	# double DQN
284 | 	#predicted_q_values = sess.run(predict_by_double_dqn, feed_dict={ph_state: undone_state_p1})
285 | 
286 | 	for i, j in enumerate(undone_indices):
287 | 		y_batch[j] += predicted_q_values[i]
288 | 
289 | 	# train
290 | 	_, current_loss = sess.run([train_op, loss], feed_dict={ph_y: y_batch, ph_state: state_batch, ph_a: action_batch})
291 | 	
292 | 	# log loss
293 | 	cnt_loss += 1
294 | 	total_loss += current_loss
295 | 	t_cnt_loss = cnt_loss
296 | 
297 | 	if t_cnt_loss == (LOG_INTERVAL // 10): # and TRAIN # is always True
298 | 		summary_writer.add_summary(sess.run(loss_summary, feed_dict={ph_avg_loss: total_loss/cnt_loss}), step)
299 | 		summary_writer.add_summary(sess.run(exploration_rate_summary, feed_dict={ph_exploration_rate: get_exploration_rate()}), step)
300 | 
301 | 		total_loss = 0
302 | 		cnt_loss = 0
303 | 
304 | 	step += 1
305 | 
306 | 	if BENCHMARK and step%100==0:
307 | 		print((step-ps)/(time.time()-st),'iterations per second')
308 | 		st = time.time()
309 | 		ps = step
310 | 	
311 | 	if step % C == 0:
312 | 		sess.run(reset_target_q)
313 | 	
314 | 	if step % SAVE_INTERVAL == 0 and not BENCHMARK:
315 | 		print('Autosaving networks ...')
316 | 		save_networks(step)
317 | 
318 | def preprocess(x):
319 | 	return sess.run(_preprocess, feed_dict={ph_x: x})[:, :, 0]
320 | 
321 | def put_experience(s, a, r, s_p, t, D_lock=None):
322 | 	global D_index
323 | 
324 | 	if D_lock:
325 | 		D_lock.acquire()
326 | 
327 | 	new_exp = (s, a, r, s_p, t)
328 | 
329 | 	if len(D) >= MAX_D_SIZE:
330 | 		D[D_index] = new_exp
331 | 		D_index += 1
332 | 		if D_index == len(D):
333 | 			D_index = 0
334 | 	else:
335 | 		D.append(new_exp)
336 | 
337 | 	if D_lock:
338 | 		D_lock.release()
339 | 
340 | def agent_worker(agent_coord, D_lock=None):
341 | 	assert OBSERVE <= MAX_D_SIZE
342 | 
343 | 	global D, total_loss, cnt_loss, st, ps
344 | 
345 | 	env = gym.make('Breakout-v0')
346 | 	get_state = lambda current:prev_ob_list[-ACTION_HISTORY_LENGTH:] if current else prev_ob_list[-ACTION_HISTORY_LENGTH-1:-1]
347 | 	
348 | 	total_reward = 0
349 | 	cnt_reward = 0
350 | 
351 | 	total_score_per_episode = 0
352 | 	cnt_score_per_episode = 0
353 | 
354 | 	total_max_q_value = 0
355 | 	cnt_max_q_value = 0
356 | 
357 | 	total_loss = 0
358 | 	cnt_loss = 0
359 | 
360 | 	# benchmark
361 | 	st = time.time()
362 | 	ps = step
363 | 
364 | 	while not agent_coord.should_stop():
365 | 		# new episode
366 | 		observation = env.reset()
367 | 		done = None
368 | 		score = 0
369 | 		cnt_same_state = 0
370 | 		last_score = None
371 | 
372 | 		prev_ob_list = [preprocess(observation)] * (ACTION_HISTORY_LENGTH - 1) # previous observations
373 | 
374 | 		while not agent_coord.should_stop():
375 | 			prev_ob_list.append(preprocess(observation))
376 | 
377 | 			if not TRAIN:
378 | 				env.render()
379 | 
380 | 			if done is not None and TRAIN:
381 | 				put_experience(get_state(False), action, min(1, reward), get_state(True), done, D_lock)
382 | 
383 | 				if len(D) > (OBSERVE if not BENCHMARK else BATCH_SIZE):
384 | 					train_step()
385 | 
386 | 			if done is not None and done:
387 | 				if not TRAIN:
388 | 					print('score:', score)
389 | 					time.sleep(1)
390 | 				break
391 | 
392 | 			if TRAIN and (random.random() < get_exploration_rate()):
393 | 				action = env.action_space.sample()
394 | 			else:
395 | 				# evaluate
396 | 				ops = [predicted_action, max_q]
397 | 
398 | 				if not TRAIN:
399 | 					ops = [predicted_action, max_q, q_values]
400 | 					
401 | 				feed_dict = {ph_state: (get_state(True),)}
402 | 
403 | 				if cnt_max_q_value == LOG_INTERVAL:
404 | 					ops.extend([input_summary, max_q_value_summary])
405 | 					feed_dict[ph_avg_max_q_value] = total_max_q_value / cnt_max_q_value
406 | 					total_max_q_value = 0
407 | 					cnt_max_q_value = 0
408 | 
409 | 				ret = sess.run(ops, feed_dict=feed_dict)
410 | 				action = ret[0][0]
411 | 
412 | 				# prevent the agent from doing nothing
413 | 				if not TRAIN:
414 | 					if last_score == score:
415 | 						cnt_same_state += 1
416 | 
417 | 						if cnt_same_state >= 50:
418 | 							action = 1 # FIRE
419 | 							cnt_same_state = 0
420 | 					else:
421 | 						cnt_same_state = 0
422 | 
423 | 					last_score = score
424 | 
425 | 				if len(D) >= OBSERVE:
426 | 					total_max_q_value += ret[1][0]
427 | 					cnt_max_q_value += 1
428 | 
429 | 				if TRAIN:
430 | 					for summary in ret[2:]:
431 | 						summary_writer.add_summary(summary, step)
432 | 				else:
433 | 					print(ret[-1])
434 | 					print(ACTION_MEANINGS[action], '\t' if len(ACTION_MEANINGS[action]) >= 8 else '\t\t', ret[1][0])
435 | 
436 | 			observation, reward, done, info = env.step(action)
437 | 			score += reward
438 | 
439 | 			if len(D) >= OBSERVE:
440 | 				total_reward += reward
441 | 				cnt_reward += 1
442 | 
443 | 				if cnt_reward == (LOG_INTERVAL*10):
444 | 					summary_writer.add_summary(sess.run(reward_summary, feed_dict={ph_avg_reward: total_reward/cnt_reward}), step)
445 | 					total_reward = 0
446 | 					cnt_reward = 0
447 | 
448 | 		# episode done
449 | 		if len(D) >= OBSERVE:
450 | 			total_score_per_episode += score
451 | 			cnt_score_per_episode += 1
452 | 
453 | 			if cnt_score_per_episode == (LOG_INTERVAL//10):
454 | 				summary_writer.add_summary(sess.run(score_per_episode_summary, feed_dict={ph_avg_score_per_episode:total_score_per_episode/cnt_score_per_episode}), step)
455 | 				total_score_per_episode = 0
456 | 				cnt_score_per_episode = 0
457 | 
458 | def main():
459 | 	global sess, saver, summary_writer, D, D_index, step
460 | 	sess, saver = start_session()
461 | 	step = sess.run(global_step)
462 | 
463 | 	summary_writer=tf.train.SummaryWriter('logdir', sess.graph)
464 | 	coord = tf.train.Coordinator()
465 | 
466 | 	D = [] # replay memory
467 | 	D_index = 0
468 | 
469 | 	if TRAIN:
470 | 		D_lock = threading.Lock()
471 | 	
472 | 		agent_coord = tf.train.Coordinator()
473 | 		agent_threads = []
474 | 
475 | 		for i in range(NUM_AGENT_THREAD):
476 | 			agent_thread = threading.Thread(target=agent_worker, args=(agent_coord, D_lock))
477 | 			agent_thread.start()
478 | 			agent_threads.append(agent_thread)
479 | 
480 | 		print("Waiting for initial observation")
481 | 
482 | 		while len(D) < (OBSERVE if not BENCHMARK else BATCH_SIZE):
483 | 			print("Current len(D):", len(D))
484 | 			time.sleep(1)
485 | 
486 | 		agent_coord.request_stop()
487 | 		agent_coord.join(agent_threads)
488 | 
489 | 	try:
490 | 		agent_worker(coord)
491 | 	except Exception as e:
492 | 		print(e)
493 | 		# Report exceptions to the coordinator.
494 | 		coord.request_stop(e)
495 | 	finally:
496 | 		coord.request_stop()
497 | 
498 | 		if TRAIN and not BENCHMARK:
499 | 			print('Received should_stop - Saving networks ...')
500 | 			save_networks(step)
501 | 
502 | if __name__ == '__main__':
503 | 	main()


--------------------------------------------------------------------------------