├── .gitignore ├── Agents ├── PySC2_A3C_AtariNet.py ├── PySC2_A3C_FullyConv.py ├── PySC2_A3C_old.py └── _model_old │ ├── checkpoint │ ├── model-27500.cptk.data-00000-of-00001 │ ├── model-27500.cptk.index │ └── model-27500.cptk.meta ├── Images ├── angle.png ├── poster.jpg ├── stall.png └── start_game.png ├── LICENSE ├── Notes ├── List of Action Argument Types.txt ├── Running an Agent.txt └── Total Action Space.txt ├── README.md └── ResearchLog ├── 2017-11-01.md ├── 2017-11-06.md └── 2017-11-07.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Directories creates from PySC2_A3Cagent.py 4 | Agents/train_* 5 | Agents/model 6 | -------------------------------------------------------------------------------- /Agents/PySC2_A3C_AtariNet.py: -------------------------------------------------------------------------------- 1 | """ 2 | PySC2_A3C_AtariNet.py 3 | A script for training and running an A3C agent on the PySC2 environment, with reference to DeepMind's paper: 4 | [1] Vinyals, Oriol, et al. "Starcraft II: A new challenge for reinforcement learning." arXiv preprint arXiv:1708.04782 (2017). 5 | Advantage estimation uses generalized advantage estimation from: 6 | [2] Schulman, John, et al. "High-dimensional continuous control using generalized advantage estimation." arXiv preprint arXiv:1506.02438 (2015). 7 | 8 | Credit goes to Arthur Juliani for providing for reference an implementation of A3C for the VizDoom environment 9 | https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2 10 | https://github.com/awjuliani/DeepRL-Agents 11 | """ 12 | 13 | import threading 14 | import multiprocessing 15 | import psutil 16 | import numpy as np 17 | import tensorflow as tf 18 | import scipy.signal 19 | from time import sleep 20 | import os 21 | import sys 22 | from absl import flags 23 | from absl.flags import FLAGS 24 | 25 | from pysc2.env import sc2_env 26 | from pysc2.env import environment 27 | from pysc2.lib import actions 28 | from pysc2.maps import mini_games 29 | 30 | """ 31 | Use the following command to launch Tensorboard: 32 | tensorboard --logdir=worker_0:'./train_0',worker_1:'./train_1',worker_2:'./train_2',worker_3:'./train_3' 33 | """ 34 | 35 | 36 | ## HELPER FUNCTIONS 37 | 38 | # Copies one set of variables to another. 39 | # Used to set worker network parameters to those of global network. 40 | def update_target_graph(from_scope,to_scope): 41 | from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) 42 | to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) 43 | op_holder = [] 44 | for from_var,to_var in zip(from_vars,to_vars): 45 | op_holder.append(to_var.assign(from_var)) 46 | return op_holder 47 | 48 | # Processes PySC2 observations 49 | def process_observation(observation, action_spec, observation_spec): 50 | # reward 51 | reward = observation.reward 52 | # features 53 | features = observation.observation 54 | spatial_features = ['minimap', 'screen'] 55 | variable_features = ['cargo', 'multi_select', 'build_queue'] 56 | available_actions = ['available_actions'] 57 | # the shapes of some features depend on the state (eg. shape of multi_select depends on number of units) 58 | # since tf requires fixed input shapes, we set a maximum size then pad the input if it falls short 59 | max_no = {'available_actions': len(action_spec.functions), 'cargo': 500, 'multi_select': 500, 'build_queue': 10} 60 | nonspatial_stack = [] 61 | for feature_label, feature in observation.observation.items(): 62 | if feature_label not in spatial_features + variable_features + available_actions: 63 | nonspatial_stack = np.concatenate((nonspatial_stack, feature.reshape(-1))) 64 | elif feature_label in variable_features: 65 | padded_feature = np.concatenate((feature.reshape(-1), np.zeros(max_no[feature_label] * observation_spec['single_select'][1] - len(feature.reshape(-1))))) 66 | nonspatial_stack = np.concatenate((nonspatial_stack, padded_feature)) 67 | elif feature_label in available_actions: 68 | available_actions_feature = [1 if action_id in feature else 0 for action_id in np.arange(max_no['available_actions'])] 69 | nonspatial_stack = np.concatenate((nonspatial_stack, available_actions_feature)) 70 | nonspatial_stack = np.expand_dims(nonspatial_stack, axis=0) 71 | # spatial_minimap features 72 | minimap_stack = np.expand_dims(np.stack(features['minimap'], axis=2), axis=0) 73 | # spatial_screen features 74 | screen_stack = np.expand_dims(np.stack(features['screen'], axis=2), axis=0) 75 | # is episode over? 76 | episode_end = observation.step_type == environment.StepType.LAST 77 | return reward, nonspatial_stack, minimap_stack, screen_stack, episode_end 78 | 79 | # Discounting function used to calculate discounted returns. 80 | def discount(x, gamma): 81 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] 82 | 83 | # Used to initialize weights for policy and value output layers 84 | def normalized_columns_initializer(std=1.0): 85 | def _initializer(shape, dtype=None, partition_info=None): 86 | out = np.random.randn(*shape).astype(np.float32) 87 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 88 | return tf.constant(out) 89 | return _initializer 90 | 91 | # Sample from a given distribution 92 | def sample_dist(dist): 93 | sample = np.random.choice(dist[0],p=dist[0]) 94 | sample = np.argmax(dist == sample) 95 | return sample 96 | 97 | ## ACTOR-CRITIC NETWORK 98 | 99 | class AC_Network(): 100 | def __init__(self, scope, trainer, action_spec, observation_spec): 101 | with tf.variable_scope(scope): 102 | # get size of features from action_spec and observation_spec 103 | nonspatial_size = 0 104 | spatial_features = ['minimap', 'screen'] 105 | initially_zero_features = {'cargo': 500, 'multi_select': 500, 'build_queue': 10, 'single_select': 1} 106 | for feature_name, feature_dim in observation_spec.items(): 107 | if feature_name not in spatial_features: 108 | if feature_name == 'available_actions': 109 | feature_size = len(action_spec.functions) 110 | elif feature_name in initially_zero_features: 111 | feature_size = initially_zero_features[feature_name] * feature_dim[1] 112 | else: 113 | feature_size = 1 114 | for dim in feature_dim: 115 | feature_size *= dim 116 | nonspatial_size += feature_size 117 | screen_channels = observation_spec['screen'][0] 118 | minimap_channels = observation_spec['minimap'][0] 119 | 120 | # Architecture here follows Atari-net Agent described in [1] Section 4.3 121 | 122 | self.inputs_nonspatial = tf.placeholder(shape=[None,nonspatial_size], dtype=tf.float32) 123 | self.inputs_spatial_screen = tf.placeholder(shape=[None,observation_spec['screen'][1],observation_spec['screen'][2],screen_channels], dtype=tf.float32) 124 | self.inputs_spatial_minimap = tf.placeholder(shape=[None,observation_spec['minimap'][1],observation_spec['minimap'][2],minimap_channels], dtype=tf.float32) 125 | 126 | self.nonspatial_dense = tf.layers.dense( 127 | inputs=self.inputs_nonspatial, 128 | units=32, 129 | activation=tf.tanh) 130 | self.screen_conv1 = tf.layers.conv2d( 131 | inputs=self.inputs_spatial_screen, 132 | filters=16, 133 | kernel_size=[8,8], 134 | strides=[4,4], 135 | padding='valid', 136 | activation=tf.nn.relu) 137 | self.screen_conv2 = tf.layers.conv2d( 138 | inputs=self.screen_conv1, 139 | filters=32, 140 | kernel_size=[4,4], 141 | strides=[2,2], 142 | padding='valid', 143 | activation=tf.nn.relu) 144 | self.minimap_conv1 = tf.layers.conv2d( 145 | inputs=self.inputs_spatial_minimap, 146 | filters=16, 147 | kernel_size=[8,8], 148 | strides=[4,4], 149 | padding='valid', 150 | activation=tf.nn.relu) 151 | self.minimap_conv2 = tf.layers.conv2d( 152 | inputs=self.minimap_conv1, 153 | filters=32, 154 | kernel_size=[4,4], 155 | strides=[2,2], 156 | padding='valid', 157 | activation=tf.nn.relu) 158 | 159 | # According to [1]: "The results are concatenated and sent through a linear layer with a ReLU activation." 160 | 161 | screen_output_length = 1 162 | for dim in self.screen_conv2.get_shape().as_list()[1:]: 163 | screen_output_length *= dim 164 | minimap_output_length = 1 165 | for dim in self.minimap_conv2.get_shape().as_list()[1:]: 166 | minimap_output_length *= dim 167 | 168 | self.latent_vector = tf.layers.dense( 169 | inputs=tf.concat([self.nonspatial_dense, tf.reshape(self.screen_conv2,shape=[-1,screen_output_length]), tf.reshape(self.minimap_conv2,shape=[-1,minimap_output_length])], axis=1), 170 | units=256, 171 | activation=tf.nn.relu) 172 | 173 | # Output layers for policy and value estimations 174 | # 1 policy network for base actions 175 | # 16 policy networks for arguments 176 | # - All modeled independently 177 | # - Spatial arguments have the x and y values modeled independently as well 178 | # 1 value network 179 | self.policy_base_actions = tf.layers.dense( 180 | inputs=self.latent_vector, 181 | units=len(action_spec.functions), 182 | activation=tf.nn.softmax, 183 | kernel_initializer=normalized_columns_initializer(0.01)) 184 | self.policy_arg = dict() 185 | for arg in action_spec.types: 186 | self.policy_arg[arg.name] = dict() 187 | for dim, size in enumerate(arg.sizes): 188 | self.policy_arg[arg.name][dim] = tf.layers.dense( 189 | inputs=self.latent_vector, 190 | units=size, 191 | activation=tf.nn.softmax, 192 | kernel_initializer=normalized_columns_initializer(0.01)) 193 | self.value = tf.layers.dense( 194 | inputs=self.latent_vector, 195 | units=1, 196 | kernel_initializer=normalized_columns_initializer(1.0)) 197 | 198 | # Only the worker network need ops for loss functions and gradient updating. 199 | if scope != 'global': 200 | self.actions_base = tf.placeholder(shape=[None],dtype=tf.int32) 201 | self.actions_onehot_base = tf.one_hot(self.actions_base,524,dtype=tf.float32) 202 | 203 | self.actions_arg = dict() 204 | self.actions_onehot_arg = dict() 205 | for arg in action_spec.types: 206 | self.actions_arg[arg.name] = dict() 207 | self.actions_onehot_arg[arg.name] = dict() 208 | for dim, size in enumerate(arg.sizes): 209 | self.actions_arg[arg.name][dim] = tf.placeholder(shape=[None],dtype=tf.int32) 210 | self.actions_onehot_arg[arg.name][dim] = tf.one_hot(self.actions_arg[arg.name][dim],size,dtype=tf.float32) 211 | 212 | self.target_v = tf.placeholder(shape=[None],dtype=tf.float32) 213 | self.advantages = tf.placeholder(shape=[None],dtype=tf.float32) 214 | 215 | self.responsible_outputs_base = tf.reduce_sum(self.policy_base_actions * self.actions_onehot_base, [1]) 216 | 217 | self.responsible_outputs_arg = dict() 218 | for arg in action_spec.types: 219 | self.responsible_outputs_arg[arg.name] = dict() 220 | for dim, size in enumerate(arg.sizes): 221 | self.responsible_outputs_arg[arg.name][dim] = tf.reduce_sum(self.policy_arg[arg.name][dim] * self.actions_onehot_arg[arg.name][dim], [1]) 222 | 223 | # Loss functions 224 | self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1]))) 225 | 226 | self.entropy_base = - tf.reduce_sum(self.policy_base_actions * tf.log(tf.clip_by_value(self.policy_base_actions, 1e-20, 1.0))) # avoid NaN with clipping when value in policy becomes zero 227 | 228 | self.entropy_arg = dict() 229 | for arg in action_spec.types: 230 | self.entropy_arg[arg.name] = dict() 231 | for dim, size in enumerate(arg.sizes): 232 | self.entropy_arg[arg.name][dim] = - tf.reduce_sum(self.policy_arg[arg.name][dim] * tf.log(tf.clip_by_value(self.policy_arg[arg.name][dim], 1e-20, 1.))) 233 | 234 | self.entropy = self.entropy_base 235 | for arg in action_spec.types: 236 | for dim, size in enumerate(arg.sizes): 237 | self.entropy += self.entropy_arg[arg.name][dim] 238 | 239 | self.policy_loss_base = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_base, 1e-20, 1.0))*self.advantages) 240 | 241 | self.policy_loss_arg = dict() 242 | for arg in action_spec.types: 243 | self.policy_loss_arg[arg.name] = dict() 244 | for dim, size in enumerate(arg.sizes): 245 | self.policy_loss_arg[arg.name][dim] = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg[arg.name][dim], 1e-20, 1.0)) * self.advantages) 246 | 247 | self.policy_loss = self.policy_loss_base 248 | for arg in action_spec.types: 249 | for dim, size in enumerate(arg.sizes): 250 | self.policy_loss += self.policy_loss_arg[arg.name][dim] 251 | 252 | self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 253 | 254 | # Get gradients from local network using local losses 255 | local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 256 | # self.gradients - gradients of loss wrt local_vars 257 | self.gradients = tf.gradients(self.loss,local_vars) 258 | self.var_norms = tf.global_norm(local_vars) 259 | grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0) 260 | 261 | # Apply local gradients to global network 262 | global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') 263 | self.apply_grads = trainer.apply_gradients(zip(grads,global_vars)) 264 | 265 | ## WORKER AGENT 266 | 267 | class Worker(): 268 | def __init__(self,name,trainer,model_path,global_episodes, map_name, action_spec, observation_spec): 269 | self.name = "worker_" + str(name) 270 | self.number = name 271 | self.model_path = model_path 272 | self.trainer = trainer 273 | self.global_episodes = global_episodes 274 | self.increment = self.global_episodes.assign_add(1) 275 | self.episode_rewards = [] 276 | self.episode_lengths = [] 277 | self.episode_mean_values = [] 278 | self.summary_writer = tf.summary.FileWriter("train_"+str(self.number)) 279 | 280 | # Create the local copy of the network and the tensorflow op to copy global paramters to local network 281 | self.local_AC = AC_Network(self.name,trainer,action_spec,observation_spec) 282 | self.update_local_ops = update_target_graph('global',self.name) 283 | 284 | print('Initializing environment #{}...'.format(self.number)) 285 | self.env = sc2_env.SC2Env(map_name=map_name) 286 | 287 | self.action_spec = action_spec 288 | self.observation_spec = observation_spec 289 | 290 | 291 | def train(self,rollout,sess,gamma,bootstrap_value): 292 | rollout = np.array(rollout) 293 | obs_screen = rollout[:,0] 294 | obs_minimap = rollout[:,1] 295 | obs_nonspatial = rollout[:,2] 296 | actions_base = rollout[:,3] 297 | actions_args = rollout[:,4] 298 | rewards = rollout[:,5] 299 | next_obs_screen = rollout[:,6] 300 | next_obs_minimap = rollout[:,7] 301 | next_obs_nonspatial = rollout[:,8] 302 | values = rollout[:,10] 303 | 304 | actions_arg_stack = dict() 305 | for actions_arg in actions_args: 306 | for arg_name,arg in actions_arg.items(): 307 | if arg_name not in actions_arg_stack: 308 | actions_arg_stack[arg_name] = dict() 309 | for dim, value in arg.items(): 310 | if dim not in actions_arg_stack[arg_name]: 311 | actions_arg_stack[arg_name][dim] = [] 312 | actions_arg_stack[arg_name][dim].append(value) 313 | 314 | # Here we take the rewards and values from the rollout, and use them to calculate the advantage and discounted returns 315 | # The advantage function uses generalized advantage estimation from [2] 316 | self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) 317 | discounted_rewards = discount(self.rewards_plus,gamma)[:-1] 318 | self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) 319 | advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] 320 | advantages = discount(advantages,gamma) 321 | 322 | # Update the global network using gradients from loss 323 | # Generate network statistics to periodically save 324 | feed_dict = {self.local_AC.target_v:discounted_rewards, 325 | self.local_AC.inputs_spatial_screen:np.stack(obs_screen).reshape(-1,64,64,17), 326 | self.local_AC.inputs_spatial_minimap:np.stack(obs_minimap).reshape(-1,64,64,7), 327 | self.local_AC.inputs_nonspatial:np.stack(obs_nonspatial).reshape(-1,7647), 328 | self.local_AC.actions_base:actions_base, 329 | self.local_AC.advantages:advantages} 330 | 331 | for arg_name, arg in actions_arg_stack.items(): 332 | for dim, value in arg.items(): 333 | feed_dict[self.local_AC.actions_arg[arg_name][dim]] = value 334 | 335 | v_l,p_l,e_l,g_n,v_n, _ = sess.run([self.local_AC.value_loss, 336 | self.local_AC.policy_loss, 337 | self.local_AC.entropy, 338 | self.local_AC.grad_norms, 339 | self.local_AC.var_norms, 340 | self.local_AC.apply_grads], 341 | feed_dict=feed_dict) 342 | return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n 343 | 344 | def work(self,max_episode_length,gamma,sess,coord,saver): 345 | episode_count = sess.run(self.global_episodes) 346 | total_steps = 0 347 | print ("Starting worker " + str(self.number)) 348 | with sess.as_default(), sess.graph.as_default(): 349 | while not coord.should_stop(): 350 | # Download copy of parameters from global network 351 | sess.run(self.update_local_ops) 352 | 353 | episode_buffer = [] 354 | episode_values = [] 355 | episode_frames = [] 356 | episode_reward = 0 357 | episode_step_count = 0 358 | episode_end = False 359 | 360 | # Start new episode 361 | obs = self.env.reset() 362 | episode_frames.append(obs[0]) 363 | reward, nonspatial_stack, minimap_stack, screen_stack, episode_end = process_observation(obs[0], self.action_spec, self.observation_spec) 364 | s_screen = screen_stack 365 | s_minimap = minimap_stack 366 | s_nonspatial = nonspatial_stack 367 | 368 | while not episode_end: 369 | 370 | # Take an action using distributions from policy networks' outputs 371 | base_action_dist, arg_dist, v = sess.run([self.local_AC.policy_base_actions, self.local_AC.policy_arg, self.local_AC.value], 372 | feed_dict={self.local_AC.inputs_spatial_screen: screen_stack, 373 | self.local_AC.inputs_spatial_minimap: minimap_stack, 374 | self.local_AC.inputs_nonspatial: nonspatial_stack}) 375 | 376 | # Apply filter to remove unavailable actions and then renormalize 377 | for action_id, action_prob in enumerate(base_action_dist[0]): 378 | if action_id not in obs[0].observation['available_actions']: 379 | base_action_dist[0][action_id] = 0 380 | if np.sum(base_action_dist[0]) != 1: 381 | current_sum = np.sum(base_action_dist[0]) 382 | base_action_dist[0] /= current_sum 383 | 384 | base_action = sample_dist(base_action_dist) 385 | arg_sample = dict() 386 | for arg in arg_dist: 387 | arg_sample[arg] = dict() 388 | for dim in arg_dist[arg]: 389 | arg_sample[arg][dim] = sample_dist(arg_dist[arg][dim]) 390 | 391 | arguments = [] 392 | for arg in self.action_spec.functions[base_action].args: 393 | arg_value = [] 394 | for dim, size in enumerate(arg.sizes): 395 | arg_value.append(arg_sample[arg.name][dim]) 396 | arguments.append(arg_value) 397 | 398 | # Set unused arguments to -1 so that they won't be updated in the training 399 | # See documentation for tf.one_hot 400 | for arg_name, arg in arg_sample.items(): 401 | if arg_name not in self.action_spec.functions[base_action].args: 402 | for dim in arg: 403 | arg_sample[arg_name][dim] = -1 404 | 405 | a = actions.FunctionCall(base_action, arguments) 406 | obs = self.env.step(actions=[a]) 407 | r, nonspatial_stack, minimap_stack, screen_stack, episode_end = process_observation(obs[0], self.action_spec, self.observation_spec) 408 | 409 | if not episode_end: 410 | episode_frames.append(obs[0]) 411 | s1_screen = screen_stack 412 | s1_minimap = minimap_stack 413 | s1_nonspatial = nonspatial_stack 414 | else: 415 | s1_screen = s_screen 416 | s1_minimap = s_minimap 417 | s1_nonspatial = s_nonspatial 418 | 419 | # Append latest state to buffer 420 | episode_buffer.append([s_screen, s_minimap, s_nonspatial,base_action,arg_sample,r,s1_screen, s1_minimap, s1_nonspatial,episode_end,v[0,0]]) 421 | episode_values.append(v[0,0]) 422 | 423 | episode_reward += r 424 | s_screen = s1_screen 425 | s_minimap = s1_minimap 426 | s_nonspatial = s1_nonspatial 427 | total_steps += 1 428 | episode_step_count += 1 429 | 430 | # If the episode hasn't ended, but the experience buffer is full, then we make an update step using that experience rollout 431 | if len(episode_buffer) == 30 and not episode_end and episode_step_count != max_episode_length - 1: 432 | # Since we don't know what the true final return is, we "bootstrap" from our current value estimation 433 | v1 = sess.run(self.local_AC.value, 434 | feed_dict={self.local_AC.inputs_spatial_screen: screen_stack,self.local_AC.inputs_spatial_minimap: minimap_stack,self.local_AC.inputs_nonspatial: nonspatial_stack})[0,0] 435 | v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1) 436 | episode_buffer = [] 437 | sess.run(self.update_local_ops) 438 | if episode_end: 439 | break 440 | 441 | self.episode_rewards.append(episode_reward) 442 | self.episode_lengths.append(episode_step_count) 443 | self.episode_mean_values.append(np.mean(episode_values)) 444 | episode_count += 1 445 | 446 | global _max_score, _running_avg_score, _episodes, _steps 447 | if _max_score < episode_reward: 448 | _max_score = episode_reward 449 | _running_avg_score = (2.0 / 101) * (episode_reward - _running_avg_score) + _running_avg_score 450 | _episodes[self.number] = episode_count 451 | _steps[self.number] = total_steps 452 | 453 | print("{} Step #{} Episode #{} Reward: {}".format(self.name, total_steps, episode_count, episode_reward)) 454 | print("Total Steps: {}\tTotal Episodes: {}\tMax Score: {}\tAvg Score: {}".format(np.sum(_steps), np.sum(_episodes), _max_score, _running_avg_score)) 455 | 456 | # Update the network using the episode buffer at the end of the episode 457 | if len(episode_buffer) != 0: 458 | v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0) 459 | 460 | if episode_count % 5 == 0 and episode_count != 0: 461 | if episode_count % 250 == 0 and self.name == 'worker_0': 462 | saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk') 463 | print ("Saved Model") 464 | 465 | mean_reward = np.mean(self.episode_rewards[-5:]) 466 | mean_length = np.mean(self.episode_lengths[-5:]) 467 | mean_value = np.mean(self.episode_mean_values[-5:]) 468 | summary = tf.Summary() 469 | summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) 470 | summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) 471 | summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) 472 | summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) 473 | summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) 474 | summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) 475 | summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n)) 476 | summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n)) 477 | self.summary_writer.add_summary(summary, episode_count) 478 | 479 | self.summary_writer.flush() 480 | if self.name == 'worker_0': 481 | sess.run(self.increment) 482 | 483 | def main(): 484 | max_episode_length = 300 485 | gamma = .99 # Discount rate for advantage estimation and reward discounting 486 | load_model = False 487 | model_path = './model' 488 | map_name = FLAGS.map_name 489 | assert map_name in mini_games.mini_games 490 | 491 | print('Initializing temporary environment to retrive action_spec...') 492 | action_spec = sc2_env.SC2Env(map_name=map_name).action_spec() 493 | print('Initializing temporary environment to retrive observation_spec...') 494 | observation_spec = sc2_env.SC2Env(map_name=map_name).observation_spec() 495 | 496 | tf.reset_default_graph() 497 | 498 | if not os.path.exists(model_path): 499 | os.makedirs(model_path) 500 | 501 | with tf.device("/cpu:0"): 502 | global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False) 503 | trainer = tf.train.AdamOptimizer(learning_rate=1e-4) 504 | master_network = AC_Network('global',None, action_spec, observation_spec) # Generate global network 505 | #num_workers = multiprocessing.cpu_count() # Set workers to number of available CPU threads 506 | num_workers = psutil.cpu_count() # Set workers to number of available CPU threads 507 | global _max_score, _running_avg_score, _steps, _episodes 508 | _max_score = 0 509 | _running_avg_score = 0 510 | _steps = np.zeros(num_workers) 511 | _episodes = np.zeros(num_workers) 512 | workers = [] 513 | # Create worker classes 514 | for i in range(num_workers): 515 | workers.append(Worker(i,trainer,model_path,global_episodes, map_name, action_spec, observation_spec)) 516 | saver = tf.train.Saver(max_to_keep=5) 517 | 518 | with tf.Session() as sess: 519 | coord = tf.train.Coordinator() 520 | if load_model == True: 521 | print ('Loading Model...') 522 | ckpt = tf.train.get_checkpoint_state(model_path) 523 | saver.restore(sess,ckpt.model_checkpoint_path) 524 | else: 525 | sess.run(tf.global_variables_initializer()) 526 | 527 | # This is where the asynchronous magic happens 528 | # Start the "work" process for each worker in a separate thread 529 | worker_threads = [] 530 | for worker in workers: 531 | worker_work = lambda: worker.work(max_episode_length,gamma,sess,coord,saver) 532 | t = threading.Thread(target=(worker_work)) 533 | t.start() 534 | sleep(0.5) 535 | worker_threads.append(t) 536 | coord.join(worker_threads) 537 | 538 | 539 | if __name__ == '__main__': 540 | flags.DEFINE_string("map_name", "DefeatRoaches", "Name of the map/minigame") 541 | FLAGS(sys.argv) 542 | main() 543 | -------------------------------------------------------------------------------- /Agents/PySC2_A3C_FullyConv.py: -------------------------------------------------------------------------------- 1 | """ 2 | PySC2_A3C_AtariNetNew.py 3 | A script for training and running an A3C agent on the PySC2 environment, with reference to DeepMind's paper: 4 | [1] Vinyals, Oriol, et al. "Starcraft II: A new challenge for reinforcement learning." arXiv preprint arXiv:1708.04782 (2017). 5 | Advantage estimation uses generalized advantage estimation from: 6 | [2] Schulman, John, et al. "High-dimensional continuous control using generalized advantage estimation." arXiv preprint arXiv:1506.02438 (2015). 7 | 8 | Credit goes to Arthur Juliani for providing for reference an implementation of A3C for the VizDoom environment 9 | https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2 10 | https://github.com/awjuliani/DeepRL-Agents 11 | 12 | This follows the AtariNet implementation described in [1]. 13 | The agent takes as input all of the features and outputs a policy across all 524 actions, which makes it generalizable to any of the minigames supplied in SC2LE. 14 | """ 15 | 16 | import threading 17 | import multiprocessing 18 | import psutil 19 | import numpy as np 20 | import tensorflow as tf 21 | import scipy.signal 22 | from time import sleep 23 | import os 24 | import json 25 | import pickle 26 | 27 | from pysc2.env import sc2_env 28 | from pysc2.env import environment 29 | from pysc2.lib import actions 30 | from pysc2.maps import mini_games 31 | 32 | """ 33 | Use the following command to launch Tensorboard: 34 | tensorboard --logdir=worker_0:'./train_0',worker_1:'./train_1',worker_2:'./train_2',worker_3:'./train_3' 35 | """ 36 | 37 | ## HELPER FUNCTIONS 38 | 39 | # Copies one set of variables to another. 40 | # Used to set worker network parameters to those of global network. 41 | def update_target_graph(from_scope,to_scope): 42 | from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) 43 | to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) 44 | op_holder = [] 45 | for from_var,to_var in zip(from_vars,to_vars): 46 | op_holder.append(to_var.assign(from_var)) 47 | return op_holder 48 | 49 | # Processes PySC2 observations 50 | def process_observation(observation, action_spec, observation_spec): 51 | # is episode over? 52 | episode_end = observation.step_type == environment.StepType.LAST 53 | # reward 54 | reward = observation.reward 55 | # features 56 | features = observation.observation 57 | variable_features = ['cargo', 'multi_select', 'build_queue'] 58 | max_no = {'available_actions': len(action_spec.functions), 'cargo': 100, 'multi_select': 100, 'build_queue': 10} 59 | # nonspatial features 60 | nonspatial_stack = [] 61 | nonspatial_stack = np.log(features['player'].reshape(-1) + 1.) 62 | nonspatial_stack = np.concatenate((nonspatial_stack, features['game_loop'].reshape(-1))) 63 | nonspatial_stack = np.expand_dims(nonspatial_stack, axis=0) 64 | # spatial_minimap features 65 | minimap_stack = np.stack((features['minimap']), axis=2) 66 | minimap_stack = np.expand_dims(minimap_stack, axis=0) 67 | # spatial_screen features 68 | screen_stack = np.stack((features['screen']), axis=2) 69 | screen_stack = np.expand_dims(screen_stack, axis=0) 70 | return reward, nonspatial_stack, minimap_stack, screen_stack, episode_end 71 | 72 | # Discounting function used to calculate discounted returns. 73 | def discount(x, gamma): 74 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] 75 | 76 | # Used to initialize weights for policy and value output layers 77 | def normalized_columns_initializer(std=1.0): 78 | def _initializer(shape, dtype=None, partition_info=None): 79 | out = np.random.randn(*shape).astype(np.float32) 80 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 81 | return tf.constant(out) 82 | return _initializer 83 | 84 | def sample_dist(dist): 85 | sample = np.random.choice(dist[0],p=dist[0]) 86 | sample = np.argmax(dist == sample) 87 | return sample 88 | 89 | ## ACTOR-CRITIC NETWORK 90 | 91 | class AC_Network(): 92 | def __init__(self, scope, trainer, action_spec, observation_spec): 93 | with tf.variable_scope(scope): 94 | # Architecture here follows Atari-net Agent described in [1] Section 4.3 95 | nonspatial_size = 12 96 | minimap_channels = 7 97 | screen_channels = 17 98 | 99 | self.inputs_nonspatial = tf.placeholder(shape=[None,nonspatial_size], dtype=tf.float32) 100 | self.inputs_spatial_minimap = tf.placeholder(shape=[None,64,64,minimap_channels], dtype=tf.float32) 101 | self.inputs_spatial_screen = tf.placeholder(shape=[None,64,64,screen_channels], dtype=tf.float32) 102 | self.nonspatial_dense = tf.layers.dense( 103 | inputs=self.inputs_nonspatial, 104 | units=32, 105 | activation=tf.tanh) 106 | self.screen_conv1 = tf.layers.conv2d( 107 | inputs=self.inputs_spatial_screen, 108 | filters=16, 109 | kernel_size=[5,5], 110 | strides=[1,1], 111 | padding='same', 112 | activation=tf.nn.relu) 113 | self.screen_conv2 = tf.layers.conv2d( 114 | inputs=self.screen_conv1, 115 | filters=32, 116 | kernel_size=[3,3], 117 | strides=[1,1], 118 | padding='same', 119 | activation=tf.nn.relu) 120 | self.minimap_conv1 = tf.layers.conv2d( 121 | inputs=self.inputs_spatial_minimap, 122 | filters=16, 123 | kernel_size=[5,5], 124 | strides=[1,1], 125 | padding='same', 126 | activation=tf.nn.relu) 127 | self.minimap_conv2 = tf.layers.conv2d( 128 | inputs=self.minimap_conv1, 129 | filters=32, 130 | kernel_size=[3,3], 131 | strides=[1,1], 132 | padding='same', 133 | activation=tf.nn.relu) 134 | screen_output_length = 1 135 | for dim in self.screen_conv2.get_shape().as_list()[1:]: 136 | screen_output_length *= dim 137 | minimap_output_length = 1 138 | for dim in self.minimap_conv2.get_shape().as_list()[1:]: 139 | minimap_output_length *= dim 140 | self.latent_vector_nonspatial = tf.layers.dense( 141 | inputs=tf.concat([self.nonspatial_dense, tf.reshape(self.screen_conv2,shape=[-1,screen_output_length]), tf.reshape(self.minimap_conv2,shape=[-1,minimap_output_length])], axis=1), 142 | units=256, 143 | activation=tf.nn.relu) 144 | 145 | # Output layers for policy and value estimations 146 | # 12 policy networks for base actions and arguments 147 | # - All modeled independently 148 | # - Spatial arguments have the x and y values modeled independently as well 149 | # 1 value network 150 | spatial_arguments = ['screen', 'minimap', 'screen2'] 151 | self.policy_base_actions = tf.layers.dense( 152 | inputs=self.latent_vector_nonspatial, 153 | units=len(action_spec.functions), 154 | activation=tf.nn.softmax, 155 | kernel_initializer=normalized_columns_initializer(0.01)) 156 | self.policy_arg_nonspatial = dict() 157 | for arg in action_spec.types: 158 | if arg.name not in spatial_arguments: 159 | self.policy_arg_nonspatial[arg.name] = dict() 160 | for dim, size in enumerate(arg.sizes): 161 | if size == 2: 162 | self.policy_arg_nonspatial[arg.name][dim] = tf.layers.dense( 163 | inputs=self.latent_vector_nonspatial, 164 | units=size, 165 | activation=tf.nn.softmax, 166 | kernel_initializer=normalized_columns_initializer(1.0)) 167 | else: 168 | self.policy_arg_nonspatial[arg.name][dim] = tf.layers.dense( 169 | inputs=self.latent_vector_nonspatial, 170 | units=size, 171 | activation=tf.nn.softmax, 172 | kernel_initializer=normalized_columns_initializer(0.01)) 173 | self.policy_arg_spatial = dict() 174 | self.latent_vector_spatial = dict() 175 | for arg in spatial_arguments: 176 | self.latent_vector_spatial[arg] = tf.layers.conv2d( 177 | inputs=tf.concat([self.screen_conv2, self.minimap_conv2], axis=3), 178 | filters=1, 179 | kernel_size=[1,1], 180 | strides=[1,1], 181 | padding='same', 182 | activation=None) 183 | self.policy_arg_spatial[arg] = tf.nn.softmax(tf.reshape(self.latent_vector_spatial[arg], shape=[-1, 64 * 64])) 184 | self.value = tf.layers.dense( 185 | inputs=self.latent_vector_nonspatial, 186 | units=1, 187 | kernel_initializer=normalized_columns_initializer(1.0)) 188 | # Only the worker network need ops for loss functions and gradient updating. 189 | # calculates the losses 190 | # self.gradients - gradients of loss wrt local_vars 191 | # applies the gradients to update the global network 192 | if scope != 'global': 193 | self.actions_base = tf.placeholder(shape=[None], dtype=tf.int32) 194 | self.actions_onehot_base = tf.one_hot(self.actions_base, 524, dtype=tf.float32) 195 | self.actions_arg = dict() 196 | self.actions_onehot_arg = dict() 197 | for arg in action_spec.types: 198 | if arg.name not in spatial_arguments: 199 | arg_name = arg.name 200 | self.actions_arg[arg_name] = dict() 201 | self.actions_onehot_arg[arg_name] = dict() 202 | for dim, size in enumerate(arg.sizes): 203 | self.actions_arg[arg_name][dim] = tf.placeholder(shape=[None], dtype=tf.int32) 204 | self.actions_onehot_arg[arg_name][dim] = tf.one_hot(self.actions_arg[arg_name][dim], size, dtype=tf.float32) 205 | self.actions_arg_spatial = dict() 206 | self.actions_onehot_arg_spatial = dict() 207 | for arg in spatial_arguments: 208 | self.actions_arg_spatial[arg] = tf.placeholder(shape=[None],dtype=tf.int32) 209 | self.actions_onehot_arg_spatial[arg] = tf.one_hot(self.actions_arg_spatial[arg], 64 * 64,dtype=tf.float32) 210 | self.target_v = tf.placeholder(shape=[None], dtype=tf.float32) 211 | self.advantages = tf.placeholder(shape=[None], dtype=tf.float32) 212 | 213 | self.responsible_outputs_base = tf.reduce_sum(self.policy_base_actions * self.actions_onehot_base, [1]) 214 | self.responsible_outputs_arg = dict() 215 | for arg_name in self.policy_arg_nonspatial: 216 | self.responsible_outputs_arg[arg_name] = dict() 217 | for dim in self.policy_arg_nonspatial[arg_name]: 218 | self.responsible_outputs_arg[arg_name][dim] = tf.reduce_sum(self.policy_arg_nonspatial[arg_name][dim] * self.actions_onehot_arg[arg_name][dim], [1]) 219 | self.responsible_outputs_arg_spatial = dict() 220 | for arg in spatial_arguments: 221 | self.responsible_outputs_arg_spatial[arg] = tf.reduce_sum(self.policy_arg_spatial[arg] * self.actions_onehot_arg_spatial[arg], [1]) 222 | 223 | # Loss functions 224 | self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1]))) 225 | 226 | self.log_policy_base_actions = tf.log(tf.clip_by_value(self.policy_base_actions, 1e-20, 1.0)) # avoid NaN with clipping when value in policy becomes zero 227 | self.entropy_base = - tf.reduce_sum(self.policy_base_actions * self.log_policy_base_actions) 228 | self.entropy_arg = dict() 229 | for arg_name in self.policy_arg_nonspatial: 230 | self.entropy_arg[arg_name] = dict() 231 | for dim in self.policy_arg_nonspatial[arg_name]: 232 | self.entropy_arg[arg_name][dim] = - tf.reduce_sum(self.policy_arg_nonspatial[arg_name][dim] * tf.log(tf.clip_by_value(self.policy_arg_nonspatial[arg_name][dim], 1e-20, 1.0))) 233 | self.entropy_arg_spatial = dict() 234 | for arg in spatial_arguments: 235 | self.entropy_arg_spatial[arg] = - tf.reduce_sum(self.policy_arg_spatial[arg] * tf.log(tf.clip_by_value(self.policy_arg_spatial[arg], 1e-20, 1.))) 236 | self.entropy = self.entropy_base 237 | for arg_name in self.policy_arg_nonspatial: 238 | for dim in self.policy_arg_nonspatial[arg_name]: 239 | self.entropy += self.entropy_arg[arg_name][dim] 240 | for arg in spatial_arguments: 241 | self.entropy += self.entropy_arg_spatial[arg] 242 | 243 | self.policy_loss_base = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_base, 1e-20, 1.0)) * self.advantages) 244 | self.policy_loss_arg = dict() 245 | for arg_name in self.policy_arg_nonspatial: 246 | self.policy_loss_arg[arg_name] = dict() 247 | for dim in self.policy_arg_nonspatial[arg_name]: 248 | self.policy_loss_arg[arg_name][dim] = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg[arg_name][dim], 1e-20, 1.0)) * self.advantages) 249 | self.policy_loss_arg_spatial = dict() 250 | for arg in spatial_arguments: 251 | self.policy_loss_arg_spatial[arg] = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_spatial[arg], 1e-20, 1.0))*self.advantages) 252 | self.policy_loss = self.policy_loss_base 253 | for arg_name in self.policy_arg_nonspatial: 254 | for dim in self.policy_arg_nonspatial[arg_name]: 255 | self.policy_loss += self.policy_loss_arg[arg_name][dim] 256 | for arg in spatial_arguments: 257 | self.policy_loss += self.policy_loss_arg_spatial[arg] 258 | 259 | self.loss = self.value_loss + self.policy_loss - self.entropy * 0.001 260 | 261 | # Get gradients from local network using local losses 262 | local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 263 | self.gradients = tf.gradients(self.loss,local_vars) 264 | self.var_norms = tf.global_norm(local_vars) 265 | grads, self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0) 266 | 267 | # Apply local gradients to global network 268 | global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') 269 | self.apply_grads = trainer.apply_gradients(zip(grads, global_vars)) 270 | 271 | ## WORKER AGENT 272 | 273 | class Worker(): 274 | def __init__(self, name, trainer, model_path, global_episodes, global_steps, map_name, action_spec, observation_spec): 275 | self.name = "worker_" + str(name) 276 | self.number = name 277 | self.model_path = model_path 278 | self.trainer = trainer 279 | self.global_episodes = global_episodes 280 | self.increment_global_episodes = self.global_episodes.assign_add(1) 281 | self.global_steps = global_steps 282 | self.increment_global_steps = self.global_steps.assign_add(1) 283 | self.episode_rewards = [] 284 | self.episode_lengths = [] 285 | self.episode_mean_values = [] 286 | self.summary_writer = tf.summary.FileWriter("train_" + str(self.number)) 287 | 288 | self.action_spec = action_spec 289 | self.observation_spec = observation_spec 290 | 291 | #Create the local copy of the network and the tensorflow op to copy global paramters to local network 292 | self.local_AC = AC_Network(self.name, trainer, action_spec, observation_spec) 293 | self.update_local_ops = update_target_graph('global', self.name) 294 | 295 | print('Initializing environment #{}...'.format(self.number)) 296 | self.env = sc2_env.SC2Env(map_name=map_name) 297 | 298 | 299 | def train(self, rollout, sess, gamma, bootstrap_value): 300 | rollout = np.array(rollout) 301 | obs_minimap = rollout[:,0] 302 | obs_screen = rollout[:,1] 303 | obs_nonspatial = rollout[:,2] 304 | actions_base = rollout[:,3] 305 | actions_args = rollout[:,4] 306 | actions_args_spatial = rollout[:,5] 307 | rewards = rollout[:,6] 308 | next_obs_minimap = rollout[:,7] 309 | next_obs_screen = rollout[:,8] 310 | next_obs_nonspatial = rollout[:,9] 311 | values = rollout[:,11] 312 | 313 | actions_arg_stack = dict() 314 | for actions_arg in actions_args: 315 | for arg_name in actions_arg: 316 | if arg_name not in actions_arg_stack: 317 | actions_arg_stack[arg_name] = dict() 318 | for dim in actions_arg[arg_name]: 319 | if dim not in actions_arg_stack[arg_name]: 320 | actions_arg_stack[arg_name][dim] = [actions_arg[arg_name][dim]] 321 | else: 322 | actions_arg_stack[arg_name][dim].append(actions_arg[arg_name][dim]) 323 | actions_arg_spatial_stack = dict() 324 | for actions_arg_spatial in actions_args_spatial: 325 | for arg_name,arg_value in actions_arg_spatial.items(): 326 | if arg_name not in actions_arg_spatial_stack: 327 | actions_arg_spatial_stack[arg_name] = [] 328 | actions_arg_spatial_stack[arg_name].append(arg_value) 329 | 330 | # Here we take the rewards and values from the rollout, and use them to calculate the advantage and discounted returns. 331 | # The advantage function uses generalized advantage estimation from [2] 332 | self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) 333 | discounted_rewards = discount(self.rewards_plus,gamma)[:-1] 334 | self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) 335 | advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] 336 | advantages = discount(advantages,gamma) 337 | 338 | # Update the global network using gradients from loss 339 | # Generate network statistics to periodically save 340 | feed_dict = {self.local_AC.target_v:discounted_rewards, 341 | self.local_AC.inputs_spatial_screen:np.stack(obs_screen).reshape(-1,64,64,17), 342 | self.local_AC.inputs_spatial_minimap:np.stack(obs_minimap).reshape(-1,64,64,7), 343 | self.local_AC.inputs_nonspatial:np.stack(obs_nonspatial).reshape(-1,12), 344 | self.local_AC.actions_base:actions_base, 345 | self.local_AC.advantages:advantages} 346 | for arg_name in actions_arg_stack: 347 | for dim in actions_arg_stack[arg_name]: 348 | feed_dict[self.local_AC.actions_arg[arg_name][dim]] = actions_arg_stack[arg_name][dim] 349 | for arg_name, value in actions_arg_spatial_stack.items(): 350 | feed_dict[self.local_AC.actions_arg_spatial[arg_name]] = value 351 | v_l,p_l,e_l,g_n,v_n, _ = sess.run([self.local_AC.value_loss, 352 | self.local_AC.policy_loss, 353 | self.local_AC.entropy, 354 | self.local_AC.grad_norms, 355 | self.local_AC.var_norms, 356 | self.local_AC.apply_grads], 357 | feed_dict=feed_dict) 358 | return v_l / len(rollout), p_l / len(rollout), e_l / len(rollout), g_n,v_n 359 | 360 | def work(self,max_episode_length,gamma,sess,coord,saver): 361 | episode_count = sess.run(self.global_episodes) 362 | total_steps = 0 363 | print ("Starting worker " + str(self.number)) 364 | with sess.as_default(), sess.graph.as_default(): 365 | while not coord.should_stop(): 366 | # Download copy of parameters from global network 367 | sess.run(self.update_local_ops) 368 | 369 | episode_buffer = [] 370 | episode_values = [] 371 | episode_frames = [] 372 | episode_reward = 0 373 | episode_step_count = 0 374 | 375 | # Start new episode 376 | obs = self.env.reset() 377 | episode_frames.append(obs[0]) 378 | reward, nonspatial_stack, minimap_stack, screen_stack, episode_end = process_observation(obs[0], self.action_spec, self.observation_spec) 379 | s_minimap = minimap_stack 380 | s_screen = screen_stack 381 | s_nonspatial = nonspatial_stack 382 | 383 | while not episode_end: 384 | # Take an action using distributions from policy networks' outputs. 385 | base_action_dist, arg_spatial_dist, arg_nonspatial_dist, v = sess.run([ 386 | self.local_AC.policy_base_actions, 387 | self.local_AC.policy_arg_spatial, 388 | self.local_AC.policy_arg_nonspatial, 389 | self.local_AC.value], 390 | feed_dict={ 391 | self.local_AC.inputs_spatial_minimap: minimap_stack, 392 | self.local_AC.inputs_spatial_screen: screen_stack, 393 | self.local_AC.inputs_nonspatial: nonspatial_stack}) 394 | 395 | # Apply filter to remove unavailable actions and then renormalize 396 | base_action_dist[0] += 1e-20 397 | for action_id, action in enumerate(base_action_dist[0]): 398 | if action_id not in obs[0].observation['available_actions']: 399 | base_action_dist[0][action_id] = 0. 400 | base_action_dist[0] /= np.sum(base_action_dist[0]) 401 | 402 | action_id = sample_dist(base_action_dist) 403 | 404 | arg_sample = dict() 405 | for arg_name in arg_nonspatial_dist: 406 | arg_sample[arg_name] = dict() 407 | for dim in arg_nonspatial_dist[arg_name]: 408 | arg_sample[arg_name][dim] = sample_dist(arg_nonspatial_dist[arg_name][dim]) 409 | arg_sample_spatial = dict() 410 | arg_sample_spatial_abs = dict() 411 | for arg in arg_spatial_dist: 412 | arg_sample_spatial_abs[arg] = sample_dist(arg_spatial_dist[arg]) 413 | arg_sample_spatial[arg] = [arg_sample_spatial_abs[arg] % 64, arg_sample_spatial_abs[arg] / 64] 414 | 415 | arguments = [] 416 | spatial_arguments = ['screen', 'minimap', 'screen2'] 417 | for argument in self.action_spec.functions[action_id].args: 418 | name = argument.name 419 | if name not in spatial_arguments: 420 | argument_value = [] 421 | for dim, size in enumerate(argument.sizes): 422 | argument_value.append(arg_sample[name][dim]) 423 | else: 424 | argument_value = arg_sample_spatial[name] 425 | arguments.append(argument_value) 426 | 427 | # Set unused arguments to -1 so that they won't be updated in the training 428 | # See documentation for tf.one_hot 429 | for arg_name, argument in arg_sample.items(): 430 | if arg_name not in self.action_spec.functions[action_id].args: 431 | for dim in argument: 432 | arg_sample[arg_name][dim] = -1 433 | for arg_name, arg in arg_sample_spatial_abs.items(): 434 | if arg_name not in self.action_spec.functions[action_id].args: 435 | arg_sample_spatial_abs[arg_name] = -1 436 | 437 | a = actions.FunctionCall(action_id, arguments) 438 | obs = self.env.step(actions=[a]) 439 | r, nonspatial_stack, minimap_stack, screen_stack, episode_end = process_observation(obs[0], self.action_spec, self.observation_spec) 440 | 441 | if not episode_end: 442 | episode_frames.append(obs[0]) 443 | s1_minimap = minimap_stack 444 | s1_screen = screen_stack 445 | s1_nonspatial = nonspatial_stack 446 | else: 447 | s1_minimap = s_minimap 448 | s1_screen = s_screen 449 | s1_nonspatial = s_nonspatial 450 | 451 | # Append latest state to buffer 452 | episode_buffer.append([s_minimap, s_screen, s_nonspatial,action_id,arg_sample,arg_sample_spatial_abs,r,s1_minimap, s1_screen, s1_nonspatial,episode_end,v[0,0]]) 453 | episode_values.append(v[0,0]) 454 | 455 | episode_reward += r 456 | s_minimap = s1_minimap 457 | s_screen = s1_screen 458 | s_nonspatial = s1_nonspatial 459 | sess.run(self.increment_global_steps) 460 | total_steps += 1 461 | episode_step_count += 1 462 | 463 | # If the episode hasn't ended, but the experience buffer is full, then we make an update step using that experience rollout. 464 | if len(episode_buffer) == 40 and not episode_end and episode_step_count != max_episode_length - 1: 465 | # Since we don't know what the true final return is, we "bootstrap" from our current value estimation. 466 | v1 = sess.run(self.local_AC.value, 467 | feed_dict={self.local_AC.inputs_spatial_minimap: minimap_stack, self.local_AC.inputs_spatial_screen: screen_stack,self.local_AC.inputs_nonspatial: nonspatial_stack})[0,0] 468 | v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1) 469 | episode_buffer = [] 470 | sess.run(self.update_local_ops) 471 | if episode_end: 472 | break 473 | 474 | self.episode_rewards.append(episode_reward) 475 | self.episode_lengths.append(episode_step_count) 476 | self.episode_mean_values.append(np.mean(episode_values)) 477 | episode_count += 1 478 | 479 | episode_reward = obs[0].observation['score_cumulative'][0] 480 | 481 | global _max_score, _running_avg_score 482 | if _max_score < episode_reward: 483 | _max_score = episode_reward 484 | _running_avg_score = (2.0 / 101) * (episode_reward - _running_avg_score) + _running_avg_score 485 | 486 | print("{} Step #{} Episode #{} Reward: {}".format(self.name, total_steps, episode_count, episode_reward)) 487 | print("Total Steps: {}\tTotal Episodes: {}\tMax Score: {}\tAvg Score: {}".format(sess.run(self.global_steps), sess.run(self.global_episodes), _max_score, _running_avg_score)) 488 | 489 | # Update the network using the episode buffer at the end of the episode. 490 | if len(episode_buffer) != 0: 491 | v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.) 492 | 493 | if episode_count % 50 == 0 and episode_count != 0: 494 | if episode_count % 50 == 0 and self.name == 'worker_0': 495 | saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk') 496 | print ("Saved Model") 497 | 498 | mean_reward = np.mean(self.episode_rewards[-50:]) 499 | mean_length = np.mean(self.episode_lengths[-50:]) 500 | mean_value = np.mean(self.episode_mean_values[-50:]) 501 | summary = tf.Summary() 502 | summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) 503 | summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) 504 | summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) 505 | summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) 506 | summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) 507 | summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) 508 | summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n)) 509 | summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n)) 510 | self.summary_writer.add_summary(summary, episode_count) 511 | 512 | self.summary_writer.flush() 513 | 514 | sess.run(self.increment_global_episodes) 515 | 516 | def main(): 517 | max_episode_length = 300 518 | gamma = .99 # discount rate for advantage estimation and reward discounting 519 | load_model = FLAGS.load_model 520 | model_path = './model' 521 | map_name = FLAGS.map_name 522 | assert map_name in mini_games.mini_games 523 | 524 | global _max_score, _running_avg_score 525 | _max_score = 0 526 | _running_avg_score = 0 527 | 528 | print('Initializing temporary environment to retrive action_spec...') 529 | action_spec = sc2_env.SC2Env(map_name=map_name).action_spec() 530 | print('Initializing temporary environment to retrive observation_spec...') 531 | observation_spec = sc2_env.SC2Env(map_name=map_name).observation_spec() 532 | 533 | tf.reset_default_graph() 534 | 535 | if not os.path.exists(model_path): 536 | os.makedirs(model_path) 537 | 538 | with tf.device("/cpu:0"): 539 | global_episodes = tf.Variable(0, dtype=tf.int32, name='global_episodes', trainable=False) 540 | global_steps = tf.Variable(0, dtype=tf.int32, name='global_steps', trainable=False) 541 | trainer = tf.train.AdamOptimizer(learning_rate=3e-5) 542 | master_network = AC_Network('global', None, action_spec,observation_spec) # Generate global network 543 | if FLAGS.n_agents < 1: 544 | num_workers = psutil.cpu_count() # Set workers to number of available CPU threads 545 | else: 546 | num_workers = FLAGS.n_agents 547 | workers = [] 548 | # Create worker classes 549 | for i in range(num_workers): 550 | workers.append(Worker(i, trainer, model_path, global_episodes, global_steps, map_name, action_spec, observation_spec)) 551 | saver = tf.train.Saver(max_to_keep=5) 552 | 553 | with tf.Session() as sess: 554 | coord = tf.train.Coordinator() 555 | if load_model == True: 556 | print ('Loading Model...') 557 | ckpt = tf.train.get_checkpoint_state(model_path) 558 | saver.restore(sess, ckpt.model_checkpoint_path) 559 | else: 560 | sess.run(tf.global_variables_initializer()) 561 | 562 | # This is where the asynchronous magic happens. 563 | # Start the "work" process for each worker in a separate thread. 564 | worker_threads = [] 565 | for worker in workers: 566 | worker_work = lambda: worker.work(max_episode_length, gamma, sess, coord, saver) 567 | t = threading.Thread(target=(worker_work)) 568 | t.start() 569 | sleep(0.5) 570 | sleep(1.5) 571 | worker_threads.append(t) 572 | coord.join(worker_threads) 573 | 574 | if __name__ == '__main__': 575 | import sys 576 | from absl import flags 577 | flags.DEFINE_string(name="map_name", 578 | default="DefeatRoaches", 579 | help="Name of the map/minigame") 580 | flags.DEFINE_integer(name="n_agents", 581 | default=0, 582 | help="Number of agents; passing anything less than 1 will default to number of available CPU threads") 583 | flags.DEFINE_boolean(name="load_model", 584 | default=False, 585 | help="Load a saved model") 586 | FLAGS = flags.FLAGS 587 | FLAGS(sys.argv) 588 | main() 589 | -------------------------------------------------------------------------------- /Agents/PySC2_A3C_old.py: -------------------------------------------------------------------------------- 1 | """ 2 | PySC2_A3C_old.py 3 | A script for training and running an A3C agent on the PySC2 environment, with reference to DeepMind's paper: 4 | [1] Vinyals, Oriol, et al. "Starcraft II: A new challenge for reinforcement learning." arXiv preprint arXiv:1708.04782 (2017). 5 | Advantage estimation uses generalized advantage estimation from: 6 | [2] Schulman, John, et al. "High-dimensional continuous control using generalized advantage estimation." arXiv preprint arXiv:1506.02438 (2015). 7 | 8 | Credit goes to Arthur Juliani for providing for reference an implementation of A3C for the VizDoom environment 9 | https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2 10 | https://github.com/awjuliani/DeepRL-Agents 11 | 12 | Note: 13 | Currently only works on the DefeatRoaches mini-game; work is in-progress for generalizing the script to run on all mini-games 14 | """ 15 | 16 | import threading 17 | import psutil 18 | import numpy as np 19 | import tensorflow as tf 20 | import scipy.signal 21 | from time import sleep 22 | import os 23 | 24 | from pysc2.env import sc2_env 25 | from pysc2.env import environment 26 | from pysc2.lib import actions 27 | 28 | """ 29 | Use the following command to launch Tensorboard: 30 | tensorboard --logdir=worker_0:'./train_0',worker_1:'./train_1',worker_2:'./train_2',worker_3:'./train_3' 31 | """ 32 | 33 | ## HELPER FUNCTIONS 34 | 35 | # Copies one set of variables to another. 36 | # Used to set worker network parameters to those of global network. 37 | def update_target_graph(from_scope,to_scope): 38 | from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, from_scope) 39 | to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, to_scope) 40 | op_holder = [] 41 | for from_var,to_var in zip(from_vars,to_vars): 42 | op_holder.append(to_var.assign(from_var)) 43 | return op_holder 44 | 45 | # Processes PySC2 observations 46 | def process_observation(observation): 47 | nonspatial_size = 727 48 | screen_channels = 7 49 | multi_select_max = 100 50 | # is episode over? 51 | episode_end = observation.step_type == environment.StepType.LAST 52 | # reward 53 | reward = observation.reward 54 | # features 55 | features = observation.observation 56 | # nonspatial features 57 | # TimeStep.observation['control_groups'](10,2) 58 | # TimeStep.observation['single_select'](1,7) 59 | # TimeStep.observation['multi_select'](n,7) 60 | nonspatial_stack = features['control_groups'].reshape(-1) 61 | nonspatial_stack = np.concatenate((nonspatial_stack, features['single_select'].reshape(-1))) 62 | multi_select = features['multi_select'].reshape(-1) 63 | # if multi_select has less than multi_select_max units, pad with zeros 64 | if len(multi_select) < multi_select_max * 7: 65 | multi_select = np.concatenate((multi_select, np.zeros(multi_select_max * 7 - len(multi_select)))) 66 | nonspatial_stack = np.concatenate((nonspatial_stack, multi_select)) 67 | # spatial_minimap features 68 | # not used for DefeatRoaches since no camera movement is required 69 | minimap_stack = None 70 | # spatial_screen features 71 | # TimeStep.observation['screen'][5] (player_relative) 72 | # TimeStep.observation['screen'][6] (unit_type) 73 | # TimeStep.observation['screen'][7] (selected) 74 | # TimeStep.observation['screen'][8] (unit_hit_points) 75 | # TimeStep.observation['screen'][9] (unit_hit_points_ratio) 76 | # TimeStep.observation['screen'][14] (unit_density) 77 | # TimeStep.observation['screen'][15] (unit_density_aa) 78 | screen_stack = np.stack((features['screen'][5], features['screen'][6], features['screen'][7], features['screen'][8], features['screen'][9], features['screen'][14], features['screen'][15]), axis=2) 79 | return reward, nonspatial_stack.reshape([-1,nonspatial_size]), minimap_stack, screen_stack.reshape([-1,64,64,screen_channels]), episode_end 80 | 81 | # Discounting function used to calculate discounted returns. 82 | def discount(x, gamma): 83 | return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1] 84 | 85 | # Used to initialize weights for policy and value output layers 86 | def normalized_columns_initializer(std=1.0): 87 | def _initializer(shape, dtype=None, partition_info=None): 88 | out = np.random.randn(*shape).astype(np.float32) 89 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 90 | return tf.constant(out) 91 | return _initializer 92 | 93 | # Sample from distribution of arguments 94 | def sample_dist(dist): 95 | sample = np.random.choice(dist[0],p=dist[0]) 96 | sample = np.argmax(dist == sample) 97 | return sample 98 | 99 | ## ACTOR-CRITIC NETWORK 100 | 101 | class AC_Network(): 102 | def __init__(self,scope,trainer): 103 | with tf.variable_scope(scope): 104 | # Architecture here follows Atari-net Agent described in [1] Section 4.3 105 | nonspatial_size = 727 106 | screen_channels = 7 107 | 108 | self.inputs_nonspatial = tf.placeholder(shape=[None,nonspatial_size], dtype=tf.float32) 109 | self.inputs_spatial_screen_reshaped = tf.placeholder(shape=[None,64,64,screen_channels], dtype=tf.float32) 110 | self.nonspatial_dense = tf.layers.dense( 111 | inputs=self.inputs_nonspatial, 112 | units=32, 113 | activation=tf.tanh) 114 | self.screen_conv1 = tf.layers.conv2d( 115 | inputs=self.inputs_spatial_screen_reshaped, 116 | filters=16, 117 | kernel_size=[8,8], 118 | strides=[4,4], 119 | padding='valid', 120 | activation=tf.nn.relu) 121 | self.screen_conv2 = tf.layers.conv2d( 122 | inputs=self.screen_conv1, 123 | filters=32, 124 | kernel_size=[4,4], 125 | strides=[2,2], 126 | padding='valid', 127 | activation=tf.nn.relu) 128 | # According to [1]: "The results are concatenated and sent through a linear layer with a ReLU activation." 129 | self.latent_vector = tf.layers.dense( 130 | inputs=tf.concat([self.nonspatial_dense, tf.reshape(self.screen_conv2,shape=[-1,6*6*32])], axis=1), 131 | units=256, 132 | activation=tf.nn.relu) 133 | 134 | # Output layers for policy and value estimations 135 | # 12 policy networks for base actions and arguments 136 | # - All modeled independently 137 | # - Spatial arguments have the x and y values modeled independently as well 138 | # 1 value network 139 | self.policy_base_actions = tf.layers.dense( 140 | inputs=self.latent_vector, 141 | units=17, 142 | activation=tf.nn.softmax, 143 | kernel_initializer=normalized_columns_initializer(0.01)) 144 | self.policy_arg_select_add = tf.layers.dense( 145 | inputs=self.latent_vector, 146 | units=2, 147 | activation=tf.nn.softmax, 148 | kernel_initializer=normalized_columns_initializer(1.0)) 149 | self.policy_arg_queued = tf.layers.dense( 150 | inputs=self.latent_vector, 151 | units=2, 152 | activation=tf.nn.softmax, 153 | kernel_initializer=normalized_columns_initializer(1.0)) 154 | self.policy_arg_select_point_act = tf.layers.dense( 155 | inputs=self.latent_vector, 156 | units=4, 157 | activation=tf.nn.softmax, 158 | kernel_initializer=normalized_columns_initializer(0.01)) 159 | self.policy_arg_select_unit_act = tf.layers.dense( 160 | inputs=self.latent_vector, 161 | units=4, 162 | activation=tf.nn.softmax, 163 | kernel_initializer=normalized_columns_initializer(0.01)) 164 | self.policy_arg_control_group_act = tf.layers.dense( 165 | inputs=self.latent_vector, 166 | units=5, 167 | activation=tf.nn.softmax, 168 | kernel_initializer=normalized_columns_initializer(0.01)) 169 | self.policy_arg_control_group_id = tf.layers.dense( 170 | inputs=self.latent_vector, 171 | units=10, 172 | activation=tf.nn.softmax, 173 | kernel_initializer=normalized_columns_initializer(0.01)) 174 | self.policy_arg_select_unit_id = tf.layers.dense( 175 | inputs=self.latent_vector, 176 | units=500, 177 | activation=tf.nn.softmax, 178 | kernel_initializer=normalized_columns_initializer(0.01)) 179 | self.policy_arg_screen_x = tf.layers.dense( 180 | inputs=self.latent_vector, 181 | units=64, 182 | activation=tf.nn.softmax, 183 | kernel_initializer=normalized_columns_initializer(0.01)) 184 | self.policy_arg_screen_y = tf.layers.dense( 185 | inputs=self.latent_vector, 186 | units=64, 187 | activation=tf.nn.softmax, 188 | kernel_initializer=normalized_columns_initializer(0.01)) 189 | self.policy_arg_screen2_x = tf.layers.dense( 190 | inputs=self.latent_vector, 191 | units=64, 192 | activation=tf.nn.softmax, 193 | kernel_initializer=normalized_columns_initializer(0.01)) 194 | self.policy_arg_screen2_y = tf.layers.dense( 195 | inputs=self.latent_vector, 196 | units=64, 197 | activation=tf.nn.softmax, 198 | kernel_initializer=normalized_columns_initializer(0.01)) 199 | self.value = tf.layers.dense( 200 | inputs=self.latent_vector, 201 | units=1, 202 | kernel_initializer=normalized_columns_initializer(1.0)) 203 | # Only the worker network need ops for loss functions and gradient updating. 204 | # calculates the losses 205 | # self.gradients - gradients of loss wrt local_vars 206 | # applies the gradients to update the global network 207 | if scope != 'global': 208 | self.actions_base = tf.placeholder(shape=[None],dtype=tf.int32) 209 | self.actions_onehot_base = tf.one_hot(self.actions_base,17,dtype=tf.float32) 210 | self.actions_arg_screen_x = tf.placeholder(shape=[None],dtype=tf.int32) 211 | self.actions_onehot_arg_screen_x = tf.one_hot(self.actions_arg_screen_x,64,dtype=tf.float32) 212 | self.actions_arg_screen_y = tf.placeholder(shape=[None],dtype=tf.int32) 213 | self.actions_onehot_arg_screen_y = tf.one_hot(self.actions_arg_screen_y,64,dtype=tf.float32) 214 | self.actions_arg_screen2_x = tf.placeholder(shape=[None],dtype=tf.int32) 215 | self.actions_onehot_arg_screen2_x = tf.one_hot(self.actions_arg_screen2_x,64,dtype=tf.float32) 216 | self.actions_arg_screen2_y = tf.placeholder(shape=[None],dtype=tf.int32) 217 | self.actions_onehot_arg_screen2_y = tf.one_hot(self.actions_arg_screen2_y,64,dtype=tf.float32) 218 | self.actions_arg_select_point_act = tf.placeholder(shape=[None],dtype=tf.int32) 219 | self.actions_onehot_arg_select_point_act = tf.one_hot(self.actions_arg_select_point_act,4,dtype=tf.float32) 220 | self.actions_arg_select_add = tf.placeholder(shape=[None],dtype=tf.int32) 221 | self.actions_onehot_arg_select_add = tf.one_hot(self.actions_arg_select_add,2,dtype=tf.float32) 222 | self.actions_arg_control_group_act = tf.placeholder(shape=[None],dtype=tf.int32) 223 | self.actions_onehot_arg_control_group_act = tf.one_hot(self.actions_arg_control_group_act,5,dtype=tf.float32) 224 | self.actions_arg_control_group_id = tf.placeholder(shape=[None],dtype=tf.int32) 225 | self.actions_onehot_arg_control_group_id = tf.one_hot(self.actions_arg_control_group_id,10,dtype=tf.float32) 226 | self.actions_arg_select_unit_id = tf.placeholder(shape=[None],dtype=tf.int32) 227 | self.actions_onehot_arg_select_unit_id = tf.one_hot(self.actions_arg_select_unit_id,500,dtype=tf.float32) 228 | self.actions_arg_select_unit_act = tf.placeholder(shape=[None],dtype=tf.int32) 229 | self.actions_onehot_arg_select_unit_act = tf.one_hot(self.actions_arg_select_unit_act,4,dtype=tf.float32) 230 | self.actions_arg_queued = tf.placeholder(shape=[None],dtype=tf.int32) 231 | self.actions_onehot_arg_queued = tf.one_hot(self.actions_arg_queued,2,dtype=tf.float32) 232 | self.target_v = tf.placeholder(shape=[None],dtype=tf.float32) 233 | self.advantages = tf.placeholder(shape=[None],dtype=tf.float32) 234 | 235 | self.responsible_outputs_base = tf.reduce_sum(self.policy_base_actions * self.actions_onehot_base, [1]) 236 | self.responsible_outputs_arg_screen_x = tf.reduce_sum(self.policy_arg_screen_x * self.actions_onehot_arg_screen_x, [1]) 237 | self.responsible_outputs_arg_screen_y = tf.reduce_sum(self.policy_arg_screen_y * self.actions_onehot_arg_screen_y, [1]) 238 | self.responsible_outputs_arg_screen2_x = tf.reduce_sum(self.policy_arg_screen2_x * self.actions_onehot_arg_screen2_x, [1]) 239 | self.responsible_outputs_arg_screen2_y = tf.reduce_sum(self.policy_arg_screen2_y * self.actions_onehot_arg_screen2_y, [1]) 240 | self.responsible_outputs_arg_select_point_act = tf.reduce_sum(self.policy_arg_select_point_act * self.actions_onehot_arg_select_point_act, [1]) 241 | self.responsible_outputs_arg_select_add = tf.reduce_sum(self.policy_arg_select_add * self.actions_onehot_arg_select_add, [1]) 242 | self.responsible_outputs_arg_control_group_act = tf.reduce_sum(self.policy_arg_control_group_act * self.actions_onehot_arg_control_group_act, [1]) 243 | self.responsible_outputs_arg_control_group_id = tf.reduce_sum(self.policy_arg_control_group_id * self.actions_onehot_arg_control_group_id, [1]) 244 | self.responsible_outputs_arg_select_unit_id = tf.reduce_sum(self.policy_arg_select_unit_id * self.actions_onehot_arg_select_unit_id, [1]) 245 | self.responsible_outputs_arg_select_unit_act = tf.reduce_sum(self.policy_arg_select_unit_act * self.actions_onehot_arg_select_unit_act, [1]) 246 | self.responsible_outputs_arg_queued = tf.reduce_sum(self.policy_arg_queued * self.actions_onehot_arg_queued, [1]) 247 | 248 | # Loss functions 249 | self.value_loss = 0.5 * tf.reduce_sum(tf.square(self.target_v - tf.reshape(self.value,[-1]))) 250 | 251 | self.log_policy_base_actions = tf.log(tf.clip_by_value(self.policy_base_actions, 1e-20, 1.0)) # avoid NaN with clipping when value in policy becomes zero 252 | self.entropy_base = - tf.reduce_sum(self.policy_base_actions * self.log_policy_base_actions) 253 | self.entropy_arg_screen_x = - tf.reduce_sum(self.policy_arg_screen_x * tf.log(tf.clip_by_value(self.policy_arg_screen_x, 1e-20, 1.0))) 254 | self.entropy_arg_screen_y = - tf.reduce_sum(self.policy_arg_screen_y * tf.log(tf.clip_by_value(self.policy_arg_screen_y, 1e-20, 1.0))) 255 | self.entropy_arg_screen2_x = - tf.reduce_sum(self.policy_arg_screen2_x * tf.log(tf.clip_by_value(self.policy_arg_screen2_x, 1e-20, 1.0))) 256 | self.entropy_arg_screen2_y = - tf.reduce_sum(self.policy_arg_screen2_y * tf.log(tf.clip_by_value(self.policy_arg_screen2_y, 1e-20, 1.0))) 257 | self.entropy_arg_select_point_act = - tf.reduce_sum(self.policy_arg_select_point_act * tf.log(tf.clip_by_value(self.policy_arg_select_point_act, 1e-20, 1.0))) 258 | self.entropy_arg_select_add = - tf.reduce_sum(self.policy_arg_select_add * tf.log(tf.clip_by_value(self.policy_arg_select_add, 1e-20, 1.0))) 259 | self.entropy_arg_control_group_act = - tf.reduce_sum(self.policy_arg_control_group_act * tf.log(tf.clip_by_value(self.policy_arg_control_group_act, 1e-20, 1.0))) 260 | self.entropy_arg_control_group_id = - tf.reduce_sum(self.policy_arg_control_group_id * tf.log(tf.clip_by_value(self.policy_arg_control_group_id, 1e-20, 1.0))) 261 | self.entropy_arg_select_unit_id = - tf.reduce_sum(self.policy_arg_select_unit_id * tf.log(tf.clip_by_value(self.policy_arg_select_unit_id, 1e-20, 1.0))) 262 | self.entropy_arg_select_unit_act = - tf.reduce_sum(self.policy_arg_select_unit_act * tf.log(tf.clip_by_value(self.policy_arg_select_unit_act, 1e-20, 1.0))) 263 | self.entropy_arg_queued = - tf.reduce_sum(self.policy_arg_queued * tf.log(tf.clip_by_value(self.policy_arg_queued, 1e-20, 1.0))) 264 | self.entropy = self.entropy_base + self.entropy_arg_screen_x + self.entropy_arg_screen_y + self.entropy_arg_screen2_x + self.entropy_arg_screen2_y + self.entropy_arg_select_point_act + self.entropy_arg_select_add + self.entropy_arg_control_group_act + self.entropy_arg_control_group_id + self.entropy_arg_select_unit_id + self.entropy_arg_select_unit_act + self.entropy_arg_queued 265 | 266 | self.policy_loss_base = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_base, 1e-20, 1.0))*self.advantages) 267 | self.policy_loss_arg_screen_x = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_screen_x, 1e-20, 1.0))*self.advantages) 268 | self.policy_loss_arg_screen_y = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_screen_y, 1e-20, 1.0))*self.advantages) 269 | self.policy_loss_arg_screen2_x = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_screen2_x, 1e-20, 1.0))*self.advantages) 270 | self.policy_loss_arg_screen2_y = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_screen2_y, 1e-20, 1.0))*self.advantages) 271 | self.policy_loss_arg_select_point_act = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_select_point_act, 1e-20, 1.0))*self.advantages) 272 | self.policy_loss_arg_select_add = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_select_add, 1e-20, 1.0))*self.advantages) 273 | self.policy_loss_arg_control_group_act = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_control_group_act, 1e-20, 1.0))*self.advantages) 274 | self.policy_loss_arg_control_group_id = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_control_group_id, 1e-20, 1.0))*self.advantages) 275 | self.policy_loss_arg_select_unit_id = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_select_unit_id, 1e-20, 1.0))*self.advantages) 276 | self.policy_loss_arg_select_unit_act = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_select_unit_act, 1e-20, 1.0))*self.advantages) 277 | self.policy_loss_arg_queued = - tf.reduce_sum(tf.log(tf.clip_by_value(self.responsible_outputs_arg_queued, 1e-20, 1.0))*self.advantages) 278 | self.policy_loss = self.policy_loss_base + self.policy_loss_arg_screen_x + self.policy_loss_arg_screen_y + self.policy_loss_arg_screen2_x + self.policy_loss_arg_screen2_y + self.policy_loss_arg_select_point_act + self.policy_loss_arg_select_add + self.policy_loss_arg_control_group_act + self.policy_loss_arg_control_group_id + self.policy_loss_arg_select_unit_id + self.policy_loss_arg_select_unit_act + self.policy_loss_arg_queued 279 | 280 | self.loss = 0.5 * self.value_loss + self.policy_loss - self.entropy * 0.01 281 | 282 | # Get gradients from local network using local losses 283 | local_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 284 | self.gradients = tf.gradients(self.loss,local_vars) 285 | self.var_norms = tf.global_norm(local_vars) 286 | grads,self.grad_norms = tf.clip_by_global_norm(self.gradients,40.0) 287 | 288 | # Apply local gradients to global network 289 | global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') 290 | self.apply_grads = trainer.apply_gradients(zip(grads,global_vars)) 291 | 292 | ## WORKER AGENT 293 | 294 | class Worker(): 295 | def __init__(self,name,trainer,model_path,global_episodes): 296 | self.name = "worker_" + str(name) 297 | self.number = name 298 | self.model_path = model_path 299 | self.trainer = trainer 300 | self.global_episodes = global_episodes 301 | self.increment = self.global_episodes.assign_add(1) 302 | self.episode_rewards = [] 303 | self.episode_lengths = [] 304 | self.episode_mean_values = [] 305 | self.summary_writer = tf.summary.FileWriter("train_"+str(self.number)) 306 | 307 | #Create the local copy of the network and the tensorflow op to copy global paramters to local network 308 | self.local_AC = AC_Network(self.name,trainer) 309 | self.update_local_ops = update_target_graph('global',self.name) 310 | 311 | self.env = sc2_env.SC2Env(map_name="DefeatRoaches") 312 | 313 | 314 | def train(self,rollout,sess,gamma,bootstrap_value): 315 | rollout = np.array(rollout) 316 | obs_screen = rollout[:,0] 317 | obs_nonspatial = rollout[:,1] 318 | actions_base = rollout[:,2] 319 | actions_arg_screen_x = rollout[:,3] 320 | actions_arg_screen_y = rollout[:,4] 321 | actions_arg_screen2_x = rollout[:,5] 322 | actions_arg_screen2_y = rollout[:,6] 323 | actions_arg_select_point_act = rollout[:,7] 324 | actions_arg_select_add = rollout[:,8] 325 | actions_arg_control_group_act = rollout[:,9] 326 | actions_arg_control_group_id = rollout[:,10] 327 | actions_arg_select_unit_id = rollout[:,11] 328 | actions_arg_select_unit_act = rollout[:,12] 329 | actions_arg_queued = rollout[:,13] 330 | rewards = rollout[:,14] 331 | next_obs_screen = rollout[:,15] 332 | next_obs_nonspatial = rollout[:,16] 333 | values = rollout[:,18] 334 | 335 | # Here we take the rewards and values from the rollout, and use them to calculate the advantage and discounted returns. 336 | # The advantage function uses generalized advantage estimation from [2] 337 | self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) 338 | discounted_rewards = discount(self.rewards_plus,gamma)[:-1] 339 | self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) 340 | advantages = rewards + gamma * self.value_plus[1:] - self.value_plus[:-1] 341 | advantages = discount(advantages,gamma) 342 | 343 | # Update the global network using gradients from loss 344 | # Generate network statistics to periodically save 345 | feed_dict = {self.local_AC.target_v:discounted_rewards, 346 | self.local_AC.inputs_spatial_screen_reshaped:np.stack(obs_screen).reshape(-1,64,64,7), 347 | self.local_AC.inputs_nonspatial:np.stack(obs_nonspatial).reshape(-1,727), 348 | self.local_AC.actions_base:actions_base, 349 | self.local_AC.actions_arg_screen_x:actions_arg_screen_x, 350 | self.local_AC.actions_arg_screen_y:actions_arg_screen_y, 351 | self.local_AC.actions_arg_screen2_x:actions_arg_screen2_x, 352 | self.local_AC.actions_arg_screen2_y:actions_arg_screen2_y, 353 | self.local_AC.actions_arg_select_point_act:actions_arg_select_point_act, 354 | self.local_AC.actions_arg_select_add:actions_arg_select_add, 355 | self.local_AC.actions_arg_control_group_act:actions_arg_control_group_act, 356 | self.local_AC.actions_arg_control_group_id:actions_arg_control_group_id, 357 | self.local_AC.actions_arg_select_unit_id:actions_arg_select_unit_id, 358 | self.local_AC.actions_arg_select_unit_act:actions_arg_select_unit_act, 359 | self.local_AC.actions_arg_queued:actions_arg_queued, 360 | self.local_AC.advantages:advantages} 361 | v_l,p_l,e_l,g_n,v_n, _ = sess.run([self.local_AC.value_loss, 362 | self.local_AC.policy_loss, 363 | self.local_AC.entropy, 364 | self.local_AC.grad_norms, 365 | self.local_AC.var_norms, 366 | self.local_AC.apply_grads], 367 | feed_dict=feed_dict) 368 | return v_l / len(rollout),p_l / len(rollout),e_l / len(rollout), g_n,v_n 369 | 370 | def work(self,max_episode_length,gamma,sess,coord,saver): 371 | episode_count = sess.run(self.global_episodes) 372 | total_steps = 0 373 | print ("Starting worker " + str(self.number)) 374 | with sess.as_default(), sess.graph.as_default(): 375 | while not coord.should_stop(): 376 | # Download copy of parameters from global network 377 | sess.run(self.update_local_ops) 378 | 379 | episode_buffer = [] 380 | episode_values = [] 381 | episode_frames = [] 382 | episode_reward = 0 383 | episode_step_count = 0 384 | d = False 385 | 386 | # Start new episode 387 | obs = self.env.reset() 388 | episode_frames.append(obs[0]) 389 | reward, nonspatial_stack, minimap_stack, screen_stack, episode_end = process_observation(obs[0]) 390 | s_screen = screen_stack 391 | s_nonspatial = nonspatial_stack 392 | 393 | while not episode_end: 394 | # Take an action using distributions from policy networks' outputs. 395 | base_action_dist, screen_x_dist, screen_y_dist, screen2_x_dist, screen2_y_dist, select_point_act_dist,select_add_dist,control_group_act_dist,control_group_id_dist,select_unit_id_dist,select_unit_act_dist,queued_dist,v = sess.run([ 396 | self.local_AC.policy_base_actions, 397 | self.local_AC.policy_arg_screen_x, 398 | self.local_AC.policy_arg_screen_y, 399 | self.local_AC.policy_arg_screen2_x, 400 | self.local_AC.policy_arg_screen2_y, 401 | self.local_AC.policy_arg_select_point_act, 402 | self.local_AC.policy_arg_select_add, 403 | self.local_AC.policy_arg_control_group_act, 404 | self.local_AC.policy_arg_control_group_id, 405 | self.local_AC.policy_arg_select_unit_id, 406 | self.local_AC.policy_arg_select_unit_act, 407 | self.local_AC.policy_arg_queued, 408 | self.local_AC.value], 409 | feed_dict={self.local_AC.inputs_spatial_screen_reshaped: screen_stack, 410 | self.local_AC.inputs_nonspatial: nonspatial_stack}) 411 | 412 | # Apply filter to remove unavailable actions and then renormalize 413 | index2action_id = {0:0, 1:1, 2:2, 3:3, 4:4, 5:5, 6:7, 7:12, 8:13, 9:274, 10:331, 11:332, 12:333, 13:334, 14:451, 15:452, 16:453} 414 | for index, action in enumerate(base_action_dist[0]): 415 | action_id = index2action_id[index] 416 | if action_id not in obs[0].observation['available_actions']: 417 | base_action_dist[0][index] = 0 418 | if np.sum(base_action_dist[0]) != 1: 419 | current_sum = np.sum(base_action_dist[0]) 420 | base_action_dist[0] /= current_sum 421 | 422 | base_action = sample_dist(base_action_dist) 423 | arg_screen_x = sample_dist(screen_x_dist) 424 | arg_screen_y = sample_dist(screen_y_dist) 425 | arg_screen2_x = sample_dist(screen2_x_dist) 426 | arg_screen2_y = sample_dist(screen2_y_dist) 427 | arg_select_point_act = sample_dist(select_point_act_dist) 428 | arg_select_add = sample_dist(select_add_dist) 429 | arg_control_group_act = sample_dist(control_group_act_dist) 430 | arg_control_group_id = sample_dist(control_group_id_dist) 431 | arg_select_unit_id = sample_dist(select_unit_id_dist) 432 | arg_select_unit_act = sample_dist(select_unit_act_dist) 433 | arg_queued = sample_dist(queued_dist) 434 | 435 | # 17 relevant base actions 436 | if base_action == 0: 437 | # 0/no_op 438 | action_id = 0 439 | arguments = [] 440 | elif base_action == 1: 441 | # 1/move_camera 442 | action_id = 1 443 | arguments = [[arg_screen_x, arg_screen_y]] 444 | elif base_action == 2: 445 | # 2/select_point 446 | action_id = 2 447 | arguments = [[arg_select_point_act],[arg_screen_x, arg_screen_y]] 448 | elif base_action == 3: 449 | # 3/select_rect 450 | action_id = 3 451 | arguments = [[arg_select_add],[arg_screen_x, arg_screen_y],[arg_screen2_x, arg_screen2_y]] 452 | elif base_action == 4: 453 | # 4/select_control_group 454 | action_id = 4 455 | arguments = [[arg_control_group_act],[arg_control_group_id]] 456 | elif base_action == 5: 457 | # 5/select_unit 458 | action_id = 5 459 | arguments = [[arg_select_unit_act],[arg_select_unit_id]] 460 | elif base_action == 6: 461 | # 7/select_army 462 | action_id = 7 463 | arguments = [[arg_select_add]] 464 | elif base_action == 7: 465 | # 12/Attack_screen 466 | action_id = 12 467 | arguments = [[arg_queued],[arg_screen_x, arg_screen_y]] 468 | elif base_action == 8: 469 | # 13/Attack_minimap 470 | action_id = 13 471 | arguments = [[arg_queued],[arg_screen_x, arg_screen_y]] 472 | elif base_action == 9: 473 | # 274/HoldPosition_quick 474 | action_id = 274 475 | arguments = [[arg_queued]] 476 | elif base_action == 10: 477 | # 331/Move_screen 478 | action_id = 331 479 | arguments = [[arg_queued],[arg_screen_x, arg_screen_y]] 480 | elif base_action == 11: 481 | # 332/Move_minimap 482 | action_id = 332 483 | arguments = [[arg_queued],[arg_screen_x, arg_screen_y]] 484 | elif base_action == 12: 485 | # 333/Patrol_screen 486 | action_id = 333 487 | arguments = [[arg_queued],[arg_screen_x, arg_screen_y]] 488 | elif base_action == 13: 489 | # 334/Patrol_minimap 490 | action_id = 334 491 | arguments = [[arg_queued],[arg_screen_x, arg_screen_y]] 492 | elif base_action == 14: 493 | # 451/Smart_screen 494 | action_id = 451 495 | arguments = [[arg_queued],[arg_screen_x, arg_screen_y]] 496 | elif base_action == 15: 497 | # 452/Smart_minimap 498 | action_id = 452 499 | arguments = [[arg_queued],[arg_screen_x, arg_screen_y]] 500 | elif base_action == 16: 501 | # 453/Stop_quick 502 | action_id = 453 503 | arguments = [[arg_queued]] 504 | 505 | a = actions.FunctionCall(action_id, arguments) 506 | obs = self.env.step(actions=[a]) 507 | r, nonspatial_stack, minimap_stack, screen_stack, episode_end = process_observation(obs[0]) 508 | 509 | if not episode_end: 510 | episode_frames.append(obs[0]) 511 | s1_screen = screen_stack 512 | s1_nonspatial = nonspatial_stack 513 | else: 514 | s1_screen = s_screen 515 | s1_nonspatial = s_nonspatial 516 | 517 | # Append latest state to buffer 518 | episode_buffer.append([s_screen, s_nonspatial,base_action,arg_screen_x,arg_screen_y,arg_screen2_x,arg_screen2_y,arg_select_point_act,arg_select_add,arg_control_group_act,arg_control_group_id,arg_select_unit_id,arg_select_unit_act,arg_queued,r,s1_screen, s1_nonspatial,d,v[0,0]]) 519 | episode_values.append(v[0,0]) 520 | 521 | episode_reward += r 522 | s_screen = s1_screen 523 | s_nonspatial = s1_nonspatial 524 | total_steps += 1 525 | episode_step_count += 1 526 | 527 | global _steps 528 | _steps += 1 529 | 530 | # If the episode hasn't ended, but the experience buffer is full, then we make an update step using that experience rollout. 531 | if len(episode_buffer) == 30 and not episode_end and episode_step_count != max_episode_length - 1: 532 | # Since we don't know what the true final return is, we "bootstrap" from our current value estimation. 533 | v1 = sess.run(self.local_AC.value, 534 | feed_dict={self.local_AC.inputs_spatial_screen_reshaped: screen_stack,self.local_AC.inputs_nonspatial: nonspatial_stack})[0,0] 535 | v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,v1) 536 | episode_buffer = [] 537 | sess.run(self.update_local_ops) 538 | if episode_end: 539 | break 540 | 541 | self.episode_rewards.append(episode_reward) 542 | self.episode_lengths.append(episode_step_count) 543 | self.episode_mean_values.append(np.mean(episode_values)) 544 | episode_count += 1 545 | 546 | global _max_score, _running_avg_score, _episodes 547 | if _max_score < episode_reward: 548 | _max_score = episode_reward 549 | _running_avg_score = (2.0 / 101) * (episode_reward - _running_avg_score) + _running_avg_score 550 | _episodes += 1 551 | 552 | print("{} Step #{} Episode #{} Reward: {}".format(self.name, total_steps, episode_count, episode_reward)) 553 | print("Total Steps: {}\tTotal Episodes: {}\tMax Score: {}\tAvg Score: {}".format(_steps, _episodes, _max_score, _running_avg_score)) 554 | 555 | # Update the network using the episode buffer at the end of the episode. 556 | if len(episode_buffer) != 0: 557 | v_l,p_l,e_l,g_n,v_n = self.train(episode_buffer,sess,gamma,0.0) 558 | 559 | if episode_count % 5 == 0 and episode_count != 0: 560 | if self.name == 'worker_0' and episode_count % 25 == 0: 561 | time_per_step = 0.05 562 | images = np.array(episode_frames) 563 | if episode_count % 250 == 0 and self.name == 'worker_0': 564 | saver.save(sess,self.model_path+'/model-'+str(episode_count)+'.cptk') 565 | print ("Saved Model") 566 | 567 | mean_reward = np.mean(self.episode_rewards[-5:]) 568 | mean_length = np.mean(self.episode_lengths[-5:]) 569 | mean_value = np.mean(self.episode_mean_values[-5:]) 570 | summary = tf.Summary() 571 | summary.value.add(tag='Perf/Reward', simple_value=float(mean_reward)) 572 | summary.value.add(tag='Perf/Length', simple_value=float(mean_length)) 573 | summary.value.add(tag='Perf/Value', simple_value=float(mean_value)) 574 | summary.value.add(tag='Losses/Value Loss', simple_value=float(v_l)) 575 | summary.value.add(tag='Losses/Policy Loss', simple_value=float(p_l)) 576 | summary.value.add(tag='Losses/Entropy', simple_value=float(e_l)) 577 | summary.value.add(tag='Losses/Grad Norm', simple_value=float(g_n)) 578 | summary.value.add(tag='Losses/Var Norm', simple_value=float(v_n)) 579 | self.summary_writer.add_summary(summary, episode_count) 580 | 581 | self.summary_writer.flush() 582 | if self.name == 'worker_0': 583 | sess.run(self.increment) 584 | 585 | def main(): 586 | max_episode_length = 300 587 | gamma = .99 # discount rate for advantage estimation and reward discounting 588 | load_model = True 589 | model_path = './_model_old' 590 | 591 | global _max_score, _running_avg_score, _steps, _episodes 592 | _max_score = -9 593 | _running_avg_score = -9 594 | _steps = 0 595 | _episodes = 0 596 | 597 | tf.reset_default_graph() 598 | 599 | if not os.path.exists(model_path): 600 | os.makedirs(model_path) 601 | 602 | with tf.device("/cpu:0"): 603 | global_episodes = tf.Variable(0,dtype=tf.int32,name='global_episodes',trainable=False) 604 | trainer = tf.train.AdamOptimizer(learning_rate=1e-4) 605 | master_network = AC_Network('global',None) # Generate global network 606 | num_workers = psutil.cpu_count() # Set workers to number of available CPU threads 607 | num_workers = 1 608 | workers = [] 609 | # Create worker classes 610 | for i in range(num_workers): 611 | workers.append(Worker(i,trainer,model_path,global_episodes)) 612 | saver = tf.train.Saver(max_to_keep=5) 613 | 614 | with tf.Session() as sess: 615 | coord = tf.train.Coordinator() 616 | if load_model == True: 617 | print ('Loading Model...') 618 | ckpt = tf.train.get_checkpoint_state(model_path) 619 | saver.restore(sess,ckpt.model_checkpoint_path) 620 | else: 621 | sess.run(tf.global_variables_initializer()) 622 | 623 | # This is where the asynchronous magic happens. 624 | # Start the "work" process for each worker in a separate thread. 625 | worker_threads = [] 626 | for worker in workers: 627 | worker_work = lambda: worker.work(max_episode_length,gamma,sess,coord,saver) 628 | t = threading.Thread(target=(worker_work)) 629 | t.start() 630 | sleep(0.5) 631 | worker_threads.append(t) 632 | coord.join(worker_threads) 633 | 634 | if __name__ == '__main__': 635 | import sys 636 | from absl import flags 637 | FLAGS = flags.FLAGS 638 | FLAGS(sys.argv) 639 | main() -------------------------------------------------------------------------------- /Agents/_model_old/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model-27500.cptk" 2 | all_model_checkpoint_paths: "model-27500.cptk" 3 | -------------------------------------------------------------------------------- /Agents/_model_old/model-27500.cptk.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greentfrapp/pysc2-RLagents/8bbb30d2dd44de31d2a28b3611a013b38da825b7/Agents/_model_old/model-27500.cptk.data-00000-of-00001 -------------------------------------------------------------------------------- /Agents/_model_old/model-27500.cptk.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greentfrapp/pysc2-RLagents/8bbb30d2dd44de31d2a28b3611a013b38da825b7/Agents/_model_old/model-27500.cptk.index -------------------------------------------------------------------------------- /Agents/_model_old/model-27500.cptk.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greentfrapp/pysc2-RLagents/8bbb30d2dd44de31d2a28b3611a013b38da825b7/Agents/_model_old/model-27500.cptk.meta -------------------------------------------------------------------------------- /Images/angle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greentfrapp/pysc2-RLagents/8bbb30d2dd44de31d2a28b3611a013b38da825b7/Images/angle.png -------------------------------------------------------------------------------- /Images/poster.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greentfrapp/pysc2-RLagents/8bbb30d2dd44de31d2a28b3611a013b38da825b7/Images/poster.jpg -------------------------------------------------------------------------------- /Images/stall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greentfrapp/pysc2-RLagents/8bbb30d2dd44de31d2a28b3611a013b38da825b7/Images/stall.png -------------------------------------------------------------------------------- /Images/start_game.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greentfrapp/pysc2-RLagents/8bbb30d2dd44de31d2a28b3611a013b38da825b7/Images/start_game.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Lim Swee Kiat 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Notes/List of Action Argument Types.txt: -------------------------------------------------------------------------------- 1 | ================================================== 2 | List of Argument Types for Actions/Functions 3 | ================================================== 4 | 5 | [0] screen 6 | # A point on the screen, by default accepts 2 integers [0, 84) 7 | 8 | [1] minimap 9 | # A point on the minimap, by default accepts 2 integers [0, 64) 10 | 11 | [2] screen2 12 | # Same as screen(id=0), used when two screen points need to be selected 13 | 14 | [3] queued 15 | # Whether the action should be taken immediately or after previous actions, boolean 16 | 17 | [4] control_group_act 18 | # Reference for each of the five actions related to control groups, scalar integer [0, 5) 19 | # 0 - Recall 20 | # 1 - Set 21 | # 2 - Append 22 | # 3 - SetAndSteal (set selected units to control group and remove from other control groups) 23 | # 4 - AppendAndSteal (add selected units to control group and remove from other control groups) 24 | 25 | [5] control_group_id 26 | # Reference for each of the ten control groups that can be assigned, scalar integer [0, 10) 27 | 28 | [6] select_point_act 29 | # Reference for each of the four actions related to selecting a point, scalar integer [0, 4) 30 | # 0 - Select 31 | # 1 - Toggle (add or subtract from selection) 32 | # 2 - AllType (select all units of the same type) 33 | # 3 - AddAllType (add all units of the same type to selection) 34 | 35 | [7] select_add 36 | # Whether to select or add to existing selection, boolean 37 | 38 | [8] select_unit_act 39 | # Reference for each of the four actions related to selecting a unit from the multipanel, scalar integer [0, 4) 40 | # 0 - SingleSelect 41 | # 1 - DeselectUnit 42 | # 2 - SelectAllOfType 43 | # 3 - DeselectAllOfType 44 | 45 | [9] select_unit_id 46 | # Reference for each unit that can be selected from the current multipanel, scalar integer [0, 500) 47 | 48 | [10] select_worker 49 | # Reference for each of the four actions related to selecting an idle work, scalar integer [0, 4) 50 | # 0 - Set (select) 51 | # 1 - Add (add to selection) 52 | # 2 - All (select all idle workers) 53 | # 3 - AddAll (add all idle workers to selection) 54 | 55 | [11] build_queue_id 56 | # Reference for each unit in a build queue (max 10), scalar integer [0, 10) 57 | 58 | [12] unload_id 59 | # Reference for each loaded unit in a cargo, scalar integer [0, 500) 60 | -------------------------------------------------------------------------------- /Notes/Running an Agent.txt: -------------------------------------------------------------------------------- 1 | ================================================== 2 | Running an Agent in pysc2.env.sc2_env.SC2Env 3 | ================================================== 4 | 5 | At each timestep, the agent.step function is called with an observation in the form of a TimeStep object, ie. 6 | 7 | agent.step(obs) 8 | 9 | where obs is a TimeStep object. 10 | 11 | agent.step(obs) is then expected to return an action with subaction arguments, ie. 12 | 13 | return actions.FunctionCall(_NOOP, []) 14 | 15 | obs is a TimeStep object with the following properties 16 | 17 | TimeStep.step_type 18 | step_type= if this is the first step 19 | step_type= if this is neither the first nor the last step 20 | step_type= if this is the last step 21 | 22 | TimeStep.reward 23 | a scalar value that denotes the reward for the TimeStep 24 | 25 | TimeStep.discount 26 | a scalar value in range [0, 1] 27 | 28 | TimeStep.observation 29 | a dictionary with the following key-value pairs 30 | 'cargo': (n, 7) tensor, for each unit in a transport, refer to 'single_select' 31 | 'minimap': (7, 64, 64) tensor, in the following order 32 | [0]: height_map 33 | [1]: visibility_map 34 | [2]: creep 35 | [3]: camera 36 | [4]: player_id 37 | [5]: player_relative 38 | [6]: selected 39 | 'game_loop': (1,) tensor showing game step 40 | 'available_actions': (n,) tensor showing ids of valid actions 41 | 'screen': (17, 84, 84) tensor, in the following order 42 | [0]: height_map 43 | [1]: visibility_map 44 | [2]: creep 45 | [3]: power 46 | [4]: player_id 47 | [5]: player_relative 48 | [6]: unit_type 49 | [7]: selected 50 | [8]: unit_hit_points 51 | [9]: unit_hit_points_ratio 52 | [10]: unit_energy 53 | [11]: unit_energy_ratio 54 | [12]: unit_shields 55 | [13]: unit_shields_ratio 56 | [14]: unit_density 57 | [15]: unit_density_aa 58 | [16]: effects 59 | 'control_groups': (10, 2) tensor, for each of the control groups 60 | [0]: unit leader type 61 | [1]: count 62 | 'multi_select': (n, 7) tensor, for each currently selected unit, refer to 'single_select' 63 | 'cargo_slots_available': (1,) tensor showing number of slots available in transport 64 | 'player': (11,) tensor, for general information 65 | [0]: player_id 66 | [1]: minerals 67 | [2]: vespene 68 | [3]: food used / supply 69 | [4]: food cap 70 | [5]: food used by army 71 | [6]: food used by workers 72 | [7]: idle worker count 73 | [8]: army count 74 | [9]: warp gate count 75 | [10]: larva count 76 | 'single_select': (1, 7) tensor, for the currently selected unit 77 | [0]: unit type 78 | [1]: player_relative 79 | [2]: health 80 | [3]: shields 81 | [4]: energy 82 | [5]: transport slot taken if in transport 83 | [6]: build progress (%) if being built 84 | 'build_queue': (n, 7) tensor, for each unit in a build queue, refer to 'single_select' 85 | 'score_cumulative': (13,) tensor, showing score information 86 | [0]: overall score 87 | [1]: idle production time 88 | [2]: idle worker time 89 | [3]: total value of own units 90 | [4]: total value of own structures 91 | [5]: total value of killed units 92 | [6]: total value of destroyed structures 93 | [7]: collected minerals 94 | [8]: collected vespene 95 | [9]: mineral collection rate 96 | [10]: vespene collection rate 97 | [11]: spent minerals 98 | [12]: spent vespene 99 | -------------------------------------------------------------------------------- /Notes/Total Action Space.txt: -------------------------------------------------------------------------------- 1 | ================================================== 2 | Total Action Space 3 | ================================================== 4 | 5 | Total base actions: 524 6 | Total possible actions (flattened): 101938719 7 | 8 | 0/no_op () 9 | 1/move_camera (1/minimap [64, 64]) 10 | 2/select_point (6/select_point_act [4]; 0/screen [84, 84]) 11 | 3/select_rect (7/select_add [2]; 0/screen [84, 84]; 2/screen2 [84, 84]) 12 | 4/select_control_group (4/control_group_act [5]; 5/control_group_id [10]) 13 | 5/select_unit (8/select_unit_act [4]; 9/select_unit_id [500]) 14 | 6/select_idle_worker (10/select_worker [4]) 15 | 7/select_army (7/select_add [2]) 16 | 8/select_warp_gates (7/select_add [2]) 17 | 9/select_larva () 18 | 10/unload (12/unload_id [500]) 19 | 11/build_queue (11/build_queue_id [10]) 20 | 12/Attack_screen (3/queued [2]; 0/screen [84, 84]) 21 | 13/Attack_minimap (3/queued [2]; 1/minimap [64, 64]) 22 | 14/Attack_Attack_screen (3/queued [2]; 0/screen [84, 84]) 23 | 15/Attack_Attack_minimap (3/queued [2]; 1/minimap [64, 64]) 24 | 16/Attack_AttackBuilding_screen (3/queued [2]; 0/screen [84, 84]) 25 | 17/Attack_AttackBuilding_minimap (3/queued [2]; 1/minimap [64, 64]) 26 | 18/Attack_Redirect_screen (3/queued [2]; 0/screen [84, 84]) 27 | 19/Scan_Move_screen (3/queued [2]; 0/screen [84, 84]) 28 | 20/Scan_Move_minimap (3/queued [2]; 1/minimap [64, 64]) 29 | 21/Behavior_BuildingAttackOff_quick (3/queued [2]) 30 | 22/Behavior_BuildingAttackOn_quick (3/queued [2]) 31 | 23/Behavior_CloakOff_quick (3/queued [2]) 32 | 24/Behavior_CloakOff_Banshee_quick (3/queued [2]) 33 | 25/Behavior_CloakOff_Ghost_quick (3/queued [2]) 34 | 26/Behavior_CloakOn_quick (3/queued [2]) 35 | 27/Behavior_CloakOn_Banshee_quick (3/queued [2]) 36 | 28/Behavior_CloakOn_Ghost_quick (3/queued [2]) 37 | 29/Behavior_GenerateCreepOff_quick (3/queued [2]) 38 | 30/Behavior_GenerateCreepOn_quick (3/queued [2]) 39 | 31/Behavior_HoldFireOff_quick (3/queued [2]) 40 | 32/Behavior_HoldFireOff_Ghost_quick (3/queued [2]) 41 | 33/Behavior_HoldFireOff_Lurker_quick (3/queued [2]) 42 | 34/Behavior_HoldFireOn_quick (3/queued [2]) 43 | 35/Behavior_HoldFireOn_Ghost_quick (3/queued [2]) 44 | 36/Behavior_HoldFireOn_Lurker_quick (3/queued [2]) 45 | 37/Behavior_PulsarBeamOff_quick (3/queued [2]) 46 | 38/Behavior_PulsarBeamOn_quick (3/queued [2]) 47 | 39/Build_Armory_screen (3/queued [2]; 0/screen [84, 84]) 48 | 40/Build_Assimilator_screen (3/queued [2]; 0/screen [84, 84]) 49 | 41/Build_BanelingNest_screen (3/queued [2]; 0/screen [84, 84]) 50 | 42/Build_Barracks_screen (3/queued [2]; 0/screen [84, 84]) 51 | 43/Build_Bunker_screen (3/queued [2]; 0/screen [84, 84]) 52 | 44/Build_CommandCenter_screen (3/queued [2]; 0/screen [84, 84]) 53 | 45/Build_CreepTumor_screen (3/queued [2]; 0/screen [84, 84]) 54 | 46/Build_CreepTumor_Queen_screen (3/queued [2]; 0/screen [84, 84]) 55 | 47/Build_CreepTumor_Tumor_screen (3/queued [2]; 0/screen [84, 84]) 56 | 48/Build_CyberneticsCore_screen (3/queued [2]; 0/screen [84, 84]) 57 | 49/Build_DarkShrine_screen (3/queued [2]; 0/screen [84, 84]) 58 | 50/Build_EngineeringBay_screen (3/queued [2]; 0/screen [84, 84]) 59 | 51/Build_EvolutionChamber_screen (3/queued [2]; 0/screen [84, 84]) 60 | 52/Build_Extractor_screen (3/queued [2]; 0/screen [84, 84]) 61 | 53/Build_Factory_screen (3/queued [2]; 0/screen [84, 84]) 62 | 54/Build_FleetBeacon_screen (3/queued [2]; 0/screen [84, 84]) 63 | 55/Build_Forge_screen (3/queued [2]; 0/screen [84, 84]) 64 | 56/Build_FusionCore_screen (3/queued [2]; 0/screen [84, 84]) 65 | 57/Build_Gateway_screen (3/queued [2]; 0/screen [84, 84]) 66 | 58/Build_GhostAcademy_screen (3/queued [2]; 0/screen [84, 84]) 67 | 59/Build_Hatchery_screen (3/queued [2]; 0/screen [84, 84]) 68 | 60/Build_HydraliskDen_screen (3/queued [2]; 0/screen [84, 84]) 69 | 61/Build_InfestationPit_screen (3/queued [2]; 0/screen [84, 84]) 70 | 62/Build_Interceptors_quick (3/queued [2]) 71 | 63/Build_Interceptors_autocast () 72 | 64/Build_MissileTurret_screen (3/queued [2]; 0/screen [84, 84]) 73 | 65/Build_Nexus_screen (3/queued [2]; 0/screen [84, 84]) 74 | 66/Build_Nuke_quick (3/queued [2]) 75 | 67/Build_NydusNetwork_screen (3/queued [2]; 0/screen [84, 84]) 76 | 68/Build_NydusWorm_screen (3/queued [2]; 0/screen [84, 84]) 77 | 69/Build_PhotonCannon_screen (3/queued [2]; 0/screen [84, 84]) 78 | 70/Build_Pylon_screen (3/queued [2]; 0/screen [84, 84]) 79 | 71/Build_Reactor_quick (3/queued [2]) 80 | 72/Build_Reactor_screen (3/queued [2]; 0/screen [84, 84]) 81 | 73/Build_Reactor_Barracks_quick (3/queued [2]) 82 | 74/Build_Reactor_Barracks_screen (3/queued [2]; 0/screen [84, 84]) 83 | 75/Build_Reactor_Factory_quick (3/queued [2]) 84 | 76/Build_Reactor_Factory_screen (3/queued [2]; 0/screen [84, 84]) 85 | 77/Build_Reactor_Starport_quick (3/queued [2]) 86 | 78/Build_Reactor_Starport_screen (3/queued [2]; 0/screen [84, 84]) 87 | 79/Build_Refinery_screen (3/queued [2]; 0/screen [84, 84]) 88 | 80/Build_RoachWarren_screen (3/queued [2]; 0/screen [84, 84]) 89 | 81/Build_RoboticsBay_screen (3/queued [2]; 0/screen [84, 84]) 90 | 82/Build_RoboticsFacility_screen (3/queued [2]; 0/screen [84, 84]) 91 | 83/Build_SensorTower_screen (3/queued [2]; 0/screen [84, 84]) 92 | 84/Build_SpawningPool_screen (3/queued [2]; 0/screen [84, 84]) 93 | 85/Build_SpineCrawler_screen (3/queued [2]; 0/screen [84, 84]) 94 | 86/Build_Spire_screen (3/queued [2]; 0/screen [84, 84]) 95 | 87/Build_SporeCrawler_screen (3/queued [2]; 0/screen [84, 84]) 96 | 88/Build_Stargate_screen (3/queued [2]; 0/screen [84, 84]) 97 | 89/Build_Starport_screen (3/queued [2]; 0/screen [84, 84]) 98 | 90/Build_StasisTrap_screen (3/queued [2]; 0/screen [84, 84]) 99 | 91/Build_SupplyDepot_screen (3/queued [2]; 0/screen [84, 84]) 100 | 92/Build_TechLab_quick (3/queued [2]) 101 | 93/Build_TechLab_screen (3/queued [2]; 0/screen [84, 84]) 102 | 94/Build_TechLab_Barracks_quick (3/queued [2]) 103 | 95/Build_TechLab_Barracks_screen (3/queued [2]; 0/screen [84, 84]) 104 | 96/Build_TechLab_Factory_quick (3/queued [2]) 105 | 97/Build_TechLab_Factory_screen (3/queued [2]; 0/screen [84, 84]) 106 | 98/Build_TechLab_Starport_quick (3/queued [2]) 107 | 99/Build_TechLab_Starport_screen (3/queued [2]; 0/screen [84, 84]) 108 | 100/Build_TemplarArchive_screen (3/queued [2]; 0/screen [84, 84]) 109 | 101/Build_TwilightCouncil_screen (3/queued [2]; 0/screen [84, 84]) 110 | 102/Build_UltraliskCavern_screen (3/queued [2]; 0/screen [84, 84]) 111 | 103/BurrowDown_quick (3/queued [2]) 112 | 104/BurrowDown_Baneling_quick (3/queued [2]) 113 | 105/BurrowDown_Drone_quick (3/queued [2]) 114 | 106/BurrowDown_Hydralisk_quick (3/queued [2]) 115 | 107/BurrowDown_Infestor_quick (3/queued [2]) 116 | 108/BurrowDown_InfestorTerran_quick (3/queued [2]) 117 | 109/BurrowDown_Lurker_quick (3/queued [2]) 118 | 110/BurrowDown_Queen_quick (3/queued [2]) 119 | 111/BurrowDown_Ravager_quick (3/queued [2]) 120 | 112/BurrowDown_Roach_quick (3/queued [2]) 121 | 113/BurrowDown_SwarmHost_quick (3/queued [2]) 122 | 114/BurrowDown_Ultralisk_quick (3/queued [2]) 123 | 115/BurrowDown_WidowMine_quick (3/queued [2]) 124 | 116/BurrowDown_Zergling_quick (3/queued [2]) 125 | 117/BurrowUp_quick (3/queued [2]) 126 | 118/BurrowUp_autocast () 127 | 119/BurrowUp_Baneling_quick (3/queued [2]) 128 | 120/BurrowUp_Baneling_autocast () 129 | 121/BurrowUp_Drone_quick (3/queued [2]) 130 | 122/BurrowUp_Hydralisk_quick (3/queued [2]) 131 | 123/BurrowUp_Hydralisk_autocast () 132 | 124/BurrowUp_Infestor_quick (3/queued [2]) 133 | 125/BurrowUp_InfestorTerran_quick (3/queued [2]) 134 | 126/BurrowUp_InfestorTerran_autocast () 135 | 127/BurrowUp_Lurker_quick (3/queued [2]) 136 | 128/BurrowUp_Queen_quick (3/queued [2]) 137 | 129/BurrowUp_Queen_autocast () 138 | 130/BurrowUp_Ravager_quick (3/queued [2]) 139 | 131/BurrowUp_Ravager_autocast () 140 | 132/BurrowUp_Roach_quick (3/queued [2]) 141 | 133/BurrowUp_Roach_autocast () 142 | 134/BurrowUp_SwarmHost_quick (3/queued [2]) 143 | 135/BurrowUp_Ultralisk_quick (3/queued [2]) 144 | 136/BurrowUp_Ultralisk_autocast () 145 | 137/BurrowUp_WidowMine_quick (3/queued [2]) 146 | 138/BurrowUp_Zergling_quick (3/queued [2]) 147 | 139/BurrowUp_Zergling_autocast () 148 | 140/Cancel_quick (3/queued [2]) 149 | 141/Cancel_AdeptPhaseShift_quick (3/queued [2]) 150 | 142/Cancel_AdeptShadePhaseShift_quick (3/queued [2]) 151 | 143/Cancel_BarracksAddOn_quick (3/queued [2]) 152 | 144/Cancel_BuildInProgress_quick (3/queued [2]) 153 | 145/Cancel_CreepTumor_quick (3/queued [2]) 154 | 146/Cancel_FactoryAddOn_quick (3/queued [2]) 155 | 147/Cancel_GravitonBeam_quick (3/queued [2]) 156 | 148/Cancel_LockOn_quick (3/queued [2]) 157 | 149/Cancel_MorphBroodlord_quick (3/queued [2]) 158 | 150/Cancel_MorphGreaterSpire_quick (3/queued [2]) 159 | 151/Cancel_MorphHive_quick (3/queued [2]) 160 | 152/Cancel_MorphLair_quick (3/queued [2]) 161 | 153/Cancel_MorphLurker_quick (3/queued [2]) 162 | 154/Cancel_MorphLurkerDen_quick (3/queued [2]) 163 | 155/Cancel_MorphMothership_quick (3/queued [2]) 164 | 156/Cancel_MorphOrbital_quick (3/queued [2]) 165 | 157/Cancel_MorphOverlordTransport_quick (3/queued [2]) 166 | 158/Cancel_MorphOverseer_quick (3/queued [2]) 167 | 159/Cancel_MorphPlanetaryFortress_quick (3/queued [2]) 168 | 160/Cancel_MorphRavager_quick (3/queued [2]) 169 | 161/Cancel_MorphThorExplosiveMode_quick (3/queued [2]) 170 | 162/Cancel_NeuralParasite_quick (3/queued [2]) 171 | 163/Cancel_Nuke_quick (3/queued [2]) 172 | 164/Cancel_SpineCrawlerRoot_quick (3/queued [2]) 173 | 165/Cancel_SporeCrawlerRoot_quick (3/queued [2]) 174 | 166/Cancel_StarportAddOn_quick (3/queued [2]) 175 | 167/Cancel_StasisTrap_quick (3/queued [2]) 176 | 168/Cancel_Last_quick (3/queued [2]) 177 | 169/Cancel_HangarQueue5_quick (3/queued [2]) 178 | 170/Cancel_Queue1_quick (3/queued [2]) 179 | 171/Cancel_Queue5_quick (3/queued [2]) 180 | 172/Cancel_QueueAddOn_quick (3/queued [2]) 181 | 173/Cancel_QueueCancelToSelection_quick (3/queued [2]) 182 | 174/Cancel_QueuePasive_quick (3/queued [2]) 183 | 175/Cancel_QueuePassiveCancelToSelection_quick (3/queued [2]) 184 | 176/Effect_Abduct_screen (3/queued [2]; 0/screen [84, 84]) 185 | 177/Effect_AdeptPhaseShift_screen (3/queued [2]; 0/screen [84, 84]) 186 | 178/Effect_AutoTurret_screen (3/queued [2]; 0/screen [84, 84]) 187 | 179/Effect_BlindingCloud_screen (3/queued [2]; 0/screen [84, 84]) 188 | 180/Effect_Blink_screen (3/queued [2]; 0/screen [84, 84]) 189 | 181/Effect_Blink_Stalker_screen (3/queued [2]; 0/screen [84, 84]) 190 | 182/Effect_ShadowStride_screen (3/queued [2]; 0/screen [84, 84]) 191 | 183/Effect_CalldownMULE_screen (3/queued [2]; 0/screen [84, 84]) 192 | 184/Effect_CausticSpray_screen (3/queued [2]; 0/screen [84, 84]) 193 | 185/Effect_Charge_screen (3/queued [2]; 0/screen [84, 84]) 194 | 186/Effect_Charge_autocast () 195 | 187/Effect_ChronoBoost_screen (3/queued [2]; 0/screen [84, 84]) 196 | 188/Effect_Contaminate_screen (3/queued [2]; 0/screen [84, 84]) 197 | 189/Effect_CorrosiveBile_screen (3/queued [2]; 0/screen [84, 84]) 198 | 190/Effect_EMP_screen (3/queued [2]; 0/screen [84, 84]) 199 | 191/Effect_Explode_quick (3/queued [2]) 200 | 192/Effect_Feedback_screen (3/queued [2]; 0/screen [84, 84]) 201 | 193/Effect_ForceField_screen (3/queued [2]; 0/screen [84, 84]) 202 | 194/Effect_FungalGrowth_screen (3/queued [2]; 0/screen [84, 84]) 203 | 195/Effect_GhostSnipe_screen (3/queued [2]; 0/screen [84, 84]) 204 | 196/Effect_GravitonBeam_screen (3/queued [2]; 0/screen [84, 84]) 205 | 197/Effect_GuardianShield_quick (3/queued [2]) 206 | 198/Effect_Heal_screen (3/queued [2]; 0/screen [84, 84]) 207 | 199/Effect_Heal_autocast () 208 | 200/Effect_HunterSeekerMissile_screen (3/queued [2]; 0/screen [84, 84]) 209 | 201/Effect_ImmortalBarrier_quick (3/queued [2]) 210 | 202/Effect_ImmortalBarrier_autocast () 211 | 203/Effect_InfestedTerrans_screen (3/queued [2]; 0/screen [84, 84]) 212 | 204/Effect_InjectLarva_screen (3/queued [2]; 0/screen [84, 84]) 213 | 205/Effect_KD8Charge_screen (3/queued [2]; 0/screen [84, 84]) 214 | 206/Effect_LockOn_screen (3/queued [2]; 0/screen [84, 84]) 215 | 207/Effect_LocustSwoop_screen (3/queued [2]; 0/screen [84, 84]) 216 | 208/Effect_MassRecall_screen (3/queued [2]; 0/screen [84, 84]) 217 | 209/Effect_MassRecall_Mothership_screen (3/queued [2]; 0/screen [84, 84]) 218 | 210/Effect_MassRecall_MothershipCore_screen (3/queued [2]; 0/screen [84, 84]) 219 | 211/Effect_MedivacIgniteAfterburners_quick (3/queued [2]) 220 | 212/Effect_NeuralParasite_screen (3/queued [2]; 0/screen [84, 84]) 221 | 213/Effect_NukeCalldown_screen (3/queued [2]; 0/screen [84, 84]) 222 | 214/Effect_OracleRevelation_screen (3/queued [2]; 0/screen [84, 84]) 223 | 215/Effect_ParasiticBomb_screen (3/queued [2]; 0/screen [84, 84]) 224 | 216/Effect_PhotonOvercharge_screen (3/queued [2]; 0/screen [84, 84]) 225 | 217/Effect_PointDefenseDrone_screen (3/queued [2]; 0/screen [84, 84]) 226 | 218/Effect_PsiStorm_screen (3/queued [2]; 0/screen [84, 84]) 227 | 219/Effect_PurificationNova_screen (3/queued [2]; 0/screen [84, 84]) 228 | 220/Effect_Repair_screen (3/queued [2]; 0/screen [84, 84]) 229 | 221/Effect_Repair_autocast () 230 | 222/Effect_Repair_Mule_screen (3/queued [2]; 0/screen [84, 84]) 231 | 223/Effect_Repair_Mule_autocast () 232 | 224/Effect_Repair_SCV_screen (3/queued [2]; 0/screen [84, 84]) 233 | 225/Effect_Repair_SCV_autocast () 234 | 226/Effect_Salvage_quick (3/queued [2]) 235 | 227/Effect_Scan_screen (3/queued [2]; 0/screen [84, 84]) 236 | 228/Effect_SpawnChangeling_quick (3/queued [2]) 237 | 229/Effect_SpawnLocusts_screen (3/queued [2]; 0/screen [84, 84]) 238 | 230/Effect_Spray_screen (3/queued [2]; 0/screen [84, 84]) 239 | 231/Effect_Spray_Protoss_screen (3/queued [2]; 0/screen [84, 84]) 240 | 232/Effect_Spray_Terran_screen (3/queued [2]; 0/screen [84, 84]) 241 | 233/Effect_Spray_Zerg_screen (3/queued [2]; 0/screen [84, 84]) 242 | 234/Effect_Stim_quick (3/queued [2]) 243 | 235/Effect_Stim_Marauder_quick (3/queued [2]) 244 | 236/Effect_Stim_Marauder_Redirect_quick (3/queued [2]) 245 | 237/Effect_Stim_Marine_quick (3/queued [2]) 246 | 238/Effect_Stim_Marine_Redirect_quick (3/queued [2]) 247 | 239/Effect_SupplyDrop_screen (3/queued [2]; 0/screen [84, 84]) 248 | 240/Effect_TacticalJump_screen (3/queued [2]; 0/screen [84, 84]) 249 | 241/Effect_TimeWarp_screen (3/queued [2]; 0/screen [84, 84]) 250 | 242/Effect_Transfusion_screen (3/queued [2]; 0/screen [84, 84]) 251 | 243/Effect_ViperConsume_screen (3/queued [2]; 0/screen [84, 84]) 252 | 244/Effect_VoidRayPrismaticAlignment_quick (3/queued [2]) 253 | 245/Effect_WidowMineAttack_screen (3/queued [2]; 0/screen [84, 84]) 254 | 246/Effect_WidowMineAttack_autocast () 255 | 247/Effect_YamatoGun_screen (3/queued [2]; 0/screen [84, 84]) 256 | 248/Hallucination_Adept_quick (3/queued [2]) 257 | 249/Hallucination_Archon_quick (3/queued [2]) 258 | 250/Hallucination_Colossus_quick (3/queued [2]) 259 | 251/Hallucination_Disruptor_quick (3/queued [2]) 260 | 252/Hallucination_HighTemplar_quick (3/queued [2]) 261 | 253/Hallucination_Immortal_quick (3/queued [2]) 262 | 254/Hallucination_Oracle_quick (3/queued [2]) 263 | 255/Hallucination_Phoenix_quick (3/queued [2]) 264 | 256/Hallucination_Probe_quick (3/queued [2]) 265 | 257/Hallucination_Stalker_quick (3/queued [2]) 266 | 258/Hallucination_VoidRay_quick (3/queued [2]) 267 | 259/Hallucination_WarpPrism_quick (3/queued [2]) 268 | 260/Hallucination_Zealot_quick (3/queued [2]) 269 | 261/Halt_quick (3/queued [2]) 270 | 262/Halt_Building_quick (3/queued [2]) 271 | 263/Halt_TerranBuild_quick (3/queued [2]) 272 | 264/Harvest_Gather_screen (3/queued [2]; 0/screen [84, 84]) 273 | 265/Harvest_Gather_Drone_screen (3/queued [2]; 0/screen [84, 84]) 274 | 266/Harvest_Gather_Mule_screen (3/queued [2]; 0/screen [84, 84]) 275 | 267/Harvest_Gather_Probe_screen (3/queued [2]; 0/screen [84, 84]) 276 | 268/Harvest_Gather_SCV_screen (3/queued [2]; 0/screen [84, 84]) 277 | 269/Harvest_Return_quick (3/queued [2]) 278 | 270/Harvest_Return_Drone_quick (3/queued [2]) 279 | 271/Harvest_Return_Mule_quick (3/queued [2]) 280 | 272/Harvest_Return_Probe_quick (3/queued [2]) 281 | 273/Harvest_Return_SCV_quick (3/queued [2]) 282 | 274/HoldPosition_quick (3/queued [2]) 283 | 275/Land_screen (3/queued [2]; 0/screen [84, 84]) 284 | 276/Land_Barracks_screen (3/queued [2]; 0/screen [84, 84]) 285 | 277/Land_CommandCenter_screen (3/queued [2]; 0/screen [84, 84]) 286 | 278/Land_Factory_screen (3/queued [2]; 0/screen [84, 84]) 287 | 279/Land_OrbitalCommand_screen (3/queued [2]; 0/screen [84, 84]) 288 | 280/Land_Starport_screen (3/queued [2]; 0/screen [84, 84]) 289 | 281/Lift_quick (3/queued [2]) 290 | 282/Lift_Barracks_quick (3/queued [2]) 291 | 283/Lift_CommandCenter_quick (3/queued [2]) 292 | 284/Lift_Factory_quick (3/queued [2]) 293 | 285/Lift_OrbitalCommand_quick (3/queued [2]) 294 | 286/Lift_Starport_quick (3/queued [2]) 295 | 287/Load_screen (3/queued [2]; 0/screen [84, 84]) 296 | 288/Load_Bunker_screen (3/queued [2]; 0/screen [84, 84]) 297 | 289/Load_Medivac_screen (3/queued [2]; 0/screen [84, 84]) 298 | 290/Load_NydusNetwork_screen (3/queued [2]; 0/screen [84, 84]) 299 | 291/Load_NydusWorm_screen (3/queued [2]; 0/screen [84, 84]) 300 | 292/Load_Overlord_screen (3/queued [2]; 0/screen [84, 84]) 301 | 293/Load_WarpPrism_screen (3/queued [2]; 0/screen [84, 84]) 302 | 294/LoadAll_quick (3/queued [2]) 303 | 295/LoadAll_CommandCenter_quick (3/queued [2]) 304 | 296/Morph_Archon_quick (3/queued [2]) 305 | 297/Morph_BroodLord_quick (3/queued [2]) 306 | 298/Morph_Gateway_quick (3/queued [2]) 307 | 299/Morph_GreaterSpire_quick (3/queued [2]) 308 | 300/Morph_Hellbat_quick (3/queued [2]) 309 | 301/Morph_Hellion_quick (3/queued [2]) 310 | 302/Morph_Hive_quick (3/queued [2]) 311 | 303/Morph_Lair_quick (3/queued [2]) 312 | 304/Morph_LiberatorAAMode_quick (3/queued [2]) 313 | 305/Morph_LiberatorAGMode_screen (3/queued [2]; 0/screen [84, 84]) 314 | 306/Morph_Lurker_quick (3/queued [2]) 315 | 307/Morph_LurkerDen_quick (3/queued [2]) 316 | 308/Morph_Mothership_quick (3/queued [2]) 317 | 309/Morph_OrbitalCommand_quick (3/queued [2]) 318 | 310/Morph_OverlordTransport_quick (3/queued [2]) 319 | 311/Morph_Overseer_quick (3/queued [2]) 320 | 312/Morph_PlanetaryFortress_quick (3/queued [2]) 321 | 313/Morph_Ravager_quick (3/queued [2]) 322 | 314/Morph_Root_screen (3/queued [2]; 0/screen [84, 84]) 323 | 315/Morph_SpineCrawlerRoot_screen (3/queued [2]; 0/screen [84, 84]) 324 | 316/Morph_SporeCrawlerRoot_screen (3/queued [2]; 0/screen [84, 84]) 325 | 317/Morph_SiegeMode_quick (3/queued [2]) 326 | 318/Morph_SupplyDepot_Lower_quick (3/queued [2]) 327 | 319/Morph_SupplyDepot_Raise_quick (3/queued [2]) 328 | 320/Morph_ThorExplosiveMode_quick (3/queued [2]) 329 | 321/Morph_ThorHighImpactMode_quick (3/queued [2]) 330 | 322/Morph_Unsiege_quick (3/queued [2]) 331 | 323/Morph_Uproot_quick (3/queued [2]) 332 | 324/Morph_SpineCrawlerUproot_quick (3/queued [2]) 333 | 325/Morph_SporeCrawlerUproot_quick (3/queued [2]) 334 | 326/Morph_VikingAssaultMode_quick (3/queued [2]) 335 | 327/Morph_VikingFighterMode_quick (3/queued [2]) 336 | 328/Morph_WarpGate_quick (3/queued [2]) 337 | 329/Morph_WarpPrismPhasingMode_quick (3/queued [2]) 338 | 330/Morph_WarpPrismTransportMode_quick (3/queued [2]) 339 | 331/Move_screen (3/queued [2]; 0/screen [84, 84]) 340 | 332/Move_minimap (3/queued [2]; 1/minimap [64, 64]) 341 | 333/Patrol_screen (3/queued [2]; 0/screen [84, 84]) 342 | 334/Patrol_minimap (3/queued [2]; 1/minimap [64, 64]) 343 | 335/Rally_Units_screen (3/queued [2]; 0/screen [84, 84]) 344 | 336/Rally_Units_minimap (3/queued [2]; 1/minimap [64, 64]) 345 | 337/Rally_Building_screen (3/queued [2]; 0/screen [84, 84]) 346 | 338/Rally_Building_minimap (3/queued [2]; 1/minimap [64, 64]) 347 | 339/Rally_Hatchery_Units_screen (3/queued [2]; 0/screen [84, 84]) 348 | 340/Rally_Hatchery_Units_minimap (3/queued [2]; 1/minimap [64, 64]) 349 | 341/Rally_Morphing_Unit_screen (3/queued [2]; 0/screen [84, 84]) 350 | 342/Rally_Morphing_Unit_minimap (3/queued [2]; 1/minimap [64, 64]) 351 | 343/Rally_Workers_screen (3/queued [2]; 0/screen [84, 84]) 352 | 344/Rally_Workers_minimap (3/queued [2]; 1/minimap [64, 64]) 353 | 345/Rally_CommandCenter_screen (3/queued [2]; 0/screen [84, 84]) 354 | 346/Rally_CommandCenter_minimap (3/queued [2]; 1/minimap [64, 64]) 355 | 347/Rally_Hatchery_Workers_screen (3/queued [2]; 0/screen [84, 84]) 356 | 348/Rally_Hatchery_Workers_minimap (3/queued [2]; 1/minimap [64, 64]) 357 | 349/Rally_Nexus_screen (3/queued [2]; 0/screen [84, 84]) 358 | 350/Rally_Nexus_minimap (3/queued [2]; 1/minimap [64, 64]) 359 | 351/Research_AdeptResonatingGlaives_quick (3/queued [2]) 360 | 352/Research_AdvancedBallistics_quick (3/queued [2]) 361 | 353/Research_BansheeCloakingField_quick (3/queued [2]) 362 | 354/Research_BansheeHyperflightRotors_quick (3/queued [2]) 363 | 355/Research_BattlecruiserWeaponRefit_quick (3/queued [2]) 364 | 356/Research_Blink_quick (3/queued [2]) 365 | 357/Research_Burrow_quick (3/queued [2]) 366 | 358/Research_CentrifugalHooks_quick (3/queued [2]) 367 | 359/Research_Charge_quick (3/queued [2]) 368 | 360/Research_ChitinousPlating_quick (3/queued [2]) 369 | 361/Research_CombatShield_quick (3/queued [2]) 370 | 362/Research_ConcussiveShells_quick (3/queued [2]) 371 | 363/Research_DrillingClaws_quick (3/queued [2]) 372 | 364/Research_ExtendedThermalLance_quick (3/queued [2]) 373 | 365/Research_GlialRegeneration_quick (3/queued [2]) 374 | 366/Research_GraviticBooster_quick (3/queued [2]) 375 | 367/Research_GraviticDrive_quick (3/queued [2]) 376 | 368/Research_GroovedSpines_quick (3/queued [2]) 377 | 369/Research_HiSecAutoTracking_quick (3/queued [2]) 378 | 370/Research_HighCapacityFuelTanks_quick (3/queued [2]) 379 | 371/Research_InfernalPreigniter_quick (3/queued [2]) 380 | 372/Research_InterceptorGravitonCatapult_quick (3/queued [2]) 381 | 373/Research_MagFieldLaunchers_quick (3/queued [2]) 382 | 374/Research_MuscularAugments_quick (3/queued [2]) 383 | 375/Research_NeosteelFrame_quick (3/queued [2]) 384 | 376/Research_NeuralParasite_quick (3/queued [2]) 385 | 377/Research_PathogenGlands_quick (3/queued [2]) 386 | 378/Research_PersonalCloaking_quick (3/queued [2]) 387 | 379/Research_PhoenixAnionPulseCrystals_quick (3/queued [2]) 388 | 380/Research_PneumatizedCarapace_quick (3/queued [2]) 389 | 381/Research_ProtossAirArmor_quick (3/queued [2]) 390 | 382/Research_ProtossAirArmorLevel1_quick (3/queued [2]) 391 | 383/Research_ProtossAirArmorLevel2_quick (3/queued [2]) 392 | 384/Research_ProtossAirArmorLevel3_quick (3/queued [2]) 393 | 385/Research_ProtossAirWeapons_quick (3/queued [2]) 394 | 386/Research_ProtossAirWeaponsLevel1_quick (3/queued [2]) 395 | 387/Research_ProtossAirWeaponsLevel2_quick (3/queued [2]) 396 | 388/Research_ProtossAirWeaponsLevel3_quick (3/queued [2]) 397 | 389/Research_ProtossGroundArmor_quick (3/queued [2]) 398 | 390/Research_ProtossGroundArmorLevel1_quick (3/queued [2]) 399 | 391/Research_ProtossGroundArmorLevel2_quick (3/queued [2]) 400 | 392/Research_ProtossGroundArmorLevel3_quick (3/queued [2]) 401 | 393/Research_ProtossGroundWeapons_quick (3/queued [2]) 402 | 394/Research_ProtossGroundWeaponsLevel1_quick (3/queued [2]) 403 | 395/Research_ProtossGroundWeaponsLevel2_quick (3/queued [2]) 404 | 396/Research_ProtossGroundWeaponsLevel3_quick (3/queued [2]) 405 | 397/Research_ProtossShields_quick (3/queued [2]) 406 | 398/Research_ProtossShieldsLevel1_quick (3/queued [2]) 407 | 399/Research_ProtossShieldsLevel2_quick (3/queued [2]) 408 | 400/Research_ProtossShieldsLevel3_quick (3/queued [2]) 409 | 401/Research_PsiStorm_quick (3/queued [2]) 410 | 402/Research_RavenCorvidReactor_quick (3/queued [2]) 411 | 403/Research_RavenRecalibratedExplosives_quick (3/queued [2]) 412 | 404/Research_ShadowStrike_quick (3/queued [2]) 413 | 405/Research_Stimpack_quick (3/queued [2]) 414 | 406/Research_TerranInfantryArmor_quick (3/queued [2]) 415 | 407/Research_TerranInfantryArmorLevel1_quick (3/queued [2]) 416 | 408/Research_TerranInfantryArmorLevel2_quick (3/queued [2]) 417 | 409/Research_TerranInfantryArmorLevel3_quick (3/queued [2]) 418 | 410/Research_TerranInfantryWeapons_quick (3/queued [2]) 419 | 411/Research_TerranInfantryWeaponsLevel1_quick (3/queued [2]) 420 | 412/Research_TerranInfantryWeaponsLevel2_quick (3/queued [2]) 421 | 413/Research_TerranInfantryWeaponsLevel3_quick (3/queued [2]) 422 | 414/Research_TerranShipWeapons_quick (3/queued [2]) 423 | 415/Research_TerranShipWeaponsLevel1_quick (3/queued [2]) 424 | 416/Research_TerranShipWeaponsLevel2_quick (3/queued [2]) 425 | 417/Research_TerranShipWeaponsLevel3_quick (3/queued [2]) 426 | 418/Research_TerranStructureArmorUpgrade_quick (3/queued [2]) 427 | 419/Research_TerranVehicleAndShipPlating_quick (3/queued [2]) 428 | 420/Research_TerranVehicleAndShipPlatingLevel1_quick (3/queued [2]) 429 | 421/Research_TerranVehicleAndShipPlatingLevel2_quick (3/queued [2]) 430 | 422/Research_TerranVehicleAndShipPlatingLevel3_quick (3/queued [2]) 431 | 423/Research_TerranVehicleWeapons_quick (3/queued [2]) 432 | 424/Research_TerranVehicleWeaponsLevel1_quick (3/queued [2]) 433 | 425/Research_TerranVehicleWeaponsLevel2_quick (3/queued [2]) 434 | 426/Research_TerranVehicleWeaponsLevel3_quick (3/queued [2]) 435 | 427/Research_TunnelingClaws_quick (3/queued [2]) 436 | 428/Research_WarpGate_quick (3/queued [2]) 437 | 429/Research_ZergFlyerArmor_quick (3/queued [2]) 438 | 430/Research_ZergFlyerArmorLevel1_quick (3/queued [2]) 439 | 431/Research_ZergFlyerArmorLevel2_quick (3/queued [2]) 440 | 432/Research_ZergFlyerArmorLevel3_quick (3/queued [2]) 441 | 433/Research_ZergFlyerAttack_quick (3/queued [2]) 442 | 434/Research_ZergFlyerAttackLevel1_quick (3/queued [2]) 443 | 435/Research_ZergFlyerAttackLevel2_quick (3/queued [2]) 444 | 436/Research_ZergFlyerAttackLevel3_quick (3/queued [2]) 445 | 437/Research_ZergGroundArmor_quick (3/queued [2]) 446 | 438/Research_ZergGroundArmorLevel1_quick (3/queued [2]) 447 | 439/Research_ZergGroundArmorLevel2_quick (3/queued [2]) 448 | 440/Research_ZergGroundArmorLevel3_quick (3/queued [2]) 449 | 441/Research_ZergMeleeWeapons_quick (3/queued [2]) 450 | 442/Research_ZergMeleeWeaponsLevel1_quick (3/queued [2]) 451 | 443/Research_ZergMeleeWeaponsLevel2_quick (3/queued [2]) 452 | 444/Research_ZergMeleeWeaponsLevel3_quick (3/queued [2]) 453 | 445/Research_ZergMissileWeapons_quick (3/queued [2]) 454 | 446/Research_ZergMissileWeaponsLevel1_quick (3/queued [2]) 455 | 447/Research_ZergMissileWeaponsLevel2_quick (3/queued [2]) 456 | 448/Research_ZergMissileWeaponsLevel3_quick (3/queued [2]) 457 | 449/Research_ZerglingAdrenalGlands_quick (3/queued [2]) 458 | 450/Research_ZerglingMetabolicBoost_quick (3/queued [2]) 459 | 451/Smart_screen (3/queued [2]; 0/screen [84, 84]) 460 | 452/Smart_minimap (3/queued [2]; 1/minimap [64, 64]) 461 | 453/Stop_quick (3/queued [2]) 462 | 454/Stop_Building_quick (3/queued [2]) 463 | 455/Stop_Redirect_quick (3/queued [2]) 464 | 456/Stop_Stop_quick (3/queued [2]) 465 | 457/Train_Adept_quick (3/queued [2]) 466 | 458/Train_Baneling_quick (3/queued [2]) 467 | 459/Train_Banshee_quick (3/queued [2]) 468 | 460/Train_Battlecruiser_quick (3/queued [2]) 469 | 461/Train_Carrier_quick (3/queued [2]) 470 | 462/Train_Colossus_quick (3/queued [2]) 471 | 463/Train_Corruptor_quick (3/queued [2]) 472 | 464/Train_Cyclone_quick (3/queued [2]) 473 | 465/Train_DarkTemplar_quick (3/queued [2]) 474 | 466/Train_Disruptor_quick (3/queued [2]) 475 | 467/Train_Drone_quick (3/queued [2]) 476 | 468/Train_Ghost_quick (3/queued [2]) 477 | 469/Train_Hellbat_quick (3/queued [2]) 478 | 470/Train_Hellion_quick (3/queued [2]) 479 | 471/Train_HighTemplar_quick (3/queued [2]) 480 | 472/Train_Hydralisk_quick (3/queued [2]) 481 | 473/Train_Immortal_quick (3/queued [2]) 482 | 474/Train_Infestor_quick (3/queued [2]) 483 | 475/Train_Liberator_quick (3/queued [2]) 484 | 476/Train_Marauder_quick (3/queued [2]) 485 | 477/Train_Marine_quick (3/queued [2]) 486 | 478/Train_Medivac_quick (3/queued [2]) 487 | 479/Train_MothershipCore_quick (3/queued [2]) 488 | 480/Train_Mutalisk_quick (3/queued [2]) 489 | 481/Train_Observer_quick (3/queued [2]) 490 | 482/Train_Oracle_quick (3/queued [2]) 491 | 483/Train_Overlord_quick (3/queued [2]) 492 | 484/Train_Phoenix_quick (3/queued [2]) 493 | 485/Train_Probe_quick (3/queued [2]) 494 | 486/Train_Queen_quick (3/queued [2]) 495 | 487/Train_Raven_quick (3/queued [2]) 496 | 488/Train_Reaper_quick (3/queued [2]) 497 | 489/Train_Roach_quick (3/queued [2]) 498 | 490/Train_SCV_quick (3/queued [2]) 499 | 491/Train_Sentry_quick (3/queued [2]) 500 | 492/Train_SiegeTank_quick (3/queued [2]) 501 | 493/Train_Stalker_quick (3/queued [2]) 502 | 494/Train_SwarmHost_quick (3/queued [2]) 503 | 495/Train_Tempest_quick (3/queued [2]) 504 | 496/Train_Thor_quick (3/queued [2]) 505 | 497/Train_Ultralisk_quick (3/queued [2]) 506 | 498/Train_VikingFighter_quick (3/queued [2]) 507 | 499/Train_Viper_quick (3/queued [2]) 508 | 500/Train_VoidRay_quick (3/queued [2]) 509 | 501/Train_WarpPrism_quick (3/queued [2]) 510 | 502/Train_WidowMine_quick (3/queued [2]) 511 | 503/Train_Zealot_quick (3/queued [2]) 512 | 504/Train_Zergling_quick (3/queued [2]) 513 | 505/TrainWarp_Adept_screen (3/queued [2]; 0/screen [84, 84]) 514 | 506/TrainWarp_DarkTemplar_screen (3/queued [2]; 0/screen [84, 84]) 515 | 507/TrainWarp_HighTemplar_screen (3/queued [2]; 0/screen [84, 84]) 516 | 508/TrainWarp_Sentry_screen (3/queued [2]; 0/screen [84, 84]) 517 | 509/TrainWarp_Stalker_screen (3/queued [2]; 0/screen [84, 84]) 518 | 510/TrainWarp_Zealot_screen (3/queued [2]; 0/screen [84, 84]) 519 | 511/UnloadAll_quick (3/queued [2]) 520 | 512/UnloadAll_Bunker_quick (3/queued [2]) 521 | 513/UnloadAll_CommandCenter_quick (3/queued [2]) 522 | 514/UnloadAll_NydasNetwork_quick (3/queued [2]) 523 | 515/UnloadAll_NydusWorm_quick (3/queued [2]) 524 | 516/UnloadAllAt_screen (3/queued [2]; 0/screen [84, 84]) 525 | 517/UnloadAllAt_minimap (3/queued [2]; 1/minimap [64, 64]) 526 | 518/UnloadAllAt_Medivac_screen (3/queued [2]; 0/screen [84, 84]) 527 | 519/UnloadAllAt_Medivac_minimap (3/queued [2]; 1/minimap [64, 64]) 528 | 520/UnloadAllAt_Overlord_screen (3/queued [2]; 0/screen [84, 84]) 529 | 521/UnloadAllAt_Overlord_minimap (3/queued [2]; 1/minimap [64, 64]) 530 | 522/UnloadAllAt_WarpPrism_screen (3/queued [2]; 0/screen [84, 84]) 531 | 523/UnloadAllAt_WarpPrism_minimap (3/queued [2]; 1/minimap [64, 64]) 532 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pysc2-RLagents 2 | Notes and scripts for SC2LE released by DeepMind and Blizzard, more details [here](https://github.com/deepmind/pysc2). 3 | 4 | #### There seems to be a bug where the agent's performance drops drastically after prolonged training eg. in MoveToBeacon from 25 to 1. I'm still trying to work on this when I can spare the time. 5 | 6 | ## Important Links 7 | 8 | [Original SC2LE Paper](https://deepmind.com/documents/110/sc2le.pdf) 9 | 10 | [DeepMind blog post](https://deepmind.com/blog/deepmind-and-blizzard-open-starcraft-ii-ai-research-environment/) 11 | 12 | [Blizzard blog post](http://us.battle.net/sc2/en/blog/20944009) 13 | 14 | [PySC2 repo](https://github.com/deepmind/pysc2) 15 | 16 | [Blizzard's SC2 API](https://github.com/Blizzard/s2client-api) 17 | 18 | [Blizzard's SC2 API Protocol](https://github.com/Blizzard/s2client-proto) 19 | 20 | [Python library for SC2 API Protocol](https://pypi.python.org/pypi/s2clientprotocol/) 21 | 22 | ## Work by others 23 | 24 | Chris' [blog post](http://chris-chris.ai/2017/08/30/pysc2-tutorial1/) and [repo](https://github.com/chris-chris/pysc2-examples) 25 | 26 | Siraj's [Youtube tutorial](https://www.youtube.com/watch?v=URWXG5jRB-A&feature=youtu.be) and accompanying [code](https://github.com/llSourcell/A-Guide-to-DeepMinds-StarCraft-AI-Environment) 27 | 28 | Steven's Medium articles for [a simple scripted agent](https://chatbotslife.com/building-a-basic-pysc2-agent-b109cde1477c) and [one based on Q-tables](https://chatbotslife.com/building-a-smart-pysc2-agent-cdc269cb095d) 29 | 30 | pekaalto's [work](https://github.com/pekaalto/sc2atari) on adapting OpenAI's gym environment to SC2LE and an implementation of the FullyConv algorithm plus results on three minigames 31 | 32 | Arthur Juliani's [posts](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2) and [repo](https://github.com/awjuliani/DeepRL-Agents) for RL agents 33 | 34 | Not SC2LE but mentioned here because my agent script was built on Juliani's A3C implementation. 35 | 36 | Let me know if anyone else is also working on this and I'll add a link here! 37 | 38 | ## Notes 39 | 40 | Contains general notes on working with SC2LE. 41 | 42 | ### Total Action Space 43 | 44 | The entire unfiltered action space for an SC2LE agent. 45 | 46 | It contains 524 base actions / functions with 101938719 possible actions given a minimap_resolution of (64, 64) and screen_resolution of (84, 84). 47 | 48 | ### List of Action Argument Types 49 | 50 | The entire list of action argument types for use in the actions / functions. 51 | 52 | It contains 13 argument types with descriptions. 53 | 54 | ### Running an Agent 55 | 56 | Notes on running an agent in the pysc2.env.sc2_env.SC2Env environment. In particular, showing details and brief descriptions of the TimeStep object (observation) fed to the step function of an agent or returned from calling the step function of an environment. 57 | 58 | ## ResearchLog 59 | 60 | Contains notes on developing RL agents for SC2LE. 61 | 62 | ## Agents 63 | 64 | Contains scripts for training and running RL agents in SC2LE. 65 | 66 | ### `PySC2_A3C_FullyConv.py` 67 | 68 | This script implements the A3C algorithm with the FullyConv architecture described in DeepMind's paper, for SC2LE. The code is based on Arthur Juliani's A3C implementation for the VizDoom environment (see above). 69 | 70 | To run the script, use the following command: 71 | 72 | `python PySC2_A3C_FullyConv.py --map_name MoveToBeacon` 73 | 74 | If `--map_name` is not supplied, the script runs DefeatRoaches by default. 75 | 76 | ### `PySC2_A3C_AtariNet.py` 77 | 78 | This script implements the A3C algorithm with the Atari-net architecture described in DeepMind's paper, for SC2LE. The code is based on Arthur Juliani's A3C implementation for the VizDoom environment (see above). 79 | 80 | This is a generalized version of PySC2_A3C_old.py that works for all minigames and also contains some bug fixes. 81 | 82 | To run the script, use the following command: 83 | 84 | `python PySC2_A3C_AtariNet.py --map_name MoveToBeacon` 85 | 86 | If `--map_name` is not supplied, the script runs DefeatRoaches by default. 87 | 88 | ### `PySC2_A3C_old.py` 89 | 90 | #### This is an initial script that only works for the DefeatRoaches minigame. There is also a model file in this repo that will load if you just run `python PySC2_A3C_old.py`. 91 | 92 | I initially focused on the DefeatRoaches minigame and so I only took in 7 screen features and 3 nonspatial features for the state space and the action space is limited to 17 base actions and their relevant arguments. 93 | 94 | For the action space, I modeled the base actions and arguments independently. In addition, I also model x and y coordinates independently for spatial arguments, to further reduce the effective action space. 95 | 96 | The agent currently samples the distributions returned from the policy networks for the actions taken, instead of an epsilon-greedy. 97 | 98 | Also, the policy networks for the arguments are updated irregardless of whether the argument was used (eg. even if a no_op action is taken, the argument policies are still updated), which should probably be corrected. 99 | 100 | Will be updating this to work with all the minigames. 101 | 102 | As of ~10 million steps on DefeatRoaches, the agent achieved max and average scores of 338 and 65, compared to DeepMind's Atari-net agent that achieved max and average scores of 351 and 101 after 600 million steps. 103 | -------------------------------------------------------------------------------- /ResearchLog/2017-11-01.md: -------------------------------------------------------------------------------- 1 | # SC2LE Research Log 2 | ## Entry #1 3 | ### 2017-11-01 4 | 5 | ## Current Progress: 6 | 7 | Currently able to run an A3C script adapted from AWJuliani's [implementation](https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2). 8 | 9 | I limited the observation features to `TimeStep.observation['single_select']` which is a (1,7) tensor. 10 | The actions are sampled from base_actions (17-dim) but the input action to `env.step` is fixed to `no_op`. 11 | The actions returns to the A3C algorithm is also fixed to 0. 12 | 13 | For faster debugging I run only 1 thread but the script also works with 4 threads. 14 | 15 | Applying a random reward results in corresponding changes in the value function and the action distribution / policy. 16 | 17 | ## Next Steps: 18 | 19 | I am planning to input the actual action from the sample, with the arguments for the actions fixed. 20 | The returned action to the A3C algorithm should also be from this input. -------------------------------------------------------------------------------- /ResearchLog/2017-11-06.md: -------------------------------------------------------------------------------- 1 | # SC2LE Research Log 2 | ## Entry #2 3 | ### 2017-11-06 4 | 5 | ## Current Progress: 6 | 7 | As per the last research log, I have been able to sample the 17-dim action space for the DefeatRoaches mini-game and pass the sampled action to be executed in the environment. 8 | Also, fed the sampled action back into the update instead of returning a fixed action. 9 | 10 | One thing to note, I had assumed that all the 17 actions will be available throughout the episode, based on an initial test run where I printed the available actions from the environment. Assumption was quickly proven wrong when running the script, action `# 5/select_unit` was unavailable at certain steps, prompting the script to be terminated. 11 | 12 | To remedy that, I used a while loop to keep sampling an action until the sampled action is in the set of the available actions in that state. On a side note, there might be more available actions than the 17 I'm currently using. 13 | 14 | Also, the arguments are fixed (basically all 0s where a number is needed) for the 17 actions. So I effectively see the agent repeatedly moving the marines to the top left of the screen. Two things - first this means the arguments are passed correctly, two this means that the coordinates are set with origin at the top left. 15 | 16 | Finally, at this point, the features used to represent the state is still only `TimeStep.observation['single_select']`(1,7). 17 | 18 | ## Next Steps: 19 | 20 | - Change the features to `TimeStep.observation['screen'][5]` (player_relative)(84,84) tensor 21 | - Use CNN to process the screen feature 22 | - If that works, use both the screen feature and `TimeStep.observation['single_select']`(1,7) 23 | - If that works, use all 7 relevant screen channels and 3 relevant nonspatial features 24 | - If that works, sample the spatial arguments -------------------------------------------------------------------------------- /ResearchLog/2017-11-07.md: -------------------------------------------------------------------------------- 1 | # SC2LE Research Log 2 | ## Entry #3 3 | ### 2017-11-07 4 | 5 | ## Current Progress: 6 | 7 | Replaced the while loop to filter out unavailable actions with the following: 8 | 9 | - Set the probabilities of the unavailable actions to 0 10 | - Normalize the new probability distribution 11 | 12 | This is as per laid out in DeepMind's PySC2 paper 13 | 14 | Incorporated all 7 screen features and the 3 nonspatial features that are relevant to the DefeatRoaches minigame. Did not include the minimap features since in this case, no screen movement is required ie. the screen encompasses the entire map. 15 | 16 | One problem here is that in incorporating the multi_select nonspatial feature, the dimensions of the multi_select tensor is (n,7) where n is the number of units selected. Since the input size has to be fixed, I assume that n <= 100 and if n < 100, I pad the unrolled tensor with zeros. 17 | 18 | While adding the screen features, I realized that the screen tensor dimensions for the DefeatRoaches minigame were actually (64,64) instead of (84,84). 19 | 20 | Also modeled individual policies for base_actions and all relevant argument types (x and y coordinates are modeled independently as well). The entropy and policy_loss are calculated as the sum of individual entropies and policy_losses for individual policies. 21 | 22 | Added `tf.clip_by_value` to all `tf.log` functions to prevent overflow due to log 0. 23 | 24 | Added `global` variables `_max_score` and `_running_avg_score` that records the maximum score and exponentially-weighted moving average respectively. `running_avg_score` is calculated as 25 | `running_avg_score = (2.0 / 101) * (episode_reward - running_avg_score) + running_avg_score` so that the average is taken approximately over the last 100 episodes. 26 | 27 | Another thing to point out is that the argument policies are updated irregardless of whether the argument was used. Eg. even if the no_op action is taken, all argument policy networks are updated even though no arguments were accepted. In that sense, there is a lot of noise in the updating of the policy networks for the arguments. 28 | 29 | ## Next Steps: 30 | 31 | - Generalize script to run for all minigames 32 | - Expand the accepted features and sampled actions/arguments --------------------------------------------------------------------------------