├── pong.bin ├── constants.pyc ├── game_state.pyc ├── game_ac_network.pyc ├── rmsprop_applier.pyc ├── a3c_training_thread.pyc ├── cont_action ├── constants.pyc ├── game_state.pyc ├── tmp │ └── a3c_log │ │ └── cur │ │ ├── checkpoint │ │ ├── model.ckpt-0.index │ │ ├── model.ckpt-0.meta │ │ ├── model.ckpt-0.data-00000-of-00001 │ │ └── events.out.tfevents.1514256902.jaesik-System-Product-Name ├── game_ac_network.pyc ├── rmsprop_applier.pyc ├── a3c_training_thread.pyc ├── auto_run.sh ├── constants.py ├── game_state_test.py ├── rmsprop_applier_test.py ├── game_state.py ├── rmsprop_applier.py ├── a3c.py ├── a3c_dist.py ├── a3c_training_thread.py └── game_ac_network.py ├── figures └── a3c_dist_tensor.PNG ├── auto_run.sh ├── constants.py ├── LICENSE ├── README.md ├── game_state.py ├── rmsprop_applier.py ├── a3c_dist.py ├── a3c_training_thread.py └── game_ac_network.py /pong.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/pong.bin -------------------------------------------------------------------------------- /constants.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/constants.pyc -------------------------------------------------------------------------------- /game_state.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/game_state.pyc -------------------------------------------------------------------------------- /game_ac_network.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/game_ac_network.pyc -------------------------------------------------------------------------------- /rmsprop_applier.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/rmsprop_applier.pyc -------------------------------------------------------------------------------- /a3c_training_thread.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/a3c_training_thread.pyc -------------------------------------------------------------------------------- /cont_action/constants.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/cont_action/constants.pyc -------------------------------------------------------------------------------- /cont_action/game_state.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/cont_action/game_state.pyc -------------------------------------------------------------------------------- /cont_action/tmp/a3c_log/cur/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt-0" 2 | all_model_checkpoint_paths: "model.ckpt-0" 3 | -------------------------------------------------------------------------------- /figures/a3c_dist_tensor.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/figures/a3c_dist_tensor.PNG -------------------------------------------------------------------------------- /cont_action/game_ac_network.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/cont_action/game_ac_network.pyc -------------------------------------------------------------------------------- /cont_action/rmsprop_applier.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/cont_action/rmsprop_applier.pyc -------------------------------------------------------------------------------- /cont_action/a3c_training_thread.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/cont_action/a3c_training_thread.pyc -------------------------------------------------------------------------------- /cont_action/tmp/a3c_log/cur/model.ckpt-0.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/cont_action/tmp/a3c_log/cur/model.ckpt-0.index -------------------------------------------------------------------------------- /cont_action/tmp/a3c_log/cur/model.ckpt-0.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/cont_action/tmp/a3c_log/cur/model.ckpt-0.meta -------------------------------------------------------------------------------- /cont_action/tmp/a3c_log/cur/model.ckpt-0.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/cont_action/tmp/a3c_log/cur/model.ckpt-0.data-00000-of-00001 -------------------------------------------------------------------------------- /cont_action/tmp/a3c_log/cur/events.out.tfevents.1514256902.jaesik-System-Product-Name: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsikyoon/a3c-distributed_tensorflow/HEAD/cont_action/tmp/a3c_log/cur/events.out.tfevents.1514256902.jaesik-System-Product-Name -------------------------------------------------------------------------------- /auto_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ps_num=6 4 | worker_num=13 5 | 6 | for i in `eval echo {0..$ps_num}` 7 | do 8 | python a3c_dist.py --ps_hosts_num=$ps_num --worker_hosts_num=$worker_num --job_name=ps --task_index=$i & 9 | done 10 | 11 | for i in `eval echo {0..$worker_num}` 12 | do 13 | python a3c_dist.py --ps_hosts_num=$ps_num --worker_hosts_num=$worker_num --job_name=worker --task_index=$i & 14 | done 15 | -------------------------------------------------------------------------------- /cont_action/auto_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ps_num=0 4 | worker_num=0 5 | 6 | for i in `eval echo {0..$ps_num}` 7 | do 8 | python a3c_dist.py --ps_hosts_num=$ps_num --worker_hosts_num=$worker_num --job_name=ps --task_index=$i & 9 | done 10 | 11 | for i in `eval echo {0..$worker_num}` 12 | do 13 | python a3c_dist.py --ps_hosts_num=$ps_num --worker_hosts_num=$worker_num --job_name=worker --task_index=$i & 14 | done 15 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | LOCAL_T_MAX = 20 # repeat step size 4 | RMSP_ALPHA = 0.99 # decay parameter for RMSProp 5 | RMSP_EPSILON = 0.1 # epsilon parameter for RMSProp 6 | CHECKPOINT_DIR = 'checkpoints' 7 | LOG_FILE = '/tmp/a3c_log/cur' 8 | INITIAL_ALPHA_LOW = 1e-4 # log_uniform low limit for learning rate 9 | INITIAL_ALPHA_HIGH = 1e-2 # log_uniform high limit for learning rate 10 | 11 | PARALLEL_SIZE = 8 # parallel thread size 12 | ROM = "pong.bin" # action size = 3 13 | ACTION_SIZE = 3 # action size 14 | 15 | INITIAL_ALPHA_LOG_RATE = 0.4226 # log_uniform interpolate rate for learning rate (around 7 * 10^-4) 16 | GAMMA = 0.99 # discount factor for rewards 17 | ENTROPY_BETA = 0.01 # entropy regurarlization constant 18 | MAX_TIME_STEP = 10 * 10**7 19 | GRAD_NORM_CLIP = 40.0 # gradient norm clipping 20 | USE_GPU = False # To use GPU, set True 21 | USE_LSTM = True # True for A3C LSTM, False for A3C FF 22 | -------------------------------------------------------------------------------- /cont_action/constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | RMSP_ALPHA = 0.99 # decay parameter for RMSProp 4 | RMSP_EPSILON = 0.1 # epsilon parameter for RMSProp 5 | CHECKPOINT_DIR = 'checkpoints' 6 | LOG_FILE = 'tmp/a3c_log/cur' 7 | PINITIAL_ALPHA_LOW = 1e-4 # log_uniform low limit for learning rate 8 | PINITIAL_ALPHA_HIGH = 1e-4 # log_uniform high limit for learning rate 9 | VINITIAL_ALPHA_LOW = 1e-3 # log_uniform low limit for learning rate 10 | VINITIAL_ALPHA_HIGH = 1e-3 # log_uniform high limit for learning rate 11 | 12 | ENV_NAME="MountainCarContinuous-v0" 13 | 14 | INITIAL_ALPHA_LOG_RATE = 0.4226 # log_uniform interpolate rate for learning rate (around 7 * 10^-4) 15 | GAMMA = 0.99 # discount factor for rewards 16 | ENTROPY_BETA = 0.0001 # entropy regurarlization constant 17 | MAX_TIME_STEP = 10 * 10**7 18 | GRAD_NORM_CLIP = 40.0 # gradient norm clipping 19 | USE_GPU = False # To use GPU, set True 20 | USE_LSTM = False # True for A3C LSTM, False for A3C FF 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Jaesik Yoon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | a3c-distributed_tensorflow 2 | =========== 3 | 4 | Distributed Tensorflow Implementation of a3c from Google Deepmind. 5 | 6 | Implementation is on Tensorflow 1.0 7 | 8 | http://arxiv.org/abs/1602.01783 9 | 10 | "We present asynchronous variants of four standard reinforcement learning algorithms and show that parallel actor-learners have a stabilizing effect on training allowing all four methods to successfully train neural network controllers." Form Paper 11 | 12 | The implementation is based on miyosuda git. https://github.com/miyosuda/async_deep_reinforce 13 | 14 | The core implementation is almost same to miyosuda ones. The parts for running with Distributed Tensorflow are changed. 15 | 16 | Atari Game 17 | ------------------- 18 | 19 | ` 20 | ./auto_run.sh 21 | ` 22 | 23 | You need to set your hostname and port number in a3c.py code. The number of parameter servers and workers can be set in auto_run.sh script file. 24 | 25 | 26 | ### Settings 27 | Almost Setting values are same to miyosuda ones except the number of workers. The Number of worker is 14, and the number of parameter server is 7. 28 | 29 | ### Results 30 | 31 | ![alt tag](https://github.com/jaesik817/a3c-distributed_tensorflow/blob/master/figures/a3c_dist_tensor.PNG) 32 | 33 | The experiment is Pong game with the number of worker as above ones. 34 | This one runs with just CPU. The CPU and memory of server are Xeon E7-8880 v4 x 4 (176 threaing with hyper-threading) and 1TB respectively (cpu and memory are not fully used. About 50% of CPU and 20GB Memory are used). 35 | As above result, about 800~900 steps run per seconds (that is better than 980ti performance in miyosuda git), and score is saturated at about 35M steps. 36 | -------------------------------------------------------------------------------- /cont_action/game_state_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from game_state import GameState 5 | 6 | class TestSequenceFunctions(unittest.TestCase): 7 | 8 | def test_process(self): 9 | game_state = GameState(0) 10 | 11 | before_s_t = np.array( game_state.s_t ) 12 | 13 | for i in range(1000): 14 | bef1 = game_state.s_t[:,:,1] 15 | bef2 = game_state.s_t[:,:,2] 16 | bef3 = game_state.s_t[:,:,3] 17 | 18 | game_state.process(1) 19 | game_state.update() 20 | 21 | aft0 = game_state.s_t[:,:,0] 22 | aft1 = game_state.s_t[:,:,1] 23 | aft2 = game_state.s_t[:,:,2] 24 | 25 | # values should be shifted 26 | self.assertTrue( (bef1.flatten() == aft0.flatten()).all() ) 27 | self.assertTrue( (bef2.flatten() == aft1.flatten()).all() ) 28 | self.assertTrue( (bef3.flatten() == aft2.flatten()).all() ) 29 | 30 | # all element should be less [0.0~1.0] 31 | self.assertTrue( np.less_equal(bef1, 1.0).all() ) 32 | self.assertTrue( np.less_equal(bef2, 1.0).all() ) 33 | self.assertTrue( np.less_equal(bef3, 1.0).all() ) 34 | self.assertTrue( np.greater_equal(bef1, 0.0).all() ) 35 | self.assertTrue( np.greater_equal(bef2, 0.0).all() ) 36 | self.assertTrue( np.greater_equal(bef3, 0.0).all() ) 37 | 38 | self.assertTrue( np.less_equal(aft0, 1.0).all() ) 39 | self.assertTrue( np.less_equal(aft1, 1.0).all() ) 40 | self.assertTrue( np.less_equal(aft2, 1.0).all() ) 41 | self.assertTrue( np.greater_equal(aft0, 0.0).all() ) 42 | self.assertTrue( np.greater_equal(aft1, 0.0).all() ) 43 | self.assertTrue( np.greater_equal(aft2, 0.0).all() ) 44 | 45 | if __name__ == '__main__': 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /cont_action/rmsprop_applier_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import math 5 | import tensorflow as tf 6 | import rmsprop_applier 7 | 8 | class RMSPropApplierTest(tf.test.TestCase): 9 | def testApply(self): 10 | with self.test_session(): 11 | var = tf.Variable([1.0, 2.0]) 12 | 13 | grad0 = tf.Variable([2.0, 4.0]) 14 | grad1 = tf.Variable([3.0, 6.0]) 15 | 16 | opt = rmsprop_applier.RMSPropApplier(learning_rate=2.0, 17 | decay=0.9, 18 | momentum=0.0, 19 | epsilon=1.0) 20 | 21 | apply_gradient0 = opt.apply_gradients([var], [grad0]) 22 | apply_gradient1 = opt.apply_gradients([var], [grad1]) 23 | 24 | tf.initialize_all_variables().run() 25 | 26 | # grad0を反映 27 | apply_gradient0.run() 28 | 29 | ms_x = 1.0 30 | ms_y = 1.0 31 | 32 | x = 1.0 33 | y = 2.0 34 | dx = 2.0 35 | dy = 4.0 36 | ms_x = ms_x + (dx * dx - ms_x) * (1.0 - 0.9) 37 | ms_y = ms_y + (dy * dy - ms_y) * (1.0 - 0.9) 38 | x = x - (2.0 * dx / math.sqrt(ms_x+1.0)) 39 | y = y - (2.0 * dy / math.sqrt(ms_y+1.0)) 40 | 41 | self.assertAllClose(np.array([x, y]), var.eval()) 42 | 43 | # grad1を反映 44 | apply_gradient1.run() 45 | 46 | dx = 3.0 47 | dy = 6.0 48 | ms_x = ms_x + (dx * dx - ms_x) * (1.0 - 0.9) 49 | ms_y = ms_y + (dy * dy - ms_y) * (1.0 - 0.9) 50 | x = x - (2.0 * dx / math.sqrt(ms_x+1.0)) 51 | y = y - (2.0 * dy / math.sqrt(ms_y+1.0)) 52 | 53 | self.assertAllClose(np.array([x, y]), var.eval()) 54 | 55 | if __name__ == "__main__": 56 | tf.test.main() 57 | -------------------------------------------------------------------------------- /game_state.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import numpy as np 4 | import cv2 5 | from ale_python_interface import ALEInterface 6 | 7 | from constants import ROM 8 | from constants import ACTION_SIZE 9 | 10 | class GameState(object): 11 | def __init__(self, rand_seed, display=False, no_op_max=7): 12 | self.ale = ALEInterface() 13 | self.ale.setInt(b'random_seed', rand_seed) 14 | self.ale.setFloat(b'repeat_action_probability', 0.0) 15 | self.ale.setBool(b'color_averaging', True) 16 | self.ale.setInt(b'frame_skip', 4) 17 | self._no_op_max = no_op_max 18 | 19 | if display: 20 | self._setup_display() 21 | 22 | self.ale.loadROM(ROM.encode('ascii')) 23 | 24 | # collect minimal action set 25 | self.real_actions = self.ale.getMinimalActionSet() 26 | 27 | # height=210, width=160 28 | self._screen = np.empty((210, 160, 1), dtype=np.uint8) 29 | 30 | self.reset() 31 | 32 | def _process_frame(self, action, reshape): 33 | reward = self.ale.act(action) 34 | terminal = self.ale.game_over() 35 | 36 | # screen shape is (210, 160, 1) 37 | self.ale.getScreenGrayscale(self._screen) 38 | 39 | # reshape it into (210, 160) 40 | reshaped_screen = np.reshape(self._screen, (210, 160)) 41 | 42 | # resize to height=110, width=84 43 | resized_screen = cv2.resize(reshaped_screen, (84, 110)) 44 | 45 | x_t = resized_screen[18:102,:] 46 | if reshape: 47 | x_t = np.reshape(x_t, (84, 84, 1)) 48 | x_t = x_t.astype(np.float32) 49 | x_t *= (1.0/255.0) 50 | return reward, terminal, x_t 51 | 52 | 53 | def _setup_display(self): 54 | if sys.platform == 'darwin': 55 | import pygame 56 | pygame.init() 57 | self.ale.setBool(b'sound', False) 58 | elif sys.platform.startswith('linux'): 59 | self.ale.setBool(b'sound', True) 60 | self.ale.setBool(b'display_screen', True) 61 | 62 | def reset(self): 63 | self.ale.reset_game() 64 | 65 | # randomize initial state 66 | if self._no_op_max > 0: 67 | no_op = np.random.randint(0, self._no_op_max + 1) 68 | for _ in range(no_op): 69 | self.ale.act(0) 70 | 71 | _, _, x_t = self._process_frame(0, False) 72 | 73 | self.reward = 0 74 | self.terminal = False 75 | self.s_t = np.stack((x_t, x_t, x_t, x_t), axis = 2) 76 | 77 | def process(self, action): 78 | # convert original 18 action index to minimal action set index 79 | real_action = self.real_actions[action] 80 | 81 | r, t, x_t1 = self._process_frame(real_action, True) 82 | 83 | self.reward = r 84 | self.terminal = t 85 | self.s_t1 = np.append(self.s_t[:,:,1:], x_t1, axis = 2) 86 | 87 | def update(self): 88 | self.s_t = self.s_t1 89 | -------------------------------------------------------------------------------- /cont_action/game_state.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import numpy as np 4 | import gym 5 | from gym import wrappers 6 | from constants import ENV_NAME 7 | 8 | env = gym.make(ENV_NAME); 9 | 10 | """ crate a new environment class with actions and states normalized to [-1,1] """ 11 | acsp = env.action_space 12 | obsp = env.observation_space 13 | if not type(acsp)==gym.spaces.box.Box: 14 | raise RuntimeError('Environment with continous action space (i.e. Box) required.') 15 | if not type(obsp)==gym.spaces.box.Box: 16 | raise RuntimeError('Environment with continous observation space (i.e. Box) required.') 17 | env_type = type(env) 18 | 19 | class GameState(env_type): 20 | def __init__(self): 21 | self.__dict__.update(env.__dict__) # transfer properties 22 | 23 | self.local_max_iter=env.spec.timestep_limit; 24 | 25 | # Observation space 26 | if np.any(obsp.high < 1e10): 27 | h = obsp.high 28 | l = obsp.low 29 | sc = h-l 30 | self.o_c = (h+l)/2. 31 | self.o_sc = sc / 2. 32 | else: 33 | self.o_c = np.zeros_like(obsp.high) 34 | self.o_sc = np.ones_like(obsp.high) 35 | 36 | self.state_size=len(self.o_sc); 37 | 38 | # Action space 39 | h = acsp.high 40 | l = acsp.low 41 | sc = (h-l) 42 | self.a_c = (h+l)/2. 43 | self.a_sc = sc / 2. 44 | 45 | self.action_size=len(self.a_sc); 46 | self.action_low=l; self.action_high=h; 47 | 48 | # Rewards 49 | self.r_sc = 1.0 50 | self.r_c = 0. 51 | # Special cases 52 | if ENV_NAME == "Reacher-v1": 53 | self.o_sc[6] = 40. 54 | self.o_sc[7] = 20. 55 | self.r_sc = 200. 56 | self.r_c = 0. 57 | # Check and assign transformed spaces 58 | self.observation_space = gym.spaces.Box(self.filter_observation(obsp.low),self.filter_observation(obsp.high)) 59 | self.action_space = gym.spaces.Box(-np.ones_like(acsp.high),np.ones_like(acsp.high)) 60 | def assertEqual(a,b): assert np.all(a == b), "{} != {}".format(a,b) 61 | assertEqual(self.filter_action(self.action_space.low), acsp.low) 62 | assertEqual(self.filter_action(self.action_space.high), acsp.high) 63 | 64 | def filter_observation(self,obs): 65 | return (obs-self.o_c) / self.o_sc 66 | def filter_action(self,action): 67 | return self.a_sc*action+self.a_c 68 | def filter_reward(self,reward): 69 | ''' has to be applied manually otherwise it makes the reward_threshold invalid ''' 70 | return self.r_sc*reward+self.r_c 71 | 72 | def process(self,action): 73 | ac_f = np.clip(self.filter_action(action),self.action_space.low,self.action_space.high) 74 | obs, reward, term, info = env_type.step(self,ac_f[0]) # super function 75 | reward=self.filter_reward(reward); 76 | obs_f = self.filter_observation(obs) 77 | #return obs_f, reward, term, info 78 | self.reward = reward; 79 | self.terminal = term 80 | self.s_t1=np.append(self.s_t[:,1:],np.reshape(obs_f,[self.state_size,1]),axis=1); 81 | 82 | def reset_gs(self,x_t): 83 | self.reward = 0 84 | self.terminal = False 85 | self.s_t = np.stack((x_t, x_t, x_t, x_t), axis = 1) 86 | 87 | def update(self): 88 | self.s_t = self.s_t1 89 | -------------------------------------------------------------------------------- /rmsprop_applier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | 4 | from tensorflow.python.training import training_ops 5 | from tensorflow.python.training import slot_creator 6 | 7 | class RMSPropApplier(object): 8 | 9 | def __init__(self, 10 | learning_rate, 11 | decay=0.9, 12 | momentum=0.0, 13 | epsilon=1e-10, 14 | clip_norm=40.0, 15 | device="/cpu:0", 16 | name="RMSPropApplier"): 17 | 18 | self._name = name 19 | self._learning_rate = learning_rate 20 | self._decay = decay 21 | self._momentum = momentum 22 | self._epsilon = epsilon 23 | self._clip_norm = clip_norm 24 | self._device = device 25 | 26 | # Tensors for learning rate and momentum. Created in _prepare. 27 | self._learning_rate_tensor = None 28 | self._decay_tensor = None 29 | self._momentum_tensor = None 30 | self._epsilon_tensor = None 31 | 32 | self._slots = {} 33 | 34 | def _create_slots(self, var_list): 35 | for v in var_list: 36 | # 'val' is Variable's intial value tensor. 37 | val = tf.constant(1.0, dtype=v.dtype, shape=v.get_shape()) 38 | self._get_or_make_slot(v, val, "rms", self._name) 39 | self._zeros_slot(v, "momentum", self._name) 40 | 41 | def _prepare(self): 42 | self._learning_rate_tensor = tf.convert_to_tensor(self._learning_rate, 43 | name="learning_rate") 44 | self._decay_tensor = tf.convert_to_tensor(self._decay, name="decay") 45 | self._momentum_tensor = tf.convert_to_tensor(self._momentum, 46 | name="momentum") 47 | self._epsilon_tensor = tf.convert_to_tensor(self._epsilon, 48 | name="epsilon") 49 | 50 | def _slot_dict(self, slot_name): 51 | named_slots = self._slots.get(slot_name, None) 52 | if named_slots is None: 53 | named_slots = {} 54 | self._slots[slot_name] = named_slots 55 | return named_slots 56 | 57 | def _get_or_make_slot(self, var, val, slot_name, op_name): 58 | named_slots = self._slot_dict(slot_name) 59 | if var not in named_slots: 60 | named_slots[var] = slot_creator.create_slot(var, val, op_name) 61 | return named_slots[var] 62 | 63 | def get_slot(self, var, name): 64 | named_slots = self._slots.get(name, None) 65 | if not named_slots: 66 | return None 67 | return named_slots.get(var, None) 68 | 69 | def _zeros_slot(self, var, slot_name, op_name): 70 | named_slots = self._slot_dict(slot_name) 71 | if var not in named_slots: 72 | named_slots[var] = slot_creator.create_zeros_slot(var, op_name) 73 | return named_slots[var] 74 | 75 | # TODO: in RMSProp native code, memcpy() (for CPU) and 76 | # cudaMemcpyAsync() (for GPU) are used when updating values, 77 | # and values might tend to be overwritten with results from other threads. 78 | # (Need to check the learning performance with replacing it) 79 | def _apply_dense(self, grad, var): 80 | rms = self.get_slot(var, "rms") 81 | mom = self.get_slot(var, "momentum") 82 | return training_ops.apply_rms_prop( 83 | var, rms, mom, 84 | self._learning_rate_tensor, 85 | self._decay_tensor, 86 | self._momentum_tensor, 87 | self._epsilon_tensor, 88 | grad, 89 | use_locking=False).op 90 | 91 | # Apply accumulated gradients to var. 92 | def apply_gradients(self, var_list, accum_grad_list, name=None): 93 | update_ops = [] 94 | 95 | with tf.device(self._device): 96 | with tf.control_dependencies(None): 97 | self._create_slots(var_list) 98 | 99 | with tf.name_scope(name, self._name, []) as name: 100 | self._prepare() 101 | for var, accum_grad in zip(var_list, accum_grad_list): 102 | with tf.name_scope("update_" + var.op.name), tf.device(var.device): 103 | clipped_accum_grad = tf.clip_by_norm(accum_grad, self._clip_norm) 104 | update_ops.append(self._apply_dense(clipped_accum_grad, var)) 105 | return tf.group(*update_ops, name=name) 106 | -------------------------------------------------------------------------------- /cont_action/rmsprop_applier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | 4 | from tensorflow.python.training import training_ops 5 | from tensorflow.python.training import slot_creator 6 | 7 | class RMSPropApplier(object): 8 | 9 | def __init__(self, 10 | learning_rate, 11 | decay=0.9, 12 | momentum=0.0, 13 | epsilon=1e-10, 14 | clip_norm=40.0, 15 | device="/cpu:0", 16 | name="RMSPropApplier"): 17 | 18 | self._name = name 19 | self._learning_rate = learning_rate 20 | self._decay = decay 21 | self._momentum = momentum 22 | self._epsilon = epsilon 23 | self._clip_norm = clip_norm 24 | self._device = device 25 | 26 | # Tensors for learning rate and momentum. Created in _prepare. 27 | self._learning_rate_tensor = None 28 | self._decay_tensor = None 29 | self._momentum_tensor = None 30 | self._epsilon_tensor = None 31 | 32 | self._slots = {} 33 | 34 | def _create_slots(self, var_list): 35 | for v in var_list: 36 | # 'val' is Variable's intial value tensor. 37 | val = tf.constant(1.0, dtype=v.dtype, shape=v.get_shape()) 38 | self._get_or_make_slot(v, val, "rms", self._name) 39 | self._zeros_slot(v, "momentum", self._name) 40 | 41 | def _prepare(self): 42 | self._learning_rate_tensor = tf.convert_to_tensor(self._learning_rate, 43 | name="learning_rate") 44 | self._decay_tensor = tf.convert_to_tensor(self._decay, name="decay") 45 | self._momentum_tensor = tf.convert_to_tensor(self._momentum, 46 | name="momentum") 47 | self._epsilon_tensor = tf.convert_to_tensor(self._epsilon, 48 | name="epsilon") 49 | 50 | def _slot_dict(self, slot_name): 51 | named_slots = self._slots.get(slot_name, None) 52 | if named_slots is None: 53 | named_slots = {} 54 | self._slots[slot_name] = named_slots 55 | return named_slots 56 | 57 | def _get_or_make_slot(self, var, val, slot_name, op_name): 58 | named_slots = self._slot_dict(slot_name) 59 | if var not in named_slots: 60 | named_slots[var] = slot_creator.create_slot(var, val, op_name) 61 | return named_slots[var] 62 | 63 | def get_slot(self, var, name): 64 | named_slots = self._slots.get(name, None) 65 | if not named_slots: 66 | return None 67 | return named_slots.get(var, None) 68 | 69 | def _zeros_slot(self, var, slot_name, op_name): 70 | named_slots = self._slot_dict(slot_name) 71 | if var not in named_slots: 72 | named_slots[var] = slot_creator.create_zeros_slot(var, op_name) 73 | return named_slots[var] 74 | 75 | # TODO: in RMSProp native code, memcpy() (for CPU) and 76 | # cudaMemcpyAsync() (for GPU) are used when updating values, 77 | # and values might tend to be overwritten with results from other threads. 78 | # (Need to check the learning performance with replacing it) 79 | def _apply_dense(self, grad, var): 80 | rms = self.get_slot(var, "rms") 81 | mom = self.get_slot(var, "momentum") 82 | return training_ops.apply_rms_prop( 83 | var, rms, mom, 84 | self._learning_rate_tensor, 85 | self._decay_tensor, 86 | self._momentum_tensor, 87 | self._epsilon_tensor, 88 | grad, 89 | use_locking=False).op 90 | 91 | # Apply accumulated gradients to var. 92 | def apply_gradients(self, var_list, accum_grad_list, name=None): 93 | update_ops = [] 94 | 95 | with tf.device(self._device): 96 | with tf.control_dependencies(None): 97 | self._create_slots(var_list) 98 | 99 | with tf.name_scope(name, self._name, []) as name: 100 | self._prepare() 101 | for var, accum_grad in zip(var_list, accum_grad_list): 102 | with tf.name_scope("update_" + var.op.name), tf.device(var.device): 103 | clipped_accum_grad = tf.clip_by_norm(accum_grad, self._clip_norm) 104 | update_ops.append(self._apply_dense(clipped_accum_grad, var)) 105 | return tf.group(*update_ops, name=name) 106 | -------------------------------------------------------------------------------- /cont_action/a3c.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import threading 4 | import numpy as np 5 | 6 | import signal 7 | import random 8 | import math 9 | import os 10 | import time 11 | 12 | from game_ac_network import GameACFFNetwork, GameACLSTMNetwork 13 | from a3c_training_thread import A3CTrainingThread 14 | from rmsprop_applier import RMSPropApplier 15 | 16 | from constants import ACTION_SIZE 17 | from constants import PARALLEL_SIZE 18 | from constants import INITIAL_ALPHA_LOW 19 | from constants import INITIAL_ALPHA_HIGH 20 | from constants import INITIAL_ALPHA_LOG_RATE 21 | from constants import MAX_TIME_STEP 22 | from constants import CHECKPOINT_DIR 23 | from constants import LOG_FILE 24 | from constants import RMSP_EPSILON 25 | from constants import RMSP_ALPHA 26 | from constants import GRAD_NORM_CLIP 27 | from constants import USE_GPU 28 | from constants import USE_LSTM 29 | 30 | 31 | def log_uniform(lo, hi, rate): 32 | log_lo = math.log(lo) 33 | log_hi = math.log(hi) 34 | v = log_lo * (1-rate) + log_hi * rate 35 | return math.exp(v) 36 | 37 | device = "/cpu:0" 38 | if USE_GPU: 39 | device = "/gpu:0" 40 | 41 | initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, 42 | INITIAL_ALPHA_HIGH, 43 | INITIAL_ALPHA_LOG_RATE) 44 | 45 | global_t = 0 46 | 47 | stop_requested = False 48 | 49 | if USE_LSTM: 50 | global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device) 51 | else: 52 | global_network = GameACFFNetwork(ACTION_SIZE, -1, device) 53 | 54 | 55 | training_threads = [] 56 | 57 | learning_rate_input = tf.placeholder("float") 58 | 59 | grad_applier = RMSPropApplier(learning_rate = learning_rate_input, 60 | decay = RMSP_ALPHA, 61 | momentum = 0.0, 62 | epsilon = RMSP_EPSILON, 63 | clip_norm = GRAD_NORM_CLIP, 64 | device = device) 65 | 66 | for i in range(PARALLEL_SIZE): 67 | training_thread = A3CTrainingThread(i, global_network, initial_learning_rate, 68 | learning_rate_input, 69 | grad_applier, MAX_TIME_STEP, 70 | device = device) 71 | training_threads.append(training_thread) 72 | 73 | # prepare session 74 | sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, 75 | allow_soft_placement=True)) 76 | 77 | init = tf.global_variables_initializer() 78 | sess.run(init) 79 | 80 | # summary for tensorboard 81 | score_input = tf.placeholder(tf.int32) 82 | tf.summary.scalar("score", score_input) 83 | 84 | summary_op = tf.summary.merge_all() 85 | summary_writer = tf.summary.FileWriter(LOG_FILE, sess.graph) 86 | 87 | # init or load checkpoint with saver 88 | saver = tf.train.Saver() 89 | checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) 90 | if checkpoint and checkpoint.model_checkpoint_path: 91 | saver.restore(sess, checkpoint.model_checkpoint_path) 92 | print("checkpoint loaded:", checkpoint.model_checkpoint_path) 93 | tokens = checkpoint.model_checkpoint_path.split("-") 94 | # set global step 95 | global_t = int(tokens[1]) 96 | print(">>> global step set: ", global_t) 97 | # set wall time 98 | wall_t_fname = CHECKPOINT_DIR + '/' + 'wall_t.' + str(global_t) 99 | with open(wall_t_fname, 'r') as f: 100 | wall_t = float(f.read()) 101 | else: 102 | print("Could not find old checkpoint") 103 | # set wall time 104 | wall_t = 0.0 105 | 106 | 107 | def train_function(parallel_index): 108 | global global_t 109 | 110 | training_thread = training_threads[parallel_index] 111 | # set start_time 112 | start_time = time.time() - wall_t 113 | training_thread.set_start_time(start_time) 114 | idx=1;total_score=0; 115 | while True: 116 | if stop_requested: 117 | break 118 | if global_t > MAX_TIME_STEP: 119 | break 120 | 121 | diff_global_t,score = training_thread.process(sess, global_t, summary_writer, 122 | summary_op, score_input) 123 | if(idx==100): 124 | total_score+=score; 125 | total_score=total_score/100.; 126 | print("Score: "+str(total_score)); 127 | idx=1; 128 | else: 129 | total_score+=score; 130 | idx+=1; 131 | 132 | global_t += diff_global_t 133 | 134 | 135 | def signal_handler(signal, frame): 136 | global stop_requested 137 | print('You pressed Ctrl+C!') 138 | stop_requested = True 139 | 140 | train_threads = [] 141 | for i in range(PARALLEL_SIZE): 142 | train_threads.append(threading.Thread(target=train_function, args=(i,))) 143 | 144 | signal.signal(signal.SIGINT, signal_handler) 145 | 146 | # set start time 147 | start_time = time.time() - wall_t 148 | 149 | for t in train_threads: 150 | t.start() 151 | 152 | print('Press Ctrl+C to stop') 153 | signal.pause() 154 | 155 | print('Now saving data. Please wait') 156 | 157 | for t in train_threads: 158 | t.join() 159 | 160 | if not os.path.exists(CHECKPOINT_DIR): 161 | os.mkdir(CHECKPOINT_DIR) 162 | 163 | # write wall time 164 | wall_t = time.time() - start_time 165 | wall_t_fname = CHECKPOINT_DIR + '/' + 'wall_t.' + str(global_t) 166 | with open(wall_t_fname, 'w') as f: 167 | f.write(str(wall_t)) 168 | 169 | saver.save(sess, CHECKPOINT_DIR + '/' + 'checkpoint', global_step = global_t) 170 | 171 | -------------------------------------------------------------------------------- /a3c_dist.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import threading 4 | import numpy as np 5 | 6 | import signal 7 | import random 8 | import math 9 | import os 10 | import time 11 | import sys 12 | 13 | from game_ac_network import GameACFFNetwork, GameACLSTMNetwork 14 | from a3c_training_thread import A3CTrainingThread 15 | from rmsprop_applier import RMSPropApplier 16 | 17 | from constants import ACTION_SIZE 18 | from constants import PARALLEL_SIZE 19 | from constants import INITIAL_ALPHA_LOW 20 | from constants import INITIAL_ALPHA_HIGH 21 | from constants import INITIAL_ALPHA_LOG_RATE 22 | from constants import MAX_TIME_STEP 23 | from constants import CHECKPOINT_DIR 24 | from constants import LOG_FILE 25 | from constants import RMSP_EPSILON 26 | from constants import RMSP_ALPHA 27 | from constants import GRAD_NORM_CLIP 28 | from constants import USE_LSTM 29 | 30 | import argparse 31 | 32 | FLAGS=None; 33 | log_dir=None; 34 | 35 | def log_uniform(lo, hi, rate): 36 | log_lo = math.log(lo) 37 | log_hi = math.log(hi) 38 | v = log_lo * (1-rate) + log_hi * rate 39 | return math.exp(v) 40 | 41 | def train(): 42 | #initial learning rate 43 | initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, 44 | INITIAL_ALPHA_HIGH, 45 | INITIAL_ALPHA_LOG_RATE) 46 | 47 | # parameter server and worker information 48 | ps_hosts = np.zeros(FLAGS.ps_hosts_num,dtype=object); 49 | worker_hosts = np.zeros(FLAGS.worker_hosts_num,dtype=object); 50 | port_num=FLAGS.st_port_num; 51 | for i in range(FLAGS.ps_hosts_num): 52 | ps_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); 53 | port_num+=1; 54 | for i in range(FLAGS.worker_hosts_num): 55 | worker_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); 56 | port_num+=1; 57 | ps_hosts=list(ps_hosts); 58 | worker_hosts=list(worker_hosts); 59 | # Create a cluster from the parameter server and worker hosts. 60 | cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) 61 | 62 | # Create and start a server for the local task. 63 | server = tf.train.Server(cluster, 64 | job_name=FLAGS.job_name, 65 | task_index=FLAGS.task_index) 66 | 67 | 68 | if FLAGS.job_name == "ps": 69 | server.join(); 70 | elif FLAGS.job_name == "worker": 71 | device=tf.train.replica_device_setter( 72 | worker_device="/job:worker/task:%d" % FLAGS.task_index, 73 | cluster=cluster); 74 | 75 | """ 76 | # There are no global network 77 | if USE_LSTM: 78 | global_network = GameACLSTMNetwork(ACTION_SIZE, -1, device) 79 | else: 80 | global_network = GameACFFNetwork(ACTION_SIZE, -1, device) 81 | """ 82 | 83 | learning_rate_input = tf.placeholder("float") 84 | 85 | grad_applier = RMSPropApplier(learning_rate = learning_rate_input, 86 | decay = RMSP_ALPHA, 87 | momentum = 0.0, 88 | epsilon = RMSP_EPSILON, 89 | clip_norm = GRAD_NORM_CLIP, 90 | device = device) 91 | 92 | tf.set_random_seed(1); 93 | #There are no global network 94 | training_thread = A3CTrainingThread(0, "", initial_learning_rate, 95 | learning_rate_input, 96 | grad_applier, MAX_TIME_STEP, 97 | device = device,task_index=FLAGS.task_index) 98 | 99 | # prepare session 100 | with tf.device(tf.train.replica_device_setter( 101 | worker_device="/job:worker/task:%d" % FLAGS.task_index, 102 | cluster=cluster)): 103 | global_step = tf.get_variable('global_step',[],initializer=tf.constant_initializer(0),trainable=False); 104 | global_step_ph=tf.placeholder(global_step.dtype,shape=global_step.get_shape()); 105 | global_step_ops=global_step.assign(global_step_ph); 106 | score = tf.get_variable('score',[],initializer=tf.constant_initializer(-21),trainable=False); 107 | score_ph=tf.placeholder(score.dtype,shape=score.get_shape()); 108 | score_ops=score.assign(score_ph); 109 | init_op=tf.global_variables_initializer(); 110 | # summary for tensorboard 111 | tf.summary.scalar("score", score); 112 | summary_op = tf.summary.merge_all() 113 | saver = tf.train.Saver(); 114 | 115 | sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), 116 | global_step=global_step, 117 | logdir=LOG_FILE, 118 | summary_op=summary_op, 119 | saver=saver, 120 | init_op=init_op) 121 | 122 | with sv.managed_session(server.target) as sess: 123 | # set start_time 124 | wall_t=0.0; 125 | start_time = time.time() - wall_t 126 | training_thread.set_start_time(start_time) 127 | local_t=0; 128 | while True: 129 | if sess.run([global_step])[0] > MAX_TIME_STEP: 130 | break 131 | diff_global_t = training_thread.process(sess, sess.run([global_step])[0], "", 132 | summary_op, "",score_ph,score_ops) 133 | sess.run(global_step_ops,{global_step_ph:sess.run([global_step])[0]+diff_global_t}); 134 | print(str(FLAGS.task_index)+","+str(sess.run([global_step])[0])); 135 | local_t+=diff_global_t; 136 | 137 | sv.stop(); 138 | print("Done"); 139 | 140 | def main(_): 141 | os.system("rm -rf "+LOG_FILE+"/*"); 142 | FLAGS.ps_hosts_num+=1; 143 | FLAGS.worker_hosts_num+=1; 144 | train() 145 | 146 | if __name__ == '__main__': 147 | parser = argparse.ArgumentParser() 148 | parser.register("type", "bool", lambda v: v.lower() == "true") 149 | # Flags for defining the tf.train.ClusterSpec 150 | parser.add_argument( 151 | "--ps_hosts_num", 152 | type=int, 153 | default=5, 154 | help="The Number of Parameter Servers" 155 | ) 156 | parser.add_argument( 157 | "--worker_hosts_num", 158 | type=int, 159 | default=10, 160 | help="The Number of Workers" 161 | ) 162 | parser.add_argument( 163 | "--hostname", 164 | type=str, 165 | default="seltera46", 166 | help="The Hostname of the machine" 167 | ) 168 | parser.add_argument( 169 | "--st_port_num", 170 | type=int, 171 | default=2222, 172 | help="The start port number of ps and worker servers" 173 | ) 174 | parser.add_argument( 175 | "--job_name", 176 | type=str, 177 | default="", 178 | help="One of 'ps', 'worker'" 179 | ) 180 | # Flags for defining the tf.train.Server 181 | parser.add_argument( 182 | "--task_index", 183 | type=int, 184 | default=0, 185 | help="Index of task within the job" 186 | ) 187 | FLAGS, unparsed = parser.parse_known_args() 188 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 189 | -------------------------------------------------------------------------------- /cont_action/a3c_dist.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import threading 4 | import numpy as np 5 | 6 | import signal 7 | import random 8 | import math 9 | import os 10 | import time 11 | import sys 12 | 13 | from game_ac_network import GameACFFNetwork, GameACLSTMNetwork 14 | from a3c_training_thread import A3CTrainingThread 15 | from rmsprop_applier import RMSPropApplier 16 | 17 | from constants import PINITIAL_ALPHA_LOW 18 | from constants import PINITIAL_ALPHA_HIGH 19 | from constants import VINITIAL_ALPHA_LOW 20 | from constants import VINITIAL_ALPHA_HIGH 21 | from constants import INITIAL_ALPHA_LOG_RATE 22 | from constants import MAX_TIME_STEP 23 | from constants import CHECKPOINT_DIR 24 | from constants import LOG_FILE 25 | from constants import RMSP_EPSILON 26 | from constants import RMSP_ALPHA 27 | from constants import GRAD_NORM_CLIP 28 | from constants import USE_LSTM 29 | 30 | import argparse 31 | 32 | FLAGS=None; 33 | log_dir=None; 34 | 35 | def log_uniform(lo, hi, rate): 36 | log_lo = math.log(lo) 37 | log_hi = math.log(hi) 38 | v = log_lo * (1-rate) + log_hi * rate 39 | return math.exp(v) 40 | 41 | def train(): 42 | #initial learning rate 43 | pinitial_learning_rate = log_uniform(PINITIAL_ALPHA_LOW, 44 | PINITIAL_ALPHA_HIGH, 45 | INITIAL_ALPHA_LOG_RATE) 46 | vinitial_learning_rate = log_uniform(VINITIAL_ALPHA_LOW, 47 | VINITIAL_ALPHA_HIGH, 48 | INITIAL_ALPHA_LOG_RATE) 49 | 50 | # parameter server and worker information 51 | ps_hosts = np.zeros(FLAGS.ps_hosts_num,dtype=object); 52 | worker_hosts = np.zeros(FLAGS.worker_hosts_num,dtype=object); 53 | port_num=FLAGS.st_port_num; 54 | for i in range(FLAGS.ps_hosts_num): 55 | ps_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); 56 | port_num+=1; 57 | for i in range(FLAGS.worker_hosts_num): 58 | worker_hosts[i]=str(FLAGS.hostname)+":"+str(port_num); 59 | port_num+=1; 60 | ps_hosts=list(ps_hosts); 61 | worker_hosts=list(worker_hosts); 62 | # Create a cluster from the parameter server and worker hosts. 63 | cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts}) 64 | 65 | # Create and start a server for the local task. 66 | server = tf.train.Server(cluster, 67 | job_name=FLAGS.job_name, 68 | task_index=FLAGS.task_index) 69 | 70 | 71 | if FLAGS.job_name == "ps": 72 | server.join(); 73 | elif FLAGS.job_name == "worker": 74 | device=tf.train.replica_device_setter( 75 | worker_device="/job:worker/task:%d" % FLAGS.task_index, 76 | cluster=cluster); 77 | 78 | plearning_rate_input = tf.placeholder("float") 79 | vlearning_rate_input = tf.placeholder("float") 80 | 81 | pgrad_applier = RMSPropApplier(learning_rate = plearning_rate_input, 82 | decay = RMSP_ALPHA, 83 | momentum = 0.0, 84 | epsilon = RMSP_EPSILON, 85 | clip_norm = GRAD_NORM_CLIP, 86 | device = device) 87 | vgrad_applier = RMSPropApplier(learning_rate = vlearning_rate_input, 88 | decay = RMSP_ALPHA, 89 | momentum = 0.0, 90 | epsilon = RMSP_EPSILON, 91 | clip_norm = GRAD_NORM_CLIP, 92 | device = device) 93 | 94 | tf.set_random_seed(1); 95 | #There are no global network 96 | training_thread = A3CTrainingThread(0, "", 97 | pinitial_learning_rate,plearning_rate_input,pgrad_applier, 98 | vinitial_learning_rate,vlearning_rate_input,vgrad_applier, 99 | MAX_TIME_STEP, 100 | device = device,task_index=FLAGS.task_index) 101 | 102 | # prepare session 103 | with tf.device(tf.train.replica_device_setter( 104 | worker_device="/job:worker/task:%d" % FLAGS.task_index, 105 | cluster=cluster)): 106 | global_step = tf.get_variable('global_step',[],initializer=tf.constant_initializer(0),trainable=False); 107 | global_step_ph=tf.placeholder(global_step.dtype,shape=global_step.get_shape()); 108 | global_step_ops=global_step.assign(global_step_ph); 109 | score = tf.get_variable('score',[],initializer=tf.constant_initializer(-21),trainable=False); 110 | score_ph=tf.placeholder(score.dtype,shape=score.get_shape()); 111 | score_ops=score.assign(score_ph); 112 | init_op=tf.global_variables_initializer(); 113 | # summary for tensorboard 114 | tf.summary.scalar("score", score); 115 | summary_op = tf.summary.merge_all() 116 | saver = tf.train.Saver(); 117 | 118 | sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), 119 | global_step=global_step, 120 | logdir=LOG_FILE, 121 | summary_op=summary_op, 122 | saver=saver, 123 | init_op=init_op) 124 | 125 | with sv.managed_session(server.target) as sess: 126 | # set start_time 127 | wall_t=0.0; 128 | start_time = time.time() - wall_t 129 | training_thread.set_start_time(start_time) 130 | local_t=0; 131 | while True: 132 | if sess.run([global_step])[0] > MAX_TIME_STEP: 133 | break 134 | diff_global_t = training_thread.process(sess, sess.run([global_step])[0], "", 135 | summary_op, "",score_ph,score_ops) 136 | sess.run(global_step_ops,{global_step_ph:sess.run([global_step])[0]+diff_global_t}); 137 | local_t+=diff_global_t; 138 | 139 | sv.stop(); 140 | print("Done"); 141 | 142 | def main(_): 143 | os.system("rm -rf "+LOG_FILE+"/*"); 144 | FLAGS.ps_hosts_num+=1; 145 | FLAGS.worker_hosts_num+=1; 146 | train() 147 | 148 | if __name__ == '__main__': 149 | parser = argparse.ArgumentParser() 150 | parser.register("type", "bool", lambda v: v.lower() == "true") 151 | # Flags for defining the tf.train.ClusterSpec 152 | parser.add_argument( 153 | "--ps_hosts_num", 154 | type=int, 155 | default=5, 156 | help="The Number of Parameter Servers" 157 | ) 158 | parser.add_argument( 159 | "--worker_hosts_num", 160 | type=int, 161 | default=10, 162 | help="The Number of Workers" 163 | ) 164 | parser.add_argument( 165 | "--hostname", 166 | type=str, 167 | default="localhost", 168 | help="The Hostname of the machine" 169 | ) 170 | parser.add_argument( 171 | "--st_port_num", 172 | type=int, 173 | default=2222, 174 | help="The start port number of ps and worker servers" 175 | ) 176 | parser.add_argument( 177 | "--job_name", 178 | type=str, 179 | default="", 180 | help="One of 'ps', 'worker'" 181 | ) 182 | # Flags for defining the tf.train.Server 183 | parser.add_argument( 184 | "--task_index", 185 | type=int, 186 | default=0, 187 | help="Index of task within the job" 188 | ) 189 | FLAGS, unparsed = parser.parse_known_args() 190 | tf.app.run(main=main, argv=[sys.argv[0]] + unparsed) 191 | -------------------------------------------------------------------------------- /a3c_training_thread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import numpy as np 4 | import random 5 | import time 6 | import sys 7 | 8 | from game_state import GameState 9 | from game_state import ACTION_SIZE 10 | from game_ac_network import GameACFFNetwork, GameACLSTMNetwork 11 | 12 | from constants import GAMMA 13 | from constants import LOCAL_T_MAX 14 | from constants import ENTROPY_BETA 15 | from constants import USE_LSTM 16 | 17 | LOG_INTERVAL = 100 18 | PERFORMANCE_LOG_INTERVAL = 1000 19 | 20 | class A3CTrainingThread(object): 21 | def __init__(self, 22 | thread_index, 23 | global_network, 24 | initial_learning_rate, 25 | learning_rate_input, 26 | grad_applier, 27 | max_global_time_step, 28 | device,task_index=""): 29 | 30 | self.thread_index = thread_index 31 | self.learning_rate_input = learning_rate_input 32 | self.max_global_time_step = max_global_time_step 33 | 34 | if USE_LSTM: 35 | self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) 36 | else: 37 | self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) 38 | 39 | self.local_network.prepare_loss(ENTROPY_BETA) 40 | 41 | with tf.device(device): 42 | var_refs = [v._ref() for v in self.local_network.get_vars()] 43 | self.gradients = tf.gradients( 44 | self.local_network.total_loss, var_refs, 45 | gate_gradients=False, 46 | aggregation_method=None, 47 | colocate_gradients_with_ops=False) 48 | 49 | if(global_network): 50 | self.apply_gradients = grad_applier.apply_gradients( 51 | global_network.get_vars(), 52 | self.gradients ) 53 | self.sync = self.local_network.sync_from(global_network) 54 | self.mode="threading"; 55 | else: 56 | self.apply_gradients = grad_applier.apply_gradients( 57 | self.local_network.get_vars(), 58 | self.gradients ) 59 | self.mode="dist_tensor"; 60 | if not (task_index): 61 | self.game_state = GameState(113 * thread_index) 62 | else: 63 | self.game_state = GameState(113 * task_index) 64 | 65 | self.local_t = 0 66 | 67 | self.initial_learning_rate = initial_learning_rate 68 | 69 | self.episode_reward = 0 70 | 71 | # variable controling log output 72 | self.prev_local_t = 0 73 | 74 | def _anneal_learning_rate(self, global_time_step): 75 | learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step 76 | if learning_rate < 0.0: 77 | learning_rate = 0.0 78 | return learning_rate 79 | 80 | def choose_action(self, pi_values): 81 | return np.random.choice(range(len(pi_values)), p=pi_values) 82 | 83 | def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): 84 | summary_str = sess.run(summary_op, feed_dict={ 85 | score_input: score 86 | }) 87 | summary_writer.add_summary(summary_str, global_t) 88 | summary_writer.flush() 89 | 90 | def set_start_time(self, start_time): 91 | self.start_time = start_time 92 | 93 | def get_episode_reward(self): 94 | return self.episode_reward; 95 | 96 | def process(self, sess, global_t, summary_writer, summary_op, score_input,score_ph="",score_ops=""): 97 | states = [] 98 | actions = [] 99 | rewards = [] 100 | values = [] 101 | 102 | terminal_end = False 103 | 104 | # copy weights from shared to local 105 | # dist_tensor case not necessary 106 | if not (self.mode=="dist_tensor"): 107 | sess.run( self.sync ) 108 | 109 | start_local_t = self.local_t 110 | 111 | if USE_LSTM: 112 | start_lstm_state = self.local_network.lstm_state_out 113 | 114 | # t_max times loop 115 | for i in range(LOCAL_T_MAX): 116 | pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) 117 | action = self.choose_action(pi_) 118 | 119 | states.append(self.game_state.s_t) 120 | actions.append(action) 121 | values.append(value_) 122 | 123 | if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): 124 | print("pi={}".format(pi_)) 125 | print(" V={}".format(value_)) 126 | 127 | # process game 128 | self.game_state.process(action) 129 | 130 | # receive game result 131 | reward = self.game_state.reward 132 | terminal = self.game_state.terminal 133 | 134 | self.episode_reward += reward 135 | 136 | # clip reward 137 | rewards.append( np.clip(reward, -1, 1) ) 138 | 139 | self.local_t += 1 140 | 141 | # s_t1 -> s_t 142 | self.game_state.update() 143 | 144 | if terminal: 145 | terminal_end = True 146 | print("score={}".format(self.episode_reward)) 147 | if summary_writer: 148 | self._record_score(sess, summary_writer, summary_op, score_input, 149 | self.episode_reward, global_t) 150 | else: 151 | sess.run(score_ops,{score_ph:self.episode_reward}); 152 | 153 | self.episode_reward = 0 154 | self.game_state.reset() 155 | if USE_LSTM: 156 | self.local_network.reset_state() 157 | break 158 | 159 | R = 0.0 160 | if not terminal_end: 161 | R = self.local_network.run_value(sess, self.game_state.s_t) 162 | 163 | actions.reverse() 164 | states.reverse() 165 | rewards.reverse() 166 | values.reverse() 167 | 168 | batch_si = [] 169 | batch_a = [] 170 | batch_td = [] 171 | batch_R = [] 172 | 173 | # compute and accmulate gradients 174 | for(ai, ri, si, Vi) in zip(actions, rewards, states, values): 175 | R = ri + GAMMA * R 176 | td = R - Vi 177 | a = np.zeros([ACTION_SIZE]) 178 | a[ai] = 1 179 | 180 | batch_si.append(si) 181 | batch_a.append(a) 182 | batch_td.append(td) 183 | batch_R.append(R) 184 | 185 | cur_learning_rate = self._anneal_learning_rate(global_t) 186 | 187 | if USE_LSTM: 188 | batch_si.reverse() 189 | batch_a.reverse() 190 | batch_td.reverse() 191 | batch_R.reverse() 192 | 193 | sess.run( self.apply_gradients, 194 | feed_dict = { 195 | self.local_network.s: batch_si, 196 | self.local_network.a: batch_a, 197 | self.local_network.td: batch_td, 198 | self.local_network.r: batch_R, 199 | self.local_network.initial_lstm_state: start_lstm_state, 200 | self.local_network.step_size : [len(batch_a)], 201 | self.learning_rate_input: cur_learning_rate } ) 202 | else: 203 | sess.run( self.apply_gradients, 204 | feed_dict = { 205 | self.local_network.s: batch_si, 206 | self.local_network.a: batch_a, 207 | self.local_network.td: batch_td, 208 | self.local_network.r: batch_R, 209 | self.learning_rate_input: cur_learning_rate} ) 210 | 211 | if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): 212 | self.prev_local_t += PERFORMANCE_LOG_INTERVAL 213 | elapsed_time = time.time() - self.start_time 214 | steps_per_sec = global_t / elapsed_time 215 | print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( 216 | global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) 217 | 218 | # return advanced local step size 219 | diff_local_t = self.local_t - start_local_t 220 | return diff_local_t 221 | 222 | -------------------------------------------------------------------------------- /cont_action/a3c_training_thread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import numpy as np 4 | import random 5 | import time 6 | import sys 7 | 8 | from game_state import GameState 9 | from game_ac_network import GameACFFNetwork, GameACLSTMNetwork 10 | 11 | from constants import GAMMA 12 | from constants import ENTROPY_BETA 13 | from constants import USE_LSTM 14 | 15 | LOG_INTERVAL = 100 16 | PERFORMANCE_LOG_INTERVAL = 1000 17 | 18 | class A3CTrainingThread(object): 19 | def __init__(self, 20 | thread_index, 21 | global_network, 22 | pinitial_learning_rate, 23 | plearning_rate_input, 24 | pgrad_applier, 25 | vinitial_learning_rate, 26 | vlearning_rate_input, 27 | vgrad_applier, 28 | max_global_time_step, 29 | device,task_index=""): 30 | 31 | self.thread_index = thread_index 32 | self.plearning_rate_input = plearning_rate_input 33 | self.vlearning_rate_input = vlearning_rate_input 34 | self.max_global_time_step = max_global_time_step 35 | self.game_state = GameState() 36 | state=self.game_state.reset(); 37 | self.game_state.reset_gs(state); 38 | self.action_size=self.game_state.action_size; 39 | self.state_size=self.game_state.state_size; 40 | self.local_max_iter=self.game_state.local_max_iter; 41 | 42 | if USE_LSTM: 43 | self.local_network = GameACLSTMNetwork(self.action_size,self.state_size,self.game_state.action_low,self.game_state.action_high, thread_index, device) 44 | else: 45 | self.local_network = GameACFFNetwork(self.action_size,self.state_size,self.game_state.action_low,self.game_state.action_high, thread_index, device) 46 | 47 | self.local_network.prepare_loss(ENTROPY_BETA) 48 | 49 | with tf.device(device): 50 | pvar_refs = [v._ref() for v in self.local_network.get_pvars()] 51 | self.policy_gradients = tf.gradients( 52 | self.local_network.policy_loss, pvar_refs, 53 | gate_gradients=False, 54 | aggregation_method=None, 55 | colocate_gradients_with_ops=False) 56 | vvar_refs = [v._ref() for v in self.local_network.get_vvars()] 57 | self.value_gradients = tf.gradients( 58 | self.local_network.value_loss, vvar_refs, 59 | gate_gradients=False, 60 | aggregation_method=None, 61 | colocate_gradients_with_ops=False) 62 | 63 | self.apply_policy_gradients = pgrad_applier.apply_gradients( 64 | self.local_network.get_pvars(), 65 | self.policy_gradients ) 66 | self.apply_value_gradients = vgrad_applier.apply_gradients( 67 | self.local_network.get_vvars(), 68 | self.value_gradients ) 69 | 70 | self.local_t = 0 71 | 72 | self.pinitial_learning_rate = pinitial_learning_rate 73 | self.vinitial_learning_rate = vinitial_learning_rate 74 | 75 | self.episode_reward = 0 76 | 77 | # variable controling log output 78 | self.prev_local_t = 0 79 | 80 | def _panneal_learning_rate(self, global_time_step): 81 | learning_rate = self.pinitial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step 82 | if learning_rate < 0.0: 83 | learning_rate = 0.0 84 | return learning_rate 85 | 86 | def _vanneal_learning_rate(self, global_time_step): 87 | learning_rate = self.vinitial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step 88 | if learning_rate < 0.0: 89 | learning_rate = 0.0 90 | return learning_rate 91 | 92 | def choose_action(self, pi_values): 93 | return np.random.choice(range(len(pi_values)), p=pi_values) 94 | 95 | def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): 96 | summary_str = sess.run(summary_op, feed_dict={ 97 | score_input: score 98 | }) 99 | summary_writer.add_summary(summary_str, global_t) 100 | summary_writer.flush() 101 | 102 | def set_start_time(self, start_time): 103 | self.start_time = start_time 104 | 105 | def process(self, sess, global_t, summary_writer, summary_op, score_input,score_ph="",score_ops=""): 106 | states = [] 107 | actions = [] 108 | rewards = [] 109 | values = [] 110 | 111 | terminal_end = False 112 | 113 | start_local_t = self.local_t 114 | 115 | if USE_LSTM: 116 | pstart_lstm_state = self.local_network.plstm_state_out 117 | vstart_lstm_state = self.local_network.vlstm_state_out 118 | 119 | # t_max times loop 120 | for i in range(self.local_max_iter): 121 | action, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t) 122 | states.append(self.game_state.s_t) 123 | actions.append(action) 124 | values.append(value_) 125 | 126 | # process game 127 | self.game_state.process(action) 128 | 129 | # receive game result 130 | reward = self.game_state.reward 131 | terminal = self.game_state.terminal 132 | 133 | self.episode_reward += reward 134 | 135 | # clip reward 136 | #rewards.append( np.clip(reward,-1,1) ) 137 | rewards.append(reward); 138 | 139 | self.local_t += 1 140 | 141 | # s_t1 -> s_t 142 | self.game_state.update() 143 | if terminal: 144 | terminal_end = True 145 | print("score={}".format(self.episode_reward/self.game_state.r_sc)) 146 | score=self.episode_reward/self.game_state.r_sc; 147 | if summary_writer: 148 | self._record_score(sess, summary_writer, summary_op, score_input, 149 | self.episode_reward/self.game_state.r_sc, global_t) 150 | else: 151 | sess.run(score_ops,{score_ph:self.episode_reward/self.game_state.r_sc}); 152 | 153 | self.episode_reward = 0 154 | state=self.game_state.reset() 155 | self.game_state.reset_gs(state); 156 | if USE_LSTM: 157 | self.local_network.reset_state() 158 | break 159 | 160 | R = 0.0 161 | if not terminal_end: 162 | R = self.local_network.run_value(sess, self.game_state.s_t) 163 | score=self.episode_reward/self.game_state.r_sc; 164 | 165 | actions.reverse() 166 | states.reverse() 167 | rewards.reverse() 168 | values.reverse() 169 | 170 | batch_si = [] 171 | batch_a = [] 172 | batch_td = [] 173 | batch_R = [] 174 | 175 | # compute and accmulate gradients 176 | for(ai, ri, si, Vi) in zip(actions, rewards, states, values): 177 | R = ri + GAMMA * R 178 | td = R - Vi 179 | 180 | batch_si.append(si) 181 | batch_R.append(R) 182 | batch_td.append(td); 183 | 184 | pcur_learning_rate = self._panneal_learning_rate(global_t) 185 | vcur_learning_rate = self._vanneal_learning_rate(global_t) 186 | 187 | if USE_LSTM: 188 | batch_si.reverse() 189 | batch_td.reverse() 190 | batch_R.reverse() 191 | 192 | sess.run( self.apply_policy_gradients, 193 | feed_dict = { 194 | self.local_network.s: batch_si, 195 | self.local_network.td: batch_td, 196 | self.local_network.r: batch_R, 197 | self.local_network.pinitial_lstm_state: pstart_lstm_state, 198 | self.local_network.pstep_size : [len(batch_a)], 199 | self.local_network.vinitial_lstm_state: vstart_lstm_state, 200 | self.local_network.vstep_size : [len(batch_a)], 201 | self.plearning_rate_input: pcur_learning_rate } ) 202 | sess.run( self.apply_value_gradients, 203 | feed_dict = { 204 | self.local_network.s: batch_si, 205 | self.local_network.td: batch_td, 206 | self.local_network.r: batch_R, 207 | self.local_network.pinitial_lstm_state: pstart_lstm_state, 208 | self.local_network.pstep_size : [len(batch_a)], 209 | self.local_network.vinitial_lstm_state: vstart_lstm_state, 210 | self.local_network.vstep_size : [len(batch_a)], 211 | self.vlearning_rate_input: vcur_learning_rate } ) 212 | else: 213 | sess.run( self.apply_policy_gradients, 214 | feed_dict = { 215 | self.local_network.s: batch_si, 216 | self.local_network.r: batch_R, 217 | self.local_network.td: batch_td, 218 | self.plearning_rate_input: pcur_learning_rate} ) 219 | sess.run( self.apply_value_gradients, 220 | feed_dict = { 221 | self.local_network.s: batch_si, 222 | self.local_network.r: batch_R, 223 | self.local_network.td: batch_td, 224 | self.vlearning_rate_input: vcur_learning_rate} ) 225 | 226 | if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): 227 | self.prev_local_t += PERFORMANCE_LOG_INTERVAL 228 | elapsed_time = time.time() - self.start_time 229 | steps_per_sec = global_t / elapsed_time 230 | #print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format( 231 | # global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) 232 | 233 | # return advanced local step size 234 | diff_local_t = self.local_t - start_local_t 235 | return diff_local_t 236 | 237 | -------------------------------------------------------------------------------- /game_ac_network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | # Actor-Critic Network Base Class 6 | # (Policy network and Value network) 7 | class GameACNetwork(object): 8 | def __init__(self, 9 | action_size, 10 | thread_index, # -1 for global 11 | device="/cpu:0"): 12 | self._action_size = action_size 13 | self._thread_index = thread_index 14 | self._device = device 15 | 16 | def prepare_loss(self, entropy_beta): 17 | with tf.device(self._device): 18 | # taken action (input for policy) 19 | self.a = tf.placeholder("float", [None, self._action_size]) 20 | 21 | # temporary difference (R-V) (input for policy) 22 | self.td = tf.placeholder("float", [None]) 23 | 24 | # avoid NaN with clipping when value in pi becomes zero 25 | log_pi = tf.log(tf.clip_by_value(self.pi, 1e-20, 1.0)) 26 | 27 | # policy entropy 28 | entropy = -tf.reduce_sum(self.pi * log_pi, reduction_indices=1) 29 | 30 | # policy loss (output) (Adding minus, because the original paper's objective function is for gradient ascent, but we use gradient descent optimizer.) 31 | policy_loss = - tf.reduce_sum( tf.reduce_sum( tf.multiply( log_pi, self.a ), reduction_indices=1 ) * self.td + entropy * entropy_beta ) 32 | 33 | # R (input for value) 34 | self.r = tf.placeholder("float", [None]) 35 | 36 | # value loss (output) 37 | # (Learning rate for Critic is half of Actor's, so multiply by 0.5) 38 | value_loss = 0.5 * tf.nn.l2_loss(self.r - self.v) 39 | 40 | # gradienet of policy and value are summed up 41 | self.total_loss = policy_loss + value_loss 42 | 43 | def run_policy_and_value(self, sess, s_t): 44 | raise NotImplementedError() 45 | 46 | def run_policy(self, sess, s_t): 47 | raise NotImplementedError() 48 | 49 | def run_value(self, sess, s_t): 50 | raise NotImplementedError() 51 | 52 | def get_vars(self): 53 | raise NotImplementedError() 54 | 55 | def sync_from(self, src_netowrk, name=None): 56 | src_vars = src_netowrk.get_vars() 57 | dst_vars = self.get_vars() 58 | 59 | sync_ops = [] 60 | 61 | with tf.device(self._device): 62 | with tf.name_scope(name, "GameACNetwork", []) as name: 63 | for(src_var, dst_var) in zip(src_vars, dst_vars): 64 | sync_op = tf.assign(dst_var, src_var) 65 | sync_ops.append(sync_op) 66 | 67 | return tf.group(*sync_ops, name=name) 68 | 69 | # weight initialization based on muupan's code 70 | # https://github.com/muupan/async-rl/blob/master/a3c_ale.py 71 | def _fc_variable(self, weight_shape): 72 | input_channels = weight_shape[0] 73 | output_channels = weight_shape[1] 74 | d = 1.0 / np.sqrt(input_channels) 75 | bias_shape = [output_channels] 76 | weight = tf.Variable(tf.random_uniform(weight_shape, minval=-d, maxval=d)) 77 | bias = tf.Variable(tf.random_uniform(bias_shape, minval=-d, maxval=d)) 78 | return weight, bias 79 | 80 | def _conv_variable(self, weight_shape): 81 | w = weight_shape[0] 82 | h = weight_shape[1] 83 | input_channels = weight_shape[2] 84 | output_channels = weight_shape[3] 85 | d = 1.0 / np.sqrt(input_channels * w * h) 86 | bias_shape = [output_channels] 87 | weight = tf.Variable(tf.random_uniform(weight_shape, minval=-d, maxval=d)) 88 | bias = tf.Variable(tf.random_uniform(bias_shape, minval=-d, maxval=d)) 89 | return weight, bias 90 | 91 | def _conv2d(self, x, W, stride): 92 | return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "VALID") 93 | 94 | # Actor-Critic FF Network 95 | class GameACFFNetwork(GameACNetwork): 96 | def __init__(self, 97 | action_size, 98 | thread_index, # -1 for global 99 | device="/cpu:0"): 100 | GameACNetwork.__init__(self, action_size, thread_index, device) 101 | 102 | scope_name = "net_" + str(self._thread_index) 103 | with tf.device(self._device), tf.variable_scope(scope_name) as scope: 104 | self.W_conv1, self.b_conv1 = self._conv_variable([8, 8, 4, 16]) # stride=4 105 | self.W_conv2, self.b_conv2 = self._conv_variable([4, 4, 16, 32]) # stride=2 106 | 107 | self.W_fc1, self.b_fc1 = self._fc_variable([2592, 256]) 108 | 109 | # weight for policy output layer 110 | self.W_fc2, self.b_fc2 = self._fc_variable([256, action_size]) 111 | 112 | # weight for value output layer 113 | self.W_fc3, self.b_fc3 = self._fc_variable([256, 1]) 114 | 115 | # state (input) 116 | self.s = tf.placeholder("float", [None, 84, 84, 4]) 117 | 118 | h_conv1 = tf.nn.relu(self._conv2d(self.s, self.W_conv1, 4) + self.b_conv1) 119 | h_conv2 = tf.nn.relu(self._conv2d(h_conv1, self.W_conv2, 2) + self.b_conv2) 120 | 121 | h_conv2_flat = tf.reshape(h_conv2, [-1, 2592]) 122 | h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, self.W_fc1) + self.b_fc1) 123 | 124 | # policy (output) 125 | self.pi = tf.nn.softmax(tf.matmul(h_fc1, self.W_fc2) + self.b_fc2) 126 | # value (output) 127 | v_ = tf.matmul(h_fc1, self.W_fc3) + self.b_fc3 128 | self.v = tf.reshape( v_, [-1] ) 129 | 130 | def run_policy_and_value(self, sess, s_t): 131 | pi_out, v_out = sess.run( [self.pi, self.v], feed_dict = {self.s : [s_t]} ) 132 | return (pi_out[0], v_out[0]) 133 | 134 | def run_policy(self, sess, s_t): 135 | pi_out = sess.run( self.pi, feed_dict = {self.s : [s_t]} ) 136 | return pi_out[0] 137 | 138 | def run_value(self, sess, s_t): 139 | v_out = sess.run( self.v, feed_dict = {self.s : [s_t]} ) 140 | return v_out[0] 141 | 142 | def get_vars(self): 143 | return [self.W_conv1, self.b_conv1, 144 | self.W_conv2, self.b_conv2, 145 | self.W_fc1, self.b_fc1, 146 | self.W_fc2, self.b_fc2, 147 | self.W_fc3, self.b_fc3] 148 | 149 | # Actor-Critic LSTM Network 150 | class GameACLSTMNetwork(GameACNetwork): 151 | def __init__(self, 152 | action_size, 153 | thread_index, # -1 for global 154 | device="/cpu:0" ): 155 | GameACNetwork.__init__(self, action_size, thread_index, device) 156 | 157 | scope_name = "net_" + str(self._thread_index) 158 | with tf.device(self._device), tf.variable_scope(scope_name) as scope: 159 | self.W_conv1, self.b_conv1 = self._conv_variable([8, 8, 4, 16]) # stride=4 160 | self.W_conv2, self.b_conv2 = self._conv_variable([4, 4, 16, 32]) # stride=2 161 | 162 | self.W_fc1, self.b_fc1 = self._fc_variable([2592, 256]) 163 | 164 | # lstm 165 | self.lstm = tf.contrib.rnn.BasicLSTMCell(256, state_is_tuple=True) 166 | 167 | # weight for policy output layer 168 | self.W_fc2, self.b_fc2 = self._fc_variable([256, action_size]) 169 | 170 | # weight for value output layer 171 | self.W_fc3, self.b_fc3 = self._fc_variable([256, 1]) 172 | 173 | # state (input) 174 | self.s = tf.placeholder("float", [None, 84, 84, 4]) 175 | 176 | h_conv1 = tf.nn.relu(self._conv2d(self.s, self.W_conv1, 4) + self.b_conv1) 177 | h_conv2 = tf.nn.relu(self._conv2d(h_conv1, self.W_conv2, 2) + self.b_conv2) 178 | 179 | h_conv2_flat = tf.reshape(h_conv2, [-1, 2592]) 180 | h_fc1 = tf.nn.relu(tf.matmul(h_conv2_flat, self.W_fc1) + self.b_fc1) 181 | # h_fc1 shape=(5,256) 182 | 183 | h_fc1_reshaped = tf.reshape(h_fc1, [1,-1,256]) 184 | # h_fc_reshaped = (1,5,256) 185 | 186 | # place holder for LSTM unrolling time step size. 187 | self.step_size = tf.placeholder(tf.float32, [1]) 188 | 189 | self.initial_lstm_state0 = tf.placeholder(tf.float32, [1, 256]) 190 | self.initial_lstm_state1 = tf.placeholder(tf.float32, [1, 256]) 191 | self.initial_lstm_state = tf.contrib.rnn.LSTMStateTuple(self.initial_lstm_state0, 192 | self.initial_lstm_state1) 193 | 194 | # Unrolling LSTM up to LOCAL_T_MAX time steps. (= 5time steps.) 195 | # When episode terminates unrolling time steps becomes less than LOCAL_TIME_STEP. 196 | # Unrolling step size is applied via self.step_size placeholder. 197 | # When forward propagating, step_size is 1. 198 | # (time_major = False, so output shape is [batch_size, max_time, cell.output_size]) 199 | lstm_outputs, self.lstm_state = tf.nn.dynamic_rnn(self.lstm, 200 | h_fc1_reshaped, 201 | initial_state = self.initial_lstm_state, 202 | sequence_length = self.step_size, 203 | time_major = False, 204 | scope = scope) 205 | 206 | # lstm_outputs: (1,5,256) for back prop, (1,1,256) for forward prop. 207 | 208 | lstm_outputs = tf.reshape(lstm_outputs, [-1,256]) 209 | 210 | # policy (output) 211 | self.pi = tf.nn.softmax(tf.matmul(lstm_outputs, self.W_fc2) + self.b_fc2) 212 | 213 | # value (output) 214 | v_ = tf.matmul(lstm_outputs, self.W_fc3) + self.b_fc3 215 | self.v = tf.reshape( v_, [-1] ) 216 | 217 | scope.reuse_variables() 218 | self.W_lstm = tf.get_variable("basic_lstm_cell/weights") 219 | self.b_lstm = tf.get_variable("basic_lstm_cell/biases") 220 | 221 | self.reset_state() 222 | 223 | def reset_state(self): 224 | self.lstm_state_out = tf.contrib.rnn.LSTMStateTuple(np.zeros([1, 256]), 225 | np.zeros([1, 256])) 226 | 227 | def run_policy_and_value(self, sess, s_t): 228 | # This run_policy_and_value() is used when forward propagating. 229 | # so the step size is 1. 230 | pi_out, v_out, self.lstm_state_out = sess.run( [self.pi, self.v, self.lstm_state], 231 | feed_dict = {self.s : [s_t], 232 | self.initial_lstm_state0 : self.lstm_state_out[0], 233 | self.initial_lstm_state1 : self.lstm_state_out[1], 234 | self.step_size : [1]} ) 235 | # pi_out: (1,3), v_out: (1) 236 | return (pi_out[0], v_out[0]) 237 | 238 | def run_policy(self, sess, s_t): 239 | # This run_policy() is used for displaying the result with display tool. 240 | pi_out, self.lstm_state_out = sess.run( [self.pi, self.lstm_state], 241 | feed_dict = {self.s : [s_t], 242 | self.initial_lstm_state0 : self.lstm_state_out[0], 243 | self.initial_lstm_state1 : self.lstm_state_out[1], 244 | self.step_size : [1]} ) 245 | 246 | return pi_out[0] 247 | 248 | def run_value(self, sess, s_t): 249 | # This run_value() is used for calculating V for bootstrapping at the 250 | # end of LOCAL_T_MAX time step sequence. 251 | # When next sequcen starts, V will be calculated again with the same state using updated network weights, 252 | # so we don't update LSTM state here. 253 | prev_lstm_state_out = self.lstm_state_out 254 | v_out, _ = sess.run( [self.v, self.lstm_state], 255 | feed_dict = {self.s : [s_t], 256 | self.initial_lstm_state0 : self.lstm_state_out[0], 257 | self.initial_lstm_state1 : self.lstm_state_out[1], 258 | self.step_size : [1]} ) 259 | 260 | # roll back lstm state 261 | self.lstm_state_out = prev_lstm_state_out 262 | return v_out[0] 263 | 264 | def get_vars(self): 265 | return [self.W_conv1, self.b_conv1, 266 | self.W_conv2, self.b_conv2, 267 | self.W_fc1, self.b_fc1, 268 | self.W_lstm, self.b_lstm, 269 | self.W_fc2, self.b_fc2, 270 | self.W_fc3, self.b_fc3] 271 | -------------------------------------------------------------------------------- /cont_action/game_ac_network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import tensorflow as tf 3 | import numpy as np 4 | import math 5 | 6 | # Actor-Critic Network Base Class 7 | # (Policy network and Value network) 8 | class GameACNetwork(object): 9 | def __init__(self, 10 | action_size, 11 | state_size, 12 | thread_index, # -1 for global 13 | device="/cpu:0"): 14 | self._action_size = action_size 15 | self._state_size = state_size 16 | self._thread_index = thread_index 17 | self._device = device 18 | 19 | def prepare_loss(self, entropy_beta): 20 | with tf.device(self._device): 21 | 22 | # temporary difference (R-V) (input for policy) 23 | self.td = tf.placeholder("float", [None]) 24 | 25 | # policy loss 26 | self.policy_loss=policy_loss = -1*self.log_prob*self.td -1e-1*self.entropy; 27 | 28 | # R (input for value) 29 | self.r = tf.placeholder("float", [None]) 30 | 31 | # value loss (output) 32 | # (Learning rate for Critic is half of Actor's, so multiply by 0.5) 33 | self.value_loss=value_loss = tf.squared_difference(self.r, self.v) 34 | 35 | def run_policy_and_value(self, sess, s_t): 36 | raise NotImplementedError() 37 | 38 | def run_policy(self, sess, s_t): 39 | raise NotImplementedError() 40 | 41 | def run_value(self, sess, s_t): 42 | raise NotImplementedError() 43 | 44 | def get_vars(self): 45 | raise NotImplementedError() 46 | 47 | def sync_from(self, src_netowrk, name=None): 48 | src_vars = src_netowrk.get_vars() 49 | dst_vars = self.get_vars() 50 | 51 | sync_ops = [] 52 | 53 | with tf.device(self._device): 54 | with tf.name_scope(name, "GameACNetwork", []) as name: 55 | for(src_var, dst_var) in zip(src_vars, dst_vars): 56 | sync_op = tf.assign(dst_var, src_var) 57 | sync_ops.append(sync_op) 58 | 59 | return tf.group(*sync_ops, name=name) 60 | 61 | # weight initialization based on muupan's code 62 | # https://github.com/muupan/async-rl/blob/master/a3c_ale.py 63 | def _fc_variable(self, weight_shape): 64 | input_channels = weight_shape[0] 65 | output_channels = weight_shape[1] 66 | d = 1.0 / np.sqrt(input_channels) 67 | bias_shape = [output_channels] 68 | #weight = tf.Variable(tf.random_uniform(weight_shape, minval=-d, maxval=d)) 69 | #bias = tf.Variable(tf.random_uniform(bias_shape, minval=-d, maxval=d)) 70 | weight = tf.Variable(tf.random_uniform(weight_shape)) 71 | bias = tf.Variable(tf.random_uniform(bias_shape)) 72 | return weight, bias 73 | 74 | # Actor-Critic FF Network 75 | class GameACFFNetwork(GameACNetwork): 76 | def __init__(self, 77 | action_size, 78 | state_size, 79 | action_low, 80 | action_high, 81 | thread_index, # -1 for global 82 | device="/cpu:0"): 83 | GameACNetwork.__init__(self, action_size, state_size,thread_index, device) 84 | 85 | # state (input) 86 | self.s = tf.placeholder("float", [None, state_size, 4]) 87 | s2 = tf.reshape(self.s,[-1, state_size*4]); 88 | 89 | scope_name = "net_" + str(self._thread_index) + "_policy" 90 | with tf.device(self._device), tf.variable_scope(scope_name) as scope: 91 | # for mu 92 | self.pW_fc1, self.pb_fc1 = self._fc_variable([state_size*4, action_size]) 93 | # for sigma 94 | self.pW_fc2, self.pb_fc2 = self._fc_variable([state_size*4, action_size]) 95 | # policy (output) 96 | self.mu = tf.matmul(s2, self.pW_fc1) + self.pb_fc1 97 | self.sigma = tf.matmul(s2, self.pW_fc2) + self.pb_fc2 98 | self.sigma = tf.nn.softplus(self.sigma) 99 | #SoftPlus operation 100 | self.normal_dist=tf.contrib.distributions.Normal(self.mu,self.sigma); 101 | self.action = self.normal_dist._sample_n(1) 102 | self.action = tf.clip_by_value(self.action,action_low,action_high); 103 | self.log_prob = self.normal_dist.log_prob(self.action); 104 | self.entropy = self.normal_dist.entropy(); 105 | 106 | scope_name = "net_" + str(self._thread_index) + "_value" 107 | with tf.device(self._device), tf.variable_scope(scope_name) as scope: 108 | # for value 109 | self.vW_fc1, self.vb_fc1 = self._fc_variable([state_size*4, action_size]) 110 | v_ = tf.matmul(s2,self.vW_fc1)+self.vb_fc1; 111 | self.v = tf.reshape(v_,[-1]); 112 | 113 | def run_policy_and_value(self, sess, s_t): 114 | action_out, v_out = sess.run( [self.action, self.v], feed_dict = {self.s : [s_t]} ) 115 | return (action_out[0], v_out[0]) 116 | 117 | def run_policy(self, sess, s_t): 118 | action_out = sess.run([self.action], feed_dict = {self.s : [s_t]} ) 119 | return action_out[0] 120 | 121 | def run_value(self, sess, s_t): 122 | v_out = sess.run( self.v, feed_dict = {self.s : [s_t]} ) 123 | return v_out[0] 124 | 125 | def get_vars(self): 126 | return [self.pW_fc1, self.pb_fc1, 127 | self.pW_fc2, self.pb_fc2, 128 | self.vW_fc1, self.vb_fc1] 129 | def get_pvars(self): 130 | return [self.pW_fc1, self.pb_fc1, 131 | self.pW_fc2, self.pb_fc2] 132 | def get_vvars(self): 133 | return [self.vW_fc1, self.vb_fc1] 134 | 135 | # Actor-Critic LSTM Network 136 | class GameACLSTMNetwork(GameACNetwork): 137 | def __init__(self, 138 | action_size, 139 | state_size, 140 | action_low, 141 | action_high, 142 | thread_index, # -1 for global 143 | device="/cpu:0" ): 144 | GameACNetwork.__init__(self, action_size,state_size, thread_index, device) 145 | 146 | # state (input) 147 | self.s = tf.placeholder("float", [None, state_size, 4]) 148 | s2 = tf.reshape(self.s,[-1, state_size*4]); 149 | 150 | 151 | # place holder for LSTM unrolling time step size. 152 | self.pstep_size = tf.placeholder(tf.float32, [1]) 153 | self.pinitial_lstm_state0 = tf.placeholder(tf.float32, [1, 200]) 154 | self.pinitial_lstm_state1 = tf.placeholder(tf.float32, [1, 200]) 155 | self.pinitial_lstm_state = tf.contrib.rnn.LSTMStateTuple(self.pinitial_lstm_state0,self.pinitial_lstm_state1) 156 | scope_name = "net_" + str(self._thread_index)+"_policy" 157 | with tf.device(self._device), tf.variable_scope(scope_name) as scope: 158 | ### policy weight 159 | 160 | self.pW_fc1, self.pb_fc1 = self._fc_variable([state_size*4, 200]) 161 | # lstm 162 | self.plstm = tf.contrib.rnn.BasicLSTMCell(200, state_is_tuple=True) 163 | # weight for policy output layer 164 | self.pW_fc2, self.pb_fc2 = self._fc_variable([200, action_size]) 165 | self.pW_fc3, self.pb_fc3 = self._fc_variable([200, action_size]) 166 | 167 | ### policy networks 168 | self.ph_fc1 = ph_fc1 = tf.nn.relu(tf.matmul(s2, self.pW_fc1) + self.pb_fc1) 169 | ph_fc1_reshaped = tf.reshape(ph_fc1, [1,-1,200]) 170 | plstm_outputs, self.plstm_state = tf.nn.dynamic_rnn(self.plstm, 171 | ph_fc1_reshaped, 172 | initial_state = self.pinitial_lstm_state, 173 | sequence_length = self.pstep_size, 174 | time_major = False, 175 | scope = scope) 176 | plstm_outputs = tf.reshape(plstm_outputs, [-1,200]) 177 | self.mu = tf.matmul(plstm_outputs, self.pW_fc2) + self.pb_fc2 178 | self.sigma = tf.matmul(plstm_outputs, self.pW_fc3) + self.pb_fc3 179 | self.sigma = tf.nn.softplus(self.sigma)+1e-5 180 | #SoftPlus operation 181 | self.normal_dist=tf.contrib.distributions.Normal(self.mu,self.sigma); 182 | self.action = self.normal_dist._sample_n(1) 183 | self.action = tf.clip_by_value(self.action,action_low,action_high); 184 | self.log_prob = self.normal_dist.log_prob(self.action); 185 | self.entropy = self.normal_dist.entropy(); 186 | scope.reuse_variables() 187 | self.pW_lstm = tf.get_variable("basic_lstm_cell/kernel") 188 | self.pb_lstm = tf.get_variable("basic_lstm_cell/bias") 189 | self.reset_state() 190 | 191 | # place holder for LSTM unrolling time step size. 192 | self.vstep_size = tf.placeholder(tf.float32, [1]) 193 | self.vinitial_lstm_state0 = tf.placeholder(tf.float32, [1, 200]) 194 | self.vinitial_lstm_state1 = tf.placeholder(tf.float32, [1, 200]) 195 | self.vinitial_lstm_state = tf.contrib.rnn.LSTMStateTuple(self.vinitial_lstm_state0,self.vinitial_lstm_state1) 196 | scope_name = "net_" + str(self._thread_index)+"_value" 197 | with tf.device(self._device), tf.variable_scope(scope_name) as scope: 198 | ### value weights 199 | self.vW_fc1, self.vb_fc1 = self._fc_variable([state_size*4, 200]) 200 | # lstm 201 | self.vlstm = tf.contrib.rnn.BasicLSTMCell(200, state_is_tuple=True) 202 | # weight for value output layer 203 | self.vW_fc2, self.vb_fc2 = self._fc_variable([200, 1]) 204 | ### value networks 205 | self.vh_fc1 = vh_fc1 = tf.nn.relu(tf.matmul(s2, self.vW_fc1) + self.vb_fc1) 206 | vh_fc1_reshaped = tf.reshape(vh_fc1, [1,-1,200]) 207 | vlstm_outputs, self.vlstm_state = tf.nn.dynamic_rnn(self.vlstm, 208 | vh_fc1_reshaped, 209 | initial_state = self.vinitial_lstm_state, 210 | sequence_length = self.vstep_size, 211 | time_major = False, 212 | scope = scope) 213 | vlstm_outputs = tf.reshape(vlstm_outputs, [-1,200]) 214 | v_ = tf.matmul(vlstm_outputs, self.vW_fc2) + self.vb_fc2 215 | self.v = tf.reshape( v_, [-1] ) 216 | scope.reuse_variables() 217 | self.vW_lstm = tf.get_variable("basic_lstm_cell/kernel") 218 | self.vb_lstm = tf.get_variable("basic_lstm_cell/bias") 219 | self.reset_state() 220 | 221 | def reset_state(self): 222 | self.plstm_state_out = tf.contrib.rnn.LSTMStateTuple(np.zeros([1, 200]), 223 | np.zeros([1, 200])) 224 | self.vlstm_state_out = tf.contrib.rnn.LSTMStateTuple(np.zeros([1, 200]), 225 | np.zeros([1, 200])) 226 | 227 | def run_policy_and_value(self, sess, s_t): 228 | # This run_policy_and_value() is used when forward propagating. 229 | # so the step size is 1. 230 | action_out, v_out = sess.run( [self.action, self.v], 231 | feed_dict = {self.s : [s_t], 232 | self.pinitial_lstm_state0 : self.plstm_state_out[0], 233 | self.pinitial_lstm_state1 : self.plstm_state_out[1], 234 | self.pstep_size : [1], 235 | self.vinitial_lstm_state0 : self.vlstm_state_out[0], 236 | self.vinitial_lstm_state1 : self.vlstm_state_out[1], 237 | self.vstep_size : [1]} ) 238 | # pi_out: (1,3), v_out: (1) 239 | return (action_out[0], v_out[0]) 240 | 241 | def run_policy(self, sess, s_t): 242 | # This run_policy() is used for displaying the result with display tool. 243 | action_out = sess.run( self.action, 244 | feed_dict = {self.s : [s_t], 245 | self.pinitial_lstm_state0 : self.plstm_state_out[0], 246 | self.pinitial_lstm_state1 : self.plstm_state_out[1], 247 | self.pstep_size : [1], 248 | self.vinitial_lstm_state0 : self.vlstm_state_out[0], 249 | self.vinitial_lstm_state1 : self.vlstm_state_out[1], 250 | self.vstep_size : [1]} ) 251 | 252 | return action_out[0] 253 | 254 | def run_value(self, sess, s_t): 255 | vprev_lstm_state_out = self.vlstm_state_out 256 | v_out = sess.run(self.v, 257 | feed_dict = {self.s : [s_t], 258 | self.pinitial_lstm_state0 : self.plstm_state_out[0], 259 | self.pinitial_lstm_state1 : self.plstm_state_out[1], 260 | self.pstep_size : [1], 261 | self.vinitial_lstm_state0 : self.vlstm_state_out[0], 262 | self.vinitial_lstm_state1 : self.vlstm_state_out[1], 263 | self.vstep_size : [1]} ) 264 | 265 | # roll back lstm state 266 | self.vlstm_state_out = vprev_lstm_state_out 267 | return v_out[0] 268 | 269 | def get_vars(self): 270 | return [self.pW_fc1, self.pb_fc1, 271 | self.pW_lstm, self.pb_lstm, 272 | self.pW_fc2, self.pb_fc2, 273 | self.pW_fc3, self.pb_fc3, 274 | self.vW_fc1, self.vb_fc1, 275 | self.vW_lstm, self.vb_lstm, 276 | self.vW_fc2, self.vb_fc2] 277 | 278 | def get_pvars(self): 279 | return [self.pW_fc1, self.pb_fc1, 280 | self.pW_lstm, self.pb_lstm, 281 | self.pW_fc2, self.pb_fc2, 282 | self.pW_fc3, self.pb_fc3] 283 | 284 | def get_vvars(self): 285 | return [self.vW_fc1, self.vb_fc1, 286 | self.vW_lstm, self.vb_lstm, 287 | self.vW_fc2, self.vb_fc2] 288 | --------------------------------------------------------------------------------