├── README.md ├── eval.py ├── gym_wrapper.py ├── log.png ├── logger.py ├── paac-breakout-1280014.pkl ├── paac.py ├── train.py └── worker.py /README.md: -------------------------------------------------------------------------------- 1 | # PAAC.pytorch 2 | Pytorch implementation of the PAAC algorithm presented in "Efficient Parallel Methods for Deep Reinforcement Learning". PAAC is the abbreviation of Parallel Advantage Actor-Critic. 3 | 4 | Currently, because the PAAC network is not using LSTM, the evaluation result is not very good. I'm working on the LSTM version of PAAC (waiting for a new graphic card due to lack of current gpu's memory.) 5 | 6 | The original paper is here: https://arxiv.org/abs/1705.04862 7 | 8 | # Requirements 9 | PAAC.pytorch requires torch, torchvision, PIL, gym. 10 | 11 | Libraries used in this project: 12 | 13 | * torch==0.1.12+32e6665 14 | * torchvision==0.1.8 15 | * Pillow==4.1.1 16 | * gym@797a25d1b1a8823b305fdb575c4378a5c288b432 17 | 18 | # Result (BreakoutDeterministic-v4 training log) 19 | ![log](log.png) 20 | 21 | https://www.youtube.com/watch?v=6FMzNaL88wQ 22 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | import gym_wrapper as gym 7 | from paac import PAACNet, INPUT_CHANNELS, INPUT_IMAGE_SIZE 8 | 9 | 10 | def get_args(): 11 | parser = argparse.ArgumentParser( 12 | description='Evaluate a PAAC model.', 13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 14 | 15 | parser.add_argument('--env', type=str, default='Pong-v0') 16 | parser.add_argument('-f', '--filename', type=str, default='paac.pkl', 17 | help='filename to save the trained model into.') 18 | parser.add_argument('--no-cuda', action='store_true') 19 | parser.add_argument('-d', '--debug', action='store_true') 20 | parser.add_argument('--use-max', action='store_true') 21 | 22 | return parser.parse_args() 23 | 24 | 25 | def draw_state(state): 26 | import matplotlib.pyplot as plt 27 | plt.ion() 28 | for i in range(4): 29 | plt.subplot(141 + i) 30 | plt.imshow(PAACNet.to_pil_image(state[:, i]), cmap='gray') 31 | plt.pause(1e-30) 32 | 33 | 34 | if __name__ == '__main__': 35 | args = get_args() 36 | args.cuda = torch.cuda.is_available() and not args.no_cuda 37 | 38 | env = gym.make(args.env, hack='eval') 39 | ob = env.reset() 40 | 41 | num_actions = env.action_space.n 42 | print('num_actions:', num_actions) 43 | 44 | action_meanings = env.env.get_action_meanings() 45 | print('action_meanings:', action_meanings) 46 | 47 | paac = PAACNet(num_actions) 48 | checkpoint = torch.load(args.filename, 49 | map_location=lambda storage, loc: storage) 50 | 51 | try: 52 | iteration = checkpoint['iteration'] 53 | except KeyError: 54 | iteration = -1 55 | 56 | paac.load_state_dict(checkpoint['paac']) 57 | print('Loaded PAAC checkpoint (%d) from' % iteration, args.filename) 58 | 59 | paac.eval() 60 | 61 | state = torch.zeros(1, INPUT_CHANNELS, *INPUT_IMAGE_SIZE) 62 | score = 0 63 | 64 | if args.cuda: 65 | paac.cuda() 66 | state = state.pin_memory().cuda(async=True) 67 | 68 | while True: 69 | state[0, :-1] = state[0, 1:] 70 | state[0, -1] = PAACNet.preprocess(ob) 71 | env.render() 72 | 73 | # draw_state(state) 74 | 75 | policy, value = paac(Variable(state, volatile=True)) 76 | 77 | if args.use_max: 78 | action = policy.max(1)[1].cpu().data[0] 79 | else: 80 | action = policy.multinomial()[0].cpu().data[0] 81 | 82 | if args.debug: 83 | entropy = paac.entropy(policy, 1e-30) 84 | 85 | print('policy:', policy.data.numpy()) 86 | print('value:', value.data[0, 0]) 87 | print('entropy:', entropy.data[0]) 88 | print(action_meanings[action]) 89 | 90 | ob, reward, done, info = env.step(action) 91 | score += reward 92 | 93 | if done: 94 | print('score:', score) 95 | score = 0 96 | state.fill_(0) 97 | ob = env.reset() 98 | -------------------------------------------------------------------------------- /gym_wrapper.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | 4 | def make(env_id, hack=None): 5 | if 'Deterministic-v4' not in env_id: 6 | print('[Warning] Use Deterministic-v4 version ' 7 | 'to reproduce the results of paper.') 8 | 9 | _env = env = gym.make(env_id) 10 | 11 | if hack: 12 | # Hack gym env to output grayscale image 13 | if env.spec.timestep_limit is not None: 14 | from gym.wrappers.time_limit import TimeLimit 15 | 16 | if isinstance(env, TimeLimit): 17 | _env = env.env 18 | 19 | if hack == 'train': 20 | _env._get_image = _env.ale.getScreenGrayscale 21 | _env._get_obs = _env.ale.getScreenGrayscale 22 | elif hack == 'eval': 23 | _env._get_obs = _env.ale.getScreenGrayscale 24 | 25 | return env 26 | -------------------------------------------------------------------------------- /log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qbx2/PAAC.pytorch/0de8a408d6ad0d05a360a8d9ee27efc9327dd1bd/log.png -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | 3 | 4 | class Logger: 5 | exp = None 6 | 7 | @staticmethod 8 | def log(n, t_max, n_e, log_step, 9 | loss_p_sum, double_loss_v_sum, entropy_sum, 10 | scores, normalized_scores, **kwargs): 11 | iteration, timestep = n + 1, (n + 1) * t_max * n_e 12 | print('Iteration %d (Timestep %d)' % (iteration, timestep)) 13 | 14 | average_loss_p = loss_p_sum / log_step / t_max 15 | average_loss_v = double_loss_v_sum / 2. / log_step / t_max 16 | average_entropy = entropy_sum / log_step / t_max 17 | 18 | print('average loss_p:', average_loss_p) 19 | print('average loss_v:', average_loss_v) 20 | print('average entropy:', average_entropy) 21 | 22 | print('Episodes:', len(scores)) 23 | 24 | try: 25 | max_score = max(scores) 26 | min_score = min(scores) 27 | avg_score = sum(scores) / len(scores) 28 | 29 | print('Max_score:', max_score) 30 | print('Min_score:', min_score) 31 | print('Avg_score:', avg_score) 32 | except ValueError: 33 | pass 34 | 35 | try: 36 | max_norm_score = max(normalized_scores) 37 | min_norm_score = min(normalized_scores) 38 | avg_norm_score = sum(normalized_scores) / len(normalized_scores) 39 | 40 | print('Max_norm_score:', max_norm_score) 41 | print('Min_norm_score:', min_norm_score) 42 | print('Avg_norm_score:', avg_norm_score) 43 | except ValueError: 44 | pass 45 | 46 | print() 47 | 48 | if Logger.exp is not None: 49 | Thread(target=Logger.crayon_log(**locals()), daemon=True).start() 50 | 51 | @staticmethod 52 | def init_crayon(hostname, experiment_name): 53 | try: 54 | from pycrayon import CrayonClient 55 | 56 | cc = CrayonClient(hostname) 57 | 58 | try: 59 | Logger.exp = cc.create_experiment(experiment_name) 60 | except ValueError as e: 61 | print(e) 62 | 63 | if input('Open the experiment (y/n)? ').lower() != 'y': 64 | raise 65 | 66 | Logger.exp = cc.open_experiment(experiment_name) 67 | except ImportError: 68 | print('Importing pycrayon has been failed. ' 69 | 'Some features of Logger will disabled.') 70 | except ValueError as e: 71 | print(e) 72 | 73 | if input('continue (y/n)? ').lower() != 'y': 74 | raise 75 | 76 | @staticmethod 77 | def crayon_log(timestep, average_loss_p, average_loss_v, average_entropy, 78 | max_score=None, min_score=None, avg_score=None, 79 | max_norm_score=None, min_norm_score=None, 80 | avg_norm_score=None, **kwargs): 81 | import requests 82 | 83 | try: 84 | exp = Logger.exp 85 | exp.add_scalar_value("loss_p", average_loss_p, step=timestep) 86 | exp.add_scalar_value("loss_v", average_loss_v, step=timestep) 87 | exp.add_scalar_value("entropy", average_entropy, step=timestep) 88 | 89 | if max_score is not None: 90 | exp.add_scalar_value("score_max", max_score, step=timestep) 91 | 92 | if min_score is not None: 93 | exp.add_scalar_value("score_min", min_score, step=timestep) 94 | 95 | if avg_score is not None: 96 | exp.add_scalar_value("score_avg", avg_score, step=timestep) 97 | 98 | if max_norm_score is not None: 99 | exp.add_scalar_value("norm_score_max", max_norm_score, 100 | step=timestep) 101 | 102 | if min_norm_score is not None: 103 | exp.add_scalar_value("norm_score_min", min_norm_score, 104 | step=timestep) 105 | 106 | if avg_norm_score is not None: 107 | exp.add_scalar_value("norm_score_avg", avg_norm_score, 108 | step=timestep) 109 | 110 | except requests.ConnectionError as e: 111 | print(e) 112 | -------------------------------------------------------------------------------- /paac-breakout-1280014.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qbx2/PAAC.pytorch/0de8a408d6ad0d05a360a8d9ee27efc9327dd1bd/paac-breakout-1280014.pkl -------------------------------------------------------------------------------- /paac.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.autograd import Variable 3 | from PIL import Image 4 | from PIL.Image import BILINEAR 5 | from torchvision.transforms import ToTensor, ToPILImage 6 | 7 | INPUT_CHANNELS = 4 8 | INPUT_IMAGE_SIZE = (84, 84) 9 | 10 | 11 | class PAACNet(nn.Module): 12 | to_tensor = ToTensor() 13 | to_pil_image = ToPILImage() 14 | 15 | def __init__(self, num_actions): 16 | super().__init__() 17 | 18 | self.conv_layers = nn.Sequential( 19 | nn.Conv2d(INPUT_CHANNELS, 32, 8, 4), 20 | nn.ReLU(), 21 | nn.Conv2d(32, 64, 4, 2), 22 | nn.ReLU(), 23 | nn.Conv2d(64, 64, 3, 1), 24 | nn.ReLU() 25 | ) 26 | 27 | self.fc = nn.Linear(3136, 512) 28 | 29 | self.policy_output = nn.Sequential( 30 | nn.Linear(512, num_actions), 31 | nn.Softmax(1) 32 | ) 33 | 34 | self.value_output = nn.Linear(512, 1) 35 | 36 | # init weights and biases 37 | import torch.nn.init as init 38 | 39 | for m in self.modules(): 40 | if isinstance(m, nn.Conv2d): 41 | init.kaiming_normal(m.weight) 42 | m.bias.data.zero_() 43 | elif isinstance(m, nn.Linear): 44 | init.kaiming_normal(m.weight) 45 | m.bias.data.zero_() 46 | 47 | @classmethod 48 | def preprocess(cls, x): 49 | r"""preprocesses & converts the output of gym environment 50 | 51 | :param x: grayscale array with shape (210, 160, 1) 52 | :return: preprocessed & converted tensor 53 | """ 54 | 55 | # TODO : support flickering games by picking max pixels 56 | x = Image.fromarray(x.squeeze(), 'L') 57 | x = x.resize(INPUT_IMAGE_SIZE, resample=BILINEAR) 58 | return cls.to_tensor(x) 59 | 60 | def forward(self, x): 61 | r"""calculates PAAC outputs 62 | 63 | :param x: preprocessed states with shape (N, H, W, C) 64 | :return: tuple (policy_output, value_output) 65 | """ 66 | x = self.conv_layers(x) 67 | # flatten 68 | x = x.view(x.size(0), -1) 69 | x = self.fc(x) 70 | return self.policy_output(x), self.value_output(x) 71 | 72 | def policy(self, x): 73 | x = self.conv_layers(x) 74 | x = x.view(x.size(0), -1) 75 | x = self.fc(x) 76 | return self.policy_output(x) 77 | 78 | def value(self, x): 79 | x = self.conv_layers(x) 80 | x = x.view(x.size(0), -1) 81 | x = self.fc(x) 82 | return self.value_output(x) 83 | 84 | @staticmethod 85 | def entropy(x, epsilon=0): 86 | r"""calculates entropy 87 | 88 | :param x: policy_output with shape (N, L) where L is NUM_ACTIONS 89 | :param epsilon: epsilon for numerical stability 90 | :return: entropy 91 | """ 92 | return -(x * (x + epsilon).log()).sum(0).sum(0) 93 | 94 | @staticmethod 95 | def log_and_negated_entropy(x, epsilon): 96 | log_x = (x + epsilon).log() 97 | return log_x, (x * log_x).sum(0).sum(0) 98 | 99 | @staticmethod 100 | def get_loss(q_values, values, log_a): 101 | r"""calculates policy loss and value loss 102 | 103 | :param q_values: Tensor with shape (T, N) 104 | :param values: Variable with shape (T, N) 105 | :param log_a: Variable with shape (T, N) 106 | :return: tuple (policy_loss, value_loss) 107 | """ 108 | diff = Variable(q_values) - values 109 | 110 | # policy loss 111 | loss_p = -(Variable(diff.data) * log_a).mean(0) 112 | # value loss 113 | # 2 * nn.MSELoss 114 | double_loss_v = diff.pow(2).mean(0) 115 | loss = loss_p + 0.25 * double_loss_v 116 | return loss_p, double_loss_v, loss 117 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | 4 | import gym 5 | import torch 6 | import torch.nn 7 | from torch.autograd import Variable 8 | 9 | from paac import PAACNet, INPUT_CHANNELS, INPUT_IMAGE_SIZE 10 | from worker import Worker 11 | from logger import Logger 12 | 13 | 14 | class Master: 15 | def __init__(self, args): 16 | self.args = args 17 | 18 | print('Loading environment information ...') 19 | env = gym.make(args.env) 20 | 21 | self.num_actions = env.action_space.n 22 | print('num_actions:', self.num_actions) 23 | 24 | self.action_meanings = env.env.get_action_meanings() 25 | print('action_meanings:', self.action_meanings) 26 | 27 | self.no_op = None 28 | 29 | for i, v in enumerate(self.action_meanings): 30 | if v.upper() == 'NOOP': 31 | self.no_op = i 32 | print('Using action %d as NO-OP' % i) 33 | 34 | if self.no_op is None: 35 | self.no_op = 0 36 | print('NO-OP not found, using action 0') 37 | 38 | del env 39 | 40 | # create PAAC model 41 | self.paac = PAACNet(self.num_actions) 42 | 43 | if args.cuda: 44 | self.paac.cuda() 45 | 46 | if args.use_rmsprop: 47 | self.optim = torch.optim.RMSprop( 48 | self.paac.parameters(), args.learning_rate, args.alpha, 49 | args.rmsprop_epsilon 50 | ) 51 | else: 52 | self.optim = torch.optim.Adam( 53 | self.paac.parameters(), args.learning_rate, 54 | (args.beta1, args.beta2), args.adam_epsilon 55 | ) 56 | 57 | self.workers = [Worker(i, args) for i in range(args.n_w)] 58 | self.start = 0 59 | self.range_iter = None 60 | 61 | def __enter__(self): 62 | return self 63 | 64 | def __exit__(self, *exc_details): 65 | for worker in self.workers: 66 | worker.exit_event.set() 67 | worker.set_action_done() 68 | worker.join() 69 | 70 | @staticmethod 71 | def get_starting_point(): 72 | return random.randint(args.min_starting_point, args.max_starting_point) 73 | 74 | def train(self): 75 | optim = self.optim 76 | workers = self.workers 77 | args = self.args 78 | model_params = self.paac.parameters() 79 | ( 80 | filename, cuda, n_e, t_max, n_max, 81 | gamma, beta, log_step, save_step, 82 | epsilon, clip 83 | ) = ( 84 | args.filename, args.cuda, args.n_e, args.t_max, args.n_max, 85 | args.gamma, args.beta, args.log_step, args.save_step, 86 | args.epsilon, args.clip 87 | ) 88 | log_step_1 = (self.start - 1) % log_step 89 | save_step_1 = (self.start - 1) % save_step 90 | del args 91 | 92 | # gpu (if possible) variables, will be wrapped by Variable later 93 | # policies = Variable(torch.zeros(t_max, n_e)) # unused at the moment 94 | values = torch.zeros(t_max, n_e) 95 | log_a = torch.zeros(t_max, n_e) 96 | negated_entropy_sum = torch.zeros(1) 97 | 98 | # gpu tensors 99 | # tensor to store states, updated at every timestep 100 | states = torch.zeros(n_e, INPUT_CHANNELS, *INPUT_IMAGE_SIZE) 101 | _states = torch.zeros(t_max, n_e, INPUT_CHANNELS, *INPUT_IMAGE_SIZE) 102 | q_values = torch.zeros(t_max + 1, n_e) 103 | 104 | # cpu tensors 105 | # tensors to store data for a backprop 106 | _actions = torch.zeros(n_e).long().share_memory_() 107 | obs = torch.zeros(n_e, *INPUT_IMAGE_SIZE).share_memory_() 108 | rewards = torch.zeros(t_max, n_e).share_memory_() 109 | terminals = torch.zeros(t_max, n_e).share_memory_() 110 | 111 | # accumulated rewards to calculate score 112 | rewards_accumulated = torch.zeros(n_e) 113 | normalized_rewards_accumulated = torch.zeros(n_e) 114 | 115 | # if current_frames < starting_points: action = no-op 116 | # else: action = policy() 117 | starting_points = [self.get_starting_point() for _ in range(n_e)] 118 | current_frames = [1] * n_e 119 | # list to store scores of episodes, 120 | # printed & flushed at every log_step 121 | scores = [] 122 | normalized_scores = [] 123 | # sum of loss_p & double_loss_v, printed & flushed at every log_step 124 | loss_p_sum = double_loss_v_sum = entropy_sum = 0 125 | 126 | if cuda: 127 | # policies = policies.pin_memory().cuda(async=True) 128 | values = values.cuda() 129 | log_a = log_a.cuda() 130 | negated_entropy_sum = negated_entropy_sum.cuda() 131 | 132 | states = states.cuda() 133 | _states = _states.cuda() 134 | q_values = q_values.cuda() 135 | 136 | # wrap variables 137 | # policies = Variable(policies) 138 | values = Variable(values) 139 | log_a = Variable(log_a) 140 | negated_entropy_sum = Variable(negated_entropy_sum) 141 | 142 | # start training 143 | self.paac.train() 144 | 145 | # send states 146 | for worker in workers: 147 | worker.put_shared_tensors(_actions, obs, rewards, terminals) 148 | worker.wait_step_done() 149 | 150 | self.range_iter = iter(range(self.start, n_max)) 151 | 152 | for n in self.range_iter: 153 | # policies = Variable(policies.data) 154 | values = Variable(values.data) 155 | log_a = Variable(log_a.data) 156 | negated_entropy_sum = Variable(negated_entropy_sum.data) 157 | 158 | negated_entropy_sum.data.zero_() 159 | 160 | for t in range(t_max): 161 | # yes, check terminals[-1] when t = 0 162 | nonzero_terminals = terminals[t - 1].nonzero() 163 | 164 | if len(nonzero_terminals.size()): 165 | for i in nonzero_terminals.squeeze(1): 166 | # reset done environments 167 | starting_points[i] = self.get_starting_point() 168 | current_frames[i] = 1 169 | 170 | scores.append(rewards_accumulated[i]) 171 | normalized_scores.append( 172 | normalized_rewards_accumulated[i]) 173 | 174 | rewards_accumulated[i] = 0 175 | normalized_rewards_accumulated[i] = 0 176 | 177 | states[i].zero_() 178 | 179 | # states must be cloned for gradient calculation 180 | _states[t].copy_(states) 181 | paac_p, paac_v = self.paac(Variable(_states[t])) 182 | # paac_p_max_values, paac_p_max_indices = paac_p.max(1) 183 | values[t] = paac_v 184 | 185 | log_paac_p, negated_h = self.paac.log_and_negated_entropy( 186 | paac_p, epsilon) 187 | negated_entropy_sum += negated_h 188 | 189 | actions = paac_p.multinomial().data 190 | 191 | # process no-op environments 192 | for i in range(n_e): 193 | if current_frames[i] < starting_points[i]: 194 | current_frames[i] += 1 195 | # policies[t, i] = paac_p[i, self.NOOP] 196 | actions[i, 0] = self.no_op 197 | 198 | log_a[t] = log_paac_p.gather(1, Variable(actions.clone())) 199 | 200 | # perform actions 201 | _actions.copy_(actions.squeeze(1)) 202 | 203 | for worker in workers: 204 | worker.set_action_done() 205 | 206 | # get new observations 207 | for worker in workers: 208 | worker.wait_step_done() 209 | 210 | states[:, :-1], states[:, -1] = states[:, 1:], obs 211 | rewards_accumulated += rewards[t] 212 | # normalize rewards 213 | rewards[t].clamp_(-1, 1) 214 | normalized_rewards_accumulated += rewards[t] 215 | 216 | entropy = -negated_entropy_sum / n_e 217 | entropy_sum += entropy.data[0] 218 | 219 | # values of new states 220 | q_values[t_max] = self.paac.value(Variable(states)).data 221 | 222 | loss_sum = 0 223 | 224 | if cuda: 225 | _rewards = rewards.cuda() 226 | _terminals = terminals.cuda() 227 | else: 228 | _rewards = rewards 229 | _terminals = terminals 230 | 231 | # calculate q_values 232 | for t in reversed(range(t_max)): 233 | q_values[t] = _rewards[t] + \ 234 | (1. - _terminals[t]) * gamma * q_values[t + 1] 235 | 236 | loss_p, double_loss_v, loss = self.paac.get_loss( 237 | q_values[t], values[t], log_a[t] 238 | ) 239 | 240 | loss_sum += loss 241 | loss_p_sum += loss_p 242 | double_loss_v_sum += double_loss_v 243 | 244 | # entropy term 245 | loss_sum -= beta * entropy 246 | 247 | optim.zero_grad() 248 | # loss scaling by t_max 249 | loss_sum.backward() 250 | torch.nn.utils.clip_grad_norm(model_params, clip) 251 | optim.step() 252 | 253 | if n % log_step == log_step_1: 254 | loss_p_sum = loss_p_sum.data[0] 255 | double_loss_v_sum = double_loss_v_sum.data[0] 256 | Logger.log(**locals()) 257 | 258 | # flush 259 | loss_p_sum = double_loss_v_sum = entropy_sum = 0 260 | scores.clear() 261 | normalized_scores.clear() 262 | 263 | if n % save_step == save_step_1: 264 | self.save(filename, n + 1) 265 | 266 | def load(self, filename): 267 | checkpoint = torch.load(filename) 268 | self.start = checkpoint['iteration'] 269 | self.paac.load_state_dict(checkpoint['paac']) 270 | self.optim.load_state_dict(checkpoint['optimizer']) 271 | print('Loaded PAAC checkpoint (%d) from' % self.start, filename) 272 | 273 | def save(self, filename, iteration=0): 274 | checkpoint = { 275 | 'iteration': iteration, 276 | 'paac': self.paac.state_dict(), 277 | 'optimizer': self.optim.state_dict() 278 | } 279 | 280 | torch.save(checkpoint, filename) 281 | print('Saved PAAC checkpoint (%d) into' % iteration, filename) 282 | 283 | 284 | def get_args(): 285 | parser = argparse.ArgumentParser( 286 | description='Train a PAAC model.', 287 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 288 | 289 | parser.add_argument('--env', type=str, default='Pong-v0') 290 | parser.add_argument('-f', '--filename', type=str, default='paac.pkl', 291 | help='filename to save the trained model into.') 292 | parser.add_argument('--no-cuda', action='store_true') 293 | parser.add_argument('-l', '--log-step', type=int, default=100) 294 | parser.add_argument('-s', '--save-step', type=int, default=1000) 295 | # WARNING: you should check if the agent can control the environment 296 | # in the starting point range (e. g. The agent cannot control 297 | # until 35th frame in SpaceInvadersDeterministic-v4) 298 | parser.add_argument('--min-starting-point', type=int, default=1) 299 | parser.add_argument('--max-starting-point', type=int, default=30) 300 | # crayon experiment name 301 | parser.add_argument('--crayon-host', type=str, default='localhost') 302 | parser.add_argument('--experiment-name', type=str, default='paac') 303 | 304 | # PAAC parameters 305 | parser.add_argument('-w', '--n_w', '--workers', type=int, 306 | default=8, metavar='N_W', 307 | help='Number of workers') 308 | parser.add_argument('-e', '--n_e', '--environments', type=int, 309 | default=32, metavar='N_E', 310 | help='Number of environments') 311 | parser.add_argument('-t', '--t-max', type=int, default=5, metavar='T_MAX', 312 | help='Max local steps') 313 | 314 | parser.add_argument('-n', '--n-max', type=int, default=int(1.15e8), 315 | metavar='N_MAX', 316 | help='Max global steps') 317 | parser.add_argument('-g', '--gamma', type=float, default=0.99) 318 | 319 | # Optimizer parameters 320 | parser.add_argument('--lr', '--learning-rate', type=float, default=0.00224, 321 | dest='learning_rate', help='Learning rate') 322 | parser.add_argument('--use-adam', dest='use_rmsprop', action='store_false') 323 | 324 | # RMSProp parameters 325 | parser.add_argument('--alpha', type=float, default=0.99, 326 | help='Alpha for the RMSProp optimizer') 327 | parser.add_argument('--rmsprop-epsilon', type=float, default=0.1, 328 | help='Epsilon for the RMSProp optimizer') 329 | 330 | # Adam parameters 331 | parser.add_argument('--beta1', type=float, default=0.9, 332 | help='Beta1 for the Adam optimizer') 333 | parser.add_argument('--beta2', type=float, default=0.999, 334 | help='Beta2 for the Adam optimizer') 335 | parser.add_argument('--adam-epsilon', type=float, default=1e-8) 336 | 337 | # Other parameters 338 | parser.add_argument('-b', '--beta', type=float, default=0.01, 339 | help='Strength of entropy regularization term') 340 | parser.add_argument('-E', '--epsilon', type=float, default=1e-30, 341 | help='Epsilon for numerical stability') 342 | parser.add_argument('-C', '--clip', type=float, default=40.0) 343 | 344 | args = parser.parse_args() 345 | args.cuda = torch.cuda.is_available() and not args.no_cuda 346 | return args 347 | 348 | 349 | if __name__ == '__main__': 350 | args = get_args() 351 | print(args) 352 | 353 | Logger.init_crayon(args.crayon_host, args.experiment_name) 354 | 355 | with Master(args) as master: 356 | try: 357 | master.load(args.filename) 358 | except FileNotFoundError as e: 359 | print(e) 360 | 361 | try: 362 | master.train() 363 | finally: 364 | try: 365 | n = next(master.range_iter) - 1 366 | except TypeError: 367 | n = master.start 368 | except StopIteration: 369 | n = args.n_max 370 | 371 | master.save(args.filename, n) 372 | -------------------------------------------------------------------------------- /worker.py: -------------------------------------------------------------------------------- 1 | from torch.multiprocessing import Process, Pipe, Event 2 | 3 | import gym_wrapper as gym 4 | from paac import PAACNet 5 | 6 | 7 | class Worker(Process): 8 | def __init__(self, worker_id, args): 9 | super().__init__() 10 | 11 | self.id = worker_id 12 | self.args = args 13 | # for master use, for worker use 14 | self.pipe_master, self.pipe_worker = Pipe() 15 | self.exit_event = Event() 16 | 17 | # determine n_e 18 | q, r = divmod(args.n_e, args.n_w) 19 | 20 | if r: 21 | print('Warning: n_e % n_w != 0') 22 | 23 | if worker_id == args.n_w - 1: 24 | self.n_e = n_e = q + r 25 | else: 26 | self.n_e = n_e = q 27 | 28 | print('Worker', self.id, '] n_e = %d' % n_e) 29 | 30 | self.env_start = worker_id * q 31 | self.env_slice = slice(self.env_start, self.env_start + n_e) 32 | self.env_range = range(self.env_start, self.env_start + n_e) 33 | self.envs = None 34 | 35 | self.start() 36 | 37 | def make_environments(self): 38 | envs = [] 39 | 40 | for _ in range(self.n_e): 41 | envs.append(gym.make(self.args.env, hack='train')) 42 | 43 | return envs 44 | 45 | def put_shared_tensors(self, actions, obs, rewards, terminals): 46 | assert (actions.is_shared() and obs.is_shared() and 47 | rewards.is_shared() and terminals.is_shared()) 48 | 49 | self.pipe_master.send((actions, obs, rewards, terminals)) 50 | 51 | def get_shared_tensors(self): 52 | actions, obs, rewards, terminals = self.pipe_worker.recv() 53 | assert (actions.is_shared() and obs.is_shared() and 54 | rewards.is_shared() and terminals.is_shared()) 55 | return actions, obs, rewards, terminals 56 | 57 | def set_step_done(self): 58 | self.pipe_worker.send_bytes(b'1') 59 | 60 | def wait_step_done(self): 61 | self.pipe_master.recv_bytes(1) 62 | 63 | def set_action_done(self): 64 | self.pipe_master.send_bytes(b'1') 65 | 66 | def wait_action_done(self): 67 | self.pipe_worker.recv_bytes(1) 68 | 69 | def run(self): 70 | preprocess = PAACNet.preprocess 71 | 72 | envs = self.envs = self.make_environments() 73 | env_start = self.env_start 74 | t_max = self.args.t_max 75 | t = 0 76 | dones = [False] * self.args.n_e 77 | 78 | # get shared tensor 79 | actions, obs, rewards, terminals = self.get_shared_tensors() 80 | 81 | for i, env in enumerate(envs, start=env_start): 82 | obs[i] = preprocess(env.reset()) 83 | 84 | self.set_step_done() 85 | 86 | while not self.exit_event.is_set(): 87 | self.wait_action_done() 88 | 89 | for i, env in enumerate(envs, start=env_start): 90 | if not dones[i]: 91 | ob, reward, done, info = env.step(actions[i]) 92 | else: 93 | ob, reward, done, info = env.reset(), 0, False, None 94 | 95 | obs[i] = preprocess(ob) 96 | rewards[t, i] = reward 97 | terminals[t, i] = dones[i] = done 98 | 99 | self.set_step_done() 100 | 101 | t += 1 102 | 103 | if t == t_max: 104 | t = 0 105 | --------------------------------------------------------------------------------