├── .gitignore ├── LICENSE ├── README.md ├── algo ├── __init__.py ├── a2c_acktr.py ├── kfac.py ├── ppo.py └── sil.py ├── arguments.py ├── distributions.py ├── enjoy.py ├── envs.py ├── imgs ├── a2c_beamrider.png ├── a2c_breakout.png ├── a2c_qbert.png ├── a2c_seaquest.png ├── acktr_beamrider.png ├── acktr_breakout.png ├── acktr_qbert.png ├── acktr_seaquest.png ├── ppo_halfcheetah.png ├── ppo_hopper.png ├── ppo_reacher.png └── ppo_walker.png ├── main.py ├── model.py ├── monitor.py ├── my_prosthetics_env.py ├── replay_storage.py ├── requirements.txt ├── rollout_storage.py ├── submit.py ├── utils.py └── visualize.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | trained_models/ 104 | .fuse_hidden* 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Ilya Kostrikov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Reinforcement Learning for OpenSim Environments 2 | 3 | This is my code for experimenting with the CrowdAI Prosthetics Challenge (https://www.crowdai.org/challenges/nips-2018-ai-for-prosthetics-challenge) 4 | 5 | The reinforcement learning codebase is based upon Ilya Kostrikov's awesome work (https://github.com/ikostrikov/pytorch-a2c-ppo-acktr) 6 | 7 | As this is part of my learning process for continuous control with deep reinforcement learning, there are likely to be some issues. 8 | 9 | All experiments were performed with PPO or PPO w/ self-improvement learning w/ 16 vector'd environments running in parallel. Keep in mind, the simulator is VERY slow so expect to wait a long time for decent results (days) -- even if you happen to have a kick ass machine. 10 | 11 | Added: 12 | * support for the OpenSim Gym-like environments with Ilya's RL codebase 13 | * custom 'MyProstheticsEnv' wrapper to allow easier experimentation with different observation projections, rewards, and other aspects 14 | * frame skipping support in custom env 15 | * beta distribution experiment for continuous control in the range [0, 1] (http://ri.cmu.edu/wp-content/uploads/2017/06/thesis-Chou.pdf) 16 | * tweaks to logging/folders/checkpoints and model resume for easier experimentation and tracking of results 17 | * an implementation of SIL (https://arxiv.org/abs/1806.05635), one variant off policy replay with on policy methods. It speeds initial training but starts to falter. I need further experiments with loss weight and other sil param decay. 18 | 19 | 20 | ## Get Started 21 | 22 | Setup your environment as per https://github.com/stanfordnmbl/osim-rl#getting-started 23 | 24 | ## Give It a Go 25 | 26 | Unclipped -- trains much faster but not clear what OpenSim is doing: 27 | `main.py --algo ppo --env-name osim.Prosthetics --lr 7e-4 --num-steps 1000 --use-gae --ppo-epoch 10` 28 | 29 | With clipped [0, 1] actions shifted so mean is at 0.5: 30 | 31 | `main.py --algo ppo --env-name osim.Prosthetics --lr 1e-3 --num-steps 1000 --use-gae --ppo-epoch 10 --clip-action -shift-action` 32 | 33 | With beta distribution [0, 1]: 34 | 35 | `main.py --algo ppo --env-name osim.Prosthetics --lr 1e-3 --num-steps 1000 --use-gae --ppo-epoch 10 --beta-dist` 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /algo/__init__.py: -------------------------------------------------------------------------------- 1 | from .a2c_acktr import A2C_ACKTR 2 | from .ppo import PPO 3 | from .sil import SIL -------------------------------------------------------------------------------- /algo/a2c_acktr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | 5 | from .kfac import KFACOptimizer 6 | 7 | 8 | class A2C_ACKTR(): 9 | def __init__(self, 10 | actor_critic, 11 | value_loss_coef, 12 | entropy_coef, 13 | lr=None, 14 | lr_schedule=None, 15 | eps=None, 16 | alpha=None, 17 | max_grad_norm=None, 18 | acktr=False): 19 | 20 | self.actor_critic = actor_critic 21 | self.acktr = acktr 22 | 23 | self.value_loss_coef = value_loss_coef 24 | self.entropy_coef = entropy_coef 25 | 26 | self.max_grad_norm = max_grad_norm 27 | 28 | if acktr: 29 | self.optimizer = KFACOptimizer(actor_critic) 30 | self.scheduler = None 31 | else: 32 | self.optimizer = optim.RMSprop( 33 | actor_critic.parameters(), lr, eps=eps, alpha=alpha) 34 | if lr_schedule is not None: 35 | self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, lr_schedule) 36 | else: 37 | self.scheduler = None 38 | 39 | def update(self, rollouts, update_index, _replay=None): 40 | if self.scheduler is not None: 41 | self.scheduler.step(update_index) 42 | 43 | obs_shape = rollouts.obs.size()[2:] 44 | action_shape = rollouts.actions.size()[-1] 45 | num_steps, num_processes, _ = rollouts.rewards.size() 46 | 47 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 48 | rollouts.obs[:-1].view(-1, *obs_shape), 49 | rollouts.recurrent_hidden_states[0].view(-1, self.actor_critic.recurrent_hidden_state_size), 50 | rollouts.masks[:-1].view(-1, 1), 51 | rollouts.actions.view(-1, action_shape)) 52 | 53 | values = values.view(num_steps, num_processes, 1) 54 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1) 55 | 56 | advantages = rollouts.returns[:-1] - values 57 | value_loss = advantages.pow(2).mean() 58 | 59 | action_loss = -(advantages.detach() * action_log_probs).mean() 60 | 61 | if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: 62 | # Sampled fisher, see Martens 2014 63 | self.actor_critic.zero_grad() 64 | pg_fisher_loss = -action_log_probs.mean() 65 | 66 | value_noise = torch.randn(values.size()) 67 | if values.is_cuda: 68 | value_noise = value_noise.cuda() 69 | 70 | sample_values = values + value_noise 71 | vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() 72 | 73 | fisher_loss = pg_fisher_loss + vf_fisher_loss 74 | self.optimizer.acc_stats = True 75 | fisher_loss.backward(retain_graph=True) 76 | self.optimizer.acc_stats = False 77 | 78 | self.optimizer.zero_grad() 79 | (value_loss * self.value_loss_coef + action_loss - 80 | dist_entropy * self.entropy_coef).backward() 81 | 82 | if self.acktr == False: 83 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 84 | self.max_grad_norm) 85 | 86 | self.optimizer.step() 87 | 88 | return value_loss.item(), action_loss.item(), dist_entropy.item() 89 | -------------------------------------------------------------------------------- /algo/kfac.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | 8 | from utils import AddBias 9 | 10 | # TODO: In order to make this code faster: 11 | # 1) Implement _extract_patches as a single cuda kernel 12 | # 2) Compute QR decomposition in a separate process 13 | # 3) Actually make a general KFAC optimizer so it fits PyTorch 14 | 15 | 16 | def _extract_patches(x, kernel_size, stride, padding): 17 | if padding[0] + padding[1] > 0: 18 | x = F.pad(x, (padding[1], padding[1], padding[0], 19 | padding[0])).data # Actually check dims 20 | x = x.unfold(2, kernel_size[0], stride[0]) 21 | x = x.unfold(3, kernel_size[1], stride[1]) 22 | x = x.transpose_(1, 2).transpose_(2, 3).contiguous() 23 | x = x.view( 24 | x.size(0), x.size(1), x.size(2), 25 | x.size(3) * x.size(4) * x.size(5)) 26 | return x 27 | 28 | 29 | def compute_cov_a(a, classname, layer_info, fast_cnn): 30 | batch_size = a.size(0) 31 | 32 | if classname == 'Conv2d': 33 | if fast_cnn: 34 | a = _extract_patches(a, *layer_info) 35 | a = a.view(a.size(0), -1, a.size(-1)) 36 | a = a.mean(1) 37 | else: 38 | a = _extract_patches(a, *layer_info) 39 | a = a.view(-1, a.size(-1)).div_(a.size(1)).div_(a.size(2)) 40 | elif classname == 'AddBias': 41 | is_cuda = a.is_cuda 42 | a = torch.ones(a.size(0), 1) 43 | if is_cuda: 44 | a = a.cuda() 45 | 46 | return a.t() @ (a / batch_size) 47 | 48 | 49 | def compute_cov_g(g, classname, layer_info, fast_cnn): 50 | batch_size = g.size(0) 51 | 52 | if classname == 'Conv2d': 53 | if fast_cnn: 54 | g = g.view(g.size(0), g.size(1), -1) 55 | g = g.sum(-1) 56 | else: 57 | g = g.transpose(1, 2).transpose(2, 3).contiguous() 58 | g = g.view(-1, g.size(-1)).mul_(g.size(1)).mul_(g.size(2)) 59 | elif classname == 'AddBias': 60 | g = g.view(g.size(0), g.size(1), -1) 61 | g = g.sum(-1) 62 | 63 | g_ = g * batch_size 64 | return g_.t() @ (g_ / g.size(0)) 65 | 66 | 67 | def update_running_stat(aa, m_aa, momentum): 68 | # Do the trick to keep aa unchanged and not create any additional tensors 69 | m_aa *= momentum / (1 - momentum) 70 | m_aa += aa 71 | m_aa *= (1 - momentum) 72 | 73 | 74 | class SplitBias(nn.Module): 75 | def __init__(self, module): 76 | super(SplitBias, self).__init__() 77 | self.module = module 78 | self.add_bias = AddBias(module.bias.data) 79 | self.module.bias = None 80 | 81 | def forward(self, input): 82 | x = self.module(input) 83 | x = self.add_bias(x) 84 | return x 85 | 86 | 87 | class KFACOptimizer(optim.Optimizer): 88 | def __init__(self, 89 | model, 90 | lr=0.25, 91 | momentum=0.9, 92 | stat_decay=0.99, 93 | kl_clip=0.001, 94 | damping=1e-2, 95 | weight_decay=0, 96 | fast_cnn=False, 97 | Ts=1, 98 | Tf=10): 99 | defaults = dict() 100 | 101 | def split_bias(module): 102 | for mname, child in module.named_children(): 103 | if hasattr(child, 'bias') and child.bias is not None: 104 | module._modules[mname] = SplitBias(child) 105 | else: 106 | split_bias(child) 107 | 108 | split_bias(model) 109 | 110 | super(KFACOptimizer, self).__init__(model.parameters(), defaults) 111 | 112 | self.known_modules = {'Linear', 'Conv2d', 'AddBias'} 113 | 114 | self.modules = [] 115 | self.grad_outputs = {} 116 | 117 | self.model = model 118 | self._prepare_model() 119 | 120 | self.steps = 0 121 | 122 | self.m_aa, self.m_gg = {}, {} 123 | self.Q_a, self.Q_g = {}, {} 124 | self.d_a, self.d_g = {}, {} 125 | 126 | self.momentum = momentum 127 | self.stat_decay = stat_decay 128 | 129 | self.lr = lr 130 | self.kl_clip = kl_clip 131 | self.damping = damping 132 | self.weight_decay = weight_decay 133 | 134 | self.fast_cnn = fast_cnn 135 | 136 | self.Ts = Ts 137 | self.Tf = Tf 138 | 139 | self.optim = optim.SGD( 140 | model.parameters(), 141 | lr=self.lr * (1 - self.momentum), 142 | momentum=self.momentum) 143 | 144 | def _save_input(self, module, input): 145 | if torch.is_grad_enabled() and self.steps % self.Ts == 0: 146 | classname = module.__class__.__name__ 147 | layer_info = None 148 | if classname == 'Conv2d': 149 | layer_info = (module.kernel_size, module.stride, 150 | module.padding) 151 | 152 | aa = compute_cov_a(input[0].data, classname, layer_info, 153 | self.fast_cnn) 154 | 155 | # Initialize buffers 156 | if self.steps == 0: 157 | self.m_aa[module] = aa.clone() 158 | 159 | update_running_stat(aa, self.m_aa[module], self.stat_decay) 160 | 161 | def _save_grad_output(self, module, grad_input, grad_output): 162 | if self.acc_stats: 163 | classname = module.__class__.__name__ 164 | layer_info = None 165 | if classname == 'Conv2d': 166 | layer_info = (module.kernel_size, module.stride, 167 | module.padding) 168 | 169 | gg = compute_cov_g(grad_output[0].data, classname, layer_info, 170 | self.fast_cnn) 171 | 172 | # Initialize buffers 173 | if self.steps == 0: 174 | self.m_gg[module] = gg.clone() 175 | 176 | update_running_stat(gg, self.m_gg[module], self.stat_decay) 177 | 178 | def _prepare_model(self): 179 | for module in self.model.modules(): 180 | classname = module.__class__.__name__ 181 | if classname in self.known_modules: 182 | assert not ((classname in ['Linear', 'Conv2d']) and module.bias is not None), \ 183 | "You must have a bias as a separate layer" 184 | 185 | self.modules.append(module) 186 | module.register_forward_pre_hook(self._save_input) 187 | module.register_backward_hook(self._save_grad_output) 188 | 189 | def step(self): 190 | # Add weight decay 191 | if self.weight_decay > 0: 192 | for p in self.model.parameters(): 193 | p.grad.data.add_(self.weight_decay, p.data) 194 | 195 | updates = {} 196 | for i, m in enumerate(self.modules): 197 | assert len(list(m.parameters()) 198 | ) == 1, "Can handle only one parameter at the moment" 199 | classname = m.__class__.__name__ 200 | p = next(m.parameters()) 201 | 202 | la = self.damping + self.weight_decay 203 | 204 | if self.steps % self.Tf == 0: 205 | # My asynchronous implementation exists, I will add it later. 206 | # Experimenting with different ways to this in PyTorch. 207 | self.d_a[m], self.Q_a[m] = torch.symeig( 208 | self.m_aa[m], eigenvectors=True) 209 | self.d_g[m], self.Q_g[m] = torch.symeig( 210 | self.m_gg[m], eigenvectors=True) 211 | 212 | self.d_a[m].mul_((self.d_a[m] > 1e-6).float()) 213 | self.d_g[m].mul_((self.d_g[m] > 1e-6).float()) 214 | 215 | if classname == 'Conv2d': 216 | p_grad_mat = p.grad.data.view(p.grad.data.size(0), -1) 217 | else: 218 | p_grad_mat = p.grad.data 219 | 220 | v1 = self.Q_g[m].t() @ p_grad_mat @ self.Q_a[m] 221 | v2 = v1 / ( 222 | self.d_g[m].unsqueeze(1) * self.d_a[m].unsqueeze(0) + la) 223 | v = self.Q_g[m] @ v2 @ self.Q_a[m].t() 224 | 225 | v = v.view(p.grad.data.size()) 226 | updates[p] = v 227 | 228 | vg_sum = 0 229 | for p in self.model.parameters(): 230 | v = updates[p] 231 | vg_sum += (v * p.grad.data * self.lr * self.lr).sum() 232 | 233 | nu = min(1, math.sqrt(self.kl_clip / vg_sum)) 234 | 235 | for p in self.model.parameters(): 236 | v = updates[p] 237 | p.grad.data.copy_(v) 238 | p.grad.data.mul_(nu) 239 | 240 | self.optim.step() 241 | self.steps += 1 242 | -------------------------------------------------------------------------------- /algo/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | 6 | 7 | class PPO(): 8 | def __init__(self, 9 | actor_critic, 10 | clip_param, 11 | ppo_epoch, 12 | num_mini_batch, 13 | value_loss_coef, 14 | entropy_coef, 15 | lr=None, 16 | lr_schedule=None, 17 | eps=None, 18 | max_grad_norm=None): 19 | 20 | self.actor_critic = actor_critic 21 | 22 | self.clip_param = clip_param 23 | self.ppo_epoch = ppo_epoch 24 | self.num_mini_batch = num_mini_batch 25 | 26 | self.value_loss_coef = value_loss_coef 27 | self.entropy_coef = entropy_coef 28 | 29 | self.max_grad_norm = max_grad_norm 30 | 31 | self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps) 32 | 33 | if lr_schedule is not None: 34 | self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, lr_schedule) 35 | else: 36 | self.scheduler = None 37 | 38 | def update(self, rollouts, update_index, _replay=None): 39 | if self.scheduler is not None: 40 | self.scheduler.step(update_index) 41 | 42 | advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] 43 | advantages = (advantages - advantages.mean()) / ( 44 | advantages.std() + 1e-5) 45 | 46 | value_loss_epoch = 0 47 | action_loss_epoch = 0 48 | dist_entropy_epoch = 0 49 | 50 | for e in range(self.ppo_epoch): 51 | if self.actor_critic.is_recurrent: 52 | data_generator = rollouts.recurrent_generator( 53 | advantages, self.num_mini_batch) 54 | else: 55 | data_generator = rollouts.feed_forward_generator( 56 | advantages, self.num_mini_batch) 57 | 58 | for sample in data_generator: 59 | obs_batch, recurrent_hidden_states_batch, actions_batch, \ 60 | return_batch, masks_batch, old_action_log_probs_batch, \ 61 | adv_targ = sample 62 | 63 | # Reshape to do in a single forward pass for all steps 64 | values, action_log_probs, dist_entropy, states = self.actor_critic.evaluate_actions( 65 | obs_batch, recurrent_hidden_states_batch, 66 | masks_batch, actions_batch) 67 | 68 | ratio = torch.exp(action_log_probs - old_action_log_probs_batch) 69 | surr1 = ratio * adv_targ 70 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 71 | 1.0 + self.clip_param) * adv_targ 72 | action_loss = -torch.min(surr1, surr2).mean() 73 | 74 | value_loss = F.mse_loss(return_batch, values) 75 | 76 | self.optimizer.zero_grad() 77 | (value_loss * self.value_loss_coef + action_loss - 78 | dist_entropy * self.entropy_coef).backward() 79 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 80 | self.max_grad_norm) 81 | self.optimizer.step() 82 | 83 | value_loss_epoch += value_loss.item() 84 | action_loss_epoch += action_loss.item() 85 | dist_entropy_epoch += dist_entropy.item() 86 | 87 | num_updates = self.ppo_epoch * self.num_mini_batch 88 | 89 | value_loss_epoch /= num_updates 90 | action_loss_epoch /= num_updates 91 | dist_entropy_epoch /= num_updates 92 | 93 | return value_loss_epoch, action_loss_epoch, dist_entropy_epoch 94 | -------------------------------------------------------------------------------- /algo/sil.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SIL: 6 | def __init__( 7 | self, 8 | algo, 9 | update_ratio=1.0, 10 | epochs=1, 11 | batch_size=64, 12 | beta=0., 13 | value_loss_coef=0.5, 14 | entropy_coef=0.01, 15 | 16 | ): 17 | self.update_ratio = update_ratio 18 | self.epochs = epochs 19 | self.batch_size = batch_size 20 | self.beta = beta # FIXME should be on schedule? 21 | self.value_loss_coef = value_loss_coef 22 | self.entropy_coef = entropy_coef 23 | self.loss_weight = 0.1 24 | self.avg_loss_by_valid_samples = True 25 | 26 | self.algo = algo 27 | 28 | def _calc_num_updates(self, index): 29 | num_updates = 0 30 | if self.update_ratio < 1: 31 | if index % int(round(1 / self.update_ratio)) == 0: 32 | num_updates = 1 33 | else: 34 | num_updates = int(round(self.update_ratio)) 35 | return num_updates 36 | 37 | def update(self, rollouts, update_index, replay=None): 38 | value_loss, action_loss, dist_entropy = self.algo.update(rollouts, update_index) 39 | 40 | num_updates = self._calc_num_updates(update_index) 41 | if replay is not None and replay.num_steps > self.batch_size and num_updates: 42 | sil_value_loss, sil_action_loss = self.update_sil(replay, num_updates, self.epochs) 43 | print("SIL: value_loss = {:.5f}, action_loss = {:.5f}".format(sil_value_loss, sil_action_loss)) 44 | 45 | return value_loss, action_loss, dist_entropy 46 | 47 | def update_sil(self, replay, num_updates_per_epoch=1, num_epochs=1): 48 | value_loss_epoch = 0 49 | action_loss_epoch = 0 50 | num_updates = 0 51 | 52 | for _ in range(num_epochs): 53 | if self.algo.actor_critic.is_recurrent: 54 | assert False, "Not implemented" 55 | else: 56 | data_generator = replay.feed_forward_generator( 57 | self.batch_size, num_updates_per_epoch, beta=self.beta) 58 | 59 | for sample in data_generator: 60 | obs_batch, recurrent_hidden_states_batch, actions_batch, \ 61 | return_batch, masks_batch, weights_batch, indices_batch = sample 62 | 63 | values, action_log_probs, dist_entropy, _ = self.algo.actor_critic.evaluate_actions( 64 | obs_batch, recurrent_hidden_states_batch, 65 | masks_batch, actions_batch) 66 | 67 | advantages = (return_batch - values) 68 | clipped_advantages = torch.clamp(advantages, min=0.0) 69 | 70 | # FIXME this loss is what's described in the paper, but the author's TF implementation differs. 71 | # TODO Look into the TF implementation, it appears to be motivated by the author's 72 | # lower-bound-soft-Q-learning equivalence justification. 73 | # https://github.com/junhyukoh/self-imitation-learning/blob/master/baselines/common/self_imitation.py 74 | 75 | action_loss = -action_log_probs * clipped_advantages.detach() 76 | value_loss = 0.5 * clipped_advantages.pow(2) 77 | 78 | # apply importance sampling (priority sampling bias correction) weights 79 | if weights_batch is not None: 80 | action_loss *= weights_batch 81 | value_loss *= weights_batch 82 | 83 | if self.avg_loss_by_valid_samples: 84 | num_valid_samples = torch.clamp(torch.sum(advantages > 0).float(), min=1.0) 85 | action_loss = action_loss.sum() / num_valid_samples 86 | value_loss = value_loss.sum() / num_valid_samples 87 | else: 88 | action_loss = action_loss.mean() 89 | value_loss = value_loss.mean() 90 | 91 | loss = value_loss * self.value_loss_coef + action_loss 92 | if self.entropy_coef: 93 | loss -= dist_entropy * self.entropy_coef 94 | 95 | loss *= self.loss_weight 96 | 97 | self.algo.optimizer.zero_grad() 98 | 99 | loss.backward() 100 | 101 | nn.utils.clip_grad_norm_( 102 | self.algo.actor_critic.parameters(), self.algo.max_grad_norm) 103 | 104 | self.algo.optimizer.step() 105 | 106 | replay.update_priorities(indices_batch, clipped_advantages) 107 | 108 | value_loss_epoch += value_loss.item() 109 | action_loss_epoch += action_loss.item() 110 | num_updates += 1 111 | 112 | if num_updates: 113 | value_loss_epoch /= num_updates 114 | action_loss_epoch /= num_updates 115 | 116 | return value_loss_epoch, action_loss_epoch 117 | 118 | -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser(description='RL') 8 | parser.add_argument('--algo', default='ppo', 9 | help='algorithm to use: a2c | ppo | acktr') 10 | parser.add_argument('--lr', type=float, default=7e-4, 11 | help='learning rate (default: 7e-4)') 12 | parser.add_argument('--lr-schedule', type=float, default=None, 13 | help='learning rate schedule (decay steps) (default: None)') 14 | parser.add_argument('--eps', type=float, default=1e-5, 15 | help='RMSprop optimizer epsilon (default: 1e-5)') 16 | parser.add_argument('--alpha', type=float, default=0.99, 17 | help='RMSprop optimizer apha (default: 0.99)') 18 | parser.add_argument('--gamma', type=float, default=0.99, 19 | help='discount factor for rewards (default: 0.99)') 20 | parser.add_argument('--use-gae', action='store_true', default=False, 21 | help='use generalized advantage estimation') 22 | parser.add_argument('--tau', type=float, default=0.95, 23 | help='gae parameter (default: 0.95)') 24 | parser.add_argument('--entropy-coef', type=float, default=0.0, 25 | help='entropy term coefficient (default: 0.00)') 26 | parser.add_argument('--value-loss-coef', type=float, default=0.5, 27 | help='value loss coefficient (default: 0.5)') 28 | parser.add_argument('--max-grad-norm', type=float, default=0.5, 29 | help='max norm of gradients (default: 0.5)') 30 | parser.add_argument('--beta-dist', action='store_true', default=False, 31 | help='use beta dist for continuous control') 32 | parser.add_argument('--seed', type=int, default=1, 33 | help='random seed (default: 1)') 34 | parser.add_argument('--num-processes', type=int, default=16, 35 | help='how many training CPU processes to use (default: 16)') 36 | parser.add_argument('--num-steps', type=int, default=5, 37 | help='number of forward steps in A2C (default: 5)') 38 | parser.add_argument('--ppo-epoch', type=int, default=4, 39 | help='number of ppo epochs (default: 4)') 40 | parser.add_argument('--num-mini-batch', type=int, default=32, 41 | help='number of batches for ppo (default: 32)') 42 | parser.add_argument('--clip-param', type=float, default=0.2, 43 | help='ppo clip parameter (default: 0.2)') 44 | parser.add_argument('--clip-action', action='store_true', default=False, 45 | help='clip actions') 46 | parser.add_argument('--shift-action', action='store_true', default=False, 47 | help='shift action to mean and rescale') 48 | parser.add_argument('--frame-skip', type=int, default=0, 49 | help='number of frames to skip (apply same action)') 50 | parser.add_argument('--sil-update-ratio', type=float, default=4.0, 51 | help='sil off-policy updates per on-policy updates (default: 4.0)') 52 | parser.add_argument('--sil-epochs', type=int, default=1, 53 | help='number of sil epochs (default: 1)') 54 | parser.add_argument('--sil-batch-size', type=int, default=512, 55 | help='sil batch size (default: 512)') 56 | parser.add_argument('--sil-alpha', type=float, default=0.6, 57 | help='sil replay priority alpha, priority queue disabled if 0. (default: 0.6)') 58 | parser.add_argument('--sil-beta', type=float, default=0.1, 59 | help='sil replay priority beta, importance sampling weights disabled if 0. (default: 0.1)') 60 | parser.add_argument('--sil-entropy-coef', type=float, default=0.01, 61 | help='entropy term coefficient (default: 0.0)') 62 | parser.add_argument('--sil-value-loss-coef', type=float, default=0.05, 63 | help='value loss coefficient (default: 0.5)') 64 | parser.add_argument('--log-interval', type=int, default=10, 65 | help='log interval, one log per n updates (default: 10)') 66 | parser.add_argument('--save-interval', type=int, default=100, 67 | help='save interval, one save per n updates (default: 100)') 68 | parser.add_argument('--eval-interval', type=int, default=None, 69 | help='eval interval, one eval per n updates (default: None)') 70 | parser.add_argument('--vis-interval', type=int, default=100, 71 | help='vis interval, one log per n updates (default: 100)') 72 | parser.add_argument('--num-frames', type=int, default=10e8, 73 | help='number of frames to train (default: 10e8)') 74 | parser.add_argument('--env-name', default='PongNoFrameskip-v4', 75 | help='environment to train on (default: PongNoFrameskip-v4)') 76 | parser.add_argument('--log-dir', default='/tmp/', 77 | help='directory to save agent logs (default: /tmp/)') 78 | parser.add_argument('--save-dir', default='./trained_models/', 79 | help='directory to save agent logs (default: ./trained_models/)') 80 | parser.add_argument('--no-cuda', action='store_true', default=False, 81 | help='disables CUDA training') 82 | parser.add_argument('--add-timestep', action='store_true', default=False, 83 | help='add timestep to observations') 84 | parser.add_argument('--recurrent-policy', action='store_true', default=False, 85 | help='use a recurrent policy') 86 | parser.add_argument('--vis', action='store_true', default=False, 87 | help='enable visdom visualization') 88 | parser.add_argument('--port', type=int, default=8097, 89 | help='port to run the server on (default: 8097)') 90 | parser.add_argument('--load-path', default='', 91 | help='directory to save agent logs (default: ') 92 | args = parser.parse_args() 93 | 94 | args.cuda = not args.no_cuda and torch.cuda.is_available() 95 | 96 | return args 97 | -------------------------------------------------------------------------------- /distributions.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from utils import AddBias, init, init_normc_ 8 | 9 | """ 10 | Modify standard PyTorch distributions so they are compatible with this code. 11 | """ 12 | 13 | FixedCategorical = torch.distributions.Categorical 14 | 15 | old_sample = FixedCategorical.sample 16 | FixedCategorical.sample = lambda self: old_sample(self).unsqueeze(-1) 17 | log_prob_cat = FixedCategorical.log_prob 18 | FixedCategorical.log_probs = lambda self, actions: log_prob_cat(self, actions.squeeze(-1)).unsqueeze(-1) 19 | FixedCategorical.mode = lambda self: self.probs.argmax(dim=1, keepdim=True) 20 | 21 | FixedNormal = torch.distributions.Normal 22 | log_prob_normal = FixedNormal.log_prob 23 | FixedNormal.log_probs = lambda self, actions: log_prob_normal(self, actions).sum(-1, keepdim=True) 24 | entropy_normal = FixedNormal.entropy 25 | FixedNormal.entropy = lambda self: entropy_normal(self).sum(-1) 26 | FixedNormal.mode = lambda self: self.mean 27 | 28 | FixedBeta = torch.distributions.Beta 29 | entropy_beta = FixedBeta.entropy 30 | FixedBeta.entropy = lambda self: entropy_beta(self).sum(-1) 31 | FixedBeta.mode = lambda self: (self.concentration1 - 1) / (self.concentration1 + self.concentration0 - 2) 32 | log_prob_beta = FixedBeta.log_prob 33 | FixedBeta.log_probs = lambda self, actions: log_prob_beta(self, actions).sum(-1, keepdim=True) 34 | 35 | 36 | class Categorical(nn.Module): 37 | def __init__(self, num_inputs, num_outputs): 38 | super(Categorical, self).__init__() 39 | 40 | init_ = lambda m: init( 41 | m, 42 | nn.init.orthogonal_, 43 | lambda x: nn.init.constant_(x, 0), 44 | gain=0.01) 45 | 46 | self.linear = init_(nn.Linear(num_inputs, num_outputs)) 47 | 48 | def forward(self, x): 49 | x = self.linear(x) 50 | return FixedCategorical(logits=x) 51 | 52 | 53 | class DiagGaussian(nn.Module): 54 | def __init__(self, num_inputs, num_outputs): 55 | super(DiagGaussian, self).__init__() 56 | 57 | init_ = lambda m: init( 58 | m, 59 | init_normc_, 60 | lambda x: nn.init.constant_(x, 0)) 61 | 62 | self.fc_mean = init_(nn.Linear(num_inputs, num_outputs)) 63 | self.logstd = AddBias(torch.zeros(num_outputs)) 64 | 65 | def forward(self, x): 66 | action_mean = self.fc_mean(x) 67 | 68 | # An ugly hack for my KFAC implementation. 69 | zeros = torch.zeros(action_mean.size()) 70 | if x.is_cuda: 71 | zeros = zeros.cuda() 72 | 73 | action_logstd = self.logstd(zeros) 74 | return FixedNormal(action_mean, action_logstd.exp()) 75 | 76 | 77 | class Beta(nn.Module): 78 | 79 | def __init__(self, num_inputs, num_outputs): 80 | super(Beta, self).__init__() 81 | 82 | init_ = lambda m: init( 83 | m, 84 | init_normc_, 85 | lambda x: nn.init.constant_(x, 0)) 86 | 87 | self.fc_a = init_(nn.Linear(num_inputs, num_outputs)) 88 | self.fc_b = init_(nn.Linear(num_inputs, num_outputs)) 89 | 90 | def forward(self, x): 91 | action_a = self.fc_a(x) 92 | action_b = self.fc_b(x) 93 | action_a = F.softplus(action_a) + 1. 94 | action_b = F.softplus(action_b) + 1. 95 | 96 | return FixedBeta(action_a, action_b) 97 | 98 | -------------------------------------------------------------------------------- /enjoy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import gym 4 | import numpy as np 5 | import torch 6 | 7 | from envs import VecPyTorch, make_vec_envs 8 | from utils import get_render_func, get_vec_normalize 9 | 10 | 11 | parser = argparse.ArgumentParser(description='RL') 12 | parser.add_argument('--seed', type=int, default=1, 13 | help='random seed (default: 1)') 14 | parser.add_argument('--log-interval', type=int, default=10, 15 | help='log interval, one log per n updates (default: 10)') 16 | parser.add_argument('--env-name', default='PongNoFrameskip-v4', 17 | help='environment to train on (default: PongNoFrameskip-v4)') 18 | parser.add_argument('--load-path', default='', 19 | help='directory to save agent logs (default: ') 20 | parser.add_argument('--add-timestep', action='store_true', default=False, 21 | help='add timestep to observations') 22 | parser.add_argument('--clip-action', action='store_true', default=False, 23 | help='clip actions') 24 | args = parser.parse_args() 25 | 26 | env = make_vec_envs(args.env_name, args.seed + 1000, 1, 27 | None, None, args.add_timestep, device='cpu', 28 | allow_early_resets=False, 29 | visualize=True) 30 | 31 | # Get a render function 32 | render_func = get_render_func(env) 33 | 34 | # We need to use the same statistics for normalization as used in training 35 | actor_critic, ob_rms = torch.load(args.load_path) 36 | actor_critic.eval() 37 | 38 | vec_norm = get_vec_normalize(env) 39 | if vec_norm is not None: 40 | vec_norm.eval() 41 | vec_norm.ob_rms = ob_rms 42 | 43 | recurrent_hidden_states = torch.zeros(1, actor_critic.recurrent_hidden_state_size) 44 | masks = torch.zeros(1, 1) 45 | 46 | if render_func is not None: 47 | render_func('human') 48 | 49 | obs = env.reset() 50 | 51 | while True: 52 | with torch.no_grad(): 53 | value, action, _, recurrent_hidden_states = actor_critic.act( 54 | obs, recurrent_hidden_states, masks, deterministic=True) 55 | 56 | clipped_action = action 57 | if args.clip_action and isinstance(env.action_space, gym.spaces.Box): 58 | clipped_action = torch.max(torch.min( 59 | clipped_action, torch.from_numpy(env.action_space.high)), 60 | torch.from_numpy(env.action_space.low)) 61 | 62 | # Obser reward and next obs 63 | obs, reward, done, _ = env.step(clipped_action) 64 | 65 | masks.fill_(0.0 if done else 1.0) 66 | 67 | if render_func is not None: 68 | render_func('human') 69 | -------------------------------------------------------------------------------- /envs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import gym 4 | import numpy as np 5 | import torch 6 | from gym.spaces.box import Box 7 | 8 | #from baselines import bench 9 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 10 | from baselines.common.vec_env import VecEnvWrapper 11 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv 12 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 13 | from baselines.common.vec_env.vec_normalize import VecNormalize as VecNormalize_ 14 | from monitor import Monitor 15 | 16 | try: 17 | import dm_control2gym 18 | except ImportError: 19 | pass 20 | 21 | try: 22 | import roboschool 23 | except ImportError: 24 | pass 25 | 26 | try: 27 | import pybullet_envs 28 | except ImportError: 29 | pass 30 | 31 | try: 32 | from osim.env import ProstheticsEnv, Arm2DEnv, L2RunEnv 33 | from my_prosthetics_env import MyProstheticsEnv 34 | except ImportError: 35 | pass 36 | 37 | 38 | def make_env(env_id, seed, rank, log_dir, add_timestep, allow_early_resets, **kwargs): 39 | def _thunk(): 40 | info_keywords = () 41 | if env_id.startswith("dm"): 42 | _, domain, task = env_id.split('.') 43 | env = dm_control2gym.make(domain_name=domain, task_name=task) 44 | elif env_id.startswith("osim"): 45 | info_keywords = ('rb',) 46 | # https://github.com/stanfordnmbl/osim-rl 47 | _, task = env_id.split('.') 48 | if task == "Prosthetics": 49 | env = MyProstheticsEnv(integrator_accuracy=1e-4, **kwargs) 50 | elif task == "Arm2D": 51 | env = Arm2DEnv(integrator_accuracy=1e-4, **kwargs) 52 | else: # task == "L2Run" 53 | assert task == "L2Run" 54 | env = L2RunEnv(integrator_accuracy=1e-4, **kwargs) 55 | else: 56 | env = gym.make(env_id) 57 | is_atari = hasattr(gym.envs, 'atari') and isinstance( 58 | env.unwrapped, gym.envs.atari.atari_env.AtariEnv) 59 | if is_atari: 60 | env = make_atari(env_id) 61 | env.seed(seed + rank) 62 | 63 | obs_shape = env.observation_space.shape 64 | 65 | if add_timestep and len( 66 | obs_shape) == 1 and str(env).find('TimeLimit') > -1: 67 | env = AddTimestep(env) 68 | 69 | if log_dir is not None: 70 | env = Monitor( 71 | env, os.path.join(log_dir, str(rank)), 72 | info_keywords=info_keywords, 73 | allow_early_resets=allow_early_resets) 74 | 75 | if is_atari: 76 | env = wrap_deepmind(env) 77 | 78 | # If the input has shape (W,H,3), wrap for PyTorch convolutions 79 | obs_shape = env.observation_space.shape 80 | if len(obs_shape) == 3 and obs_shape[2] in [1, 3]: 81 | env = TransposeImage(env) 82 | 83 | return env 84 | 85 | return _thunk 86 | 87 | 88 | def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, add_timestep, 89 | device, allow_early_resets=True, num_frame_stack=None, **kwargs): 90 | 91 | envs = [make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets, **kwargs) 92 | for i in range(num_processes)] 93 | 94 | if len(envs) > 1: 95 | envs = SubprocVecEnv(envs) 96 | else: 97 | envs = DummyVecEnv(envs) 98 | 99 | if len(envs.observation_space.shape) == 1: 100 | if gamma is None: 101 | envs = VecNormalize(envs, ret=False) 102 | else: 103 | envs = VecNormalize(envs, gamma=gamma) 104 | 105 | envs = VecPyTorch(envs, device) 106 | 107 | if num_frame_stack is not None: 108 | envs = VecPyTorchFrameStack(envs, num_frame_stack, device) 109 | elif len(envs.observation_space.shape) == 3: 110 | envs = VecPyTorchFrameStack(envs, 4, device) 111 | 112 | return envs 113 | 114 | 115 | # Can be used to test recurrent policies for Reacher-v2 116 | class MaskGoal(gym.ObservationWrapper): 117 | def observation(self, observation): 118 | if self.env._elapsed_steps > 0: 119 | observation[-2:0] = 0 120 | return observation 121 | 122 | 123 | class AddTimestep(gym.ObservationWrapper): 124 | def __init__(self, env=None): 125 | super(AddTimestep, self).__init__(env) 126 | self.observation_space = Box( 127 | self.observation_space.low[0], 128 | self.observation_space.high[0], 129 | [self.observation_space.shape[0] + 1], 130 | dtype=self.observation_space.dtype) 131 | 132 | def observation(self, observation): 133 | return np.concatenate((observation, [self.env._elapsed_steps])) 134 | 135 | 136 | class TransposeImage(gym.ObservationWrapper): 137 | def __init__(self, env=None): 138 | super(TransposeImage, self).__init__(env) 139 | obs_shape = self.observation_space.shape 140 | self.observation_space = Box( 141 | self.observation_space.low[0, 0, 0], 142 | self.observation_space.high[0, 0, 0], 143 | [obs_shape[2], obs_shape[1], obs_shape[0]], 144 | dtype=self.observation_space.dtype) 145 | 146 | def observation(self, observation): 147 | return observation.transpose(2, 0, 1) 148 | 149 | 150 | class VecPyTorch(VecEnvWrapper): 151 | def __init__(self, venv, device): 152 | """Return only every `skip`-th frame""" 153 | super(VecPyTorch, self).__init__(venv) 154 | self.device = device 155 | # TODO: Fix data types 156 | 157 | def reset(self): 158 | obs = self.venv.reset() 159 | obs = torch.from_numpy(obs).float().to(self.device) 160 | return obs 161 | 162 | def step_async(self, actions): 163 | actions = actions.squeeze(1).cpu().numpy() 164 | self.venv.step_async(actions) 165 | 166 | def step_wait(self): 167 | obs, reward, done, info = self.venv.step_wait() 168 | obs = torch.from_numpy(obs).float().to(self.device) 169 | reward = torch.from_numpy(reward).unsqueeze(dim=1).float() 170 | return obs, reward, done, info 171 | 172 | 173 | class VecNormalize(VecNormalize_): 174 | 175 | def __init__(self, *args, **kwargs): 176 | super(VecNormalize, self).__init__(*args, **kwargs) 177 | self.training = True 178 | 179 | def _obfilt(self, obs): 180 | if self.ob_rms: 181 | if self.training: 182 | self.ob_rms.update(obs) 183 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) 184 | return obs 185 | else: 186 | return obs 187 | 188 | def train(self): 189 | self.training = True 190 | 191 | def eval(self): 192 | self.training = False 193 | 194 | 195 | # Derived from 196 | # https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_frame_stack.py 197 | class VecPyTorchFrameStack(VecEnvWrapper): 198 | def __init__(self, venv, nstack, device=None): 199 | self.venv = venv 200 | self.nstack = nstack 201 | 202 | wos = venv.observation_space # wrapped ob space 203 | self.shape_dim0 = wos.shape[0] 204 | 205 | low = np.repeat(wos.low, self.nstack, axis=0) 206 | high = np.repeat(wos.high, self.nstack, axis=0) 207 | 208 | if device is None: 209 | device = torch.device('cpu') 210 | self.stacked_obs = torch.zeros((venv.num_envs,) + low.shape).to(device) 211 | 212 | observation_space = gym.spaces.Box( 213 | low=low, high=high, dtype=venv.observation_space.dtype) 214 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 215 | 216 | def step_wait(self): 217 | obs, rews, news, infos = self.venv.step_wait() 218 | self.stacked_obs[:, :-self.shape_dim0] = \ 219 | self.stacked_obs[:, self.shape_dim0:] 220 | for (i, new) in enumerate(news): 221 | if new: 222 | self.stacked_obs[i] = 0 223 | self.stacked_obs[:, -self.shape_dim0:] = obs 224 | return self.stacked_obs, rews, news, infos 225 | 226 | def reset(self): 227 | obs = self.venv.reset() 228 | self.stacked_obs.zero_() 229 | self.stacked_obs[:, -self.shape_dim0:] = obs 230 | return self.stacked_obs 231 | 232 | def close(self): 233 | self.venv.close() 234 | -------------------------------------------------------------------------------- /imgs/a2c_beamrider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/a2c_beamrider.png -------------------------------------------------------------------------------- /imgs/a2c_breakout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/a2c_breakout.png -------------------------------------------------------------------------------- /imgs/a2c_qbert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/a2c_qbert.png -------------------------------------------------------------------------------- /imgs/a2c_seaquest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/a2c_seaquest.png -------------------------------------------------------------------------------- /imgs/acktr_beamrider.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/acktr_beamrider.png -------------------------------------------------------------------------------- /imgs/acktr_breakout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/acktr_breakout.png -------------------------------------------------------------------------------- /imgs/acktr_qbert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/acktr_qbert.png -------------------------------------------------------------------------------- /imgs/acktr_seaquest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/acktr_seaquest.png -------------------------------------------------------------------------------- /imgs/ppo_halfcheetah.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/ppo_halfcheetah.png -------------------------------------------------------------------------------- /imgs/ppo_hopper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/ppo_hopper.png -------------------------------------------------------------------------------- /imgs/ppo_reacher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/ppo_reacher.png -------------------------------------------------------------------------------- /imgs/ppo_walker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/ppo_walker.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import glob 3 | import os 4 | import time 5 | import datetime 6 | from collections import deque 7 | 8 | import gym 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | import torch.optim as optim 14 | 15 | import algo 16 | from arguments import get_args 17 | from envs import make_vec_envs 18 | from model import Policy 19 | from rollout_storage import RolloutStorage 20 | from replay_storage import ReplayStorage 21 | from utils import get_vec_normalize 22 | from visualize import visdom_plot 23 | 24 | args = get_args() 25 | 26 | assert args.algo in ['a2c', 'a2c-sil', 'ppo', 'ppo-sil', 'acktr'] 27 | if args.recurrent_policy: 28 | assert args.algo in ['a2c', 'ppo'], \ 29 | 'Recurrent policy is not implemented for ACKTR' 30 | 31 | num_updates = int(args.num_frames) // args.num_steps // args.num_processes 32 | 33 | torch.manual_seed(args.seed) 34 | if args.cuda: 35 | torch.cuda.manual_seed(args.seed) 36 | 37 | 38 | def setup_dirs(experiment_name, log_dir, save_dir): 39 | log_dir = os.path.join(log_dir, experiment_name) 40 | os.makedirs(log_dir, exist_ok=True) 41 | 42 | eval_log_dir = args.log_dir + "_eval" 43 | os.makedirs(eval_log_dir, exist_ok=True) 44 | 45 | save_dir = os.path.join(save_dir, experiment_name) 46 | os.makedirs(save_dir, exist_ok=True) 47 | 48 | return log_dir, eval_log_dir, save_dir 49 | 50 | 51 | def main(): 52 | torch.set_num_threads(1) 53 | device = torch.device("cuda:0" if args.cuda else "cpu") 54 | 55 | experiment_name = args.env_name + '-' + args.algo + '-' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f") 56 | log_dir, eval_log_dir, save_dir = setup_dirs(experiment_name, args.log_dir, args.save_dir) 57 | 58 | if args.vis: 59 | from visdom import Visdom 60 | viz = Visdom(port=args.port) 61 | win = None 62 | 63 | envs = make_vec_envs( 64 | args.env_name, args.seed, args.num_processes, 65 | args.gamma, log_dir, args.add_timestep, device, False, frame_skip=args.frame_skip) 66 | 67 | if args.load_path: 68 | actor_critic, _ob_rms = torch.load(args.load_path) 69 | vec_norm = get_vec_normalize(envs) 70 | if vec_norm is not None: 71 | vec_norm.train() 72 | vec_norm.ob_rms = _ob_rms 73 | actor_critic.train() 74 | else: 75 | actor_critic = Policy( 76 | envs.observation_space.shape, 77 | envs.action_space, 78 | beta=args.beta_dist, 79 | base_kwargs={'recurrent': args.recurrent_policy}) 80 | actor_critic.to(device) 81 | 82 | if args.algo.startswith('a2c'): 83 | agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, 84 | args.entropy_coef, lr=args.lr, 85 | lr_schedule=args.lr_schedule, 86 | eps=args.eps, alpha=args.alpha, 87 | max_grad_norm=args.max_grad_norm) 88 | elif args.algo.startswith('ppo'): 89 | agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, 90 | args.value_loss_coef, args.entropy_coef, lr=args.lr, 91 | lr_schedule=args.lr_schedule, 92 | eps=args.eps, 93 | max_grad_norm=args.max_grad_norm) 94 | elif args.algo == 'acktr': 95 | agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, 96 | args.entropy_coef, acktr=True) 97 | 98 | if args.algo.endswith('sil'): 99 | agent = algo.SIL( 100 | agent, 101 | update_ratio=args.sil_update_ratio, 102 | epochs=args.sil_epochs, 103 | batch_size=args.sil_batch_size, 104 | beta=args.sil_beta, 105 | value_loss_coef=args.sil_value_loss_coef, 106 | entropy_coef=args.sil_entropy_coef) 107 | replay = ReplayStorage( 108 | 10000, 109 | num_processes=args.num_processes, 110 | gamma=args.gamma, 111 | prio_alpha=args.sil_alpha, 112 | obs_shape=envs.observation_space.shape, 113 | action_space=envs.action_space, 114 | recurrent_hidden_state_size=actor_critic.recurrent_hidden_state_size, 115 | device=device) 116 | else: 117 | replay = None 118 | 119 | action_high = torch.from_numpy(envs.action_space.high).to(device) 120 | action_low = torch.from_numpy(envs.action_space.low).to(device) 121 | action_mid = 0.5 * (action_high + action_low) 122 | 123 | rollouts = RolloutStorage( 124 | args.num_steps, args.num_processes, 125 | envs.observation_space.shape, envs.action_space, 126 | actor_critic.recurrent_hidden_state_size) 127 | 128 | obs = envs.reset() 129 | rollouts.obs[0].copy_(obs) 130 | rollouts.to(device) 131 | 132 | episode_rewards = deque(maxlen=10) 133 | benchmark_rewards = deque(maxlen=10) 134 | 135 | start = time.time() 136 | for j in range(num_updates): 137 | for step in range(args.num_steps): 138 | # Sample actions 139 | with torch.no_grad(): 140 | # sample actions 141 | value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( 142 | rollouts.obs[step], 143 | rollouts.recurrent_hidden_states[step], 144 | rollouts.masks[step]) 145 | 146 | if args.clip_action and isinstance(envs.action_space, gym.spaces.Box): 147 | clipped_action = action.clone() 148 | if args.shift_action: 149 | # FIXME experimenting with this, so far resulting in 150 | # faster learning when clipping guassian continuous 151 | # output (vs leaving centred at 0 and unscaled) 152 | clipped_action = 0.5 * clipped_action + action_mid 153 | clipped_action = torch.max( 154 | torch.min(clipped_action, action_high), action_low) 155 | else: 156 | clipped_action = action 157 | 158 | # act in environment and observe 159 | obs, reward, done, infos = envs.step(clipped_action) 160 | 161 | for info in infos: 162 | if 'episode' in info.keys(): 163 | episode_rewards.append(info['episode']['r']) 164 | if 'rb' in info['episode']: 165 | benchmark_rewards.append(info['episode']['rb']) 166 | 167 | # If done then clean the history of observations. 168 | masks = torch.FloatTensor([[0.0] if done_ else [1.0] 169 | for done_ in done]) 170 | rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) 171 | if replay is not None: 172 | replay.insert( 173 | rollouts.obs[step], 174 | rollouts.recurrent_hidden_states[step], 175 | action, 176 | reward, 177 | done) 178 | 179 | with torch.no_grad(): 180 | next_value = actor_critic.get_value(rollouts.obs[-1], 181 | rollouts.recurrent_hidden_states[-1], 182 | rollouts.masks[-1]).detach() 183 | 184 | rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) 185 | 186 | value_loss, action_loss, dist_entropy = agent.update(rollouts, j, replay) 187 | 188 | rollouts.after_update() 189 | 190 | total_num_steps = (j + 1) * args.num_processes * args.num_steps 191 | 192 | train_eprew = np.mean(episode_rewards) 193 | if j % args.log_interval == 0 and len(episode_rewards) > 1: 194 | end = time.time() 195 | print("Updates {}, num timesteps {}, FPS {} \n Last {} episodes: mean/med {:.1f}/{:.1f}, min/max reward {:.2f}/{:.2f}". 196 | format(j, total_num_steps, 197 | int(total_num_steps / (end - start)), 198 | len(episode_rewards), 199 | train_eprew, 200 | np.median(episode_rewards), 201 | np.min(episode_rewards), 202 | np.max(episode_rewards), dist_entropy, 203 | value_loss, action_loss), end='') 204 | if len(benchmark_rewards): 205 | print(", benchmark {:.1f}/{:.1f}, {:.1f}/{:.1f}".format( 206 | np.mean(benchmark_rewards), 207 | np.median(benchmark_rewards), 208 | np.min(benchmark_rewards), 209 | np.max(benchmark_rewards) 210 | ), end='') 211 | print() 212 | 213 | if (args.eval_interval is not None 214 | and len(episode_rewards) > 1 215 | and j % args.eval_interval == 0): 216 | eval_envs = make_vec_envs( 217 | args.env_name, args.seed + args.num_processes, args.num_processes, 218 | args.gamma, eval_log_dir, args.add_timestep, device, True) 219 | 220 | vec_norm = get_vec_normalize(eval_envs) 221 | if vec_norm is not None: 222 | vec_norm.eval() 223 | vec_norm.ob_rms = get_vec_normalize(envs).ob_rms 224 | 225 | eval_episode_rewards = [] 226 | 227 | obs = eval_envs.reset() 228 | eval_recurrent_hidden_states = torch.zeros(args.num_processes, 229 | actor_critic.recurrent_hidden_state_size, device=device) 230 | eval_masks = torch.zeros(args.num_processes, 1, device=device) 231 | 232 | while len(eval_episode_rewards) < 10: 233 | with torch.no_grad(): 234 | _, action, _, eval_recurrent_hidden_states = actor_critic.act( 235 | obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) 236 | 237 | clipped_action = action 238 | if args.clip_action and isinstance(envs.action_space, gym.spaces.Box): 239 | if args.shift_action: 240 | clipped_action = 0.5 * clipped_action + action_mid 241 | clipped_action = torch.max( 242 | torch.min(clipped_action, action_high), action_low) 243 | 244 | obs, reward, done, infos = eval_envs.step(clipped_action) 245 | 246 | eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] 247 | for done_ in done]) 248 | for info in infos: 249 | if 'episode' in info.keys(): 250 | eval_episode_rewards.append(info['episode']['r']) 251 | 252 | eval_envs.close() 253 | 254 | eval_eprew = np.mean(eval_episode_rewards) 255 | print(" Evaluation using {} episodes: mean reward {:.5f}\n". 256 | format(len(eval_episode_rewards), eval_eprew)) 257 | 258 | if len(episode_rewards) and j % args.save_interval == 0 and save_dir != "": 259 | # A really ugly way to save a model to CPU 260 | save_model = actor_critic 261 | if args.cuda: 262 | save_model = copy.deepcopy(actor_critic).cpu() 263 | 264 | save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] 265 | 266 | ep_rewstr = ("%d" % train_eprew).replace("-", "n") 267 | save_filename = os.path.join(save_dir, './checkpoint-%d-%s.pt' % (j, ep_rewstr)) 268 | 269 | torch.save(save_model, save_filename) 270 | 271 | if args.vis and j % args.vis_interval == 0: 272 | try: 273 | # Sometimes monitor doesn't properly flush the outputs 274 | win = visdom_plot(viz, win, log_dir, args.env_name, 275 | args.algo, args.num_frames) 276 | except IOError: 277 | pass 278 | 279 | 280 | if __name__ == "__main__": 281 | main() 282 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from distributions import Categorical, DiagGaussian, Beta 6 | from utils import init, init_normc_ 7 | 8 | 9 | class Flatten(nn.Module): 10 | def forward(self, x): 11 | return x.view(x.size(0), -1) 12 | 13 | 14 | class Policy(nn.Module): 15 | def __init__(self, obs_shape, action_space, beta=False, base_kwargs=None): 16 | super(Policy, self).__init__() 17 | if base_kwargs is None: 18 | base_kwargs = {} 19 | 20 | if len(obs_shape) == 3: 21 | self.base = CNNBase(obs_shape[0], **base_kwargs) 22 | elif len(obs_shape) == 1: 23 | self.base = MLPBase(obs_shape[0], **base_kwargs) 24 | else: 25 | raise NotImplementedError 26 | 27 | if action_space.__class__.__name__ == "Discrete": 28 | num_outputs = action_space.n 29 | self.dist = Categorical(self.base.output_size, num_outputs) 30 | elif action_space.__class__.__name__ == "Box": 31 | num_outputs = action_space.shape[0] 32 | if beta: 33 | self.dist = Beta(self.base.output_size, num_outputs) 34 | else: 35 | self.dist = DiagGaussian(self.base.output_size, num_outputs) 36 | else: 37 | raise NotImplementedError 38 | 39 | @property 40 | def is_recurrent(self): 41 | return self.base.is_recurrent 42 | 43 | @property 44 | def recurrent_hidden_state_size(self): 45 | """Size of rnn_hx.""" 46 | return self.base.recurrent_hidden_state_size 47 | 48 | def forward(self, inputs, rnn_hxs, masks): 49 | raise NotImplementedError 50 | 51 | def act(self, inputs, rnn_hxs, masks, deterministic=False): 52 | value, actor_features, rnn_hxs = self.base(inputs, rnn_hxs, masks) 53 | dist = self.dist(actor_features) 54 | 55 | if deterministic: 56 | action = dist.mode() 57 | else: 58 | action = dist.sample() 59 | 60 | action_log_probs = dist.log_probs(action) 61 | dist_entropy = dist.entropy().mean() 62 | 63 | return value, action, action_log_probs, rnn_hxs 64 | 65 | def get_value(self, inputs, rnn_hxs, masks): 66 | value, _, _ = self.base(inputs, rnn_hxs, masks) 67 | return value 68 | 69 | def evaluate_actions(self, inputs, rnn_hxs, masks, action): 70 | value, actor_features, rnn_hxs = self.base(inputs, rnn_hxs, masks) 71 | dist = self.dist(actor_features) 72 | 73 | action_log_probs = dist.log_probs(action) 74 | dist_entropy = dist.entropy().mean() 75 | 76 | return value, action_log_probs, dist_entropy, rnn_hxs 77 | 78 | 79 | class NNBase(nn.Module): 80 | 81 | def __init__(self, recurrent, recurrent_input_size, hidden_size): 82 | super(NNBase, self).__init__() 83 | 84 | self._hidden_size = hidden_size 85 | self._recurrent = recurrent 86 | 87 | if recurrent: 88 | self.gru = nn.GRUCell(recurrent_input_size, hidden_size) 89 | nn.init.orthogonal_(self.gru.weight_ih.data) 90 | nn.init.orthogonal_(self.gru.weight_hh.data) 91 | self.gru.bias_ih.data.fill_(0) 92 | self.gru.bias_hh.data.fill_(0) 93 | 94 | @property 95 | def is_recurrent(self): 96 | return self._recurrent 97 | 98 | @property 99 | def recurrent_hidden_state_size(self): 100 | if self._recurrent: 101 | return self._hidden_size 102 | return 1 103 | 104 | @property 105 | def output_size(self): 106 | return self._hidden_size 107 | 108 | def _forward_gru(self, x, hxs, masks): 109 | if x.size(0) == hxs.size(0): 110 | x = hxs = self.gru(x, hxs * masks) 111 | else: 112 | # x is a (T, N, -1) tensor that has been flatten to (T * N, -1) 113 | N = hxs.size(0) 114 | T = int(x.size(0) / N) 115 | 116 | # unflatten 117 | x = x.view(T, N, x.size(1)) 118 | 119 | # Same deal with masks 120 | masks = masks.view(T, N, 1) 121 | 122 | outputs = [] 123 | for i in range(T): 124 | hx = hxs = self.gru(x[i], hxs * masks[i]) 125 | outputs.append(hx) 126 | 127 | # assert len(outputs) == T 128 | # x is a (T, N, -1) tensor 129 | x = torch.stack(outputs, dim=0) 130 | # flatten 131 | x = x.view(T * N, -1) 132 | 133 | return x, hxs 134 | 135 | 136 | class CNNBase(NNBase): 137 | def __init__(self, num_inputs, recurrent=False, hidden_size=512, output_layer=nn.ReLU): 138 | super(CNNBase, self).__init__(recurrent, hidden_size, hidden_size) 139 | 140 | init_ = lambda m: init(m, 141 | nn.init.orthogonal_, 142 | lambda x: nn.init.constant_(x, 0), 143 | nn.init.calculate_gain('relu')) 144 | 145 | self.main = nn.Sequential( 146 | init_(nn.Conv2d(num_inputs, 32, 8, stride=4)), 147 | output_layer(), 148 | init_(nn.Conv2d(32, 64, 4, stride=2)), 149 | output_layer(), 150 | init_(nn.Conv2d(64, 32, 3, stride=1)), 151 | output_layer(), 152 | Flatten(), 153 | init_(nn.Linear(32 * 7 * 7, hidden_size)), 154 | output_layer() 155 | ) 156 | 157 | init_ = lambda m: init(m, 158 | nn.init.orthogonal_, 159 | lambda x: nn.init.constant_(x, 0)) 160 | 161 | self.critic_linear = init_(nn.Linear(hidden_size, 1)) 162 | 163 | self.train() 164 | 165 | def forward(self, inputs, rnn_hxs, masks): 166 | x = self.main(inputs / 255.0) 167 | 168 | if self.is_recurrent: 169 | x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks) 170 | 171 | return self.critic_linear(x), x, rnn_hxs 172 | 173 | 174 | class MLPBase(NNBase): 175 | def __init__(self, num_inputs, recurrent=False, hidden_size=64, activation_layer=nn.Tanh): 176 | super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size) 177 | 178 | if recurrent: 179 | num_inputs = hidden_size 180 | 181 | init_ = lambda m: init(m, 182 | init_normc_, 183 | lambda x: nn.init.constant_(x, 0)) 184 | 185 | self.actor = nn.Sequential( 186 | init_(nn.Linear(num_inputs, hidden_size*2)), 187 | activation_layer(), 188 | init_(nn.Linear(hidden_size*2, hidden_size)), 189 | activation_layer() 190 | ) 191 | 192 | self.critic = nn.Sequential( 193 | init_(nn.Linear(num_inputs, hidden_size*2)), 194 | activation_layer(), 195 | init_(nn.Linear(hidden_size*2, hidden_size)), 196 | activation_layer() 197 | ) 198 | 199 | self.critic_linear = init_(nn.Linear(hidden_size, 1)) 200 | 201 | self.train() 202 | 203 | def forward(self, inputs, rnn_hxs, masks): 204 | x = inputs 205 | 206 | if self.is_recurrent: 207 | x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks) 208 | 209 | hidden_critic = self.critic(x) 210 | hidden_actor = self.actor(x) 211 | 212 | return self.critic_linear(hidden_critic), hidden_actor, rnn_hxs 213 | -------------------------------------------------------------------------------- /monitor.py: -------------------------------------------------------------------------------- 1 | import time 2 | from baselines.bench import Monitor as _Monitor 3 | 4 | 5 | class Monitor(_Monitor): 6 | 7 | def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()): 8 | super(Monitor, self).__init__( 9 | env, filename, 10 | allow_early_resets=allow_early_resets, 11 | reset_keywords=reset_keywords, info_keywords=info_keywords) 12 | self.benchmark_rewards = [] 13 | self.episode_benchmark_rewards = [] 14 | if 'rb' in info_keywords: 15 | self.do_benchmark = True 16 | else: 17 | self.do_benchmark = False 18 | 19 | def reset_state(self): 20 | self.benchmark_rewards = [] 21 | super(Monitor, self).reset_state() 22 | 23 | def update(self, ob, rew, done, info): 24 | if self.do_benchmark and 'rb' in info: 25 | self.benchmark_rewards.append(info['rb']) 26 | self.rewards.append(rew) 27 | if done: 28 | self.needs_reset = True 29 | eprewb = sum(self.benchmark_rewards) 30 | eprew = sum(self.rewards) 31 | eplen = len(self.rewards) 32 | epinfo = { 33 | "r": round(eprew, 6), 34 | "l": eplen, 35 | "t": round(time.time() - self.tstart, 6)} 36 | for k in self.info_keywords: 37 | epinfo[k] = info[k] 38 | if self.do_benchmark: 39 | epinfo["rb"] = eprewb # overwrite with episode benchmark 40 | self.episode_benchmark_rewards.append(eprewb) 41 | self.episode_rewards.append(eprew) 42 | self.episode_lengths.append(eplen) 43 | self.episode_times.append(time.time() - self.tstart) 44 | epinfo.update(self.current_reset_info) 45 | self.results_writer.write_row(epinfo) 46 | 47 | if isinstance(info, dict): 48 | info['episode'] = epinfo 49 | 50 | self.total_steps += 1 51 | -------------------------------------------------------------------------------- /my_prosthetics_env.py: -------------------------------------------------------------------------------- 1 | from osim.env import ProstheticsEnv 2 | import random 3 | import numpy as np 4 | import math 5 | import os 6 | import time 7 | from collections import deque 8 | 9 | 10 | PROJ_FULL = 0 11 | PROJ_NORMAL = 1 12 | PROJ_SIMPLE = 2 13 | 14 | 15 | ## Values in the observation vector 16 | # y, vx, vy, ax, ay, rz, vrz, arz of pelvis (10 values) 17 | # x, y, vx, vy, ax, ay, rz, vrz, arz of head, torso, toes_l, toes_r, talus_l, talus_r (12*6 values) 18 | # rz, vrz, arz of ankle_l, ankle_r, back, hip_l, hip_r, knee_l, knee_r (7*3 values) 19 | # activation, fiber_len, fiber_vel for all muscles (3*18) 20 | # x, y, vx, vy, ax, ay ofg center of mass (6) 21 | # 8 + 9*6 + 8*3 + 3*18 + 6 = 146 22 | def project_obs(state_desc, proj=PROJ_FULL, prosthetic=True): 23 | res = [] 24 | 25 | if proj == PROJ_SIMPLE: 26 | pelvis = state_desc["body_pos"]["pelvis"][0:3] 27 | # pelvis_vel = state_desc["body_vel"]["pelvis"][0:3] 28 | # pelvis_acc = state_desc["body_acc"]["pelvis"][0:3] 29 | res += pelvis[1:2] # + pelvis_vel[:] + pelvis_acc[:] 30 | for bp in ["talus_l", "pros_foot_r"]: 31 | bp_pos = state_desc["body_pos"][bp].copy() 32 | bp_pos[0] = bp_pos[0] - pelvis[0] 33 | bp_pos[2] = bp_pos[2] - pelvis[2] 34 | res += bp_pos 35 | else: 36 | pelvis = None 37 | for body_part in ["pelvis", "head", "torso", "toes_l", "toes_r", "talus_l", "talus_r"]: 38 | if prosthetic and body_part in ["toes_r", "talus_r"]: 39 | if proj == PROJ_FULL: 40 | res += [0] * 12 41 | continue 42 | cur = [] 43 | cur += state_desc["body_pos"][body_part][0:3] 44 | cur += state_desc["body_vel"][body_part][0:3] 45 | cur += state_desc["body_acc"][body_part][0:3] 46 | cur += state_desc["body_pos_rot"][body_part][2:] 47 | cur += state_desc["body_vel_rot"][body_part][2:] 48 | cur += state_desc["body_acc_rot"][body_part][2:] 49 | if body_part == "pelvis": 50 | pelvis = cur.copy() 51 | res += pelvis[1:2] + pelvis[3:] 52 | else: 53 | cur_upd = cur.copy() 54 | cur_upd[:3] = [cur[i] - pelvis[i] for i in range(3)] 55 | cur_upd[9:10] = [cur[i] - pelvis[i] for i in range(9, 10)] 56 | res += cur_upd 57 | 58 | for joint in ["ankle_l", "ankle_r", "back", "hip_l", "hip_r", "knee_l", "knee_r"]: 59 | res += state_desc["joint_pos"][joint] 60 | res += state_desc["joint_vel"][joint] 61 | res += state_desc["joint_acc"][joint] 62 | 63 | for muscle in sorted(state_desc["muscles"].keys()): 64 | res += [state_desc["muscles"][muscle]["activation"]] 65 | res += [state_desc["muscles"][muscle]["fiber_length"]] 66 | res += [state_desc["muscles"][muscle]["fiber_velocity"]] 67 | 68 | cm_pos = [state_desc["misc"]["mass_center_pos"][i] - pelvis[i] for i in range(3)] 69 | cm_vel = state_desc["misc"]["mass_center_vel"] 70 | cm_acc = state_desc["misc"]["mass_center_acc"] 71 | res = res + cm_pos + cm_vel + cm_acc 72 | 73 | return np.array(res) 74 | 75 | 76 | class MyProstheticsEnv(ProstheticsEnv): 77 | 78 | def __init__(self, visualize=False, integrator_accuracy=1e-4, difficulty=0, seed=0, frame_skip=0): 79 | self.project_mode = PROJ_FULL 80 | super(MyProstheticsEnv, self).__init__( 81 | visualize=visualize, 82 | integrator_accuracy=integrator_accuracy, 83 | difficulty=difficulty, 84 | seed=seed) 85 | if difficulty == 0: 86 | self.time_limit = 600 # longer time limit to reduce likelihood of diving strategy 87 | self.spec.timestep_limit = self.time_limit 88 | np.random.seed(seed) 89 | self.frame_times = deque(maxlen=100) 90 | self.frame_count = 0 91 | self.frame_skip = frame_skip 92 | self.debug = False 93 | 94 | def get_observation(self): 95 | state_desc = self.get_state_desc() 96 | return project_obs(state_desc, proj=self.project_mode, prosthetic=self.prosthetic) 97 | 98 | def get_observation_space_size(self): 99 | if self.prosthetic: 100 | if self.project_mode == PROJ_SIMPLE: 101 | return 106 102 | elif self.project_mode == PROJ_FULL: 103 | return 181 104 | else: 105 | return 157 106 | return 167 107 | 108 | def is_done(self): 109 | state_desc = self.get_state_desc() 110 | return state_desc["body_pos"]["pelvis"][1] < 0.65 111 | 112 | def my_reward_round1(self): 113 | state_desc = self.get_state_desc() 114 | prev_state_desc = self.get_prev_state_desc() 115 | if not prev_state_desc: 116 | return 0 117 | 118 | penalty = 0. 119 | penalty += (state_desc["body_vel"]["pelvis"][0] - 3.0) ** 2 120 | penalty += (state_desc["body_vel"]["pelvis"][2]) ** 2 121 | penalty += np.sum(np.array(self.osim_model.get_activations()) ** 2) * 0.001 122 | if state_desc["body_pos"]["pelvis"][1] < 0.70: 123 | penalty += 10 # penalize falling more 124 | 125 | # Reward for not falling 126 | reward = 10.0 127 | 128 | return reward - penalty 129 | 130 | def my_reward_round2(self): 131 | state_desc = self.get_state_desc() 132 | prev_state_desc = self.get_prev_state_desc() 133 | penalty = 0 134 | 135 | # Small penalty for too much activation (cost of transport) 136 | penalty += np.sum(np.array(self.osim_model.get_activations()) ** 2) * 0.001 137 | 138 | # Big penalty for not matching the vector on the X,Z projection. 139 | # No penalty for the vertical axis 140 | penalty += (state_desc["body_vel"]["pelvis"][0] - state_desc["target_vel"][0]) ** 2 141 | penalty += (state_desc["body_vel"]["pelvis"][2] - state_desc["target_vel"][2]) ** 2 142 | if state_desc["body_pos"]["pelvis"][1] < 0.70: 143 | penalty += 10 # penalize falling more 144 | 145 | # Reward for not falling 146 | reward = 10.0 147 | 148 | return reward - penalty 149 | 150 | def reward_round1(self): 151 | state_desc = self.get_state_desc() 152 | prev_state_desc = self.get_prev_state_desc() 153 | if not prev_state_desc: 154 | return 0 155 | return 9.0 - (state_desc["body_vel"]["pelvis"][0] - 3.0)**2 156 | 157 | def reward_round2(self): 158 | state_desc = self.get_state_desc() 159 | prev_state_desc = self.get_prev_state_desc() 160 | penalty = 0 161 | 162 | # Small penalty for too much activation (cost of transport) 163 | penalty += np.sum(np.array(self.osim_model.get_activations()) ** 2) * 0.001 164 | 165 | # Big penalty for not matching the vector on the X,Z projection. 166 | # No penalty for the vertical axis 167 | penalty += (state_desc["body_vel"]["pelvis"][0] - state_desc["target_vel"][0]) ** 2 168 | penalty += (state_desc["body_vel"]["pelvis"][2] - state_desc["target_vel"][2]) ** 2 169 | 170 | # Reward for not falling 171 | reward = 10.0 172 | 173 | return reward - penalty 174 | 175 | def reward(self): 176 | if self.difficulty == 0: 177 | return self.reward_round1() 178 | return self.reward_round2() 179 | 180 | def my_reward(self): 181 | if self.difficulty == 0: 182 | return self.my_reward_round1() 183 | return self.my_reward_round2() 184 | 185 | def step(self, action, project=True): 186 | reward = 0. 187 | rewardb = 0. 188 | done = False 189 | 190 | if self.frame_skip: 191 | num_steps = self.frame_skip 192 | else: 193 | num_steps = 1 194 | 195 | for _ in range(num_steps): 196 | self.prev_state_desc = self.get_state_desc() 197 | 198 | start_time = time.perf_counter() 199 | self.osim_model.actuate(action) 200 | self.osim_model.integrate() 201 | step_time = time.perf_counter() - start_time 202 | 203 | # track some step stats across resets 204 | self.frame_times.append(step_time) 205 | self.frame_count += 1 206 | 207 | if self.debug and self.frame_count % 1000 == 0: 208 | frame_mean = np.mean(self.frame_times) 209 | frame_min = np.min(self.frame_times) 210 | frame_max = np.max(self.frame_times) 211 | print('Steps {}, duration mean, min, max: {:.3f}, {:.3f}, {:.3f}'.format( 212 | self.frame_count, frame_mean, frame_min, frame_max)) 213 | 214 | done = self.is_done() or self.osim_model.istep >= self.spec.timestep_limit 215 | if step_time > 15.: 216 | reward += -10 217 | done = True 218 | else: 219 | reward += self.my_reward() 220 | rewardb += self.reward() 221 | 222 | if done: 223 | break 224 | 225 | if project: 226 | obs = self.get_observation() 227 | else: 228 | obs = self.get_state_desc() 229 | 230 | return [obs, reward, done, {'rb': rewardb}] 231 | 232 | def seed(self, seed=None): 233 | random.seed(seed) 234 | np.random.seed(seed) 235 | -------------------------------------------------------------------------------- /replay_storage.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import math 4 | import random 5 | from collections import deque 6 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 7 | 8 | 9 | class ReplayStorage: 10 | def __init__( 11 | self, max_steps, num_processes, gamma, prio_alpha, 12 | obs_shape, action_space, recurrent_hidden_state_size, 13 | device): 14 | self.max_steps = max_steps 15 | self.num_processes = num_processes 16 | self.gamma = gamma 17 | self.device = device 18 | 19 | # stored episode data 20 | self.obs = torch.zeros(max_steps, *obs_shape) 21 | self.recurrent_hidden_states = torch.zeros(max_steps, recurrent_hidden_state_size) 22 | self.returns = torch.zeros(max_steps, 1) 23 | if action_space.__class__.__name__ == 'Discrete': 24 | self.actions = torch.zeros(max_steps, 1).long() 25 | else: 26 | self.actions = torch.zeros(max_steps, action_space.shape[0]) 27 | self.masks = torch.ones(max_steps, 1) 28 | self.next_idx = 0 29 | self.num_steps = 0 30 | 31 | # store (full) episode stats 32 | self.episode_step_count = 0 33 | self.episode_rewards = deque() 34 | self.episode_steps = deque() 35 | 36 | # currently running (accumulating) episodes 37 | self.running_episodes = [[] for _ in range(num_processes)] 38 | 39 | if prio_alpha > 0: 40 | """ 41 | Sampling priority is enabled if prio_alpha > 0 42 | Priority algorithm ripped from OpenAI Baselines 43 | https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py 44 | """ 45 | self.prio_alpha = prio_alpha 46 | tree_capacity = 1 << math.ceil(math.log2(self.max_steps)) 47 | self.prio_sum_tree = SumSegmentTree(tree_capacity) 48 | self.prio_min_tree = MinSegmentTree(tree_capacity) 49 | self.prio_max = 1.0 50 | else: 51 | self.prio_alpha = 0 52 | 53 | def _process_rewards(self, trajectory): 54 | has_positive = False 55 | reward_sum = 0. 56 | r = 0. 57 | for t in trajectory[::-1]: 58 | reward = t['reward'] 59 | reward_sum += reward 60 | if reward > (0. + 1e-5): 61 | has_positive = True 62 | r = reward + self.gamma*r 63 | t['return'] = r 64 | return has_positive, reward_sum 65 | 66 | def _add_trajectory(self, trajectory): 67 | has_positive, reward_sum = self._process_rewards(trajectory) 68 | if not has_positive: 69 | return 70 | trajectory_len = len(trajectory) 71 | prev_idx = self.next_idx 72 | for transition in trajectory: 73 | self.obs[self.next_idx].copy_(transition['obs']) 74 | self.recurrent_hidden_states[self.next_idx].copy_(transition['rhs']) 75 | self.actions[self.next_idx].copy_(transition['action']) 76 | self.returns[self.next_idx].copy_(transition['return']) 77 | self.masks[self.next_idx] = 1.0 78 | prev_idx = self.next_idx 79 | if self.prio_alpha: 80 | self.prio_sum_tree[self.next_idx] = self.prio_max ** self.prio_alpha 81 | self.prio_min_tree[self.next_idx] = self.prio_max ** self.prio_alpha 82 | self.next_idx = (self.next_idx + 1) % self.max_steps 83 | self.num_steps = min(self.max_steps, self.num_steps + 1) 84 | self.masks[prev_idx] = 0.0 85 | 86 | # update stats of stored full trajectories (episodes) 87 | while self.episode_step_count + trajectory_len > self.max_steps: 88 | steps_popped = self.episode_steps.popleft() 89 | self.episode_rewards.popleft() 90 | self.episode_step_count -= steps_popped 91 | self.episode_step_count += trajectory_len 92 | self.episode_steps.append(trajectory_len) 93 | self.episode_rewards.append(reward_sum) 94 | 95 | def _sample_proportional(self, sample_size): 96 | res = [] 97 | for _ in range(sample_size): 98 | mass = random.random() * self.prio_sum_tree.sum(0, self.num_steps - 1) 99 | idx = self.prio_sum_tree.find_prefixsum_idx(mass) 100 | res.append(idx) 101 | return res 102 | 103 | def insert(self, obs, rhs, actions, rewards, dones): 104 | for n in range(self.num_processes): 105 | self.running_episodes[n].append(dict( 106 | obs=obs[n].clone(), 107 | rhs=rhs[n].clone(), 108 | action=actions[n].clone(), 109 | reward=rewards[n].clone() 110 | )) 111 | for n, done in enumerate(dones): 112 | if done: 113 | self._add_trajectory(self.running_episodes[n]) 114 | self.running_episodes[n] = [] 115 | 116 | def update_priorities(self, indices, priorities): 117 | if not self.prio_alpha: 118 | return 119 | 120 | """Update priorities of sampled transitions. 121 | sets priority of transition at index indices[i] in buffer 122 | to priorities[i]. 123 | Parameters 124 | ---------- 125 | indices: [int] 126 | List of indices of sampled transitions 127 | priorities: [float] 128 | List of updated priorities corresponding to 129 | transitions at the sampled indices. 130 | """ 131 | assert len(indices) == len(priorities) 132 | for idx, priority in zip(indices, priorities): 133 | priority = max(priority, 1e-6) 134 | assert priority > 0 135 | assert 0 <= idx < self.num_steps 136 | self.prio_sum_tree[idx] = priority ** self.prio_alpha 137 | self.prio_min_tree[idx] = priority ** self.prio_alpha 138 | 139 | self.prio_max = max(self.prio_max, priority) 140 | 141 | def feed_forward_generator(self, batch_size, num_batches=None, beta=0.): 142 | """Generate batches of sampled experiences. 143 | 144 | Parameters 145 | ---------- 146 | batch_size: int 147 | Size of each sampled batch 148 | num_batches: int 149 | Number of batches to sample 150 | beta: float 151 | To what degree to use importance weights 152 | (0 - no corrections, 1 - full correction) 153 | """ 154 | 155 | batch_count = 0 156 | sample_size = num_batches * batch_size or self.num_steps 157 | 158 | if self.prio_alpha > 0: 159 | indices = self._sample_proportional(sample_size) 160 | if beta > 0: 161 | # compute importance sampling weights to correct for the 162 | # bias introduced by sampling in a non-uniform manner 163 | weights = [] 164 | p_min = self.prio_min_tree.min() / self.prio_sum_tree.sum() 165 | max_weight = (p_min * self.num_steps) ** (-beta) 166 | for i in indices: 167 | p_sample = self.prio_sum_tree[i] / self.prio_sum_tree.sum() 168 | weight = (p_sample * self.num_steps) ** (-beta) 169 | weights.append(weight / max_weight) 170 | weights = torch.tensor(weights, dtype=torch.float32).unsqueeze(1) 171 | else: 172 | weights = torch.ones((len(indices), 1), dtype=torch.float32) 173 | else: 174 | if sample_size * 3 < self.num_steps: 175 | indices = random.sample(range(self.num_steps), sample_size) 176 | else: 177 | indices = np.random.permutation(self.num_steps)[:sample_size] 178 | weights = None 179 | 180 | for si in range(0, len(indices), batch_size): 181 | indices_batch = indices[si:min(len(indices), si + batch_size)] 182 | if len(indices_batch) < batch_size: 183 | return 184 | 185 | weights_batch = None if weights is None else \ 186 | weights[si:min(len(indices), si + batch_size)].to(self.device) 187 | 188 | obs_batch = self.obs[indices_batch].to(self.device) 189 | recurrent_hidden_states_batch = self.recurrent_hidden_states[indices_batch].to(self.device) 190 | actions_batch = self.actions[indices_batch].to(self.device) 191 | returns_batch = self.returns[indices_batch].to(self.device) 192 | masks_batch = self.masks[indices_batch].to(self.device) 193 | 194 | yield obs_batch, recurrent_hidden_states_batch, actions_batch, returns_batch, \ 195 | masks_batch, weights_batch, indices_batch 196 | 197 | batch_count += 1 198 | if num_batches and batch_count >= num_batches: 199 | return 200 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gym 2 | matplotlib 3 | pybullet 4 | -------------------------------------------------------------------------------- /rollout_storage.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 3 | 4 | 5 | def _flatten_helper(T, N, _tensor): 6 | return _tensor.view(T * N, *_tensor.size()[2:]) 7 | 8 | 9 | class RolloutStorage(object): 10 | def __init__(self, num_steps, num_processes, obs_shape, action_space, recurrent_hidden_state_size): 11 | self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape) 12 | self.recurrent_hidden_states = torch.zeros(num_steps + 1, num_processes, recurrent_hidden_state_size) 13 | self.rewards = torch.zeros(num_steps, num_processes, 1) 14 | self.value_preds = torch.zeros(num_steps + 1, num_processes, 1) 15 | self.returns = torch.zeros(num_steps + 1, num_processes, 1) 16 | self.action_log_probs = torch.zeros(num_steps, num_processes, 1) 17 | if action_space.__class__.__name__ == 'Discrete': 18 | action_shape = 1 19 | else: 20 | action_shape = action_space.shape[0] 21 | self.actions = torch.zeros(num_steps, num_processes, action_shape) 22 | if action_space.__class__.__name__ == 'Discrete': 23 | self.actions = self.actions.long() 24 | self.masks = torch.ones(num_steps + 1, num_processes, 1) 25 | 26 | self.num_steps = num_steps 27 | self.step = 0 28 | 29 | def to(self, device): 30 | self.obs = self.obs.to(device) 31 | self.recurrent_hidden_states = self.recurrent_hidden_states.to(device) 32 | self.rewards = self.rewards.to(device) 33 | self.value_preds = self.value_preds.to(device) 34 | self.returns = self.returns.to(device) 35 | self.action_log_probs = self.action_log_probs.to(device) 36 | self.actions = self.actions.to(device) 37 | self.masks = self.masks.to(device) 38 | 39 | def insert(self, obs, recurrent_hidden_states, actions, action_log_probs, value_preds, rewards, masks): 40 | self.obs[self.step + 1].copy_(obs) 41 | self.recurrent_hidden_states[self.step + 1].copy_(recurrent_hidden_states) 42 | self.actions[self.step].copy_(actions) 43 | self.action_log_probs[self.step].copy_(action_log_probs) 44 | self.value_preds[self.step].copy_(value_preds) 45 | self.rewards[self.step].copy_(rewards) 46 | self.masks[self.step + 1].copy_(masks) 47 | 48 | self.step = (self.step + 1) % self.num_steps 49 | 50 | def after_update(self): 51 | self.obs[0].copy_(self.obs[-1]) 52 | self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1]) 53 | self.masks[0].copy_(self.masks[-1]) 54 | 55 | def compute_returns(self, next_value, use_gae, gamma, tau): 56 | if use_gae: 57 | self.value_preds[-1] = next_value 58 | gae = 0 59 | for step in reversed(range(self.rewards.size(0))): 60 | delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step + 1] - self.value_preds[step] 61 | gae = delta + gamma * tau * self.masks[step + 1] * gae 62 | self.returns[step] = gae + self.value_preds[step] 63 | else: 64 | self.returns[-1] = next_value 65 | for step in reversed(range(self.rewards.size(0))): 66 | self.returns[step] = self.returns[step + 1] * \ 67 | gamma * self.masks[step + 1] + self.rewards[step] 68 | 69 | 70 | def feed_forward_generator(self, advantages, num_mini_batch): 71 | num_steps, num_processes = self.rewards.size()[0:2] 72 | batch_size = num_processes * num_steps 73 | assert batch_size >= num_mini_batch, ( 74 | "PPO requires the number of processes ({}) " 75 | "* number of steps ({}) = {} " 76 | "to be greater than or equal to the number of PPO mini batches ({})." 77 | "".format(num_processes, num_steps, num_processes * num_steps, num_mini_batch)) 78 | mini_batch_size = batch_size // num_mini_batch 79 | sampler = BatchSampler(SubsetRandomSampler(range(batch_size)), mini_batch_size, drop_last=False) 80 | for indices in sampler: 81 | obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices] 82 | recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view(-1, 83 | self.recurrent_hidden_states.size(-1))[indices] 84 | actions_batch = self.actions.view(-1, self.actions.size(-1))[indices] 85 | return_batch = self.returns[:-1].view(-1, 1)[indices] 86 | masks_batch = self.masks[:-1].view(-1, 1)[indices] 87 | old_action_log_probs_batch = self.action_log_probs.view(-1, 1)[indices] 88 | adv_targ = advantages.view(-1, 1)[indices] 89 | 90 | yield obs_batch, recurrent_hidden_states_batch, actions_batch, \ 91 | return_batch, masks_batch, old_action_log_probs_batch, adv_targ 92 | 93 | def recurrent_generator(self, advantages, num_mini_batch): 94 | num_processes = self.rewards.size(1) 95 | assert num_processes >= num_mini_batch, ( 96 | "PPO requires the number of processes ({}) " 97 | "to be greater than or equal to the number of " 98 | "PPO mini batches ({}).".format(num_processes, num_mini_batch)) 99 | num_envs_per_batch = num_processes // num_mini_batch 100 | perm = torch.randperm(num_processes) 101 | for start_ind in range(0, num_processes, num_envs_per_batch): 102 | obs_batch = [] 103 | recurrent_hidden_states_batch = [] 104 | actions_batch = [] 105 | return_batch = [] 106 | masks_batch = [] 107 | old_action_log_probs_batch = [] 108 | adv_targ = [] 109 | 110 | for offset in range(num_envs_per_batch): 111 | ind = perm[start_ind + offset] 112 | obs_batch.append(self.obs[:-1, ind]) 113 | recurrent_hidden_states_batch.append(self.recurrent_hidden_states[0:1, ind]) 114 | actions_batch.append(self.actions[:, ind]) 115 | return_batch.append(self.returns[:-1, ind]) 116 | masks_batch.append(self.masks[:-1, ind]) 117 | old_action_log_probs_batch.append(self.action_log_probs[:, ind]) 118 | adv_targ.append(advantages[:, ind]) 119 | 120 | T, N = self.num_steps, num_envs_per_batch 121 | # These are all tensors of size (T, N, -1) 122 | obs_batch = torch.stack(obs_batch, 1) 123 | actions_batch = torch.stack(actions_batch, 1) 124 | return_batch = torch.stack(return_batch, 1) 125 | masks_batch = torch.stack(masks_batch, 1) 126 | old_action_log_probs_batch = torch.stack(old_action_log_probs_batch, 1) 127 | adv_targ = torch.stack(adv_targ, 1) 128 | 129 | # States is just a (N, -1) tensor 130 | recurrent_hidden_states_batch = torch.stack(recurrent_hidden_states_batch, 1).view(N, -1) 131 | 132 | # Flatten the (T, N, ...) tensors to (T * N, ...) 133 | obs_batch = _flatten_helper(T, N, obs_batch) 134 | actions_batch = _flatten_helper(T, N, actions_batch) 135 | return_batch = _flatten_helper(T, N, return_batch) 136 | masks_batch = _flatten_helper(T, N, masks_batch) 137 | old_action_log_probs_batch = _flatten_helper(T, N, \ 138 | old_action_log_probs_batch) 139 | adv_targ = _flatten_helper(T, N, adv_targ) 140 | 141 | yield obs_batch, recurrent_hidden_states_batch, actions_batch, \ 142 | return_batch, masks_batch, old_action_log_probs_batch, adv_targ 143 | -------------------------------------------------------------------------------- /submit.py: -------------------------------------------------------------------------------- 1 | import opensim as osim 2 | from osim.http.client import Client 3 | from osim.env import ProstheticsEnv 4 | import numpy as np 5 | import argparse 6 | import os 7 | import gym 8 | import torch 9 | 10 | from envs import VecPyTorch, make_vec_envs 11 | from utils import get_render_func, get_vec_normalize 12 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 13 | from envs import VecNormalize, VecPyTorch 14 | from my_prosthetics_env import MyProstheticsEnv, project_obs 15 | 16 | 17 | class StopTheSim(Exception): 18 | pass 19 | 20 | 21 | class ClientWrapper(MyProstheticsEnv): 22 | 23 | def __init__(self, client, token): 24 | super(ClientWrapper, self).__init__( 25 | visualize=False, 26 | integrator_accuracy=1e-4, 27 | difficulty=0, 28 | seed=42) 29 | self.client = client 30 | self._cached_observation = self.client.env_create(token, env_id="ProstheticsEnv") 31 | print(self._cached_observation) 32 | self.step_count = 0 33 | 34 | def step(self, action, project=True): 35 | print('Step: ', self.step_count, end='. ') 36 | obs, reward, done, info = self.client.env_step(action.tolist()) 37 | if obs is not None and 'body_pos' in obs: 38 | print('Pelvis: ', obs['body_pos']['pelvis']) 39 | elif obs is None: 40 | print('Invalid obs.') 41 | return None, None, True, None 42 | self.step_count += 1 43 | proj_obs = project_obs(obs, self.project_mode, self.prosthetic) 44 | return proj_obs, reward, done, info 45 | 46 | def reset(self, project=True): 47 | print('Reset') 48 | if self._cached_observation is not None: 49 | print('Returning cached') 50 | obs = self._cached_observation 51 | self._cached_observation = None 52 | else: 53 | obs = self.client.env_reset() 54 | self.step_count = 0 55 | if obs is None: 56 | raise StopTheSim 57 | return project_obs(obs, self.project_mode, self.prosthetic) 58 | 59 | def close(self): 60 | return self.client.env_close() 61 | 62 | 63 | # Command line parameters 64 | parser = argparse.ArgumentParser(description='Submit the result to crowdAI') 65 | parser.add_argument('--token', dest='token', action='store', required=True) 66 | parser.add_argument('--env-name', default='PongNoFrameskip-v4', 67 | help='environment to train on (default: PongNoFrameskip-v4)') 68 | parser.add_argument('--load-path', default='', 69 | help='directory to save agent logs (default: ') 70 | args = parser.parse_args() 71 | 72 | remote_base = 'http://grader.crowdai.org:1729' # Submission to Round-1 73 | # remote_base = 'http://grader.crowdai.org:1730' # Submission to Round-2 74 | client = Client(remote_base) 75 | 76 | 77 | def create_env(): 78 | env = ClientWrapper(client=client, token=args.token) 79 | return env 80 | 81 | env = DummyVecEnv([create_env]) 82 | env = VecNormalize(env, ret=False) 83 | env = VecPyTorch(env, 'cpu') 84 | 85 | # We need to use the same statistics for normalization as used in training 86 | actor_critic, ob_rms = torch.load(args.load_path) 87 | actor_critic.eval() 88 | 89 | vec_norm = get_vec_normalize(env) 90 | if vec_norm is not None: 91 | vec_norm.eval() 92 | vec_norm.ob_rms = ob_rms 93 | 94 | recurrent_hidden_states = torch.zeros(1, actor_critic.recurrent_hidden_state_size) 95 | masks = torch.zeros(1, 1) 96 | 97 | # Create environment 98 | 99 | ref_env = ProstheticsEnv() 100 | 101 | obs = env.reset() 102 | 103 | # Run a single step 104 | # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one 105 | count = 0 106 | num_steps = 0 107 | while True: 108 | with torch.no_grad(): 109 | value, action, _, recurrent_hidden_states = actor_critic.act( 110 | obs, recurrent_hidden_states, masks, deterministic=True) 111 | 112 | clipped_action = action 113 | if isinstance(ref_env.action_space, gym.spaces.Box): 114 | clipped_action = torch.max(torch.min( 115 | clipped_action, torch.from_numpy(ref_env.action_space.high)), 116 | torch.from_numpy(ref_env.action_space.low)) 117 | 118 | try: 119 | obs, reward, done, info = env.step(clipped_action) 120 | num_steps += 1 121 | if done: 122 | print('Done after %d steps.' % num_steps) 123 | num_steps = 0 124 | count += 1 125 | except StopTheSim: 126 | print('Finishing.') 127 | break 128 | 129 | client.submit() 130 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | from envs import VecNormalize 6 | 7 | 8 | # Get a render function 9 | def get_render_func(venv): 10 | if hasattr(venv, 'envs'): 11 | return venv.envs[0].render 12 | elif hasattr(venv, 'venv'): 13 | return get_render_func(venv.venv) 14 | elif hasattr(venv, 'env'): 15 | return get_render_func(venv.env) 16 | 17 | return None 18 | 19 | 20 | def get_vec_normalize(venv): 21 | if isinstance(venv, VecNormalize): 22 | return venv 23 | elif hasattr(venv, 'venv'): 24 | return get_vec_normalize(venv.venv) 25 | 26 | return None 27 | 28 | 29 | # Necessary for my KFAC implementation. 30 | class AddBias(nn.Module): 31 | def __init__(self, bias): 32 | super(AddBias, self).__init__() 33 | self._bias = nn.Parameter(bias.unsqueeze(1)) 34 | 35 | def forward(self, x): 36 | if x.dim() == 2: 37 | bias = self._bias.t().view(1, -1) 38 | else: 39 | bias = self._bias.t().view(1, -1, 1, 1) 40 | 41 | return x + bias 42 | 43 | 44 | def init(module, weight_init, bias_init, gain=1): 45 | weight_init(module.weight.data, gain=gain) 46 | bias_init(module.bias.data) 47 | return module 48 | 49 | 50 | # https://github.com/openai/baselines/blob/master/baselines/common/tf_util.py#L87 51 | def init_normc_(weight, gain=1): 52 | weight.normal_(0, 1) 53 | weight *= gain / torch.sqrt(weight.pow(2).sum(1, keepdim=True)) 54 | 55 | 56 | 57 | class _Schedule: 58 | def __init__(self, initial_value, gamma=0.1, last_epoch=-1): 59 | self.gamma = gamma 60 | self.initial_value = initial_value 61 | self.last_epoch = last_epoch 62 | self.step() 63 | 64 | def get(self): 65 | return self.initial_value 66 | 67 | def step(self, epoch=None): 68 | if epoch is None: 69 | self.last_epoch += 1 70 | else: 71 | self.last_epoch = epoch 72 | return self.get() 73 | 74 | 75 | class StepSchedule(_Schedule): 76 | 77 | def __init__(self, initial_value, step_size, gamma=0.1, last_epoch=-1): 78 | self.step_size = step_size 79 | super(StepSchedule, self).__init__(initial_value, gamma, last_epoch) 80 | 81 | def get(self): 82 | return self.initial_value * self.gamma ** (self.last_epoch // self.step_size) 83 | 84 | 85 | class ExpSchedule(_Schedule): 86 | 87 | def __init__(self, initial_value, step_size, gamma=0.1, last_epoch=-1): 88 | self.step_size = step_size 89 | super(ExpSchedule, self).__init__(initial_value, gamma, last_epoch) 90 | 91 | def get(self): 92 | return self.initial_value * self.gamma ** (self.last_epoch / self.step_size) 93 | 94 | 95 | class NatExpSchedule(_Schedule): 96 | 97 | def __init__(self, initial_value, step_size, gamma=0.1, last_epoch=-1): 98 | self.step_size = step_size 99 | super(NatExpSchedule, self).__init__(initial_value, gamma, last_epoch) 100 | 101 | def get(self): 102 | return self.initial_value * math.exp(-self.gamma * (self.last_epoch / self.step_size)) 103 | -------------------------------------------------------------------------------- /visualize.py: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/emansim/baselines-mansimov/blob/master/baselines/a2c/visualize_atari.py 2 | # and https://github.com/emansim/baselines-mansimov/blob/master/baselines/a2c/load.py 3 | # Thanks to the author and OpenAI team! 4 | 5 | import glob 6 | import json 7 | import os 8 | 9 | import matplotlib 10 | matplotlib.use('Agg') 11 | import matplotlib.pyplot as plt 12 | plt.switch_backend('agg') 13 | import numpy as np 14 | from scipy.signal import medfilt 15 | matplotlib.rcParams.update({'font.size': 8}) 16 | 17 | 18 | def smooth_reward_curve(x, y): 19 | # Halfwidth of our smoothing convolution 20 | halfwidth = min(31, int(np.ceil(len(x) / 30))) 21 | k = halfwidth 22 | xsmoo = x[k:-k] 23 | ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='valid') / \ 24 | np.convolve(np.ones_like(y), np.ones(2 * k + 1), mode='valid') 25 | downsample = max(int(np.floor(len(xsmoo) / 1e3)), 1) 26 | return xsmoo[::downsample], ysmoo[::downsample] 27 | 28 | 29 | def fix_point(x, y, interval): 30 | np.insert(x, 0, 0) 31 | np.insert(y, 0, 0) 32 | 33 | fx, fy = [], [] 34 | pointer = 0 35 | 36 | ninterval = int(max(x) / interval + 1) 37 | 38 | for i in range(ninterval): 39 | tmpx = interval * i 40 | 41 | while pointer + 1 < len(x) and tmpx > x[pointer + 1]: 42 | pointer += 1 43 | 44 | if pointer + 1 < len(x): 45 | alpha = (y[pointer + 1] - y[pointer]) / \ 46 | (x[pointer + 1] - x[pointer]) 47 | tmpy = y[pointer] + alpha * (tmpx - x[pointer]) 48 | fx.append(tmpx) 49 | fy.append(tmpy) 50 | 51 | return fx, fy 52 | 53 | 54 | def load_data(indir, smooth, bin_size): 55 | datas = [] 56 | infiles = glob.glob(os.path.join(indir, '*.monitor.csv')) 57 | 58 | for inf in infiles: 59 | with open(inf, 'r') as f: 60 | f.readline() 61 | f.readline() 62 | for line in f: 63 | tmp = line.split(',') 64 | t_time = float(tmp[2]) 65 | tmp = [t_time, int(tmp[1]), float(tmp[0])] 66 | datas.append(tmp) 67 | 68 | datas = sorted(datas, key=lambda d_entry: d_entry[0]) 69 | result = [] 70 | timesteps = 0 71 | for i in range(len(datas)): 72 | result.append([timesteps, datas[i][-1]]) 73 | timesteps += datas[i][1] 74 | 75 | if len(result) < bin_size: 76 | return [None, None] 77 | 78 | x, y = np.array(result)[:, 0], np.array(result)[:, 1] 79 | 80 | if smooth == 1: 81 | x, y = smooth_reward_curve(x, y) 82 | 83 | if smooth == 2: 84 | y = medfilt(y, kernel_size=9) 85 | 86 | x, y = fix_point(x, y, bin_size) 87 | return [x, y] 88 | 89 | 90 | color_defaults = [ 91 | '#1f77b4', # muted blue 92 | '#ff7f0e', # safety orange 93 | '#2ca02c', # cooked asparagus green 94 | '#d62728', # brick red 95 | '#9467bd', # muted purple 96 | '#8c564b', # chestnut brown 97 | '#e377c2', # raspberry yogurt pink 98 | '#7f7f7f', # middle gray 99 | '#bcbd22', # curry yellow-green 100 | '#17becf' # blue-teal 101 | ] 102 | 103 | 104 | def visdom_plot(viz, win, folder, game, name, num_steps, bin_size=100, smooth=1): 105 | tx, ty = load_data(folder, smooth, bin_size) 106 | if tx is None or ty is None: 107 | return win 108 | 109 | fig = plt.figure() 110 | plt.plot(tx, ty, label="{}".format(name)) 111 | 112 | tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0]) 113 | ticks = tick_fractions * num_steps 114 | tick_names = ["{:.0e}".format(tick) for tick in ticks] 115 | plt.xticks(ticks, tick_names) 116 | plt.xlim(0, num_steps * 1.01) 117 | 118 | plt.xlabel('Number of Timesteps') 119 | plt.ylabel('Rewards') 120 | 121 | plt.title(game) 122 | plt.legend(loc=4) 123 | plt.show() 124 | plt.draw() 125 | 126 | image = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') 127 | image = image.reshape(fig.canvas.get_width_height()[::-1] + (3, )) 128 | plt.close(fig) 129 | 130 | # Show it in visdom 131 | image = np.transpose(image, (2, 0, 1)) 132 | return viz.image(image, win=win) 133 | 134 | 135 | if __name__ == "__main__": 136 | from visdom import Visdom 137 | viz = Visdom() 138 | visdom_plot(viz, None, '/tmp/gym/', 'BreakOut', 'a2c', bin_size=100, smooth=1) 139 | --------------------------------------------------------------------------------