├── .gitignore
├── LICENSE
├── README.md
├── algo
    ├── __init__.py
    ├── a2c_acktr.py
    ├── kfac.py
    ├── ppo.py
    └── sil.py
├── arguments.py
├── distributions.py
├── enjoy.py
├── envs.py
├── imgs
    ├── a2c_beamrider.png
    ├── a2c_breakout.png
    ├── a2c_qbert.png
    ├── a2c_seaquest.png
    ├── acktr_beamrider.png
    ├── acktr_breakout.png
    ├── acktr_qbert.png
    ├── acktr_seaquest.png
    ├── ppo_halfcheetah.png
    ├── ppo_hopper.png
    ├── ppo_reacher.png
    └── ppo_walker.png
├── main.py
├── model.py
├── monitor.py
├── my_prosthetics_env.py
├── replay_storage.py
├── requirements.txt
├── rollout_storage.py
├── submit.py
├── utils.py
└── visualize.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | trained_models/
104 | .fuse_hidden*
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Ilya Kostrikov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Reinforcement Learning for OpenSim Environments
 2 | 
 3 | This is my code for experimenting with the CrowdAI Prosthetics Challenge (https://www.crowdai.org/challenges/nips-2018-ai-for-prosthetics-challenge)
 4 | 
 5 | The reinforcement learning codebase is based upon Ilya Kostrikov's awesome work (https://github.com/ikostrikov/pytorch-a2c-ppo-acktr)
 6 | 
 7 | As this is part of my learning process for continuous control with deep reinforcement learning, there are likely to be some issues.
 8 | 
 9 | All experiments were performed with PPO or PPO w/ self-improvement learning w/ 16 vector'd environments running in parallel. Keep in mind, the simulator is VERY slow so expect to wait a long time for decent results (days) -- even if you happen to have a kick ass machine.
10 | 
11 | Added:
12 |  * support for the OpenSim Gym-like environments with Ilya's RL codebase
13 |  * custom 'MyProstheticsEnv' wrapper to allow easier experimentation with different observation projections, rewards, and other aspects
14 |  * frame skipping support in custom env
15 |  * beta distribution experiment for continuous control in the range [0, 1] (http://ri.cmu.edu/wp-content/uploads/2017/06/thesis-Chou.pdf)
16 |  * tweaks to logging/folders/checkpoints and model resume for easier experimentation and tracking of results
17 |  * an implementation of SIL (https://arxiv.org/abs/1806.05635), one variant off policy replay with on policy methods. It speeds initial training but starts to falter. I need further experiments with loss weight and other sil param decay.
18 |  
19 |  
20 |  ## Get Started
21 |  
22 |  Setup your environment as per https://github.com/stanfordnmbl/osim-rl#getting-started
23 |  
24 |  ## Give It a Go
25 |   
26 |  Unclipped -- trains much faster but not clear what OpenSim is doing:
27 |  `main.py --algo ppo --env-name osim.Prosthetics --lr 7e-4 --num-steps 1000 --use-gae --ppo-epoch 10`
28 |  
29 |  With clipped [0, 1] actions shifted so mean is at 0.5:
30 |  
31 |  `main.py --algo ppo --env-name osim.Prosthetics --lr 1e-3 --num-steps 1000 --use-gae --ppo-epoch 10 --clip-action -shift-action`
32 |  
33 |  With beta distribution [0, 1]:
34 |  
35 | `main.py --algo ppo --env-name osim.Prosthetics --lr 1e-3 --num-steps 1000 --use-gae --ppo-epoch 10 --beta-dist`
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/algo/__init__.py:
--------------------------------------------------------------------------------
1 | from .a2c_acktr import A2C_ACKTR
2 | from .ppo import PPO
3 | from .sil import SIL


--------------------------------------------------------------------------------
/algo/a2c_acktr.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.optim as optim
 4 | 
 5 | from .kfac import KFACOptimizer
 6 | 
 7 | 
 8 | class A2C_ACKTR():
 9 |     def __init__(self,
10 |                  actor_critic,
11 |                  value_loss_coef,
12 |                  entropy_coef,
13 |                  lr=None,
14 |                  lr_schedule=None,
15 |                  eps=None,
16 |                  alpha=None,
17 |                  max_grad_norm=None,
18 |                  acktr=False):
19 | 
20 |         self.actor_critic = actor_critic
21 |         self.acktr = acktr
22 | 
23 |         self.value_loss_coef = value_loss_coef
24 |         self.entropy_coef = entropy_coef
25 | 
26 |         self.max_grad_norm = max_grad_norm
27 | 
28 |         if acktr:
29 |             self.optimizer = KFACOptimizer(actor_critic)
30 |             self.scheduler = None
31 |         else:
32 |             self.optimizer = optim.RMSprop(
33 |                 actor_critic.parameters(), lr, eps=eps, alpha=alpha)
34 |             if lr_schedule is not None:
35 |                 self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, lr_schedule)
36 |             else:
37 |                 self.scheduler = None
38 | 
39 |     def update(self, rollouts, update_index, _replay=None):
40 |         if self.scheduler is not None:
41 |             self.scheduler.step(update_index)
42 | 
43 |         obs_shape = rollouts.obs.size()[2:]
44 |         action_shape = rollouts.actions.size()[-1]
45 |         num_steps, num_processes, _ = rollouts.rewards.size()
46 | 
47 |         values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
48 |             rollouts.obs[:-1].view(-1, *obs_shape),
49 |             rollouts.recurrent_hidden_states[0].view(-1, self.actor_critic.recurrent_hidden_state_size),
50 |             rollouts.masks[:-1].view(-1, 1),
51 |             rollouts.actions.view(-1, action_shape))
52 | 
53 |         values = values.view(num_steps, num_processes, 1)
54 |         action_log_probs = action_log_probs.view(num_steps, num_processes, 1)
55 | 
56 |         advantages = rollouts.returns[:-1] - values
57 |         value_loss = advantages.pow(2).mean()
58 | 
59 |         action_loss = -(advantages.detach() * action_log_probs).mean()
60 | 
61 |         if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
62 |             # Sampled fisher, see Martens 2014
63 |             self.actor_critic.zero_grad()
64 |             pg_fisher_loss = -action_log_probs.mean()
65 | 
66 |             value_noise = torch.randn(values.size())
67 |             if values.is_cuda:
68 |                 value_noise = value_noise.cuda()
69 | 
70 |             sample_values = values + value_noise
71 |             vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()
72 | 
73 |             fisher_loss = pg_fisher_loss + vf_fisher_loss
74 |             self.optimizer.acc_stats = True
75 |             fisher_loss.backward(retain_graph=True)
76 |             self.optimizer.acc_stats = False
77 | 
78 |         self.optimizer.zero_grad()
79 |         (value_loss * self.value_loss_coef + action_loss -
80 |          dist_entropy * self.entropy_coef).backward()
81 | 
82 |         if self.acktr == False:
83 |             nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
84 |                                      self.max_grad_norm)
85 | 
86 |         self.optimizer.step()
87 | 
88 |         return value_loss.item(), action_loss.item(), dist_entropy.item()
89 | 


--------------------------------------------------------------------------------
/algo/kfac.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | 
  8 | from utils import AddBias
  9 | 
 10 | # TODO: In order to make this code faster:
 11 | # 1) Implement _extract_patches as a single cuda kernel
 12 | # 2) Compute QR decomposition in a separate process
 13 | # 3) Actually make a general KFAC optimizer so it fits PyTorch
 14 | 
 15 | 
 16 | def _extract_patches(x, kernel_size, stride, padding):
 17 |     if padding[0] + padding[1] > 0:
 18 |         x = F.pad(x, (padding[1], padding[1], padding[0],
 19 |                       padding[0])).data  # Actually check dims
 20 |     x = x.unfold(2, kernel_size[0], stride[0])
 21 |     x = x.unfold(3, kernel_size[1], stride[1])
 22 |     x = x.transpose_(1, 2).transpose_(2, 3).contiguous()
 23 |     x = x.view(
 24 |         x.size(0), x.size(1), x.size(2),
 25 |         x.size(3) * x.size(4) * x.size(5))
 26 |     return x
 27 | 
 28 | 
 29 | def compute_cov_a(a, classname, layer_info, fast_cnn):
 30 |     batch_size = a.size(0)
 31 | 
 32 |     if classname == 'Conv2d':
 33 |         if fast_cnn:
 34 |             a = _extract_patches(a, *layer_info)
 35 |             a = a.view(a.size(0), -1, a.size(-1))
 36 |             a = a.mean(1)
 37 |         else:
 38 |             a = _extract_patches(a, *layer_info)
 39 |             a = a.view(-1, a.size(-1)).div_(a.size(1)).div_(a.size(2))
 40 |     elif classname == 'AddBias':
 41 |         is_cuda = a.is_cuda
 42 |         a = torch.ones(a.size(0), 1)
 43 |         if is_cuda:
 44 |             a = a.cuda()
 45 | 
 46 |     return a.t() @ (a / batch_size)
 47 | 
 48 | 
 49 | def compute_cov_g(g, classname, layer_info, fast_cnn):
 50 |     batch_size = g.size(0)
 51 | 
 52 |     if classname == 'Conv2d':
 53 |         if fast_cnn:
 54 |             g = g.view(g.size(0), g.size(1), -1)
 55 |             g = g.sum(-1)
 56 |         else:
 57 |             g = g.transpose(1, 2).transpose(2, 3).contiguous()
 58 |             g = g.view(-1, g.size(-1)).mul_(g.size(1)).mul_(g.size(2))
 59 |     elif classname == 'AddBias':
 60 |         g = g.view(g.size(0), g.size(1), -1)
 61 |         g = g.sum(-1)
 62 | 
 63 |     g_ = g * batch_size
 64 |     return g_.t() @ (g_ / g.size(0))
 65 | 
 66 | 
 67 | def update_running_stat(aa, m_aa, momentum):
 68 |     # Do the trick to keep aa unchanged and not create any additional tensors
 69 |     m_aa *= momentum / (1 - momentum)
 70 |     m_aa += aa
 71 |     m_aa *= (1 - momentum)
 72 | 
 73 | 
 74 | class SplitBias(nn.Module):
 75 |     def __init__(self, module):
 76 |         super(SplitBias, self).__init__()
 77 |         self.module = module
 78 |         self.add_bias = AddBias(module.bias.data)
 79 |         self.module.bias = None
 80 | 
 81 |     def forward(self, input):
 82 |         x = self.module(input)
 83 |         x = self.add_bias(x)
 84 |         return x
 85 | 
 86 | 
 87 | class KFACOptimizer(optim.Optimizer):
 88 |     def __init__(self,
 89 |                  model,
 90 |                  lr=0.25,
 91 |                  momentum=0.9,
 92 |                  stat_decay=0.99,
 93 |                  kl_clip=0.001,
 94 |                  damping=1e-2,
 95 |                  weight_decay=0,
 96 |                  fast_cnn=False,
 97 |                  Ts=1,
 98 |                  Tf=10):
 99 |         defaults = dict()
100 | 
101 |         def split_bias(module):
102 |             for mname, child in module.named_children():
103 |                 if hasattr(child, 'bias') and child.bias is not None:
104 |                     module._modules[mname] = SplitBias(child)
105 |                 else:
106 |                     split_bias(child)
107 | 
108 |         split_bias(model)
109 | 
110 |         super(KFACOptimizer, self).__init__(model.parameters(), defaults)
111 | 
112 |         self.known_modules = {'Linear', 'Conv2d', 'AddBias'}
113 | 
114 |         self.modules = []
115 |         self.grad_outputs = {}
116 | 
117 |         self.model = model
118 |         self._prepare_model()
119 | 
120 |         self.steps = 0
121 | 
122 |         self.m_aa, self.m_gg = {}, {}
123 |         self.Q_a, self.Q_g = {}, {}
124 |         self.d_a, self.d_g = {}, {}
125 | 
126 |         self.momentum = momentum
127 |         self.stat_decay = stat_decay
128 | 
129 |         self.lr = lr
130 |         self.kl_clip = kl_clip
131 |         self.damping = damping
132 |         self.weight_decay = weight_decay
133 | 
134 |         self.fast_cnn = fast_cnn
135 | 
136 |         self.Ts = Ts
137 |         self.Tf = Tf
138 | 
139 |         self.optim = optim.SGD(
140 |             model.parameters(),
141 |             lr=self.lr * (1 - self.momentum),
142 |             momentum=self.momentum)
143 | 
144 |     def _save_input(self, module, input):
145 |         if torch.is_grad_enabled() and self.steps % self.Ts == 0:
146 |             classname = module.__class__.__name__
147 |             layer_info = None
148 |             if classname == 'Conv2d':
149 |                 layer_info = (module.kernel_size, module.stride,
150 |                               module.padding)
151 | 
152 |             aa = compute_cov_a(input[0].data, classname, layer_info,
153 |                                self.fast_cnn)
154 | 
155 |             # Initialize buffers
156 |             if self.steps == 0:
157 |                 self.m_aa[module] = aa.clone()
158 | 
159 |             update_running_stat(aa, self.m_aa[module], self.stat_decay)
160 | 
161 |     def _save_grad_output(self, module, grad_input, grad_output):
162 |         if self.acc_stats:
163 |             classname = module.__class__.__name__
164 |             layer_info = None
165 |             if classname == 'Conv2d':
166 |                 layer_info = (module.kernel_size, module.stride,
167 |                               module.padding)
168 | 
169 |             gg = compute_cov_g(grad_output[0].data, classname, layer_info,
170 |                                self.fast_cnn)
171 | 
172 |             # Initialize buffers
173 |             if self.steps == 0:
174 |                 self.m_gg[module] = gg.clone()
175 | 
176 |             update_running_stat(gg, self.m_gg[module], self.stat_decay)
177 | 
178 |     def _prepare_model(self):
179 |         for module in self.model.modules():
180 |             classname = module.__class__.__name__
181 |             if classname in self.known_modules:
182 |                 assert not ((classname in ['Linear', 'Conv2d']) and module.bias is not None), \
183 |                                     "You must have a bias as a separate layer"
184 | 
185 |                 self.modules.append(module)
186 |                 module.register_forward_pre_hook(self._save_input)
187 |                 module.register_backward_hook(self._save_grad_output)
188 | 
189 |     def step(self):
190 |         # Add weight decay
191 |         if self.weight_decay > 0:
192 |             for p in self.model.parameters():
193 |                 p.grad.data.add_(self.weight_decay, p.data)
194 | 
195 |         updates = {}
196 |         for i, m in enumerate(self.modules):
197 |             assert len(list(m.parameters())
198 |                        ) == 1, "Can handle only one parameter at the moment"
199 |             classname = m.__class__.__name__
200 |             p = next(m.parameters())
201 | 
202 |             la = self.damping + self.weight_decay
203 | 
204 |             if self.steps % self.Tf == 0:
205 |                 # My asynchronous implementation exists, I will add it later.
206 |                 # Experimenting with different ways to this in PyTorch.
207 |                 self.d_a[m], self.Q_a[m] = torch.symeig(
208 |                     self.m_aa[m], eigenvectors=True)
209 |                 self.d_g[m], self.Q_g[m] = torch.symeig(
210 |                     self.m_gg[m], eigenvectors=True)
211 | 
212 |                 self.d_a[m].mul_((self.d_a[m] > 1e-6).float())
213 |                 self.d_g[m].mul_((self.d_g[m] > 1e-6).float())
214 | 
215 |             if classname == 'Conv2d':
216 |                 p_grad_mat = p.grad.data.view(p.grad.data.size(0), -1)
217 |             else:
218 |                 p_grad_mat = p.grad.data
219 | 
220 |             v1 = self.Q_g[m].t() @ p_grad_mat @ self.Q_a[m]
221 |             v2 = v1 / (
222 |                 self.d_g[m].unsqueeze(1) * self.d_a[m].unsqueeze(0) + la)
223 |             v = self.Q_g[m] @ v2 @ self.Q_a[m].t()
224 | 
225 |             v = v.view(p.grad.data.size())
226 |             updates[p] = v
227 | 
228 |         vg_sum = 0
229 |         for p in self.model.parameters():
230 |             v = updates[p]
231 |             vg_sum += (v * p.grad.data * self.lr * self.lr).sum()
232 | 
233 |         nu = min(1, math.sqrt(self.kl_clip / vg_sum))
234 | 
235 |         for p in self.model.parameters():
236 |             v = updates[p]
237 |             p.grad.data.copy_(v)
238 |             p.grad.data.mul_(nu)
239 | 
240 |         self.optim.step()
241 |         self.steps += 1
242 | 


--------------------------------------------------------------------------------
/algo/ppo.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.optim as optim
 5 | 
 6 | 
 7 | class PPO():
 8 |     def __init__(self,
 9 |                  actor_critic,
10 |                  clip_param,
11 |                  ppo_epoch,
12 |                  num_mini_batch,
13 |                  value_loss_coef,
14 |                  entropy_coef,
15 |                  lr=None,
16 |                  lr_schedule=None,
17 |                  eps=None,
18 |                  max_grad_norm=None):
19 | 
20 |         self.actor_critic = actor_critic
21 | 
22 |         self.clip_param = clip_param
23 |         self.ppo_epoch = ppo_epoch
24 |         self.num_mini_batch = num_mini_batch
25 | 
26 |         self.value_loss_coef = value_loss_coef
27 |         self.entropy_coef = entropy_coef
28 | 
29 |         self.max_grad_norm = max_grad_norm
30 | 
31 |         self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps)
32 | 
33 |         if lr_schedule is not None:
34 |             self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, lr_schedule)
35 |         else:
36 |             self.scheduler = None
37 | 
38 |     def update(self, rollouts, update_index, _replay=None):
39 |         if self.scheduler is not None:
40 |             self.scheduler.step(update_index)
41 | 
42 |         advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
43 |         advantages = (advantages - advantages.mean()) / (
44 |             advantages.std() + 1e-5)
45 | 
46 |         value_loss_epoch = 0
47 |         action_loss_epoch = 0
48 |         dist_entropy_epoch = 0
49 | 
50 |         for e in range(self.ppo_epoch):
51 |             if self.actor_critic.is_recurrent:
52 |                 data_generator = rollouts.recurrent_generator(
53 |                     advantages, self.num_mini_batch)
54 |             else:
55 |                 data_generator = rollouts.feed_forward_generator(
56 |                     advantages, self.num_mini_batch)
57 | 
58 |             for sample in data_generator:
59 |                 obs_batch, recurrent_hidden_states_batch, actions_batch, \
60 |                    return_batch, masks_batch, old_action_log_probs_batch, \
61 |                         adv_targ = sample
62 | 
63 |                 # Reshape to do in a single forward pass for all steps
64 |                 values, action_log_probs, dist_entropy, states = self.actor_critic.evaluate_actions(
65 |                     obs_batch, recurrent_hidden_states_batch,
66 |                     masks_batch, actions_batch)
67 | 
68 |                 ratio = torch.exp(action_log_probs - old_action_log_probs_batch)
69 |                 surr1 = ratio * adv_targ
70 |                 surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
71 |                                            1.0 + self.clip_param) * adv_targ
72 |                 action_loss = -torch.min(surr1, surr2).mean()
73 | 
74 |                 value_loss = F.mse_loss(return_batch, values)
75 | 
76 |                 self.optimizer.zero_grad()
77 |                 (value_loss * self.value_loss_coef + action_loss -
78 |                  dist_entropy * self.entropy_coef).backward()
79 |                 nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
80 |                                          self.max_grad_norm)
81 |                 self.optimizer.step()
82 | 
83 |                 value_loss_epoch += value_loss.item()
84 |                 action_loss_epoch += action_loss.item()
85 |                 dist_entropy_epoch += dist_entropy.item()
86 | 
87 |         num_updates = self.ppo_epoch * self.num_mini_batch
88 | 
89 |         value_loss_epoch /= num_updates
90 |         action_loss_epoch /= num_updates
91 |         dist_entropy_epoch /= num_updates
92 | 
93 |         return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
94 | 


--------------------------------------------------------------------------------
/algo/sil.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | 
  5 | class SIL:
  6 |     def __init__(
  7 |             self,
  8 |             algo,
  9 |             update_ratio=1.0,
 10 |             epochs=1,
 11 |             batch_size=64,
 12 |             beta=0.,
 13 |             value_loss_coef=0.5,
 14 |             entropy_coef=0.01,
 15 | 
 16 |     ):
 17 |         self.update_ratio = update_ratio
 18 |         self.epochs = epochs
 19 |         self.batch_size = batch_size
 20 |         self.beta = beta  # FIXME should be on schedule?
 21 |         self.value_loss_coef = value_loss_coef
 22 |         self.entropy_coef = entropy_coef
 23 |         self.loss_weight = 0.1
 24 |         self.avg_loss_by_valid_samples = True
 25 | 
 26 |         self.algo = algo
 27 | 
 28 |     def _calc_num_updates(self, index):
 29 |         num_updates = 0
 30 |         if self.update_ratio < 1:
 31 |             if index % int(round(1 / self.update_ratio)) == 0:
 32 |                 num_updates = 1
 33 |         else:
 34 |             num_updates = int(round(self.update_ratio))
 35 |         return num_updates
 36 | 
 37 |     def update(self, rollouts, update_index, replay=None):
 38 |         value_loss, action_loss, dist_entropy = self.algo.update(rollouts, update_index)
 39 | 
 40 |         num_updates = self._calc_num_updates(update_index)
 41 |         if replay is not None and replay.num_steps > self.batch_size and num_updates:
 42 |             sil_value_loss, sil_action_loss = self.update_sil(replay, num_updates, self.epochs)
 43 |             print("SIL: value_loss = {:.5f}, action_loss = {:.5f}".format(sil_value_loss, sil_action_loss))
 44 | 
 45 |         return value_loss, action_loss, dist_entropy
 46 | 
 47 |     def update_sil(self, replay, num_updates_per_epoch=1, num_epochs=1):
 48 |         value_loss_epoch = 0
 49 |         action_loss_epoch = 0
 50 |         num_updates = 0
 51 | 
 52 |         for _ in range(num_epochs):
 53 |             if self.algo.actor_critic.is_recurrent:
 54 |                 assert False, "Not implemented"
 55 |             else:
 56 |                 data_generator = replay.feed_forward_generator(
 57 |                     self.batch_size, num_updates_per_epoch, beta=self.beta)
 58 | 
 59 |             for sample in data_generator:
 60 |                 obs_batch, recurrent_hidden_states_batch, actions_batch, \
 61 |                 return_batch, masks_batch, weights_batch, indices_batch = sample
 62 | 
 63 |                 values, action_log_probs, dist_entropy, _ = self.algo.actor_critic.evaluate_actions(
 64 |                     obs_batch, recurrent_hidden_states_batch,
 65 |                     masks_batch, actions_batch)
 66 | 
 67 |                 advantages = (return_batch - values)
 68 |                 clipped_advantages = torch.clamp(advantages, min=0.0)
 69 | 
 70 |                 # FIXME this loss is what's described in the paper, but the author's TF implementation differs.
 71 |                 # TODO Look into the TF implementation, it appears to be motivated by the author's
 72 |                 # lower-bound-soft-Q-learning equivalence justification.
 73 |                 # https://github.com/junhyukoh/self-imitation-learning/blob/master/baselines/common/self_imitation.py
 74 | 
 75 |                 action_loss = -action_log_probs * clipped_advantages.detach()
 76 |                 value_loss = 0.5 * clipped_advantages.pow(2)
 77 | 
 78 |                 # apply importance sampling (priority sampling bias correction) weights
 79 |                 if weights_batch is not None:
 80 |                     action_loss *= weights_batch
 81 |                     value_loss *= weights_batch
 82 | 
 83 |                 if self.avg_loss_by_valid_samples:
 84 |                     num_valid_samples = torch.clamp(torch.sum(advantages > 0).float(), min=1.0)
 85 |                     action_loss = action_loss.sum() / num_valid_samples
 86 |                     value_loss = value_loss.sum() / num_valid_samples
 87 |                 else:
 88 |                     action_loss = action_loss.mean()
 89 |                     value_loss = value_loss.mean()
 90 | 
 91 |                 loss = value_loss * self.value_loss_coef + action_loss
 92 |                 if self.entropy_coef:
 93 |                     loss -= dist_entropy * self.entropy_coef
 94 | 
 95 |                 loss *= self.loss_weight
 96 | 
 97 |                 self.algo.optimizer.zero_grad()
 98 | 
 99 |                 loss.backward()
100 | 
101 |                 nn.utils.clip_grad_norm_(
102 |                     self.algo.actor_critic.parameters(), self.algo.max_grad_norm)
103 | 
104 |                 self.algo.optimizer.step()
105 | 
106 |                 replay.update_priorities(indices_batch, clipped_advantages)
107 | 
108 |                 value_loss_epoch += value_loss.item()
109 |                 action_loss_epoch += action_loss.item()
110 |                 num_updates += 1
111 | 
112 |         if num_updates:
113 |             value_loss_epoch /= num_updates
114 |             action_loss_epoch /= num_updates
115 | 
116 |         return value_loss_epoch, action_loss_epoch
117 | 
118 | 


--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def get_args():
 7 |     parser = argparse.ArgumentParser(description='RL')
 8 |     parser.add_argument('--algo', default='ppo',
 9 |                         help='algorithm to use: a2c | ppo | acktr')
10 |     parser.add_argument('--lr', type=float, default=7e-4,
11 |                         help='learning rate (default: 7e-4)')
12 |     parser.add_argument('--lr-schedule', type=float, default=None,
13 |                         help='learning rate schedule (decay steps) (default: None)')
14 |     parser.add_argument('--eps', type=float, default=1e-5,
15 |                         help='RMSprop optimizer epsilon (default: 1e-5)')
16 |     parser.add_argument('--alpha', type=float, default=0.99,
17 |                         help='RMSprop optimizer apha (default: 0.99)')
18 |     parser.add_argument('--gamma', type=float, default=0.99,
19 |                         help='discount factor for rewards (default: 0.99)')
20 |     parser.add_argument('--use-gae', action='store_true', default=False,
21 |                         help='use generalized advantage estimation')
22 |     parser.add_argument('--tau', type=float, default=0.95,
23 |                         help='gae parameter (default: 0.95)')
24 |     parser.add_argument('--entropy-coef', type=float, default=0.0,
25 |                         help='entropy term coefficient (default: 0.00)')
26 |     parser.add_argument('--value-loss-coef', type=float, default=0.5,
27 |                         help='value loss coefficient (default: 0.5)')
28 |     parser.add_argument('--max-grad-norm', type=float, default=0.5,
29 |                         help='max norm of gradients (default: 0.5)')
30 |     parser.add_argument('--beta-dist', action='store_true', default=False,
31 |                         help='use beta dist for continuous control')
32 |     parser.add_argument('--seed', type=int, default=1,
33 |                         help='random seed (default: 1)')
34 |     parser.add_argument('--num-processes', type=int, default=16,
35 |                         help='how many training CPU processes to use (default: 16)')
36 |     parser.add_argument('--num-steps', type=int, default=5,
37 |                         help='number of forward steps in A2C (default: 5)')
38 |     parser.add_argument('--ppo-epoch', type=int, default=4,
39 |                         help='number of ppo epochs (default: 4)')
40 |     parser.add_argument('--num-mini-batch', type=int, default=32,
41 |                         help='number of batches for ppo (default: 32)')
42 |     parser.add_argument('--clip-param', type=float, default=0.2,
43 |                         help='ppo clip parameter (default: 0.2)')
44 |     parser.add_argument('--clip-action', action='store_true', default=False,
45 |                         help='clip actions')
46 |     parser.add_argument('--shift-action', action='store_true', default=False,
47 |                         help='shift action to mean and rescale')
48 |     parser.add_argument('--frame-skip', type=int, default=0,
49 |                         help='number of frames to skip (apply same action)')
50 |     parser.add_argument('--sil-update-ratio', type=float, default=4.0,
51 |                         help='sil off-policy updates per on-policy updates (default: 4.0)')
52 |     parser.add_argument('--sil-epochs', type=int, default=1,
53 |                         help='number of sil epochs (default: 1)')
54 |     parser.add_argument('--sil-batch-size', type=int, default=512,
55 |                         help='sil batch size (default: 512)')
56 |     parser.add_argument('--sil-alpha', type=float, default=0.6,
57 |                         help='sil replay priority alpha, priority queue disabled if 0. (default: 0.6)')
58 |     parser.add_argument('--sil-beta', type=float, default=0.1,
59 |                         help='sil replay priority beta, importance sampling weights disabled if 0. (default: 0.1)')
60 |     parser.add_argument('--sil-entropy-coef', type=float, default=0.01,
61 |                         help='entropy term coefficient (default: 0.0)')
62 |     parser.add_argument('--sil-value-loss-coef', type=float, default=0.05,
63 |                         help='value loss coefficient (default: 0.5)')
64 |     parser.add_argument('--log-interval', type=int, default=10,
65 |                         help='log interval, one log per n updates (default: 10)')
66 |     parser.add_argument('--save-interval', type=int, default=100,
67 |                         help='save interval, one save per n updates (default: 100)')
68 |     parser.add_argument('--eval-interval', type=int, default=None,
69 |                         help='eval interval, one eval per n updates (default: None)')
70 |     parser.add_argument('--vis-interval', type=int, default=100,
71 |                         help='vis interval, one log per n updates (default: 100)')
72 |     parser.add_argument('--num-frames', type=int, default=10e8,
73 |                         help='number of frames to train (default: 10e8)')
74 |     parser.add_argument('--env-name', default='PongNoFrameskip-v4',
75 |                         help='environment to train on (default: PongNoFrameskip-v4)')
76 |     parser.add_argument('--log-dir', default='/tmp/',
77 |                         help='directory to save agent logs (default: /tmp/)')
78 |     parser.add_argument('--save-dir', default='./trained_models/',
79 |                         help='directory to save agent logs (default: ./trained_models/)')
80 |     parser.add_argument('--no-cuda', action='store_true', default=False,
81 |                         help='disables CUDA training')
82 |     parser.add_argument('--add-timestep', action='store_true', default=False,
83 |                         help='add timestep to observations')
84 |     parser.add_argument('--recurrent-policy', action='store_true', default=False,
85 |                         help='use a recurrent policy')
86 |     parser.add_argument('--vis', action='store_true', default=False,
87 |                         help='enable visdom visualization')
88 |     parser.add_argument('--port', type=int, default=8097,
89 |                         help='port to run the server on (default: 8097)')
90 |     parser.add_argument('--load-path', default='',
91 |                         help='directory to save agent logs (default: ')
92 |     args = parser.parse_args()
93 | 
94 |     args.cuda = not args.no_cuda and torch.cuda.is_available()
95 | 
96 |     return args
97 | 


--------------------------------------------------------------------------------
/distributions.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | from utils import AddBias, init, init_normc_
 8 | 
 9 | """
10 | Modify standard PyTorch distributions so they are compatible with this code.
11 | """
12 | 
13 | FixedCategorical = torch.distributions.Categorical
14 | 
15 | old_sample = FixedCategorical.sample
16 | FixedCategorical.sample = lambda self: old_sample(self).unsqueeze(-1)
17 | log_prob_cat = FixedCategorical.log_prob
18 | FixedCategorical.log_probs = lambda self, actions: log_prob_cat(self, actions.squeeze(-1)).unsqueeze(-1)
19 | FixedCategorical.mode = lambda self: self.probs.argmax(dim=1, keepdim=True)
20 | 
21 | FixedNormal = torch.distributions.Normal
22 | log_prob_normal = FixedNormal.log_prob
23 | FixedNormal.log_probs = lambda self, actions: log_prob_normal(self, actions).sum(-1, keepdim=True)
24 | entropy_normal = FixedNormal.entropy
25 | FixedNormal.entropy = lambda self: entropy_normal(self).sum(-1)
26 | FixedNormal.mode = lambda self: self.mean
27 | 
28 | FixedBeta = torch.distributions.Beta
29 | entropy_beta = FixedBeta.entropy
30 | FixedBeta.entropy = lambda self: entropy_beta(self).sum(-1)
31 | FixedBeta.mode = lambda self: (self.concentration1 - 1) / (self.concentration1 + self.concentration0 - 2)
32 | log_prob_beta = FixedBeta.log_prob
33 | FixedBeta.log_probs = lambda self, actions: log_prob_beta(self, actions).sum(-1, keepdim=True)
34 | 
35 | 
36 | class Categorical(nn.Module):
37 |     def __init__(self, num_inputs, num_outputs):
38 |         super(Categorical, self).__init__()
39 | 
40 |         init_ = lambda m: init(
41 |             m,
42 |             nn.init.orthogonal_,
43 |             lambda x: nn.init.constant_(x, 0),
44 |             gain=0.01)
45 | 
46 |         self.linear = init_(nn.Linear(num_inputs, num_outputs))
47 | 
48 |     def forward(self, x):
49 |         x = self.linear(x)
50 |         return FixedCategorical(logits=x)
51 | 
52 | 
53 | class DiagGaussian(nn.Module):
54 |     def __init__(self, num_inputs, num_outputs):
55 |         super(DiagGaussian, self).__init__()
56 | 
57 |         init_ = lambda m: init(
58 |             m,
59 |             init_normc_,
60 |             lambda x: nn.init.constant_(x, 0))
61 | 
62 |         self.fc_mean = init_(nn.Linear(num_inputs, num_outputs))
63 |         self.logstd = AddBias(torch.zeros(num_outputs))
64 | 
65 |     def forward(self, x):
66 |         action_mean = self.fc_mean(x)
67 | 
68 |         #  An ugly hack for my KFAC implementation.
69 |         zeros = torch.zeros(action_mean.size())
70 |         if x.is_cuda:
71 |             zeros = zeros.cuda()
72 | 
73 |         action_logstd = self.logstd(zeros)
74 |         return FixedNormal(action_mean, action_logstd.exp())
75 | 
76 | 
77 | class Beta(nn.Module):
78 | 
79 |     def __init__(self, num_inputs, num_outputs):
80 |         super(Beta, self).__init__()
81 | 
82 |         init_ = lambda m: init(
83 |             m,
84 |             init_normc_,
85 |             lambda x: nn.init.constant_(x, 0))
86 | 
87 |         self.fc_a = init_(nn.Linear(num_inputs, num_outputs))
88 |         self.fc_b = init_(nn.Linear(num_inputs, num_outputs))
89 | 
90 |     def forward(self, x):
91 |         action_a = self.fc_a(x)
92 |         action_b = self.fc_b(x)
93 |         action_a = F.softplus(action_a) + 1.
94 |         action_b = F.softplus(action_b) + 1.
95 | 
96 |         return FixedBeta(action_a, action_b)
97 | 
98 | 


--------------------------------------------------------------------------------
/enjoy.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import gym
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | from envs import VecPyTorch, make_vec_envs
 8 | from utils import get_render_func, get_vec_normalize
 9 | 
10 | 
11 | parser = argparse.ArgumentParser(description='RL')
12 | parser.add_argument('--seed', type=int, default=1,
13 |                     help='random seed (default: 1)')
14 | parser.add_argument('--log-interval', type=int, default=10,
15 |                     help='log interval, one log per n updates (default: 10)')
16 | parser.add_argument('--env-name', default='PongNoFrameskip-v4',
17 |                     help='environment to train on (default: PongNoFrameskip-v4)')
18 | parser.add_argument('--load-path', default='',
19 |                     help='directory to save agent logs (default: ')
20 | parser.add_argument('--add-timestep', action='store_true', default=False,
21 |                     help='add timestep to observations')
22 | parser.add_argument('--clip-action', action='store_true', default=False,
23 |                     help='clip actions')
24 | args = parser.parse_args()
25 | 
26 | env = make_vec_envs(args.env_name, args.seed + 1000, 1,
27 |                     None, None, args.add_timestep, device='cpu',
28 |                     allow_early_resets=False,
29 |                     visualize=True)
30 | 
31 | # Get a render function
32 | render_func = get_render_func(env)
33 | 
34 | # We need to use the same statistics for normalization as used in training
35 | actor_critic, ob_rms = torch.load(args.load_path)
36 | actor_critic.eval()
37 | 
38 | vec_norm = get_vec_normalize(env)
39 | if vec_norm is not None:
40 |     vec_norm.eval()
41 |     vec_norm.ob_rms = ob_rms
42 | 
43 | recurrent_hidden_states = torch.zeros(1, actor_critic.recurrent_hidden_state_size)
44 | masks = torch.zeros(1, 1)
45 | 
46 | if render_func is not None:
47 |     render_func('human')
48 | 
49 | obs = env.reset()
50 | 
51 | while True:
52 |     with torch.no_grad():
53 |         value, action, _, recurrent_hidden_states = actor_critic.act(
54 |             obs, recurrent_hidden_states, masks, deterministic=True)
55 | 
56 |     clipped_action = action
57 |     if args.clip_action and isinstance(env.action_space, gym.spaces.Box):
58 |         clipped_action = torch.max(torch.min(
59 |             clipped_action, torch.from_numpy(env.action_space.high)),
60 |             torch.from_numpy(env.action_space.low))
61 | 
62 |     # Obser reward and next obs
63 |     obs, reward, done, _ = env.step(clipped_action)
64 | 
65 |     masks.fill_(0.0 if done else 1.0)
66 | 
67 |     if render_func is not None:
68 |         render_func('human')
69 | 


--------------------------------------------------------------------------------
/envs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import gym
  4 | import numpy as np
  5 | import torch
  6 | from gym.spaces.box import Box
  7 | 
  8 | #from baselines import bench
  9 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind
 10 | from baselines.common.vec_env import VecEnvWrapper
 11 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
 12 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 13 | from baselines.common.vec_env.vec_normalize import VecNormalize as VecNormalize_
 14 | from monitor import Monitor
 15 | 
 16 | try:
 17 |     import dm_control2gym
 18 | except ImportError:
 19 |     pass
 20 | 
 21 | try:
 22 |     import roboschool
 23 | except ImportError:
 24 |     pass
 25 | 
 26 | try:
 27 |     import pybullet_envs
 28 | except ImportError:
 29 |     pass
 30 | 
 31 | try:
 32 |     from osim.env import ProstheticsEnv, Arm2DEnv, L2RunEnv
 33 |     from my_prosthetics_env import MyProstheticsEnv
 34 | except ImportError:
 35 |     pass
 36 | 
 37 | 
 38 | def make_env(env_id, seed, rank, log_dir, add_timestep, allow_early_resets, **kwargs):
 39 |     def _thunk():
 40 |         info_keywords = ()
 41 |         if env_id.startswith("dm"):
 42 |             _, domain, task = env_id.split('.')
 43 |             env = dm_control2gym.make(domain_name=domain, task_name=task)
 44 |         elif env_id.startswith("osim"):
 45 |             info_keywords = ('rb',)
 46 |             # https://github.com/stanfordnmbl/osim-rl
 47 |             _, task = env_id.split('.')
 48 |             if task == "Prosthetics":
 49 |                 env = MyProstheticsEnv(integrator_accuracy=1e-4, **kwargs)
 50 |             elif task == "Arm2D":
 51 |                 env = Arm2DEnv(integrator_accuracy=1e-4, **kwargs)
 52 |             else:  # task == "L2Run"
 53 |                 assert task == "L2Run"
 54 |                 env = L2RunEnv(integrator_accuracy=1e-4, **kwargs)
 55 |         else:
 56 |             env = gym.make(env_id)
 57 |         is_atari = hasattr(gym.envs, 'atari') and isinstance(
 58 |             env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
 59 |         if is_atari:
 60 |             env = make_atari(env_id)
 61 |         env.seed(seed + rank)
 62 | 
 63 |         obs_shape = env.observation_space.shape
 64 | 
 65 |         if add_timestep and len(
 66 |                 obs_shape) == 1 and str(env).find('TimeLimit') > -1:
 67 |             env = AddTimestep(env)
 68 | 
 69 |         if log_dir is not None:
 70 |             env = Monitor(
 71 |                 env, os.path.join(log_dir, str(rank)),
 72 |                 info_keywords=info_keywords,
 73 |                 allow_early_resets=allow_early_resets)
 74 | 
 75 |         if is_atari:
 76 |             env = wrap_deepmind(env)
 77 | 
 78 |         # If the input has shape (W,H,3), wrap for PyTorch convolutions
 79 |         obs_shape = env.observation_space.shape
 80 |         if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
 81 |             env = TransposeImage(env)
 82 | 
 83 |         return env
 84 | 
 85 |     return _thunk
 86 | 
 87 | 
 88 | def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, add_timestep,
 89 |                   device, allow_early_resets=True, num_frame_stack=None, **kwargs):
 90 | 
 91 |     envs = [make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets, **kwargs)
 92 |             for i in range(num_processes)]
 93 | 
 94 |     if len(envs) > 1:
 95 |         envs = SubprocVecEnv(envs)
 96 |     else:
 97 |         envs = DummyVecEnv(envs)
 98 | 
 99 |     if len(envs.observation_space.shape) == 1:
100 |         if gamma is None:
101 |             envs = VecNormalize(envs, ret=False)
102 |         else:
103 |             envs = VecNormalize(envs, gamma=gamma)
104 | 
105 |     envs = VecPyTorch(envs, device)
106 | 
107 |     if num_frame_stack is not None:
108 |         envs = VecPyTorchFrameStack(envs, num_frame_stack, device)
109 |     elif len(envs.observation_space.shape) == 3:
110 |         envs = VecPyTorchFrameStack(envs, 4, device)
111 | 
112 |     return envs
113 | 
114 | 
115 | # Can be used to test recurrent policies for Reacher-v2
116 | class MaskGoal(gym.ObservationWrapper):
117 |     def observation(self, observation):
118 |         if self.env._elapsed_steps > 0:
119 |             observation[-2:0] = 0
120 |         return observation
121 | 
122 | 
123 | class AddTimestep(gym.ObservationWrapper):
124 |     def __init__(self, env=None):
125 |         super(AddTimestep, self).__init__(env)
126 |         self.observation_space = Box(
127 |             self.observation_space.low[0],
128 |             self.observation_space.high[0],
129 |             [self.observation_space.shape[0] + 1],
130 |             dtype=self.observation_space.dtype)
131 | 
132 |     def observation(self, observation):
133 |         return np.concatenate((observation, [self.env._elapsed_steps]))
134 | 
135 | 
136 | class TransposeImage(gym.ObservationWrapper):
137 |     def __init__(self, env=None):
138 |         super(TransposeImage, self).__init__(env)
139 |         obs_shape = self.observation_space.shape
140 |         self.observation_space = Box(
141 |             self.observation_space.low[0, 0, 0],
142 |             self.observation_space.high[0, 0, 0],
143 |             [obs_shape[2], obs_shape[1], obs_shape[0]],
144 |             dtype=self.observation_space.dtype)
145 | 
146 |     def observation(self, observation):
147 |         return observation.transpose(2, 0, 1)
148 | 
149 | 
150 | class VecPyTorch(VecEnvWrapper):
151 |     def __init__(self, venv, device):
152 |         """Return only every `skip`-th frame"""
153 |         super(VecPyTorch, self).__init__(venv)
154 |         self.device = device
155 |         # TODO: Fix data types
156 | 
157 |     def reset(self):
158 |         obs = self.venv.reset()
159 |         obs = torch.from_numpy(obs).float().to(self.device)
160 |         return obs
161 | 
162 |     def step_async(self, actions):
163 |         actions = actions.squeeze(1).cpu().numpy()
164 |         self.venv.step_async(actions)
165 | 
166 |     def step_wait(self):
167 |         obs, reward, done, info = self.venv.step_wait()
168 |         obs = torch.from_numpy(obs).float().to(self.device)
169 |         reward = torch.from_numpy(reward).unsqueeze(dim=1).float()
170 |         return obs, reward, done, info
171 | 
172 | 
173 | class VecNormalize(VecNormalize_):
174 | 
175 |     def __init__(self, *args, **kwargs):
176 |         super(VecNormalize, self).__init__(*args, **kwargs)
177 |         self.training = True
178 | 
179 |     def _obfilt(self, obs):
180 |         if self.ob_rms:
181 |             if self.training:
182 |                 self.ob_rms.update(obs)
183 |             obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
184 |             return obs
185 |         else:
186 |             return obs
187 | 
188 |     def train(self):
189 |         self.training = True
190 | 
191 |     def eval(self):
192 |         self.training = False
193 | 
194 | 
195 | # Derived from
196 | # https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_frame_stack.py
197 | class VecPyTorchFrameStack(VecEnvWrapper):
198 |     def __init__(self, venv, nstack, device=None):
199 |         self.venv = venv
200 |         self.nstack = nstack
201 | 
202 |         wos = venv.observation_space  # wrapped ob space
203 |         self.shape_dim0 = wos.shape[0]
204 | 
205 |         low = np.repeat(wos.low, self.nstack, axis=0)
206 |         high = np.repeat(wos.high, self.nstack, axis=0)
207 | 
208 |         if device is None:
209 |             device = torch.device('cpu')
210 |         self.stacked_obs = torch.zeros((venv.num_envs,) + low.shape).to(device)
211 | 
212 |         observation_space = gym.spaces.Box(
213 |             low=low, high=high, dtype=venv.observation_space.dtype)
214 |         VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
215 | 
216 |     def step_wait(self):
217 |         obs, rews, news, infos = self.venv.step_wait()
218 |         self.stacked_obs[:, :-self.shape_dim0] = \
219 |             self.stacked_obs[:, self.shape_dim0:]
220 |         for (i, new) in enumerate(news):
221 |             if new:
222 |                 self.stacked_obs[i] = 0
223 |         self.stacked_obs[:, -self.shape_dim0:] = obs
224 |         return self.stacked_obs, rews, news, infos
225 | 
226 |     def reset(self):
227 |         obs = self.venv.reset()
228 |         self.stacked_obs.zero_()
229 |         self.stacked_obs[:, -self.shape_dim0:] = obs
230 |         return self.stacked_obs
231 | 
232 |     def close(self):
233 |         self.venv.close()
234 | 


--------------------------------------------------------------------------------
/imgs/a2c_beamrider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/a2c_beamrider.png


--------------------------------------------------------------------------------
/imgs/a2c_breakout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/a2c_breakout.png


--------------------------------------------------------------------------------
/imgs/a2c_qbert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/a2c_qbert.png


--------------------------------------------------------------------------------
/imgs/a2c_seaquest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/a2c_seaquest.png


--------------------------------------------------------------------------------
/imgs/acktr_beamrider.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/acktr_beamrider.png


--------------------------------------------------------------------------------
/imgs/acktr_breakout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/acktr_breakout.png


--------------------------------------------------------------------------------
/imgs/acktr_qbert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/acktr_qbert.png


--------------------------------------------------------------------------------
/imgs/acktr_seaquest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/acktr_seaquest.png


--------------------------------------------------------------------------------
/imgs/ppo_halfcheetah.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/ppo_halfcheetah.png


--------------------------------------------------------------------------------
/imgs/ppo_hopper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/ppo_hopper.png


--------------------------------------------------------------------------------
/imgs/ppo_reacher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/ppo_reacher.png


--------------------------------------------------------------------------------
/imgs/ppo_walker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rwightman/pytorch-opensim-rl/195b7315557e8082f48100bf37859d0c2a0b42c0/imgs/ppo_walker.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import glob
  3 | import os
  4 | import time
  5 | import datetime
  6 | from collections import deque
  7 | 
  8 | import gym
  9 | import numpy as np
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | import torch.optim as optim
 14 | 
 15 | import algo
 16 | from arguments import get_args
 17 | from envs import make_vec_envs
 18 | from model import Policy
 19 | from rollout_storage import RolloutStorage
 20 | from replay_storage import ReplayStorage
 21 | from utils import get_vec_normalize
 22 | from visualize import visdom_plot
 23 | 
 24 | args = get_args()
 25 | 
 26 | assert args.algo in ['a2c', 'a2c-sil', 'ppo', 'ppo-sil', 'acktr']
 27 | if args.recurrent_policy:
 28 |     assert args.algo in ['a2c', 'ppo'], \
 29 |         'Recurrent policy is not implemented for ACKTR'
 30 | 
 31 | num_updates = int(args.num_frames) // args.num_steps // args.num_processes
 32 | 
 33 | torch.manual_seed(args.seed)
 34 | if args.cuda:
 35 |     torch.cuda.manual_seed(args.seed)
 36 | 
 37 | 
 38 | def setup_dirs(experiment_name, log_dir, save_dir):
 39 |     log_dir = os.path.join(log_dir, experiment_name)
 40 |     os.makedirs(log_dir, exist_ok=True)
 41 | 
 42 |     eval_log_dir = args.log_dir + "_eval"
 43 |     os.makedirs(eval_log_dir,  exist_ok=True)
 44 | 
 45 |     save_dir = os.path.join(save_dir, experiment_name)
 46 |     os.makedirs(save_dir, exist_ok=True)
 47 | 
 48 |     return log_dir, eval_log_dir, save_dir
 49 | 
 50 | 
 51 | def main():
 52 |     torch.set_num_threads(1)
 53 |     device = torch.device("cuda:0" if args.cuda else "cpu")
 54 | 
 55 |     experiment_name = args.env_name + '-' + args.algo + '-' + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")
 56 |     log_dir, eval_log_dir, save_dir = setup_dirs(experiment_name, args.log_dir, args.save_dir)
 57 | 
 58 |     if args.vis:
 59 |         from visdom import Visdom
 60 |         viz = Visdom(port=args.port)
 61 |         win = None
 62 | 
 63 |     envs = make_vec_envs(
 64 |         args.env_name, args.seed, args.num_processes,
 65 |         args.gamma, log_dir, args.add_timestep, device, False, frame_skip=args.frame_skip)
 66 | 
 67 |     if args.load_path:
 68 |         actor_critic, _ob_rms = torch.load(args.load_path)
 69 |         vec_norm = get_vec_normalize(envs)
 70 |         if vec_norm is not None:
 71 |             vec_norm.train()
 72 |             vec_norm.ob_rms = _ob_rms
 73 |         actor_critic.train()
 74 |     else:
 75 |         actor_critic = Policy(
 76 |             envs.observation_space.shape,
 77 |             envs.action_space,
 78 |             beta=args.beta_dist,
 79 |             base_kwargs={'recurrent': args.recurrent_policy})
 80 |     actor_critic.to(device)
 81 | 
 82 |     if args.algo.startswith('a2c'):
 83 |         agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
 84 |                                args.entropy_coef, lr=args.lr,
 85 |                                lr_schedule=args.lr_schedule,
 86 |                                eps=args.eps, alpha=args.alpha,
 87 |                                max_grad_norm=args.max_grad_norm)
 88 |     elif args.algo.startswith('ppo'):
 89 |         agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
 90 |                          args.value_loss_coef, args.entropy_coef, lr=args.lr,
 91 |                          lr_schedule=args.lr_schedule,
 92 |                          eps=args.eps,
 93 |                          max_grad_norm=args.max_grad_norm)
 94 |     elif args.algo == 'acktr':
 95 |         agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
 96 |                                args.entropy_coef, acktr=True)
 97 | 
 98 |     if args.algo.endswith('sil'):
 99 |         agent = algo.SIL(
100 |             agent,
101 |             update_ratio=args.sil_update_ratio,
102 |             epochs=args.sil_epochs,
103 |             batch_size=args.sil_batch_size,
104 |             beta=args.sil_beta,
105 |             value_loss_coef=args.sil_value_loss_coef,
106 |             entropy_coef=args.sil_entropy_coef)
107 |         replay = ReplayStorage(
108 |             10000,
109 |             num_processes=args.num_processes,
110 |             gamma=args.gamma,
111 |             prio_alpha=args.sil_alpha,
112 |             obs_shape=envs.observation_space.shape,
113 |             action_space=envs.action_space,
114 |             recurrent_hidden_state_size=actor_critic.recurrent_hidden_state_size,
115 |             device=device)
116 |     else:
117 |         replay = None
118 | 
119 |     action_high = torch.from_numpy(envs.action_space.high).to(device)
120 |     action_low = torch.from_numpy(envs.action_space.low).to(device)
121 |     action_mid = 0.5 * (action_high + action_low)
122 | 
123 |     rollouts = RolloutStorage(
124 |         args.num_steps, args.num_processes,
125 |         envs.observation_space.shape, envs.action_space,
126 |         actor_critic.recurrent_hidden_state_size)
127 | 
128 |     obs = envs.reset()
129 |     rollouts.obs[0].copy_(obs)
130 |     rollouts.to(device)
131 | 
132 |     episode_rewards = deque(maxlen=10)
133 |     benchmark_rewards = deque(maxlen=10)
134 | 
135 |     start = time.time()
136 |     for j in range(num_updates):
137 |         for step in range(args.num_steps):
138 |             # Sample actions
139 |             with torch.no_grad():
140 |                 # sample actions
141 |                 value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
142 |                         rollouts.obs[step],
143 |                         rollouts.recurrent_hidden_states[step],
144 |                         rollouts.masks[step])
145 | 
146 |             if args.clip_action and isinstance(envs.action_space, gym.spaces.Box):
147 |                 clipped_action = action.clone()
148 |                 if args.shift_action:
149 |                     # FIXME experimenting with this, so far resulting in
150 |                     # faster learning when clipping guassian continuous
151 |                     # output (vs leaving centred at 0 and unscaled)
152 |                     clipped_action = 0.5 * clipped_action + action_mid
153 |                 clipped_action = torch.max(
154 |                     torch.min(clipped_action, action_high), action_low)
155 |             else:
156 |                 clipped_action = action
157 | 
158 |             # act in environment and observe
159 |             obs, reward, done, infos = envs.step(clipped_action)
160 | 
161 |             for info in infos:
162 |                 if 'episode' in info.keys():
163 |                     episode_rewards.append(info['episode']['r'])
164 |                     if 'rb' in info['episode']:
165 |                         benchmark_rewards.append(info['episode']['rb'])
166 | 
167 |             # If done then clean the history of observations.
168 |             masks = torch.FloatTensor([[0.0] if done_ else [1.0]
169 |                                        for done_ in done])
170 |             rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)
171 |             if replay is not None:
172 |                 replay.insert(
173 |                     rollouts.obs[step],
174 |                     rollouts.recurrent_hidden_states[step],
175 |                     action,
176 |                     reward,
177 |                     done)
178 | 
179 |         with torch.no_grad():
180 |             next_value = actor_critic.get_value(rollouts.obs[-1],
181 |                                                 rollouts.recurrent_hidden_states[-1],
182 |                                                 rollouts.masks[-1]).detach()
183 | 
184 |         rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)
185 | 
186 |         value_loss, action_loss, dist_entropy = agent.update(rollouts, j, replay)
187 | 
188 |         rollouts.after_update()
189 | 
190 |         total_num_steps = (j + 1) * args.num_processes * args.num_steps
191 | 
192 |         train_eprew = np.mean(episode_rewards)
193 |         if j % args.log_interval == 0 and len(episode_rewards) > 1:
194 |             end = time.time()
195 |             print("Updates {}, num timesteps {}, FPS {} \n Last {} episodes: mean/med {:.1f}/{:.1f}, min/max reward {:.2f}/{:.2f}".
196 |                 format(j, total_num_steps,
197 |                        int(total_num_steps / (end - start)),
198 |                        len(episode_rewards),
199 |                        train_eprew,
200 |                        np.median(episode_rewards),
201 |                        np.min(episode_rewards),
202 |                        np.max(episode_rewards), dist_entropy,
203 |                        value_loss, action_loss), end='')
204 |             if len(benchmark_rewards):
205 |                 print(", benchmark {:.1f}/{:.1f}, {:.1f}/{:.1f}".format(
206 |                     np.mean(benchmark_rewards),
207 |                     np.median(benchmark_rewards),
208 |                     np.min(benchmark_rewards),
209 |                     np.max(benchmark_rewards)
210 |                 ), end='')
211 |             print()
212 | 
213 |         if (args.eval_interval is not None
214 |                 and len(episode_rewards) > 1
215 |                 and j % args.eval_interval == 0):
216 |             eval_envs = make_vec_envs(
217 |                 args.env_name, args.seed + args.num_processes, args.num_processes,
218 |                 args.gamma, eval_log_dir, args.add_timestep, device, True)
219 | 
220 |             vec_norm = get_vec_normalize(eval_envs)
221 |             if vec_norm is not None:
222 |                 vec_norm.eval()
223 |                 vec_norm.ob_rms = get_vec_normalize(envs).ob_rms
224 | 
225 |             eval_episode_rewards = []
226 | 
227 |             obs = eval_envs.reset()
228 |             eval_recurrent_hidden_states = torch.zeros(args.num_processes,
229 |                             actor_critic.recurrent_hidden_state_size, device=device)
230 |             eval_masks = torch.zeros(args.num_processes, 1, device=device)
231 | 
232 |             while len(eval_episode_rewards) < 10:
233 |                 with torch.no_grad():
234 |                     _, action, _, eval_recurrent_hidden_states = actor_critic.act(
235 |                         obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)
236 | 
237 |                 clipped_action = action
238 |                 if args.clip_action and isinstance(envs.action_space, gym.spaces.Box):
239 |                     if args.shift_action:
240 |                         clipped_action = 0.5 * clipped_action + action_mid
241 |                     clipped_action = torch.max(
242 |                         torch.min(clipped_action, action_high), action_low)
243 | 
244 |                 obs, reward, done, infos = eval_envs.step(clipped_action)
245 | 
246 |                 eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
247 |                                                 for done_ in done])
248 |                 for info in infos:
249 |                     if 'episode' in info.keys():
250 |                         eval_episode_rewards.append(info['episode']['r'])
251 | 
252 |             eval_envs.close()
253 | 
254 |             eval_eprew = np.mean(eval_episode_rewards)
255 |             print(" Evaluation using {} episodes: mean reward {:.5f}\n".
256 |                 format(len(eval_episode_rewards), eval_eprew))
257 | 
258 |         if len(episode_rewards) and j % args.save_interval == 0 and save_dir != "":
259 |             # A really ugly way to save a model to CPU
260 |             save_model = actor_critic
261 |             if args.cuda:
262 |                 save_model = copy.deepcopy(actor_critic).cpu()
263 | 
264 |             save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)]
265 | 
266 |             ep_rewstr = ("%d" % train_eprew).replace("-", "n")
267 |             save_filename = os.path.join(save_dir, './checkpoint-%d-%s.pt' % (j, ep_rewstr))
268 | 
269 |             torch.save(save_model, save_filename)
270 | 
271 |         if args.vis and j % args.vis_interval == 0:
272 |             try:
273 |                 # Sometimes monitor doesn't properly flush the outputs
274 |                 win = visdom_plot(viz, win, log_dir, args.env_name,
275 |                                   args.algo, args.num_frames)
276 |             except IOError:
277 |                 pass
278 | 
279 | 
280 | if __name__ == "__main__":
281 |     main()
282 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from distributions import Categorical, DiagGaussian, Beta
  6 | from utils import init, init_normc_
  7 | 
  8 | 
  9 | class Flatten(nn.Module):
 10 |     def forward(self, x):
 11 |         return x.view(x.size(0), -1)
 12 | 
 13 | 
 14 | class Policy(nn.Module):
 15 |     def __init__(self, obs_shape, action_space, beta=False, base_kwargs=None):
 16 |         super(Policy, self).__init__()
 17 |         if base_kwargs is None:
 18 |             base_kwargs = {}
 19 | 
 20 |         if len(obs_shape) == 3:
 21 |             self.base = CNNBase(obs_shape[0], **base_kwargs)
 22 |         elif len(obs_shape) == 1:
 23 |             self.base = MLPBase(obs_shape[0], **base_kwargs)
 24 |         else:
 25 |             raise NotImplementedError
 26 | 
 27 |         if action_space.__class__.__name__ == "Discrete":
 28 |             num_outputs = action_space.n
 29 |             self.dist = Categorical(self.base.output_size, num_outputs)
 30 |         elif action_space.__class__.__name__ == "Box":
 31 |             num_outputs = action_space.shape[0]
 32 |             if beta:
 33 |                 self.dist = Beta(self.base.output_size, num_outputs)
 34 |             else:
 35 |                 self.dist = DiagGaussian(self.base.output_size, num_outputs)
 36 |         else:
 37 |             raise NotImplementedError
 38 | 
 39 |     @property
 40 |     def is_recurrent(self):
 41 |         return self.base.is_recurrent
 42 | 
 43 |     @property
 44 |     def recurrent_hidden_state_size(self):
 45 |         """Size of rnn_hx."""
 46 |         return self.base.recurrent_hidden_state_size
 47 | 
 48 |     def forward(self, inputs, rnn_hxs, masks):
 49 |         raise NotImplementedError
 50 | 
 51 |     def act(self, inputs, rnn_hxs, masks, deterministic=False):
 52 |         value, actor_features, rnn_hxs = self.base(inputs, rnn_hxs, masks)
 53 |         dist = self.dist(actor_features)
 54 | 
 55 |         if deterministic:
 56 |             action = dist.mode()
 57 |         else:
 58 |             action = dist.sample()
 59 | 
 60 |         action_log_probs = dist.log_probs(action)
 61 |         dist_entropy = dist.entropy().mean()
 62 | 
 63 |         return value, action, action_log_probs, rnn_hxs
 64 | 
 65 |     def get_value(self, inputs, rnn_hxs, masks):
 66 |         value, _, _ = self.base(inputs, rnn_hxs, masks)
 67 |         return value
 68 | 
 69 |     def evaluate_actions(self, inputs, rnn_hxs, masks, action):
 70 |         value, actor_features, rnn_hxs = self.base(inputs, rnn_hxs, masks)
 71 |         dist = self.dist(actor_features)
 72 | 
 73 |         action_log_probs = dist.log_probs(action)
 74 |         dist_entropy = dist.entropy().mean()
 75 | 
 76 |         return value, action_log_probs, dist_entropy, rnn_hxs
 77 | 
 78 | 
 79 | class NNBase(nn.Module):
 80 | 
 81 |     def __init__(self, recurrent, recurrent_input_size, hidden_size):
 82 |         super(NNBase, self).__init__()
 83 | 
 84 |         self._hidden_size = hidden_size
 85 |         self._recurrent = recurrent
 86 | 
 87 |         if recurrent:
 88 |             self.gru = nn.GRUCell(recurrent_input_size, hidden_size)
 89 |             nn.init.orthogonal_(self.gru.weight_ih.data)
 90 |             nn.init.orthogonal_(self.gru.weight_hh.data)
 91 |             self.gru.bias_ih.data.fill_(0)
 92 |             self.gru.bias_hh.data.fill_(0)
 93 | 
 94 |     @property
 95 |     def is_recurrent(self):
 96 |         return self._recurrent
 97 | 
 98 |     @property
 99 |     def recurrent_hidden_state_size(self):
100 |         if self._recurrent:
101 |             return self._hidden_size
102 |         return 1
103 | 
104 |     @property
105 |     def output_size(self):
106 |         return self._hidden_size
107 | 
108 |     def _forward_gru(self, x, hxs, masks):
109 |         if x.size(0) == hxs.size(0):
110 |             x = hxs = self.gru(x, hxs * masks)
111 |         else:
112 |             # x is a (T, N, -1) tensor that has been flatten to (T * N, -1)
113 |             N = hxs.size(0)
114 |             T = int(x.size(0) / N)
115 | 
116 |             # unflatten
117 |             x = x.view(T, N, x.size(1))
118 | 
119 |             # Same deal with masks
120 |             masks = masks.view(T, N, 1)
121 | 
122 |             outputs = []
123 |             for i in range(T):
124 |                 hx = hxs = self.gru(x[i], hxs * masks[i])
125 |                 outputs.append(hx)
126 | 
127 |             # assert len(outputs) == T
128 |             # x is a (T, N, -1) tensor
129 |             x = torch.stack(outputs, dim=0)
130 |             # flatten
131 |             x = x.view(T * N, -1)
132 | 
133 |         return x, hxs
134 | 
135 | 
136 | class CNNBase(NNBase):
137 |     def __init__(self, num_inputs, recurrent=False, hidden_size=512, output_layer=nn.ReLU):
138 |         super(CNNBase, self).__init__(recurrent, hidden_size, hidden_size)
139 | 
140 |         init_ = lambda m: init(m,
141 |             nn.init.orthogonal_,
142 |             lambda x: nn.init.constant_(x, 0),
143 |             nn.init.calculate_gain('relu'))
144 | 
145 |         self.main = nn.Sequential(
146 |             init_(nn.Conv2d(num_inputs, 32, 8, stride=4)),
147 |             output_layer(),
148 |             init_(nn.Conv2d(32, 64, 4, stride=2)),
149 |             output_layer(),
150 |             init_(nn.Conv2d(64, 32, 3, stride=1)),
151 |             output_layer(),
152 |             Flatten(),
153 |             init_(nn.Linear(32 * 7 * 7, hidden_size)),
154 |             output_layer()
155 |         )
156 | 
157 |         init_ = lambda m: init(m,
158 |             nn.init.orthogonal_,
159 |             lambda x: nn.init.constant_(x, 0))
160 | 
161 |         self.critic_linear = init_(nn.Linear(hidden_size, 1))
162 | 
163 |         self.train()
164 | 
165 |     def forward(self, inputs, rnn_hxs, masks):
166 |         x = self.main(inputs / 255.0)
167 | 
168 |         if self.is_recurrent:
169 |             x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks)
170 | 
171 |         return self.critic_linear(x), x, rnn_hxs
172 | 
173 | 
174 | class MLPBase(NNBase):
175 |     def __init__(self, num_inputs, recurrent=False, hidden_size=64, activation_layer=nn.Tanh):
176 |         super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size)
177 | 
178 |         if recurrent:
179 |             num_inputs = hidden_size
180 | 
181 |         init_ = lambda m: init(m,
182 |             init_normc_,
183 |             lambda x: nn.init.constant_(x, 0))
184 | 
185 |         self.actor = nn.Sequential(
186 |             init_(nn.Linear(num_inputs, hidden_size*2)),
187 |             activation_layer(),
188 |             init_(nn.Linear(hidden_size*2, hidden_size)),
189 |             activation_layer()
190 |         )
191 | 
192 |         self.critic = nn.Sequential(
193 |             init_(nn.Linear(num_inputs, hidden_size*2)),
194 |             activation_layer(),
195 |             init_(nn.Linear(hidden_size*2, hidden_size)),
196 |             activation_layer()
197 |         )
198 | 
199 |         self.critic_linear = init_(nn.Linear(hidden_size, 1))
200 | 
201 |         self.train()
202 | 
203 |     def forward(self, inputs, rnn_hxs, masks):
204 |         x = inputs
205 | 
206 |         if self.is_recurrent:
207 |             x, rnn_hxs = self._forward_gru(x, rnn_hxs, masks)
208 | 
209 |         hidden_critic = self.critic(x)
210 |         hidden_actor = self.actor(x)
211 | 
212 |         return self.critic_linear(hidden_critic), hidden_actor, rnn_hxs
213 | 


--------------------------------------------------------------------------------
/monitor.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from baselines.bench import Monitor as _Monitor
 3 | 
 4 | 
 5 | class Monitor(_Monitor):
 6 | 
 7 |     def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
 8 |         super(Monitor, self).__init__(
 9 |             env, filename,
10 |             allow_early_resets=allow_early_resets,
11 |             reset_keywords=reset_keywords, info_keywords=info_keywords)
12 |         self.benchmark_rewards = []
13 |         self.episode_benchmark_rewards = []
14 |         if 'rb' in info_keywords:
15 |             self.do_benchmark = True
16 |         else:
17 |             self.do_benchmark = False
18 | 
19 |     def reset_state(self):
20 |         self.benchmark_rewards = []
21 |         super(Monitor, self).reset_state()
22 | 
23 |     def update(self, ob, rew, done, info):
24 |         if self.do_benchmark and 'rb' in info:
25 |             self.benchmark_rewards.append(info['rb'])
26 |         self.rewards.append(rew)
27 |         if done:
28 |             self.needs_reset = True
29 |             eprewb = sum(self.benchmark_rewards)
30 |             eprew = sum(self.rewards)
31 |             eplen = len(self.rewards)
32 |             epinfo = {
33 |                 "r": round(eprew, 6),
34 |                 "l": eplen,
35 |                 "t": round(time.time() - self.tstart, 6)}
36 |             for k in self.info_keywords:
37 |                 epinfo[k] = info[k]
38 |             if self.do_benchmark:
39 |                 epinfo["rb"] = eprewb  # overwrite with episode benchmark
40 |             self.episode_benchmark_rewards.append(eprewb)
41 |             self.episode_rewards.append(eprew)
42 |             self.episode_lengths.append(eplen)
43 |             self.episode_times.append(time.time() - self.tstart)
44 |             epinfo.update(self.current_reset_info)
45 |             self.results_writer.write_row(epinfo)
46 | 
47 |             if isinstance(info, dict):
48 |                 info['episode'] = epinfo
49 | 
50 |         self.total_steps += 1
51 | 


--------------------------------------------------------------------------------
/my_prosthetics_env.py:
--------------------------------------------------------------------------------
  1 | from osim.env import ProstheticsEnv
  2 | import random
  3 | import numpy as np
  4 | import math
  5 | import os
  6 | import time
  7 | from collections import deque
  8 | 
  9 | 
 10 | PROJ_FULL = 0
 11 | PROJ_NORMAL = 1
 12 | PROJ_SIMPLE = 2
 13 | 
 14 | 
 15 | ## Values in the observation vector
 16 | # y, vx, vy, ax, ay, rz, vrz, arz of pelvis (10 values)
 17 | # x, y, vx, vy, ax, ay, rz, vrz, arz of head, torso, toes_l, toes_r, talus_l, talus_r (12*6 values)
 18 | # rz, vrz, arz of ankle_l, ankle_r, back, hip_l, hip_r, knee_l, knee_r (7*3 values)
 19 | # activation, fiber_len, fiber_vel for all muscles (3*18)
 20 | # x, y, vx, vy, ax, ay ofg center of mass (6)
 21 | # 8 + 9*6 + 8*3 + 3*18 + 6 = 146
 22 | def project_obs(state_desc, proj=PROJ_FULL, prosthetic=True):
 23 |     res = []
 24 | 
 25 |     if proj == PROJ_SIMPLE:
 26 |         pelvis = state_desc["body_pos"]["pelvis"][0:3]
 27 |         # pelvis_vel = state_desc["body_vel"]["pelvis"][0:3]
 28 |         # pelvis_acc = state_desc["body_acc"]["pelvis"][0:3]
 29 |         res += pelvis[1:2]  # + pelvis_vel[:] + pelvis_acc[:]
 30 |         for bp in ["talus_l", "pros_foot_r"]:
 31 |             bp_pos = state_desc["body_pos"][bp].copy()
 32 |             bp_pos[0] = bp_pos[0] - pelvis[0]
 33 |             bp_pos[2] = bp_pos[2] - pelvis[2]
 34 |             res += bp_pos
 35 |     else:
 36 |         pelvis = None
 37 |         for body_part in ["pelvis", "head", "torso", "toes_l", "toes_r", "talus_l", "talus_r"]:
 38 |             if prosthetic and body_part in ["toes_r", "talus_r"]:
 39 |                 if proj == PROJ_FULL:
 40 |                     res += [0] * 12
 41 |                 continue
 42 |             cur = []
 43 |             cur += state_desc["body_pos"][body_part][0:3]
 44 |             cur += state_desc["body_vel"][body_part][0:3]
 45 |             cur += state_desc["body_acc"][body_part][0:3]
 46 |             cur += state_desc["body_pos_rot"][body_part][2:]
 47 |             cur += state_desc["body_vel_rot"][body_part][2:]
 48 |             cur += state_desc["body_acc_rot"][body_part][2:]
 49 |             if body_part == "pelvis":
 50 |                 pelvis = cur.copy()
 51 |                 res += pelvis[1:2] + pelvis[3:]
 52 |             else:
 53 |                 cur_upd = cur.copy()
 54 |                 cur_upd[:3] = [cur[i] - pelvis[i] for i in range(3)]
 55 |                 cur_upd[9:10] = [cur[i] - pelvis[i] for i in range(9, 10)]
 56 |                 res += cur_upd
 57 | 
 58 |     for joint in ["ankle_l", "ankle_r", "back", "hip_l", "hip_r", "knee_l", "knee_r"]:
 59 |         res += state_desc["joint_pos"][joint]
 60 |         res += state_desc["joint_vel"][joint]
 61 |         res += state_desc["joint_acc"][joint]
 62 | 
 63 |     for muscle in sorted(state_desc["muscles"].keys()):
 64 |         res += [state_desc["muscles"][muscle]["activation"]]
 65 |         res += [state_desc["muscles"][muscle]["fiber_length"]]
 66 |         res += [state_desc["muscles"][muscle]["fiber_velocity"]]
 67 | 
 68 |     cm_pos = [state_desc["misc"]["mass_center_pos"][i] - pelvis[i] for i in range(3)]
 69 |     cm_vel = state_desc["misc"]["mass_center_vel"]
 70 |     cm_acc = state_desc["misc"]["mass_center_acc"]
 71 |     res = res + cm_pos + cm_vel + cm_acc
 72 | 
 73 |     return np.array(res)
 74 | 
 75 | 
 76 | class MyProstheticsEnv(ProstheticsEnv):
 77 | 
 78 |     def __init__(self, visualize=False, integrator_accuracy=1e-4, difficulty=0, seed=0, frame_skip=0):
 79 |         self.project_mode = PROJ_FULL
 80 |         super(MyProstheticsEnv, self).__init__(
 81 |             visualize=visualize,
 82 |             integrator_accuracy=integrator_accuracy,
 83 |             difficulty=difficulty,
 84 |             seed=seed)
 85 |         if difficulty == 0:
 86 |             self.time_limit = 600  # longer time limit to reduce likelihood of diving strategy
 87 |         self.spec.timestep_limit = self.time_limit
 88 |         np.random.seed(seed)
 89 |         self.frame_times = deque(maxlen=100)
 90 |         self.frame_count = 0
 91 |         self.frame_skip = frame_skip
 92 |         self.debug = False
 93 | 
 94 |     def get_observation(self):
 95 |         state_desc = self.get_state_desc()
 96 |         return project_obs(state_desc, proj=self.project_mode, prosthetic=self.prosthetic)
 97 | 
 98 |     def get_observation_space_size(self):
 99 |         if self.prosthetic:
100 |             if self.project_mode == PROJ_SIMPLE:
101 |                 return 106
102 |             elif self.project_mode == PROJ_FULL:
103 |                 return 181
104 |             else:
105 |                 return 157
106 |         return 167
107 | 
108 |     def is_done(self):
109 |         state_desc = self.get_state_desc()
110 |         return state_desc["body_pos"]["pelvis"][1] < 0.65
111 | 
112 |     def my_reward_round1(self):
113 |         state_desc = self.get_state_desc()
114 |         prev_state_desc = self.get_prev_state_desc()
115 |         if not prev_state_desc:
116 |             return 0
117 | 
118 |         penalty = 0.
119 |         penalty += (state_desc["body_vel"]["pelvis"][0] - 3.0) ** 2
120 |         penalty += (state_desc["body_vel"]["pelvis"][2]) ** 2
121 |         penalty += np.sum(np.array(self.osim_model.get_activations()) ** 2) * 0.001
122 |         if state_desc["body_pos"]["pelvis"][1] < 0.70:
123 |             penalty += 10  # penalize falling more
124 | 
125 |         # Reward for not falling
126 |         reward = 10.0
127 | 
128 |         return reward - penalty
129 | 
130 |     def my_reward_round2(self):
131 |         state_desc = self.get_state_desc()
132 |         prev_state_desc = self.get_prev_state_desc()
133 |         penalty = 0
134 | 
135 |         # Small penalty for too much activation (cost of transport)
136 |         penalty += np.sum(np.array(self.osim_model.get_activations()) ** 2) * 0.001
137 | 
138 |         # Big penalty for not matching the vector on the X,Z projection.
139 |         # No penalty for the vertical axis
140 |         penalty += (state_desc["body_vel"]["pelvis"][0] - state_desc["target_vel"][0]) ** 2
141 |         penalty += (state_desc["body_vel"]["pelvis"][2] - state_desc["target_vel"][2]) ** 2
142 |         if state_desc["body_pos"]["pelvis"][1] < 0.70:
143 |             penalty += 10  # penalize falling more
144 | 
145 |         # Reward for not falling
146 |         reward = 10.0
147 | 
148 |         return reward - penalty
149 | 
150 |     def reward_round1(self):
151 |         state_desc = self.get_state_desc()
152 |         prev_state_desc = self.get_prev_state_desc()
153 |         if not prev_state_desc:
154 |             return 0
155 |         return 9.0 - (state_desc["body_vel"]["pelvis"][0] - 3.0)**2
156 | 
157 |     def reward_round2(self):
158 |         state_desc = self.get_state_desc()
159 |         prev_state_desc = self.get_prev_state_desc()
160 |         penalty = 0
161 | 
162 |         # Small penalty for too much activation (cost of transport)
163 |         penalty += np.sum(np.array(self.osim_model.get_activations()) ** 2) * 0.001
164 | 
165 |         # Big penalty for not matching the vector on the X,Z projection.
166 |         # No penalty for the vertical axis
167 |         penalty += (state_desc["body_vel"]["pelvis"][0] - state_desc["target_vel"][0]) ** 2
168 |         penalty += (state_desc["body_vel"]["pelvis"][2] - state_desc["target_vel"][2]) ** 2
169 | 
170 |         # Reward for not falling
171 |         reward = 10.0
172 | 
173 |         return reward - penalty
174 | 
175 |     def reward(self):
176 |         if self.difficulty == 0:
177 |             return self.reward_round1()
178 |         return self.reward_round2()
179 | 
180 |     def my_reward(self):
181 |         if self.difficulty == 0:
182 |             return self.my_reward_round1()
183 |         return self.my_reward_round2()
184 | 
185 |     def step(self, action, project=True):
186 |         reward = 0.
187 |         rewardb = 0.
188 |         done = False
189 | 
190 |         if self.frame_skip:
191 |             num_steps = self.frame_skip
192 |         else:
193 |             num_steps = 1
194 | 
195 |         for _ in range(num_steps):
196 |             self.prev_state_desc = self.get_state_desc()
197 | 
198 |             start_time = time.perf_counter()
199 |             self.osim_model.actuate(action)
200 |             self.osim_model.integrate()
201 |             step_time = time.perf_counter() - start_time
202 | 
203 |             # track some step stats across resets
204 |             self.frame_times.append(step_time)
205 |             self.frame_count += 1
206 | 
207 |             if self.debug and self.frame_count % 1000 == 0:
208 |                 frame_mean = np.mean(self.frame_times)
209 |                 frame_min = np.min(self.frame_times)
210 |                 frame_max = np.max(self.frame_times)
211 |                 print('Steps {}, duration mean, min, max: {:.3f}, {:.3f}, {:.3f}'.format(
212 |                     self.frame_count, frame_mean, frame_min, frame_max))
213 | 
214 |             done = self.is_done() or self.osim_model.istep >= self.spec.timestep_limit
215 |             if step_time > 15.:
216 |                 reward += -10
217 |                 done = True
218 |             else:
219 |                 reward += self.my_reward()
220 |             rewardb += self.reward()
221 | 
222 |             if done:
223 |                 break
224 | 
225 |         if project:
226 |             obs = self.get_observation()
227 |         else:
228 |             obs = self.get_state_desc()
229 | 
230 |         return [obs, reward, done, {'rb': rewardb}]
231 | 
232 |     def seed(self, seed=None):
233 |         random.seed(seed)
234 |         np.random.seed(seed)
235 | 


--------------------------------------------------------------------------------
/replay_storage.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import math
  4 | import random
  5 | from collections import deque
  6 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  7 | 
  8 | 
  9 | class ReplayStorage:
 10 |     def __init__(
 11 |             self, max_steps, num_processes, gamma, prio_alpha,
 12 |             obs_shape, action_space, recurrent_hidden_state_size,
 13 |             device):
 14 |         self.max_steps = max_steps
 15 |         self.num_processes = num_processes
 16 |         self.gamma = gamma
 17 |         self.device = device
 18 | 
 19 |         # stored episode data
 20 |         self.obs = torch.zeros(max_steps, *obs_shape)
 21 |         self.recurrent_hidden_states = torch.zeros(max_steps, recurrent_hidden_state_size)
 22 |         self.returns = torch.zeros(max_steps, 1)
 23 |         if action_space.__class__.__name__ == 'Discrete':
 24 |             self.actions = torch.zeros(max_steps, 1).long()
 25 |         else:
 26 |             self.actions = torch.zeros(max_steps, action_space.shape[0])
 27 |         self.masks = torch.ones(max_steps, 1)
 28 |         self.next_idx = 0
 29 |         self.num_steps = 0
 30 | 
 31 |         # store (full) episode stats
 32 |         self.episode_step_count = 0
 33 |         self.episode_rewards = deque()
 34 |         self.episode_steps = deque()
 35 | 
 36 |         # currently running (accumulating) episodes
 37 |         self.running_episodes = [[] for _ in range(num_processes)]
 38 | 
 39 |         if prio_alpha > 0:
 40 |             """
 41 |             Sampling priority is enabled if prio_alpha > 0
 42 |             Priority algorithm ripped from OpenAI Baselines
 43 |             https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
 44 |             """
 45 |             self.prio_alpha = prio_alpha
 46 |             tree_capacity = 1 << math.ceil(math.log2(self.max_steps))
 47 |             self.prio_sum_tree = SumSegmentTree(tree_capacity)
 48 |             self.prio_min_tree = MinSegmentTree(tree_capacity)
 49 |             self.prio_max = 1.0
 50 |         else:
 51 |             self.prio_alpha = 0
 52 | 
 53 |     def _process_rewards(self, trajectory):
 54 |         has_positive = False
 55 |         reward_sum = 0.
 56 |         r = 0.
 57 |         for t in trajectory[::-1]:
 58 |             reward = t['reward']
 59 |             reward_sum += reward
 60 |             if reward > (0. + 1e-5):
 61 |                 has_positive = True
 62 |             r = reward + self.gamma*r
 63 |             t['return'] = r
 64 |         return has_positive, reward_sum
 65 | 
 66 |     def _add_trajectory(self, trajectory):
 67 |         has_positive, reward_sum = self._process_rewards(trajectory)
 68 |         if not has_positive:
 69 |             return
 70 |         trajectory_len = len(trajectory)
 71 |         prev_idx = self.next_idx
 72 |         for transition in trajectory:
 73 |             self.obs[self.next_idx].copy_(transition['obs'])
 74 |             self.recurrent_hidden_states[self.next_idx].copy_(transition['rhs'])
 75 |             self.actions[self.next_idx].copy_(transition['action'])
 76 |             self.returns[self.next_idx].copy_(transition['return'])
 77 |             self.masks[self.next_idx] = 1.0
 78 |             prev_idx = self.next_idx
 79 |             if self.prio_alpha:
 80 |                 self.prio_sum_tree[self.next_idx] = self.prio_max ** self.prio_alpha
 81 |                 self.prio_min_tree[self.next_idx] = self.prio_max ** self.prio_alpha
 82 |             self.next_idx = (self.next_idx + 1) % self.max_steps
 83 |             self.num_steps = min(self.max_steps, self.num_steps + 1)
 84 |         self.masks[prev_idx] = 0.0
 85 | 
 86 |         # update stats of stored full trajectories (episodes)
 87 |         while self.episode_step_count + trajectory_len > self.max_steps:
 88 |             steps_popped = self.episode_steps.popleft()
 89 |             self.episode_rewards.popleft()
 90 |             self.episode_step_count -= steps_popped
 91 |         self.episode_step_count += trajectory_len
 92 |         self.episode_steps.append(trajectory_len)
 93 |         self.episode_rewards.append(reward_sum)
 94 | 
 95 |     def _sample_proportional(self, sample_size):
 96 |         res = []
 97 |         for _ in range(sample_size):
 98 |             mass = random.random() * self.prio_sum_tree.sum(0, self.num_steps - 1)
 99 |             idx = self.prio_sum_tree.find_prefixsum_idx(mass)
100 |             res.append(idx)
101 |         return res
102 | 
103 |     def insert(self, obs, rhs, actions, rewards, dones):
104 |         for n in range(self.num_processes):
105 |             self.running_episodes[n].append(dict(
106 |                 obs=obs[n].clone(),
107 |                 rhs=rhs[n].clone(),
108 |                 action=actions[n].clone(),
109 |                 reward=rewards[n].clone()
110 |             ))
111 |         for n, done in enumerate(dones):
112 |             if done:
113 |                 self._add_trajectory(self.running_episodes[n])
114 |                 self.running_episodes[n] = []
115 | 
116 |     def update_priorities(self, indices, priorities):
117 |         if not self.prio_alpha:
118 |             return
119 | 
120 |         """Update priorities of sampled transitions.
121 |         sets priority of transition at index indices[i] in buffer
122 |         to priorities[i].
123 |         Parameters
124 |         ----------
125 |         indices: [int]
126 |             List of indices of sampled transitions
127 |         priorities: [float]
128 |             List of updated priorities corresponding to
129 |             transitions at the sampled indices.
130 |         """
131 |         assert len(indices) == len(priorities)
132 |         for idx, priority in zip(indices, priorities):
133 |             priority = max(priority, 1e-6)
134 |             assert priority > 0
135 |             assert 0 <= idx < self.num_steps
136 |             self.prio_sum_tree[idx] = priority ** self.prio_alpha
137 |             self.prio_min_tree[idx] = priority ** self.prio_alpha
138 | 
139 |             self.prio_max = max(self.prio_max, priority)
140 | 
141 |     def feed_forward_generator(self, batch_size, num_batches=None, beta=0.):
142 |         """Generate batches of sampled experiences.
143 | 
144 |         Parameters
145 |         ----------
146 |         batch_size: int
147 |             Size of each sampled batch
148 |         num_batches: int
149 |             Number of batches to sample
150 |         beta: float
151 |             To what degree to use importance weights
152 |             (0 - no corrections, 1 - full correction)
153 |         """
154 | 
155 |         batch_count = 0
156 |         sample_size = num_batches * batch_size or self.num_steps
157 | 
158 |         if self.prio_alpha > 0:
159 |             indices = self._sample_proportional(sample_size)
160 |             if beta > 0:
161 |                 # compute importance sampling weights to correct for the
162 |                 # bias introduced by sampling in a non-uniform manner
163 |                 weights = []
164 |                 p_min = self.prio_min_tree.min() / self.prio_sum_tree.sum()
165 |                 max_weight = (p_min * self.num_steps) ** (-beta)
166 |                 for i in indices:
167 |                     p_sample = self.prio_sum_tree[i] / self.prio_sum_tree.sum()
168 |                     weight = (p_sample * self.num_steps) ** (-beta)
169 |                     weights.append(weight / max_weight)
170 |                 weights = torch.tensor(weights, dtype=torch.float32).unsqueeze(1)
171 |             else:
172 |                 weights = torch.ones((len(indices), 1), dtype=torch.float32)
173 |         else:
174 |             if sample_size * 3 < self.num_steps:
175 |                 indices = random.sample(range(self.num_steps), sample_size)
176 |             else:
177 |                 indices = np.random.permutation(self.num_steps)[:sample_size]
178 |             weights = None
179 | 
180 |         for si in range(0, len(indices), batch_size):
181 |             indices_batch = indices[si:min(len(indices), si + batch_size)]
182 |             if len(indices_batch) < batch_size:
183 |                 return
184 | 
185 |             weights_batch = None if weights is None else \
186 |                 weights[si:min(len(indices), si + batch_size)].to(self.device)
187 | 
188 |             obs_batch = self.obs[indices_batch].to(self.device)
189 |             recurrent_hidden_states_batch = self.recurrent_hidden_states[indices_batch].to(self.device)
190 |             actions_batch = self.actions[indices_batch].to(self.device)
191 |             returns_batch = self.returns[indices_batch].to(self.device)
192 |             masks_batch = self.masks[indices_batch].to(self.device)
193 | 
194 |             yield obs_batch, recurrent_hidden_states_batch, actions_batch, returns_batch, \
195 |                   masks_batch, weights_batch, indices_batch
196 | 
197 |             batch_count += 1
198 |             if num_batches and batch_count >= num_batches:
199 |                 return
200 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gym
2 | matplotlib
3 | pybullet
4 | 


--------------------------------------------------------------------------------
/rollout_storage.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
  3 | 
  4 | 
  5 | def _flatten_helper(T, N, _tensor):
  6 |     return _tensor.view(T * N, *_tensor.size()[2:])
  7 | 
  8 | 
  9 | class RolloutStorage(object):
 10 |     def __init__(self, num_steps, num_processes, obs_shape, action_space, recurrent_hidden_state_size):
 11 |         self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape)
 12 |         self.recurrent_hidden_states = torch.zeros(num_steps + 1, num_processes, recurrent_hidden_state_size)
 13 |         self.rewards = torch.zeros(num_steps, num_processes, 1)
 14 |         self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
 15 |         self.returns = torch.zeros(num_steps + 1, num_processes, 1)
 16 |         self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
 17 |         if action_space.__class__.__name__ == 'Discrete':
 18 |             action_shape = 1
 19 |         else:
 20 |             action_shape = action_space.shape[0]
 21 |         self.actions = torch.zeros(num_steps, num_processes, action_shape)
 22 |         if action_space.__class__.__name__ == 'Discrete':
 23 |             self.actions = self.actions.long()
 24 |         self.masks = torch.ones(num_steps + 1, num_processes, 1)
 25 | 
 26 |         self.num_steps = num_steps
 27 |         self.step = 0
 28 | 
 29 |     def to(self, device):
 30 |         self.obs = self.obs.to(device)
 31 |         self.recurrent_hidden_states = self.recurrent_hidden_states.to(device)
 32 |         self.rewards = self.rewards.to(device)
 33 |         self.value_preds = self.value_preds.to(device)
 34 |         self.returns = self.returns.to(device)
 35 |         self.action_log_probs = self.action_log_probs.to(device)
 36 |         self.actions = self.actions.to(device)
 37 |         self.masks = self.masks.to(device)
 38 | 
 39 |     def insert(self, obs, recurrent_hidden_states, actions, action_log_probs, value_preds, rewards, masks):
 40 |         self.obs[self.step + 1].copy_(obs)
 41 |         self.recurrent_hidden_states[self.step + 1].copy_(recurrent_hidden_states)
 42 |         self.actions[self.step].copy_(actions)
 43 |         self.action_log_probs[self.step].copy_(action_log_probs)
 44 |         self.value_preds[self.step].copy_(value_preds)
 45 |         self.rewards[self.step].copy_(rewards)
 46 |         self.masks[self.step + 1].copy_(masks)
 47 | 
 48 |         self.step = (self.step + 1) % self.num_steps
 49 | 
 50 |     def after_update(self):
 51 |         self.obs[0].copy_(self.obs[-1])
 52 |         self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1])
 53 |         self.masks[0].copy_(self.masks[-1])
 54 | 
 55 |     def compute_returns(self, next_value, use_gae, gamma, tau):
 56 |         if use_gae:
 57 |             self.value_preds[-1] = next_value
 58 |             gae = 0
 59 |             for step in reversed(range(self.rewards.size(0))):
 60 |                 delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step + 1] - self.value_preds[step]
 61 |                 gae = delta + gamma * tau * self.masks[step + 1] * gae
 62 |                 self.returns[step] = gae + self.value_preds[step]
 63 |         else:
 64 |             self.returns[-1] = next_value
 65 |             for step in reversed(range(self.rewards.size(0))):
 66 |                 self.returns[step] = self.returns[step + 1] * \
 67 |                     gamma * self.masks[step + 1] + self.rewards[step]
 68 | 
 69 | 
 70 |     def feed_forward_generator(self, advantages, num_mini_batch):
 71 |         num_steps, num_processes = self.rewards.size()[0:2]
 72 |         batch_size = num_processes * num_steps
 73 |         assert batch_size >= num_mini_batch, (
 74 |             "PPO requires the number of processes ({}) "
 75 |             "* number of steps ({}) = {} "
 76 |             "to be greater than or equal to the number of PPO mini batches ({})."
 77 |             "".format(num_processes, num_steps, num_processes * num_steps, num_mini_batch))
 78 |         mini_batch_size = batch_size // num_mini_batch
 79 |         sampler = BatchSampler(SubsetRandomSampler(range(batch_size)), mini_batch_size, drop_last=False)
 80 |         for indices in sampler:
 81 |             obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices]
 82 |             recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view(-1,
 83 |                 self.recurrent_hidden_states.size(-1))[indices]
 84 |             actions_batch = self.actions.view(-1, self.actions.size(-1))[indices]
 85 |             return_batch = self.returns[:-1].view(-1, 1)[indices]
 86 |             masks_batch = self.masks[:-1].view(-1, 1)[indices]
 87 |             old_action_log_probs_batch = self.action_log_probs.view(-1, 1)[indices]
 88 |             adv_targ = advantages.view(-1, 1)[indices]
 89 | 
 90 |             yield obs_batch, recurrent_hidden_states_batch, actions_batch, \
 91 |                 return_batch, masks_batch, old_action_log_probs_batch, adv_targ
 92 | 
 93 |     def recurrent_generator(self, advantages, num_mini_batch):
 94 |         num_processes = self.rewards.size(1)
 95 |         assert num_processes >= num_mini_batch, (
 96 |             "PPO requires the number of processes ({}) "
 97 |             "to be greater than or equal to the number of "
 98 |             "PPO mini batches ({}).".format(num_processes, num_mini_batch))
 99 |         num_envs_per_batch = num_processes // num_mini_batch
100 |         perm = torch.randperm(num_processes)
101 |         for start_ind in range(0, num_processes, num_envs_per_batch):
102 |             obs_batch = []
103 |             recurrent_hidden_states_batch = []
104 |             actions_batch = []
105 |             return_batch = []
106 |             masks_batch = []
107 |             old_action_log_probs_batch = []
108 |             adv_targ = []
109 | 
110 |             for offset in range(num_envs_per_batch):
111 |                 ind = perm[start_ind + offset]
112 |                 obs_batch.append(self.obs[:-1, ind])
113 |                 recurrent_hidden_states_batch.append(self.recurrent_hidden_states[0:1, ind])
114 |                 actions_batch.append(self.actions[:, ind])
115 |                 return_batch.append(self.returns[:-1, ind])
116 |                 masks_batch.append(self.masks[:-1, ind])
117 |                 old_action_log_probs_batch.append(self.action_log_probs[:, ind])
118 |                 adv_targ.append(advantages[:, ind])
119 | 
120 |             T, N = self.num_steps, num_envs_per_batch
121 |             # These are all tensors of size (T, N, -1)
122 |             obs_batch = torch.stack(obs_batch, 1)
123 |             actions_batch = torch.stack(actions_batch, 1)
124 |             return_batch = torch.stack(return_batch, 1)
125 |             masks_batch = torch.stack(masks_batch, 1)
126 |             old_action_log_probs_batch = torch.stack(old_action_log_probs_batch, 1)
127 |             adv_targ = torch.stack(adv_targ, 1)
128 | 
129 |             # States is just a (N, -1) tensor
130 |             recurrent_hidden_states_batch = torch.stack(recurrent_hidden_states_batch, 1).view(N, -1)
131 | 
132 |             # Flatten the (T, N, ...) tensors to (T * N, ...)
133 |             obs_batch = _flatten_helper(T, N, obs_batch)
134 |             actions_batch = _flatten_helper(T, N, actions_batch)
135 |             return_batch = _flatten_helper(T, N, return_batch)
136 |             masks_batch = _flatten_helper(T, N, masks_batch)
137 |             old_action_log_probs_batch = _flatten_helper(T, N, \
138 |                     old_action_log_probs_batch)
139 |             adv_targ = _flatten_helper(T, N, adv_targ)
140 | 
141 |             yield obs_batch, recurrent_hidden_states_batch, actions_batch, \
142 |                 return_batch, masks_batch, old_action_log_probs_batch, adv_targ
143 | 


--------------------------------------------------------------------------------
/submit.py:
--------------------------------------------------------------------------------
  1 | import opensim as osim
  2 | from osim.http.client import Client
  3 | from osim.env import ProstheticsEnv
  4 | import numpy as np
  5 | import argparse
  6 | import os
  7 | import gym
  8 | import torch
  9 | 
 10 | from envs import VecPyTorch, make_vec_envs
 11 | from utils import get_render_func, get_vec_normalize
 12 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 13 | from envs import VecNormalize, VecPyTorch
 14 | from my_prosthetics_env import MyProstheticsEnv, project_obs
 15 | 
 16 | 
 17 | class StopTheSim(Exception):
 18 |     pass
 19 | 
 20 | 
 21 | class ClientWrapper(MyProstheticsEnv):
 22 | 
 23 |     def __init__(self, client, token):
 24 |         super(ClientWrapper, self).__init__(
 25 |             visualize=False,
 26 |             integrator_accuracy=1e-4,
 27 |             difficulty=0,
 28 |             seed=42)
 29 |         self.client = client
 30 |         self._cached_observation = self.client.env_create(token, env_id="ProstheticsEnv")
 31 |         print(self._cached_observation)
 32 |         self.step_count = 0
 33 | 
 34 |     def step(self, action, project=True):
 35 |         print('Step: ', self.step_count, end='. ')
 36 |         obs, reward, done, info = self.client.env_step(action.tolist())
 37 |         if obs is not None and 'body_pos' in obs:
 38 |             print('Pelvis: ', obs['body_pos']['pelvis'])
 39 |         elif obs is None:
 40 |             print('Invalid obs.')
 41 |             return None, None, True, None
 42 |         self.step_count += 1
 43 |         proj_obs = project_obs(obs, self.project_mode, self.prosthetic)
 44 |         return proj_obs, reward, done, info
 45 | 
 46 |     def reset(self, project=True):
 47 |         print('Reset')
 48 |         if self._cached_observation is not None:
 49 |             print('Returning cached')
 50 |             obs = self._cached_observation
 51 |             self._cached_observation = None
 52 |         else:
 53 |             obs = self.client.env_reset()
 54 |         self.step_count = 0
 55 |         if obs is None:
 56 |             raise StopTheSim
 57 |         return project_obs(obs, self.project_mode, self.prosthetic)
 58 | 
 59 |     def close(self):
 60 |         return self.client.env_close()
 61 | 
 62 | 
 63 | # Command line parameters
 64 | parser = argparse.ArgumentParser(description='Submit the result to crowdAI')
 65 | parser.add_argument('--token', dest='token', action='store', required=True)
 66 | parser.add_argument('--env-name', default='PongNoFrameskip-v4',
 67 |                     help='environment to train on (default: PongNoFrameskip-v4)')
 68 | parser.add_argument('--load-path', default='',
 69 |                     help='directory to save agent logs (default: ')
 70 | args = parser.parse_args()
 71 | 
 72 | remote_base = 'http://grader.crowdai.org:1729'  # Submission to Round-1
 73 | # remote_base = 'http://grader.crowdai.org:1730'  # Submission to Round-2
 74 | client = Client(remote_base)
 75 | 
 76 | 
 77 | def create_env():
 78 |     env = ClientWrapper(client=client, token=args.token)
 79 |     return env
 80 | 
 81 | env = DummyVecEnv([create_env])
 82 | env = VecNormalize(env, ret=False)
 83 | env = VecPyTorch(env, 'cpu')
 84 | 
 85 | # We need to use the same statistics for normalization as used in training
 86 | actor_critic, ob_rms = torch.load(args.load_path)
 87 | actor_critic.eval()
 88 | 
 89 | vec_norm = get_vec_normalize(env)
 90 | if vec_norm is not None:
 91 |     vec_norm.eval()
 92 |     vec_norm.ob_rms = ob_rms
 93 | 
 94 | recurrent_hidden_states = torch.zeros(1, actor_critic.recurrent_hidden_state_size)
 95 | masks = torch.zeros(1, 1)
 96 | 
 97 | # Create environment
 98 | 
 99 | ref_env = ProstheticsEnv()
100 | 
101 | obs = env.reset()
102 | 
103 | # Run a single step
104 | # The grader runs 3 simulations of at most 1000 steps each. We stop after the last one
105 | count = 0
106 | num_steps = 0
107 | while True:
108 |     with torch.no_grad():
109 |         value, action, _, recurrent_hidden_states = actor_critic.act(
110 |             obs, recurrent_hidden_states, masks, deterministic=True)
111 | 
112 |     clipped_action = action
113 |     if isinstance(ref_env.action_space, gym.spaces.Box):
114 |        clipped_action = torch.max(torch.min(
115 |            clipped_action, torch.from_numpy(ref_env.action_space.high)),
116 |            torch.from_numpy(ref_env.action_space.low))
117 | 
118 |     try:
119 |         obs, reward, done, info = env.step(clipped_action)
120 |         num_steps += 1
121 |         if done:
122 |             print('Done after %d steps.' % num_steps)
123 |             num_steps = 0
124 |             count += 1
125 |     except StopTheSim:
126 |         print('Finishing.')
127 |         break
128 | 
129 | client.submit()
130 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import math
  4 | 
  5 | from envs import VecNormalize
  6 | 
  7 | 
  8 | # Get a render function
  9 | def get_render_func(venv):
 10 |     if hasattr(venv, 'envs'):
 11 |         return venv.envs[0].render
 12 |     elif hasattr(venv, 'venv'):
 13 |         return get_render_func(venv.venv)
 14 |     elif hasattr(venv, 'env'):
 15 |         return get_render_func(venv.env)
 16 | 
 17 |     return None
 18 | 
 19 | 
 20 | def get_vec_normalize(venv):
 21 |     if isinstance(venv, VecNormalize):
 22 |         return venv
 23 |     elif hasattr(venv, 'venv'):
 24 |         return get_vec_normalize(venv.venv)
 25 | 
 26 |     return None
 27 | 
 28 | 
 29 | # Necessary for my KFAC implementation.
 30 | class AddBias(nn.Module):
 31 |     def __init__(self, bias):
 32 |         super(AddBias, self).__init__()
 33 |         self._bias = nn.Parameter(bias.unsqueeze(1))
 34 | 
 35 |     def forward(self, x):
 36 |         if x.dim() == 2:
 37 |             bias = self._bias.t().view(1, -1)
 38 |         else:
 39 |             bias = self._bias.t().view(1, -1, 1, 1)
 40 | 
 41 |         return x + bias
 42 | 
 43 | 
 44 | def init(module, weight_init, bias_init, gain=1):
 45 |     weight_init(module.weight.data, gain=gain)
 46 |     bias_init(module.bias.data)
 47 |     return module
 48 | 
 49 | 
 50 | # https://github.com/openai/baselines/blob/master/baselines/common/tf_util.py#L87
 51 | def init_normc_(weight, gain=1):
 52 |     weight.normal_(0, 1)
 53 |     weight *= gain / torch.sqrt(weight.pow(2).sum(1, keepdim=True))
 54 | 
 55 | 
 56 | 
 57 | class _Schedule:
 58 |     def __init__(self, initial_value, gamma=0.1, last_epoch=-1):
 59 |         self.gamma = gamma
 60 |         self.initial_value = initial_value
 61 |         self.last_epoch = last_epoch
 62 |         self.step()
 63 | 
 64 |     def get(self):
 65 |         return self.initial_value
 66 | 
 67 |     def step(self, epoch=None):
 68 |         if epoch is None:
 69 |             self.last_epoch += 1
 70 |         else:
 71 |             self.last_epoch = epoch
 72 |         return self.get()
 73 | 
 74 | 
 75 | class StepSchedule(_Schedule):
 76 | 
 77 |     def __init__(self, initial_value, step_size, gamma=0.1, last_epoch=-1):
 78 |         self.step_size = step_size
 79 |         super(StepSchedule, self).__init__(initial_value, gamma, last_epoch)
 80 | 
 81 |     def get(self):
 82 |         return self.initial_value  * self.gamma ** (self.last_epoch // self.step_size)
 83 | 
 84 | 
 85 | class ExpSchedule(_Schedule):
 86 | 
 87 |     def __init__(self, initial_value, step_size, gamma=0.1, last_epoch=-1):
 88 |         self.step_size = step_size
 89 |         super(ExpSchedule, self).__init__(initial_value, gamma, last_epoch)
 90 | 
 91 |     def get(self):
 92 |         return self.initial_value  * self.gamma ** (self.last_epoch / self.step_size)
 93 | 
 94 | 
 95 | class NatExpSchedule(_Schedule):
 96 | 
 97 |     def __init__(self, initial_value, step_size, gamma=0.1, last_epoch=-1):
 98 |         self.step_size = step_size
 99 |         super(NatExpSchedule, self).__init__(initial_value, gamma, last_epoch)
100 | 
101 |     def get(self):
102 |         return self.initial_value * math.exp(-self.gamma * (self.last_epoch / self.step_size))
103 | 


--------------------------------------------------------------------------------
/visualize.py:
--------------------------------------------------------------------------------
  1 | # Copied from https://github.com/emansim/baselines-mansimov/blob/master/baselines/a2c/visualize_atari.py
  2 | # and https://github.com/emansim/baselines-mansimov/blob/master/baselines/a2c/load.py
  3 | # Thanks to the author and OpenAI team!
  4 | 
  5 | import glob
  6 | import json
  7 | import os
  8 | 
  9 | import matplotlib
 10 | matplotlib.use('Agg')
 11 | import matplotlib.pyplot as plt
 12 | plt.switch_backend('agg')
 13 | import numpy as np
 14 | from scipy.signal import medfilt
 15 | matplotlib.rcParams.update({'font.size': 8})
 16 | 
 17 | 
 18 | def smooth_reward_curve(x, y):
 19 |     # Halfwidth of our smoothing convolution
 20 |     halfwidth = min(31, int(np.ceil(len(x) / 30)))
 21 |     k = halfwidth
 22 |     xsmoo = x[k:-k]
 23 |     ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='valid') / \
 24 |         np.convolve(np.ones_like(y), np.ones(2 * k + 1), mode='valid')
 25 |     downsample = max(int(np.floor(len(xsmoo) / 1e3)), 1)
 26 |     return xsmoo[::downsample], ysmoo[::downsample]
 27 | 
 28 | 
 29 | def fix_point(x, y, interval):
 30 |     np.insert(x, 0, 0)
 31 |     np.insert(y, 0, 0)
 32 | 
 33 |     fx, fy = [], []
 34 |     pointer = 0
 35 | 
 36 |     ninterval = int(max(x) / interval + 1)
 37 | 
 38 |     for i in range(ninterval):
 39 |         tmpx = interval * i
 40 | 
 41 |         while pointer + 1 < len(x) and tmpx > x[pointer + 1]:
 42 |             pointer += 1
 43 | 
 44 |         if pointer + 1 < len(x):
 45 |             alpha = (y[pointer + 1] - y[pointer]) / \
 46 |                 (x[pointer + 1] - x[pointer])
 47 |             tmpy = y[pointer] + alpha * (tmpx - x[pointer])
 48 |             fx.append(tmpx)
 49 |             fy.append(tmpy)
 50 | 
 51 |     return fx, fy
 52 | 
 53 | 
 54 | def load_data(indir, smooth, bin_size):
 55 |     datas = []
 56 |     infiles = glob.glob(os.path.join(indir, '*.monitor.csv'))
 57 | 
 58 |     for inf in infiles:
 59 |         with open(inf, 'r') as f:
 60 |             f.readline()
 61 |             f.readline()
 62 |             for line in f:
 63 |                 tmp = line.split(',')
 64 |                 t_time = float(tmp[2])
 65 |                 tmp = [t_time, int(tmp[1]), float(tmp[0])]
 66 |                 datas.append(tmp)
 67 | 
 68 |     datas = sorted(datas, key=lambda d_entry: d_entry[0])
 69 |     result = []
 70 |     timesteps = 0
 71 |     for i in range(len(datas)):
 72 |         result.append([timesteps, datas[i][-1]])
 73 |         timesteps += datas[i][1]
 74 | 
 75 |     if len(result) < bin_size:
 76 |         return [None, None]
 77 | 
 78 |     x, y = np.array(result)[:, 0], np.array(result)[:, 1]
 79 | 
 80 |     if smooth == 1:
 81 |         x, y = smooth_reward_curve(x, y)
 82 | 
 83 |     if smooth == 2:
 84 |         y = medfilt(y, kernel_size=9)
 85 | 
 86 |     x, y = fix_point(x, y, bin_size)
 87 |     return [x, y]
 88 | 
 89 | 
 90 | color_defaults = [
 91 |     '#1f77b4',  # muted blue
 92 |     '#ff7f0e',  # safety orange
 93 |     '#2ca02c',  # cooked asparagus green
 94 |     '#d62728',  # brick red
 95 |     '#9467bd',  # muted purple
 96 |     '#8c564b',  # chestnut brown
 97 |     '#e377c2',  # raspberry yogurt pink
 98 |     '#7f7f7f',  # middle gray
 99 |     '#bcbd22',  # curry yellow-green
100 |     '#17becf'   # blue-teal
101 | ]
102 | 
103 | 
104 | def visdom_plot(viz, win, folder, game, name, num_steps, bin_size=100, smooth=1):
105 |     tx, ty = load_data(folder, smooth, bin_size)
106 |     if tx is None or ty is None:
107 |         return win
108 | 
109 |     fig = plt.figure()
110 |     plt.plot(tx, ty, label="{}".format(name))
111 | 
112 |     tick_fractions = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0])
113 |     ticks = tick_fractions * num_steps
114 |     tick_names = ["{:.0e}".format(tick) for tick in ticks]
115 |     plt.xticks(ticks, tick_names)
116 |     plt.xlim(0, num_steps * 1.01)
117 | 
118 |     plt.xlabel('Number of Timesteps')
119 |     plt.ylabel('Rewards')
120 | 
121 |     plt.title(game)
122 |     plt.legend(loc=4)
123 |     plt.show()
124 |     plt.draw()
125 | 
126 |     image = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
127 |     image = image.reshape(fig.canvas.get_width_height()[::-1] + (3, ))
128 |     plt.close(fig)
129 | 
130 |     # Show it in visdom
131 |     image = np.transpose(image, (2, 0, 1))
132 |     return viz.image(image, win=win)
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     from visdom import Visdom
137 |     viz = Visdom()
138 |     visdom_plot(viz, None, '/tmp/gym/', 'BreakOut', 'a2c', bin_size=100, smooth=1)
139 | 


--------------------------------------------------------------------------------