├── .gitignore
├── readme.md
├── requirements.txt
├── result
└── slbo.png
├── setup.py
└── slbo
├── __init__.py
├── algos
├── __init__.py
├── mbrl
│ ├── __init__.py
│ └── slbo.py
└── mfrl
│ ├── __init__.py
│ ├── ppo.py
│ └── trpo.py
├── configs
├── __init__.py
├── config.py
├── slbo_config.yaml
└── trpo_config.yaml
├── envs
├── __init__.py
├── mujoco
│ ├── __init__.py
│ ├── gym
│ │ ├── __init__.py
│ │ ├── ant_env.py
│ │ ├── half_cheetah_env.py
│ │ ├── hopper_env.py
│ │ ├── swimmer_env.py
│ │ └── walker2d_env.py
│ ├── mujoco_envs.py
│ └── rllab
│ │ ├── __init__.py
│ │ ├── ant_env.py
│ │ ├── half_cheetah_env.py
│ │ ├── hopper_env.py
│ │ ├── humanoid_env.py
│ │ ├── mujoco_models
│ │ ├── ant.xml
│ │ ├── green_ball.xml
│ │ ├── half_cheetah.xml
│ │ ├── hill_ant_env.xml.mako
│ │ ├── hill_half_cheetah_env.xml.mako
│ │ ├── hill_hopper_env.xml.mako
│ │ ├── hill_swimmer3d_env.xml.mako
│ │ ├── hill_walker2d_env.xml.mako
│ │ ├── hopper.xml
│ │ ├── humanoid.xml
│ │ ├── inverted_double_pendulum.xml
│ │ ├── inverted_double_pendulum.xml.mako
│ │ ├── point.xml
│ │ ├── red_ball.xml
│ │ ├── simple_humanoid.xml
│ │ ├── swimmer.xml
│ │ ├── swimmer3d.xml
│ │ ├── utils.mako
│ │ └── walker2d.xml
│ │ ├── rllab_ant.xml
│ │ ├── rllab_half_cheetah.xml
│ │ ├── rllab_hopper.xml
│ │ ├── rllab_simple_humanoid.xml
│ │ ├── rllab_swimmer.xml
│ │ ├── rllab_walker2d.xml
│ │ ├── swimmer_env.py
│ │ └── walker2d_env.py
├── virtual_env.py
└── wrapped_envs.py
├── misc
├── __init__.py
├── distributions.py
├── logger.py
├── ou_noise.py
├── param.py
└── utils.py
├── models
├── __init__.py
├── actor.py
├── actor_critic.py
├── actor_layer.py
├── critic.py
├── dynamics.py
├── initializer.py
├── normalizers.py
└── utils.py
├── scripts
├── run_slbo.py
└── run_trpo.py
└── storages
├── __init__.py
├── off_policy_buffer.py
└── on_policy_buffer.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /envs/mujoco/rllab/mujoco_models/
2 | /.idea
3 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # SLBO_PyTorch
2 | A PyTorch reimplementation of SLBO
3 |
4 | # Dependency
5 |
6 | Please refer to ./requirements.txt.
7 |
8 | # Usage
9 |
10 | python ./scripts/run_slbo.py
11 |
12 | hyperparams in ./configs/slbo_config.yaml
13 |
14 | # Result
15 |
16 | 
17 |
18 | # Credits
19 | 1. [kostrikov/pytorch-trpo](https://github.com/ikostrikov/pytorch-trpo)
20 | 2. [facebookresearch/slbo](https://github.com/facebookresearch/slbo)
21 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | atari-py==0.2.6
2 | attrs==19.3.0
3 | box2d-py==2.3.8
4 | cloudpickle==1.2.1
5 | colorama==0.4.3
6 | filelock==3.0.12
7 | gtimer==1.0.0b5
8 | gym~=0.17.2
9 | matplotlib==3.1.2
10 | more-itertools==8.0.2
11 | mpi4py==3.0.3
12 | mujoco-py==2.0.2.5
13 | multidict==4.7.5
14 | munch==2.5.0
15 | numpy~=1.18.5
16 | pybullet==2.6.1
17 | PyYAML~=5.3.1
18 | stable-baselines==2.10.0
19 | tensorboard==1.14.0
20 | tensorflow==1.14.0
21 | tensorflow-estimator==1.14.0
22 | termcolor==1.1.0
23 | torch~=1.6.0
24 | scipy~=1.5.1
25 | tqdm~=4.46.1
26 |
--------------------------------------------------------------------------------
/result/slbo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/result/slbo.png
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages
2 | from setuptools import setup
3 |
4 | setup(
5 | name='slbo_pytorch',
6 | auther='Shengyi Jiang',
7 | author_email='shengyi.jiang@outlook.com',
8 | packages=find_packages(),
9 | package_data={},
10 | install_requires=[
11 | 'torch>=1.4.0',
12 | 'gym>=0.17.0',
13 | 'numpy',
14 | 'stable_baselines',
15 | 'pyglib',
16 | 'scipy',
17 | ])
18 |
--------------------------------------------------------------------------------
/slbo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/__init__.py
--------------------------------------------------------------------------------
/slbo/algos/__init__.py:
--------------------------------------------------------------------------------
1 | from slbo.algos.mbrl.slbo import SLBO
2 | from slbo.algos.mfrl import PPO, TRPO
3 |
--------------------------------------------------------------------------------
/slbo/algos/mbrl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/algos/mbrl/__init__.py
--------------------------------------------------------------------------------
/slbo/algos/mbrl/slbo.py:
--------------------------------------------------------------------------------
1 | from operator import itemgetter
2 | import torch
3 |
4 | from slbo.models.dynamics import Dynamics
5 | from slbo.models.normalizers import Normalizers
6 |
7 |
8 | class SLBO:
9 | def __init__(self, dynamics: Dynamics, normalizers: Normalizers, batch_size: int, num_updates: int,
10 | num_rollout_steps, l2_reg_coef, lr, max_grad_norm=2):
11 | self.dynamics = dynamics
12 | self.normalizers = normalizers
13 |
14 | self.num_updates = num_updates
15 | self.num_rollout_steps = num_rollout_steps
16 | self.batch_size = batch_size
17 | self.l2_reg_coef = l2_reg_coef
18 | self.max_grad_norm = max_grad_norm
19 |
20 | self.dynamics_optimizer = torch.optim.Adam(self.dynamics.parameters(), lr)
21 |
22 | def update(self, model_buffer) -> dict:
23 |
24 | gen = model_buffer.get_sequential_batch_generator(self.batch_size, self.num_rollout_steps)
25 |
26 | model_loss_epoch = 0.
27 | l2_loss_epoch = 0.
28 | for _ in range(self.num_updates):
29 | try:
30 | state_sequences, action_sequences, next_state_sequences, mask_sequences = \
31 | itemgetter(*['states', 'actions', 'next_states', 'masks'])(next(gen))
32 | except StopIteration:
33 | gen = model_buffer.get_sequential_batch_generator(self.batch_size, self.num_rollout_steps)
34 | state_sequences, action_sequences, next_state_sequences, mask_sequences = \
35 | itemgetter(*['states', 'actions', 'next_states', 'masks'])(next(gen))
36 |
37 | cur_states = state_sequences[:, 0]
38 | model_loss = 0.
39 |
40 | for i in range(self.num_rollout_steps):
41 | next_states = self.dynamics(cur_states, action_sequences[:, i])
42 | diffs = next_states - cur_states - next_state_sequences[:, i] + state_sequences[:, i]
43 | weighted_diffs = diffs / torch.clamp(self.normalizers.diff_normalizer.std, min=1e-6)
44 | model_loss += weighted_diffs.pow(2).mean(-1).sqrt()
45 |
46 | if i < self.num_rollout_steps - 1:
47 | cur_states = state_sequences[:, i + 1] + \
48 | mask_sequences[:, i] * (next_states - state_sequences[:, i + 1])
49 |
50 | model_loss = model_loss.mean() / self.num_rollout_steps
51 | params = self.dynamics.parameters()
52 | l2_loss = self.l2_reg_coef * torch.stack([torch.norm(t, p=2) for t in params]).sum()
53 |
54 | model_loss_epoch += model_loss.item()
55 | l2_loss_epoch += l2_loss.item()
56 |
57 | self.dynamics_optimizer.zero_grad()
58 | (model_loss + l2_loss).backward()
59 | torch.nn.utils.clip_grad_norm_(self.dynamics.parameters(), self.max_grad_norm)
60 | self.dynamics_optimizer.step()
61 |
62 | model_loss_epoch /= self.num_updates
63 | return {'model_loss': model_loss_epoch, 'l2_loss': l2_loss_epoch}
64 |
--------------------------------------------------------------------------------
/slbo/algos/mfrl/__init__.py:
--------------------------------------------------------------------------------
1 | from slbo.algos.mfrl.ppo import PPO
2 | from slbo.algos.mfrl.trpo import TRPO
--------------------------------------------------------------------------------
/slbo/algos/mfrl/ppo.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 |
5 | from slbo.models.actor_critic import ActorCritic
6 |
7 |
8 | class PPO:
9 | def __init__(self, actor_critic: ActorCritic, clip_param: float, num_grad_updates: int, batch_size: int,
10 | value_loss_coef: float, entropy_coef: float, lr: float = None, max_grad_norm: float = None,
11 | use_clipped_value_loss=True, verbose=0):
12 | self.actor_critic = actor_critic
13 |
14 | self.clip_param = clip_param
15 | self.num_grad_updates = num_grad_updates
16 | self.batch_size = batch_size
17 | self.value_loss_coef = value_loss_coef
18 | self.entropy_coef = entropy_coef
19 |
20 | self.max_grad_norm = max_grad_norm
21 | self.use_clipped_value_loss = use_clipped_value_loss
22 |
23 | self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr)
24 |
25 | self.verbose = verbose
26 |
27 | def update(self, policy_buffer) -> dict:
28 | advantage = policy_buffer.returns[:-1] - policy_buffer.values[:-1]
29 | advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5)
30 |
31 | value_loss_epoch = 0
32 | action_loss_epoch = 0
33 | dist_entropy_epoch = 0
34 |
35 | for _ in range(self.num_grad_updates):
36 |
37 | data_generator = policy_buffer.get_batch_generator(self.batch_size, advantage)
38 |
39 | for sample in data_generator:
40 | states, actions, value_preds, returns, old_action_log_probs, adv_targets = \
41 | sample['states'], sample['actions'], sample['values'], \
42 | sample['returns'], sample['action_log_probs'], sample['adv_targets']
43 |
44 | values, action_log_probs, dist_entropy = self.actor_critic.evaluate_action(states, actions)
45 |
46 | ratio = torch.exp(action_log_probs - old_action_log_probs)
47 | surr1 = ratio * adv_targets
48 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targets
49 |
50 | action_loss = -torch.min(surr1, surr2).mean()
51 |
52 | if self.use_clipped_value_loss:
53 | value_pred_clipped = value_preds + \
54 | (values - value_preds).clamp(-self.clip_param, self.clip_param)
55 | value_losses = (values - returns).pow(2)
56 | value_losses_clipped = (
57 | value_pred_clipped - returns).pow(2)
58 | value_loss = 0.5 * torch.max(value_losses,
59 | value_losses_clipped).mean()
60 | else:
61 | value_loss = 0.5 * (returns - values).pow(2).mean()
62 |
63 | self.optimizer.zero_grad()
64 | (value_loss * self.value_loss_coef + action_loss -
65 | dist_entropy * self.entropy_coef).backward()
66 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
67 | self.max_grad_norm)
68 | self.optimizer.step()
69 |
70 | value_loss_epoch += value_loss.item()
71 | action_loss_epoch += action_loss.item()
72 | dist_entropy_epoch += dist_entropy.item()
73 |
74 | num_updates = self.num_grad_updates * self.batch_size
75 |
76 | value_loss_epoch /= num_updates
77 | action_loss_epoch /= num_updates
78 | dist_entropy_epoch /= num_updates
79 |
80 | return {'value_loss': value_loss_epoch, 'action_loss': action_loss_epoch,
81 | 'dist_entropy': dist_entropy_epoch}
82 |
--------------------------------------------------------------------------------
/slbo/algos/mfrl/trpo.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.optimize
3 | import torch
4 |
5 | from slbo.models import Actor, VCritic
6 | from slbo.models.utils import get_flat_params, set_flat_params, get_flat_grad
7 | try:
8 | from slbo.misc import logger
9 | except ImportError:
10 | from stable_baselines import logger
11 |
12 |
13 | # noinspection DuplicatedCode
14 | class TRPO:
15 | def __init__(self, actor: Actor, critic: VCritic, max_kld=1e-2, l2_reg_coef=1e-3, damping=0.1,
16 | entropy_coef=0., line_search_accepted_ratio=0.1, verbose=0):
17 |
18 | self.actor = actor
19 | self.critic = critic
20 |
21 | self.max_kld = max_kld
22 | self.l2_reg = l2_reg_coef
23 | self.damping = damping
24 | self.linesearch_accepted_ratio = line_search_accepted_ratio
25 | self.entropy_coef = entropy_coef
26 |
27 | self.verbose = verbose
28 |
29 | @staticmethod
30 | def get_conjugate_gradient(Avp, b, nsteps, residual_tol=1e-10):
31 | x = torch.zeros_like(b)
32 | r = b.clone()
33 | p = b.clone()
34 | rdotr = torch.dot(r, r)
35 | for i in range(nsteps):
36 | _Avp = Avp(p)
37 | alpha = rdotr / torch.dot(p, _Avp)
38 | x += alpha * p
39 | r -= alpha * _Avp
40 | new_rdotr = torch.dot(r, r)
41 | beta = new_rdotr / rdotr
42 | p = r + beta * p
43 | rdotr = new_rdotr
44 | if rdotr < residual_tol:
45 | break
46 | return x
47 |
48 | def linesearch(self, f, init_params, fullstep, expected_improve_rate, max_backtracks=10):
49 | with torch.no_grad():
50 | fval = f()
51 | for (_n_backtracks, stepfrac) in enumerate(.5 ** np.arange(max_backtracks)):
52 | new_params = init_params + stepfrac * fullstep
53 | set_flat_params(self.actor, new_params)
54 | newfval = f()
55 | actual_improve = fval - newfval
56 | expected_improve = expected_improve_rate * stepfrac
57 | ratio = actual_improve / expected_improve
58 | if self.verbose > 0:
59 | logger.log("a/e/r ", actual_improve.item(), expected_improve.item(), ratio.item())
60 | if ratio.item() > self.linesearch_accepted_ratio and actual_improve.item() > 0:
61 | return True, new_params
62 | return False, init_params
63 |
64 | # noinspection DuplicatedCode
65 | def update_critic(self, states, targets):
66 | def get_value_loss(params):
67 | set_flat_params(self.critic, torch.tensor(params))
68 | for param in self.critic.parameters():
69 | if param.grad is not None:
70 | param.grad.data.fill_(0)
71 |
72 | values = self.critic(states)
73 | value_loss_ = (values - targets).pow(2).mean()
74 |
75 | loss = value_loss_
76 | for param in self.critic.parameters():
77 | loss += param.pow(2).sum() * self.l2_reg
78 | loss.backward()
79 | return loss.data.cpu().double().numpy(), get_flat_grad(self.critic).data.cpu().double().numpy()
80 |
81 | flat_params, value_loss, _ = scipy.optimize.fmin_l_bfgs_b(get_value_loss,
82 | get_flat_params(self.critic).cpu().double().numpy(),
83 | maxiter=25)
84 | set_flat_params(self.critic, torch.tensor(flat_params))
85 | return value_loss
86 |
87 | def update(self, policy_buffer) -> dict:
88 | advantages = policy_buffer.returns[:-1] - policy_buffer.values[:-1]
89 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
90 |
91 | data_generator = policy_buffer.get_batch_generator(advantages=advantages)
92 |
93 | value_loss_epoch = 0.
94 | action_loss_epoch = 0.
95 |
96 | for sample in data_generator:
97 | states, actions, returns, adv_targets = \
98 | sample['states'], sample['actions'], sample['returns'], sample['adv_targets']
99 |
100 | value_loss = self.update_critic(states, returns)
101 | fixed_log_prob = self.actor.evaluate_action(states, actions)[0].detach()
102 |
103 | def get_action_loss():
104 | log_prob, entropy = self.actor.evaluate_action(states, actions)
105 | action_loss_ = - adv_targets * torch.exp(log_prob - fixed_log_prob) - self.entropy_coef * entropy
106 | return action_loss_.mean()
107 |
108 | def get_kl():
109 | *_, action_means, action_logstds, action_stds = self.actor.act(states)
110 |
111 | fixed_action_means = action_means.detach()
112 | fixed_action_logstds = action_logstds.detach()
113 | fixed_action_stds = action_stds.detach()
114 | kl = action_logstds - fixed_action_logstds + \
115 | (fixed_action_stds.pow(2) + (fixed_action_means - action_means).pow(2)) / \
116 | (2.0 * action_stds.pow(2)) - 0.5
117 | return kl.sum(1, keepdim=True)
118 |
119 | action_loss = get_action_loss()
120 | action_loss_grad = torch.autograd.grad(action_loss, self.actor.parameters())
121 | flat_action_loss_grad = torch.cat([grad.view(-1) for grad in action_loss_grad]).data
122 |
123 | def Fvp(v):
124 | kl = get_kl()
125 | kl = kl.mean()
126 |
127 | kld_grad = torch.autograd.grad(kl, self.actor.parameters(), create_graph=True)
128 | flat_kld_grad = torch.cat([grad.view(-1) for grad in kld_grad])
129 |
130 | kl_v = (flat_kld_grad * v).sum()
131 | kld_grad_grad = torch.autograd.grad(kl_v, self.actor.parameters())
132 | flat_kld_grad_grad = torch.cat([grad.contiguous().view(-1) for grad in kld_grad_grad]).data
133 |
134 | return flat_kld_grad_grad + v * self.damping
135 |
136 | stepdir = self.get_conjugate_gradient(Fvp, -flat_action_loss_grad, 10)
137 |
138 | shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0)
139 |
140 | lm = torch.sqrt(shs / self.max_kld)
141 | fullstep = stepdir / lm
142 |
143 | neggdotstepdir = (-flat_action_loss_grad * stepdir).sum(0, keepdim=True)
144 | if self.verbose > 0:
145 | logger.info(("lagrange multiplier:", lm, "grad_norm:", flat_action_loss_grad.norm()))
146 |
147 | prev_params = get_flat_params(self.actor)
148 | success, new_params = self.linesearch(get_action_loss, prev_params, fullstep, neggdotstepdir / lm)
149 | set_flat_params(self.actor, new_params)
150 |
151 | value_loss_epoch += value_loss
152 | action_loss_epoch += action_loss.item()
153 |
154 | return {'action_loss': action_loss_epoch, 'value_loss': value_loss_epoch}
155 |
--------------------------------------------------------------------------------
/slbo/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/configs/__init__.py
--------------------------------------------------------------------------------
/slbo/configs/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import yaml
4 | from munch import DefaultMunch
5 | from yaml import Loader
6 | import collections
7 |
8 | try:
9 | from slbo.misc import logger
10 | except ImportError:
11 | from stable_baselines import logger
12 |
13 |
14 | def flatten(d, parent_key='', sep='.'):
15 | items = []
16 | for k, v in d.items():
17 | new_key = parent_key + sep + k if parent_key else k
18 | if isinstance(v, collections.MutableMapping):
19 | items.extend(flatten(v, new_key, sep=sep).items())
20 | else:
21 | items.append((new_key, str(v)))
22 | return dict(items)
23 |
24 |
25 | class Config:
26 | def __new__(cls, config_path='config.yaml'):
27 | if not config_path.startswith('/'):
28 | config_path = os.path.join(os.path.dirname(__file__), config_path)
29 | logger.info('Loading configs from {}.'.format(config_path))
30 | with open(config_path, 'r', encoding='utf-8') as f:
31 | config_dict = yaml.load(f, Loader=Loader)
32 | config = DefaultMunch.fromDict(config_dict, object())
33 | config_dict = flatten(config_dict)
34 | logged_config_dict = {}
35 | for key, value in config_dict.items():
36 | if key.find('.') >= 0:
37 | logged_config_dict[key] = value
38 | return config, logged_config_dict
39 |
--------------------------------------------------------------------------------
/slbo/configs/slbo_config.yaml:
--------------------------------------------------------------------------------
1 | mf_algo: 'trpo'
2 | proj_dir: '/home/liuxh/Documents/slbo'
3 | result_dir: './result'
4 | use_cuda: True
5 | seed: 0
6 | verbose: 0
7 | model_load_path: ~
8 | buffer_load_path: ~
9 | save_freq: 2
10 | eval_freq: 1
11 |
12 | env:
13 | env_name: 'Hopper-v2'
14 | num_real_envs: 1
15 | num_virtual_envs: 8
16 | gamma: 0.99
17 | max_episode_steps: 500
18 |
19 | ou_noise:
20 | theta: 0.15
21 | sigma: 0.3
22 |
23 | trpo:
24 | entropy_coef: 0.005
25 | max_kld: 0.01
26 | num_env_steps: 500 # 500 x 8 = 4000
27 | critic_hidden_dims: [64, 64]
28 | actor_hidden_dims: [32, 32]
29 | use_limited_ent_actor: True
30 | use_gae: True
31 | gae_lambda: 0.95
32 | use_proper_time_limits: True
33 | log_interval: 5
34 | l2_reg_coef : 0.
35 | norm_reward: False
36 |
37 | slbo:
38 | num_env_steps: 4000
39 | num_epochs: 100 # collect num_env_steps per epoch
40 | num_iters: 20 # number of iteration per epoch
41 | num_model_updates: 100 # number of model updates per iteration
42 | num_policy_updates: 40 # number of policy updates per iteration
43 | use_prev_data: True
44 | dynamics_hidden_dims: [500, 500]
45 | num_rollout_steps: 2
46 | batch_size: 128
47 | buffer_size: 200000
48 | lr: 0.001
49 | l2_reg_coef: 0.00001
50 | log_interval: 1
51 | start_strategy: 'reset' # choose from 'reset' and 'buffer'
52 |
--------------------------------------------------------------------------------
/slbo/configs/trpo_config.yaml:
--------------------------------------------------------------------------------
1 | mf_algo: 'trpo'
2 | proj_dir: '/home/polixir/jiangsy/slbo'
3 | result_dir: './result'
4 | use_cuda: False
5 | seed: 0
6 | verbose: 0
7 | model_load_path: ~
8 | buffer_load_path: ~
9 | save_interval: 10
10 | log_interval: 1
11 | eval_interval: 10
12 |
13 | env:
14 | env_name: 'Hopper-v2'
15 | num_envs: 1
16 | gamma: 0.99
17 |
18 | ou_noise:
19 | theta: 0.15
20 | sigma: 0.3
21 |
22 | trpo:
23 | total_env_steps: 2000000
24 | entropy_coef: 0.
25 | max_kld: 0.01
26 | num_env_steps: 2048
27 | critic_hidden_dims: [64, 64]
28 | actor_hidden_dims: [64, 64]
29 | use_gae: True
30 | gae_lambda: 0.95
31 | use_proper_time_limits: True
32 |
--------------------------------------------------------------------------------
/slbo/envs/__init__.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import gym
3 | import numpy as np
4 | import torch
5 | from stable_baselines import logger
6 |
7 | from slbo.storages.off_policy_buffer import OffPolicyBuffer
8 |
9 |
10 | class BaseBatchedEnv(gym.Env, abc.ABC):
11 | n_envs: int
12 |
13 | @abc.abstractmethod
14 | def step(self, actions):
15 | pass
16 |
17 | def reset(self):
18 | return self.partial_reset(range(self.n_envs))
19 |
20 | @abc.abstractmethod
21 | def partial_reset(self, indices):
22 | pass
23 |
24 | def set_state(self, state):
25 | logger.warn('`set_state` is not implemented')
26 |
27 |
28 | class BaseModelBasedEnv(gym.Env, abc.ABC):
29 | @abc.abstractmethod
30 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray):
31 | raise NotImplementedError
32 |
33 | def verify(self, n=2000, eps=1e-4):
34 | buffer = OffPolicyBuffer(n, self.observation_space.shape, 1, self.action_space)
35 | state = self.reset()
36 | for _ in range(n):
37 | action = self.action_space.sample()
38 | next_state, reward, done, _ = self.step(action)
39 |
40 | mask = torch.tensor([0.0] if done else [1.0], dtype=torch.float32)
41 |
42 | buffer.insert(torch.tensor(state), torch.tensor(action), torch.tensor(reward),
43 | torch.tensor(next_state), torch.tensor(mask))
44 |
45 | state = next_state
46 | if done:
47 | state = self.reset()
48 |
49 | rewards_, dones_ = self.mb_step(buffer.states.numpy(), buffer.actions.numpy(), buffer.next_states.numpy())
50 | diff = (buffer.rewards.numpy() - rewards_[:, np.newaxis]) * buffer.masks.numpy()
51 | l_inf = np.abs(diff).max()
52 | logger.info('reward difference: %.6f', l_inf)
53 |
54 | assert np.allclose(dones_, buffer.masks), 'reward model is inaccurate'
55 | assert l_inf < eps, 'done model is inaccurate'
56 |
57 | def seed(self, seed: int = None):
58 | pass
59 |
60 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/envs/mujoco/__init__.py
--------------------------------------------------------------------------------
/slbo/envs/mujoco/gym/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/envs/mujoco/gym/__init__.py
--------------------------------------------------------------------------------
/slbo/envs/mujoco/gym/ant_env.py:
--------------------------------------------------------------------------------
1 | from gym.envs.mujoco import ant
2 | import numpy as np
3 | from stable_baselines import logger
4 |
5 | from slbo.envs import BaseModelBasedEnv
6 |
7 |
8 | # noinspection DuplicatedCode
9 | class AntEnv(ant.AntEnv, BaseModelBasedEnv):
10 | def __init__(self, use_approximated_vel=True):
11 | logger.warn('Modified Gym Envs!')
12 | self.rescale_action = False
13 | self.use_approximated_vel = use_approximated_vel
14 |
15 | ant.AntEnv.__init__(self)
16 | BaseModelBasedEnv.__init__(self)
17 |
18 | def get_body_xmat(self, body_name):
19 | return self.sim.data.get_body_xmat(body_name)
20 |
21 | def get_body_comvel(self, body_name):
22 | return self.sim.data.get_body_xvelp(body_name)
23 |
24 | def _get_obs(self):
25 | return np.concatenate([
26 | self.sim.data.qpos.flat, # 15
27 | self.sim.data.qvel.flat, # 14
28 | self.get_body_xmat("torso").flat, # 9
29 | self.get_body_com("torso"), # 9
30 | self.get_body_comvel("torso"), # 3
31 | ]).reshape(-1)
32 |
33 | def step(self, action):
34 | pre_pos = self.sim.data.qpos[0]
35 | self.do_simulation(action, self.frame_skip)
36 | post_pos = self.sim.data.qpos[0]
37 | if self.use_approximated_vel:
38 | fwd_reward = (post_pos - pre_pos) / self.dt
39 | else:
40 | fwd_reward = self.get_body_comvel('torso')[0]
41 | ctrl_reward = - .5 * np.square(action).sum()
42 | # make sure the reward can be recovered from state and action completely
43 | contact_reward = - 0.
44 | survive_reward = 1.0
45 | reward = fwd_reward + ctrl_reward + contact_reward + survive_reward
46 | state = self.state_vector()
47 | done = not(np.isfinite(state).all() and 0.2 <= state[2] <= 1.0)
48 | ob = self._get_obs()
49 | return ob, reward, done, {}
50 |
51 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray):
52 | if self.use_approximated_vel:
53 | reward_forward = (next_states[:, 0] - states[:, 0]) / self.dt
54 | else:
55 | reward_forward = next_states[..., -3]
56 |
57 | ctrl_cost = .5 * np.square(actions).sum(-1)
58 | contact_cost = 0.
59 | survive_reward = 1.0
60 | reward = reward_forward - ctrl_cost - contact_cost + survive_reward
61 | notdone = np.all(0.2 <= next_states[..., 2] <= 1.0, axis=0)
62 | return reward, 1. - notdone
63 |
64 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/gym/half_cheetah_env.py:
--------------------------------------------------------------------------------
1 | from gym.envs.mujoco import half_cheetah
2 | import numpy as np
3 | from stable_baselines import logger
4 |
5 | from slbo.envs import BaseModelBasedEnv
6 |
7 |
8 | # noinspection DuplicatedCode
9 | class HalfCheetahEnv(half_cheetah.HalfCheetahEnv, BaseModelBasedEnv):
10 | def __init__(self, use_approximated_vel=True):
11 | self.use_approximated_vel = use_approximated_vel
12 | self.rescale_action = False
13 | if not self.use_approximated_vel:
14 | logger.warn('Modified Gym Env!')
15 |
16 | half_cheetah.HalfCheetahEnv.__init__(self)
17 | BaseModelBasedEnv.__init__(self)
18 |
19 | def get_body_comvel(self, body_name):
20 | return self.sim.data.get_body_xvelp(body_name)
21 |
22 | def _get_obs(self):
23 | return np.concatenate([
24 | self.model.data.qpos.flat, # 9
25 | self.model.data.qvel.flat, # 9
26 | self.get_body_com("torso").flat, # 3
27 | self.get_body_comvel("torso").flat, # 3
28 | ])
29 |
30 | def step(self, action: np.ndarray):
31 | pre_pos = self.sim.data.qpos[0]
32 | self.do_simulation(action, self.frame_skip)
33 | post_pos = self.sim.data.qpos[0]
34 | if self.use_approximated_vel:
35 | fwd_reward = (post_pos - pre_pos) / self.dt
36 | else:
37 | fwd_reward = self.get_body_comvel('torso')[0]
38 | ctrl_reward = - 0.1 * np.square(action).sum()
39 | reward = ctrl_reward + fwd_reward
40 | obs = self._get_obs()
41 | return obs, reward, False, {}
42 |
43 | def mb_step(self, states, actions, next_states):
44 | ctrl_rewards = - 0.1 * np.square(actions).sum(-1)
45 | if self.use_approximated_vel:
46 | fwd_rewards = (next_states[:, 0] - states[:, 0]) / self.dt
47 | else:
48 | fwd_rewards = next_states[:, 21]
49 | rewards = fwd_rewards + ctrl_rewards
50 | return rewards, np.zeros_like(rewards, dtype=np.bool)
51 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/gym/hopper_env.py:
--------------------------------------------------------------------------------
1 | from gym.envs.mujoco import hopper
2 | import numpy as np
3 | from stable_baselines import logger
4 |
5 | from slbo.envs import BaseModelBasedEnv
6 |
7 |
8 | # noinspection DuplicatedCode
9 | class HopperEnv(hopper.HopperEnv, BaseModelBasedEnv):
10 | def __init__(self, use_approximated_vel=True):
11 | self.use_approximated_vel = use_approximated_vel
12 | self.rescale_action = False
13 |
14 | if not self.use_approximated_vel:
15 | logger.warn('Modified Gym Env!')
16 | hopper.HopperEnv.__init__(self)
17 | BaseModelBasedEnv.__init__(self)
18 |
19 | def get_body_comvel(self, body_name):
20 | return self.sim.data.get_body_xvelp(body_name)
21 |
22 | def _get_obs(self):
23 | return np.concatenate([
24 | self.sim.data.qpos.flat, # 6
25 | self.sim.data.qvel.flat, # 6
26 | self.get_body_com("torso").flat, # 3
27 | self.get_body_comvel("torso").flat, # 3
28 | ])
29 |
30 | def step(self, action):
31 | pre_pos = self.sim.data.qpos[0]
32 | self.do_simulation(action, self.frame_skip)
33 | post_pos, height, ang = self.sim.data.qpos[0:3]
34 | if self.use_approximated_vel:
35 | fwd_reward = (post_pos - pre_pos) / self.dt
36 | else:
37 | fwd_reward = self.get_body_comvel('torso')[0]
38 | survive_reward = 1.0
39 | ctrl_reward = -1e-3 * np.square(action).sum()
40 | reward = fwd_reward + survive_reward + ctrl_reward
41 | s = self.state_vector()
42 | done = not (np.isfinite(s).all() and (np.abs(s[2:]) < 100).all() and
43 | (height > .7) and (abs(ang) < .2))
44 | ob = self._get_obs()
45 | return ob, reward, done, {}
46 |
47 | def mb_step(self, states, actions, next_states):
48 | if self.use_approximated_vel:
49 | fwd_reward = (next_states[:, 0] - states[:, 0]) / self.dt
50 | else:
51 | fwd_reward = next_states[:, -3]
52 |
53 | survive_reward = 1.0
54 | ctrl_reward = -1e-3 * np.square(actions).sum(-1)
55 |
56 | reward = fwd_reward + survive_reward + ctrl_reward
57 |
58 | done = ~((next_states[:, 2:12] < 100).all(axis=-1) &
59 | (next_states[:, 1] > 0.7) &
60 | (np.abs(next_states[:, 2]) < 0.2))
61 | return reward, done
62 |
63 |
64 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/gym/swimmer_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco import swimmer
3 | from stable_baselines import logger
4 |
5 | from slbo.envs import BaseModelBasedEnv
6 |
7 |
8 | # noinspection DuplicatedCode
9 | class SwimmerEnv(swimmer.SwimmerEnv, BaseModelBasedEnv):
10 | def __init__(self, use_approximated_vel=True):
11 | self.use_approximated_vel = use_approximated_vel
12 | self.rescale_action = False
13 |
14 | if not self.use_approximated_vel:
15 | logger.warn('Modified Gym Env!')
16 |
17 | swimmer.SwimmerEnv.__init__(self)
18 | BaseModelBasedEnv.__init__(self)
19 |
20 | def get_body_comvel(self, body_name):
21 | return self.sim.data.get_body_xvelp(body_name)
22 |
23 | def _get_obs(self):
24 | return np.concatenate([
25 | self.model.data.qpos.flat, # 5
26 | self.model.data.qvel.flat, # 5
27 | self.get_body_com("torso").flat, # 3
28 | self.get_body_comvel("torso").flat, # 3
29 | ]).reshape(-1)
30 |
31 | def step(self, action):
32 | pre_pos = self.sim.data.qpos[0]
33 | self.do_simulation(action, self.frame_skip)
34 | post_pos, height, ang = self.sim.data.qpos[0:3]
35 | if self.use_approximated_vel:
36 | fwd_reward = (post_pos - pre_pos) / self.dt
37 | else:
38 | fwd_reward = self.get_body_comvel('torso')[0]
39 | ctrl_reward = - 0.0001 * np.square(action).sum()
40 | reward = fwd_reward + ctrl_reward
41 | obs = self._get_obs()
42 | return obs, reward, False, {}
43 |
44 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray):
45 | ctrl_reward = - 0.0001 * np.square(actions).sum(-1)
46 | fwd_reward = next_states[:, -3]
47 | reward = fwd_reward + ctrl_reward
48 | return reward, False
49 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/gym/walker2d_env.py:
--------------------------------------------------------------------------------
1 | from gym.envs.mujoco import walker2d
2 | import numpy as np
3 | from stable_baselines import logger
4 |
5 | from slbo.envs import BaseModelBasedEnv
6 |
7 |
8 | # noinspection DuplicatedCode
9 | class Walker2DEnv(walker2d.Walker2dEnv, BaseModelBasedEnv):
10 | def __init__(self, use_approximated_vel=True):
11 | self.use_approximated_vel = use_approximated_vel
12 | self.rescale_action = False
13 |
14 | if not self.use_approximated_vel:
15 | logger.warn('Modified Gym Env!')
16 |
17 | walker2d.Walker2dEnv.__init__(self)
18 | BaseModelBasedEnv.__init__(self)
19 |
20 | def get_body_comvel(self, body_name):
21 | return self.sim.data.get_body_xvelp(body_name)
22 |
23 | def _get_obs(self):
24 | return np.concatenate([
25 | self.model.data.qpos.flat,
26 | self.model.data.qvel.flat,
27 | self.get_body_com("torso").flat,
28 | self.get_body_comvel("torso").flat
29 | ])
30 |
31 | def step(self, action):
32 | pre_pos = self.sim.data.qpos[0]
33 | self.do_simulation(action, self.frame_skip)
34 | post_pos, height, ang = self.sim.data.qpos[0:3]
35 | if self.use_approximated_vel:
36 | fwd_reward = (post_pos - pre_pos) / self.dt
37 | else:
38 | fwd_reward = self.get_body_comvel('torso')[0]
39 | survive_reward = 1.0
40 | ctrl_reward = - 1e-3 * np.square(action).sum()
41 | reward = fwd_reward + survive_reward + ctrl_reward
42 | done = not (0.8 < height < 2.0 and -1.0 < ang < 1.0)
43 | ob = self._get_obs()
44 | return ob, reward, done, {}
45 |
46 | def mb_step(self, states, actions, next_states):
47 | if self.use_approximated_vel:
48 | fwd_rewards = (states[:, 0] - next_states[:, 0]) / self.dt
49 | else:
50 | fwd_rewards = next_states[:, 21]
51 | survive_rewards = 1.0
52 | ctrl_rewards = - 1e-3 * np.square(actions).sum(-1)
53 | rewards = fwd_rewards + survive_rewards + ctrl_rewards
54 | dones = ~((0.8 < next_states[:, 1] < 2.0) &
55 | (-1.0 < next_states[:, 2] < 1.0))
56 | return rewards, dones
57 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/mujoco_envs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from slbo.envs.mujoco.gym.ant_env import AntEnv
4 | from slbo.envs.mujoco.gym.half_cheetah_env import HalfCheetahEnv
5 | from slbo.envs.mujoco.gym.hopper_env import HopperEnv
6 | from slbo.envs.mujoco.gym.swimmer_env import SwimmerEnv
7 | from slbo.envs.mujoco.gym.walker2d_env import Walker2DEnv
8 | from slbo.envs.mujoco.rllab.ant_env import RLLabAntEnv
9 | from slbo.envs.mujoco.rllab.half_cheetah_env import RLLabHalfCheetahEnv
10 | from slbo.envs.mujoco.rllab.hopper_env import RLLabHopperEnv
11 | from slbo.envs.mujoco.rllab.humanoid_env import RLLabSimpleHumanoidEnv
12 | from slbo.envs.mujoco.rllab.swimmer_env import RLLabSwimmerEnv
13 | from slbo.envs.mujoco.rllab.walker2d_env import RLLabWalker2dEnv
14 | try:
15 | from slbo.misc import logger
16 | except ImportError:
17 | from stable_baselines import logger
18 |
19 |
20 | def make_mujoco_env(env_name: str):
21 | envs = {
22 | 'HalfCheetah-v2': HalfCheetahEnv,
23 | 'Walker2D-v2': Walker2DEnv,
24 | 'Ant-v2': AntEnv,
25 | 'Hopper-v2': HopperEnv,
26 | 'Swimmer-v2': SwimmerEnv,
27 | 'RLLabHalfCheetah-v2': RLLabHalfCheetahEnv,
28 | 'RLLabWalker2D-v2': RLLabWalker2dEnv,
29 | 'RLLabAnt-v2': RLLabAntEnv,
30 | 'RLLabHopper-v2': RLLabHopperEnv,
31 | 'RLLabSwimmer-v2': RLLabSwimmerEnv,
32 | 'RLLabHumanoid-v2': RLLabSimpleHumanoidEnv
33 | }
34 | env = envs[env_name]()
35 | if not hasattr(env, 'reward_range'):
36 | env.reward_range = (-np.inf, np.inf)
37 | if not hasattr(env, 'metadata'):
38 | env.metadata = {}
39 | env.seed(np.random.randint(2 ** 60))
40 | return env
41 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/envs/mujoco/rllab/__init__.py
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/ant_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from slbo.envs import BaseModelBasedEnv
8 |
9 |
10 | class RLLabAntEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 | def __init__(self):
12 | self.rescale_action = True
13 |
14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_ant.xml'), 1)
15 | utils.EzPickle.__init__(self)
16 |
17 | def get_body_xmat(self, body_name):
18 | return self.sim.data.get_body_xmat(body_name)
19 |
20 | def get_body_comvel(self, body_name):
21 | return self.sim.data.get_body_xvelp(body_name)
22 |
23 | def _get_obs(self):
24 | return np.concatenate([
25 | self.sim.data.qpos.flat, # 15
26 | self.sim.data.qvel.flat, # 14
27 | self.get_body_xmat("torso").flat, # 9
28 | self.get_body_com("torso").flat, # 9 (should be 3?)
29 | self.get_body_comvel("torso").flat, # 3
30 | ]).reshape(-1)
31 |
32 | def step(self, action: np.ndarray):
33 | self.do_simulation(action, self.frame_skip)
34 | comvel = self.get_body_comvel("torso")
35 | fwd_reward = comvel[0]
36 | scaling = (self.action_space.high - self.action_space.low) * 0.5
37 | ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
38 | contact_cost = 0.
39 | survive_reward = 0.05
40 | reward = fwd_reward - ctrl_cost - contact_cost + survive_reward
41 | state = self.state_vector()
42 | done = not (np.isfinite(state).all() and 0.2 <= state[2] <= 1.0)
43 | obs = self._get_obs()
44 | return obs, float(reward), done, {}
45 |
46 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray):
47 | comvel = next_states[..., -3:]
48 | fwd_reward = comvel[..., 0]
49 | scaling = (self.action_space.high - self.action_space.low) * 0.5
50 | ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(actions / scaling), axis=-1)
51 | contact_cost = 0.
52 | survive_reward = 0.05
53 | reward = fwd_reward - ctrl_cost - contact_cost + survive_reward
54 | notdone = np.all([next_states[..., 2] >= 0.2, next_states[..., 2] <= 1.0], axis=0)
55 | return reward, 1. - notdone
56 |
57 | def reset_model(self):
58 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01
59 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1
60 | self.set_state(qpos, qvel)
61 | return self._get_obs()
62 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/half_cheetah_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from slbo.envs import BaseModelBasedEnv
8 |
9 |
10 | class RLLabHalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 | def __init__(self):
12 | self.rescale_action = True
13 |
14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_half_cheetah.xml'), 1)
15 | utils.EzPickle.__init__(self)
16 |
17 | def get_body_xmat(self, body_name):
18 | return self.sim.data.get_body_xmat(body_name)
19 |
20 | def get_body_comvel(self, body_name):
21 | return self.sim.data.get_body_xvelp(body_name)
22 |
23 | def _get_obs(self):
24 | return np.concatenate([
25 | self.sim.data.qpos.flat, # 9
26 | self.sim.data.qvel.flat, # 9
27 | self.get_body_com("torso").flat, # 3
28 | self.get_body_comvel("torso").flat, # 3
29 | ])
30 |
31 | def step(self, action: np.ndarray):
32 | self.do_simulation(action, self.frame_skip)
33 | action = np.clip(action, self.action_space.low, self.action_space.high)
34 | fwd_reward = self.get_body_comvel("torso")[0]
35 | ctrl_reward = - 0.05 * np.sum(np.square(action))
36 | reward = ctrl_reward + fwd_reward
37 | obs = self._get_obs()
38 | return obs, reward, False, {}
39 |
40 | def mb_step(self, states, actions, next_states):
41 | actions = np.clip(actions, self.action_space.low, self.action_space.high)
42 | ctrl_rewards = - 0.05 * np.sum(np.square(actions), axis=-1)
43 | fwd_rewards = next_states[..., 21]
44 | rewards = fwd_rewards + ctrl_rewards
45 | return rewards, np.zeros_like(fwd_rewards, dtype=np.bool)
46 |
47 | def reset_model(self):
48 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01
49 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1
50 | self.set_state(qpos, qvel)
51 | return self._get_obs()
52 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/hopper_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from slbo.envs import BaseModelBasedEnv
8 |
9 |
10 | class RLLabHopperEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 | def __init__(self):
12 | self.rescale_action = True
13 |
14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_hopper.xml'), 1)
15 | utils.EzPickle.__init__(self)
16 |
17 | def get_body_comvel(self, body_name):
18 | return self.sim.data.get_body_xvelp(body_name)
19 |
20 | def _get_obs(self):
21 | return np.concatenate([
22 | self.sim.data.qpos.flat, # 6
23 | self.sim.data.qvel.flat, # 6
24 | self.get_body_com("torso").flat, # 3
25 | self.get_body_comvel("torso"), # 3
26 | ])
27 |
28 | def step(self, action: np.ndarray):
29 | self.do_simulation(action, self.frame_skip)
30 | scaling = 0.5 * (self.action_space.high - self.action_space.low)
31 | vel = self.get_body_comvel("torso")[0]
32 | alive_bonus = 1.0
33 | reward = vel + alive_bonus - 0.005 * np.sum(np.square(action / scaling))
34 | # FIXME
35 | state = self.state_vector()
36 | done = not (np.isfinite(state).all() and
37 | (np.abs(state[3:]) < 100).all() and (state[0] > .7) and
38 | (abs(state[2]) < .2))
39 | obs = self._get_obs()
40 | return obs, reward, done, {}
41 |
42 | def mb_step(self, states, actions, next_states):
43 | scaling = (self.action_space.high - self.action_space.low) * 0.5
44 | vel = next_states[:, -3]
45 | alive_bonus = 1.0
46 | reward = vel + alive_bonus - 0.005 * np.sum(np.square(actions / scaling), axis=-1)
47 |
48 | done = ~((next_states[:, 3:12] < 100).all(axis=-1) &
49 | (next_states[:, 0] > 0.7) &
50 | (np.abs(next_states[:, 2]) < 0.2))
51 | return reward, done
52 |
53 | def reset_model(self):
54 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01
55 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1
56 | self.set_state(qpos, qvel)
57 | return self._get_obs()
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/humanoid_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import gym.utils as utils
4 | import numpy as np
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from slbo.envs import BaseModelBasedEnv
8 |
9 |
10 | class RLLabSimpleHumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 | def __init__(self):
12 | self.rescale_action = True
13 |
14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_simple_humanoid.xml'), 1)
15 | utils.EzPickle.__init__(self)
16 |
17 | def get_body_xmat(self, body_name):
18 | return self.sim.data.get_body_xmat(body_name)
19 |
20 | def get_body_comvel(self, body_name):
21 | return self.sim.data.get_body_xvelp(body_name)
22 |
23 | def _get_obs(self):
24 | data = self.sim.data
25 | return np.concatenate([
26 | data.qpos.flat, # 17
27 | data.qvel.flat, # 16
28 | self.get_body_com("torso").flat, # 3
29 | self.get_body_comvel("torso").flat, # 3
30 | ])
31 |
32 | def step(self, actions: np.ndarray):
33 | alive_bonus = 0.2
34 | comvel = self.get_body_comvel("torso")
35 | lin_vel_reward = comvel[0]
36 | scaling = 0.5 * (self.action_space.high - self.action_space.low)
37 | ctrl_cost = 5e-4 * np.sum(np.square(actions / scaling))
38 | impact_cost = 0.
39 | vel_deviation_cost = 5e-3 * np.sum(np.square(comvel[1:]))
40 | reward = lin_vel_reward + alive_bonus - ctrl_cost - impact_cost - vel_deviation_cost
41 | done = not (0.8 <= self.sim.data.qpos.flat[2] <= 2.0)
42 | next_obs = self._get_obs()
43 | return next_obs, reward, done, {}
44 |
45 | def mb_step(self, states, actions, next_states):
46 | scaling = 0.5 * (self.action_space.high - self.action_space.low)
47 |
48 | alive_bonus = 0.2
49 | lin_vel_reward = next_states[:, 36]
50 | ctrl_cost = 5.e-4 * np.square(actions / scaling).sum(axis=1)
51 | impact_cost = 0.
52 | vel_deviation_cost = 5.e-3 * np.square(next_states[:, 37:39]).sum(axis=1)
53 | reward = lin_vel_reward + alive_bonus - ctrl_cost - impact_cost - vel_deviation_cost
54 |
55 | dones = not (0.8 <= next_states[:, 2] <= 2.0)
56 | return reward, dones
57 |
58 | def reset_model(self):
59 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01
60 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1
61 | self.set_state(qpos, qvel)
62 | return self._get_obs()
63 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/ant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/green_ball.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/half_cheetah.xml:
--------------------------------------------------------------------------------
1 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/hill_ant_env.xml.mako:
--------------------------------------------------------------------------------
1 | <%
2 | difficulty = opts.get("difficulty", 1.0)
3 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures")
4 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png")
5 | %>
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/hill_half_cheetah_env.xml.mako:
--------------------------------------------------------------------------------
1 |
35 | <%
36 | difficulty = opts.get("difficulty", 1.0)
37 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures")
38 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png")
39 | %>
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/hill_hopper_env.xml.mako:
--------------------------------------------------------------------------------
1 | <%
2 | difficulty = opts.get("difficulty", 1.0)
3 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures")
4 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png")
5 | %>
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/hill_swimmer3d_env.xml.mako:
--------------------------------------------------------------------------------
1 | <%
2 | difficulty = opts.get("difficulty", 1.0)
3 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures")
4 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png")
5 | %>
6 |
7 |
8 |
9 |
10 |
11 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/hill_walker2d_env.xml.mako:
--------------------------------------------------------------------------------
1 | <%
2 | difficulty = opts.get("difficulty", 1.0)
3 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures")
4 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png")
5 | %>
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/hopper.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/humanoid.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/inverted_double_pendulum.xml:
--------------------------------------------------------------------------------
1 |
16 |
17 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
29 |
30 |
31 |
35 |
36 |
37 |
38 |
39 |
44 |
50 |
51 |
58 |
63 |
64 |
68 |
73 |
74 |
78 |
83 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
97 |
98 |
99 |
100 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/inverted_double_pendulum.xml.mako:
--------------------------------------------------------------------------------
1 |
16 | <%
17 | noise = opts.get("noise", False)
18 | pole1_height = 0.6
19 | pole2_height = 0.6
20 | if noise:
21 | import numpy as np
22 | pole1_height = pole1_height + np.random.uniform(-0.1, 0.4)
23 | pole2_height = pole2_height + np.random.uniform(-0.1, 0.4)
24 | %>
25 |
26 |
27 |
28 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
40 |
41 |
42 |
46 |
47 |
48 |
49 |
50 |
55 |
61 |
62 |
69 |
74 |
75 |
79 |
84 |
85 |
89 |
94 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/point.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/red_ball.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/simple_humanoid.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/swimmer.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/swimmer3d.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/utils.mako:
--------------------------------------------------------------------------------
1 | <%def name="make_maze(structure, height, size_scaling)">
2 | % for i in xrange(len(structure)):
3 | % for j in xrange(len(structure[0])):
4 | % if str(structure[i][j]) == '1':
5 |
15 | % endif
16 | % endfor
17 | % endfor
18 | %def>
19 |
20 | <%def name="make_contacts(geom_name, structure)">
21 | % for i in xrange(len(structure)):
22 | % for j in xrange(len(structure[0])):
23 | % if str(structure[i][j]) == '1':
24 |
28 | % endif
29 | % endfor
30 | % endfor
31 | %def>
32 |
33 | <%def name="find_goal_range(structure, size_scaling)">
34 | <%
35 | found = False
36 | goal_range = []
37 | for i in xrange(len(structure)):
38 | for j in xrange(len(structure[0])):
39 | if structure[i][j] == 'g':
40 | goal_range.append(j*size_scaling-size_scaling*0.5),
41 | goal_range.append(j*size_scaling+size_scaling*0.5),
42 | goal_range.append(i*size_scaling-size_scaling*0.5),
43 | goal_range.append(i*size_scaling+size_scaling*0.5),
44 | found = True
45 | break
46 | if found:
47 | break
48 | %>
49 |
50 | %def>
51 |
52 | <%def name="find_robot(structure, size_scaling, z_offset=0)">
53 | <%
54 | robot_pos = [0, 0, z_offset]
55 | found = False
56 | for i in xrange(len(structure)):
57 | for j in xrange(len(structure[0])):
58 | if structure[i][j] == 'r':
59 | robot_pos[0] = j*size_scaling
60 | robot_pos[1] = i*size_scaling
61 | found = True
62 | break
63 | if found:
64 | break
65 | %>
66 | ${' '.join(map(str, robot_pos))}
67 | %def>
68 |
69 | <%def name="encode_map(structure, size_scaling)">
70 | <%
71 | data = []
72 | data.append(len(structure))
73 | data.append(len(structure[0]))
74 | data.append(size_scaling)
75 | for i in xrange(len(structure)):
76 | for j in xrange(len(structure[0])):
77 | if structure[i][j] == 1:
78 | data.append(1)
79 | elif structure[i][j] == 'g':
80 | data.append(2)
81 | else:
82 | data.append(0)
83 | %>
84 | ${' '.join(map(str, data))}
85 | %def>
86 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/mujoco_models/walker2d.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/rllab_ant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/rllab_half_cheetah.xml:
--------------------------------------------------------------------------------
1 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/rllab_hopper.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/rllab_simple_humanoid.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/rllab_swimmer.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/rllab_walker2d.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/swimmer_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import gym.utils as utils
4 | import numpy as np
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from slbo.envs import BaseModelBasedEnv
8 |
9 |
10 | class RLLabSwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 | def __init__(self):
12 | self.rescale_action = True
13 |
14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_swimmer.xml'), 50)
15 | utils.EzPickle.__init__(self)
16 |
17 | def get_body_xmat(self, body_name):
18 | return self.sim.data.get_body_xmat(body_name)
19 |
20 | def get_body_comvel(self, body_name):
21 | return self.sim.data.get_body_xvelp(body_name)
22 |
23 | def _get_obs(self):
24 | return np.concatenate([
25 | self.sim.data.qpos.flat, # 5
26 | self.sim.data.qvel.flat, # 5
27 | self.get_body_com("torso").flat, # 3
28 | self.get_body_comvel("torso").flat, # 3
29 | ]).reshape(-1)
30 |
31 | def step(self, action: np.ndarray):
32 | self.do_simulation(action, self.frame_skip)
33 | scaling = 0.5 * (self.action_space.high - self.action_space.low)
34 | ctrl_cost = 0.005 * np.sum(np.square(action / scaling))
35 | fwd_reward = self.get_body_comvel("torso")[0]
36 | reward = fwd_reward - ctrl_cost
37 | obs = self._get_obs()
38 | return obs, reward, False, {}
39 |
40 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray):
41 | scaling = 0.5 * (self.action_space.high - self.action_space.low)
42 | ctrl_cost = 0.005 * np.sum(np.square(actions / scaling), axis=-1)
43 | fwd_reward = next_states[:, -3]
44 | reward = fwd_reward - ctrl_cost
45 | return reward, np.zeros_like(reward, dtype=np.bool)
46 |
47 | def reset_model(self):
48 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01
49 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1
50 | self.set_state(qpos, qvel)
51 | return self._get_obs()
52 |
--------------------------------------------------------------------------------
/slbo/envs/mujoco/rllab/walker2d_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import gym.utils as utils
4 | import numpy as np
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from slbo.envs import BaseModelBasedEnv
8 |
9 |
10 | class RLLabWalker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 | def __init__(self):
12 | self.rescale_action = True
13 |
14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_walker2d.xml'), 1)
15 | utils.EzPickle.__init__(self)
16 |
17 | def get_body_xmat(self, body_name):
18 | return self.sim.data.get_body_xmat(body_name)
19 |
20 | def get_body_comvel(self, body_name):
21 | return self.sim.data.get_body_xvelp(body_name)
22 |
23 | def _get_obs(self):
24 | return np.concatenate([
25 | self.sim.data.qpos.flat,
26 | self.sim.data.qvel.flat,
27 | self.get_body_com("torso").flat,
28 | self.get_body_comvel("torso").flat
29 | ])
30 |
31 | def step(self, action: np.ndarray):
32 | self.do_simulation(action, self.frame_skip)
33 | fwd_reward = self.get_body_comvel("torso")[0]
34 | scaling = 0.5 * (self.action_space.high - self.action_space.low)
35 | ctrl_cost = 1e-3 * np.sum(np.square(action / scaling))
36 | alive_bonus = 1.
37 | reward = fwd_reward - ctrl_cost + alive_bonus
38 | qpos = self.sim.data.qpos
39 | done = not (0.8 < qpos[0] < 2.0 and -1.0 < qpos[2] < 1.0)
40 | obs = self._get_obs()
41 | return obs, reward, done, {}
42 |
43 | def mb_step(self, states, actions, next_states):
44 | scaling = 0.5 * (self.action_space.high - self.action_space.low)
45 | reward_ctrl = -0.001 * np.sum(np.square(actions / scaling), axis=-1)
46 | reward_fwd = next_states[:, 21]
47 | alive_bonus = 1.
48 | rewards = reward_ctrl + reward_fwd + alive_bonus
49 | dones = not ((0.8 < next_states[:, 0] < 2.0) and (-1.0 < next_states[:, 2] < 1.0))
50 | return rewards, dones
51 |
52 | def reset_model(self):
53 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01
54 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1
55 | self.set_state(qpos, qvel)
56 | return self._get_obs()
57 |
--------------------------------------------------------------------------------
/slbo/envs/virtual_env.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from stable_baselines.common.vec_env.base_vec_env import VecEnv
4 | import torch
5 |
6 | from slbo.envs import BaseModelBasedEnv
7 | from slbo.models.dynamics import Dynamics
8 |
9 |
10 | class VirtualEnv(gym.Env):
11 | def __init__(self, dynamics: Dynamics, env: BaseModelBasedEnv, seed):
12 | super().__init__()
13 | self.observation_space = env.observation_space
14 | self.action_space = env.action_space
15 |
16 | self.state_dim = self.observation_space.shape[0]
17 | self.action_dim = self.action_space.shape[0]
18 |
19 | self.dynamics = dynamics
20 | self.device = next(self.dynamics.parameters()).device
21 | self.env = env
22 | self.env.seed(seed)
23 |
24 | self.state = np.zeros([self.observation_space.shape[0]], dtype=np.float32)
25 |
26 | def _rescale_action(self, action):
27 | lo, hi = self.action_space.low, self.action_space.high
28 | return lo + (action + 1.) * 0.5 * (hi - lo)
29 |
30 | def step_await(self, action: np.ndarray):
31 | states = self.state.reshape([1, self.state_dim])
32 | actions = action.reshape([1, self.action_dim])
33 | rescaled_actions = self._rescale_action(action).reshape([1, self.action_dim])
34 | with torch.no_grad():
35 | next_states = self.dynamics(torch.tensor(states, device=self.device, dtype=torch.float32),
36 | torch.tensor(actions, device=self.device, dtype=torch.float32)).cpu().numpy()
37 | rewards, dones = self.env.mb_step(states, rescaled_actions, next_states)
38 | reward, done = rewards[0], dones[0]
39 | self.state = next_states[0]
40 | return self.state.copy(), reward.copy(), done.copy(), {}
41 |
42 | def reset(self) -> np.ndarray:
43 | self.state = self.env.reset()
44 | return self.state.copy()
45 |
46 | def set_state(self, state: np.ndarray):
47 | self.state = state.copy()
48 |
49 | def render(self, mode='human'):
50 | raise NotImplemented
51 |
52 |
53 | class VecVirtualEnv(VecEnv):
54 | def __init__(self, dynamics: Dynamics, env: BaseModelBasedEnv, num_envs, seed, max_episode_steps=1000,
55 | auto_reset=True):
56 | super(VecEnv, self).__init__()
57 | self.observation_space = env.observation_space
58 | self.action_space = env.action_space
59 |
60 | self.state_dim = self.observation_space.shape[0]
61 | self.action_dim = self.action_space.shape[0]
62 | self.num_envs = num_envs
63 | self.max_episode_steps = max_episode_steps
64 | self.auto_reset = auto_reset
65 |
66 | self.dynamics = dynamics
67 | self.device = next(self.dynamics.parameters()).device
68 | self.env = env
69 | self.env.seed(seed)
70 |
71 | self.elapsed_steps = np.zeros([self.num_envs], dtype=np.int32)
72 | self.episode_rewards = np.zeros([self.num_envs])
73 |
74 | self.states = np.zeros([self.num_envs, self.observation_space.shape[0]], dtype=np.float32)
75 |
76 | def _rescale_action(self, actions: np.array):
77 | lo, hi = self.action_space.low, self.action_space.high
78 | return lo + (actions + 1.) * 0.5 * (hi - lo)
79 |
80 | def step_async(self, actions):
81 | self.actions = actions
82 |
83 | def step_wait(self):
84 | rescaled_actions = self._rescale_action(self.actions)
85 | self.elapsed_steps += 1
86 | with torch.no_grad():
87 | next_states = self.dynamics(torch.tensor(self.states, device=self.device, dtype=torch.float32),
88 | torch.tensor(self.actions, device=self.device, dtype=torch.float32)).cpu().numpy()
89 | rewards, dones = self.env.mb_step(self.states, rescaled_actions, next_states)
90 | self.episode_rewards += rewards
91 | self.states = next_states.copy()
92 | timeouts = self.elapsed_steps == self.max_episode_steps
93 | dones |= timeouts
94 | info_dicts = [{} for _ in range(self.num_envs)]
95 | for i, (done, timeout) in enumerate(zip(dones, timeouts)):
96 | if done:
97 | info = {'episode': {'r': self.episode_rewards[i], 'l': self.elapsed_steps[i]}}
98 | if timeout:
99 | info.update({'TimeLimit.truncated': True})
100 | info_dicts[i] = info
101 | else:
102 | info_dicts[i] = {}
103 | if self.auto_reset:
104 | self.reset(np.argwhere(dones).squeeze(axis=-1))
105 | return self.states.copy(), rewards.copy(), dones.copy(), info_dicts
106 |
107 | # if indices = None, every env will be reset
108 | def reset(self, indices=None) -> np.ndarray:
109 | # have to distinguish [] and None
110 | indices = np.arange(self.num_envs) if indices is None else indices
111 | if np.size(indices) == 0:
112 | return np.array([])
113 | states = np.array([self.env.reset() for _ in indices])
114 | self.states[indices] = states
115 | self.elapsed_steps[indices] = 0
116 | self.episode_rewards[indices] = 0.
117 | return states.copy()
118 |
119 | # if indices = None, every env will be set
120 | def set_state(self, states: np.ndarray, indices=None):
121 | indices = indices or np.arange(self.num_envs)
122 | assert states.ndim == 2 and states.shape[0] == indices.shape[0]
123 | self.states[indices] = states.copy()
124 | # set_state should reset reward and length
125 | self.elapsed_steps[indices] = 0
126 | self.episode_rewards[indices] = 0.
127 |
128 | def close(self):
129 | pass
130 |
131 | def seed(self, seed):
132 | return self.env.seed(seed)
133 |
134 | def render(self, mode='human'):
135 | raise NotImplemented
136 |
137 | def set_attr(self, attr_name, value, indices=None):
138 | raise NotImplemented
139 |
140 | def get_attr(self, attr_name, indices=None):
141 | raise NotImplemented
142 |
143 | def env_method(self, method_name, *method_args, indices=None, **method_kwargs):
144 | raise NotImplemented
145 |
146 |
--------------------------------------------------------------------------------
/slbo/envs/wrapped_envs.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Optional
3 |
4 | import gym
5 | from gym.wrappers import TimeLimit
6 | import torch
7 | from stable_baselines import bench
8 | from stable_baselines.common.vec_env import VecEnvWrapper
9 | from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
10 | from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
11 | from stable_baselines.common.vec_env.vec_normalize import VecNormalize
12 |
13 | from slbo.envs.mujoco.mujoco_envs import make_mujoco_env
14 | from slbo.envs.virtual_env import VirtualEnv, VecVirtualEnv
15 | from slbo.models.dynamics import Dynamics
16 |
17 |
18 | def make_env(env_id, seed, rank, log_dir, allow_early_resets, max_episode_steps, test=True):
19 | def _thunk():
20 | if test:
21 | env = gym.make(env_id)
22 | else:
23 | env = make_mujoco_env(env_id)
24 | env = TimeLimit(env, max_episode_steps)
25 |
26 | env.seed(seed + rank)
27 | log_dir_ = os.path.join(log_dir, str(rank)) if log_dir is not None else log_dir
28 | env = bench.Monitor(env, log_dir_, allow_early_resets=allow_early_resets)
29 |
30 | return env
31 |
32 | return _thunk
33 |
34 |
35 | def make_vec_envs(env_name: str,
36 | seed: int,
37 | num_envs: int,
38 | gamma: float,
39 | log_dir: Optional[str],
40 | device: torch.device,
41 | allow_early_resets: bool,
42 | max_episode_steps: int = 1000,
43 | norm_reward=True,
44 | norm_obs=True,
45 | test=False,
46 | ):
47 | envs = [
48 | make_env(env_name, seed, i, log_dir, allow_early_resets, max_episode_steps, test)
49 | for i in range(num_envs)
50 | ]
51 |
52 | if len(envs) > 1:
53 | envs = SubprocVecEnv(envs)
54 | else:
55 | envs = DummyVecEnv(envs)
56 |
57 | if len(envs.observation_space.shape) == 1:
58 | if gamma is None:
59 | envs = VecNormalize(envs, norm_reward=False, norm_obs=norm_obs)
60 | else:
61 | envs = VecNormalize(envs, gamma=gamma, norm_reward=norm_reward, norm_obs=norm_obs)
62 |
63 | envs = VecPyTorch(envs, device)
64 |
65 | return envs
66 |
67 |
68 | def make_vec_virtual_envs(env_name: str,
69 | dynamics: Dynamics,
70 | seed: int,
71 | num_envs: int,
72 | gamma: Optional[float],
73 | device: torch.device,
74 | allow_early_resets: bool,
75 | max_episode_steps: int = 1000,
76 | norm_reward=False,
77 | norm_obs=False,
78 | ):
79 | envs = VecVirtualEnv(dynamics, make_mujoco_env(env_name), num_envs, seed, max_episode_steps)
80 |
81 | if len(envs.observation_space.shape) == 1 and (norm_reward or norm_obs):
82 | if gamma is None:
83 | envs = VecNormalize(envs, norm_reward=False, norm_obs=norm_obs)
84 | else:
85 | envs = VecNormalize(envs, gamma=gamma, norm_reward=norm_reward, norm_obs=norm_obs)
86 |
87 | envs = VecPyTorch(envs, device)
88 |
89 | return envs
90 |
91 |
92 | class VecPyTorch(VecEnvWrapper):
93 | def __init__(self, venv, device):
94 | super(VecPyTorch, self).__init__(venv)
95 | self.device = device
96 |
97 | def reset(self):
98 | obs = self.venv.reset()
99 | obs = torch.from_numpy(obs).float().to(self.device)
100 | return obs
101 |
102 | def step_async(self, actions):
103 | if isinstance(actions, torch.LongTensor):
104 | actions = actions.squeeze(1)
105 | actions = actions.cpu().numpy()
106 | self.venv.step_async(actions)
107 |
108 | def step_wait(self):
109 | obs, reward, done, info = self.venv.step_wait()
110 | obs = torch.from_numpy(obs).float().to(self.device)
111 | reward = torch.from_numpy(reward).unsqueeze(dim=1).float()
112 | return obs, reward, done, info
113 |
114 |
115 | def get_vec_normalize(venv):
116 | if isinstance(venv, VecNormalize):
117 | return venv
118 | elif hasattr(venv, 'venv'):
119 | return get_vec_normalize(venv.venv)
120 |
121 | return None
122 |
123 |
124 |
--------------------------------------------------------------------------------
/slbo/misc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/misc/__init__.py
--------------------------------------------------------------------------------
/slbo/misc/distributions.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.distributions import Distribution, Normal
3 | import math
4 |
5 | class TanhNormal(Distribution):
6 | """
7 | Represent distribution of X where
8 | Z ~ N(mean, std)
9 | X ~ tanh(Z)
10 | Note: this is not very numerically stable.
11 | """
12 | def __init__(self, mean, std, epsilon=1e-6):
13 | """
14 | :param mean: Mean of the normal distribution
15 | :param std: Std of the normal distribution
16 | :param epsilon: Numerical stability epsilon when computing log-prob.
17 | """
18 | super().__init__()
19 | self.normal_mean = mean
20 | self.normal_std = std
21 | self.normal = Normal(mean, std)
22 | self.epsilon = epsilon
23 |
24 | def log_prob(self, value, pre_tanh_value=None):
25 | if pre_tanh_value is None:
26 | pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2
27 | return self.normal.log_prob(pre_tanh_value) - torch.log(1 - value * value + self.epsilon)
28 |
29 | def log_probs(self, value, pre_tanh_value):
30 | return self.log_prob(value, pre_tanh_value).sum(-1, keepdim=True)
31 |
32 | def sample(self, sample_shape=torch.Size([])):
33 | z = self.normal.sample(sample_shape)
34 | return torch.tanh(z), z
35 |
36 | def rsample(self, sample_shape=torch.Size([]), return_pretanh_value=False):
37 | z = (
38 | self.normal_mean +
39 | self.normal_std *
40 | Normal(
41 | torch.zeros_like(self.normal_mean),
42 | torch.ones_like(self.normal_std)
43 | ).sample()
44 | )
45 | z.requires_grad_()
46 | return torch.tanh(z), z
47 |
48 | def entropy(self):
49 | return self.normal.entropy().sum(-1)
50 |
51 | def mode(self):
52 | return torch.tan(self.normal_mean), self.normal_mean
53 |
54 |
55 | class FixedLimitedEntNormal(torch.distributions.Normal):
56 | def log_probs(self, actions):
57 | return super().log_prob(actions).sum(-1, keepdim=True)
58 |
59 | def entropy(self):
60 | limit = 2.
61 | lo, hi = (-limit - self.loc) / self.scale / math.sqrt(2), (limit - self.loc) / self.scale / math.sqrt(2)
62 | return (0.5 * (self.scale.log() + math.log(2 * math.pi) / 2) * (hi.erf() - lo.erf()) + 0.5 *
63 | (torch.exp(-hi * hi) * hi - torch.exp(-lo * lo) * lo)).sum(-1)
64 |
65 | def mode(self):
66 | return self.mean
67 |
68 |
69 | class FixedCategorical(torch.distributions.Categorical):
70 | def sample(self, **kwargs):
71 | return super().sample(**kwargs).unsqueeze(-1)
72 |
73 | def log_probs(self, actions):
74 | return (
75 | super()
76 | .log_prob(actions.squeeze(-1))
77 | .view(actions.size(0), -1)
78 | .sum(-1)
79 | .unsqueeze(-1)
80 | )
81 |
82 | def mode(self):
83 | return self.probs.argmax(dim=-1, keepdim=True)
84 |
85 |
86 | class FixedNormal(torch.distributions.Normal):
87 |
88 | def log_probs(self, actions):
89 | return super().log_prob(actions).sum(-1, keepdim=True)
90 |
91 | def entropy(self):
92 | return super().entropy().sum(-1)
93 |
94 | def mode(self):
95 | return self.mean
96 |
97 |
98 | class FixedBernoulli(torch.distributions.Bernoulli):
99 |
100 | def log_probs(self, actions):
101 | return super().log_prob(actions).view(actions.size(0), -1).sum(-1, keepdim=True)
102 |
103 | def entropy(self):
104 | return super().entropy().sum(-1)
105 |
106 | def mode(self):
107 | return torch.gt(self.probs, 0.5).float()
108 |
109 |
--------------------------------------------------------------------------------
/slbo/misc/ou_noise.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from slbo.models.actor import Actor
4 |
5 |
6 | class OUNoise(object):
7 |
8 | def __init__(self, action_space, mu=0.0, theta=0.15, sigma=0.3):
9 | self.mu = mu
10 | self.theta = theta
11 | self.sigma = sigma
12 | self.action_space = action_space
13 | self.state = None
14 | self.actor = None
15 |
16 | self.shape = action_space.shape
17 |
18 | self.reset()
19 |
20 | def reset(self):
21 | self.state = torch.ones(self.shape) * self.mu
22 |
23 | def next(self):
24 | delta = self.theta * (self.mu - self.state) + self.sigma * torch.randn_like(self.state)
25 | self.state = self.state + delta
26 | return self.state
27 |
28 | def act(self, states):
29 | result = self.actor.act(states)
30 | return (result[0] + self.next(), *result[1:])
31 |
32 | def wrap(self, actor: Actor):
33 | self.actor = actor
34 | self.state = self.state.to(next(actor.parameters()).device)
35 | return self
36 |
37 |
--------------------------------------------------------------------------------
/slbo/misc/param.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | def get_flat_params_from(model: nn.Module):
7 | params = []
8 | for param in model.parameters():
9 | params.append(param.view(-1))
10 |
11 | flat_params = torch.cat(params)
12 | return flat_params
13 |
14 |
15 | def set_flat_params_to(model: nn.Module, flat_params):
16 | prev_ind = 0
17 | for param in model.parameters():
18 | flat_size = int(np.prod(list(param.size())))
19 | param.data.copy_(flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
20 | prev_ind += flat_size
21 |
22 |
23 | def get_flat_grad_from(inputs, grad_grad=False):
24 | grads = []
25 | for param in inputs:
26 | if grad_grad:
27 | grads.append(param.grad.grad.view(-1))
28 | else:
29 | if param.grad is None:
30 | grads.append(torch.zeros(param.view(-1).shape))
31 | else:
32 | grads.append(param.grad.view(-1))
33 |
34 | flat_grad = torch.cat(grads)
35 | return flat_grad
36 |
37 |
38 | def compute_flat_grad(output, inputs, filter_input_ids: set, retain_graph=False, create_graph=False):
39 | filter_input_ids = filter_input_ids.copy()
40 | if create_graph:
41 | retain_graph = True
42 |
43 | inputs = list(inputs)
44 | params = []
45 | for i, param in enumerate(inputs):
46 | if i not in filter_input_ids:
47 | params.append(param)
48 |
49 | grads = torch.autograd.grad(output, params, retain_graph=retain_graph, create_graph=create_graph)
50 |
51 | j = 0
52 | out_grads = []
53 | for i, param in enumerate(inputs):
54 | if i in filter_input_ids:
55 | out_grads.append(torch.zeros(param.view(-1).shape, device=param.device, dtype=param.dtype))
56 | else:
57 | out_grads.append(grads[j].view(-1))
58 | j += 1
59 | grads = torch.cat(out_grads)
60 |
61 | for param in params:
62 | param.grad = None
63 | return grads
--------------------------------------------------------------------------------
/slbo/misc/utils.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import torch
4 | from torch.utils.tensorboard import SummaryWriter
5 |
6 | from slbo.envs.wrapped_envs import make_vec_envs, get_vec_normalize
7 |
8 |
9 | def log_and_write(logger, writer: SummaryWriter, log_infos: List, global_step: int):
10 | for idx, (name, value) in enumerate(log_infos):
11 | if logger is not None:
12 | logger.logkv('{}.'.format(idx) + name.split('/')[-1], value)
13 | if writer is not None and name.find('/') > -1:
14 | writer.add_scalar(name, value, global_step=global_step)
15 | if logger is not None:
16 | logger.dumpkvs()
17 |
18 |
19 | def collect_traj(actor, envs, buffer, total_step):
20 | episode_rewards = []
21 | episode_lengths = []
22 |
23 | step = 0
24 | while step < total_step:
25 | states = envs.reset()
26 | dones = False
27 | traj = {'states': [], 'actions': [], 'rewards': [], 'next_states': [], 'masks': []}
28 | while not dones:
29 | with torch.no_grad():
30 | actions, *_ = actor(states, deterministic=False, reparameterize=False)
31 |
32 | new_states, rewards, dones, infos = envs.step_index(actions)
33 | mask = torch.tensor([[0.0] if done_ else [1.0] for done_ in dones], dtype=torch.float32)
34 |
35 | traj['states'].append(states)
36 | traj['actions'].append(actions)
37 | traj['next_states'].append(new_states)
38 | traj['rewards'].append(rewards)
39 | traj['masks'].append(mask)
40 |
41 | states = new_states
42 |
43 | for info_ in infos:
44 | if 'episode' in info_.keys():
45 | episode_rewards.append(info_['episode']['r'])
46 | episode_lengths.append(info_['episode']['l'])
47 |
48 | traj_len = len(traj['actions'])
49 | step += traj_len
50 | buffer.add_traj(traj)
51 |
52 | return episode_rewards, episode_lengths
53 |
54 |
55 | def evaluate(actor, env_name, seed, num_episode, eval_log_dir,
56 | device, max_episode_steps=1000, norm_reward=False, norm_obs=True, obs_rms=None, test=True):
57 | eval_envs = make_vec_envs(env_name, seed + 1, 1, None, eval_log_dir, device, True,
58 | max_episode_steps, norm_reward, norm_obs, test)
59 |
60 | vec_norm = get_vec_normalize(eval_envs)
61 | if vec_norm is not None and norm_obs:
62 | assert obs_rms is not None
63 | vec_norm.training = False
64 | vec_norm.obs_rms = obs_rms
65 |
66 | eval_episode_rewards = []
67 | eval_episode_lengths = []
68 |
69 | obs = eval_envs.reset()
70 |
71 | while len(eval_episode_rewards) < num_episode:
72 | with torch.no_grad():
73 | action, *_ = actor.act(obs, deterministic=True)
74 |
75 | obs, _, done, infos = eval_envs.step(action)
76 |
77 | for info in infos:
78 | if 'episode' in info.keys():
79 | eval_episode_rewards.append(info['episode']['r'])
80 | eval_episode_lengths.append(info['episode']['l'])
81 |
82 | eval_envs.close()
83 |
84 | return eval_episode_rewards, eval_episode_lengths
85 |
--------------------------------------------------------------------------------
/slbo/models/__init__.py:
--------------------------------------------------------------------------------
1 | from slbo.models.actor import Actor
2 | from slbo.models.actor_critic import ActorCritic
3 | from slbo.models.critic import VCritic, QCritic
4 | from slbo.models.dynamics import Dynamics
5 | from slbo.models.normalizers import Normalizers
6 |
--------------------------------------------------------------------------------
/slbo/models/actor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from typing import List, Callable, Optional
4 |
5 | from slbo.models.initializer import normc_init
6 | from slbo.models.utils import MLP, init
7 | from slbo.models.actor_layer import *
8 |
9 |
10 | class Actor(nn.Module):
11 | def __init__(self, state_dim: int, action_space, hidden_dims: List[int],
12 | state_normalizer: Optional[nn.Module], use_limited_entropy=False):
13 | super(Actor, self).__init__()
14 | self.state_dim = state_dim
15 | self.action_dim = action_space
16 | self.hidden_dims = hidden_dims
17 |
18 | self.actor_feature = MLP(state_dim, hidden_dims[-1], hidden_dims[:-1],
19 | activation='Tanh', last_activation='Tanh')
20 | self.state_normalizer = state_normalizer or nn.Identity()
21 |
22 | if action_space.__class__.__name__ == "Discrete":
23 | action_dim = action_space.n
24 | self.actor = CategoricalActorLayer(hidden_dims[-1], action_dim)
25 | elif action_space.__class__.__name__ == "Box":
26 | action_dim = action_space.shape[0]
27 | if use_limited_entropy:
28 | self.actor = LimitedEntGaussianActorLayer(hidden_dims[-1], action_dim, use_state_dependent_std=False)
29 | else:
30 | self.actor = GaussianActorLayer(hidden_dims[-1], action_dim, use_state_dependent_std=False)
31 | elif action_space.__class__.__name__ == "MultiBinary":
32 | action_dim = action_space.shape[0]
33 | self.actor = BernoulliActorLayer(hidden_dims[-1], action_dim)
34 | else:
35 | raise NotImplemented
36 |
37 | init_ = lambda m: init(m, normc_init, lambda x: nn.init.constant_(x, 0))
38 | self.actor_feature.init(init_, init_)
39 |
40 | def act(self, states, deterministic=False, reparamterize=False):
41 | states = self.state_normalizer(states)
42 | action_features = self.actor_feature(states)
43 | action_dists, action_means, log_stds = self.actor(action_features)
44 |
45 | if deterministic:
46 | actions = action_dists.mode()
47 | else:
48 | if reparamterize:
49 | actions = action_dists.rsample()
50 | else:
51 | actions = action_dists.sample()
52 |
53 | log_probs = action_dists.log_probs(actions)
54 | entropy = action_dists.entropy().mean()
55 |
56 | return actions, log_probs, entropy, action_means, log_stds, log_stds.exp()
57 |
58 | def evaluate_action(self, states, actions):
59 | states = self.state_normalizer(states)
60 | action_feature = self.actor_feature(states)
61 | action_dist, *_ = self.actor(action_feature)
62 |
63 | log_probs = action_dist.log_probs(actions)
64 | entropy = action_dist.entropy().mean()
65 |
66 | return log_probs, entropy
67 |
--------------------------------------------------------------------------------
/slbo/models/actor_critic.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import numpy as np
4 |
5 | from slbo.models.actor_layer import *
6 | from slbo.models.utils import MLP, init
7 |
8 |
9 | class ActorCritic(nn.Module):
10 |
11 | def __init__(self, dim_state, action_space, actor_hidden_dims: List[int], critic_hidden_dims: List[int],
12 | normalizer: nn.Module = None):
13 | super(ActorCritic, self).__init__()
14 |
15 | self.actor_feature = MLP(dim_state, actor_hidden_dims[-1], actor_hidden_dims[:-1],
16 | activation='Tanh', last_activation='Tanh')
17 | self.critic = MLP(dim_state, 1, critic_hidden_dims, activation='Tanh', last_activation='Identity')
18 | self.normalizer = normalizer or nn.Identity()
19 |
20 | init_ = lambda m: init(m, lambda x: nn.init.orthogonal_(x, np.sqrt(2)), lambda x: nn.init.constant_(x, 0))
21 | self.actor_feature.init(init_, init_)
22 | self.critic.init(init_, init_)
23 |
24 | self.train()
25 |
26 | if action_space.__class__.__name__ == "Discrete":
27 | dim_action = action_space.n
28 | self.actor = CategoricalActorLayer(actor_hidden_dims[-1], dim_action)
29 | elif action_space.__class__.__name__ == "Box":
30 | dim_action = action_space.shape[0]
31 | self.actor = GaussianActorLayer(actor_hidden_dims[-1], dim_action, use_state_dependent_std=False)
32 | elif action_space.__class__.__name__ == "MultiBinary":
33 | dim_action = action_space.shape[0]
34 | self.actor = BernoulliActorLayer(actor_hidden_dims[-1], dim_action)
35 |
36 | def act(self, states, deterministic=False, reparamterize=False):
37 | action_feature, value = self.actor_feature(states), self.critic(states)
38 | action_dist, *_ = self.actor(action_feature)
39 |
40 | if deterministic:
41 | action = action_dist.mode()
42 | else:
43 | if reparamterize:
44 | action = action_dist.rsample()
45 | else:
46 | action = action_dist.sample()
47 |
48 | action_log_prob = action_dist.log_probs(action)
49 | dist_entropy = action_dist.entropy().mean()
50 |
51 | return value, action, action_log_prob, dist_entropy
52 |
53 | def criticize(self, states):
54 | values = self.critic(states)
55 | return values
56 |
57 | def evaluate_action(self, state, action):
58 | action_feature, value = self.actor_feature(state), self.critic(state)
59 | action_dist = self.actor(action_feature)
60 |
61 | action_log_probs = action_dist.log_prob(action).sum(-1, keepdim=True)
62 | dist_entropy = action_dist.entropy().mean()
63 |
64 | return value, action_log_probs, dist_entropy
65 |
66 |
--------------------------------------------------------------------------------
/slbo/models/actor_layer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | from slbo.misc.distributions import FixedNormal, FixedCategorical, FixedBernoulli, TanhNormal, FixedLimitedEntNormal
5 | from slbo.models.utils import init
6 |
7 |
8 | class CategoricalActorLayer(nn.Module):
9 | def __init__(self, num_inputs, num_outputs):
10 | super(CategoricalActorLayer, self).__init__()
11 |
12 | self.actor = nn.Linear(num_inputs, num_outputs)
13 | init(self.actor, lambda x: nn.init.orthogonal_(x, 0.01), lambda x: nn.init.constant_(x, 0))
14 |
15 | def forward(self, x):
16 | x = self.actor(x)
17 | return FixedCategorical(logits=x)
18 |
19 |
20 | class GaussianActorLayer(nn.Module):
21 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std):
22 | super(GaussianActorLayer, self).__init__()
23 |
24 | self.actor_mean = nn.Linear(num_inputs, num_outputs)
25 | init(self.actor_mean, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0))
26 | self.use_state_dependent_std = use_state_dependent_std
27 | if self.use_state_dependent_std:
28 | self.actor_logstd = nn.Linear(num_inputs, num_outputs)
29 | init(self.actor_logstd, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0))
30 |
31 | else:
32 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True)
33 |
34 | def forward(self, x):
35 | action_mean = self.actor_mean(x)
36 |
37 | if self.use_state_dependent_std:
38 | logstd = self.actor_logstd(x)
39 | else:
40 | logstd = self.logstd
41 |
42 | return FixedNormal(action_mean, logstd.exp()), action_mean, logstd
43 |
44 |
45 | class LimitedEntGaussianActorLayer(nn.Module):
46 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std):
47 | super(LimitedEntGaussianActorLayer, self).__init__()
48 |
49 | self.actor_mean = nn.Linear(num_inputs, num_outputs)
50 | init(self.actor_mean, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0))
51 | self.use_state_dependent_std = use_state_dependent_std
52 | if self.use_state_dependent_std:
53 | self.actor_logstd = nn.Linear(num_inputs, num_outputs)
54 | init(self.actor_logstd, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0))
55 |
56 | else:
57 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True)
58 |
59 | def forward(self, x):
60 | action_mean = self.actor_mean(x)
61 |
62 | if self.use_state_dependent_std:
63 | logstd = self.actor_logstd(x)
64 | else:
65 | logstd = self.logstd
66 |
67 | return FixedLimitedEntNormal(action_mean, logstd.exp()), action_mean, logstd
68 |
69 |
70 | class BernoulliActorLayer(nn.Module):
71 | def __init__(self, num_inputs, num_outputs):
72 | super(BernoulliActorLayer, self).__init__()
73 |
74 | self.actor = nn.Linear(num_inputs, num_outputs)
75 | init(self.actor, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0))
76 |
77 | def forward(self, x):
78 | x = self.actor(x)
79 | return FixedBernoulli(logits=x)
80 |
81 |
82 | class TanhGaussainActorLayer(nn.Module):
83 | def __init__(self, num_inputs, num_outputs, state_dependent_std, init_w=1e-3):
84 | super(TanhGaussainActorLayer, self).__init__()
85 |
86 | self.actor_mean = nn.Linear(num_inputs, num_outputs)
87 | init(self.actor_mean, lambda x: nn.init.uniform_(x, -init_w, init_w),
88 | lambda x: nn.init.uniform_(x, -init_w, init_w))
89 | self.state_dependent_std = state_dependent_std
90 | if self.state_dependent_std:
91 | self.actor_logstd = nn.Linear(num_inputs, num_outputs)
92 | init(self.actor_mean, lambda x: nn.init.uniform_(x, -init_w, init_w),
93 | lambda x: nn.init.uniform_(x, -init_w, init_w))
94 | else:
95 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True)
96 |
97 | def forward(self, x):
98 | action_mean = self.actor_mean(x)
99 |
100 | if self.state_dependent_std:
101 | action_logstd = self.actor_logstd(x)
102 | else:
103 | action_logstd = self.logstd
104 |
105 | action_logstd = torch.clamp(action_logstd, -20, 2)
106 |
107 | return TanhNormal(action_mean, action_logstd.exp()), torch.tanh(action_mean), action_logstd
108 |
--------------------------------------------------------------------------------
/slbo/models/critic.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from slbo.models.initializer import normc_init
7 | from slbo.models.utils import MLP, init
8 |
9 |
10 | class QCritic(nn.Module):
11 | def __init__(self, dim_state: int, dim_action: int, hidden_states: List[int]):
12 | super(QCritic, self).__init__()
13 | self.critic = MLP(dim_state + dim_action, hidden_states, 1)
14 |
15 | def forward(self, state, action):
16 | x = torch.cat([state, action], dim=-1)
17 | return self.critic(x)
18 |
19 |
20 | class VCritic(nn.Module):
21 | def __init__(self, dim_state: int, hidden_dims: List[int], state_normalizer=None, activation='Tanh'):
22 | super(VCritic, self).__init__()
23 | self.critic = MLP(dim_state, 1, hidden_dims, activation=activation)
24 | self.normalizer = state_normalizer or nn.Identity()
25 |
26 | init_ = lambda m: init(m, normc_init, lambda x: nn.init.constant_(x, 0))
27 | self.critic.init(init_, init_)
28 |
29 | def forward(self, state):
30 | state = self.normalizer(state)
31 | return self.critic(state)
32 |
--------------------------------------------------------------------------------
/slbo/models/dynamics.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from slbo.models.initializer import truncated_norm_init
7 | from slbo.models.normalizers import Normalizers
8 | from slbo.models.utils import MLP, init
9 |
10 |
11 | class Dynamics(nn.Module):
12 | def __init__(self, state_dim: int, action_dim: int, hidden_dims: List[int], normalizer: Normalizers):
13 | super(Dynamics, self).__init__()
14 | self.dim_state = state_dim
15 | self.dim_action = action_dim
16 | self.normalizer = normalizer
17 | self.diff_dynamics = MLP(state_dim + action_dim, state_dim, hidden_dims, activation='ReLU')
18 |
19 | init_ = lambda m: init(m, truncated_norm_init, lambda x: nn.init.constant_(x, 0))
20 | self.diff_dynamics.init(init_, init_)
21 |
22 | def forward(self, state, action):
23 | # action clip is the best normalization according to the authors
24 | x = torch.cat([self.normalizer.state_normalizer(state), action.clamp(-1., 1.)], dim=-1)
25 | normalized_diff = self.diff_dynamics(x)
26 | next_states = state + self.normalizer.diff_normalizer(normalized_diff, inverse=True)
27 | next_states = self.normalizer.state_normalizer(self.normalizer.state_normalizer(next_states).clamp(-100, 100),
28 | inverse=True)
29 | return next_states
30 |
31 |
32 |
--------------------------------------------------------------------------------
/slbo/models/initializer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def normc_init(tensor, std=1.0, **kwargs):
5 | tensor.data.normal_(0, 1)
6 | tensor.data *= std / np.sqrt(tensor.data.pow(2).sum(1, keepdim=True))
7 |
8 |
9 | def fanin_init(tensor, **kwargs):
10 | size = tensor.size()
11 | if len(size) == 2:
12 | fan_in = size[0]
13 | elif len(size) > 2:
14 | fan_in = np.prod(size[1:])
15 | else:
16 | raise Exception("Shape must be have dimension at least 2.")
17 | bound = 1. / np.sqrt(fan_in)
18 | return tensor.data.uniform_(-bound, bound)
19 |
20 |
21 | def truncated_norm_init(tensor, mean=0, std=1e-5, **kwargs):
22 | size = tensor.shape
23 | tmp = tensor.new_empty(size + (4,)).normal_()
24 | valid = (tmp < 2) & (tmp > -2)
25 | ind = valid.max(-1, keepdim=True)[1]
26 | tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
27 | tensor.data.mul_(std).add_(mean)
28 | return tensor
29 |
30 |
--------------------------------------------------------------------------------
/slbo/models/normalizers.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.distributions.kl as kl
3 | import torch.nn as nn
4 | from typing import List
5 | try:
6 | from slbo.misc import logger
7 | except ImportError:
8 | from stable_baselines import logger
9 |
10 |
11 | class GaussianNormalizer(nn.Module):
12 | def __init__(self, shape: List[int], eps=1e-8, verbose=0):
13 | super().__init__()
14 |
15 | self.shape = shape
16 | self.verbose = verbose
17 |
18 | self.mean = torch.zeros(shape, dtype=torch.float32)
19 | self.std = torch.ones(shape, dtype=torch.float32)
20 | self.eps = eps
21 | self.n = 0
22 |
23 | def forward(self, x: torch.Tensor, inverse=False):
24 | if inverse:
25 | return x * self.std + self.mean
26 | return (x - self.mean) / (torch.clamp(self.std, min=self.eps))
27 |
28 | def to(self, *args, **kwargs):
29 | self.mean = self.mean.to(*args, **kwargs)
30 | self.std = self.std.to(*args, **kwargs)
31 |
32 | # noinspection DuplicatedCode
33 | # samples in [batch_size, ...]
34 | def update(self, samples: torch.Tensor):
35 | old_mean, old_std, old_n = self.mean, self.std, self.n
36 | samples = samples - old_mean
37 | n = samples.shape[0]
38 | delta = samples.mean(dim=0)
39 | new_n = old_n + n
40 | new_mean = old_mean + delta * n / new_n
41 | new_std = torch.sqrt((old_std**2 * old_n + samples.var(dim=0) * n + delta**2 * old_n * n / new_n) / new_n)
42 | kl_old_new = kl.kl_divergence(torch.distributions.Normal(new_mean, torch.clamp(new_std, 1e-20)),
43 | torch.distributions.Normal(old_mean, torch.clamp(old_std, 1e-20))).sum()
44 | self.mean, self.std, self.n = new_mean, new_std, new_n
45 |
46 | if self.verbose > 0:
47 | logger.debug("updating Normalizer<%s>, KL divergence = %.6f", self.name, kl_old_new)
48 |
49 | # noinspection PyMethodOverriding
50 | def state_dict(self, *args, **kwargs):
51 | return {'mean': self.mean, 'std': self.std, 'n': self.n}
52 |
53 | # noinspection PyMethodOverriding
54 | def load_state_dict(self, state_dict):
55 | self.mean = state_dict['mean']
56 | self.std = state_dict['std']
57 | self.n = state_dict['n']
58 |
59 |
60 | class Normalizers(nn.Module):
61 | def __init__(self, dim_action: int, dim_state: int, verbose=0):
62 | super().__init__()
63 | # action_normalizer is not used
64 | self.action_normalizer = GaussianNormalizer([dim_action], verbose=verbose)
65 | self.state_normalizer = GaussianNormalizer([dim_state], verbose=verbose)
66 | self.diff_normalizer = GaussianNormalizer([dim_state], verbose=verbose)
67 |
68 | def forward(self):
69 | raise NotImplemented
70 |
71 | def to(self, *args, **kwargs):
72 | self.action_normalizer.to(*args, **kwargs)
73 | self.state_normalizer.to(*args, **kwargs)
74 | self.diff_normalizer.to(*args, **kwargs)
75 |
76 | # noinspection PyMethodOverriding
77 | def state_dict(self, *args, **kwargs):
78 | return {'action_normalizer': self.action_normalizer.state_dict(),
79 | 'state_normalizer': self.state_normalizer.state_dict(),
80 | 'diff_normalizer': self.diff_normalizer.state_dict()}
81 |
82 | # noinspection PyMethodOverriding, PyTypeChecker
83 | def load_state_dict(self, state_dict):
84 | self.action_normalizer.load_state_dict(state_dict['action_normalizer'])
85 | self.state_normalizer.load_state_dict(state_dict['state_normalizer'])
86 | self.diff_normalizer.load_state_dict(state_dict['diff_normalizer'])
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/slbo/models/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class MLP(nn.Module):
7 | def __init__(self, input_dim, output_dim, hidden_dims, activation='Tanh', last_activation='Identity', biases=None):
8 | super(MLP, self).__init__()
9 | sizes_list = hidden_dims.copy()
10 | self.activation = getattr(nn, activation)()
11 | self.last_activation = getattr(nn, last_activation)()
12 | sizes_list.insert(0, input_dim)
13 | biases = [True] * len(sizes_list) if biases is None else biases.copy()
14 |
15 | layers = []
16 | if 1 < len(sizes_list):
17 | for i in range(len(sizes_list) - 1):
18 | layers.append(nn.Linear(sizes_list[i], sizes_list[i + 1], bias=biases[i]))
19 | self.last_layer = nn.Linear(sizes_list[-1], output_dim)
20 | self.layers = nn.ModuleList(layers)
21 |
22 | def forward(self, x):
23 | for layer in self.layers:
24 | x = layer(x)
25 | x = self.activation(x)
26 | x = self.last_layer(x)
27 | x = self.last_activation(x)
28 | return x
29 |
30 | def init(self, init_fn, last_init_fn):
31 | for layer in self.layers:
32 | init_fn(layer)
33 | last_init_fn(self.last_layer)
34 |
35 |
36 | def soft_update(source_model: nn.Module, target_model: nn.Module, tau):
37 | for target_param, param in zip(target_model.parameters(), source_model.parameters()):
38 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
39 |
40 |
41 | def copy_model_params_from_to(source, target):
42 | for target_param, param in zip(target.parameters(), source.parameters()):
43 | target_param.data.copy_(param.data)
44 |
45 |
46 | def init(module, weight_init=None, bias_init=None):
47 | if weight_init:
48 | weight_init(module.weight.data)
49 | if bias_init:
50 | bias_init(module.bias.data)
51 |
52 |
53 | def get_flat_params(model):
54 | params = []
55 | for param in model.parameters():
56 | params.append(param.data.view(-1))
57 |
58 | flat_params = torch.cat(params)
59 | return flat_params
60 |
61 |
62 | def set_flat_params(model, flat_params):
63 | prev_ind = 0
64 | for param in model.parameters():
65 | flat_size = int(np.prod(list(param.size())))
66 | param.data.copy_(
67 | flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
68 | prev_ind += flat_size
69 |
70 |
71 | def get_flat_grad(net, grad_grad=False):
72 | grads = []
73 | for param in net.parameters():
74 | if grad_grad:
75 | grads.append(param.grad.grad.view(-1))
76 | else:
77 | grads.append(param.grad.view(-1))
78 |
79 | flat_grad = torch.cat(grads)
80 | return flat_grad
81 |
82 |
--------------------------------------------------------------------------------
/slbo/scripts/run_trpo.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import time
3 | from collections import deque
4 |
5 | import numpy as np
6 | import torch
7 | import torch.backends.cudnn
8 |
9 | from torch.utils.tensorboard import SummaryWriter
10 | import os
11 |
12 | from slbo.algos.mfrl.trpo import TRPO
13 | from slbo.configs.config import Config
14 | from slbo.envs.wrapped_envs import make_vec_envs, get_vec_normalize
15 | from slbo.models import Actor, VCritic
16 | from slbo.misc.utils import evaluate, log_and_write
17 | from slbo.storages.on_policy_buffer import OnPolicyBuffer
18 | try:
19 | from slbo.misc import logger
20 | except ImportError:
21 | from stable_baselines import logger
22 |
23 |
24 | # noinspection DuplicatedCode
25 | def main():
26 | logger.info('Test script for TRPO')
27 | config, hparam_dict = Config('trpo_config.yaml')
28 |
29 | torch.manual_seed(config.seed)
30 | # noinspection PyUnresolvedReferences
31 | torch.cuda.manual_seed_all(config.seed)
32 |
33 | if config.use_cuda and torch.cuda.is_available() and config.cuda_deterministic:
34 | torch.backends.cudnn.benchmark = False
35 | torch.backends.cudnn.deterministic = True
36 |
37 | import datetime
38 | current_time = datetime.datetime.now().strftime('%b%d_%H%M%S')
39 | log_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'log')
40 | eval_log_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'log_eval')
41 | save_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'save')
42 | os.makedirs(log_dir, exist_ok=True)
43 | os.makedirs(eval_log_dir, exist_ok=True)
44 | os.makedirs(save_dir, exist_ok=True)
45 | writer = SummaryWriter(log_dir=log_dir)
46 | writer.add_hparams(hparam_dict, {})
47 |
48 | # save current version of code
49 | shutil.copytree(config.proj_dir, save_dir + '/code', ignore=shutil.ignore_patterns('result', 'data', 'ref'))
50 |
51 | torch.set_num_threads(1)
52 | device = torch.device('cuda' if config.use_cuda else 'cpu')
53 |
54 | envs = make_vec_envs(config.env.env_name, config.seed, config.env.num_envs, config.env.gamma, log_dir, device,
55 | allow_early_resets=False, norm_reward=True, norm_obs=True, test=True)
56 |
57 | state_dim = envs.observation_space.shape[0]
58 | action_space = envs.action_space
59 | action_dim = action_space.shape[0]
60 |
61 | actor = Actor(state_dim, action_space, hidden_dims=config.trpo.actor_hidden_dims,
62 | state_normalizer=None)
63 | critic = VCritic(state_dim, hidden_dims=config.trpo.critic_hidden_dims, state_normalizer=None)
64 | actor.to(device)
65 | critic.to(device)
66 |
67 | agent = TRPO(actor, critic,)
68 |
69 | on_policy_buffer = \
70 | OnPolicyBuffer(config.trpo.num_env_steps, config.env.num_envs, envs.observation_space.shape, envs.action_space,
71 | use_gae=config.trpo.use_gae, gamma=config.env.gamma, gae_lambda=config.trpo.gae_lambda,
72 | use_proper_time_limits=config.trpo.use_proper_time_limits, )
73 |
74 | state = envs.reset()
75 | # noinspection PyUnresolvedReferences
76 | on_policy_buffer.states[0].copy_(state)
77 | on_policy_buffer.to(device)
78 |
79 | episode_rewards = deque(maxlen=10)
80 | episode_lengths = deque(maxlen=10)
81 |
82 | start = time.time()
83 | num_updates = config.trpo.total_env_steps // config.trpo.num_env_steps // config.env.num_envs
84 |
85 | for j in range(num_updates):
86 |
87 | for step in range(config.trpo.num_env_steps):
88 | with torch.no_grad():
89 | action, action_log_prob, dist_entropy, *_ = actor.act(on_policy_buffer.states[step])
90 | value = critic(on_policy_buffer.states[step])
91 |
92 | state, reward, done, info = envs.step(action)
93 |
94 | for info_ in info:
95 | if 'episode' in info_.keys():
96 | episode_rewards.append(info_['episode']['r'])
97 | episode_lengths.append(info_['episode']['l'])
98 |
99 | mask = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32)
100 | bad_mask = torch.tensor([[0.0] if 'bad_transition' in info_.keys() else [1.0] for info_ in info],
101 | dtype=torch.float32)
102 | on_policy_buffer.insert(states=state, actions=action, action_log_probs=action_log_prob,
103 | values=value, rewards=reward, masks=mask, bad_masks=bad_mask)
104 |
105 | with torch.no_grad():
106 | next_value = critic(on_policy_buffer.states[-1])
107 |
108 | on_policy_buffer.compute_returns(next_value)
109 | losses = agent.update(on_policy_buffer)
110 | on_policy_buffer.after_update()
111 |
112 | if j % config.save_interval == 0 or j == num_updates - 1:
113 | save_path = os.path.join(save_dir, config.mf_algo)
114 | try:
115 | os.makedirs(save_path)
116 | except OSError:
117 | pass
118 |
119 | logger.info('Model saved.')
120 | torch.save([actor.state_dict(), critic.state_dict(),
121 | getattr(get_vec_normalize(envs), 'obs_rms', None)],
122 | os.path.join(save_path, config.env.env_name + ".pt"))
123 |
124 | serial_timsteps = (j + 1) * config.trpo.num_env_steps
125 | total_num_steps = config.env.num_envs * serial_timsteps
126 | end = time.time()
127 |
128 | fps = int(total_num_steps / (end - start))
129 |
130 | if j % config.log_interval == 0 and len(episode_rewards) > 0:
131 | log_info = [('serial_timesteps', serial_timsteps), ('total_timesteps', total_num_steps),
132 | ('ep_rew_mean', np.mean(episode_rewards)), ('ep_len_mean', np.mean(episode_lengths)),
133 | ('fps', fps), ('time_elapsed', end - start)]
134 |
135 | for loss_name, loss_value in losses.items():
136 | log_info.append((loss_name, loss_value))
137 | log_and_write(logger, writer, log_info, global_step=j)
138 |
139 | if (config.eval_interval is not None and len(episode_rewards) > 0
140 | and j % config.eval_interval == 0):
141 | obs_rms = get_vec_normalize(envs).obs_rms
142 | eval_episode_rewards, eval_episode_lengths = \
143 | evaluate(actor, config.env.env_name, config.seed,
144 | num_episode=10, eval_log_dir=None, device=device, norm_reward=True, norm_obs=True,
145 | obs_rms=obs_rms, test=True)
146 |
147 | logger.info('Evaluation:')
148 | log_and_write(logger, writer, [('eval_ep_rew_mean', np.mean(eval_episode_rewards)),
149 | ('eval_ep_rew_min', np.min(eval_episode_rewards)),
150 | ('eval_ep_rew_max', np.max(eval_episode_rewards))], global_step=j)
151 |
152 | envs.close()
153 |
154 |
155 | if __name__ == "__main__":
156 | main()
157 |
--------------------------------------------------------------------------------
/slbo/storages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/storages/__init__.py
--------------------------------------------------------------------------------
/slbo/storages/off_policy_buffer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
3 | import numpy as np
4 |
5 |
6 | class OffPolicyBuffer(object):
7 | def __init__(self, buffer_size, num_envs, state_dim, action_dim):
8 | self.buffer_size = buffer_size
9 | self.num_envs = num_envs
10 | self.states = torch.zeros(buffer_size, num_envs, state_dim)
11 | self.next_states = torch.zeros(buffer_size, num_envs, state_dim)
12 | self.rewards = torch.zeros(buffer_size, num_envs, 1)
13 | self.actions = torch.zeros(buffer_size, num_envs, action_dim)
14 | self.masks = torch.ones(buffer_size, num_envs, 1)
15 | self.bad_masks = torch.ones(buffer_size, num_envs, 1)
16 |
17 | self.buffer_size = buffer_size
18 | self.index = 0
19 | self.size = 0
20 | self.device = torch.device('cpu')
21 |
22 | def to(self, device):
23 | self.states = self.states.to(device)
24 | self.next_states = self.next_states.to(device)
25 | self.rewards = self.rewards.to(device)
26 | self.actions = self.actions.to(device)
27 | self.masks = self.masks.to(device)
28 | self.bad_masks = self.bad_masks.to(device)
29 |
30 | self.device = device
31 |
32 | def add_buffer(self, buffer):
33 | for idx in range(buffer.size):
34 | self.insert(buffer.states[idx], buffer.actions[idx], buffer.rewards[idx], buffer.next_states[idx],
35 | buffer.masks[idx], buffer.bad_masks[idx])
36 |
37 | def insert(self, states, actions, rewards, next_states, masks, bad_masks):
38 | self.states[self.index, :, :].copy_(states)
39 | self.actions[self.index, :, :].copy_(actions)
40 | self.rewards[self.index, :, :].copy_(rewards)
41 | self.next_states[self.index, :, :].copy_(next_states)
42 | self.masks[self.index, :, :].copy_(masks)
43 | self.bad_masks[self.index, :, :].copy_(bad_masks)
44 |
45 | self.index = (self.index + 1) % self.buffer_size
46 | self.size = min(self.size + 1, self.buffer_size)
47 |
48 | def clear(self):
49 | self.index = 0
50 | self.size = 0
51 |
52 | def get_batch_generator(self, batch_size):
53 | sampler = BatchSampler(SubsetRandomSampler(range(self.size * self.num_envs)), batch_size, drop_last=True)
54 |
55 | for indices in sampler:
56 | states = self.states.view(-1, *self.states.shape[2:])[indices]
57 | actions = self.actions.view(-1, self.actions.shape[-1])[indices]
58 | rewards = self.rewards.view(-1, 1)[indices]
59 | next_states = self.next_states.view(-1, *self.states.shape[2:])[indices]
60 | masks = self.masks.view(-1, 1)[indices]
61 | bad_masks = self.bad_masks.view(-1, 1)[indices]
62 |
63 | yield {'states': states, 'actions': actions, 'rewards': rewards, 'next_states': next_states,
64 | 'masks': masks, 'bad_masks': bad_masks}
65 |
66 | def get_sequential_batch_generator(self, batch_size, num_steps):
67 | sampler = BatchSampler(SubsetRandomSampler(range(self.size - num_steps)),
68 | int(batch_size / self.num_envs), drop_last=True)
69 |
70 | for indices in sampler:
71 | indices = np.array(indices)
72 | states = torch.zeros(batch_size, num_steps, *self.states.shape[2:], device=self.device)
73 | next_states = torch.zeros(batch_size, num_steps, *self.next_states.shape[2:], device=self.device)
74 | actions = torch.zeros([batch_size, num_steps, self.actions.shape[-1]], device=self.device)
75 | rewards = torch.zeros([batch_size, num_steps, 1], device=self.device)
76 | masks = torch.zeros([batch_size, num_steps, 1], device=self.device)
77 | bad_masks = torch.zeros([batch_size, num_steps, 1], device=self.device)
78 | for step in range(num_steps):
79 | states[:, step, :].copy_(self.states[indices + step].view(-1, *self.states.shape[2:]))
80 | next_states[:, step, :].copy_(self.next_states[indices + step].view(-1, *self.next_states.shape[2:]))
81 | actions[:, step, :].copy_(self.actions[indices + step].view(-1, self.actions.shape[-1]))
82 | rewards[:, step, :].copy_(self.rewards[indices + step].view(-1, 1))
83 | masks[:, step, :].copy_(self.masks[indices + step].view(-1, 1))
84 | bad_masks[:, step, :].copy_(self.bad_masks[indices + step].view(-1, 1))
85 |
86 | yield {'states': states, 'actions': actions, 'masks': masks, 'next_states':next_states,
87 | 'rewards': rewards, 'bad_masks': bad_masks}
88 |
89 | def load(self, file_name):
90 | raise NotImplemented
--------------------------------------------------------------------------------
/slbo/storages/on_policy_buffer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
3 |
4 |
5 | class OnPolicyBuffer(object):
6 | def __init__(self, num_steps, num_envs, obs_shape, action_space,
7 | use_gae=True, gamma=0.99, gae_lambda=0.95, use_proper_time_limits=True):
8 | self.num_steps = num_steps
9 | self.num_envs = num_envs
10 | self.states = torch.zeros(num_steps + 1, num_envs, *obs_shape)
11 | self.rewards = torch.zeros(num_steps, num_envs, 1)
12 | self.values = torch.zeros(num_steps + 1, num_envs, 1)
13 | self.returns = torch.zeros(num_steps + 1, num_envs, 1)
14 | self.action_log_probs = torch.zeros(num_steps, num_envs, 1)
15 | if action_space.__class__.__name__ == 'Discrete':
16 | action_shape = 1
17 | else:
18 | action_shape = action_space.shape[0]
19 | self.actions = torch.zeros(num_steps, num_envs, action_shape)
20 | if action_space.__class__.__name__ == 'Discrete':
21 | self.actions = self.actions.long()
22 | self.masks = torch.ones(num_steps + 1, num_envs, 1)
23 |
24 | self.bad_masks = torch.ones(num_steps + 1, num_envs, 1)
25 |
26 | self.num_steps = num_steps
27 | self.step = 0
28 |
29 | self.use_gae = use_gae
30 | self.gamma = gamma
31 | self.gae_lambda = gae_lambda
32 | self.use_proper_time_limits = use_proper_time_limits
33 |
34 | def to(self, device):
35 | self.states = self.states.to(device)
36 | self.rewards = self.rewards.to(device)
37 | self.values = self.values.to(device)
38 | self.returns = self.returns.to(device)
39 | self.action_log_probs = self.action_log_probs.to(device)
40 | self.actions = self.actions.to(device)
41 | self.masks = self.masks.to(device)
42 | self.bad_masks = self.bad_masks.to(device)
43 |
44 | def insert(self, states, actions, action_log_probs,
45 | values, rewards, masks, bad_masks):
46 | self.states[self.step + 1].copy_(states)
47 | self.actions[self.step].copy_(actions)
48 | self.action_log_probs[self.step].copy_(action_log_probs)
49 | self.values[self.step].copy_(values)
50 | self.rewards[self.step].copy_(rewards)
51 | self.masks[self.step + 1].copy_(masks)
52 | self.bad_masks[self.step + 1].copy_(bad_masks)
53 |
54 | self.step = (self.step + 1) % self.num_steps
55 |
56 | def after_update(self):
57 | self.states[0].copy_(self.states[-1])
58 | self.masks[0].copy_(self.masks[-1])
59 | self.bad_masks[0].copy_(self.bad_masks[-1])
60 |
61 | def compute_returns(self, next_value):
62 | if self.use_proper_time_limits:
63 | if self.use_gae:
64 | self.values[-1] = next_value
65 | gae = 0
66 | for step in reversed(range(self.num_steps)):
67 | delta = self.rewards[step] + self.gamma * self.values[step + 1] * self.masks[step + 1] - \
68 | self.values[step]
69 | gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae
70 | gae = gae * self.bad_masks[step + 1]
71 | self.returns[step] = gae + self.values[step]
72 | else:
73 | self.returns[-1] = next_value
74 | for step in reversed(range(self.num_steps)):
75 | self.returns[step] = (self.returns[step + 1] *
76 | self.gamma * self.masks[step + 1] + self.rewards[step]) * self.bad_masks[step + 1] \
77 | + (1 - self.bad_masks[step + 1]) * self.values[step]
78 | else:
79 | if self.use_gae:
80 | self.values[-1] = next_value
81 | gae = 0
82 | for step in reversed(range(self.num_steps)):
83 | delta = self.rewards[step] + self.gamma * self.values[step + 1] * self.masks[step + 1] - self.values[step]
84 | gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae
85 | self.returns[step] = gae + self.values[step]
86 | else:
87 | self.returns[-1] = next_value
88 | for step in reversed(range(self.num_steps)):
89 | self.returns[step] = self.returns[step + 1] * self.gamma * self.masks[step + 1] + self.rewards[step]
90 |
91 | def get_batch_generator(self, batch_size=None, advantages=None):
92 | batch_size = self.num_steps * self.num_envs if batch_size is None else batch_size
93 | sampler = BatchSampler(SubsetRandomSampler(range(self.num_steps * self.num_envs)), batch_size, drop_last=True)
94 |
95 | for indices in sampler:
96 | states = self.states[:-1].view(-1, *self.states.size()[2:])[indices]
97 | actions = self.actions.view(-1, self.actions.size(-1))[indices]
98 | values = self.values[:-1].view(-1, 1)[indices]
99 | returns = self.returns[:-1].view(-1, 1)[indices]
100 | masks = self.masks[:-1].view(-1, 1)[indices]
101 | action_log_probs = self.action_log_probs.view(-1, 1)[indices]
102 | if advantages is None:
103 | adv_targets = None
104 | else:
105 | adv_targets = advantages.view(-1, 1)[indices]
106 |
107 | yield {'states': states, 'actions': actions, 'values': values, 'returns': returns,
108 | 'masks': masks, 'action_log_probs': action_log_probs, 'adv_targets': adv_targets}
109 |
--------------------------------------------------------------------------------