├── .gitignore ├── readme.md ├── requirements.txt ├── result └── slbo.png ├── setup.py └── slbo ├── __init__.py ├── algos ├── __init__.py ├── mbrl │ ├── __init__.py │ └── slbo.py └── mfrl │ ├── __init__.py │ ├── ppo.py │ └── trpo.py ├── configs ├── __init__.py ├── config.py ├── slbo_config.yaml └── trpo_config.yaml ├── envs ├── __init__.py ├── mujoco │ ├── __init__.py │ ├── gym │ │ ├── __init__.py │ │ ├── ant_env.py │ │ ├── half_cheetah_env.py │ │ ├── hopper_env.py │ │ ├── swimmer_env.py │ │ └── walker2d_env.py │ ├── mujoco_envs.py │ └── rllab │ │ ├── __init__.py │ │ ├── ant_env.py │ │ ├── half_cheetah_env.py │ │ ├── hopper_env.py │ │ ├── humanoid_env.py │ │ ├── mujoco_models │ │ ├── ant.xml │ │ ├── green_ball.xml │ │ ├── half_cheetah.xml │ │ ├── hill_ant_env.xml.mako │ │ ├── hill_half_cheetah_env.xml.mako │ │ ├── hill_hopper_env.xml.mako │ │ ├── hill_swimmer3d_env.xml.mako │ │ ├── hill_walker2d_env.xml.mako │ │ ├── hopper.xml │ │ ├── humanoid.xml │ │ ├── inverted_double_pendulum.xml │ │ ├── inverted_double_pendulum.xml.mako │ │ ├── point.xml │ │ ├── red_ball.xml │ │ ├── simple_humanoid.xml │ │ ├── swimmer.xml │ │ ├── swimmer3d.xml │ │ ├── utils.mako │ │ └── walker2d.xml │ │ ├── rllab_ant.xml │ │ ├── rllab_half_cheetah.xml │ │ ├── rllab_hopper.xml │ │ ├── rllab_simple_humanoid.xml │ │ ├── rllab_swimmer.xml │ │ ├── rllab_walker2d.xml │ │ ├── swimmer_env.py │ │ └── walker2d_env.py ├── virtual_env.py └── wrapped_envs.py ├── misc ├── __init__.py ├── distributions.py ├── logger.py ├── ou_noise.py ├── param.py └── utils.py ├── models ├── __init__.py ├── actor.py ├── actor_critic.py ├── actor_layer.py ├── critic.py ├── dynamics.py ├── initializer.py ├── normalizers.py └── utils.py ├── scripts ├── run_slbo.py └── run_trpo.py └── storages ├── __init__.py ├── off_policy_buffer.py └── on_policy_buffer.py /.gitignore: -------------------------------------------------------------------------------- 1 | /envs/mujoco/rllab/mujoco_models/ 2 | /.idea 3 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # SLBO_PyTorch 2 | A PyTorch reimplementation of SLBO 3 | 4 | # Dependency 5 | 6 | Please refer to ./requirements.txt. 7 | 8 | # Usage 9 | 10 | python ./scripts/run_slbo.py 11 | 12 | hyperparams in ./configs/slbo_config.yaml 13 | 14 | # Result 15 | 16 | ![results on modified hopper](./result/slbo.png) 17 | 18 | # Credits 19 | 1. [kostrikov/pytorch-trpo](https://github.com/ikostrikov/pytorch-trpo) 20 | 2. [facebookresearch/slbo](https://github.com/facebookresearch/slbo) 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | atari-py==0.2.6 2 | attrs==19.3.0 3 | box2d-py==2.3.8 4 | cloudpickle==1.2.1 5 | colorama==0.4.3 6 | filelock==3.0.12 7 | gtimer==1.0.0b5 8 | gym~=0.17.2 9 | matplotlib==3.1.2 10 | more-itertools==8.0.2 11 | mpi4py==3.0.3 12 | mujoco-py==2.0.2.5 13 | multidict==4.7.5 14 | munch==2.5.0 15 | numpy~=1.18.5 16 | pybullet==2.6.1 17 | PyYAML~=5.3.1 18 | stable-baselines==2.10.0 19 | tensorboard==1.14.0 20 | tensorflow==1.14.0 21 | tensorflow-estimator==1.14.0 22 | termcolor==1.1.0 23 | torch~=1.6.0 24 | scipy~=1.5.1 25 | tqdm~=4.46.1 26 | -------------------------------------------------------------------------------- /result/slbo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/result/slbo.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | setup( 5 | name='slbo_pytorch', 6 | auther='Shengyi Jiang', 7 | author_email='shengyi.jiang@outlook.com', 8 | packages=find_packages(), 9 | package_data={}, 10 | install_requires=[ 11 | 'torch>=1.4.0', 12 | 'gym>=0.17.0', 13 | 'numpy', 14 | 'stable_baselines', 15 | 'pyglib', 16 | 'scipy', 17 | ]) 18 | -------------------------------------------------------------------------------- /slbo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/__init__.py -------------------------------------------------------------------------------- /slbo/algos/__init__.py: -------------------------------------------------------------------------------- 1 | from slbo.algos.mbrl.slbo import SLBO 2 | from slbo.algos.mfrl import PPO, TRPO 3 | -------------------------------------------------------------------------------- /slbo/algos/mbrl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/algos/mbrl/__init__.py -------------------------------------------------------------------------------- /slbo/algos/mbrl/slbo.py: -------------------------------------------------------------------------------- 1 | from operator import itemgetter 2 | import torch 3 | 4 | from slbo.models.dynamics import Dynamics 5 | from slbo.models.normalizers import Normalizers 6 | 7 | 8 | class SLBO: 9 | def __init__(self, dynamics: Dynamics, normalizers: Normalizers, batch_size: int, num_updates: int, 10 | num_rollout_steps, l2_reg_coef, lr, max_grad_norm=2): 11 | self.dynamics = dynamics 12 | self.normalizers = normalizers 13 | 14 | self.num_updates = num_updates 15 | self.num_rollout_steps = num_rollout_steps 16 | self.batch_size = batch_size 17 | self.l2_reg_coef = l2_reg_coef 18 | self.max_grad_norm = max_grad_norm 19 | 20 | self.dynamics_optimizer = torch.optim.Adam(self.dynamics.parameters(), lr) 21 | 22 | def update(self, model_buffer) -> dict: 23 | 24 | gen = model_buffer.get_sequential_batch_generator(self.batch_size, self.num_rollout_steps) 25 | 26 | model_loss_epoch = 0. 27 | l2_loss_epoch = 0. 28 | for _ in range(self.num_updates): 29 | try: 30 | state_sequences, action_sequences, next_state_sequences, mask_sequences = \ 31 | itemgetter(*['states', 'actions', 'next_states', 'masks'])(next(gen)) 32 | except StopIteration: 33 | gen = model_buffer.get_sequential_batch_generator(self.batch_size, self.num_rollout_steps) 34 | state_sequences, action_sequences, next_state_sequences, mask_sequences = \ 35 | itemgetter(*['states', 'actions', 'next_states', 'masks'])(next(gen)) 36 | 37 | cur_states = state_sequences[:, 0] 38 | model_loss = 0. 39 | 40 | for i in range(self.num_rollout_steps): 41 | next_states = self.dynamics(cur_states, action_sequences[:, i]) 42 | diffs = next_states - cur_states - next_state_sequences[:, i] + state_sequences[:, i] 43 | weighted_diffs = diffs / torch.clamp(self.normalizers.diff_normalizer.std, min=1e-6) 44 | model_loss += weighted_diffs.pow(2).mean(-1).sqrt() 45 | 46 | if i < self.num_rollout_steps - 1: 47 | cur_states = state_sequences[:, i + 1] + \ 48 | mask_sequences[:, i] * (next_states - state_sequences[:, i + 1]) 49 | 50 | model_loss = model_loss.mean() / self.num_rollout_steps 51 | params = self.dynamics.parameters() 52 | l2_loss = self.l2_reg_coef * torch.stack([torch.norm(t, p=2) for t in params]).sum() 53 | 54 | model_loss_epoch += model_loss.item() 55 | l2_loss_epoch += l2_loss.item() 56 | 57 | self.dynamics_optimizer.zero_grad() 58 | (model_loss + l2_loss).backward() 59 | torch.nn.utils.clip_grad_norm_(self.dynamics.parameters(), self.max_grad_norm) 60 | self.dynamics_optimizer.step() 61 | 62 | model_loss_epoch /= self.num_updates 63 | return {'model_loss': model_loss_epoch, 'l2_loss': l2_loss_epoch} 64 | -------------------------------------------------------------------------------- /slbo/algos/mfrl/__init__.py: -------------------------------------------------------------------------------- 1 | from slbo.algos.mfrl.ppo import PPO 2 | from slbo.algos.mfrl.trpo import TRPO -------------------------------------------------------------------------------- /slbo/algos/mfrl/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | 5 | from slbo.models.actor_critic import ActorCritic 6 | 7 | 8 | class PPO: 9 | def __init__(self, actor_critic: ActorCritic, clip_param: float, num_grad_updates: int, batch_size: int, 10 | value_loss_coef: float, entropy_coef: float, lr: float = None, max_grad_norm: float = None, 11 | use_clipped_value_loss=True, verbose=0): 12 | self.actor_critic = actor_critic 13 | 14 | self.clip_param = clip_param 15 | self.num_grad_updates = num_grad_updates 16 | self.batch_size = batch_size 17 | self.value_loss_coef = value_loss_coef 18 | self.entropy_coef = entropy_coef 19 | 20 | self.max_grad_norm = max_grad_norm 21 | self.use_clipped_value_loss = use_clipped_value_loss 22 | 23 | self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr) 24 | 25 | self.verbose = verbose 26 | 27 | def update(self, policy_buffer) -> dict: 28 | advantage = policy_buffer.returns[:-1] - policy_buffer.values[:-1] 29 | advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-5) 30 | 31 | value_loss_epoch = 0 32 | action_loss_epoch = 0 33 | dist_entropy_epoch = 0 34 | 35 | for _ in range(self.num_grad_updates): 36 | 37 | data_generator = policy_buffer.get_batch_generator(self.batch_size, advantage) 38 | 39 | for sample in data_generator: 40 | states, actions, value_preds, returns, old_action_log_probs, adv_targets = \ 41 | sample['states'], sample['actions'], sample['values'], \ 42 | sample['returns'], sample['action_log_probs'], sample['adv_targets'] 43 | 44 | values, action_log_probs, dist_entropy = self.actor_critic.evaluate_action(states, actions) 45 | 46 | ratio = torch.exp(action_log_probs - old_action_log_probs) 47 | surr1 = ratio * adv_targets 48 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targets 49 | 50 | action_loss = -torch.min(surr1, surr2).mean() 51 | 52 | if self.use_clipped_value_loss: 53 | value_pred_clipped = value_preds + \ 54 | (values - value_preds).clamp(-self.clip_param, self.clip_param) 55 | value_losses = (values - returns).pow(2) 56 | value_losses_clipped = ( 57 | value_pred_clipped - returns).pow(2) 58 | value_loss = 0.5 * torch.max(value_losses, 59 | value_losses_clipped).mean() 60 | else: 61 | value_loss = 0.5 * (returns - values).pow(2).mean() 62 | 63 | self.optimizer.zero_grad() 64 | (value_loss * self.value_loss_coef + action_loss - 65 | dist_entropy * self.entropy_coef).backward() 66 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 67 | self.max_grad_norm) 68 | self.optimizer.step() 69 | 70 | value_loss_epoch += value_loss.item() 71 | action_loss_epoch += action_loss.item() 72 | dist_entropy_epoch += dist_entropy.item() 73 | 74 | num_updates = self.num_grad_updates * self.batch_size 75 | 76 | value_loss_epoch /= num_updates 77 | action_loss_epoch /= num_updates 78 | dist_entropy_epoch /= num_updates 79 | 80 | return {'value_loss': value_loss_epoch, 'action_loss': action_loss_epoch, 81 | 'dist_entropy': dist_entropy_epoch} 82 | -------------------------------------------------------------------------------- /slbo/algos/mfrl/trpo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.optimize 3 | import torch 4 | 5 | from slbo.models import Actor, VCritic 6 | from slbo.models.utils import get_flat_params, set_flat_params, get_flat_grad 7 | try: 8 | from slbo.misc import logger 9 | except ImportError: 10 | from stable_baselines import logger 11 | 12 | 13 | # noinspection DuplicatedCode 14 | class TRPO: 15 | def __init__(self, actor: Actor, critic: VCritic, max_kld=1e-2, l2_reg_coef=1e-3, damping=0.1, 16 | entropy_coef=0., line_search_accepted_ratio=0.1, verbose=0): 17 | 18 | self.actor = actor 19 | self.critic = critic 20 | 21 | self.max_kld = max_kld 22 | self.l2_reg = l2_reg_coef 23 | self.damping = damping 24 | self.linesearch_accepted_ratio = line_search_accepted_ratio 25 | self.entropy_coef = entropy_coef 26 | 27 | self.verbose = verbose 28 | 29 | @staticmethod 30 | def get_conjugate_gradient(Avp, b, nsteps, residual_tol=1e-10): 31 | x = torch.zeros_like(b) 32 | r = b.clone() 33 | p = b.clone() 34 | rdotr = torch.dot(r, r) 35 | for i in range(nsteps): 36 | _Avp = Avp(p) 37 | alpha = rdotr / torch.dot(p, _Avp) 38 | x += alpha * p 39 | r -= alpha * _Avp 40 | new_rdotr = torch.dot(r, r) 41 | beta = new_rdotr / rdotr 42 | p = r + beta * p 43 | rdotr = new_rdotr 44 | if rdotr < residual_tol: 45 | break 46 | return x 47 | 48 | def linesearch(self, f, init_params, fullstep, expected_improve_rate, max_backtracks=10): 49 | with torch.no_grad(): 50 | fval = f() 51 | for (_n_backtracks, stepfrac) in enumerate(.5 ** np.arange(max_backtracks)): 52 | new_params = init_params + stepfrac * fullstep 53 | set_flat_params(self.actor, new_params) 54 | newfval = f() 55 | actual_improve = fval - newfval 56 | expected_improve = expected_improve_rate * stepfrac 57 | ratio = actual_improve / expected_improve 58 | if self.verbose > 0: 59 | logger.log("a/e/r ", actual_improve.item(), expected_improve.item(), ratio.item()) 60 | if ratio.item() > self.linesearch_accepted_ratio and actual_improve.item() > 0: 61 | return True, new_params 62 | return False, init_params 63 | 64 | # noinspection DuplicatedCode 65 | def update_critic(self, states, targets): 66 | def get_value_loss(params): 67 | set_flat_params(self.critic, torch.tensor(params)) 68 | for param in self.critic.parameters(): 69 | if param.grad is not None: 70 | param.grad.data.fill_(0) 71 | 72 | values = self.critic(states) 73 | value_loss_ = (values - targets).pow(2).mean() 74 | 75 | loss = value_loss_ 76 | for param in self.critic.parameters(): 77 | loss += param.pow(2).sum() * self.l2_reg 78 | loss.backward() 79 | return loss.data.cpu().double().numpy(), get_flat_grad(self.critic).data.cpu().double().numpy() 80 | 81 | flat_params, value_loss, _ = scipy.optimize.fmin_l_bfgs_b(get_value_loss, 82 | get_flat_params(self.critic).cpu().double().numpy(), 83 | maxiter=25) 84 | set_flat_params(self.critic, torch.tensor(flat_params)) 85 | return value_loss 86 | 87 | def update(self, policy_buffer) -> dict: 88 | advantages = policy_buffer.returns[:-1] - policy_buffer.values[:-1] 89 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) 90 | 91 | data_generator = policy_buffer.get_batch_generator(advantages=advantages) 92 | 93 | value_loss_epoch = 0. 94 | action_loss_epoch = 0. 95 | 96 | for sample in data_generator: 97 | states, actions, returns, adv_targets = \ 98 | sample['states'], sample['actions'], sample['returns'], sample['adv_targets'] 99 | 100 | value_loss = self.update_critic(states, returns) 101 | fixed_log_prob = self.actor.evaluate_action(states, actions)[0].detach() 102 | 103 | def get_action_loss(): 104 | log_prob, entropy = self.actor.evaluate_action(states, actions) 105 | action_loss_ = - adv_targets * torch.exp(log_prob - fixed_log_prob) - self.entropy_coef * entropy 106 | return action_loss_.mean() 107 | 108 | def get_kl(): 109 | *_, action_means, action_logstds, action_stds = self.actor.act(states) 110 | 111 | fixed_action_means = action_means.detach() 112 | fixed_action_logstds = action_logstds.detach() 113 | fixed_action_stds = action_stds.detach() 114 | kl = action_logstds - fixed_action_logstds + \ 115 | (fixed_action_stds.pow(2) + (fixed_action_means - action_means).pow(2)) / \ 116 | (2.0 * action_stds.pow(2)) - 0.5 117 | return kl.sum(1, keepdim=True) 118 | 119 | action_loss = get_action_loss() 120 | action_loss_grad = torch.autograd.grad(action_loss, self.actor.parameters()) 121 | flat_action_loss_grad = torch.cat([grad.view(-1) for grad in action_loss_grad]).data 122 | 123 | def Fvp(v): 124 | kl = get_kl() 125 | kl = kl.mean() 126 | 127 | kld_grad = torch.autograd.grad(kl, self.actor.parameters(), create_graph=True) 128 | flat_kld_grad = torch.cat([grad.view(-1) for grad in kld_grad]) 129 | 130 | kl_v = (flat_kld_grad * v).sum() 131 | kld_grad_grad = torch.autograd.grad(kl_v, self.actor.parameters()) 132 | flat_kld_grad_grad = torch.cat([grad.contiguous().view(-1) for grad in kld_grad_grad]).data 133 | 134 | return flat_kld_grad_grad + v * self.damping 135 | 136 | stepdir = self.get_conjugate_gradient(Fvp, -flat_action_loss_grad, 10) 137 | 138 | shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0) 139 | 140 | lm = torch.sqrt(shs / self.max_kld) 141 | fullstep = stepdir / lm 142 | 143 | neggdotstepdir = (-flat_action_loss_grad * stepdir).sum(0, keepdim=True) 144 | if self.verbose > 0: 145 | logger.info(("lagrange multiplier:", lm, "grad_norm:", flat_action_loss_grad.norm())) 146 | 147 | prev_params = get_flat_params(self.actor) 148 | success, new_params = self.linesearch(get_action_loss, prev_params, fullstep, neggdotstepdir / lm) 149 | set_flat_params(self.actor, new_params) 150 | 151 | value_loss_epoch += value_loss 152 | action_loss_epoch += action_loss.item() 153 | 154 | return {'action_loss': action_loss_epoch, 'value_loss': value_loss_epoch} 155 | -------------------------------------------------------------------------------- /slbo/configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/configs/__init__.py -------------------------------------------------------------------------------- /slbo/configs/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import yaml 4 | from munch import DefaultMunch 5 | from yaml import Loader 6 | import collections 7 | 8 | try: 9 | from slbo.misc import logger 10 | except ImportError: 11 | from stable_baselines import logger 12 | 13 | 14 | def flatten(d, parent_key='', sep='.'): 15 | items = [] 16 | for k, v in d.items(): 17 | new_key = parent_key + sep + k if parent_key else k 18 | if isinstance(v, collections.MutableMapping): 19 | items.extend(flatten(v, new_key, sep=sep).items()) 20 | else: 21 | items.append((new_key, str(v))) 22 | return dict(items) 23 | 24 | 25 | class Config: 26 | def __new__(cls, config_path='config.yaml'): 27 | if not config_path.startswith('/'): 28 | config_path = os.path.join(os.path.dirname(__file__), config_path) 29 | logger.info('Loading configs from {}.'.format(config_path)) 30 | with open(config_path, 'r', encoding='utf-8') as f: 31 | config_dict = yaml.load(f, Loader=Loader) 32 | config = DefaultMunch.fromDict(config_dict, object()) 33 | config_dict = flatten(config_dict) 34 | logged_config_dict = {} 35 | for key, value in config_dict.items(): 36 | if key.find('.') >= 0: 37 | logged_config_dict[key] = value 38 | return config, logged_config_dict 39 | -------------------------------------------------------------------------------- /slbo/configs/slbo_config.yaml: -------------------------------------------------------------------------------- 1 | mf_algo: 'trpo' 2 | proj_dir: '/home/liuxh/Documents/slbo' 3 | result_dir: './result' 4 | use_cuda: True 5 | seed: 0 6 | verbose: 0 7 | model_load_path: ~ 8 | buffer_load_path: ~ 9 | save_freq: 2 10 | eval_freq: 1 11 | 12 | env: 13 | env_name: 'Hopper-v2' 14 | num_real_envs: 1 15 | num_virtual_envs: 8 16 | gamma: 0.99 17 | max_episode_steps: 500 18 | 19 | ou_noise: 20 | theta: 0.15 21 | sigma: 0.3 22 | 23 | trpo: 24 | entropy_coef: 0.005 25 | max_kld: 0.01 26 | num_env_steps: 500 # 500 x 8 = 4000 27 | critic_hidden_dims: [64, 64] 28 | actor_hidden_dims: [32, 32] 29 | use_limited_ent_actor: True 30 | use_gae: True 31 | gae_lambda: 0.95 32 | use_proper_time_limits: True 33 | log_interval: 5 34 | l2_reg_coef : 0. 35 | norm_reward: False 36 | 37 | slbo: 38 | num_env_steps: 4000 39 | num_epochs: 100 # collect num_env_steps per epoch 40 | num_iters: 20 # number of iteration per epoch 41 | num_model_updates: 100 # number of model updates per iteration 42 | num_policy_updates: 40 # number of policy updates per iteration 43 | use_prev_data: True 44 | dynamics_hidden_dims: [500, 500] 45 | num_rollout_steps: 2 46 | batch_size: 128 47 | buffer_size: 200000 48 | lr: 0.001 49 | l2_reg_coef: 0.00001 50 | log_interval: 1 51 | start_strategy: 'reset' # choose from 'reset' and 'buffer' 52 | -------------------------------------------------------------------------------- /slbo/configs/trpo_config.yaml: -------------------------------------------------------------------------------- 1 | mf_algo: 'trpo' 2 | proj_dir: '/home/polixir/jiangsy/slbo' 3 | result_dir: './result' 4 | use_cuda: False 5 | seed: 0 6 | verbose: 0 7 | model_load_path: ~ 8 | buffer_load_path: ~ 9 | save_interval: 10 10 | log_interval: 1 11 | eval_interval: 10 12 | 13 | env: 14 | env_name: 'Hopper-v2' 15 | num_envs: 1 16 | gamma: 0.99 17 | 18 | ou_noise: 19 | theta: 0.15 20 | sigma: 0.3 21 | 22 | trpo: 23 | total_env_steps: 2000000 24 | entropy_coef: 0. 25 | max_kld: 0.01 26 | num_env_steps: 2048 27 | critic_hidden_dims: [64, 64] 28 | actor_hidden_dims: [64, 64] 29 | use_gae: True 30 | gae_lambda: 0.95 31 | use_proper_time_limits: True 32 | -------------------------------------------------------------------------------- /slbo/envs/__init__.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import gym 3 | import numpy as np 4 | import torch 5 | from stable_baselines import logger 6 | 7 | from slbo.storages.off_policy_buffer import OffPolicyBuffer 8 | 9 | 10 | class BaseBatchedEnv(gym.Env, abc.ABC): 11 | n_envs: int 12 | 13 | @abc.abstractmethod 14 | def step(self, actions): 15 | pass 16 | 17 | def reset(self): 18 | return self.partial_reset(range(self.n_envs)) 19 | 20 | @abc.abstractmethod 21 | def partial_reset(self, indices): 22 | pass 23 | 24 | def set_state(self, state): 25 | logger.warn('`set_state` is not implemented') 26 | 27 | 28 | class BaseModelBasedEnv(gym.Env, abc.ABC): 29 | @abc.abstractmethod 30 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray): 31 | raise NotImplementedError 32 | 33 | def verify(self, n=2000, eps=1e-4): 34 | buffer = OffPolicyBuffer(n, self.observation_space.shape, 1, self.action_space) 35 | state = self.reset() 36 | for _ in range(n): 37 | action = self.action_space.sample() 38 | next_state, reward, done, _ = self.step(action) 39 | 40 | mask = torch.tensor([0.0] if done else [1.0], dtype=torch.float32) 41 | 42 | buffer.insert(torch.tensor(state), torch.tensor(action), torch.tensor(reward), 43 | torch.tensor(next_state), torch.tensor(mask)) 44 | 45 | state = next_state 46 | if done: 47 | state = self.reset() 48 | 49 | rewards_, dones_ = self.mb_step(buffer.states.numpy(), buffer.actions.numpy(), buffer.next_states.numpy()) 50 | diff = (buffer.rewards.numpy() - rewards_[:, np.newaxis]) * buffer.masks.numpy() 51 | l_inf = np.abs(diff).max() 52 | logger.info('reward difference: %.6f', l_inf) 53 | 54 | assert np.allclose(dones_, buffer.masks), 'reward model is inaccurate' 55 | assert l_inf < eps, 'done model is inaccurate' 56 | 57 | def seed(self, seed: int = None): 58 | pass 59 | 60 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/envs/mujoco/__init__.py -------------------------------------------------------------------------------- /slbo/envs/mujoco/gym/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/envs/mujoco/gym/__init__.py -------------------------------------------------------------------------------- /slbo/envs/mujoco/gym/ant_env.py: -------------------------------------------------------------------------------- 1 | from gym.envs.mujoco import ant 2 | import numpy as np 3 | from stable_baselines import logger 4 | 5 | from slbo.envs import BaseModelBasedEnv 6 | 7 | 8 | # noinspection DuplicatedCode 9 | class AntEnv(ant.AntEnv, BaseModelBasedEnv): 10 | def __init__(self, use_approximated_vel=True): 11 | logger.warn('Modified Gym Envs!') 12 | self.rescale_action = False 13 | self.use_approximated_vel = use_approximated_vel 14 | 15 | ant.AntEnv.__init__(self) 16 | BaseModelBasedEnv.__init__(self) 17 | 18 | def get_body_xmat(self, body_name): 19 | return self.sim.data.get_body_xmat(body_name) 20 | 21 | def get_body_comvel(self, body_name): 22 | return self.sim.data.get_body_xvelp(body_name) 23 | 24 | def _get_obs(self): 25 | return np.concatenate([ 26 | self.sim.data.qpos.flat, # 15 27 | self.sim.data.qvel.flat, # 14 28 | self.get_body_xmat("torso").flat, # 9 29 | self.get_body_com("torso"), # 9 30 | self.get_body_comvel("torso"), # 3 31 | ]).reshape(-1) 32 | 33 | def step(self, action): 34 | pre_pos = self.sim.data.qpos[0] 35 | self.do_simulation(action, self.frame_skip) 36 | post_pos = self.sim.data.qpos[0] 37 | if self.use_approximated_vel: 38 | fwd_reward = (post_pos - pre_pos) / self.dt 39 | else: 40 | fwd_reward = self.get_body_comvel('torso')[0] 41 | ctrl_reward = - .5 * np.square(action).sum() 42 | # make sure the reward can be recovered from state and action completely 43 | contact_reward = - 0. 44 | survive_reward = 1.0 45 | reward = fwd_reward + ctrl_reward + contact_reward + survive_reward 46 | state = self.state_vector() 47 | done = not(np.isfinite(state).all() and 0.2 <= state[2] <= 1.0) 48 | ob = self._get_obs() 49 | return ob, reward, done, {} 50 | 51 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray): 52 | if self.use_approximated_vel: 53 | reward_forward = (next_states[:, 0] - states[:, 0]) / self.dt 54 | else: 55 | reward_forward = next_states[..., -3] 56 | 57 | ctrl_cost = .5 * np.square(actions).sum(-1) 58 | contact_cost = 0. 59 | survive_reward = 1.0 60 | reward = reward_forward - ctrl_cost - contact_cost + survive_reward 61 | notdone = np.all(0.2 <= next_states[..., 2] <= 1.0, axis=0) 62 | return reward, 1. - notdone 63 | 64 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/gym/half_cheetah_env.py: -------------------------------------------------------------------------------- 1 | from gym.envs.mujoco import half_cheetah 2 | import numpy as np 3 | from stable_baselines import logger 4 | 5 | from slbo.envs import BaseModelBasedEnv 6 | 7 | 8 | # noinspection DuplicatedCode 9 | class HalfCheetahEnv(half_cheetah.HalfCheetahEnv, BaseModelBasedEnv): 10 | def __init__(self, use_approximated_vel=True): 11 | self.use_approximated_vel = use_approximated_vel 12 | self.rescale_action = False 13 | if not self.use_approximated_vel: 14 | logger.warn('Modified Gym Env!') 15 | 16 | half_cheetah.HalfCheetahEnv.__init__(self) 17 | BaseModelBasedEnv.__init__(self) 18 | 19 | def get_body_comvel(self, body_name): 20 | return self.sim.data.get_body_xvelp(body_name) 21 | 22 | def _get_obs(self): 23 | return np.concatenate([ 24 | self.model.data.qpos.flat, # 9 25 | self.model.data.qvel.flat, # 9 26 | self.get_body_com("torso").flat, # 3 27 | self.get_body_comvel("torso").flat, # 3 28 | ]) 29 | 30 | def step(self, action: np.ndarray): 31 | pre_pos = self.sim.data.qpos[0] 32 | self.do_simulation(action, self.frame_skip) 33 | post_pos = self.sim.data.qpos[0] 34 | if self.use_approximated_vel: 35 | fwd_reward = (post_pos - pre_pos) / self.dt 36 | else: 37 | fwd_reward = self.get_body_comvel('torso')[0] 38 | ctrl_reward = - 0.1 * np.square(action).sum() 39 | reward = ctrl_reward + fwd_reward 40 | obs = self._get_obs() 41 | return obs, reward, False, {} 42 | 43 | def mb_step(self, states, actions, next_states): 44 | ctrl_rewards = - 0.1 * np.square(actions).sum(-1) 45 | if self.use_approximated_vel: 46 | fwd_rewards = (next_states[:, 0] - states[:, 0]) / self.dt 47 | else: 48 | fwd_rewards = next_states[:, 21] 49 | rewards = fwd_rewards + ctrl_rewards 50 | return rewards, np.zeros_like(rewards, dtype=np.bool) 51 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/gym/hopper_env.py: -------------------------------------------------------------------------------- 1 | from gym.envs.mujoco import hopper 2 | import numpy as np 3 | from stable_baselines import logger 4 | 5 | from slbo.envs import BaseModelBasedEnv 6 | 7 | 8 | # noinspection DuplicatedCode 9 | class HopperEnv(hopper.HopperEnv, BaseModelBasedEnv): 10 | def __init__(self, use_approximated_vel=True): 11 | self.use_approximated_vel = use_approximated_vel 12 | self.rescale_action = False 13 | 14 | if not self.use_approximated_vel: 15 | logger.warn('Modified Gym Env!') 16 | hopper.HopperEnv.__init__(self) 17 | BaseModelBasedEnv.__init__(self) 18 | 19 | def get_body_comvel(self, body_name): 20 | return self.sim.data.get_body_xvelp(body_name) 21 | 22 | def _get_obs(self): 23 | return np.concatenate([ 24 | self.sim.data.qpos.flat, # 6 25 | self.sim.data.qvel.flat, # 6 26 | self.get_body_com("torso").flat, # 3 27 | self.get_body_comvel("torso").flat, # 3 28 | ]) 29 | 30 | def step(self, action): 31 | pre_pos = self.sim.data.qpos[0] 32 | self.do_simulation(action, self.frame_skip) 33 | post_pos, height, ang = self.sim.data.qpos[0:3] 34 | if self.use_approximated_vel: 35 | fwd_reward = (post_pos - pre_pos) / self.dt 36 | else: 37 | fwd_reward = self.get_body_comvel('torso')[0] 38 | survive_reward = 1.0 39 | ctrl_reward = -1e-3 * np.square(action).sum() 40 | reward = fwd_reward + survive_reward + ctrl_reward 41 | s = self.state_vector() 42 | done = not (np.isfinite(s).all() and (np.abs(s[2:]) < 100).all() and 43 | (height > .7) and (abs(ang) < .2)) 44 | ob = self._get_obs() 45 | return ob, reward, done, {} 46 | 47 | def mb_step(self, states, actions, next_states): 48 | if self.use_approximated_vel: 49 | fwd_reward = (next_states[:, 0] - states[:, 0]) / self.dt 50 | else: 51 | fwd_reward = next_states[:, -3] 52 | 53 | survive_reward = 1.0 54 | ctrl_reward = -1e-3 * np.square(actions).sum(-1) 55 | 56 | reward = fwd_reward + survive_reward + ctrl_reward 57 | 58 | done = ~((next_states[:, 2:12] < 100).all(axis=-1) & 59 | (next_states[:, 1] > 0.7) & 60 | (np.abs(next_states[:, 2]) < 0.2)) 61 | return reward, done 62 | 63 | 64 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/gym/swimmer_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import swimmer 3 | from stable_baselines import logger 4 | 5 | from slbo.envs import BaseModelBasedEnv 6 | 7 | 8 | # noinspection DuplicatedCode 9 | class SwimmerEnv(swimmer.SwimmerEnv, BaseModelBasedEnv): 10 | def __init__(self, use_approximated_vel=True): 11 | self.use_approximated_vel = use_approximated_vel 12 | self.rescale_action = False 13 | 14 | if not self.use_approximated_vel: 15 | logger.warn('Modified Gym Env!') 16 | 17 | swimmer.SwimmerEnv.__init__(self) 18 | BaseModelBasedEnv.__init__(self) 19 | 20 | def get_body_comvel(self, body_name): 21 | return self.sim.data.get_body_xvelp(body_name) 22 | 23 | def _get_obs(self): 24 | return np.concatenate([ 25 | self.model.data.qpos.flat, # 5 26 | self.model.data.qvel.flat, # 5 27 | self.get_body_com("torso").flat, # 3 28 | self.get_body_comvel("torso").flat, # 3 29 | ]).reshape(-1) 30 | 31 | def step(self, action): 32 | pre_pos = self.sim.data.qpos[0] 33 | self.do_simulation(action, self.frame_skip) 34 | post_pos, height, ang = self.sim.data.qpos[0:3] 35 | if self.use_approximated_vel: 36 | fwd_reward = (post_pos - pre_pos) / self.dt 37 | else: 38 | fwd_reward = self.get_body_comvel('torso')[0] 39 | ctrl_reward = - 0.0001 * np.square(action).sum() 40 | reward = fwd_reward + ctrl_reward 41 | obs = self._get_obs() 42 | return obs, reward, False, {} 43 | 44 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray): 45 | ctrl_reward = - 0.0001 * np.square(actions).sum(-1) 46 | fwd_reward = next_states[:, -3] 47 | reward = fwd_reward + ctrl_reward 48 | return reward, False 49 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/gym/walker2d_env.py: -------------------------------------------------------------------------------- 1 | from gym.envs.mujoco import walker2d 2 | import numpy as np 3 | from stable_baselines import logger 4 | 5 | from slbo.envs import BaseModelBasedEnv 6 | 7 | 8 | # noinspection DuplicatedCode 9 | class Walker2DEnv(walker2d.Walker2dEnv, BaseModelBasedEnv): 10 | def __init__(self, use_approximated_vel=True): 11 | self.use_approximated_vel = use_approximated_vel 12 | self.rescale_action = False 13 | 14 | if not self.use_approximated_vel: 15 | logger.warn('Modified Gym Env!') 16 | 17 | walker2d.Walker2dEnv.__init__(self) 18 | BaseModelBasedEnv.__init__(self) 19 | 20 | def get_body_comvel(self, body_name): 21 | return self.sim.data.get_body_xvelp(body_name) 22 | 23 | def _get_obs(self): 24 | return np.concatenate([ 25 | self.model.data.qpos.flat, 26 | self.model.data.qvel.flat, 27 | self.get_body_com("torso").flat, 28 | self.get_body_comvel("torso").flat 29 | ]) 30 | 31 | def step(self, action): 32 | pre_pos = self.sim.data.qpos[0] 33 | self.do_simulation(action, self.frame_skip) 34 | post_pos, height, ang = self.sim.data.qpos[0:3] 35 | if self.use_approximated_vel: 36 | fwd_reward = (post_pos - pre_pos) / self.dt 37 | else: 38 | fwd_reward = self.get_body_comvel('torso')[0] 39 | survive_reward = 1.0 40 | ctrl_reward = - 1e-3 * np.square(action).sum() 41 | reward = fwd_reward + survive_reward + ctrl_reward 42 | done = not (0.8 < height < 2.0 and -1.0 < ang < 1.0) 43 | ob = self._get_obs() 44 | return ob, reward, done, {} 45 | 46 | def mb_step(self, states, actions, next_states): 47 | if self.use_approximated_vel: 48 | fwd_rewards = (states[:, 0] - next_states[:, 0]) / self.dt 49 | else: 50 | fwd_rewards = next_states[:, 21] 51 | survive_rewards = 1.0 52 | ctrl_rewards = - 1e-3 * np.square(actions).sum(-1) 53 | rewards = fwd_rewards + survive_rewards + ctrl_rewards 54 | dones = ~((0.8 < next_states[:, 1] < 2.0) & 55 | (-1.0 < next_states[:, 2] < 1.0)) 56 | return rewards, dones 57 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/mujoco_envs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from slbo.envs.mujoco.gym.ant_env import AntEnv 4 | from slbo.envs.mujoco.gym.half_cheetah_env import HalfCheetahEnv 5 | from slbo.envs.mujoco.gym.hopper_env import HopperEnv 6 | from slbo.envs.mujoco.gym.swimmer_env import SwimmerEnv 7 | from slbo.envs.mujoco.gym.walker2d_env import Walker2DEnv 8 | from slbo.envs.mujoco.rllab.ant_env import RLLabAntEnv 9 | from slbo.envs.mujoco.rllab.half_cheetah_env import RLLabHalfCheetahEnv 10 | from slbo.envs.mujoco.rllab.hopper_env import RLLabHopperEnv 11 | from slbo.envs.mujoco.rllab.humanoid_env import RLLabSimpleHumanoidEnv 12 | from slbo.envs.mujoco.rllab.swimmer_env import RLLabSwimmerEnv 13 | from slbo.envs.mujoco.rllab.walker2d_env import RLLabWalker2dEnv 14 | try: 15 | from slbo.misc import logger 16 | except ImportError: 17 | from stable_baselines import logger 18 | 19 | 20 | def make_mujoco_env(env_name: str): 21 | envs = { 22 | 'HalfCheetah-v2': HalfCheetahEnv, 23 | 'Walker2D-v2': Walker2DEnv, 24 | 'Ant-v2': AntEnv, 25 | 'Hopper-v2': HopperEnv, 26 | 'Swimmer-v2': SwimmerEnv, 27 | 'RLLabHalfCheetah-v2': RLLabHalfCheetahEnv, 28 | 'RLLabWalker2D-v2': RLLabWalker2dEnv, 29 | 'RLLabAnt-v2': RLLabAntEnv, 30 | 'RLLabHopper-v2': RLLabHopperEnv, 31 | 'RLLabSwimmer-v2': RLLabSwimmerEnv, 32 | 'RLLabHumanoid-v2': RLLabSimpleHumanoidEnv 33 | } 34 | env = envs[env_name]() 35 | if not hasattr(env, 'reward_range'): 36 | env.reward_range = (-np.inf, np.inf) 37 | if not hasattr(env, 'metadata'): 38 | env.metadata = {} 39 | env.seed(np.random.randint(2 ** 60)) 40 | return env 41 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/envs/mujoco/rllab/__init__.py -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/ant_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from slbo.envs import BaseModelBasedEnv 8 | 9 | 10 | class RLLabAntEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | def __init__(self): 12 | self.rescale_action = True 13 | 14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_ant.xml'), 1) 15 | utils.EzPickle.__init__(self) 16 | 17 | def get_body_xmat(self, body_name): 18 | return self.sim.data.get_body_xmat(body_name) 19 | 20 | def get_body_comvel(self, body_name): 21 | return self.sim.data.get_body_xvelp(body_name) 22 | 23 | def _get_obs(self): 24 | return np.concatenate([ 25 | self.sim.data.qpos.flat, # 15 26 | self.sim.data.qvel.flat, # 14 27 | self.get_body_xmat("torso").flat, # 9 28 | self.get_body_com("torso").flat, # 9 (should be 3?) 29 | self.get_body_comvel("torso").flat, # 3 30 | ]).reshape(-1) 31 | 32 | def step(self, action: np.ndarray): 33 | self.do_simulation(action, self.frame_skip) 34 | comvel = self.get_body_comvel("torso") 35 | fwd_reward = comvel[0] 36 | scaling = (self.action_space.high - self.action_space.low) * 0.5 37 | ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) 38 | contact_cost = 0. 39 | survive_reward = 0.05 40 | reward = fwd_reward - ctrl_cost - contact_cost + survive_reward 41 | state = self.state_vector() 42 | done = not (np.isfinite(state).all() and 0.2 <= state[2] <= 1.0) 43 | obs = self._get_obs() 44 | return obs, float(reward), done, {} 45 | 46 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray): 47 | comvel = next_states[..., -3:] 48 | fwd_reward = comvel[..., 0] 49 | scaling = (self.action_space.high - self.action_space.low) * 0.5 50 | ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(actions / scaling), axis=-1) 51 | contact_cost = 0. 52 | survive_reward = 0.05 53 | reward = fwd_reward - ctrl_cost - contact_cost + survive_reward 54 | notdone = np.all([next_states[..., 2] >= 0.2, next_states[..., 2] <= 1.0], axis=0) 55 | return reward, 1. - notdone 56 | 57 | def reset_model(self): 58 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01 59 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1 60 | self.set_state(qpos, qvel) 61 | return self._get_obs() 62 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/half_cheetah_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from slbo.envs import BaseModelBasedEnv 8 | 9 | 10 | class RLLabHalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | def __init__(self): 12 | self.rescale_action = True 13 | 14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_half_cheetah.xml'), 1) 15 | utils.EzPickle.__init__(self) 16 | 17 | def get_body_xmat(self, body_name): 18 | return self.sim.data.get_body_xmat(body_name) 19 | 20 | def get_body_comvel(self, body_name): 21 | return self.sim.data.get_body_xvelp(body_name) 22 | 23 | def _get_obs(self): 24 | return np.concatenate([ 25 | self.sim.data.qpos.flat, # 9 26 | self.sim.data.qvel.flat, # 9 27 | self.get_body_com("torso").flat, # 3 28 | self.get_body_comvel("torso").flat, # 3 29 | ]) 30 | 31 | def step(self, action: np.ndarray): 32 | self.do_simulation(action, self.frame_skip) 33 | action = np.clip(action, self.action_space.low, self.action_space.high) 34 | fwd_reward = self.get_body_comvel("torso")[0] 35 | ctrl_reward = - 0.05 * np.sum(np.square(action)) 36 | reward = ctrl_reward + fwd_reward 37 | obs = self._get_obs() 38 | return obs, reward, False, {} 39 | 40 | def mb_step(self, states, actions, next_states): 41 | actions = np.clip(actions, self.action_space.low, self.action_space.high) 42 | ctrl_rewards = - 0.05 * np.sum(np.square(actions), axis=-1) 43 | fwd_rewards = next_states[..., 21] 44 | rewards = fwd_rewards + ctrl_rewards 45 | return rewards, np.zeros_like(fwd_rewards, dtype=np.bool) 46 | 47 | def reset_model(self): 48 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01 49 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1 50 | self.set_state(qpos, qvel) 51 | return self._get_obs() 52 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/hopper_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from slbo.envs import BaseModelBasedEnv 8 | 9 | 10 | class RLLabHopperEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | def __init__(self): 12 | self.rescale_action = True 13 | 14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_hopper.xml'), 1) 15 | utils.EzPickle.__init__(self) 16 | 17 | def get_body_comvel(self, body_name): 18 | return self.sim.data.get_body_xvelp(body_name) 19 | 20 | def _get_obs(self): 21 | return np.concatenate([ 22 | self.sim.data.qpos.flat, # 6 23 | self.sim.data.qvel.flat, # 6 24 | self.get_body_com("torso").flat, # 3 25 | self.get_body_comvel("torso"), # 3 26 | ]) 27 | 28 | def step(self, action: np.ndarray): 29 | self.do_simulation(action, self.frame_skip) 30 | scaling = 0.5 * (self.action_space.high - self.action_space.low) 31 | vel = self.get_body_comvel("torso")[0] 32 | alive_bonus = 1.0 33 | reward = vel + alive_bonus - 0.005 * np.sum(np.square(action / scaling)) 34 | # FIXME 35 | state = self.state_vector() 36 | done = not (np.isfinite(state).all() and 37 | (np.abs(state[3:]) < 100).all() and (state[0] > .7) and 38 | (abs(state[2]) < .2)) 39 | obs = self._get_obs() 40 | return obs, reward, done, {} 41 | 42 | def mb_step(self, states, actions, next_states): 43 | scaling = (self.action_space.high - self.action_space.low) * 0.5 44 | vel = next_states[:, -3] 45 | alive_bonus = 1.0 46 | reward = vel + alive_bonus - 0.005 * np.sum(np.square(actions / scaling), axis=-1) 47 | 48 | done = ~((next_states[:, 3:12] < 100).all(axis=-1) & 49 | (next_states[:, 0] > 0.7) & 50 | (np.abs(next_states[:, 2]) < 0.2)) 51 | return reward, done 52 | 53 | def reset_model(self): 54 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01 55 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1 56 | self.set_state(qpos, qvel) 57 | return self._get_obs() -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/humanoid_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import gym.utils as utils 4 | import numpy as np 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from slbo.envs import BaseModelBasedEnv 8 | 9 | 10 | class RLLabSimpleHumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | def __init__(self): 12 | self.rescale_action = True 13 | 14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_simple_humanoid.xml'), 1) 15 | utils.EzPickle.__init__(self) 16 | 17 | def get_body_xmat(self, body_name): 18 | return self.sim.data.get_body_xmat(body_name) 19 | 20 | def get_body_comvel(self, body_name): 21 | return self.sim.data.get_body_xvelp(body_name) 22 | 23 | def _get_obs(self): 24 | data = self.sim.data 25 | return np.concatenate([ 26 | data.qpos.flat, # 17 27 | data.qvel.flat, # 16 28 | self.get_body_com("torso").flat, # 3 29 | self.get_body_comvel("torso").flat, # 3 30 | ]) 31 | 32 | def step(self, actions: np.ndarray): 33 | alive_bonus = 0.2 34 | comvel = self.get_body_comvel("torso") 35 | lin_vel_reward = comvel[0] 36 | scaling = 0.5 * (self.action_space.high - self.action_space.low) 37 | ctrl_cost = 5e-4 * np.sum(np.square(actions / scaling)) 38 | impact_cost = 0. 39 | vel_deviation_cost = 5e-3 * np.sum(np.square(comvel[1:])) 40 | reward = lin_vel_reward + alive_bonus - ctrl_cost - impact_cost - vel_deviation_cost 41 | done = not (0.8 <= self.sim.data.qpos.flat[2] <= 2.0) 42 | next_obs = self._get_obs() 43 | return next_obs, reward, done, {} 44 | 45 | def mb_step(self, states, actions, next_states): 46 | scaling = 0.5 * (self.action_space.high - self.action_space.low) 47 | 48 | alive_bonus = 0.2 49 | lin_vel_reward = next_states[:, 36] 50 | ctrl_cost = 5.e-4 * np.square(actions / scaling).sum(axis=1) 51 | impact_cost = 0. 52 | vel_deviation_cost = 5.e-3 * np.square(next_states[:, 37:39]).sum(axis=1) 53 | reward = lin_vel_reward + alive_bonus - ctrl_cost - impact_cost - vel_deviation_cost 54 | 55 | dones = not (0.8 <= next_states[:, 2] <= 2.0) 56 | return reward, dones 57 | 58 | def reset_model(self): 59 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01 60 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1 61 | self.set_state(qpos, qvel) 62 | return self._get_obs() 63 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 81 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/green_ball.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/half_cheetah.xml: -------------------------------------------------------------------------------- 1 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 96 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/hill_ant_env.xml.mako: -------------------------------------------------------------------------------- 1 | <% 2 | difficulty = opts.get("difficulty", 1.0) 3 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures") 4 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png") 5 | %> 6 | 7 | 8 | 88 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/hill_half_cheetah_env.xml.mako: -------------------------------------------------------------------------------- 1 | 35 | <% 36 | difficulty = opts.get("difficulty", 1.0) 37 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures") 38 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png") 39 | %> 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 103 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/hill_hopper_env.xml.mako: -------------------------------------------------------------------------------- 1 | <% 2 | difficulty = opts.get("difficulty", 1.0) 3 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures") 4 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png") 5 | %> 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 53 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/hill_swimmer3d_env.xml.mako: -------------------------------------------------------------------------------- 1 | <% 2 | difficulty = opts.get("difficulty", 1.0) 3 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures") 4 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png") 5 | %> 6 | 7 | 8 | 9 | 10 | 11 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/hill_walker2d_env.xml.mako: -------------------------------------------------------------------------------- 1 | <% 2 | difficulty = opts.get("difficulty", 1.0) 3 | texturedir = opts.get("texturedir", "/tmp/mujoco_textures") 4 | hfield_file = opts.get("hfield_file", "/tmp/mujoco_terrains/hills.png") 5 | %> 6 | 7 | 8 | 68 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/hopper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 46 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/humanoid.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/inverted_double_pendulum.xml: -------------------------------------------------------------------------------- 1 | 16 | 17 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 29 | 30 | 31 | 99 | 100 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/inverted_double_pendulum.xml.mako: -------------------------------------------------------------------------------- 1 | 16 | <% 17 | noise = opts.get("noise", False) 18 | pole1_height = 0.6 19 | pole2_height = 0.6 20 | if noise: 21 | import numpy as np 22 | pole1_height = pole1_height + np.random.uniform(-0.1, 0.4) 23 | pole2_height = pole2_height + np.random.uniform(-0.1, 0.4) 24 | %> 25 | 26 | 27 | 28 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 40 | 41 | 42 | 110 | 111 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/point.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 32 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/red_ball.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/simple_humanoid.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 125 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/swimmer.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/swimmer3d.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/utils.mako: -------------------------------------------------------------------------------- 1 | <%def name="make_maze(structure, height, size_scaling)"> 2 | % for i in xrange(len(structure)): 3 | % for j in xrange(len(structure[0])): 4 | % if str(structure[i][j]) == '1': 5 | 15 | % endif 16 | % endfor 17 | % endfor 18 | 19 | 20 | <%def name="make_contacts(geom_name, structure)"> 21 | % for i in xrange(len(structure)): 22 | % for j in xrange(len(structure[0])): 23 | % if str(structure[i][j]) == '1': 24 | 28 | % endif 29 | % endfor 30 | % endfor 31 | 32 | 33 | <%def name="find_goal_range(structure, size_scaling)"> 34 | <% 35 | found = False 36 | goal_range = [] 37 | for i in xrange(len(structure)): 38 | for j in xrange(len(structure[0])): 39 | if structure[i][j] == 'g': 40 | goal_range.append(j*size_scaling-size_scaling*0.5), 41 | goal_range.append(j*size_scaling+size_scaling*0.5), 42 | goal_range.append(i*size_scaling-size_scaling*0.5), 43 | goal_range.append(i*size_scaling+size_scaling*0.5), 44 | found = True 45 | break 46 | if found: 47 | break 48 | %> 49 | 50 | 51 | 52 | <%def name="find_robot(structure, size_scaling, z_offset=0)"> 53 | <% 54 | robot_pos = [0, 0, z_offset] 55 | found = False 56 | for i in xrange(len(structure)): 57 | for j in xrange(len(structure[0])): 58 | if structure[i][j] == 'r': 59 | robot_pos[0] = j*size_scaling 60 | robot_pos[1] = i*size_scaling 61 | found = True 62 | break 63 | if found: 64 | break 65 | %> 66 | ${' '.join(map(str, robot_pos))} 67 | 68 | 69 | <%def name="encode_map(structure, size_scaling)"> 70 | <% 71 | data = [] 72 | data.append(len(structure)) 73 | data.append(len(structure[0])) 74 | data.append(size_scaling) 75 | for i in xrange(len(structure)): 76 | for j in xrange(len(structure[0])): 77 | if structure[i][j] == 1: 78 | data.append(1) 79 | elif structure[i][j] == 'g': 80 | data.append(2) 81 | else: 82 | data.append(0) 83 | %> 84 | ${' '.join(map(str, data))} 85 | 86 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/mujoco_models/walker2d.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 61 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/rllab_ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 81 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/rllab_half_cheetah.xml: -------------------------------------------------------------------------------- 1 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 96 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/rllab_hopper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 46 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/rllab_simple_humanoid.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 125 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/rllab_swimmer.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/rllab_walker2d.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 61 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/swimmer_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import gym.utils as utils 4 | import numpy as np 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from slbo.envs import BaseModelBasedEnv 8 | 9 | 10 | class RLLabSwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | def __init__(self): 12 | self.rescale_action = True 13 | 14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_swimmer.xml'), 50) 15 | utils.EzPickle.__init__(self) 16 | 17 | def get_body_xmat(self, body_name): 18 | return self.sim.data.get_body_xmat(body_name) 19 | 20 | def get_body_comvel(self, body_name): 21 | return self.sim.data.get_body_xvelp(body_name) 22 | 23 | def _get_obs(self): 24 | return np.concatenate([ 25 | self.sim.data.qpos.flat, # 5 26 | self.sim.data.qvel.flat, # 5 27 | self.get_body_com("torso").flat, # 3 28 | self.get_body_comvel("torso").flat, # 3 29 | ]).reshape(-1) 30 | 31 | def step(self, action: np.ndarray): 32 | self.do_simulation(action, self.frame_skip) 33 | scaling = 0.5 * (self.action_space.high - self.action_space.low) 34 | ctrl_cost = 0.005 * np.sum(np.square(action / scaling)) 35 | fwd_reward = self.get_body_comvel("torso")[0] 36 | reward = fwd_reward - ctrl_cost 37 | obs = self._get_obs() 38 | return obs, reward, False, {} 39 | 40 | def mb_step(self, states: np.ndarray, actions: np.ndarray, next_states: np.ndarray): 41 | scaling = 0.5 * (self.action_space.high - self.action_space.low) 42 | ctrl_cost = 0.005 * np.sum(np.square(actions / scaling), axis=-1) 43 | fwd_reward = next_states[:, -3] 44 | reward = fwd_reward - ctrl_cost 45 | return reward, np.zeros_like(reward, dtype=np.bool) 46 | 47 | def reset_model(self): 48 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01 49 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1 50 | self.set_state(qpos, qvel) 51 | return self._get_obs() 52 | -------------------------------------------------------------------------------- /slbo/envs/mujoco/rllab/walker2d_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import gym.utils as utils 4 | import numpy as np 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from slbo.envs import BaseModelBasedEnv 8 | 9 | 10 | class RLLabWalker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | def __init__(self): 12 | self.rescale_action = True 13 | 14 | mujoco_env.MujocoEnv.__init__(self, os.path.join(os.path.dirname(__file__), 'rllab_walker2d.xml'), 1) 15 | utils.EzPickle.__init__(self) 16 | 17 | def get_body_xmat(self, body_name): 18 | return self.sim.data.get_body_xmat(body_name) 19 | 20 | def get_body_comvel(self, body_name): 21 | return self.sim.data.get_body_xvelp(body_name) 22 | 23 | def _get_obs(self): 24 | return np.concatenate([ 25 | self.sim.data.qpos.flat, 26 | self.sim.data.qvel.flat, 27 | self.get_body_com("torso").flat, 28 | self.get_body_comvel("torso").flat 29 | ]) 30 | 31 | def step(self, action: np.ndarray): 32 | self.do_simulation(action, self.frame_skip) 33 | fwd_reward = self.get_body_comvel("torso")[0] 34 | scaling = 0.5 * (self.action_space.high - self.action_space.low) 35 | ctrl_cost = 1e-3 * np.sum(np.square(action / scaling)) 36 | alive_bonus = 1. 37 | reward = fwd_reward - ctrl_cost + alive_bonus 38 | qpos = self.sim.data.qpos 39 | done = not (0.8 < qpos[0] < 2.0 and -1.0 < qpos[2] < 1.0) 40 | obs = self._get_obs() 41 | return obs, reward, done, {} 42 | 43 | def mb_step(self, states, actions, next_states): 44 | scaling = 0.5 * (self.action_space.high - self.action_space.low) 45 | reward_ctrl = -0.001 * np.sum(np.square(actions / scaling), axis=-1) 46 | reward_fwd = next_states[:, 21] 47 | alive_bonus = 1. 48 | rewards = reward_ctrl + reward_fwd + alive_bonus 49 | dones = not ((0.8 < next_states[:, 0] < 2.0) and (-1.0 < next_states[:, 2] < 1.0)) 50 | return rewards, dones 51 | 52 | def reset_model(self): 53 | qpos = self.init_qpos + self.np_random.normal(size=self.init_qpos.shape) * 0.01 54 | qvel = self.init_qvel + self.np_random.normal(size=self.init_qvel.shape) * 0.1 55 | self.set_state(qpos, qvel) 56 | return self._get_obs() 57 | -------------------------------------------------------------------------------- /slbo/envs/virtual_env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from stable_baselines.common.vec_env.base_vec_env import VecEnv 4 | import torch 5 | 6 | from slbo.envs import BaseModelBasedEnv 7 | from slbo.models.dynamics import Dynamics 8 | 9 | 10 | class VirtualEnv(gym.Env): 11 | def __init__(self, dynamics: Dynamics, env: BaseModelBasedEnv, seed): 12 | super().__init__() 13 | self.observation_space = env.observation_space 14 | self.action_space = env.action_space 15 | 16 | self.state_dim = self.observation_space.shape[0] 17 | self.action_dim = self.action_space.shape[0] 18 | 19 | self.dynamics = dynamics 20 | self.device = next(self.dynamics.parameters()).device 21 | self.env = env 22 | self.env.seed(seed) 23 | 24 | self.state = np.zeros([self.observation_space.shape[0]], dtype=np.float32) 25 | 26 | def _rescale_action(self, action): 27 | lo, hi = self.action_space.low, self.action_space.high 28 | return lo + (action + 1.) * 0.5 * (hi - lo) 29 | 30 | def step_await(self, action: np.ndarray): 31 | states = self.state.reshape([1, self.state_dim]) 32 | actions = action.reshape([1, self.action_dim]) 33 | rescaled_actions = self._rescale_action(action).reshape([1, self.action_dim]) 34 | with torch.no_grad(): 35 | next_states = self.dynamics(torch.tensor(states, device=self.device, dtype=torch.float32), 36 | torch.tensor(actions, device=self.device, dtype=torch.float32)).cpu().numpy() 37 | rewards, dones = self.env.mb_step(states, rescaled_actions, next_states) 38 | reward, done = rewards[0], dones[0] 39 | self.state = next_states[0] 40 | return self.state.copy(), reward.copy(), done.copy(), {} 41 | 42 | def reset(self) -> np.ndarray: 43 | self.state = self.env.reset() 44 | return self.state.copy() 45 | 46 | def set_state(self, state: np.ndarray): 47 | self.state = state.copy() 48 | 49 | def render(self, mode='human'): 50 | raise NotImplemented 51 | 52 | 53 | class VecVirtualEnv(VecEnv): 54 | def __init__(self, dynamics: Dynamics, env: BaseModelBasedEnv, num_envs, seed, max_episode_steps=1000, 55 | auto_reset=True): 56 | super(VecEnv, self).__init__() 57 | self.observation_space = env.observation_space 58 | self.action_space = env.action_space 59 | 60 | self.state_dim = self.observation_space.shape[0] 61 | self.action_dim = self.action_space.shape[0] 62 | self.num_envs = num_envs 63 | self.max_episode_steps = max_episode_steps 64 | self.auto_reset = auto_reset 65 | 66 | self.dynamics = dynamics 67 | self.device = next(self.dynamics.parameters()).device 68 | self.env = env 69 | self.env.seed(seed) 70 | 71 | self.elapsed_steps = np.zeros([self.num_envs], dtype=np.int32) 72 | self.episode_rewards = np.zeros([self.num_envs]) 73 | 74 | self.states = np.zeros([self.num_envs, self.observation_space.shape[0]], dtype=np.float32) 75 | 76 | def _rescale_action(self, actions: np.array): 77 | lo, hi = self.action_space.low, self.action_space.high 78 | return lo + (actions + 1.) * 0.5 * (hi - lo) 79 | 80 | def step_async(self, actions): 81 | self.actions = actions 82 | 83 | def step_wait(self): 84 | rescaled_actions = self._rescale_action(self.actions) 85 | self.elapsed_steps += 1 86 | with torch.no_grad(): 87 | next_states = self.dynamics(torch.tensor(self.states, device=self.device, dtype=torch.float32), 88 | torch.tensor(self.actions, device=self.device, dtype=torch.float32)).cpu().numpy() 89 | rewards, dones = self.env.mb_step(self.states, rescaled_actions, next_states) 90 | self.episode_rewards += rewards 91 | self.states = next_states.copy() 92 | timeouts = self.elapsed_steps == self.max_episode_steps 93 | dones |= timeouts 94 | info_dicts = [{} for _ in range(self.num_envs)] 95 | for i, (done, timeout) in enumerate(zip(dones, timeouts)): 96 | if done: 97 | info = {'episode': {'r': self.episode_rewards[i], 'l': self.elapsed_steps[i]}} 98 | if timeout: 99 | info.update({'TimeLimit.truncated': True}) 100 | info_dicts[i] = info 101 | else: 102 | info_dicts[i] = {} 103 | if self.auto_reset: 104 | self.reset(np.argwhere(dones).squeeze(axis=-1)) 105 | return self.states.copy(), rewards.copy(), dones.copy(), info_dicts 106 | 107 | # if indices = None, every env will be reset 108 | def reset(self, indices=None) -> np.ndarray: 109 | # have to distinguish [] and None 110 | indices = np.arange(self.num_envs) if indices is None else indices 111 | if np.size(indices) == 0: 112 | return np.array([]) 113 | states = np.array([self.env.reset() for _ in indices]) 114 | self.states[indices] = states 115 | self.elapsed_steps[indices] = 0 116 | self.episode_rewards[indices] = 0. 117 | return states.copy() 118 | 119 | # if indices = None, every env will be set 120 | def set_state(self, states: np.ndarray, indices=None): 121 | indices = indices or np.arange(self.num_envs) 122 | assert states.ndim == 2 and states.shape[0] == indices.shape[0] 123 | self.states[indices] = states.copy() 124 | # set_state should reset reward and length 125 | self.elapsed_steps[indices] = 0 126 | self.episode_rewards[indices] = 0. 127 | 128 | def close(self): 129 | pass 130 | 131 | def seed(self, seed): 132 | return self.env.seed(seed) 133 | 134 | def render(self, mode='human'): 135 | raise NotImplemented 136 | 137 | def set_attr(self, attr_name, value, indices=None): 138 | raise NotImplemented 139 | 140 | def get_attr(self, attr_name, indices=None): 141 | raise NotImplemented 142 | 143 | def env_method(self, method_name, *method_args, indices=None, **method_kwargs): 144 | raise NotImplemented 145 | 146 | -------------------------------------------------------------------------------- /slbo/envs/wrapped_envs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | import gym 5 | from gym.wrappers import TimeLimit 6 | import torch 7 | from stable_baselines import bench 8 | from stable_baselines.common.vec_env import VecEnvWrapper 9 | from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv 10 | from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv 11 | from stable_baselines.common.vec_env.vec_normalize import VecNormalize 12 | 13 | from slbo.envs.mujoco.mujoco_envs import make_mujoco_env 14 | from slbo.envs.virtual_env import VirtualEnv, VecVirtualEnv 15 | from slbo.models.dynamics import Dynamics 16 | 17 | 18 | def make_env(env_id, seed, rank, log_dir, allow_early_resets, max_episode_steps, test=True): 19 | def _thunk(): 20 | if test: 21 | env = gym.make(env_id) 22 | else: 23 | env = make_mujoco_env(env_id) 24 | env = TimeLimit(env, max_episode_steps) 25 | 26 | env.seed(seed + rank) 27 | log_dir_ = os.path.join(log_dir, str(rank)) if log_dir is not None else log_dir 28 | env = bench.Monitor(env, log_dir_, allow_early_resets=allow_early_resets) 29 | 30 | return env 31 | 32 | return _thunk 33 | 34 | 35 | def make_vec_envs(env_name: str, 36 | seed: int, 37 | num_envs: int, 38 | gamma: float, 39 | log_dir: Optional[str], 40 | device: torch.device, 41 | allow_early_resets: bool, 42 | max_episode_steps: int = 1000, 43 | norm_reward=True, 44 | norm_obs=True, 45 | test=False, 46 | ): 47 | envs = [ 48 | make_env(env_name, seed, i, log_dir, allow_early_resets, max_episode_steps, test) 49 | for i in range(num_envs) 50 | ] 51 | 52 | if len(envs) > 1: 53 | envs = SubprocVecEnv(envs) 54 | else: 55 | envs = DummyVecEnv(envs) 56 | 57 | if len(envs.observation_space.shape) == 1: 58 | if gamma is None: 59 | envs = VecNormalize(envs, norm_reward=False, norm_obs=norm_obs) 60 | else: 61 | envs = VecNormalize(envs, gamma=gamma, norm_reward=norm_reward, norm_obs=norm_obs) 62 | 63 | envs = VecPyTorch(envs, device) 64 | 65 | return envs 66 | 67 | 68 | def make_vec_virtual_envs(env_name: str, 69 | dynamics: Dynamics, 70 | seed: int, 71 | num_envs: int, 72 | gamma: Optional[float], 73 | device: torch.device, 74 | allow_early_resets: bool, 75 | max_episode_steps: int = 1000, 76 | norm_reward=False, 77 | norm_obs=False, 78 | ): 79 | envs = VecVirtualEnv(dynamics, make_mujoco_env(env_name), num_envs, seed, max_episode_steps) 80 | 81 | if len(envs.observation_space.shape) == 1 and (norm_reward or norm_obs): 82 | if gamma is None: 83 | envs = VecNormalize(envs, norm_reward=False, norm_obs=norm_obs) 84 | else: 85 | envs = VecNormalize(envs, gamma=gamma, norm_reward=norm_reward, norm_obs=norm_obs) 86 | 87 | envs = VecPyTorch(envs, device) 88 | 89 | return envs 90 | 91 | 92 | class VecPyTorch(VecEnvWrapper): 93 | def __init__(self, venv, device): 94 | super(VecPyTorch, self).__init__(venv) 95 | self.device = device 96 | 97 | def reset(self): 98 | obs = self.venv.reset() 99 | obs = torch.from_numpy(obs).float().to(self.device) 100 | return obs 101 | 102 | def step_async(self, actions): 103 | if isinstance(actions, torch.LongTensor): 104 | actions = actions.squeeze(1) 105 | actions = actions.cpu().numpy() 106 | self.venv.step_async(actions) 107 | 108 | def step_wait(self): 109 | obs, reward, done, info = self.venv.step_wait() 110 | obs = torch.from_numpy(obs).float().to(self.device) 111 | reward = torch.from_numpy(reward).unsqueeze(dim=1).float() 112 | return obs, reward, done, info 113 | 114 | 115 | def get_vec_normalize(venv): 116 | if isinstance(venv, VecNormalize): 117 | return venv 118 | elif hasattr(venv, 'venv'): 119 | return get_vec_normalize(venv.venv) 120 | 121 | return None 122 | 123 | 124 | -------------------------------------------------------------------------------- /slbo/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/misc/__init__.py -------------------------------------------------------------------------------- /slbo/misc/distributions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.distributions import Distribution, Normal 3 | import math 4 | 5 | class TanhNormal(Distribution): 6 | """ 7 | Represent distribution of X where 8 | Z ~ N(mean, std) 9 | X ~ tanh(Z) 10 | Note: this is not very numerically stable. 11 | """ 12 | def __init__(self, mean, std, epsilon=1e-6): 13 | """ 14 | :param mean: Mean of the normal distribution 15 | :param std: Std of the normal distribution 16 | :param epsilon: Numerical stability epsilon when computing log-prob. 17 | """ 18 | super().__init__() 19 | self.normal_mean = mean 20 | self.normal_std = std 21 | self.normal = Normal(mean, std) 22 | self.epsilon = epsilon 23 | 24 | def log_prob(self, value, pre_tanh_value=None): 25 | if pre_tanh_value is None: 26 | pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2 27 | return self.normal.log_prob(pre_tanh_value) - torch.log(1 - value * value + self.epsilon) 28 | 29 | def log_probs(self, value, pre_tanh_value): 30 | return self.log_prob(value, pre_tanh_value).sum(-1, keepdim=True) 31 | 32 | def sample(self, sample_shape=torch.Size([])): 33 | z = self.normal.sample(sample_shape) 34 | return torch.tanh(z), z 35 | 36 | def rsample(self, sample_shape=torch.Size([]), return_pretanh_value=False): 37 | z = ( 38 | self.normal_mean + 39 | self.normal_std * 40 | Normal( 41 | torch.zeros_like(self.normal_mean), 42 | torch.ones_like(self.normal_std) 43 | ).sample() 44 | ) 45 | z.requires_grad_() 46 | return torch.tanh(z), z 47 | 48 | def entropy(self): 49 | return self.normal.entropy().sum(-1) 50 | 51 | def mode(self): 52 | return torch.tan(self.normal_mean), self.normal_mean 53 | 54 | 55 | class FixedLimitedEntNormal(torch.distributions.Normal): 56 | def log_probs(self, actions): 57 | return super().log_prob(actions).sum(-1, keepdim=True) 58 | 59 | def entropy(self): 60 | limit = 2. 61 | lo, hi = (-limit - self.loc) / self.scale / math.sqrt(2), (limit - self.loc) / self.scale / math.sqrt(2) 62 | return (0.5 * (self.scale.log() + math.log(2 * math.pi) / 2) * (hi.erf() - lo.erf()) + 0.5 * 63 | (torch.exp(-hi * hi) * hi - torch.exp(-lo * lo) * lo)).sum(-1) 64 | 65 | def mode(self): 66 | return self.mean 67 | 68 | 69 | class FixedCategorical(torch.distributions.Categorical): 70 | def sample(self, **kwargs): 71 | return super().sample(**kwargs).unsqueeze(-1) 72 | 73 | def log_probs(self, actions): 74 | return ( 75 | super() 76 | .log_prob(actions.squeeze(-1)) 77 | .view(actions.size(0), -1) 78 | .sum(-1) 79 | .unsqueeze(-1) 80 | ) 81 | 82 | def mode(self): 83 | return self.probs.argmax(dim=-1, keepdim=True) 84 | 85 | 86 | class FixedNormal(torch.distributions.Normal): 87 | 88 | def log_probs(self, actions): 89 | return super().log_prob(actions).sum(-1, keepdim=True) 90 | 91 | def entropy(self): 92 | return super().entropy().sum(-1) 93 | 94 | def mode(self): 95 | return self.mean 96 | 97 | 98 | class FixedBernoulli(torch.distributions.Bernoulli): 99 | 100 | def log_probs(self, actions): 101 | return super().log_prob(actions).view(actions.size(0), -1).sum(-1, keepdim=True) 102 | 103 | def entropy(self): 104 | return super().entropy().sum(-1) 105 | 106 | def mode(self): 107 | return torch.gt(self.probs, 0.5).float() 108 | 109 | -------------------------------------------------------------------------------- /slbo/misc/ou_noise.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from slbo.models.actor import Actor 4 | 5 | 6 | class OUNoise(object): 7 | 8 | def __init__(self, action_space, mu=0.0, theta=0.15, sigma=0.3): 9 | self.mu = mu 10 | self.theta = theta 11 | self.sigma = sigma 12 | self.action_space = action_space 13 | self.state = None 14 | self.actor = None 15 | 16 | self.shape = action_space.shape 17 | 18 | self.reset() 19 | 20 | def reset(self): 21 | self.state = torch.ones(self.shape) * self.mu 22 | 23 | def next(self): 24 | delta = self.theta * (self.mu - self.state) + self.sigma * torch.randn_like(self.state) 25 | self.state = self.state + delta 26 | return self.state 27 | 28 | def act(self, states): 29 | result = self.actor.act(states) 30 | return (result[0] + self.next(), *result[1:]) 31 | 32 | def wrap(self, actor: Actor): 33 | self.actor = actor 34 | self.state = self.state.to(next(actor.parameters()).device) 35 | return self 36 | 37 | -------------------------------------------------------------------------------- /slbo/misc/param.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | def get_flat_params_from(model: nn.Module): 7 | params = [] 8 | for param in model.parameters(): 9 | params.append(param.view(-1)) 10 | 11 | flat_params = torch.cat(params) 12 | return flat_params 13 | 14 | 15 | def set_flat_params_to(model: nn.Module, flat_params): 16 | prev_ind = 0 17 | for param in model.parameters(): 18 | flat_size = int(np.prod(list(param.size()))) 19 | param.data.copy_(flat_params[prev_ind:prev_ind + flat_size].view(param.size())) 20 | prev_ind += flat_size 21 | 22 | 23 | def get_flat_grad_from(inputs, grad_grad=False): 24 | grads = [] 25 | for param in inputs: 26 | if grad_grad: 27 | grads.append(param.grad.grad.view(-1)) 28 | else: 29 | if param.grad is None: 30 | grads.append(torch.zeros(param.view(-1).shape)) 31 | else: 32 | grads.append(param.grad.view(-1)) 33 | 34 | flat_grad = torch.cat(grads) 35 | return flat_grad 36 | 37 | 38 | def compute_flat_grad(output, inputs, filter_input_ids: set, retain_graph=False, create_graph=False): 39 | filter_input_ids = filter_input_ids.copy() 40 | if create_graph: 41 | retain_graph = True 42 | 43 | inputs = list(inputs) 44 | params = [] 45 | for i, param in enumerate(inputs): 46 | if i not in filter_input_ids: 47 | params.append(param) 48 | 49 | grads = torch.autograd.grad(output, params, retain_graph=retain_graph, create_graph=create_graph) 50 | 51 | j = 0 52 | out_grads = [] 53 | for i, param in enumerate(inputs): 54 | if i in filter_input_ids: 55 | out_grads.append(torch.zeros(param.view(-1).shape, device=param.device, dtype=param.dtype)) 56 | else: 57 | out_grads.append(grads[j].view(-1)) 58 | j += 1 59 | grads = torch.cat(out_grads) 60 | 61 | for param in params: 62 | param.grad = None 63 | return grads -------------------------------------------------------------------------------- /slbo/misc/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | from torch.utils.tensorboard import SummaryWriter 5 | 6 | from slbo.envs.wrapped_envs import make_vec_envs, get_vec_normalize 7 | 8 | 9 | def log_and_write(logger, writer: SummaryWriter, log_infos: List, global_step: int): 10 | for idx, (name, value) in enumerate(log_infos): 11 | if logger is not None: 12 | logger.logkv('{}.'.format(idx) + name.split('/')[-1], value) 13 | if writer is not None and name.find('/') > -1: 14 | writer.add_scalar(name, value, global_step=global_step) 15 | if logger is not None: 16 | logger.dumpkvs() 17 | 18 | 19 | def collect_traj(actor, envs, buffer, total_step): 20 | episode_rewards = [] 21 | episode_lengths = [] 22 | 23 | step = 0 24 | while step < total_step: 25 | states = envs.reset() 26 | dones = False 27 | traj = {'states': [], 'actions': [], 'rewards': [], 'next_states': [], 'masks': []} 28 | while not dones: 29 | with torch.no_grad(): 30 | actions, *_ = actor(states, deterministic=False, reparameterize=False) 31 | 32 | new_states, rewards, dones, infos = envs.step_index(actions) 33 | mask = torch.tensor([[0.0] if done_ else [1.0] for done_ in dones], dtype=torch.float32) 34 | 35 | traj['states'].append(states) 36 | traj['actions'].append(actions) 37 | traj['next_states'].append(new_states) 38 | traj['rewards'].append(rewards) 39 | traj['masks'].append(mask) 40 | 41 | states = new_states 42 | 43 | for info_ in infos: 44 | if 'episode' in info_.keys(): 45 | episode_rewards.append(info_['episode']['r']) 46 | episode_lengths.append(info_['episode']['l']) 47 | 48 | traj_len = len(traj['actions']) 49 | step += traj_len 50 | buffer.add_traj(traj) 51 | 52 | return episode_rewards, episode_lengths 53 | 54 | 55 | def evaluate(actor, env_name, seed, num_episode, eval_log_dir, 56 | device, max_episode_steps=1000, norm_reward=False, norm_obs=True, obs_rms=None, test=True): 57 | eval_envs = make_vec_envs(env_name, seed + 1, 1, None, eval_log_dir, device, True, 58 | max_episode_steps, norm_reward, norm_obs, test) 59 | 60 | vec_norm = get_vec_normalize(eval_envs) 61 | if vec_norm is not None and norm_obs: 62 | assert obs_rms is not None 63 | vec_norm.training = False 64 | vec_norm.obs_rms = obs_rms 65 | 66 | eval_episode_rewards = [] 67 | eval_episode_lengths = [] 68 | 69 | obs = eval_envs.reset() 70 | 71 | while len(eval_episode_rewards) < num_episode: 72 | with torch.no_grad(): 73 | action, *_ = actor.act(obs, deterministic=True) 74 | 75 | obs, _, done, infos = eval_envs.step(action) 76 | 77 | for info in infos: 78 | if 'episode' in info.keys(): 79 | eval_episode_rewards.append(info['episode']['r']) 80 | eval_episode_lengths.append(info['episode']['l']) 81 | 82 | eval_envs.close() 83 | 84 | return eval_episode_rewards, eval_episode_lengths 85 | -------------------------------------------------------------------------------- /slbo/models/__init__.py: -------------------------------------------------------------------------------- 1 | from slbo.models.actor import Actor 2 | from slbo.models.actor_critic import ActorCritic 3 | from slbo.models.critic import VCritic, QCritic 4 | from slbo.models.dynamics import Dynamics 5 | from slbo.models.normalizers import Normalizers 6 | -------------------------------------------------------------------------------- /slbo/models/actor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from typing import List, Callable, Optional 4 | 5 | from slbo.models.initializer import normc_init 6 | from slbo.models.utils import MLP, init 7 | from slbo.models.actor_layer import * 8 | 9 | 10 | class Actor(nn.Module): 11 | def __init__(self, state_dim: int, action_space, hidden_dims: List[int], 12 | state_normalizer: Optional[nn.Module], use_limited_entropy=False): 13 | super(Actor, self).__init__() 14 | self.state_dim = state_dim 15 | self.action_dim = action_space 16 | self.hidden_dims = hidden_dims 17 | 18 | self.actor_feature = MLP(state_dim, hidden_dims[-1], hidden_dims[:-1], 19 | activation='Tanh', last_activation='Tanh') 20 | self.state_normalizer = state_normalizer or nn.Identity() 21 | 22 | if action_space.__class__.__name__ == "Discrete": 23 | action_dim = action_space.n 24 | self.actor = CategoricalActorLayer(hidden_dims[-1], action_dim) 25 | elif action_space.__class__.__name__ == "Box": 26 | action_dim = action_space.shape[0] 27 | if use_limited_entropy: 28 | self.actor = LimitedEntGaussianActorLayer(hidden_dims[-1], action_dim, use_state_dependent_std=False) 29 | else: 30 | self.actor = GaussianActorLayer(hidden_dims[-1], action_dim, use_state_dependent_std=False) 31 | elif action_space.__class__.__name__ == "MultiBinary": 32 | action_dim = action_space.shape[0] 33 | self.actor = BernoulliActorLayer(hidden_dims[-1], action_dim) 34 | else: 35 | raise NotImplemented 36 | 37 | init_ = lambda m: init(m, normc_init, lambda x: nn.init.constant_(x, 0)) 38 | self.actor_feature.init(init_, init_) 39 | 40 | def act(self, states, deterministic=False, reparamterize=False): 41 | states = self.state_normalizer(states) 42 | action_features = self.actor_feature(states) 43 | action_dists, action_means, log_stds = self.actor(action_features) 44 | 45 | if deterministic: 46 | actions = action_dists.mode() 47 | else: 48 | if reparamterize: 49 | actions = action_dists.rsample() 50 | else: 51 | actions = action_dists.sample() 52 | 53 | log_probs = action_dists.log_probs(actions) 54 | entropy = action_dists.entropy().mean() 55 | 56 | return actions, log_probs, entropy, action_means, log_stds, log_stds.exp() 57 | 58 | def evaluate_action(self, states, actions): 59 | states = self.state_normalizer(states) 60 | action_feature = self.actor_feature(states) 61 | action_dist, *_ = self.actor(action_feature) 62 | 63 | log_probs = action_dist.log_probs(actions) 64 | entropy = action_dist.entropy().mean() 65 | 66 | return log_probs, entropy 67 | -------------------------------------------------------------------------------- /slbo/models/actor_critic.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | 5 | from slbo.models.actor_layer import * 6 | from slbo.models.utils import MLP, init 7 | 8 | 9 | class ActorCritic(nn.Module): 10 | 11 | def __init__(self, dim_state, action_space, actor_hidden_dims: List[int], critic_hidden_dims: List[int], 12 | normalizer: nn.Module = None): 13 | super(ActorCritic, self).__init__() 14 | 15 | self.actor_feature = MLP(dim_state, actor_hidden_dims[-1], actor_hidden_dims[:-1], 16 | activation='Tanh', last_activation='Tanh') 17 | self.critic = MLP(dim_state, 1, critic_hidden_dims, activation='Tanh', last_activation='Identity') 18 | self.normalizer = normalizer or nn.Identity() 19 | 20 | init_ = lambda m: init(m, lambda x: nn.init.orthogonal_(x, np.sqrt(2)), lambda x: nn.init.constant_(x, 0)) 21 | self.actor_feature.init(init_, init_) 22 | self.critic.init(init_, init_) 23 | 24 | self.train() 25 | 26 | if action_space.__class__.__name__ == "Discrete": 27 | dim_action = action_space.n 28 | self.actor = CategoricalActorLayer(actor_hidden_dims[-1], dim_action) 29 | elif action_space.__class__.__name__ == "Box": 30 | dim_action = action_space.shape[0] 31 | self.actor = GaussianActorLayer(actor_hidden_dims[-1], dim_action, use_state_dependent_std=False) 32 | elif action_space.__class__.__name__ == "MultiBinary": 33 | dim_action = action_space.shape[0] 34 | self.actor = BernoulliActorLayer(actor_hidden_dims[-1], dim_action) 35 | 36 | def act(self, states, deterministic=False, reparamterize=False): 37 | action_feature, value = self.actor_feature(states), self.critic(states) 38 | action_dist, *_ = self.actor(action_feature) 39 | 40 | if deterministic: 41 | action = action_dist.mode() 42 | else: 43 | if reparamterize: 44 | action = action_dist.rsample() 45 | else: 46 | action = action_dist.sample() 47 | 48 | action_log_prob = action_dist.log_probs(action) 49 | dist_entropy = action_dist.entropy().mean() 50 | 51 | return value, action, action_log_prob, dist_entropy 52 | 53 | def criticize(self, states): 54 | values = self.critic(states) 55 | return values 56 | 57 | def evaluate_action(self, state, action): 58 | action_feature, value = self.actor_feature(state), self.critic(state) 59 | action_dist = self.actor(action_feature) 60 | 61 | action_log_probs = action_dist.log_prob(action).sum(-1, keepdim=True) 62 | dist_entropy = action_dist.entropy().mean() 63 | 64 | return value, action_log_probs, dist_entropy 65 | 66 | -------------------------------------------------------------------------------- /slbo/models/actor_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from slbo.misc.distributions import FixedNormal, FixedCategorical, FixedBernoulli, TanhNormal, FixedLimitedEntNormal 5 | from slbo.models.utils import init 6 | 7 | 8 | class CategoricalActorLayer(nn.Module): 9 | def __init__(self, num_inputs, num_outputs): 10 | super(CategoricalActorLayer, self).__init__() 11 | 12 | self.actor = nn.Linear(num_inputs, num_outputs) 13 | init(self.actor, lambda x: nn.init.orthogonal_(x, 0.01), lambda x: nn.init.constant_(x, 0)) 14 | 15 | def forward(self, x): 16 | x = self.actor(x) 17 | return FixedCategorical(logits=x) 18 | 19 | 20 | class GaussianActorLayer(nn.Module): 21 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std): 22 | super(GaussianActorLayer, self).__init__() 23 | 24 | self.actor_mean = nn.Linear(num_inputs, num_outputs) 25 | init(self.actor_mean, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) 26 | self.use_state_dependent_std = use_state_dependent_std 27 | if self.use_state_dependent_std: 28 | self.actor_logstd = nn.Linear(num_inputs, num_outputs) 29 | init(self.actor_logstd, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) 30 | 31 | else: 32 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True) 33 | 34 | def forward(self, x): 35 | action_mean = self.actor_mean(x) 36 | 37 | if self.use_state_dependent_std: 38 | logstd = self.actor_logstd(x) 39 | else: 40 | logstd = self.logstd 41 | 42 | return FixedNormal(action_mean, logstd.exp()), action_mean, logstd 43 | 44 | 45 | class LimitedEntGaussianActorLayer(nn.Module): 46 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std): 47 | super(LimitedEntGaussianActorLayer, self).__init__() 48 | 49 | self.actor_mean = nn.Linear(num_inputs, num_outputs) 50 | init(self.actor_mean, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) 51 | self.use_state_dependent_std = use_state_dependent_std 52 | if self.use_state_dependent_std: 53 | self.actor_logstd = nn.Linear(num_inputs, num_outputs) 54 | init(self.actor_logstd, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) 55 | 56 | else: 57 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True) 58 | 59 | def forward(self, x): 60 | action_mean = self.actor_mean(x) 61 | 62 | if self.use_state_dependent_std: 63 | logstd = self.actor_logstd(x) 64 | else: 65 | logstd = self.logstd 66 | 67 | return FixedLimitedEntNormal(action_mean, logstd.exp()), action_mean, logstd 68 | 69 | 70 | class BernoulliActorLayer(nn.Module): 71 | def __init__(self, num_inputs, num_outputs): 72 | super(BernoulliActorLayer, self).__init__() 73 | 74 | self.actor = nn.Linear(num_inputs, num_outputs) 75 | init(self.actor, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0)) 76 | 77 | def forward(self, x): 78 | x = self.actor(x) 79 | return FixedBernoulli(logits=x) 80 | 81 | 82 | class TanhGaussainActorLayer(nn.Module): 83 | def __init__(self, num_inputs, num_outputs, state_dependent_std, init_w=1e-3): 84 | super(TanhGaussainActorLayer, self).__init__() 85 | 86 | self.actor_mean = nn.Linear(num_inputs, num_outputs) 87 | init(self.actor_mean, lambda x: nn.init.uniform_(x, -init_w, init_w), 88 | lambda x: nn.init.uniform_(x, -init_w, init_w)) 89 | self.state_dependent_std = state_dependent_std 90 | if self.state_dependent_std: 91 | self.actor_logstd = nn.Linear(num_inputs, num_outputs) 92 | init(self.actor_mean, lambda x: nn.init.uniform_(x, -init_w, init_w), 93 | lambda x: nn.init.uniform_(x, -init_w, init_w)) 94 | else: 95 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True) 96 | 97 | def forward(self, x): 98 | action_mean = self.actor_mean(x) 99 | 100 | if self.state_dependent_std: 101 | action_logstd = self.actor_logstd(x) 102 | else: 103 | action_logstd = self.logstd 104 | 105 | action_logstd = torch.clamp(action_logstd, -20, 2) 106 | 107 | return TanhNormal(action_mean, action_logstd.exp()), torch.tanh(action_mean), action_logstd 108 | -------------------------------------------------------------------------------- /slbo/models/critic.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from slbo.models.initializer import normc_init 7 | from slbo.models.utils import MLP, init 8 | 9 | 10 | class QCritic(nn.Module): 11 | def __init__(self, dim_state: int, dim_action: int, hidden_states: List[int]): 12 | super(QCritic, self).__init__() 13 | self.critic = MLP(dim_state + dim_action, hidden_states, 1) 14 | 15 | def forward(self, state, action): 16 | x = torch.cat([state, action], dim=-1) 17 | return self.critic(x) 18 | 19 | 20 | class VCritic(nn.Module): 21 | def __init__(self, dim_state: int, hidden_dims: List[int], state_normalizer=None, activation='Tanh'): 22 | super(VCritic, self).__init__() 23 | self.critic = MLP(dim_state, 1, hidden_dims, activation=activation) 24 | self.normalizer = state_normalizer or nn.Identity() 25 | 26 | init_ = lambda m: init(m, normc_init, lambda x: nn.init.constant_(x, 0)) 27 | self.critic.init(init_, init_) 28 | 29 | def forward(self, state): 30 | state = self.normalizer(state) 31 | return self.critic(state) 32 | -------------------------------------------------------------------------------- /slbo/models/dynamics.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from slbo.models.initializer import truncated_norm_init 7 | from slbo.models.normalizers import Normalizers 8 | from slbo.models.utils import MLP, init 9 | 10 | 11 | class Dynamics(nn.Module): 12 | def __init__(self, state_dim: int, action_dim: int, hidden_dims: List[int], normalizer: Normalizers): 13 | super(Dynamics, self).__init__() 14 | self.dim_state = state_dim 15 | self.dim_action = action_dim 16 | self.normalizer = normalizer 17 | self.diff_dynamics = MLP(state_dim + action_dim, state_dim, hidden_dims, activation='ReLU') 18 | 19 | init_ = lambda m: init(m, truncated_norm_init, lambda x: nn.init.constant_(x, 0)) 20 | self.diff_dynamics.init(init_, init_) 21 | 22 | def forward(self, state, action): 23 | # action clip is the best normalization according to the authors 24 | x = torch.cat([self.normalizer.state_normalizer(state), action.clamp(-1., 1.)], dim=-1) 25 | normalized_diff = self.diff_dynamics(x) 26 | next_states = state + self.normalizer.diff_normalizer(normalized_diff, inverse=True) 27 | next_states = self.normalizer.state_normalizer(self.normalizer.state_normalizer(next_states).clamp(-100, 100), 28 | inverse=True) 29 | return next_states 30 | 31 | 32 | -------------------------------------------------------------------------------- /slbo/models/initializer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def normc_init(tensor, std=1.0, **kwargs): 5 | tensor.data.normal_(0, 1) 6 | tensor.data *= std / np.sqrt(tensor.data.pow(2).sum(1, keepdim=True)) 7 | 8 | 9 | def fanin_init(tensor, **kwargs): 10 | size = tensor.size() 11 | if len(size) == 2: 12 | fan_in = size[0] 13 | elif len(size) > 2: 14 | fan_in = np.prod(size[1:]) 15 | else: 16 | raise Exception("Shape must be have dimension at least 2.") 17 | bound = 1. / np.sqrt(fan_in) 18 | return tensor.data.uniform_(-bound, bound) 19 | 20 | 21 | def truncated_norm_init(tensor, mean=0, std=1e-5, **kwargs): 22 | size = tensor.shape 23 | tmp = tensor.new_empty(size + (4,)).normal_() 24 | valid = (tmp < 2) & (tmp > -2) 25 | ind = valid.max(-1, keepdim=True)[1] 26 | tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1)) 27 | tensor.data.mul_(std).add_(mean) 28 | return tensor 29 | 30 | -------------------------------------------------------------------------------- /slbo/models/normalizers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributions.kl as kl 3 | import torch.nn as nn 4 | from typing import List 5 | try: 6 | from slbo.misc import logger 7 | except ImportError: 8 | from stable_baselines import logger 9 | 10 | 11 | class GaussianNormalizer(nn.Module): 12 | def __init__(self, shape: List[int], eps=1e-8, verbose=0): 13 | super().__init__() 14 | 15 | self.shape = shape 16 | self.verbose = verbose 17 | 18 | self.mean = torch.zeros(shape, dtype=torch.float32) 19 | self.std = torch.ones(shape, dtype=torch.float32) 20 | self.eps = eps 21 | self.n = 0 22 | 23 | def forward(self, x: torch.Tensor, inverse=False): 24 | if inverse: 25 | return x * self.std + self.mean 26 | return (x - self.mean) / (torch.clamp(self.std, min=self.eps)) 27 | 28 | def to(self, *args, **kwargs): 29 | self.mean = self.mean.to(*args, **kwargs) 30 | self.std = self.std.to(*args, **kwargs) 31 | 32 | # noinspection DuplicatedCode 33 | # samples in [batch_size, ...] 34 | def update(self, samples: torch.Tensor): 35 | old_mean, old_std, old_n = self.mean, self.std, self.n 36 | samples = samples - old_mean 37 | n = samples.shape[0] 38 | delta = samples.mean(dim=0) 39 | new_n = old_n + n 40 | new_mean = old_mean + delta * n / new_n 41 | new_std = torch.sqrt((old_std**2 * old_n + samples.var(dim=0) * n + delta**2 * old_n * n / new_n) / new_n) 42 | kl_old_new = kl.kl_divergence(torch.distributions.Normal(new_mean, torch.clamp(new_std, 1e-20)), 43 | torch.distributions.Normal(old_mean, torch.clamp(old_std, 1e-20))).sum() 44 | self.mean, self.std, self.n = new_mean, new_std, new_n 45 | 46 | if self.verbose > 0: 47 | logger.debug("updating Normalizer<%s>, KL divergence = %.6f", self.name, kl_old_new) 48 | 49 | # noinspection PyMethodOverriding 50 | def state_dict(self, *args, **kwargs): 51 | return {'mean': self.mean, 'std': self.std, 'n': self.n} 52 | 53 | # noinspection PyMethodOverriding 54 | def load_state_dict(self, state_dict): 55 | self.mean = state_dict['mean'] 56 | self.std = state_dict['std'] 57 | self.n = state_dict['n'] 58 | 59 | 60 | class Normalizers(nn.Module): 61 | def __init__(self, dim_action: int, dim_state: int, verbose=0): 62 | super().__init__() 63 | # action_normalizer is not used 64 | self.action_normalizer = GaussianNormalizer([dim_action], verbose=verbose) 65 | self.state_normalizer = GaussianNormalizer([dim_state], verbose=verbose) 66 | self.diff_normalizer = GaussianNormalizer([dim_state], verbose=verbose) 67 | 68 | def forward(self): 69 | raise NotImplemented 70 | 71 | def to(self, *args, **kwargs): 72 | self.action_normalizer.to(*args, **kwargs) 73 | self.state_normalizer.to(*args, **kwargs) 74 | self.diff_normalizer.to(*args, **kwargs) 75 | 76 | # noinspection PyMethodOverriding 77 | def state_dict(self, *args, **kwargs): 78 | return {'action_normalizer': self.action_normalizer.state_dict(), 79 | 'state_normalizer': self.state_normalizer.state_dict(), 80 | 'diff_normalizer': self.diff_normalizer.state_dict()} 81 | 82 | # noinspection PyMethodOverriding, PyTypeChecker 83 | def load_state_dict(self, state_dict): 84 | self.action_normalizer.load_state_dict(state_dict['action_normalizer']) 85 | self.state_normalizer.load_state_dict(state_dict['state_normalizer']) 86 | self.diff_normalizer.load_state_dict(state_dict['diff_normalizer']) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /slbo/models/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class MLP(nn.Module): 7 | def __init__(self, input_dim, output_dim, hidden_dims, activation='Tanh', last_activation='Identity', biases=None): 8 | super(MLP, self).__init__() 9 | sizes_list = hidden_dims.copy() 10 | self.activation = getattr(nn, activation)() 11 | self.last_activation = getattr(nn, last_activation)() 12 | sizes_list.insert(0, input_dim) 13 | biases = [True] * len(sizes_list) if biases is None else biases.copy() 14 | 15 | layers = [] 16 | if 1 < len(sizes_list): 17 | for i in range(len(sizes_list) - 1): 18 | layers.append(nn.Linear(sizes_list[i], sizes_list[i + 1], bias=biases[i])) 19 | self.last_layer = nn.Linear(sizes_list[-1], output_dim) 20 | self.layers = nn.ModuleList(layers) 21 | 22 | def forward(self, x): 23 | for layer in self.layers: 24 | x = layer(x) 25 | x = self.activation(x) 26 | x = self.last_layer(x) 27 | x = self.last_activation(x) 28 | return x 29 | 30 | def init(self, init_fn, last_init_fn): 31 | for layer in self.layers: 32 | init_fn(layer) 33 | last_init_fn(self.last_layer) 34 | 35 | 36 | def soft_update(source_model: nn.Module, target_model: nn.Module, tau): 37 | for target_param, param in zip(target_model.parameters(), source_model.parameters()): 38 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 39 | 40 | 41 | def copy_model_params_from_to(source, target): 42 | for target_param, param in zip(target.parameters(), source.parameters()): 43 | target_param.data.copy_(param.data) 44 | 45 | 46 | def init(module, weight_init=None, bias_init=None): 47 | if weight_init: 48 | weight_init(module.weight.data) 49 | if bias_init: 50 | bias_init(module.bias.data) 51 | 52 | 53 | def get_flat_params(model): 54 | params = [] 55 | for param in model.parameters(): 56 | params.append(param.data.view(-1)) 57 | 58 | flat_params = torch.cat(params) 59 | return flat_params 60 | 61 | 62 | def set_flat_params(model, flat_params): 63 | prev_ind = 0 64 | for param in model.parameters(): 65 | flat_size = int(np.prod(list(param.size()))) 66 | param.data.copy_( 67 | flat_params[prev_ind:prev_ind + flat_size].view(param.size())) 68 | prev_ind += flat_size 69 | 70 | 71 | def get_flat_grad(net, grad_grad=False): 72 | grads = [] 73 | for param in net.parameters(): 74 | if grad_grad: 75 | grads.append(param.grad.grad.view(-1)) 76 | else: 77 | grads.append(param.grad.view(-1)) 78 | 79 | flat_grad = torch.cat(grads) 80 | return flat_grad 81 | 82 | -------------------------------------------------------------------------------- /slbo/scripts/run_trpo.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import time 3 | from collections import deque 4 | 5 | import numpy as np 6 | import torch 7 | import torch.backends.cudnn 8 | 9 | from torch.utils.tensorboard import SummaryWriter 10 | import os 11 | 12 | from slbo.algos.mfrl.trpo import TRPO 13 | from slbo.configs.config import Config 14 | from slbo.envs.wrapped_envs import make_vec_envs, get_vec_normalize 15 | from slbo.models import Actor, VCritic 16 | from slbo.misc.utils import evaluate, log_and_write 17 | from slbo.storages.on_policy_buffer import OnPolicyBuffer 18 | try: 19 | from slbo.misc import logger 20 | except ImportError: 21 | from stable_baselines import logger 22 | 23 | 24 | # noinspection DuplicatedCode 25 | def main(): 26 | logger.info('Test script for TRPO') 27 | config, hparam_dict = Config('trpo_config.yaml') 28 | 29 | torch.manual_seed(config.seed) 30 | # noinspection PyUnresolvedReferences 31 | torch.cuda.manual_seed_all(config.seed) 32 | 33 | if config.use_cuda and torch.cuda.is_available() and config.cuda_deterministic: 34 | torch.backends.cudnn.benchmark = False 35 | torch.backends.cudnn.deterministic = True 36 | 37 | import datetime 38 | current_time = datetime.datetime.now().strftime('%b%d_%H%M%S') 39 | log_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'log') 40 | eval_log_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'log_eval') 41 | save_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'save') 42 | os.makedirs(log_dir, exist_ok=True) 43 | os.makedirs(eval_log_dir, exist_ok=True) 44 | os.makedirs(save_dir, exist_ok=True) 45 | writer = SummaryWriter(log_dir=log_dir) 46 | writer.add_hparams(hparam_dict, {}) 47 | 48 | # save current version of code 49 | shutil.copytree(config.proj_dir, save_dir + '/code', ignore=shutil.ignore_patterns('result', 'data', 'ref')) 50 | 51 | torch.set_num_threads(1) 52 | device = torch.device('cuda' if config.use_cuda else 'cpu') 53 | 54 | envs = make_vec_envs(config.env.env_name, config.seed, config.env.num_envs, config.env.gamma, log_dir, device, 55 | allow_early_resets=False, norm_reward=True, norm_obs=True, test=True) 56 | 57 | state_dim = envs.observation_space.shape[0] 58 | action_space = envs.action_space 59 | action_dim = action_space.shape[0] 60 | 61 | actor = Actor(state_dim, action_space, hidden_dims=config.trpo.actor_hidden_dims, 62 | state_normalizer=None) 63 | critic = VCritic(state_dim, hidden_dims=config.trpo.critic_hidden_dims, state_normalizer=None) 64 | actor.to(device) 65 | critic.to(device) 66 | 67 | agent = TRPO(actor, critic,) 68 | 69 | on_policy_buffer = \ 70 | OnPolicyBuffer(config.trpo.num_env_steps, config.env.num_envs, envs.observation_space.shape, envs.action_space, 71 | use_gae=config.trpo.use_gae, gamma=config.env.gamma, gae_lambda=config.trpo.gae_lambda, 72 | use_proper_time_limits=config.trpo.use_proper_time_limits, ) 73 | 74 | state = envs.reset() 75 | # noinspection PyUnresolvedReferences 76 | on_policy_buffer.states[0].copy_(state) 77 | on_policy_buffer.to(device) 78 | 79 | episode_rewards = deque(maxlen=10) 80 | episode_lengths = deque(maxlen=10) 81 | 82 | start = time.time() 83 | num_updates = config.trpo.total_env_steps // config.trpo.num_env_steps // config.env.num_envs 84 | 85 | for j in range(num_updates): 86 | 87 | for step in range(config.trpo.num_env_steps): 88 | with torch.no_grad(): 89 | action, action_log_prob, dist_entropy, *_ = actor.act(on_policy_buffer.states[step]) 90 | value = critic(on_policy_buffer.states[step]) 91 | 92 | state, reward, done, info = envs.step(action) 93 | 94 | for info_ in info: 95 | if 'episode' in info_.keys(): 96 | episode_rewards.append(info_['episode']['r']) 97 | episode_lengths.append(info_['episode']['l']) 98 | 99 | mask = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32) 100 | bad_mask = torch.tensor([[0.0] if 'bad_transition' in info_.keys() else [1.0] for info_ in info], 101 | dtype=torch.float32) 102 | on_policy_buffer.insert(states=state, actions=action, action_log_probs=action_log_prob, 103 | values=value, rewards=reward, masks=mask, bad_masks=bad_mask) 104 | 105 | with torch.no_grad(): 106 | next_value = critic(on_policy_buffer.states[-1]) 107 | 108 | on_policy_buffer.compute_returns(next_value) 109 | losses = agent.update(on_policy_buffer) 110 | on_policy_buffer.after_update() 111 | 112 | if j % config.save_interval == 0 or j == num_updates - 1: 113 | save_path = os.path.join(save_dir, config.mf_algo) 114 | try: 115 | os.makedirs(save_path) 116 | except OSError: 117 | pass 118 | 119 | logger.info('Model saved.') 120 | torch.save([actor.state_dict(), critic.state_dict(), 121 | getattr(get_vec_normalize(envs), 'obs_rms', None)], 122 | os.path.join(save_path, config.env.env_name + ".pt")) 123 | 124 | serial_timsteps = (j + 1) * config.trpo.num_env_steps 125 | total_num_steps = config.env.num_envs * serial_timsteps 126 | end = time.time() 127 | 128 | fps = int(total_num_steps / (end - start)) 129 | 130 | if j % config.log_interval == 0 and len(episode_rewards) > 0: 131 | log_info = [('serial_timesteps', serial_timsteps), ('total_timesteps', total_num_steps), 132 | ('ep_rew_mean', np.mean(episode_rewards)), ('ep_len_mean', np.mean(episode_lengths)), 133 | ('fps', fps), ('time_elapsed', end - start)] 134 | 135 | for loss_name, loss_value in losses.items(): 136 | log_info.append((loss_name, loss_value)) 137 | log_and_write(logger, writer, log_info, global_step=j) 138 | 139 | if (config.eval_interval is not None and len(episode_rewards) > 0 140 | and j % config.eval_interval == 0): 141 | obs_rms = get_vec_normalize(envs).obs_rms 142 | eval_episode_rewards, eval_episode_lengths = \ 143 | evaluate(actor, config.env.env_name, config.seed, 144 | num_episode=10, eval_log_dir=None, device=device, norm_reward=True, norm_obs=True, 145 | obs_rms=obs_rms, test=True) 146 | 147 | logger.info('Evaluation:') 148 | log_and_write(logger, writer, [('eval_ep_rew_mean', np.mean(eval_episode_rewards)), 149 | ('eval_ep_rew_min', np.min(eval_episode_rewards)), 150 | ('eval_ep_rew_max', np.max(eval_episode_rewards))], global_step=j) 151 | 152 | envs.close() 153 | 154 | 155 | if __name__ == "__main__": 156 | main() 157 | -------------------------------------------------------------------------------- /slbo/storages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiangsy/slbo_pytorch/7b1283968a82c939725c2705e7315d1f3759ee29/slbo/storages/__init__.py -------------------------------------------------------------------------------- /slbo/storages/off_policy_buffer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 3 | import numpy as np 4 | 5 | 6 | class OffPolicyBuffer(object): 7 | def __init__(self, buffer_size, num_envs, state_dim, action_dim): 8 | self.buffer_size = buffer_size 9 | self.num_envs = num_envs 10 | self.states = torch.zeros(buffer_size, num_envs, state_dim) 11 | self.next_states = torch.zeros(buffer_size, num_envs, state_dim) 12 | self.rewards = torch.zeros(buffer_size, num_envs, 1) 13 | self.actions = torch.zeros(buffer_size, num_envs, action_dim) 14 | self.masks = torch.ones(buffer_size, num_envs, 1) 15 | self.bad_masks = torch.ones(buffer_size, num_envs, 1) 16 | 17 | self.buffer_size = buffer_size 18 | self.index = 0 19 | self.size = 0 20 | self.device = torch.device('cpu') 21 | 22 | def to(self, device): 23 | self.states = self.states.to(device) 24 | self.next_states = self.next_states.to(device) 25 | self.rewards = self.rewards.to(device) 26 | self.actions = self.actions.to(device) 27 | self.masks = self.masks.to(device) 28 | self.bad_masks = self.bad_masks.to(device) 29 | 30 | self.device = device 31 | 32 | def add_buffer(self, buffer): 33 | for idx in range(buffer.size): 34 | self.insert(buffer.states[idx], buffer.actions[idx], buffer.rewards[idx], buffer.next_states[idx], 35 | buffer.masks[idx], buffer.bad_masks[idx]) 36 | 37 | def insert(self, states, actions, rewards, next_states, masks, bad_masks): 38 | self.states[self.index, :, :].copy_(states) 39 | self.actions[self.index, :, :].copy_(actions) 40 | self.rewards[self.index, :, :].copy_(rewards) 41 | self.next_states[self.index, :, :].copy_(next_states) 42 | self.masks[self.index, :, :].copy_(masks) 43 | self.bad_masks[self.index, :, :].copy_(bad_masks) 44 | 45 | self.index = (self.index + 1) % self.buffer_size 46 | self.size = min(self.size + 1, self.buffer_size) 47 | 48 | def clear(self): 49 | self.index = 0 50 | self.size = 0 51 | 52 | def get_batch_generator(self, batch_size): 53 | sampler = BatchSampler(SubsetRandomSampler(range(self.size * self.num_envs)), batch_size, drop_last=True) 54 | 55 | for indices in sampler: 56 | states = self.states.view(-1, *self.states.shape[2:])[indices] 57 | actions = self.actions.view(-1, self.actions.shape[-1])[indices] 58 | rewards = self.rewards.view(-1, 1)[indices] 59 | next_states = self.next_states.view(-1, *self.states.shape[2:])[indices] 60 | masks = self.masks.view(-1, 1)[indices] 61 | bad_masks = self.bad_masks.view(-1, 1)[indices] 62 | 63 | yield {'states': states, 'actions': actions, 'rewards': rewards, 'next_states': next_states, 64 | 'masks': masks, 'bad_masks': bad_masks} 65 | 66 | def get_sequential_batch_generator(self, batch_size, num_steps): 67 | sampler = BatchSampler(SubsetRandomSampler(range(self.size - num_steps)), 68 | int(batch_size / self.num_envs), drop_last=True) 69 | 70 | for indices in sampler: 71 | indices = np.array(indices) 72 | states = torch.zeros(batch_size, num_steps, *self.states.shape[2:], device=self.device) 73 | next_states = torch.zeros(batch_size, num_steps, *self.next_states.shape[2:], device=self.device) 74 | actions = torch.zeros([batch_size, num_steps, self.actions.shape[-1]], device=self.device) 75 | rewards = torch.zeros([batch_size, num_steps, 1], device=self.device) 76 | masks = torch.zeros([batch_size, num_steps, 1], device=self.device) 77 | bad_masks = torch.zeros([batch_size, num_steps, 1], device=self.device) 78 | for step in range(num_steps): 79 | states[:, step, :].copy_(self.states[indices + step].view(-1, *self.states.shape[2:])) 80 | next_states[:, step, :].copy_(self.next_states[indices + step].view(-1, *self.next_states.shape[2:])) 81 | actions[:, step, :].copy_(self.actions[indices + step].view(-1, self.actions.shape[-1])) 82 | rewards[:, step, :].copy_(self.rewards[indices + step].view(-1, 1)) 83 | masks[:, step, :].copy_(self.masks[indices + step].view(-1, 1)) 84 | bad_masks[:, step, :].copy_(self.bad_masks[indices + step].view(-1, 1)) 85 | 86 | yield {'states': states, 'actions': actions, 'masks': masks, 'next_states':next_states, 87 | 'rewards': rewards, 'bad_masks': bad_masks} 88 | 89 | def load(self, file_name): 90 | raise NotImplemented -------------------------------------------------------------------------------- /slbo/storages/on_policy_buffer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler 3 | 4 | 5 | class OnPolicyBuffer(object): 6 | def __init__(self, num_steps, num_envs, obs_shape, action_space, 7 | use_gae=True, gamma=0.99, gae_lambda=0.95, use_proper_time_limits=True): 8 | self.num_steps = num_steps 9 | self.num_envs = num_envs 10 | self.states = torch.zeros(num_steps + 1, num_envs, *obs_shape) 11 | self.rewards = torch.zeros(num_steps, num_envs, 1) 12 | self.values = torch.zeros(num_steps + 1, num_envs, 1) 13 | self.returns = torch.zeros(num_steps + 1, num_envs, 1) 14 | self.action_log_probs = torch.zeros(num_steps, num_envs, 1) 15 | if action_space.__class__.__name__ == 'Discrete': 16 | action_shape = 1 17 | else: 18 | action_shape = action_space.shape[0] 19 | self.actions = torch.zeros(num_steps, num_envs, action_shape) 20 | if action_space.__class__.__name__ == 'Discrete': 21 | self.actions = self.actions.long() 22 | self.masks = torch.ones(num_steps + 1, num_envs, 1) 23 | 24 | self.bad_masks = torch.ones(num_steps + 1, num_envs, 1) 25 | 26 | self.num_steps = num_steps 27 | self.step = 0 28 | 29 | self.use_gae = use_gae 30 | self.gamma = gamma 31 | self.gae_lambda = gae_lambda 32 | self.use_proper_time_limits = use_proper_time_limits 33 | 34 | def to(self, device): 35 | self.states = self.states.to(device) 36 | self.rewards = self.rewards.to(device) 37 | self.values = self.values.to(device) 38 | self.returns = self.returns.to(device) 39 | self.action_log_probs = self.action_log_probs.to(device) 40 | self.actions = self.actions.to(device) 41 | self.masks = self.masks.to(device) 42 | self.bad_masks = self.bad_masks.to(device) 43 | 44 | def insert(self, states, actions, action_log_probs, 45 | values, rewards, masks, bad_masks): 46 | self.states[self.step + 1].copy_(states) 47 | self.actions[self.step].copy_(actions) 48 | self.action_log_probs[self.step].copy_(action_log_probs) 49 | self.values[self.step].copy_(values) 50 | self.rewards[self.step].copy_(rewards) 51 | self.masks[self.step + 1].copy_(masks) 52 | self.bad_masks[self.step + 1].copy_(bad_masks) 53 | 54 | self.step = (self.step + 1) % self.num_steps 55 | 56 | def after_update(self): 57 | self.states[0].copy_(self.states[-1]) 58 | self.masks[0].copy_(self.masks[-1]) 59 | self.bad_masks[0].copy_(self.bad_masks[-1]) 60 | 61 | def compute_returns(self, next_value): 62 | if self.use_proper_time_limits: 63 | if self.use_gae: 64 | self.values[-1] = next_value 65 | gae = 0 66 | for step in reversed(range(self.num_steps)): 67 | delta = self.rewards[step] + self.gamma * self.values[step + 1] * self.masks[step + 1] - \ 68 | self.values[step] 69 | gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae 70 | gae = gae * self.bad_masks[step + 1] 71 | self.returns[step] = gae + self.values[step] 72 | else: 73 | self.returns[-1] = next_value 74 | for step in reversed(range(self.num_steps)): 75 | self.returns[step] = (self.returns[step + 1] * 76 | self.gamma * self.masks[step + 1] + self.rewards[step]) * self.bad_masks[step + 1] \ 77 | + (1 - self.bad_masks[step + 1]) * self.values[step] 78 | else: 79 | if self.use_gae: 80 | self.values[-1] = next_value 81 | gae = 0 82 | for step in reversed(range(self.num_steps)): 83 | delta = self.rewards[step] + self.gamma * self.values[step + 1] * self.masks[step + 1] - self.values[step] 84 | gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae 85 | self.returns[step] = gae + self.values[step] 86 | else: 87 | self.returns[-1] = next_value 88 | for step in reversed(range(self.num_steps)): 89 | self.returns[step] = self.returns[step + 1] * self.gamma * self.masks[step + 1] + self.rewards[step] 90 | 91 | def get_batch_generator(self, batch_size=None, advantages=None): 92 | batch_size = self.num_steps * self.num_envs if batch_size is None else batch_size 93 | sampler = BatchSampler(SubsetRandomSampler(range(self.num_steps * self.num_envs)), batch_size, drop_last=True) 94 | 95 | for indices in sampler: 96 | states = self.states[:-1].view(-1, *self.states.size()[2:])[indices] 97 | actions = self.actions.view(-1, self.actions.size(-1))[indices] 98 | values = self.values[:-1].view(-1, 1)[indices] 99 | returns = self.returns[:-1].view(-1, 1)[indices] 100 | masks = self.masks[:-1].view(-1, 1)[indices] 101 | action_log_probs = self.action_log_probs.view(-1, 1)[indices] 102 | if advantages is None: 103 | adv_targets = None 104 | else: 105 | adv_targets = advantages.view(-1, 1)[indices] 106 | 107 | yield {'states': states, 'actions': actions, 'values': values, 'returns': returns, 108 | 'masks': masks, 'action_log_probs': action_log_probs, 'adv_targets': adv_targets} 109 | --------------------------------------------------------------------------------