├── .gitignore ├── mbpo_pytorch ├── __init__.py ├── configs │ ├── __init__.py │ ├── ant.yaml │ ├── hopper.yaml │ ├── walker2d.yaml │ ├── halfcheetah.yaml │ ├── inverted_pendulum.yaml │ ├── inverted_double_pendulum.yaml │ ├── humanoid.yaml │ ├── sac.yaml │ ├── mbpo.yaml │ └── config.py ├── envs │ ├── __init__.py │ ├── benchmarking_envs │ │ ├── __init__.py │ │ ├── gym │ │ │ ├── __init__.py │ │ │ ├── gym_oswimmer.py │ │ │ ├── gym_ohalfcheetah.py │ │ │ ├── gym_oant.py │ │ │ ├── gym_ohumanoid.py │ │ │ ├── gym_ohopper.py │ │ │ ├── gym_owalker.py │ │ │ ├── pets_cartpole.py │ │ │ ├── assets │ │ │ │ ├── cartpole.xml │ │ │ │ ├── fixed_swimmer.xml │ │ │ │ ├── half_cheetah.xml │ │ │ │ └── pusher.xml │ │ │ ├── pets_cheetah.py │ │ │ ├── inverted_pendulum.py │ │ │ ├── gym_cheetahO01.py │ │ │ ├── half_cheetah.py │ │ │ ├── gym_fswimmer.py │ │ │ ├── reacher.py │ │ │ ├── ant.py │ │ │ ├── gym_cheetahO001.py │ │ │ ├── swimmer.py │ │ │ ├── gym_fant.py │ │ │ ├── gym_cheetahA003.py │ │ │ ├── gym_cheetahA01.py │ │ │ ├── pets_pusher.py │ │ │ ├── walker2d.py │ │ │ ├── gym_nostopslimhumanoid.py │ │ │ ├── hopper.py │ │ │ ├── gym_slimhumanoid.py │ │ │ ├── gym_fhopper.py │ │ │ ├── gym_fwalker2d.py │ │ │ ├── gym_humanoid.py │ │ │ ├── pets_reacher.py │ │ │ ├── pendulum.py │ │ │ ├── gym_pendulumO01.py │ │ │ ├── gym_pendulumO001.py │ │ │ ├── gym_cartpoleO001.py │ │ │ ├── cartpole.py │ │ │ ├── gym_cartpoleO01.py │ │ │ └── mountain_car.py │ │ ├── assets │ │ │ ├── point.xml │ │ │ ├── swimmer.xml │ │ │ ├── reacher.xml │ │ │ ├── hopper.xml │ │ │ ├── walker2d.xml │ │ │ ├── ant.xml │ │ │ ├── pusher.xml │ │ │ └── half_cheetah.xml │ │ └── benchmarking_envs.py │ └── wrapped_envs.py ├── misc │ ├── __init__.py │ ├── distributions.py │ └── utils.py ├── thirdparty │ ├── __init__.py │ ├── summary_writer.py │ ├── tile_images.py │ ├── running_mean_std.py │ ├── util.py │ └── dummy_vec_env.py ├── algos │ ├── mbrl │ │ └── __init__.py │ ├── mfrl │ │ ├── __init__.py │ │ └── sac.py │ └── __init__.py ├── storages │ ├── __init__.py │ └── mixture_buffer.py ├── scripts │ ├── run_mbpo.sh │ └── remove_tb_logs.py └── models │ ├── __init__.py │ ├── initializer.py │ ├── critic.py │ ├── q_critic.py │ ├── utils.py │ ├── actor.py │ ├── normalizers.py │ └── actor_layer.py ├── setup.py └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea -------------------------------------------------------------------------------- /mbpo_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mbpo_pytorch/configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mbpo_pytorch/misc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mbpo_pytorch/thirdparty/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /mbpo_pytorch/algos/mbrl/__init__.py: -------------------------------------------------------------------------------- 1 | from .mbpo import MBPO 2 | -------------------------------------------------------------------------------- /mbpo_pytorch/algos/mfrl/__init__.py: -------------------------------------------------------------------------------- 1 | from .sac import SAC 2 | -------------------------------------------------------------------------------- /mbpo_pytorch/algos/__init__.py: -------------------------------------------------------------------------------- 1 | from .mbrl import MBPO 2 | from .mfrl import SAC 3 | -------------------------------------------------------------------------------- /mbpo_pytorch/storages/__init__.py: -------------------------------------------------------------------------------- 1 | from .universal_offpolicy_buffer import SimpleUniversalBuffer 2 | from .mixture_buffer import MixtureBuffer -------------------------------------------------------------------------------- /mbpo_pytorch/scripts/run_mbpo.sh: -------------------------------------------------------------------------------- 1 | for env in "halfcheetah" "walker2d" "hopper" "ant" 2 | do 3 | python run_mbpo.py --configs "mbpo.yaml" "${env}.yaml" "priv.yaml" 4 | done -------------------------------------------------------------------------------- /mbpo_pytorch/configs/ant.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | env_name: 'OriginalAnt' 3 | max_episode_steps: 1000 4 | 5 | sac: 6 | target_entropy: -4 7 | num_grad_steps: 20 8 | 9 | mbpo: 10 | rollout_schedule: [ 20, 100, 1, 25 ] 11 | -------------------------------------------------------------------------------- /mbpo_pytorch/configs/hopper.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | env_name: 'OriginalHopper' 3 | max_episode_steps: 1000 4 | 5 | sac: 6 | target_entropy: ~ 7 | num_grad_steps: 20 8 | 9 | mbpo: 10 | rollout_schedule: [ 20, 150, 1, 15 ] 11 | -------------------------------------------------------------------------------- /mbpo_pytorch/configs/walker2d.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | env_name: 'OriginalWalker' 3 | max_episode_steps: 1000 4 | 5 | sac: 6 | target_entropy: -3 7 | num_grad_steps: 20 8 | 9 | mbpo: 10 | rollout_schedule: [ 20, 150, 1, 1 ] 11 | -------------------------------------------------------------------------------- /mbpo_pytorch/configs/halfcheetah.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | env_name: 'OriginalHalfCheetah' 3 | max_episode_steps: 1000 4 | 5 | sac: 6 | target_entropy: -3 7 | num_grad_steps: 40 8 | 9 | mbpo: 10 | rollout_schedule: [ 20, 150, 1, 1 ] 11 | -------------------------------------------------------------------------------- /mbpo_pytorch/models/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .actor import Actor 3 | from .critic import QCritic 4 | from .dynamics import RDynamics, EnsembleRDynamics 5 | from .normalizers import RunningNormalizer, BatchNormalizer 6 | 7 | setattr(torch, 'identity', lambda x: x) 8 | setattr(torch, 'swish', lambda x: x * torch.sigmoid(x)) 9 | -------------------------------------------------------------------------------- /mbpo_pytorch/configs/inverted_pendulum.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | env_name: 'OriginalInvertedPendulum' 3 | max_episode_steps: 250 4 | 5 | ac: 6 | target_entropy: -0.05 7 | num_grad_steps: 10 8 | 9 | mbpo: 10 | rollout_schedule: [ 1, 15, 1, 1 ] 11 | num_total_epochs: 80 12 | num_warmup_samples: 500 13 | 14 | 15 | -------------------------------------------------------------------------------- /mbpo_pytorch/configs/inverted_double_pendulum.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | env_name: 'OriginalDoubleInvertedPendulum' 3 | max_episode_steps: 250 4 | 5 | ac: 6 | target_entropy: -0.5 7 | num_grad_steps: 20 8 | 9 | mbpo: 10 | rollout_schedule: [ 1, 15, 1, 1 ] 11 | num_total_epochs: 80 12 | num_warmup_samples: 500 13 | 14 | 15 | -------------------------------------------------------------------------------- /mbpo_pytorch/configs/humanoid.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | env_name: 'OriginalHumanoid' 3 | max_episode_steps: 1000 4 | 5 | sac: 6 | target_entropy: -2 7 | num_grad_steps: 40 8 | 9 | mbpo: 10 | rollout_schedule: [ 20, 300, 1, 15 ] 11 | dynamics_hidden_dims: [400, 400, 400, 400] 12 | num_model_retain_epochs: 5 13 | model_update_interval: 1000 14 | 15 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_oswimmer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco.swimmer import SwimmerEnv 3 | 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class OriginalSwimmerEnv(SwimmerEnv, BaseModelBasedEnv): 9 | 10 | def mb_step(self, states, actions, next_states): 11 | return None, np.zeros([states.shape[0], 1], dtype=np.bool) 12 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_ohalfcheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco.half_cheetah import HalfCheetahEnv 3 | 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class OriginalHalfCheetahEnv(HalfCheetahEnv, BaseModelBasedEnv): 9 | 10 | def mb_step(self, states, actions, next_states): 11 | return None, np.zeros([states.shape[0], 1], dtype=np.bool) 12 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_oant.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco.ant import AntEnv 3 | 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class OriginalAntEnv(AntEnv, BaseModelBasedEnv): 9 | 10 | def mb_step(self, states, actions, next_states): 11 | heights = next_states[:, 0] 12 | dones = np.logical_or((heights > 1.0), (heights < 0.2)) 13 | return None, dones 14 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_ohumanoid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco.humanoid import HumanoidEnv 3 | 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class OriginalHumanoidEnv(HumanoidEnv, BaseModelBasedEnv): 9 | 10 | def mb_step(self, states, actions, next_states): 11 | heights = next_states[:, 0] 12 | dones = np.logical_or((heights > 2.0), (heights < 1.0)) 13 | return None, dones 14 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_ohopper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco.hopper import HopperEnv 3 | 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class OriginalHopperEnv(HopperEnv, BaseModelBasedEnv): 9 | 10 | def mb_step(self, states, actions, next_states): 11 | heights, angs = next_states[:, 0], next_states[:, 1] 12 | dones = np.logical_or(heights <= 0.7, abs(angs) >= 0.2) 13 | return None, dones 14 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_owalker.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco.walker2d import Walker2dEnv 3 | 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class OriginalWalkerEnv(Walker2dEnv, BaseModelBasedEnv): 9 | 10 | def mb_step(self, states, actions, next_states): 11 | heights, angs = next_states[:, 0], next_states[:, 1] 12 | dones = np.logical_or( 13 | np.logical_or(heights >= 2.0, heights <= 0.8), 14 | np.abs(angs) >= 1.0 15 | ) 16 | return None, dones 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from setuptools import setup 3 | 4 | setup( 5 | name='mbpo_pytorch', 6 | auther='Shengyi Jiang', 7 | author_email='shengyi.jiang@outlook.com', 8 | packages=find_packages(), 9 | python_requires='>=3.7', 10 | install_requires=[ 11 | 'torch>=1.4.0', 12 | 'mujoco-py', 13 | 'scipy', 14 | 'numpy', 15 | 'gym>=0.17.0', 16 | 'pyglib', 17 | 'munch', 18 | 'pyyaml', 19 | 'colorama', 20 | 'tensorboard>=1.15.0', 21 | 'pandas' 22 | ], 23 | package_data={ 24 | # include default config files and env data files 25 | "": ["*.yaml", "*.xml"], 26 | } 27 | ) 28 | -------------------------------------------------------------------------------- /mbpo_pytorch/thirdparty/summary_writer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.tensorboard import SummaryWriter 3 | from torch.utils.tensorboard.summary import hparams 4 | 5 | 6 | class FixedSummaryWriter(SummaryWriter): 7 | def add_hparams(self, hparam_dict, metric_dict): 8 | torch._C._log_api_usage_once("tensorboard.logging.add_hparams") 9 | if type(hparam_dict) is not dict or type(metric_dict) is not dict: 10 | raise TypeError('hparam_dict and metric_dict should be dictionary.') 11 | exp, ssi, sei = hparams(hparam_dict, metric_dict) 12 | 13 | self._get_file_writer().add_summary(exp) 14 | self._get_file_writer().add_summary(ssi) 15 | self._get_file_writer().add_summary(sei) 16 | for k, v in metric_dict.items(): 17 | self.add_scalar(k, v) -------------------------------------------------------------------------------- /mbpo_pytorch/configs/sac.yaml: -------------------------------------------------------------------------------- 1 | proj_name: 'SAC' 2 | proj_dir: '/home/liuxh/Documents/mbpo' 3 | result_dir: './result' 4 | save_dir: './save' 5 | use_cuda: True 6 | device: 'cpu' # e.g 'cpu', 'cuda', 'cuda:0' 7 | seed: 3 8 | verbose: 0 9 | model_load_path: ~ 10 | buffer_load_path: ~ 11 | log_interval: 1 12 | save_interval: 10 13 | eval_interval: 1 14 | log_email: False 15 | debug: False 16 | 17 | env: 18 | env_name: 'Walker2d-v2' 19 | num_envs: 1 20 | gamma: 0.99 21 | max_episode_steps: 1000 22 | 23 | sac: 24 | num_total_steps: 1000000 25 | num_warmup_steps: 1000 26 | num_epoch_steps: 1000 27 | buffer_size: 200000 28 | actor_hidden_dims: [256, 256] 29 | critic_hidden_dims: [256, 256] 30 | num_grad_steps: 1000 31 | batch_size: 256 32 | target_entropy: ~ 33 | actor_lr: 3.0e-4 34 | critic_lr: 3.0e-4 35 | soft_target_tau: 5.0e-3 36 | 37 | -------------------------------------------------------------------------------- /mbpo_pytorch/models/initializer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def normc_init(tensor, std=1.0, **kwargs): 5 | tensor.data.normal_(0, 1) 6 | tensor.data *= std / np.sqrt(tensor.data.pow(2).sum(1, keepdim=True)) 7 | 8 | 9 | def fanin_init(tensor, **kwargs): 10 | size = tensor.size() 11 | if len(size) == 2: 12 | fan_in = size[0] 13 | elif len(size) > 2: 14 | fan_in = np.prod(size[1:]) 15 | else: 16 | raise Exception("Shape must be have dimension at least 2.") 17 | bound = 1. / np.sqrt(fan_in) 18 | return tensor.data.uniform_(-bound, bound) 19 | 20 | 21 | def truncated_norm_init(tensor, mean=0, std=None, **kwargs): 22 | size = tensor.shape 23 | std = std or 1.0/(2*np.sqrt(size[0])) 24 | tmp = tensor.new_empty(size + (4,)).normal_() 25 | valid = (tmp < 2) & (tmp > -2) 26 | ind = valid.max(-1, keepdim=True)[1] 27 | tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1)) 28 | tensor.data.mul_(std).add_(mean) 29 | return tensor 30 | 31 | -------------------------------------------------------------------------------- /mbpo_pytorch/storages/mixture_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from mbpo_pytorch.misc.utils import merge_dicts 5 | from mbpo_pytorch.misc import logger 6 | 7 | 8 | class MixtureBuffer: 9 | def __init__(self, buffers, weights, verbose=0): 10 | self.buffers = buffers 11 | self.weights = np.array(weights) 12 | self.verbose = verbose 13 | 14 | def get_batch_generator_inf(self, batch_size, **kwargs): 15 | batch_sizes = (batch_size * self.weights).astype(np.int) 16 | if self.verbose: 17 | logger.log('[Buffer Mixing] Max error {}'.format(np.max(np.abs(batch_sizes / batch_size - self.weights)))) 18 | rand_index = np.random.randint(len(batch_sizes)) 19 | batch_sizes[rand_index] = batch_size - np.delete(batch_sizes, rand_index).sum() 20 | inf_gens = [buffer.get_batch_generator_inf(int(batch_size_), **kwargs) 21 | for buffer, batch_size_ in zip(self.buffers, batch_sizes)] 22 | while True: 23 | buffer_samples = list(map(lambda gen: next(gen), inf_gens)) 24 | yield merge_dicts(buffer_samples, lambda x: torch.cat(x, dim=0)) 25 | -------------------------------------------------------------------------------- /mbpo_pytorch/thirdparty/tile_images.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def tile_images(img_nhwc): 5 | """ 6 | Tile N images into one big PxQ image 7 | (P,Q) are chosen to be as close as possible, and if N 8 | is square, then P=Q. 9 | 10 | :param img_nhwc: (list) list or array of images, ndim=4 once turned into array. img nhwc 11 | n = batch index, h = height, w = width, c = channel 12 | :return: (numpy float) img_HWc, ndim=3 13 | """ 14 | img_nhwc = np.asarray(img_nhwc) 15 | n_images, height, width, n_channels = img_nhwc.shape 16 | # new_height was named H before 17 | new_height = int(np.ceil(np.sqrt(n_images))) 18 | # new_width was named W before 19 | new_width = int(np.ceil(float(n_images) / new_height)) 20 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)]) 21 | # img_HWhwc 22 | out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels) 23 | # img_HhWwc 24 | out_image = out_image.transpose(0, 2, 1, 3, 4) 25 | # img_Hh_Ww_c 26 | out_image = out_image.reshape(new_height * height, new_width * width, n_channels) 27 | return out_image 28 | 29 | -------------------------------------------------------------------------------- /mbpo_pytorch/configs/mbpo.yaml: -------------------------------------------------------------------------------- 1 | proj_name: 'MBPO' 2 | mf_algo: 'sac' 3 | proj_dir: '~' 4 | result_dir: './result' 5 | save_dir: './save' 6 | device: 'cuda:0' # e.g 'cpu', 'cuda', 'cuda:0' 7 | seed: 3 8 | verbose: 0 9 | model_load_path: ~ 10 | buffer_load_path: ~ 11 | save_interval: 2 12 | eval_interval: 1 13 | log_interval: 250 14 | log_email: False 15 | debug: False 16 | 17 | env: 18 | num_real_envs: 1 19 | gamma: 0.99 20 | 21 | sac: 22 | actor_hidden_dims: [256, 256] 23 | critic_hidden_dims: [256, 256] 24 | num_grad_steps: 20 25 | batch_size: 256 26 | target_entropy: ~ 27 | actor_lr: 3.0e-4 28 | critic_lr: 3.0e-4 29 | soft_target_tau: 5.0e-3 30 | 31 | mbpo: 32 | num_total_epochs: 1000 33 | dynamics_hidden_dims: [200, 200, 200, 200] 34 | l2_loss_coefs: [0.000025, 0.00005, 0.000075, 0.000075, 0.0001] 35 | lr: 1.0e-3 36 | dynamics_batch_size: 256 37 | num_dynamics_networks: 7 38 | num_elite_dynamics_networks: 5 39 | real_buffer_size: 1000000 40 | rollout_batch_size: 100000 41 | num_model_retain_epochs: 1 42 | model_update_interval: 250 43 | rollout_schedule: [20, 150, 1, 15] 44 | max_num_epochs: ~ 45 | real_sample_ratio: 0. 46 | num_warmup_samples: 5000 47 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # MBPO PyTorch 2 | A PyTorch reimplementation of MBPO (When to trust your model: model-based policy optimization) 3 | 4 | # Deprecated Warning 5 | The owner of this repo has graduated and this repo is no longer maintained. Please refer to this new [MBPO](https://github.com/x35f/model_based_rl) Pytorch re-implementation, which is a submodule of the [Unstable Baselines](https://github.com/x35f/unstable_Baselines) project maintained by researchers from the same [lab](http://www.lamda.nju.edu.cn/MainPage.ashx). This new MBPO re-implementation strictly follows the original TF implementation and has been tested on several MuJoCo tasks. 6 | 7 | # Dependency 8 | 9 | Please refer to ./requirements.txt. 10 | 11 | # Usage 12 | 13 | pip install -e . 14 | 15 | # default hyperparams in ./configs/mbpo.yaml 16 | # remember to CHANGE proj_dir to your actual directory 17 | python ./mbpo_pytorch/scripts/run_mbpo.py 18 | 19 | # you can also overwrite hyperparams by passing args, e.g. 20 | python ./mbpo_pytorch/scripts/run_mbpo.py --set seed=0 verbose=1 device="'cuda:0'" env.env_name='FixedHopper' 21 | 22 | 23 | # Credits 24 | 1. [vitchyr/rlkit](https://github.com/vitchyr/rlkit) 25 | 2. [JannerM/mbpo](https://github.com/JannerM/mbpo) 26 | 3. [WilsonWangTHU/mbbl](https://github.com/WilsonWangTHU/mbbl) -------------------------------------------------------------------------------- /mbpo_pytorch/scripts/remove_tb_logs.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import shutil 3 | from tensorboard.backend.event_processing.event_file_inspector import get_inspection_units, get_dict_to_print 4 | 5 | 6 | parser = ArgumentParser('delete small runs') 7 | parser.add_argument('--logdir', type=str, default='/home/liuxh/Documents/mbpo_torch/result') 8 | parser.add_argument('--min_run_len', type=int, default=100) 9 | parser.add_argument('--list', action='store_true') 10 | args = parser.parse_args() 11 | 12 | run_len = {} 13 | inspect_units = get_inspection_units(logdir=args.logdir) 14 | 15 | 16 | for run in inspect_units: 17 | path = run[0] 18 | max_length = 0 19 | for key, value in get_dict_to_print(run.field_to_obs).items(): 20 | if value is not None: 21 | length = value['max_step'] 22 | if max_length < length: 23 | max_length = length 24 | run_len[path] = max_length 25 | 26 | for run, length in run_len.items(): 27 | if length < args.min_run_len: 28 | if args.list: 29 | print(f'{run} is {length} steps long and so will be deleted') 30 | else: 31 | try: 32 | print(f'{run} is {length} and was deleted') 33 | shutil.rmtree(run) 34 | except OSError: 35 | print(f"OS didn't let us delete {run}") 36 | else: 37 | print(f'{run} is {length} and is good') 38 | -------------------------------------------------------------------------------- /mbpo_pytorch/thirdparty/running_mean_std.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RunningMeanStd(object): 5 | def __init__(self, epsilon=1e-4, shape=()): 6 | """ 7 | calulates the running mean and std of a data stream 8 | https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 9 | 10 | :param epsilon: (float) helps with arithmetic issues 11 | :param shape: (tuple) the shape of the data stream's output 12 | """ 13 | self.mean = np.zeros(shape, 'float64') 14 | self.var = np.ones(shape, 'float64') 15 | self.count = epsilon 16 | 17 | def update(self, arr): 18 | batch_mean = np.mean(arr, axis=0) 19 | batch_var = np.var(arr, axis=0) 20 | batch_count = arr.shape[0] 21 | self.update_from_moments(batch_mean, batch_var, batch_count) 22 | 23 | def update_from_moments(self, batch_mean, batch_var, batch_count): 24 | delta = batch_mean - self.mean 25 | tot_count = self.count + batch_count 26 | 27 | new_mean = self.mean + delta * batch_count / tot_count 28 | m_a = self.var * self.count 29 | m_b = batch_var * batch_count 30 | m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) 31 | new_var = m_2 / (self.count + batch_count) 32 | 33 | new_count = batch_count + self.count 34 | 35 | self.mean = new_mean 36 | self.var = new_var 37 | self.count = new_count 38 | -------------------------------------------------------------------------------- /mbpo_pytorch/models/critic.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import List 3 | 4 | from gym.spaces import Box, MultiBinary, Discrete 5 | import torch 6 | import torch.nn as nn 7 | 8 | from .initializer import fanin_init 9 | from .utils import MLP, init 10 | 11 | 12 | class QCritic(nn.Module, ABC): 13 | def __init__(self, state_dim, action_space, hidden_dims: List[int], init_w=3e-3, init_b=0.1, 14 | use_multihead_output=False, **kwargs): 15 | super(QCritic, self).__init__() 16 | 17 | assert not use_multihead_output or action_space.__class__.__name__ == 'Discrete' 18 | 19 | if isinstance(action_space, Box) or isinstance(action_space, MultiBinary): 20 | action_dim = action_space.shape[0] 21 | else: 22 | assert isinstance(action_space, Discrete) 23 | action_dim = action_space.n 24 | 25 | mlp_kwargs = kwargs.copy() 26 | mlp_kwargs['activation'] = kwargs.get('activation', 'ReLU') 27 | mlp_kwargs['last_activation'] = kwargs.get('last_activation', 'Identity') 28 | 29 | self.critic = MLP(state_dim + action_dim, 1, hidden_dims, **kwargs) 30 | 31 | def init_(m): init(m, fanin_init, lambda x: nn.init.constant_(x, init_b)) 32 | def init_last_(m): init(m, lambda x: nn.init.uniform_(x, -init_w, init_w), 33 | lambda x: nn.init.uniform_(x, -init_w, init_w)) 34 | self.critic.init(init_, init_last_) 35 | 36 | def forward(self, states, actions): 37 | return self.critic(torch.cat([states, actions], dim=-1)) 38 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/pets_cartpole.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class CartpoleEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | PENDULUM_LENGTH = 0.6 12 | 13 | def __init__(self): 14 | utils.EzPickle.__init__(self) 15 | dir_path = os.path.dirname(os.path.realpath(__file__)) 16 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/cartpole.xml' % dir_path, 2) 17 | 18 | def step(self, a): 19 | self.do_simulation(a, self.frame_skip) 20 | ob = self._get_obs() 21 | 22 | cost_lscale = CartpoleEnv.PENDULUM_LENGTH 23 | reward = np.exp( 24 | -np.sum(np.square(self._get_ee_pos(ob) - np.array([0.0, CartpoleEnv.PENDULUM_LENGTH]))) / (cost_lscale ** 2) 25 | ) 26 | reward -= 0.01 * np.sum(np.square(a)) 27 | 28 | done = False 29 | return ob, reward, done, {} 30 | 31 | def reset_model(self): 32 | qpos = self.init_qpos + np.random.normal(0, 0.1, np.shape(self.init_qpos)) 33 | qvel = self.init_qvel + np.random.normal(0, 0.1, np.shape(self.init_qvel)) 34 | self.set_state(qpos, qvel) 35 | return self._get_obs() 36 | 37 | def _get_obs(self): 38 | return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel() 39 | 40 | @staticmethod 41 | def _get_ee_pos(x): 42 | x0, theta = x[0], x[1] 43 | return np.array([ 44 | x0 - CartpoleEnv.PENDULUM_LENGTH * np.sin(theta), 45 | -CartpoleEnv.PENDULUM_LENGTH * np.cos(theta) 46 | ]) 47 | 48 | def viewer_setup(self): 49 | v = self.viewer 50 | v.cam.trackbodyid = 0 51 | v.cam.distance = v.model.stat.extent 52 | -------------------------------------------------------------------------------- /mbpo_pytorch/models/q_critic.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from .utils import MLP, init 7 | from .initializer import fanin_init 8 | 9 | 10 | class QCritic(nn.Module, ABC): 11 | def __init__(self, state_dim, action_space, hidden_dims, activation='relu', last_activation='Identity', 12 | init_w=3e-3, init_b=0.1, use_multihead_output=False): 13 | super(QCritic, self).__init__() 14 | 15 | assert not use_multihead_output or action_space.__class__.__name__ == 'Discrete' 16 | 17 | if action_space.__class__.__name__ == 'Discrete': 18 | action_dim = action_space.n 19 | else: 20 | assert action_space.__class__.__name__ == 'Box' 21 | action_dim = action_space.shape[0] 22 | 23 | if use_multihead_output: 24 | action_dim = action_space.n 25 | self.critic = MLP(state_dim, action_dim, hidden_dims, 26 | activation=activation, last_activation=last_activation) 27 | self.forward = self._get_q_value_discrete 28 | else: 29 | self.critic = MLP(state_dim + action_dim, 1, hidden_dims, 30 | activation=activation, last_activation=last_activation) 31 | self.forward = self._get_q_value_continuous 32 | 33 | def init_(m): init(m, fanin_init, lambda x: nn.init.constant_(x, init_b)) 34 | def init_last_(m): init(m, lambda x: nn.init.uniform_(x, -init_w, init_w), 35 | lambda x: nn.init.uniform_(x, -init_w, init_w)) 36 | self.critic.init(init_, init_last_) 37 | 38 | def _get_q_value_continuous(self, state, action): 39 | return self.critic(torch.cat([state, action], dim=-1)) 40 | 41 | def _get_q_value_discrete(self, state, action): 42 | return self.critic_feature(state)[action] 43 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/assets/cartpole.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 35 | 36 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/assets/point.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 32 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/pets_cheetah.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.realpath(__file__)) 15 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/half_cheetah.xml' % dir_path, 5) 16 | utils.EzPickle.__init__(self) 17 | 18 | def step(self, action): 19 | self.prev_qpos = np.copy(self.sim.data.qpos.flat) 20 | self.do_simulation(action, self.frame_skip) 21 | ob = self._get_obs() 22 | 23 | reward_ctrl = -0.1 * np.square(action).sum() 24 | reward_run = ob[0] - 0.0 * np.square(ob[2]) 25 | reward = reward_run + reward_ctrl 26 | 27 | done = False 28 | return ob, reward, done, {} 29 | 30 | def _get_obs(self): 31 | return np.concatenate([ 32 | (self.sim.data.qpos.flat[:1] - self.prev_qpos[:1]) / self.dt, 33 | self.sim.data.qpos.flat[1:], 34 | self.sim.data.qvel.flat, 35 | ]) 36 | 37 | def reset_model(self): 38 | qpos = self.init_qpos + np.random.normal(loc=0, scale=0.001, size=self.model.nq) 39 | qvel = self.init_qvel + np.random.normal(loc=0, scale=0.001, size=self.model.nv) 40 | self.set_state(qpos, qvel) 41 | self.prev_qpos = np.copy(self.sim.data.qpos.flat) 42 | return self._get_obs() 43 | 44 | def cost_np_vec(self, obs, acts, next_obs): 45 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 46 | reward_run = obs[:, 0] 47 | reward = reward_run + reward_ctrl 48 | return -reward 49 | 50 | def viewer_setup(self): 51 | self.viewer.cam.distance = self.model.stat.extent * 0.25 52 | self.viewer.cam.elevation = -55 53 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/inverted_pendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | 6 | class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | 8 | def __init__(self): 9 | utils.EzPickle.__init__(self) 10 | mujoco_env.MujocoEnv.__init__(self, 'inverted_pendulum.xml', 2) 11 | 12 | def step(self, a): 13 | # reward = 1.0 14 | reward = self._get_reward() 15 | self.do_simulation(a, self.frame_skip) 16 | ob = self._get_obs() 17 | # notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= .2) 18 | # done = not notdone 19 | done = False 20 | return ob, reward, done, {} 21 | 22 | def reset_model(self): 23 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-0.01, high=0.01) 24 | qvel = self.init_qvel + self.np_random.uniform(size=self.model.nv, low=-0.01, high=0.01) 25 | self.set_state(qpos, qvel) 26 | return self._get_obs() 27 | 28 | def _get_reward(self): 29 | old_ob = self._get_obs() 30 | reward = -((old_ob[1]) ** 2) 31 | return reward 32 | 33 | def _get_obs(self): 34 | return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel() 35 | 36 | def viewer_setup(self): 37 | v = self.viewer 38 | v.cam.trackbodyid = 0 39 | v.cam.distance = v.model.stat.extent 40 | 41 | def mb_step(self, states, actions, next_states): 42 | # returns rewards and dones 43 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 44 | if getattr(self, 'action_space', None): 45 | actions = np.clip(actions, self.action_space.low, 46 | self.action_space.high) 47 | rewards = - self.cost_np_vec(states, actions, next_states) 48 | return rewards, np.zeros_like(rewards, dtype=np.bool) 49 | 50 | def cost_np_vec(self, obs, acts, next_obs): 51 | return ((obs[:, 1]) ** 2) 52 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/assets/swimmer.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 39 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/assets/reacher.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/assets/fixed_swimmer.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 44 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_cheetahO01.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=5): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action): 21 | start_ob = self._get_obs() 22 | reward_run = start_ob[8] 23 | 24 | self.do_simulation(action, self.frame_skip) 25 | ob = self._get_obs() 26 | if getattr(self, 'action_space', None): 27 | action = np.clip(action, self.action_space.low, 28 | self.action_space.high) 29 | reward_ctrl = -0.1 * np.square(action).sum() 30 | 31 | reward = reward_run + reward_ctrl 32 | done = False 33 | ob += np.random.uniform(low=-0.1, high=0.1, size=ob.shape) 34 | return ob, reward, done, {} 35 | 36 | def _get_obs(self): 37 | return np.concatenate([ 38 | self.sim.data.qpos.flat[1:], 39 | self.sim.data.qvel.flat, 40 | ]) 41 | 42 | def reset_model(self): 43 | qpos = self.init_qpos + \ 44 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 45 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 46 | self.set_state(qpos, qvel) 47 | return self._get_obs() 48 | 49 | def viewer_setup(self): 50 | self.viewer.cam.distance = self.model.stat.extent * 0.5 51 | 52 | def cost_np_vec(self, obs, acts, next_obs): 53 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 54 | reward_run = obs[:, 8] 55 | reward = reward_run + reward_ctrl 56 | return -reward 57 | 58 | def mb_step(self, states, actions, next_states): 59 | # returns rewards and dones 60 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 61 | if getattr(self, 'action_space', None): 62 | actions = np.clip(actions, self.action_space.low, 63 | self.action_space.high) 64 | rewards = - self.cost_np_vec(states, actions, next_states) 65 | return rewards, np.zeros_like(rewards, dtype=np.bool) 66 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/half_cheetah.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=5): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action: np.ndarray): 21 | start_ob = self._get_obs() 22 | reward_run = start_ob[8] 23 | 24 | self.do_simulation(action, self.frame_skip) 25 | ob = self._get_obs() 26 | if getattr(self, 'action_space', None): 27 | action = np.clip(action, self.action_space.low, 28 | self.action_space.high) 29 | reward_ctrl = -0.1 * np.square(action).sum() 30 | 31 | reward = reward_run + reward_ctrl 32 | done = False 33 | return ob, reward, done, {} 34 | 35 | def _get_obs(self): 36 | return np.concatenate([ 37 | self.sim.data.qpos.flat[1:], 38 | self.sim.data.qvel.flat, 39 | ]) 40 | 41 | def mb_step(self, states, actions, next_states): 42 | # returns rewards and dones 43 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 44 | if getattr(self, 'action_space', None): 45 | actions = np.clip(actions, self.action_space.low, 46 | self.action_space.high) 47 | rewards = - self.cost_np_vec(states, actions, next_states) 48 | return rewards, np.zeros_like(rewards, dtype=np.bool) 49 | 50 | def reset_model(self): 51 | qpos = self.init_qpos + \ 52 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 53 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 54 | self.set_state(qpos, qvel) 55 | return self._get_obs() 56 | 57 | def viewer_setup(self): 58 | self.viewer.cam.distance = self.model.stat.extent * 0.5 59 | 60 | def cost_np_vec(self, obs, acts, next_obs): 61 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 62 | reward_run = obs[:, 8] 63 | reward = reward_run + reward_ctrl 64 | return -reward 65 | 66 | def cost_tf_vec(self, obs, acts, next_obs): 67 | raise NotImplementedError 68 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_fswimmer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class fixedSwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self): 13 | dir_path = os.path.dirname(os.path.realpath(__file__)) 14 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/fixed_swimmer.xml' % dir_path, 4) 15 | utils.EzPickle.__init__(self) 16 | 17 | def step(self, a): 18 | ctrl_cost_coeff = 0.0001 19 | 20 | """ 21 | xposbefore = self.sim.data.qpos[0, 0] 22 | self.do_simulation(a, self.frame_skip) 23 | xposafter = self.sim.data.qpos[0, 0] 24 | """ 25 | 26 | self.xposbefore = self.sim.data.site_xpos[0][0] / self.dt 27 | self.do_simulation(a, self.frame_skip) 28 | self.xposafter = self.sim.data.site_xpos[0][0] / self.dt 29 | self.pos_diff = self.xposafter - self.xposbefore 30 | 31 | reward_fwd = self.xposafter - self.xposbefore 32 | reward_ctrl = - ctrl_cost_coeff * np.square(a).sum() 33 | reward = reward_fwd + reward_ctrl 34 | ob = self._get_obs() 35 | return ob, reward, False, dict(reward_fwd=reward_fwd, reward_ctrl=reward_ctrl) 36 | 37 | def _get_obs(self): 38 | qpos = self.sim.data.qpos 39 | qvel = self.sim.data.qvel 40 | return np.concatenate([qpos.flat[2:], qvel.flat, self.pos_diff.flat]) 41 | 42 | def mb_step(self, states, actions, next_states): 43 | # returns rewards and dones 44 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 45 | if getattr(self, 'action_space', None): 46 | actions = np.clip(actions, self.action_space.low, 47 | self.action_space.high) 48 | rewards = - self.cost_np_vec(states, actions, next_states) 49 | return rewards, np.zeros_like(rewards, dtype=np.bool) 50 | 51 | def reset_model(self): 52 | self.set_state( 53 | self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq), 54 | self.init_qvel + self.np_random.uniform(low=-.1, high=.1, size=self.model.nv) 55 | ) 56 | return self._get_obs() 57 | 58 | def cost_np_vec(self, obs, acts, next_obs): 59 | reward_ctrl = -0.0001 * np.sum(np.square(acts), axis=1) 60 | reward_run = obs[:, -1] 61 | reward = reward_run + reward_ctrl 62 | return -reward 63 | 64 | def cost_tf_vec(self, obs, acts, next_obs): 65 | raise NotImplementedError 66 | 67 | 68 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/reacher.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 9 | 10 | def __init__(self): 11 | utils.EzPickle.__init__(self) 12 | mujoco_env.MujocoEnv.__init__(self, 'reacher.xml', 2) 13 | 14 | def step(self, a): 15 | vec = self.get_body_com("fingertip") - self.get_body_com("target") 16 | 17 | if getattr(self, 'action_space', None): 18 | a = np.clip(a, self.action_space.low, 19 | self.action_space.high) 20 | reward_dist = - np.linalg.norm(vec) 21 | reward_ctrl = - np.square(a).sum() 22 | reward = reward_dist + reward_ctrl 23 | self.do_simulation(a, self.frame_skip) 24 | ob = self._get_obs() 25 | done = False 26 | return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl) 27 | 28 | def viewer_setup(self): 29 | self.viewer.cam.trackbodyid = 0 30 | 31 | def mb_step(self, states, actions, next_states): 32 | # returns rewards and dones 33 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 34 | if getattr(self, 'action_space', None): 35 | actions = np.clip(actions, self.action_space.low, 36 | self.action_space.high) 37 | rewards = - self.cost_np_vec(states, actions, next_states) 38 | return rewards, np.zeros_like(rewards, dtype=np.bool) 39 | 40 | def reset_model(self): 41 | qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos 42 | while True: 43 | self.goal = self.np_random.uniform(low=-.2, high=.2, size=2) 44 | if np.linalg.norm(self.goal) < 2: 45 | break 46 | qpos[-2:] = self.goal 47 | qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 48 | qvel[-2:] = 0 49 | self.set_state(qpos, qvel) 50 | return self._get_obs() 51 | 52 | def _get_obs(self): 53 | theta = self.sim.data.qpos.flat[:2] 54 | return np.concatenate([ 55 | np.cos(theta), 56 | np.sin(theta), 57 | self.sim.data.qpos.flat[2:], 58 | self.sim.data.qvel.flat[:2], 59 | self.get_body_com("fingertip") - self.get_body_com("target") 60 | ]) 61 | 62 | def cost_np_vec(self, obs, acts, next_obs): 63 | dist_vec = obs[:, -3:] 64 | reward_dist = - np.linalg.norm(dist_vec, axis=1) 65 | reward_ctrl = - np.sum(np.square(acts), axis=1) 66 | reward = reward_dist + reward_ctrl 67 | return -reward 68 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/ant.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=5): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/ant.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action: np.ndarray): 21 | old_ob = self._get_obs() 22 | self.do_simulation(action, self.frame_skip) 23 | 24 | if getattr(self, 'action_space', None): 25 | action = np.clip(action, self.action_space.low, self.action_space.high) 26 | ob = self._get_obs() 27 | 28 | reward_ctrl = -0.1 * np.square(action).sum() 29 | reward_run = old_ob[13] 30 | reward_height = -3.0 * np.square(old_ob[0] - 0.57) 31 | reward = reward_run + reward_ctrl + reward_height + 1.0 32 | done = False 33 | return ob, reward, done, {} 34 | 35 | def _get_obs(self): 36 | return np.concatenate([ 37 | # (self.sim.data.qpos.flat[:1] - self.prev_qpos[:1]) / self.dt, 38 | # self.get_body_comvel("torso")[:1], 39 | self.sim.data.qpos.flat[2:], 40 | self.sim.data.qvel.flat, 41 | ]) 42 | 43 | def mb_step(self, states, actions, next_states): 44 | # returns rewards and dones 45 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 46 | if getattr(self, 'action_space', None): 47 | actions = np.clip(actions, self.action_space.low, 48 | self.action_space.high) 49 | rewards = - self.cost_np_vec(states, actions, next_states) 50 | return rewards, np.zeros_like(rewards, dtype=np.bool) 51 | 52 | def reset_model(self): 53 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 54 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 55 | self.set_state(qpos, qvel) 56 | # self.prev_qpos = np.copy(self.sim.data.qpos.flat) 57 | return self._get_obs() 58 | 59 | def viewer_setup(self): 60 | self.viewer.cam.distance = self.model.stat.extent * 0.5 61 | 62 | def cost_np_vec(self, obs, acts, next_obs): 63 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 64 | reward_run = obs[:, 13] 65 | reward_height = -3.0 * np.square(obs[:, 0] - 0.57) 66 | reward = reward_run + reward_ctrl + reward_height + 1.0 67 | return -reward 68 | 69 | -------------------------------------------------------------------------------- /mbpo_pytorch/models/utils.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class MLP(nn.Module, ABC): 9 | def __init__(self, input_dim, output_dim, hidden_dims, activation='tanh', last_activation='identity', biases=None): 10 | super(MLP, self).__init__() 11 | sizes_list = hidden_dims.copy() 12 | self.activation = getattr(torch, activation) 13 | self.last_activation = getattr(torch, last_activation) 14 | sizes_list.insert(0, input_dim) 15 | biases = [True] * len(sizes_list) if biases is None else biases.copy() 16 | 17 | layers = [] 18 | if 1 < len(sizes_list): 19 | for i in range(len(sizes_list) - 1): 20 | layers.append(nn.Linear(sizes_list[i], sizes_list[i + 1], bias=biases[i])) 21 | self.last_layer = nn.Linear(sizes_list[-1], output_dim) 22 | self.layers = nn.ModuleList(layers) 23 | 24 | def forward(self, x): 25 | for layer in self.layers: 26 | x = layer(x) 27 | x = self.activation(x) 28 | x = self.last_layer(x) 29 | x = self.last_activation(x) 30 | return x 31 | 32 | def init(self, init_fn, last_init_fn): 33 | for layer in self.layers: 34 | init_fn(layer) 35 | last_init_fn(self.last_layer) 36 | 37 | 38 | def soft_update(source_model: nn.Module, target_model: nn.Module, tau): 39 | for target_param, param in zip(target_model.parameters(), source_model.parameters()): 40 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 41 | 42 | 43 | def copy_model_params_from_to(source, target): 44 | for target_param, param in zip(target.parameters(), source.parameters()): 45 | target_param.data.copy_(param.data) 46 | 47 | 48 | def init(module, weight_init=None, bias_init=None): 49 | if weight_init: 50 | weight_init(module.weight.data) 51 | if bias_init: 52 | bias_init(module.bias.data) 53 | 54 | 55 | def get_flat_params(model): 56 | params = [] 57 | for param in model.parameters(): 58 | params.append(param.data.view(-1)) 59 | 60 | flat_params = torch.cat(params) 61 | return flat_params 62 | 63 | 64 | def set_flat_params(model, flat_params): 65 | prev_ind = 0 66 | for param in model.parameters(): 67 | flat_size = int(np.prod(list(param.size()))) 68 | param.data.copy_( 69 | flat_params[prev_ind:prev_ind + flat_size].view(param.size())) 70 | prev_ind += flat_size 71 | 72 | 73 | def get_flat_grad(net, grad_grad=False): 74 | grads = [] 75 | for param in net.parameters(): 76 | if grad_grad: 77 | grads.append(param.grad.grad.view(-1)) 78 | else: 79 | grads.append(param.grad.view(-1)) 80 | 81 | flat_grad = torch.cat(grads) 82 | return flat_grad 83 | 84 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_cheetahO001.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=5): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action): 21 | start_ob = self._get_obs() 22 | reward_run = start_ob[8] 23 | 24 | self.do_simulation(action, self.frame_skip) 25 | ob = self._get_obs() 26 | if getattr(self, 'action_space', None): 27 | action = np.clip(action, self.action_space.low, 28 | self.action_space.high) 29 | reward_ctrl = -0.1 * np.square(action).sum() 30 | 31 | reward = reward_run + reward_ctrl 32 | done = False 33 | ob += np.random.uniform(low=-0.01, high=0.01, size=ob.shape) 34 | return ob, reward, done, {} 35 | 36 | def _get_obs(self): 37 | return np.concatenate([ 38 | self.sim.data.qpos.flat[1:], 39 | self.sim.data.qvel.flat, 40 | ]) 41 | 42 | def reset_model(self): 43 | qpos = self.init_qpos + \ 44 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 45 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 46 | self.set_state(qpos, qvel) 47 | return self._get_obs() 48 | 49 | def viewer_setup(self): 50 | self.viewer.cam.distance = self.model.stat.extent * 0.5 51 | 52 | def cost_np_vec(self, obs, acts, next_obs): 53 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 54 | reward_run = obs[:, 8] 55 | reward = reward_run + reward_ctrl 56 | return -reward 57 | 58 | def cost_tf_vec(self, obs, acts, next_obs): 59 | raise NotImplementedError 60 | """ 61 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1) 62 | reward_run = next_obs[:, 0] 63 | reward = reward_run + reward_ctrl 64 | return -reward 65 | """ 66 | 67 | def mb_step(self, states, actions, next_states): 68 | # returns rewards and dones 69 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 70 | if getattr(self, 'action_space', None): 71 | actions = np.clip(actions, self.action_space.low, 72 | self.action_space.high) 73 | rewards = - self.cost_np_vec(states, actions, next_states) 74 | return rewards, np.zeros_like(rewards, dtype=np.bool) 75 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/swimmer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=4): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/swimmer.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action): 21 | old_ob = self._get_obs() 22 | self.do_simulation(action, self.frame_skip) 23 | 24 | if getattr(self, 'action_space', None): 25 | action = np.clip(action, self.action_space.low, 26 | self.action_space.high) 27 | ob = self._get_obs() 28 | 29 | reward_ctrl = -0.0001 * np.square(action).sum() 30 | reward_run = old_ob[3] 31 | reward = reward_run + reward_ctrl 32 | 33 | done = False 34 | return ob, reward, done, {} 35 | 36 | def _get_obs(self): 37 | return np.concatenate([ 38 | # (self.sim.data.qpos.flat[:1] - self.prev_qpos[:1]) / self.dt, 39 | # self.get_body_comvel("torso")[:1], 40 | self.sim.data.qpos.flat[2:], 41 | self.sim.data.qvel.flat, 42 | ]) 43 | 44 | def mb_step(self, states, actions, next_states): 45 | # returns rewards and dones 46 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 47 | if getattr(self, 'action_space', None): 48 | actions = np.clip(actions, self.action_space.low, 49 | self.action_space.high) 50 | rewards = - self.cost_np_vec(states, actions, next_states) 51 | return rewards, np.zeros_like(rewards, dtype=np.bool) 52 | 53 | def reset_model(self): 54 | self.set_state( 55 | self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq), 56 | self.init_qvel + self.np_random.uniform(low=-.1, high=.1, size=self.model.nv) 57 | ) 58 | self.prev_qpos = np.copy(self.sim.data.qpos.flat) 59 | return self._get_obs() 60 | 61 | def cost_np_vec(self, obs, acts, next_obs): 62 | reward_ctrl = -0.0001 * np.sum(np.square(acts), axis=1) 63 | reward_run = obs[:, 3] 64 | reward = reward_run + reward_ctrl 65 | return -reward 66 | 67 | def cost_tf_vec(self, obs, acts, next_obs): 68 | """ 69 | reward_ctrl = -0.0001 * tf.reduce_sum(tf.square(acts), axis=1) 70 | reward_run = next_obs[:, 0] 71 | reward = reward_run + reward_ctrl 72 | return -reward 73 | """ 74 | raise NotImplementedError 75 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_fant.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=5): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/ant.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action): 21 | old_ob = self._get_obs() 22 | self.do_simulation(action, self.frame_skip) 23 | 24 | if getattr(self, 'action_space', None): 25 | action = np.clip(action, self.action_space.low, self.action_space.high) 26 | ob = self._get_obs() 27 | 28 | reward_ctrl = -0.1 * np.square(action).sum() 29 | reward_run = old_ob[13] 30 | reward_height = -3.0 * np.square(old_ob[0] - 0.57) 31 | 32 | # the alive bonus 33 | height = ob[0] 34 | done = (height > 1.0) or (height < 0.2) 35 | alive_reward = float(not done) 36 | 37 | reward = reward_run + reward_ctrl + reward_height + alive_reward 38 | return ob, reward, done, {} 39 | 40 | def _get_obs(self): 41 | return np.concatenate([ 42 | self.sim.data.qpos.flat[2:], 43 | self.sim.data.qvel.flat, 44 | ]) 45 | 46 | def reset_model(self): 47 | qpos = self.init_qpos + \ 48 | self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 49 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 50 | self.set_state(qpos, qvel) 51 | # self.prev_qpos = np.copy(self.sim.data.qpos.flat) 52 | return self._get_obs() 53 | 54 | def viewer_setup(self): 55 | self.viewer.cam.distance = self.model.stat.extent * 0.5 56 | 57 | def cost_np_vec(self, obs, acts, next_obs): 58 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 59 | reward_run = obs[:, 13] 60 | reward_height = -3.0 * np.square(obs[:, 0] - 0.57) 61 | 62 | height = next_obs[:, 0] 63 | done = np.logical_or((height > 1.0), (height < 0.2)) 64 | alive_reward = 1.0 - np.array(done, dtype=np.float) 65 | 66 | reward = reward_run + reward_ctrl + reward_height + alive_reward 67 | return -reward 68 | 69 | def mb_step(self, states, actions, next_states): 70 | if getattr(self, 'action_space', None): 71 | actions = np.clip(actions, self.action_space.low, 72 | self.action_space.high) 73 | rewards = - self.cost_np_vec(states, actions, next_states) 74 | height = next_states[:, 0] 75 | done = np.logical_or((height > 1.0), (height < 0.2)) 76 | return rewards, done 77 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_cheetahA003.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=5): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action): 21 | action = np.array(action) 22 | action += np.random.uniform(low=-0.03, high=0.03, size=action.shape) 23 | start_ob = self._get_obs() 24 | reward_run = start_ob[8] 25 | 26 | self.do_simulation(action, self.frame_skip) 27 | ob = self._get_obs() 28 | if getattr(self, 'action_space', None): 29 | action = np.clip(action, self.action_space.low, 30 | self.action_space.high) 31 | reward_ctrl = -0.1 * np.square(action).sum() 32 | 33 | reward = reward_run + reward_ctrl 34 | done = False 35 | return ob, reward, done, {} 36 | 37 | def _get_obs(self): 38 | return np.concatenate([ 39 | self.sim.data.qpos.flat[1:], 40 | self.sim.data.qvel.flat, 41 | ]) 42 | 43 | def reset_model(self): 44 | qpos = self.init_qpos + \ 45 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 46 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 47 | self.set_state(qpos, qvel) 48 | return self._get_obs() 49 | 50 | def viewer_setup(self): 51 | self.viewer.cam.distance = self.model.stat.extent * 0.5 52 | 53 | def cost_np_vec(self, obs, acts, next_obs): 54 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 55 | reward_run = obs[:, 8] 56 | reward = reward_run + reward_ctrl 57 | return -reward 58 | 59 | def cost_tf_vec(self, obs, acts, next_obs): 60 | raise NotImplementedError 61 | """ 62 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1) 63 | reward_run = next_obs[:, 0] 64 | reward = reward_run + reward_ctrl 65 | return -reward 66 | """ 67 | 68 | def mb_step(self, states, actions, next_states): 69 | # returns rewards and dones 70 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 71 | if getattr(self, 'action_space', None): 72 | actions = np.clip(actions, self.action_space.low, 73 | self.action_space.high) 74 | rewards = - self.cost_np_vec(states, actions, next_states) 75 | return rewards, np.zeros_like(rewards, dtype=np.bool) 76 | 77 | 78 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_cheetahA01.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=5): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action): 21 | action = np.array(action) 22 | action += np.random.uniform(low=-0.1, high=0.1, size=action.shape) 23 | start_ob = self._get_obs() 24 | reward_run = start_ob[8] 25 | 26 | self.do_simulation(action, self.frame_skip) 27 | ob = self._get_obs() 28 | if getattr(self, 'action_space', None): 29 | action = np.clip(action, self.action_space.low, 30 | self.action_space.high) 31 | reward_ctrl = -0.1 * np.square(action).sum() 32 | 33 | reward = reward_run + reward_ctrl 34 | done = False 35 | return ob, reward, done, {} 36 | 37 | def _get_obs(self): 38 | return np.concatenate([ 39 | self.sim.data.qpos.flat[1:], 40 | self.sim.data.qvel.flat, 41 | ]) 42 | 43 | def reset_model(self): 44 | qpos = self.init_qpos + \ 45 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 46 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 47 | self.set_state(qpos, qvel) 48 | return self._get_obs() 49 | 50 | def viewer_setup(self): 51 | self.viewer.cam.distance = self.model.stat.extent * 0.5 52 | 53 | def cost_np_vec(self, obs, acts, next_obs): 54 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 55 | reward_run = obs[:, 8] 56 | reward = reward_run + reward_ctrl 57 | return -reward 58 | 59 | def cost_tf_vec(self, obs, acts, next_obs): 60 | raise NotImplementedError 61 | """ 62 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1) 63 | reward_run = next_obs[:, 0] 64 | reward = reward_run + reward_ctrl 65 | return -reward 66 | """ 67 | 68 | def mb_step(self, states, actions, next_states): 69 | # returns rewards and dones 70 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 71 | if getattr(self, 'action_space', None): 72 | actions = np.clip(actions, self.action_space.low, 73 | self.action_space.high) 74 | rewards = - self.cost_np_vec(states, actions, next_states) 75 | return rewards, np.zeros_like(rewards, dtype=np.bool) 76 | 77 | 78 | -------------------------------------------------------------------------------- /mbpo_pytorch/models/actor.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import List, Optional 4 | 5 | from .initializer import fanin_init 6 | from .utils import MLP 7 | from .actor_layer import * 8 | 9 | 10 | # noinspection DuplicatedCode 11 | class Actor(nn.Module, ABC): 12 | def __init__(self, state_dim: int, action_space, hidden_dims: List[int], 13 | state_normalizer: Optional[nn.Module], use_limited_entropy=False, use_tanh_squash=False, 14 | use_state_dependent_std=False, **kwargs): 15 | super(Actor, self).__init__() 16 | self.state_dim = state_dim 17 | self.action_dim = action_space 18 | self.hidden_dims = hidden_dims 19 | self.use_limited_entropy = use_limited_entropy 20 | self.use_tanh_squash = use_tanh_squash 21 | 22 | mlp_kwargs = kwargs.copy() 23 | mlp_kwargs['activation'] = kwargs.get('activation', 'relu') 24 | mlp_kwargs['last_activation'] = kwargs.get('activation', 'relu') 25 | 26 | self.actor_feature = MLP(state_dim, hidden_dims[-1], hidden_dims[:-1], **mlp_kwargs) 27 | 28 | self.state_normalizer = state_normalizer or nn.Identity() 29 | 30 | self.actor_layer = TanhGaussainActorLayer(hidden_dims[-1], action_space.shape[0], 31 | use_state_dependent_std) 32 | 33 | def init_(m): init(m, fanin_init, lambda x: nn.init.constant_(x, 0)) 34 | self.actor_feature.init(init_, init_) 35 | 36 | def act(self, state, deterministic=False, reparameterize=False): 37 | action_feature = self.actor_feature(state) 38 | action_dist, action_means, action_logstds = self.actor_layer(action_feature) 39 | 40 | log_probs = None 41 | pretanh_actions = None 42 | 43 | if deterministic: 44 | actions = action_means 45 | else: 46 | if reparameterize: 47 | result = action_dist.rsample() 48 | else: 49 | result = action_dist.sample() 50 | actions, pretanh_actions = result 51 | log_probs = action_dist.log_probs(actions, pretanh_actions) 52 | 53 | entropy = action_dist.entropy().mean() 54 | 55 | return {'actions': actions, 'log_probs': log_probs, 'entropy': entropy, 56 | 'action_means': action_means, 'action_logstds': action_logstds, 'pretanh_actions': pretanh_actions} 57 | 58 | def evaluate_actions(self, states, actions, pretanh_actions=None): 59 | states = self.state_normalizer(states) 60 | 61 | action_feature = self.actor_feature(states) 62 | action_dist, *_ = self.actor_layer(action_feature) 63 | 64 | if pretanh_actions: 65 | log_probs = action_dist.log_probs(actions, pretanh_actions) 66 | else: 67 | log_probs = action_dist.log_probs(actions) 68 | 69 | entropy = action_dist.entropy().mean() 70 | 71 | return {'log_probs': log_probs, 'entropy': entropy} 72 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/assets/hopper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/pets_pusher.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self): 13 | dir_path = os.path.dirname(os.path.realpath(__file__)) 14 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/pusher.xml' % dir_path, 4) 15 | utils.EzPickle.__init__(self) 16 | self.reset_model() 17 | 18 | def step(self, a): 19 | obj_pos = self.get_body_com("object"), 20 | vec_1 = obj_pos - self.get_body_com("tips_arm") 21 | vec_2 = obj_pos - self.get_body_com("goal") 22 | 23 | reward_near = -np.sum(np.abs(vec_1)) 24 | reward_dist = -np.sum(np.abs(vec_2)) 25 | reward_ctrl = -np.square(a).sum() 26 | reward = 1.25 * reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near 27 | 28 | self.do_simulation(a, self.frame_skip) 29 | ob = self._get_obs() 30 | done = False 31 | return ob, reward, done, {} 32 | 33 | def viewer_setup(self): 34 | self.viewer.cam.trackbodyid = -1 35 | self.viewer.cam.distance = 4.0 36 | 37 | def reset_model(self): 38 | qpos = self.init_qpos 39 | 40 | self.goal_pos = np.asarray([0, 0]) 41 | self.cylinder_pos = np.array([-0.25, 0.15]) + np.random.normal(0, 0.025, [2]) 42 | 43 | qpos[-4:-2] = self.cylinder_pos 44 | qpos[-2:] = self.goal_pos 45 | qvel = self.init_qvel + \ 46 | self.np_random.uniform(low=-0.005, high=0.005, size=self.model.nv) 47 | qvel[-4:] = 0 48 | self.set_state(qpos, qvel) 49 | self.ac_goal_pos = self.get_body_com("goal") 50 | 51 | return self._get_obs() 52 | 53 | def _get_obs(self): 54 | return np.concatenate([ 55 | self.sim.data.qpos.flat[:7], 56 | self.sim.data.qvel.flat[:7], 57 | self.get_body_com("tips_arm"), 58 | self.get_body_com("object"), 59 | self.get_body_com("goal"), 60 | ]) 61 | 62 | def cost_np_vec(self, obs, acts, next_obs): 63 | """ 64 | to_w, og_w = 0.5, 1.25 65 | tip_pos, obj_pos, goal_pos = obs[:, 14:17], obs[:, 17:20], obs[:, -3:] 66 | 67 | tip_obj_dist = np.sum(np.abs(tip_pos - obj_pos), axis=1) 68 | obj_goal_dist = np.sum(np.abs(goal_pos - obj_pos), axis=1) 69 | return to_w * tip_obj_dist + og_w * obj_goal_dist 70 | 71 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 72 | reward_run = obs[:, 8] 73 | reward = reward_run + reward_ctrl 74 | """ 75 | to_w, og_w = 0.5, 1.25 76 | tip_pos, obj_pos, goal_pos = obs[:, 14:17], obs[:, 17:20], obs[:, -3:] 77 | 78 | tip_obj_dist = -np.sum(np.abs(tip_pos - obj_pos), axis=1) 79 | obj_goal_dist = -np.sum(np.abs(goal_pos - obj_pos), axis=1) 80 | ctrl_reward = -0.1 * np.sum(np.square(acts), axis=1) 81 | 82 | reward = to_w * tip_obj_dist + og_w * obj_goal_dist + ctrl_reward 83 | return -reward 84 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/walker2d.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=4): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/walker2d.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action): 21 | old_ob = self._get_obs() 22 | self.do_simulation(action, self.frame_skip) 23 | ob = self._get_obs() 24 | 25 | if getattr(self, 'action_space', None): 26 | action = np.clip(action, self.action_space.low, 27 | self.action_space.high) 28 | 29 | reward_ctrl = -0.1 * np.square(action).sum() 30 | reward_run = old_ob[8] 31 | reward_height = -3.0 * np.square(old_ob[0] - 1.3) 32 | reward = reward_run + reward_ctrl + reward_height + 1.0 33 | 34 | done = False 35 | return ob, reward, done, {} 36 | 37 | def _get_obs(self): 38 | return np.concatenate([ 39 | self.sim.data.qpos.flat[1:], 40 | self.sim.data.qvel.flat 41 | ]) 42 | 43 | def mb_step(self, states, actions, next_states): 44 | # returns rewards and dones 45 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 46 | if getattr(self, 'action_space', None): 47 | actions = np.clip(actions, self.action_space.low, 48 | self.action_space.high) 49 | rewards = - self.cost_np_vec(states, actions, next_states) 50 | return rewards, np.zeros_like(rewards, dtype=np.bool) 51 | 52 | def reset_model(self): 53 | self.set_state( 54 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq), 55 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 56 | ) 57 | self.prev_qpos = np.copy(self.sim.data.qpos.flat) 58 | return self._get_obs() 59 | 60 | def viewer_setup(self): 61 | self.viewer.cam.trackbodyid = 2 62 | self.viewer.cam.distance = self.model.stat.extent * 0.5 63 | self.viewer.cam.lookat[2] += .8 64 | self.viewer.cam.elevation = -20 65 | 66 | def cost_np_vec(self, obs, acts, next_obs): 67 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 68 | reward_run = obs[:, 8] 69 | reward_height = -3.0 * np.square(obs[:, 0] - 1.3) 70 | reward = reward_run + reward_ctrl + reward_height + 1.0 71 | return -reward 72 | 73 | def cost_tf_vec(self, obs, acts, next_obs): 74 | """ 75 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1) 76 | reward_run = next_obs[:, 0] 77 | # reward_height = -3.0 * tf.square(next_obs[:, 1] - 1.3) 78 | reward = reward_run + reward_ctrl 79 | return -reward 80 | """ 81 | raise NotImplementedError 82 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_nostopslimhumanoid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import mujoco_env 3 | from gym import utils 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 9 | 10 | def __init__(self): 11 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5) 12 | utils.EzPickle.__init__(self) 13 | 14 | def _get_obs(self): 15 | data = self.sim.data 16 | return np.concatenate([data.qpos.flat[2:], 17 | data.qvel.flat]) 18 | 19 | def step(self, a): 20 | data = self.sim.data 21 | action = a 22 | if getattr(self, 'action_space', None): 23 | action = np.clip(a, self.action_space.low, 24 | self.action_space.high) 25 | qpos = self.sim.data.qpos 26 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 27 | 28 | # reward 29 | alive_bonus = 5 * (1 - float(done)) 30 | lin_vel_cost = 0.25 / 0.015 * data.qvel.flat[0] 31 | quad_ctrl_cost = 0.1 * np.square(action).sum() 32 | quad_impact_cost = 0.0 33 | 34 | self.do_simulation(action, self.frame_skip) 35 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 36 | done = False 37 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, 38 | reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 39 | 40 | def reset_model(self): 41 | c = 0.01 42 | self.set_state( 43 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 44 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 45 | ) 46 | return self._get_obs() 47 | 48 | def viewer_setup(self): 49 | self.viewer.cam.trackbodyid = 1 50 | self.viewer.cam.distance = self.model.stat.extent * 1.0 51 | self.viewer.cam.lookat[2] += .8 52 | self.viewer.cam.elevation = -20 53 | 54 | def cost_np_vec(self, obs, acts, next_obs): 55 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 56 | reward_run = 0.25 / 0.015 * obs[:, 22] 57 | 58 | quad_impact_cost = 0.0 59 | 60 | height = next_obs[:, 0] 61 | done = np.logical_or((height > 2.0), (height < 1.0)) 62 | alive_reward = 5 * (1.0 - np.array(done, dtype=np.float)) 63 | 64 | reward = reward_run + reward_ctrl + (-quad_impact_cost) + alive_reward 65 | return -reward 66 | 67 | def cost_tf_vec(self, obs, acts, next_obs): 68 | raise NotImplementedError 69 | 70 | def mb_step(self, states, actions, next_states): 71 | # returns rewards and dones 72 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 73 | if getattr(self, 'action_space', None): 74 | actions = np.clip(actions, self.action_space.low, 75 | self.action_space.high) 76 | rewards = - self.cost_np_vec(states, actions, next_states) 77 | return rewards, np.zeros_like(rewards, dtype=np.bool) 78 | 79 | 80 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/hopper.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=4): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/hopper.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action: np.ndarray): 21 | old_ob = self._get_obs() 22 | self.do_simulation(action, self.frame_skip) 23 | ob = self._get_obs() 24 | 25 | if getattr(self, 'action_space', None): 26 | action = np.clip(action, self.action_space.low, 27 | self.action_space.high) 28 | 29 | reward_ctrl = -0.1 * np.square(action).sum() 30 | reward_run = old_ob[5] 31 | reward_height = -3.0 * np.square(old_ob[0] - 1.3) 32 | reward = reward_run + reward_ctrl + reward_height + 1.0 33 | 34 | done = False 35 | return ob, reward, done, {} 36 | 37 | def _get_obs(self): 38 | return np.concatenate([ 39 | self.sim.data.qpos.flat[1:], 40 | self.sim.data.qvel.flat, 41 | ]) 42 | 43 | def mb_step(self, states, actions, next_states): 44 | # returns rewards and dones 45 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 46 | if getattr(self, 'action_space', None): 47 | actions = np.clip(actions, self.action_space.low, 48 | self.action_space.high) 49 | rewards = - self.cost_np_vec(states, actions, next_states) 50 | return rewards, np.zeros_like(rewards, dtype=np.bool) 51 | 52 | def reset_model(self): 53 | self.set_state( 54 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq), 55 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 56 | ) 57 | self.prev_qpos = np.copy(self.sim.data.qpos.flat) 58 | return self._get_obs() 59 | 60 | def viewer_setup(self): 61 | self.viewer.cam.trackbodyid = 2 62 | self.viewer.cam.distance = self.model.stat.extent * 0.75 63 | self.viewer.cam.lookat[2] += .8 64 | self.viewer.cam.elevation = -20 65 | 66 | def cost_np_vec(self, obs, acts, next_obs): 67 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 68 | reward_run = obs[:, 5] 69 | reward_height = -3.0 * np.square(obs[:, 0] - 1.3) 70 | reward = reward_run + reward_ctrl + reward_height + 1.0 71 | return -reward 72 | 73 | def cost_tf_vec(self, obs, acts, next_obs): 74 | """ 75 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1) 76 | reward_run = next_obs[:, 0] 77 | # reward_height = -3.0 * tf.square(next_obs[:, 1] - 1.3) 78 | reward = reward_run + reward_ctrl 79 | return -reward 80 | """ 81 | raise NotImplementedError 82 | -------------------------------------------------------------------------------- /mbpo_pytorch/thirdparty/util.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import gym 4 | import numpy as np 5 | 6 | 7 | def copy_obs_dict(obs): 8 | """ 9 | Deep-copy a dict of numpy arrays. 10 | 11 | :param obs: (OrderedDict): a dict of numpy arrays. 12 | :return (OrderedDict) a dict of copied numpy arrays. 13 | """ 14 | assert isinstance(obs, OrderedDict), "unexpected type for observations '{}'".format(type(obs)) 15 | return OrderedDict([(k, np.copy(v)) for k, v in obs.items()]) 16 | 17 | 18 | def dict_to_obs(space, obs_dict): 19 | """ 20 | Convert an internal representation raw_obs into the appropriate type 21 | specified by space. 22 | 23 | :param space: (gym.spaces.Space) an observation space. 24 | :param obs_dict: (OrderedDict) a dict of numpy arrays. 25 | :return (ndarray, tuple or dict): returns an observation 26 | of the same type as space. If space is Dict, function is identity; 27 | if space is Tuple, converts dict to Tuple; otherwise, space is 28 | unstructured and returns the value raw_obs[None]. 29 | """ 30 | if isinstance(space, gym.spaces.Dict): 31 | return obs_dict 32 | elif isinstance(space, gym.spaces.Tuple): 33 | assert len(obs_dict) == len(space.spaces), "size of observation does not match size of observation space" 34 | return tuple((obs_dict[i] for i in range(len(space.spaces)))) 35 | else: 36 | assert set(obs_dict.keys()) == {None}, "multiple observation keys for unstructured observation space" 37 | return obs_dict[None] 38 | 39 | 40 | def obs_space_info(obs_space): 41 | """ 42 | Get dict-structured information about a gym.Space. 43 | 44 | Dict spaces are represented directly by their dict of subspaces. 45 | Tuple spaces are converted into a dict with keys indexing into the tuple. 46 | Unstructured spaces are represented by {None: obs_space}. 47 | 48 | :param obs_space: (gym.spaces.Space) an observation space 49 | :return (tuple) A tuple (keys, shapes, dtypes): 50 | keys: a list of dict keys. 51 | shapes: a dict mapping keys to shapes. 52 | dtypes: a dict mapping keys to dtypes. 53 | """ 54 | if isinstance(obs_space, gym.spaces.Dict): 55 | assert isinstance(obs_space.spaces, OrderedDict), "Dict space must have ordered subspaces" 56 | subspaces = obs_space.spaces 57 | elif isinstance(obs_space, gym.spaces.Tuple): 58 | subspaces = {i: space for i, space in enumerate(obs_space.spaces)} 59 | else: 60 | assert not hasattr(obs_space, 'spaces'), "Unsupported structured space '{}'".format(type(obs_space)) 61 | subspaces = {None: obs_space} 62 | keys = [] 63 | shapes = {} 64 | dtypes = {} 65 | for key, box in subspaces.items(): 66 | keys.append(key) 67 | shapes[key] = box.shape 68 | dtypes[key] = box.dtype 69 | return keys, shapes, dtypes 70 | 71 | 72 | def mpi_rank_or_zero(): 73 | """ 74 | Return the MPI rank if mpi is installed. Otherwise, return 0. 75 | :return: (int) 76 | """ 77 | try: 78 | import mpi4py 79 | return mpi4py.MPI.COMM_WORLD.Get_rank() 80 | except (ImportError, AttributeError) as _: 81 | return 0 -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_slimhumanoid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import mujoco_env 3 | from gym import utils 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 9 | 10 | def __init__(self): 11 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5) 12 | utils.EzPickle.__init__(self) 13 | 14 | def _get_obs(self): 15 | data = self.sim.data 16 | return np.concatenate([data.qpos.flat[2:], 17 | data.qvel.flat]) 18 | 19 | def step(self, a): 20 | data = self.sim.data 21 | action = a 22 | if getattr(self, 'action_space', None): 23 | action = np.clip(a, self.action_space.low, 24 | self.action_space.high) 25 | 26 | # reward 27 | alive_bonus = 5.0 28 | lin_vel_cost = 0.25 / 0.015 * data.qvel.flat[0] 29 | quad_ctrl_cost = 0.1 * np.square(action).sum() 30 | quad_impact_cost = 0.0 31 | 32 | self.do_simulation(action, self.frame_skip) 33 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 34 | qpos = self.sim.data.qpos 35 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 36 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, 37 | reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 38 | 39 | def reset_model(self): 40 | c = 0.01 41 | self.set_state( 42 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 43 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 44 | ) 45 | return self._get_obs() 46 | 47 | def viewer_setup(self): 48 | self.viewer.cam.trackbodyid = 1 49 | self.viewer.cam.distance = self.model.stat.extent * 1.0 50 | self.viewer.cam.lookat[2] += .8 51 | self.viewer.cam.elevation = -20 52 | 53 | def cost_np_vec(self, obs, acts, next_obs): 54 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 55 | reward_run = 0.25 / 0.015 * obs[:, 22] 56 | 57 | quad_impact_cost = 0.0 58 | 59 | height = next_obs[:, 0] 60 | done = np.logical_or((height > 2.0), (height < 1.0)) 61 | alive_reward = 5 * (1.0 - np.array(done, dtype=np.float)) 62 | 63 | reward = reward_run + reward_ctrl + (-quad_impact_cost) + alive_reward 64 | return -reward 65 | 66 | def cost_tf_vec(self, obs, acts, next_obs): 67 | raise NotImplementedError 68 | 69 | def mb_step(self, states, actions, next_states): 70 | # returns rewards and dones 71 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 72 | if getattr(self, 'action_space', None): 73 | actions = np.clip(actions, self.action_space.low, 74 | self.action_space.high) 75 | rewards = - self.cost_np_vec(states, actions, next_states) 76 | height = next_states[:, 0] 77 | done = np.logical_or((height > 2.0), (height < 1.0)) 78 | return rewards, done 79 | 80 | 81 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/benchmarking_envs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .gym.half_cheetah import HalfCheetahEnv 4 | from .gym.walker2d import Walker2dEnv 5 | from .gym.ant import AntEnv 6 | from .gym.hopper import HopperEnv 7 | from .gym.swimmer import SwimmerEnv 8 | from .gym.reacher import ReacherEnv 9 | from .gym.pendulum import PendulumEnv 10 | from .gym.inverted_pendulum import InvertedPendulumEnv 11 | from .gym.acrobot import AcrobotEnv 12 | from .gym.cartpole import CartPoleEnv 13 | from .gym.mountain_car import Continuous_MountainCarEnv 14 | from .gym.gym_ohalfcheetah import OriginalHalfCheetahEnv 15 | from .gym.gym_oant import OriginalAntEnv 16 | from .gym.gym_owalker import OriginalWalkerEnv 17 | from .gym.gym_oswimmer import OriginalSwimmerEnv 18 | from .gym.gym_ohopper import OriginalHopperEnv 19 | from .gym.gym_ohumanoid import OriginalHumanoidEnv 20 | from .gym import gym_fswimmer 21 | from .gym import gym_fwalker2d 22 | from .gym import gym_fhopper 23 | from .gym import gym_fant 24 | from .gym import gym_cheetahA01 25 | from .gym import gym_cheetahA003 26 | from .gym import gym_cheetahO01 27 | from .gym import gym_cheetahO001 28 | from .gym import gym_pendulumO01 29 | from .gym import gym_pendulumO001 30 | from .gym import gym_cartpoleO01 31 | from .gym import gym_cartpoleO001 32 | from .gym import gym_humanoid 33 | from .gym import gym_nostopslimhumanoid 34 | from .gym import gym_slimhumanoid 35 | 36 | 37 | def make_benchmarking_env(env_id: str): 38 | envs = { 39 | 'OriginalHalfCheetah': OriginalHalfCheetahEnv, 40 | 'OriginalAnt': OriginalAntEnv, 41 | 'OriginalWalker': OriginalWalkerEnv, 42 | 'OriginalSwimmer': OriginalSwimmerEnv, 43 | 'OriginalHumanoid': OriginalHumanoidEnv, 44 | 'OriginalHopper': OriginalHopperEnv, 45 | 46 | 'HalfCheetah': HalfCheetahEnv, 47 | 'Walker2D': Walker2dEnv, 48 | 'Ant': AntEnv, 49 | 'Hopper': HopperEnv, 50 | 'Swimmer': SwimmerEnv, 51 | 'FixedSwimmer': gym_fswimmer.fixedSwimmerEnv, 52 | 'FixedWalker': gym_fwalker2d.Walker2dEnv, 53 | 'FixedHopper': gym_fhopper.HopperEnv, 54 | 'FixedAnt': gym_fant.AntEnv, 55 | 'Reacher': ReacherEnv, 56 | 'Pendulum': PendulumEnv, 57 | 'InvertedPendulum': InvertedPendulumEnv, 58 | 'Acrobot': AcrobotEnv, 59 | 'CartPole': CartPoleEnv, 60 | 'MountainCar': Continuous_MountainCarEnv, 61 | 62 | 'HalfCheetahO01': gym_cheetahO01.HalfCheetahEnv, 63 | 'HalfCheetahO001': gym_cheetahO001.HalfCheetahEnv, 64 | 'HalfCheetahA01': gym_cheetahA01.HalfCheetahEnv, 65 | 'HalfCheetahA003': gym_cheetahA003.HalfCheetahEnv, 66 | 67 | 'PendulumO01': gym_pendulumO01.PendulumEnv, 68 | 'PendulumO001': gym_pendulumO001.PendulumEnv, 69 | 70 | 'CartPoleO01': gym_cartpoleO01.CartPoleEnv, 71 | 'CartPoleO001': gym_cartpoleO001.CartPoleEnv, 72 | 73 | 'gym_humanoid': gym_humanoid.HumanoidEnv, 74 | 'gym_slimhumanoid': gym_slimhumanoid.HumanoidEnv, 75 | 'gym_nostopslimhumanoid': gym_nostopslimhumanoid.HumanoidEnv, 76 | } 77 | env = envs[env_id]() 78 | if not hasattr(env, 'reward_range'): 79 | env.reward_range = (-np.inf, np.inf) 80 | if not hasattr(env, 'metadata'): 81 | env.metadata = {} 82 | return env 83 | 84 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_fhopper.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | # noinspection DuplicatedCode 11 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 12 | 13 | def __init__(self, frame_skip=4): 14 | self.prev_qpos = None 15 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 16 | mujoco_env.MujocoEnv.__init__( 17 | self, '%s/assets/hopper.xml' % dir_path, frame_skip=frame_skip 18 | ) 19 | utils.EzPickle.__init__(self) 20 | 21 | def step(self, action): 22 | old_ob = self._get_obs() 23 | self.do_simulation(action, self.frame_skip) 24 | ob = self._get_obs() 25 | 26 | if getattr(self, 'action_space', None): 27 | action = np.clip(action, self.action_space.low, 28 | self.action_space.high) 29 | 30 | reward_ctrl = -0.1 * np.square(action).sum() 31 | reward_run = old_ob[5] 32 | reward_height = -3.0 * np.square(old_ob[0] - 1.3) 33 | height, ang = ob[0], ob[1] 34 | done = (height <= 0.7) or (abs(ang) >= 0.2) 35 | alive_reward = float(not done) 36 | reward = reward_run + reward_ctrl + reward_height + alive_reward 37 | 38 | return ob, reward, done, {} 39 | 40 | def _get_obs(self): 41 | return np.concatenate([ 42 | self.sim.data.qpos.flat[1:], 43 | self.sim.data.qvel.flat, 44 | ]) 45 | 46 | def reset_model(self): 47 | self.set_state( 48 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq), 49 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 50 | ) 51 | self.prev_qpos = np.copy(self.sim.data.qpos.flat) 52 | return self._get_obs() 53 | 54 | def viewer_setup(self): 55 | self.viewer.cam.trackbodyid = 2 56 | self.viewer.cam.distance = self.model.stat.extent * 0.75 57 | self.viewer.cam.lookat[2] += .8 58 | self.viewer.cam.elevation = -20 59 | 60 | def cost_np_vec(self, obs, acts, next_obs): 61 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 62 | reward_run = obs[:, 5] 63 | reward_height = -3.0 * np.square(obs[:, 0] - 1.3) 64 | height, ang = next_obs[:, 0], next_obs[:, 1] 65 | done = np.logical_or(height <= 0.7, abs(ang) >= 0.2) 66 | alive_reward = 1.0 - np.array(done, dtype=np.float) 67 | reward = reward_run + reward_ctrl + reward_height + alive_reward 68 | return -reward 69 | 70 | def mb_step(self, states, actions, next_states): 71 | # returns rewards and dones 72 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 73 | if getattr(self, 'action_space', None): 74 | actions = np.clip(actions, self.action_space.low, 75 | self.action_space.high) 76 | rewards = - self.cost_np_vec(states, actions, next_states) 77 | heights, angs = next_states[:, 0], next_states[:, 1] 78 | dones = np.logical_or(heights <= 0.7, abs(angs) >= 0.2) 79 | return rewards, dones 80 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_fwalker2d.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self, frame_skip=4): 13 | self.prev_qpos = None 14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | mujoco_env.MujocoEnv.__init__( 16 | self, '%s/assets/walker2d.xml' % dir_path, frame_skip=frame_skip 17 | ) 18 | utils.EzPickle.__init__(self) 19 | 20 | def step(self, action): 21 | old_ob = self._get_obs() 22 | self.do_simulation(action, self.frame_skip) 23 | ob = self._get_obs() 24 | 25 | if getattr(self, 'action_space', None): 26 | action = np.clip(action, self.action_space.low, 27 | self.action_space.high) 28 | 29 | reward_ctrl = -0.1 * np.square(action).sum() 30 | reward_run = old_ob[8] 31 | reward_height = -3.0 * np.square(old_ob[0] - 1.3) 32 | 33 | height, ang = ob[0], ob[1] 34 | done = (height >= 2.0) or (height <= 0.8) or (abs(ang) >= 1.0) 35 | alive_reward = float(not done) 36 | 37 | reward = reward_run + reward_ctrl + reward_height + alive_reward 38 | return ob, reward, done, {} 39 | 40 | def _get_obs(self): 41 | return np.concatenate([ 42 | self.sim.data.qpos.flat[1:], 43 | self.sim.data.qvel.flat 44 | ]) 45 | 46 | def reset_model(self): 47 | self.set_state( 48 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq), 49 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 50 | ) 51 | self.prev_qpos = np.copy(self.sim.data.qpos.flat) 52 | return self._get_obs() 53 | 54 | def viewer_setup(self): 55 | self.viewer.cam.trackbodyid = 2 56 | self.viewer.cam.distance = self.model.stat.extent * 0.5 57 | self.viewer.cam.lookat[2] += .8 58 | self.viewer.cam.elevation = -20 59 | 60 | def cost_np_vec(self, obs, acts, next_obs): 61 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 62 | reward_run = obs[:, 8] 63 | reward_height = -3.0 * np.square(next_obs[:, 0] - 1.3) 64 | height, ang = next_obs[:, 0], next_obs[:, 1] 65 | done = np.logical_or( 66 | np.logical_or(height >= 2.0, height <= 0.8), 67 | np.abs(ang) >= 1.0 68 | ) 69 | alive_reward = 1.0 - np.array(done, dtype=np.float) 70 | reward = reward_run + reward_ctrl + reward_height + alive_reward 71 | return -reward 72 | 73 | def mb_step(self, states, actions, next_states): 74 | if getattr(self, 'action_space', None): 75 | actions = np.clip(actions, self.action_space.low, 76 | self.action_space.high) 77 | rewards = - self.cost_np_vec(states, actions, next_states) 78 | height, ang = next_states[:, 0], next_states[:, 1] 79 | done = np.logical_or( 80 | np.logical_or(height >= 2.0, height <= 0.8), 81 | np.abs(ang) >= 1.0 82 | ) 83 | return rewards, done 84 | -------------------------------------------------------------------------------- /mbpo_pytorch/configs/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import argparse 4 | import munch 5 | import yaml 6 | from yaml import Loader 7 | import collections 8 | 9 | from mbpo_pytorch.misc import logger 10 | 11 | 12 | def flatten(d, parent_key='', sep='.'): 13 | items = [] 14 | for k, v in d.items(): 15 | new_key = parent_key + sep + k if parent_key else k 16 | if isinstance(v, collections.MutableMapping): 17 | items.extend(flatten(v, new_key, sep=sep).items()) 18 | else: 19 | items.append((new_key, str(v))) 20 | return dict(items) 21 | 22 | 23 | def safe_eval(exp: str): 24 | try: 25 | return eval(exp) 26 | except (NameError, SyntaxError): 27 | return exp 28 | 29 | 30 | def deflatten_with_eval(d, sep='.'): 31 | deflattend_d = {} 32 | for k, v in d.items(): 33 | d = deflattend_d 34 | key_seq = k.split(sep) 35 | for key in key_seq[:-1]: 36 | try: 37 | d = d[key] 38 | except (TypeError, KeyError): 39 | d[key] = {} 40 | d = d[key] 41 | d[key_seq[-1]] = safe_eval(v) 42 | return deflattend_d 43 | 44 | 45 | class Config: 46 | def __new__(cls, config_paths='config.yaml'): 47 | parser = argparse.ArgumentParser() 48 | parser.add_argument('--configs', nargs='+', default=[]) 49 | parser.add_argument('--set', type=str, nargs='*', action='append') 50 | 51 | args, unknown = parser.parse_known_args() 52 | flattened_config_dict = {} 53 | overwritten_config_dict = {} 54 | 55 | if args.configs: 56 | config_paths = args.configs 57 | 58 | if isinstance(config_paths, str): 59 | config_paths = [config_paths] 60 | 61 | for config_path in config_paths: 62 | if not config_path.startswith('/'): 63 | config_path = os.path.join(os.path.dirname(__file__), config_path) 64 | logger.info('Loading configs from {}.'.format(config_path)) 65 | 66 | with open(config_path, 'r', encoding='utf-8') as f: 67 | new_config_dict = yaml.load(f, Loader=Loader) 68 | flattened_new_config_dict = flatten(new_config_dict) 69 | overwritten_config_dict.update( 70 | {k: v for k, v in flattened_new_config_dict.items() 71 | if (k in flattened_config_dict.keys() and v != flattened_config_dict[k])}) 72 | flattened_config_dict.update(flattened_new_config_dict) 73 | 74 | if args.set: 75 | for instruction in sum(args.set, []): 76 | key, value = instruction.split('=') 77 | flattened_config_dict.update({key: safe_eval(value)}) 78 | # values set by args should be recorded all 79 | overwritten_config_dict.update({key: safe_eval(value)}) 80 | 81 | config_dict = deflatten_with_eval(flattened_config_dict) 82 | 83 | for key, value in overwritten_config_dict.items(): 84 | logger.notice('Hyperparams {} has been overwritten to {}.'.format(key, value)) 85 | 86 | config = munch.munchify(config_dict) 87 | config_dict = flatten(config_dict) 88 | logged_config_dict = {} 89 | 90 | for key, value in config_dict.items(): 91 | if key.find('.') >= 0: 92 | logged_config_dict[key] = value 93 | return config, logged_config_dict 94 | -------------------------------------------------------------------------------- /mbpo_pytorch/models/normalizers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import torch 4 | import torch.nn as nn 5 | from typing import List 6 | 7 | from mbpo_pytorch.thirdparty.running_mean_std import RunningMeanStd 8 | 9 | 10 | class RunningNormalizer(nn.Module, ABC): 11 | def __init__(self, shape: List[int], eps=1e-8, verbose=0): 12 | super().__init__() 13 | 14 | self.shape = shape 15 | self.verbose = verbose 16 | 17 | self.mean = torch.zeros(shape, dtype=torch.float32) 18 | self.var = torch.ones(shape, dtype=torch.float32) 19 | self.eps = eps 20 | self.count = 1e-4 21 | 22 | def forward(self, x: torch.Tensor, inverse=False): 23 | if inverse: 24 | return x * torch.sqrt(self.var) + self.mean 25 | return (x - self.mean) / torch.sqrt(self.var + self.eps) 26 | 27 | def to(self, *args, **kwargs): 28 | self.mean = self.mean.to(*args, **kwargs) 29 | self.var = self.var.to(*args, **kwargs) 30 | 31 | def update(self, samples: torch.Tensor): 32 | sample_count = samples.shape[0] 33 | sample_mean = samples.mean(dim=0) 34 | sample_var = samples.var(dim=0, unbiased=False) 35 | delta = sample_mean - self.mean 36 | total_count = self.count + sample_count 37 | 38 | new_mean = self.mean + delta * sample_count / total_count 39 | m_a = self.var * self.count 40 | m_b = sample_var * sample_count 41 | m_2 = m_a + m_b + delta * delta * self.count * sample_count / (self.count + sample_count) 42 | new_var = m_2 / (self.count + sample_count) 43 | 44 | new_count = sample_count + self.count 45 | 46 | self.mean = new_mean 47 | self.var = new_var 48 | self.count = new_count 49 | 50 | def state_dict(self, *args, **kwargs): 51 | return {'mean': self.mean, 'var': self.var, 'count': self.count} 52 | 53 | def load_state_dict(self, state_dict, strict=True): 54 | self.mean = state_dict['mean'] 55 | self.var = state_dict['var'] 56 | self.count = state_dict['count'] 57 | 58 | def get_rms(self): 59 | rms = RunningMeanStd(self.shape) 60 | rms.count = self.count 61 | rms.mean = self.mean.cpu().numpy() 62 | rms.var = self.var.cpu().numpy() 63 | return rms 64 | 65 | 66 | class BatchNormalizer(nn.Module, ABC): 67 | def __init__(self, shape: List[int], eps=1e-8, verbose=0): 68 | super().__init__() 69 | 70 | self.shape = shape 71 | self.verbose = verbose 72 | 73 | self.mean = torch.zeros(shape, dtype=torch.float32) 74 | self.std = torch.ones(shape, dtype=torch.float32) 75 | self.eps = eps 76 | 77 | def forward(self, x: torch.Tensor, inverse=False): 78 | if inverse: 79 | return x * self.std + self.mean 80 | return (x - self.mean) / (torch.clamp(self.std, min=self.eps)) 81 | 82 | def to(self, *args, **kwargs): 83 | self.mean = self.mean.to(*args, **kwargs) 84 | self.std = self.std.to(*args, **kwargs) 85 | 86 | # noinspection DuplicatedCode 87 | # samples in [batch_size, ...] 88 | def update(self, samples: torch.Tensor): 89 | self.mean = torch.mean(samples, dim=0) 90 | self.std = torch.std(samples, dim=0) 91 | 92 | def state_dict(self, *args, **kwargs): 93 | return {'mean': self.mean, 'std': self.std} 94 | 95 | def load_state_dict(self, state_dict, strict=True): 96 | self.mean = state_dict['mean'] 97 | self.std = state_dict['std'] 98 | 99 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_humanoid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import mujoco_env 3 | from gym import utils 4 | 5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 6 | 7 | 8 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 9 | 10 | def __init__(self): 11 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5) 12 | utils.EzPickle.__init__(self) 13 | 14 | def _get_obs(self): 15 | data = self.sim.data 16 | return np.concatenate([data.qpos.flat[2:], 17 | data.qvel.flat, 18 | data.cinert.flat, 19 | data.cvel.flat, 20 | data.qfrc_actuator.flat, 21 | data.cfrc_ext.flat]) 22 | 23 | def step(self, a): 24 | data = self.sim.data 25 | action = a 26 | if getattr(self, 'action_space', None): 27 | action = np.clip(a, self.action_space.low, 28 | self.action_space.high) 29 | 30 | # reward 31 | alive_bonus = 5.0 32 | lin_vel_cost = 0.25 / 0.015 * data.qvel.flat[0] 33 | quad_ctrl_cost = 0.1 * np.square(action).sum() 34 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() 35 | quad_impact_cost = min(quad_impact_cost, 10) 36 | 37 | self.do_simulation(action, self.frame_skip) 38 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 39 | qpos = self.sim.data.qpos 40 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 41 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, 42 | reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 43 | 44 | def reset_model(self): 45 | c = 0.01 46 | self.set_state( 47 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 48 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 49 | ) 50 | return self._get_obs() 51 | 52 | def viewer_setup(self): 53 | self.viewer.cam.trackbodyid = 1 54 | self.viewer.cam.distance = self.model.stat.extent * 1.0 55 | self.viewer.cam.lookat[2] += .8 56 | self.viewer.cam.elevation = -20 57 | 58 | def cost_np_vec(self, obs, acts, next_obs): 59 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1) 60 | reward_run = 0.25 / 0.015 * obs[:, 22] 61 | 62 | quad_impact_cost = .5e-6 * np.square(obs[:, -84:]).sum() 63 | quad_impact_cost = min(quad_impact_cost, 10) 64 | 65 | height = next_obs[:, 0] 66 | done = np.logical_or((height > 2.0), (height < 1.0)) 67 | alive_reward = 5 * (1.0 - np.array(done, dtype=np.float)) 68 | 69 | reward = reward_run + reward_ctrl + (-quad_impact_cost) + alive_reward 70 | return -reward 71 | 72 | def cost_tf_vec(self, obs, acts, next_obs): 73 | raise NotImplementedError 74 | 75 | def mb_step(self, states, actions, next_states): 76 | # returns rewards and dones 77 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 78 | if getattr(self, 'action_space', None): 79 | actions = np.clip(actions, self.action_space.low, 80 | self.action_space.high) 81 | rewards = - self.cost_np_vec(states, actions, next_states) 82 | 83 | height = next_states[:, 0] 84 | done = np.logical_or((height > 2.0), (height < 1.0)) 85 | return rewards, done 86 | 87 | 88 | -------------------------------------------------------------------------------- /mbpo_pytorch/misc/distributions.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import torch 4 | from torch.distributions import Distribution, Normal 5 | import math 6 | 7 | 8 | class TanhNormal(Distribution, ABC): 9 | """ 10 | Represent distribution of X where 11 | Z ~ N(mean, std) 12 | X ~ tanh(Z) 13 | Note: this is not very numerically stable. 14 | """ 15 | def __init__(self, mean, std, epsilon=1e-6): 16 | """ 17 | :param mean: Mean of the normal distribution 18 | :param std: Std of the normal distribution 19 | :param epsilon: Numerical stability epsilon when computing log-prob. 20 | """ 21 | super().__init__() 22 | self.normal_mean = mean 23 | self.normal_std = std 24 | self.normal = Normal(mean, std) 25 | self.epsilon = epsilon 26 | 27 | def log_prob(self, value, pre_tanh_value=None): 28 | if pre_tanh_value is None: 29 | pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2 30 | return self.normal.log_prob(pre_tanh_value) - torch.log(1 - value * value + self.epsilon) 31 | 32 | def log_probs(self, value, pre_tanh_value): 33 | return self.log_prob(value, pre_tanh_value).sum(-1, keepdim=True) 34 | 35 | def sample(self, sample_shape=torch.Size([])): 36 | z = self.normal.sample(sample_shape) 37 | return torch.tanh(z), z 38 | 39 | def rsample(self, sample_shape=torch.Size([]), return_pretanh_value=False): 40 | z = ( 41 | self.normal_mean + 42 | self.normal_std * 43 | Normal( 44 | torch.zeros_like(self.normal_mean), 45 | torch.ones_like(self.normal_std) 46 | ).sample() 47 | ) 48 | z.requires_grad_() 49 | return torch.tanh(z), z 50 | 51 | def entropy(self): 52 | return self.normal.entropy().sum(-1) 53 | 54 | def mode(self): 55 | return torch.tan(self.normal_mean), self.normal_mean 56 | 57 | 58 | class FixedLimitedEntNormal(torch.distributions.Normal, ABC): 59 | def log_probs(self, actions): 60 | return super().log_prob(actions).sum(-1, keepdim=True) 61 | 62 | def entropy(self): 63 | limit = 2. 64 | lo, hi = (-limit - self.loc) / self.scale / math.sqrt(2), (limit - self.loc) / self.scale / math.sqrt(2) 65 | return (0.5 * (self.scale.log() + math.log(2 * math.pi) / 2) * (hi.erf() - lo.erf()) + 0.5 * 66 | (torch.exp(-hi * hi) * hi - torch.exp(-lo * lo) * lo)).sum(-1) 67 | 68 | def mode(self): 69 | return self.mean 70 | 71 | 72 | class FixedCategorical(torch.distributions.Categorical, ABC): 73 | def sample(self, **kwargs): 74 | return super().sample(**kwargs).unsqueeze(-1) 75 | 76 | def log_probs(self, actions): 77 | return ( 78 | super() 79 | .log_prob(actions.squeeze(-1)) 80 | .view(actions.size(0), -1) 81 | .sum(-1) 82 | .unsqueeze(-1) 83 | ) 84 | 85 | def mode(self): 86 | return self.probs.argmax(dim=-1, keepdim=True) 87 | 88 | 89 | class FixedNormal(torch.distributions.Normal, ABC): 90 | 91 | def log_probs(self, actions): 92 | return super().log_prob(actions).sum(-1, keepdim=True) 93 | 94 | def entropy(self): 95 | return super().entropy().sum(-1) 96 | 97 | def mode(self): 98 | return self.mean 99 | 100 | 101 | class FixedBernoulli(torch.distributions.Bernoulli, ABC): 102 | 103 | def log_probs(self, actions): 104 | return super().log_prob(actions).view(actions.size(0), -1).sum(-1, keepdim=True) 105 | 106 | def entropy(self): 107 | return super().entropy().sum(-1) 108 | 109 | def mode(self): 110 | return torch.gt(self.probs, 0.5).float() 111 | 112 | -------------------------------------------------------------------------------- /mbpo_pytorch/misc/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import random 5 | from typing import List, Optional 6 | 7 | import numpy as np 8 | import torch 9 | 10 | from mbpo_pytorch.envs.wrapped_envs import make_vec_envs, get_vec_normalize 11 | from mbpo_pytorch.misc import logger 12 | from mbpo_pytorch.thirdparty.summary_writer import FixedSummaryWriter as SummaryWriter 13 | 14 | 15 | def log_and_write(writer: Optional[SummaryWriter], log_infos: List, global_step: int): 16 | for idx, (name, value) in enumerate(log_infos): 17 | logger.logkv('{}.'.format(idx) + name.split('/')[-1], value) 18 | if writer and name.find('/') > -1: 19 | writer.add_scalar(name, value, global_step=global_step) 20 | logger.dumpkvs() 21 | 22 | 23 | def evaluate(actor, env_name, seed, num_episode, eval_log_dir, 24 | device, norm_reward=False, norm_obs=True, obs_rms=None): 25 | eval_envs = make_vec_envs(env_name, seed, 1, None, eval_log_dir, device, allow_early_resets=True, 26 | norm_obs=norm_obs, norm_reward=norm_reward) 27 | vec_norm = get_vec_normalize(eval_envs) 28 | if vec_norm is not None and norm_obs: 29 | assert obs_rms is not None 30 | vec_norm.training = False 31 | vec_norm.obs_rms = obs_rms 32 | 33 | eval_episode_rewards = [] 34 | eval_episode_lengths = [] 35 | 36 | states = eval_envs.reset() 37 | while len(eval_episode_rewards) < num_episode: 38 | with torch.no_grad(): 39 | actions = actor.act(states, deterministic=True)['actions'] 40 | 41 | states, _, _, infos = eval_envs.step(actions) 42 | 43 | eval_episode_rewards.extend([info['episode']['r'] for info in infos if 'episode' in info]) 44 | eval_episode_lengths.extend([info['episode']['l'] for info in infos if 'episode' in info]) 45 | 46 | eval_envs.close() 47 | 48 | return eval_episode_rewards, eval_episode_lengths 49 | 50 | 51 | # noinspection PyUnresolvedReferences 52 | def set_seed(seed: int, strict=False): 53 | np.random.seed(seed) 54 | torch.manual_seed(np.random.randint(2 ** 30)) 55 | random.seed(np.random.randint(2 ** 30)) 56 | try: 57 | torch.cuda.manual_seed_all(np.random.randint(2 ** 30)) 58 | if strict: 59 | torch.backends.cudnn.deterministic = True 60 | torch.backends.cudnn.benchmark = False 61 | except AttributeError: 62 | pass 63 | 64 | 65 | def get_seed(): 66 | return random.randint(0, 2 ** 32 - 1) 67 | 68 | 69 | def commit_and_save(proj_dir: str, save_dir: Optional[str] = None, auto_save: bool = False): 70 | import shutil 71 | if save_dir and auto_save: 72 | shutil.copytree(proj_dir, save_dir + '/code', ignore=shutil.ignore_patterns('result', 'data', 'ref')) 73 | 74 | 75 | def merge_dicts(dicts, merge_fn): 76 | new_dict = {k: [dic[k] for dic in dicts] for k in dicts[0]} 77 | new_dict = {k: merge_fn(v) for k, v in new_dict.items()} 78 | return new_dict 79 | 80 | 81 | def init_logging(config, hparam_dict): 82 | import datetime 83 | current_time = datetime.datetime.now().strftime('%b%d_%H%M%S') 84 | log_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'log') 85 | eval_log_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'log_eval') 86 | save_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'save') 87 | os.makedirs(log_dir, exist_ok=True) 88 | os.makedirs(eval_log_dir, exist_ok=True) 89 | os.makedirs(save_dir, exist_ok=True) 90 | writer = SummaryWriter(log_dir=log_dir) 91 | writer.add_hparams(hparam_dict, metric_dict={}) 92 | 93 | logger.configure(log_dir, None, config.log_email, config.proj_name) 94 | logger.info('Hyperparms:') 95 | for key, value in hparam_dict.items(): 96 | logger.log('{:35s}: {}'.format(key, value)) 97 | 98 | return writer, log_dir, eval_log_dir, save_dir 99 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/pets_reacher.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from gym import utils 5 | from gym.envs.mujoco import mujoco_env 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class Reacher3DEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv): 11 | 12 | def __init__(self): 13 | self.viewer = None 14 | utils.EzPickle.__init__(self) 15 | dir_path = os.path.dirname(os.path.realpath(__file__)) 16 | self.goal = np.zeros(3) 17 | mujoco_env.MujocoEnv.__init__(self, os.path.join(dir_path, 'assets/reacher3d.xml'), 2) 18 | 19 | def step(self, a): 20 | self.do_simulation(a, self.frame_skip) 21 | ob = self._get_obs() 22 | reward = -np.sum(np.square(self.get_EE_pos(ob[None]) - self.goal)) 23 | reward -= 0.01 * np.square(a).sum() 24 | done = False 25 | return ob, reward, done, dict(reward_dist=0, reward_ctrl=0) 26 | 27 | def viewer_setup(self): 28 | self.viewer.cam.trackbodyid = 1 29 | self.viewer.cam.distance = 2.5 30 | self.viewer.cam.elevation = -30 31 | self.viewer.cam.azimuth = 270 32 | 33 | def reset_model(self): 34 | qpos, qvel = np.copy(self.init_qpos), np.copy(self.init_qvel) 35 | qpos[-3:] += np.random.normal(loc=0, scale=0.1, size=[3]) 36 | qvel[-3:] = 0 37 | self.goal = qpos[-3:] 38 | self.set_state(qpos, qvel) 39 | return self._get_obs() 40 | 41 | def _get_obs(self): 42 | raw_obs = np.concatenate([ 43 | self.sim.data.qpos.flat, self.sim.data.qvel.flat[:-3], 44 | ]) 45 | 46 | EE_pos = np.reshape(self.get_EE_pos(raw_obs[None]), [-1]) 47 | 48 | return np.concatenate([raw_obs, EE_pos]) 49 | 50 | def get_EE_pos(self, states): 51 | theta1, theta2, theta3, theta4, theta5, theta6, theta7 = \ 52 | states[:, :1], states[:, 1:2], states[:, 2:3], states[:, 3:4], states[:, 4:5], states[:, 5:6], states[:, 6:] 53 | 54 | rot_axis = np.concatenate([np.cos(theta2) * np.cos(theta1), np.cos(theta2) * np.sin(theta1), -np.sin(theta2)], 55 | axis=1) 56 | rot_perp_axis = np.concatenate([-np.sin(theta1), np.cos(theta1), np.zeros(theta1.shape)], axis=1) 57 | cur_end = np.concatenate([ 58 | 0.1 * np.cos(theta1) + 0.4 * np.cos(theta1) * np.cos(theta2), 59 | 0.1 * np.sin(theta1) + 0.4 * np.sin(theta1) * np.cos(theta2) - 0.188, 60 | -0.4 * np.sin(theta2) 61 | ], axis=1) 62 | 63 | for length, hinge, roll in [(0.321, theta4, theta3), (0.16828, theta6, theta5)]: 64 | perp_all_axis = np.cross(rot_axis, rot_perp_axis) 65 | x = np.cos(hinge) * rot_axis 66 | y = np.sin(hinge) * np.sin(roll) * rot_perp_axis 67 | z = -np.sin(hinge) * np.cos(roll) * perp_all_axis 68 | new_rot_axis = x + y + z 69 | new_rot_perp_axis = np.cross(new_rot_axis, rot_axis) 70 | new_rot_perp_axis[np.linalg.norm(new_rot_perp_axis, axis=1) < 1e-30] = \ 71 | rot_perp_axis[np.linalg.norm(new_rot_perp_axis, axis=1) < 1e-30] 72 | new_rot_perp_axis /= np.linalg.norm(new_rot_perp_axis, axis=1, keepdims=True) 73 | rot_axis, rot_perp_axis, cur_end = new_rot_axis, new_rot_perp_axis, cur_end + length * new_rot_axis 74 | 75 | return cur_end 76 | 77 | def cost_np_vec(self, obs, acts, next_obs): 78 | """ 79 | def obs_cost_fn(self, obs): 80 | self.ENV.goal = obs[:, 7: 10] 81 | ee_pos = obs[:, -3:] 82 | return np.sum(np.square(ee_pos - self.ENV.goal), axis=1) 83 | 84 | @staticmethod 85 | def ac_cost_fn(acs): 86 | return 0.01 * np.sum(np.square(acs), axis=1) 87 | """ 88 | reward_ctrl = -0.01 * np.sum(np.square(acts), axis=1) 89 | goal = obs[:, 7: 10] 90 | ee_pos = obs[:, -3:] 91 | 92 | reward = -np.sum(np.square(ee_pos - goal), axis=1) + reward_ctrl 93 | return -reward 94 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/assets/walker2d.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /mbpo_pytorch/models/actor_layer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from mbpo_pytorch.misc.distributions import FixedNormal, FixedCategorical, FixedBernoulli, \ 7 | TanhNormal, FixedLimitedEntNormal 8 | from .utils import init 9 | 10 | 11 | class CategoricalActorLayer(nn.Module, ABC): 12 | def __init__(self, num_inputs, num_outputs): 13 | super(CategoricalActorLayer, self).__init__() 14 | 15 | self.logit_actor = nn.Linear(num_inputs, num_outputs) 16 | init(self.logit_actor, lambda x: nn.init.orthogonal_(x, 0.01), lambda x: nn.init.constant_(x, 0)) 17 | 18 | def forward(self, states): 19 | logits = self.logit_actor(states) 20 | return FixedCategorical(logits=logits) 21 | 22 | 23 | class GaussianActorLayer(nn.Module, ABC): 24 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std): 25 | super(GaussianActorLayer, self).__init__() 26 | 27 | self.actor_mean = nn.Linear(num_inputs, num_outputs) 28 | init(self.actor_mean, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) 29 | self.use_state_dependent_std = use_state_dependent_std 30 | if self.use_state_dependent_std: 31 | self.actor_logstd = nn.Linear(num_inputs, num_outputs) 32 | init(self.actor_logstd, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) 33 | 34 | else: 35 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True) 36 | 37 | def forward(self, x): 38 | action_mean = self.actor_mean(x) 39 | 40 | if self.use_state_dependent_std: 41 | logstd = self.actor_logstd(x) 42 | else: 43 | logstd = self.logstd 44 | 45 | return FixedNormal(action_mean, logstd.exp()), action_mean, logstd 46 | 47 | 48 | class LimitedEntGaussianActorLayer(nn.Module, ABC): 49 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std): 50 | super(LimitedEntGaussianActorLayer, self).__init__() 51 | 52 | self.mean_actor = nn.Linear(num_inputs, num_outputs) 53 | init(self.mean_actor, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) 54 | self.use_state_dependent_std = use_state_dependent_std 55 | if self.use_state_dependent_std: 56 | self.logstd_actor = nn.Linear(num_inputs, num_outputs) 57 | init(self.logstd_actor, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0)) 58 | 59 | else: 60 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True) 61 | self.logstd_actor = lambda _: self.logstd 62 | 63 | def forward(self, x): 64 | action_mean = self.mean_actor(x) 65 | logstd = self.logstd_actor(x) 66 | return FixedLimitedEntNormal(action_mean, logstd.exp()), action_mean, logstd 67 | 68 | 69 | class BernoulliActorLayer(nn.Module, ABC): 70 | def __init__(self, num_inputs, num_outputs): 71 | super(BernoulliActorLayer, self).__init__() 72 | 73 | self.logit_actor = nn.Linear(num_inputs, num_outputs) 74 | init(self.logit_actor, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0)) 75 | 76 | def forward(self, states): 77 | logits = self.logit_actor(states) 78 | return FixedBernoulli(logits=logits) 79 | 80 | 81 | class TanhGaussainActorLayer(nn.Module, ABC): 82 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std, init_w=1e-3): 83 | super(TanhGaussainActorLayer, self).__init__() 84 | 85 | self.mean_actor = nn.Linear(num_inputs, num_outputs) 86 | init(self.mean_actor, lambda x: nn.init.uniform_(x, -init_w, init_w), 87 | lambda x: nn.init.uniform_(x, -init_w, init_w)) 88 | 89 | self.state_dependent_std = use_state_dependent_std 90 | if self.state_dependent_std: 91 | self.logstd_actor = nn.Linear(num_inputs, num_outputs) 92 | init(self.logstd_actor, lambda x: nn.init.uniform_(x, -init_w, init_w), 93 | lambda x: nn.init.uniform_(x, -init_w, init_w)) 94 | else: 95 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True) 96 | self.logstd_actor = lambda _: self.logstd 97 | 98 | def forward(self, states): 99 | action_means = self.mean_actor(states) 100 | action_logstds = self.logstd_actor(states) 101 | action_logstds = torch.clamp(action_logstds, -20, 2) 102 | 103 | return TanhNormal(action_means, action_logstds.exp()), torch.tanh(action_means), action_logstds 104 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/assets/ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 81 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/wrapped_envs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import os 3 | from typing import Optional, TYPE_CHECKING 4 | 5 | from gym.wrappers import TimeLimit 6 | import torch 7 | 8 | from .virtual_env import VecVirtualEnv 9 | from .benchmarking_envs.benchmarking_envs import make_benchmarking_env 10 | from mbpo_pytorch.thirdparty.base_vec_env import VecEnvWrapper 11 | from mbpo_pytorch.thirdparty.dummy_vec_env import DummyVecEnv 12 | from mbpo_pytorch.thirdparty.subproc_vec_env import SubprocVecEnv 13 | from mbpo_pytorch.thirdparty.vec_normalize import VecNormalize 14 | from mbpo_pytorch.thirdparty.monitor import Monitor 15 | 16 | if TYPE_CHECKING: 17 | from mbpo_pytorch.models.dynamics import BaseDynamics 18 | 19 | 20 | def make_env(env_id, seed, rank, log_dir, allow_early_resets, max_episode_steps): 21 | def _thunk(): 22 | env = make_benchmarking_env(env_id) 23 | env = TimeLimit(env, max_episode_steps) 24 | 25 | env.seed(seed + rank) 26 | log_dir_ = os.path.join(log_dir, str(rank)) if log_dir is not None else log_dir 27 | env = Monitor(env, log_dir_, allow_early_resets=allow_early_resets) 28 | 29 | return env 30 | 31 | return _thunk 32 | 33 | 34 | def make_vec_envs(env_name: str, 35 | seed: int, 36 | num_envs: int, 37 | gamma: Optional[float], 38 | log_dir: Optional[str], 39 | device: torch.device, 40 | allow_early_resets: bool, 41 | max_episode_steps: int = 1000, 42 | norm_reward=True, 43 | norm_obs=True, 44 | ): 45 | envs = [ 46 | make_env(env_name, seed, i, log_dir, allow_early_resets, max_episode_steps) 47 | for i in range(num_envs) 48 | ] 49 | 50 | if len(envs) > 1: 51 | envs = SubprocVecEnv(envs) 52 | else: 53 | envs = DummyVecEnv(envs) 54 | 55 | if len(envs.observation_space.shape) == 1: 56 | if gamma is None: 57 | envs = VecNormalize(envs, norm_reward=False, norm_obs=norm_obs) 58 | else: 59 | envs = VecNormalize(envs, gamma=gamma, norm_reward=norm_reward, norm_obs=norm_obs) 60 | 61 | envs = VecPyTorch(envs, device) 62 | 63 | return envs 64 | 65 | 66 | def make_vec_virtual_envs(env_name: str, 67 | dynamics: BaseDynamics, 68 | seed: int, 69 | num_envs: int, 70 | gamma: Optional[float], 71 | device: torch.device, 72 | norm_reward=False, 73 | norm_obs=False, 74 | **kwargs 75 | ): 76 | envs = VecVirtualEnv(dynamics, make_benchmarking_env(env_name), num_envs, seed, **kwargs) 77 | 78 | if (len(envs.observation_space.shape) == 1 and norm_obs) or norm_reward: 79 | if gamma is None: 80 | envs = VecNormalize(envs, norm_reward=False, norm_obs=norm_obs) 81 | else: 82 | envs = VecNormalize(envs, gamma=gamma, norm_reward=norm_reward, norm_obs=norm_obs) 83 | 84 | envs = VecPyTorch(envs, device) 85 | 86 | return envs 87 | 88 | 89 | class VecPyTorch(VecEnvWrapper): 90 | def __init__(self, venv, device): 91 | super(VecPyTorch, self).__init__(venv) 92 | self.device = device 93 | 94 | def reset(self): 95 | obs = self.venv.reset() 96 | obs = torch.from_numpy(obs).float().to(self.device) 97 | return obs 98 | 99 | def step_with_states(self, states: torch.Tensor, actions: torch.Tensor): 100 | if isinstance(actions, torch.LongTensor): 101 | actions = actions.squeeze(1) 102 | return self.venv.step_with_states(states, actions) 103 | 104 | def step_async(self, actions: torch.Tensor): 105 | if isinstance(actions, torch.LongTensor): 106 | actions = actions.squeeze(1) 107 | actions = actions.cpu().numpy() 108 | self.venv.step_async(actions) 109 | 110 | def step_wait(self): 111 | obs, reward, done, info = self.venv.step_wait() 112 | obs = torch.from_numpy(obs).float().to(self.device) 113 | reward = torch.from_numpy(reward).unsqueeze(dim=1).float() 114 | return obs, reward, done, info 115 | 116 | def env_method(self, method_name, *method_args, indices=None, **method_kwargs): 117 | new_method_args = [] 118 | new_method_kwargs = {} 119 | for method_arg in method_args: 120 | if type(method_arg) == torch.Tensor: 121 | new_method_args.append(method_arg.cpu().numpy()) 122 | for method_arg_k, method_arg_v in method_kwargs.items(): 123 | if type(method_arg_v) == torch.Tensor: 124 | new_method_kwargs[method_arg_k] = method_arg_v.cpu().numpy() 125 | self.venv.env_method(method_name, *new_method_args, indices, **new_method_kwargs) 126 | 127 | 128 | def get_vec_normalize(venv): 129 | if isinstance(venv, VecNormalize): 130 | return venv 131 | elif hasattr(venv, 'venv'): 132 | return get_vec_normalize(venv.venv) 133 | 134 | return None 135 | 136 | 137 | -------------------------------------------------------------------------------- /mbpo_pytorch/thirdparty/dummy_vec_env.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import numpy as np 3 | from typing import Sequence 4 | 5 | from mbpo_pytorch.thirdparty.base_vec_env import VecEnv 6 | from mbpo_pytorch.thirdparty.util import copy_obs_dict, dict_to_obs, obs_space_info 7 | 8 | 9 | class DummyVecEnv(VecEnv): 10 | """ 11 | Creates a simple vectorized wrapper for multiple environments, calling each environment in sequence on the current 12 | Python process. This is useful for computationally simple environment such as ``cartpole-v1``, as the overhead of 13 | multiprocess or multithread outweighs the environment computation time. This can also be used for RL methods that 14 | require a vectorized environment, but that you want a single environments to train with. 15 | 16 | :param env_fns: ([callable]) A list of functions that will create the environments 17 | (each callable returns a `Gym.Env` instance when called). 18 | """ 19 | 20 | def __init__(self, env_fns): 21 | self.envs = [fn() for fn in env_fns] 22 | env = self.envs[0] 23 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 24 | obs_space = env.observation_space 25 | self.keys, shapes, dtypes = obs_space_info(obs_space) 26 | 27 | self.buf_obs = OrderedDict([ 28 | (k, np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k])) 29 | for k in self.keys]) 30 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) 31 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) 32 | self.buf_infos = [{} for _ in range(self.num_envs)] 33 | self.actions = None 34 | self.metadata = env.metadata 35 | 36 | def step_async(self, actions): 37 | self.actions = actions 38 | 39 | def step_wait(self): 40 | for env_idx in range(self.num_envs): 41 | obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] =\ 42 | self.envs[env_idx].step(self.actions[env_idx]) 43 | if self.buf_dones[env_idx]: 44 | # save final observation where user can get it, then reset 45 | self.buf_infos[env_idx]['terminal_observation'] = obs 46 | obs = self.envs[env_idx].reset() 47 | self._save_obs(env_idx, obs) 48 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), 49 | self.buf_infos.copy()) 50 | 51 | def seed(self, seed=None): 52 | seeds = list() 53 | for idx, env in enumerate(self.envs): 54 | seeds.append(env.seed(seed + idx)) 55 | return seeds 56 | 57 | def reset(self): 58 | for env_idx in range(self.num_envs): 59 | obs = self.envs[env_idx].reset() 60 | self._save_obs(env_idx, obs) 61 | return self._obs_from_buf() 62 | 63 | def close(self): 64 | for env in self.envs: 65 | env.close() 66 | 67 | def get_images(self, *args, **kwargs) -> Sequence[np.ndarray]: 68 | return [env.render(*args, mode='rgb_array', **kwargs) for env in self.envs] 69 | 70 | def render(self, *args, **kwargs): 71 | """ 72 | Gym environment rendering. If there are multiple environments then 73 | they are tiled together in one image via `BaseVecEnv.render()`. 74 | Otherwise (if `self.num_envs == 1`), we pass the render call directly to the 75 | underlying environment. 76 | 77 | Therefore, some arguments such as `mode` will have values that are valid 78 | only when `num_envs == 1`. 79 | 80 | :param mode: The rendering type. 81 | """ 82 | if self.num_envs == 1: 83 | return self.envs[0].render(*args, **kwargs) 84 | else: 85 | return super().render(*args, **kwargs) 86 | 87 | def _save_obs(self, env_idx, obs): 88 | for key in self.keys: 89 | if key is None: 90 | self.buf_obs[key][env_idx] = obs 91 | else: 92 | self.buf_obs[key][env_idx] = obs[key] 93 | 94 | def _obs_from_buf(self): 95 | return dict_to_obs(self.observation_space, copy_obs_dict(self.buf_obs)) 96 | 97 | def get_attr(self, attr_name, indices=None): 98 | """Return attribute from vectorized environment (see base class).""" 99 | target_envs = self._get_target_envs(indices) 100 | return [getattr(env_i, attr_name) for env_i in target_envs] 101 | 102 | def set_attr(self, attr_name, value, indices=None): 103 | """Set attribute inside vectorized environments (see base class).""" 104 | target_envs = self._get_target_envs(indices) 105 | for env_i in target_envs: 106 | setattr(env_i, attr_name, value) 107 | 108 | def env_method(self, method_name, *method_args, indices=None, **method_kwargs): 109 | """Call instance methods of vectorized environments.""" 110 | target_envs = self._get_target_envs(indices) 111 | return [getattr(env_i, method_name)(*method_args, **method_kwargs) for env_i in target_envs] 112 | 113 | def _get_target_envs(self, indices): 114 | indices = self._get_indices(indices) 115 | return [self.envs[i] for i in indices] 116 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/pendulum.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from gym import spaces 4 | from gym.utils import seeding 5 | import numpy as np 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class PendulumEnv(BaseModelBasedEnv): 11 | metadata = { 12 | 'render.modes': ['human', 'rgb_array'], 13 | 'video.frames_per_second': 30 14 | } 15 | 16 | def __init__(self): 17 | self.max_speed = 8 18 | self.max_torque = 2. 19 | self.dt = .05 20 | self.viewer = None 21 | 22 | high = np.array([1., 1., self.max_speed]) 23 | self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,)) 24 | self.observation_space = spaces.Box(low=-high, high=high) 25 | 26 | self._seed() 27 | 28 | def _seed(self, seed=None): 29 | self.np_random, seed = seeding.np_random(seed) 30 | return [seed] 31 | 32 | def step(self, u): 33 | th, thdot = self.state # th := theta 34 | ''' 35 | theta, thetadot = self.state 36 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 37 | ''' 38 | 39 | # for the reward 40 | y, x, thetadot = np.cos(th), np.sin(th), thdot 41 | u = np.clip(u, -self.max_torque, self.max_torque)[0] 42 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2) 43 | reward = -costs 44 | 45 | g = 10. 46 | m = 1. 47 | l = 1. 48 | dt = self.dt 49 | 50 | self.last_u = u # for rendering 51 | # costs = angle_normalize(th) ** 2 + .1 * thdot ** 2 + .001 * (u ** 2) 52 | 53 | newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt 54 | newth = th + newthdot * dt 55 | newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) # pylint: disable=E1111 56 | 57 | self.state = np.array([newth, newthdot]) 58 | return self._get_obs(), reward, False, {} 59 | 60 | def _reset(self): 61 | high = np.array([np.pi, 1]) 62 | self.state = self.np_random.uniform(low=-high, high=high) 63 | self.last_u = None 64 | return self._get_obs() 65 | 66 | def _get_obs(self): 67 | theta, thetadot = self.state 68 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 69 | 70 | def render(self, mode='human', close=False): 71 | if close: 72 | if self.viewer is not None: 73 | self.viewer.close() 74 | self.viewer = None 75 | return 76 | 77 | if self.viewer is None: 78 | from gym.envs.classic_control import rendering 79 | self.viewer = rendering.Viewer(500, 500) 80 | self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2) 81 | rod = rendering.make_capsule(1, .2) 82 | rod.set_color(.8, .3, .3) 83 | self.pole_transform = rendering.Transform() 84 | rod.add_attr(self.pole_transform) 85 | self.viewer.add_geom(rod) 86 | axle = rendering.make_circle(.05) 87 | axle.set_color(0, 0, 0) 88 | self.viewer.add_geom(axle) 89 | fname = path.join(path.dirname(__file__), "assets/clockwise.png") 90 | self.img = rendering.Image(fname, 1., 1.) 91 | self.imgtrans = rendering.Transform() 92 | self.img.add_attr(self.imgtrans) 93 | 94 | self.viewer.add_onetime(self.img) 95 | self.pole_transform.set_rotation(self.state[0] + np.pi / 2) 96 | if self.last_u: 97 | self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2) 98 | 99 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array')) 100 | 101 | def mb_step(self, states, actions, next_states): 102 | # returns rewards and dones 103 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 104 | if getattr(self, 'action_space', None): 105 | actions = np.clip(actions, self.action_space.low, 106 | self.action_space.high) 107 | rewards = - self.cost_np_vec(states, actions, next_states) 108 | return rewards, np.zeros_like(rewards, dtype=np.bool) 109 | 110 | def cost_np_vec(self, obs, acts, next_obs): 111 | """ 112 | dist_vec = obs[:, -3:] 113 | reward_dist = - np.linalg.norm(dist_vec, axis=1) 114 | reward_ctrl = - np.sum(np.square(acts), axis=1) 115 | reward = reward_dist + reward_ctrl 116 | 117 | # for the reward 118 | y, x, thetadot = np.cos(th), np.sin(th), thdot 119 | u = np.clip(u, -self.max_torque, self.max_torque)[0] 120 | costs = y + .1 * x + .1 * (thetadot ** 2) + .001 * (u ** 2) 121 | reward = -costs 122 | 123 | def _get_obs(self): 124 | theta, thetadot = self.state 125 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 126 | 127 | """ 128 | y, x, thetadot = obs[:, 0], obs[:, 1], obs[:, 2] 129 | u = np.clip(acts[:, 0], -self.max_torque, self.max_torque) 130 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2) 131 | return costs 132 | 133 | def angle_normalize(x): 134 | return (((x + np.pi) % (2 * np.pi)) - np.pi) 135 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_pendulumO01.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | import gym 4 | from gym import spaces 5 | from gym.utils import seeding 6 | import numpy as np 7 | 8 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 9 | 10 | 11 | class PendulumEnv(BaseModelBasedEnv): 12 | metadata = { 13 | 'render.modes': ['human', 'rgb_array'], 14 | 'video.frames_per_second': 30 15 | } 16 | 17 | def __init__(self): 18 | self.max_speed = 8 19 | self.max_torque = 2. 20 | self.dt = .05 21 | self.viewer = None 22 | 23 | high = np.array([1., 1., self.max_speed]) 24 | self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,)) 25 | self.observation_space = spaces.Box(low=-high, high=high) 26 | 27 | self._seed() 28 | 29 | def _seed(self, seed=None): 30 | self.np_random, seed = seeding.np_random(seed) 31 | return [seed] 32 | 33 | def step(self, u): 34 | th, thdot = self.state # th := theta 35 | ''' 36 | theta, thetadot = self.state 37 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 38 | ''' 39 | 40 | # for the reward 41 | y, x, thetadot = np.cos(th), np.sin(th), thdot 42 | u = np.clip(u, -self.max_torque, self.max_torque)[0] 43 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2) 44 | reward = -costs 45 | 46 | g = 10. 47 | m = 1. 48 | l = 1. 49 | dt = self.dt 50 | 51 | self.last_u = u # for rendering 52 | # costs = angle_normalize(th) ** 2 + .1 * thdot ** 2 + .001 * (u ** 2) 53 | 54 | newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt 55 | newth = th + newthdot * dt 56 | newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) # pylint: disable=E1111 57 | 58 | self.state = np.array([newth, newthdot]) 59 | ob = self._get_obs() 60 | ob += np.random.uniform(low=-0.1, high=0.1, size=ob.shape) 61 | return ob, reward, False, {} 62 | 63 | def _reset(self): 64 | high = np.array([np.pi, 1]) 65 | self.state = self.np_random.uniform(low=-high, high=high) 66 | self.last_u = None 67 | return self._get_obs() 68 | 69 | def _get_obs(self): 70 | theta, thetadot = self.state 71 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 72 | 73 | def render(self, mode='human', close=False): 74 | if close: 75 | if self.viewer is not None: 76 | self.viewer.close() 77 | self.viewer = None 78 | return 79 | 80 | if self.viewer is None: 81 | from gym.envs.classic_control import rendering 82 | self.viewer = rendering.Viewer(500, 500) 83 | self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2) 84 | rod = rendering.make_capsule(1, .2) 85 | rod.set_color(.8, .3, .3) 86 | self.pole_transform = rendering.Transform() 87 | rod.add_attr(self.pole_transform) 88 | self.viewer.add_geom(rod) 89 | axle = rendering.make_circle(.05) 90 | axle.set_color(0, 0, 0) 91 | self.viewer.add_geom(axle) 92 | fname = path.join(path.dirname(__file__), "assets/clockwise.png") 93 | self.img = rendering.Image(fname, 1., 1.) 94 | self.imgtrans = rendering.Transform() 95 | self.img.add_attr(self.imgtrans) 96 | 97 | self.viewer.add_onetime(self.img) 98 | self.pole_transform.set_rotation(self.state[0] + np.pi / 2) 99 | if self.last_u: 100 | self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2) 101 | 102 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array')) 103 | 104 | def cost_np_vec(self, obs, acts, next_obs): 105 | """ 106 | dist_vec = obs[:, -3:] 107 | reward_dist = - np.linalg.norm(dist_vec, axis=1) 108 | reward_ctrl = - np.sum(np.square(acts), axis=1) 109 | reward = reward_dist + reward_ctrl 110 | 111 | # for the reward 112 | y, x, thetadot = np.cos(th), np.sin(th), thdot 113 | u = np.clip(u, -self.max_torque, self.max_torque)[0] 114 | costs = y + .1 * x + .1 * (thetadot ** 2) + .001 * (u ** 2) 115 | reward = -costs 116 | 117 | def _get_obs(self): 118 | theta, thetadot = self.state 119 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 120 | 121 | """ 122 | y, x, thetadot = obs[:, 0], obs[:, 1], obs[:, 2] 123 | u = np.clip(acts[:, 0], -self.max_torque, self.max_torque) 124 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2) 125 | return costs 126 | 127 | def mb_step(self, states, actions, next_states): 128 | # returns rewards and dones 129 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 130 | if getattr(self, 'action_space', None): 131 | actions = np.clip(actions, self.action_space.low, 132 | self.action_space.high) 133 | rewards = - self.cost_np_vec(states, actions, next_states) 134 | return rewards, np.zeros_like(rewards, dtype=np.bool) 135 | 136 | def angle_normalize(x): 137 | return (((x + np.pi) % (2 * np.pi)) - np.pi) 138 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_pendulumO001.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | import gym 4 | from gym import spaces 5 | from gym.utils import seeding 6 | import numpy as np 7 | 8 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 9 | 10 | 11 | class PendulumEnv(BaseModelBasedEnv): 12 | metadata = { 13 | 'render.modes': ['human', 'rgb_array'], 14 | 'video.frames_per_second': 30 15 | } 16 | 17 | def __init__(self): 18 | self.max_speed = 8 19 | self.max_torque = 2. 20 | self.dt = .05 21 | self.viewer = None 22 | 23 | high = np.array([1., 1., self.max_speed]) 24 | self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,)) 25 | self.observation_space = spaces.Box(low=-high, high=high) 26 | 27 | self._seed() 28 | 29 | def _seed(self, seed=None): 30 | self.np_random, seed = seeding.np_random(seed) 31 | return [seed] 32 | 33 | def step(self, u): 34 | th, thdot = self.state # th := theta 35 | ''' 36 | theta, thetadot = self.state 37 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 38 | ''' 39 | 40 | # for the reward 41 | y, x, thetadot = np.cos(th), np.sin(th), thdot 42 | u = np.clip(u, -self.max_torque, self.max_torque)[0] 43 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2) 44 | reward = -costs 45 | 46 | g = 10. 47 | m = 1. 48 | l = 1. 49 | dt = self.dt 50 | 51 | self.last_u = u # for rendering 52 | # costs = angle_normalize(th) ** 2 + .1 * thdot ** 2 + .001 * (u ** 2) 53 | 54 | newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt 55 | newth = th + newthdot * dt 56 | newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) # pylint: disable=E1111 57 | 58 | self.state = np.array([newth, newthdot]) 59 | ob = self._get_obs() 60 | ob += np.random.uniform(low=-0.01, high=0.01, size=ob.shape) 61 | return ob, reward, False, {} 62 | 63 | def _reset(self): 64 | high = np.array([np.pi, 1]) 65 | self.state = self.np_random.uniform(low=-high, high=high) 66 | self.last_u = None 67 | return self._get_obs() 68 | 69 | def _get_obs(self): 70 | theta, thetadot = self.state 71 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 72 | 73 | def render(self, mode='human', close=False): 74 | if close: 75 | if self.viewer is not None: 76 | self.viewer.close() 77 | self.viewer = None 78 | return 79 | 80 | if self.viewer is None: 81 | from gym.envs.classic_control import rendering 82 | self.viewer = rendering.Viewer(500, 500) 83 | self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2) 84 | rod = rendering.make_capsule(1, .2) 85 | rod.set_color(.8, .3, .3) 86 | self.pole_transform = rendering.Transform() 87 | rod.add_attr(self.pole_transform) 88 | self.viewer.add_geom(rod) 89 | axle = rendering.make_circle(.05) 90 | axle.set_color(0, 0, 0) 91 | self.viewer.add_geom(axle) 92 | fname = path.join(path.dirname(__file__), "assets/clockwise.png") 93 | self.img = rendering.Image(fname, 1., 1.) 94 | self.imgtrans = rendering.Transform() 95 | self.img.add_attr(self.imgtrans) 96 | 97 | self.viewer.add_onetime(self.img) 98 | self.pole_transform.set_rotation(self.state[0] + np.pi / 2) 99 | if self.last_u: 100 | self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2) 101 | 102 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array')) 103 | 104 | def cost_np_vec(self, obs, acts, next_obs): 105 | """ 106 | dist_vec = obs[:, -3:] 107 | reward_dist = - np.linalg.norm(dist_vec, axis=1) 108 | reward_ctrl = - np.sum(np.square(acts), axis=1) 109 | reward = reward_dist + reward_ctrl 110 | 111 | # for the reward 112 | y, x, thetadot = np.cos(th), np.sin(th), thdot 113 | u = np.clip(u, -self.max_torque, self.max_torque)[0] 114 | costs = y + .1 * x + .1 * (thetadot ** 2) + .001 * (u ** 2) 115 | reward = -costs 116 | 117 | def _get_obs(self): 118 | theta, thetadot = self.state 119 | return np.array([np.cos(theta), np.sin(theta), thetadot]) 120 | 121 | """ 122 | y, x, thetadot = obs[:, 0], obs[:, 1], obs[:, 2] 123 | u = np.clip(acts[:, 0], -self.max_torque, self.max_torque) 124 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2) 125 | return costs 126 | 127 | def mb_step(self, states, actions, next_states): 128 | # returns rewards and dones 129 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 130 | if getattr(self, 'action_space', None): 131 | actions = np.clip(actions, self.action_space.low, 132 | self.action_space.high) 133 | rewards = - self.cost_np_vec(states, actions, next_states) 134 | return rewards, np.zeros_like(rewards, dtype=np.bool) 135 | 136 | def angle_normalize(x): 137 | return (((x + np.pi) % (2 * np.pi)) - np.pi) 138 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/assets/pusher.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 92 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/assets/half_cheetah.xml: -------------------------------------------------------------------------------- 1 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 96 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/assets/half_cheetah.xml: -------------------------------------------------------------------------------- 1 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 96 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_cartpoleO001.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from gym import spaces 4 | from gym.utils import seeding 5 | import numpy as np 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class CartPoleEnv(BaseModelBasedEnv): 11 | metadata = { 12 | 'render.modes': ['human', 'rgb_array'], 13 | 'video.frames_per_second': 50 14 | } 15 | 16 | def __init__(self): 17 | self.gravity = 9.8 18 | self.masscart = 1.0 19 | self.masspole = 0.1 20 | self.total_mass = (self.masspole + self.masscart) 21 | self.length = 0.5 # actually half the pole's length 22 | self.polemass_length = (self.masspole * self.length) 23 | self.force_mag = 10.0 24 | self.tau = 0.02 # seconds between state updates 25 | 26 | # Angle at which to fail the episode 27 | self.theta_threshold_radians = 12 * 2 * math.pi / 360 28 | self.x_threshold = 2.4 29 | 30 | # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds 31 | high = np.array([ 32 | self.x_threshold * 2, 33 | np.finfo(np.float32).max, 34 | self.theta_threshold_radians * 2, 35 | np.finfo(np.float32).max]) 36 | 37 | # self.action_space = spaces.Discrete(2) 38 | self.action_space = \ 39 | spaces.Box(low=np.array([-1.0]), high=np.array([1.0])) 40 | self.observation_space = spaces.Box(-high, high) 41 | 42 | self._seed() 43 | self.viewer = None 44 | self.state = None 45 | 46 | self.steps_beyond_done = None 47 | 48 | def _seed(self, seed=None): 49 | self.np_random, seed = seeding.np_random(seed) 50 | return [seed] 51 | 52 | def step(self, action): 53 | action = 1 if action[0] > .0 else 0 54 | state = self.state 55 | obs = self.state 56 | reward = np.cos(obs[2]) - 0.01 * (obs[0] ** 2) 57 | 58 | x, x_dot, theta, theta_dot = state 59 | force = self.force_mag if action == 1 else -self.force_mag 60 | costheta = math.cos(theta) 61 | sintheta = math.sin(theta) 62 | temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass 63 | thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass)) 64 | xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass 65 | x = x + self.tau * x_dot 66 | x_dot = x_dot + self.tau * xacc 67 | theta = theta + self.tau * theta_dot 68 | theta_dot = theta_dot + self.tau * thetaacc 69 | self.state = (x, x_dot, theta, theta_dot) 70 | 71 | done = False 72 | self.steps_beyond_done = None 73 | 74 | ob = np.array(self.state) 75 | ob += np.random.uniform(low=-0.01, high=0.01, size=ob.shape) 76 | 77 | return ob, reward, done, {} 78 | 79 | def _reset(self): 80 | self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,)) 81 | self.steps_beyond_done = None 82 | return np.array(self.state) 83 | 84 | def render(self, mode='human', close=False): 85 | if close: 86 | if self.viewer is not None: 87 | self.viewer.close() 88 | self.viewer = None 89 | return 90 | 91 | screen_width = 600 92 | screen_height = 400 93 | 94 | world_width = self.x_threshold * 2 95 | scale = screen_width / world_width 96 | carty = 100 # TOP OF CART 97 | polewidth = 10.0 98 | polelen = scale * 1.0 99 | cartwidth = 50.0 100 | cartheight = 30.0 101 | 102 | if self.viewer is None: 103 | from gym.envs.classic_control import rendering 104 | self.viewer = rendering.Viewer(screen_width, screen_height) 105 | l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2 106 | axleoffset = cartheight / 4.0 107 | cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) 108 | self.carttrans = rendering.Transform() 109 | cart.add_attr(self.carttrans) 110 | self.viewer.add_geom(cart) 111 | l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2 112 | pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) 113 | pole.set_color(.8, .6, .4) 114 | self.poletrans = rendering.Transform(translation=(0, axleoffset)) 115 | pole.add_attr(self.poletrans) 116 | pole.add_attr(self.carttrans) 117 | self.viewer.add_geom(pole) 118 | self.axle = rendering.make_circle(polewidth / 2) 119 | self.axle.add_attr(self.poletrans) 120 | self.axle.add_attr(self.carttrans) 121 | self.axle.set_color(.5, .5, .8) 122 | self.viewer.add_geom(self.axle) 123 | self.track = rendering.Line((0, carty), (screen_width, carty)) 124 | self.track.set_color(0, 0, 0) 125 | self.viewer.add_geom(self.track) 126 | 127 | if self.state is None: 128 | return None 129 | 130 | x = self.state 131 | cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART 132 | self.carttrans.set_translation(cartx, carty) 133 | self.poletrans.set_rotation(-x[2]) 134 | 135 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array')) 136 | 137 | def cost_np_vec(self, obs, acts, next_obs): 138 | x = obs[:, 0] 139 | theta = obs[:, 2] 140 | return -(np.cos(theta) - 0.01 * (x ** 2)) 141 | 142 | def mb_step(self, states, actions, next_states): 143 | # returns rewards and dones 144 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 145 | if getattr(self, 'action_space', None): 146 | actions = np.clip(actions, self.action_space.low, 147 | self.action_space.high) 148 | rewards = - self.cost_np_vec(states, actions, next_states) 149 | return rewards, np.zeros_like(rewards, dtype=np.bool) 150 | 151 | 152 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/cartpole.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from gym import spaces 4 | from gym.utils import seeding 5 | import numpy as np 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class CartPoleEnv(BaseModelBasedEnv): 11 | metadata = { 12 | 'render.modes': ['human', 'rgb_array'], 13 | 'video.frames_per_second': 50 14 | } 15 | 16 | def __init__(self): 17 | self.gravity = 9.8 18 | self.masscart = 1.0 19 | self.masspole = 0.1 20 | self.total_mass = (self.masspole + self.masscart) 21 | self.length = 0.5 # actually half the pole's length 22 | self.polemass_length = (self.masspole * self.length) 23 | self.force_mag = 10.0 24 | self.tau = 0.02 # seconds between state updates 25 | 26 | # Angle at which to fail the episode 27 | self.theta_threshold_radians = 12 * 2 * math.pi / 360 28 | self.x_threshold = 2.4 29 | 30 | # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds 31 | high = np.array([ 32 | self.x_threshold * 2, 33 | np.finfo(np.float32).max, 34 | self.theta_threshold_radians * 2, 35 | np.finfo(np.float32).max]) 36 | 37 | # self.action_space = spaces.Discrete(2) 38 | self.action_space = \ 39 | spaces.Box(low=np.array([-1.0]), high=np.array([1.0])) 40 | self.observation_space = spaces.Box(-high, high) 41 | 42 | self._seed() 43 | self.viewer = None 44 | self.state = None 45 | 46 | self.steps_beyond_done = None 47 | 48 | def _seed(self, seed=None): 49 | self.np_random, seed = seeding.np_random(seed) 50 | return [seed] 51 | 52 | def step(self, action): 53 | action = 1 if action[0] > .0 else 0 54 | # assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action)) 55 | state = self.state 56 | obs = self.state 57 | reward = np.cos(obs[2]) - 0.01 * (obs[0] ** 2) 58 | 59 | x, x_dot, theta, theta_dot = state 60 | force = self.force_mag if action == 1 else -self.force_mag 61 | costheta = math.cos(theta) 62 | sintheta = math.sin(theta) 63 | temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass 64 | thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass)) 65 | xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass 66 | x = x + self.tau * x_dot 67 | x_dot = x_dot + self.tau * xacc 68 | theta = theta + self.tau * theta_dot 69 | theta_dot = theta_dot + self.tau * thetaacc 70 | self.state = (x, x_dot, theta, theta_dot) 71 | done = False 72 | self.steps_beyond_done = None 73 | 74 | return np.array(self.state), reward, done, {} 75 | 76 | def _reset(self): 77 | self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,)) 78 | self.steps_beyond_done = None 79 | return np.array(self.state) 80 | 81 | def render(self, mode='human', close=False): 82 | if close: 83 | if self.viewer is not None: 84 | self.viewer.close() 85 | self.viewer = None 86 | return 87 | 88 | screen_width = 600 89 | screen_height = 400 90 | 91 | world_width = self.x_threshold * 2 92 | scale = screen_width / world_width 93 | carty = 100 # TOP OF CART 94 | polewidth = 10.0 95 | polelen = scale * 1.0 96 | cartwidth = 50.0 97 | cartheight = 30.0 98 | 99 | if self.viewer is None: 100 | from gym.envs.classic_control import rendering 101 | self.viewer = rendering.Viewer(screen_width, screen_height) 102 | l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2 103 | axleoffset = cartheight / 4.0 104 | cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) 105 | self.carttrans = rendering.Transform() 106 | cart.add_attr(self.carttrans) 107 | self.viewer.add_geom(cart) 108 | l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2 109 | pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) 110 | pole.set_color(.8, .6, .4) 111 | self.poletrans = rendering.Transform(translation=(0, axleoffset)) 112 | pole.add_attr(self.poletrans) 113 | pole.add_attr(self.carttrans) 114 | self.viewer.add_geom(pole) 115 | self.axle = rendering.make_circle(polewidth / 2) 116 | self.axle.add_attr(self.poletrans) 117 | self.axle.add_attr(self.carttrans) 118 | self.axle.set_color(.5, .5, .8) 119 | self.viewer.add_geom(self.axle) 120 | self.track = rendering.Line((0, carty), (screen_width, carty)) 121 | self.track.set_color(0, 0, 0) 122 | self.viewer.add_geom(self.track) 123 | 124 | if self.state is None: 125 | return None 126 | 127 | x = self.state 128 | cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART 129 | self.carttrans.set_translation(cartx, carty) 130 | self.poletrans.set_rotation(-x[2]) 131 | 132 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array')) 133 | 134 | def mb_step(self, states, actions, next_states): 135 | # returns rewards and dones 136 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 137 | if getattr(self, 'action_space', None): 138 | actions = np.clip(actions, self.action_space.low, 139 | self.action_space.high) 140 | rewards = - self.cost_np_vec(states, actions, next_states) 141 | return rewards, np.zeros_like(rewards, dtype=np.bool) 142 | 143 | def cost_np_vec(self, obs, acts, next_obs): 144 | x = obs[:, 0] 145 | theta = obs[:, 2] 146 | return -(np.cos(theta) - 0.01 * (x ** 2)) 147 | 148 | def verify(self, n=2000, eps=1e-4): 149 | pass -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/gym_cartpoleO01.py: -------------------------------------------------------------------------------- 1 | import math 2 | from gym import spaces 3 | from gym.utils import seeding 4 | import numpy as np 5 | 6 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 7 | 8 | 9 | class CartPoleEnv(BaseModelBasedEnv): 10 | metadata = { 11 | 'render.modes': ['human', 'rgb_array'], 12 | 'video.frames_per_second': 50 13 | } 14 | 15 | def __init__(self): 16 | self.gravity = 9.8 17 | self.masscart = 1.0 18 | self.masspole = 0.1 19 | self.total_mass = (self.masspole + self.masscart) 20 | self.length = 0.5 # actually half the pole's length 21 | self.polemass_length = (self.masspole * self.length) 22 | self.force_mag = 10.0 23 | self.tau = 0.02 # seconds between state updates 24 | 25 | # Angle at which to fail the episode 26 | self.theta_threshold_radians = 12 * 2 * math.pi / 360 27 | self.x_threshold = 2.4 28 | 29 | # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds 30 | high = np.array([ 31 | self.x_threshold * 2, 32 | np.finfo(np.float32).max, 33 | self.theta_threshold_radians * 2, 34 | np.finfo(np.float32).max]) 35 | 36 | # self.action_space = spaces.Discrete(2) 37 | self.action_space = \ 38 | spaces.Box(low=np.array([-1.0]), high=np.array([1.0])) 39 | self.observation_space = spaces.Box(-high, high) 40 | 41 | self._seed() 42 | self.viewer = None 43 | self.state = None 44 | 45 | self.steps_beyond_done = None 46 | 47 | def _seed(self, seed=None): 48 | self.np_random, seed = seeding.np_random(seed) 49 | return [seed] 50 | 51 | def step(self, action): 52 | action = 1 if action[0] > .0 else 0 53 | # assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action)) 54 | state = self.state 55 | obs = self.state 56 | reward = np.cos(obs[2]) - 0.01 * (obs[0] ** 2) 57 | 58 | x, x_dot, theta, theta_dot = state 59 | force = self.force_mag if action == 1 else -self.force_mag 60 | costheta = math.cos(theta) 61 | sintheta = math.sin(theta) 62 | temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass 63 | thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass)) 64 | xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass 65 | x = x + self.tau * x_dot 66 | x_dot = x_dot + self.tau * xacc 67 | theta = theta + self.tau * theta_dot 68 | theta_dot = theta_dot + self.tau * thetaacc 69 | self.state = (x, x_dot, theta, theta_dot) 70 | 71 | done = False 72 | self.steps_beyond_done = None 73 | 74 | ob = np.array(self.state) 75 | ob += np.random.uniform(low=-0.1, high=0.1, size=ob.shape) 76 | 77 | return ob, reward, done, {} 78 | 79 | def _reset(self): 80 | self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,)) 81 | self.steps_beyond_done = None 82 | return np.array(self.state) 83 | 84 | def render(self, mode='human', close=False): 85 | if close: 86 | if self.viewer is not None: 87 | self.viewer.close() 88 | self.viewer = None 89 | return 90 | 91 | screen_width = 600 92 | screen_height = 400 93 | 94 | world_width = self.x_threshold * 2 95 | scale = screen_width / world_width 96 | carty = 100 # TOP OF CART 97 | polewidth = 10.0 98 | polelen = scale * 1.0 99 | cartwidth = 50.0 100 | cartheight = 30.0 101 | 102 | if self.viewer is None: 103 | from gym.envs.classic_control import rendering 104 | self.viewer = rendering.Viewer(screen_width, screen_height) 105 | l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2 106 | axleoffset = cartheight / 4.0 107 | cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) 108 | self.carttrans = rendering.Transform() 109 | cart.add_attr(self.carttrans) 110 | self.viewer.add_geom(cart) 111 | l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2 112 | pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) 113 | pole.set_color(.8, .6, .4) 114 | self.poletrans = rendering.Transform(translation=(0, axleoffset)) 115 | pole.add_attr(self.poletrans) 116 | pole.add_attr(self.carttrans) 117 | self.viewer.add_geom(pole) 118 | self.axle = rendering.make_circle(polewidth / 2) 119 | self.axle.add_attr(self.poletrans) 120 | self.axle.add_attr(self.carttrans) 121 | self.axle.set_color(.5, .5, .8) 122 | self.viewer.add_geom(self.axle) 123 | self.track = rendering.Line((0, carty), (screen_width, carty)) 124 | self.track.set_color(0, 0, 0) 125 | self.viewer.add_geom(self.track) 126 | 127 | if self.state is None: 128 | return None 129 | 130 | x = self.state 131 | cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART 132 | self.carttrans.set_translation(cartx, carty) 133 | self.poletrans.set_rotation(-x[2]) 134 | 135 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array')) 136 | 137 | def cost_np_vec(self, obs, acts, next_obs): 138 | x = obs[:, 0] 139 | theta = obs[:, 2] 140 | return -(np.cos(theta) - 0.01 * (x ** 2)) 141 | 142 | def mb_step(self, states, actions, next_states): 143 | # returns rewards and dones 144 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 145 | if getattr(self, 'action_space', None): 146 | actions = np.clip(actions, self.action_space.low, self.action_space.high) 147 | rewards = - self.cost_np_vec(states, actions, next_states) 148 | return rewards, np.zeros_like(rewards, dtype=np.bool) 149 | 150 | 151 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/mountain_car.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from gym import spaces 4 | from gym.utils import seeding 5 | import numpy as np 6 | 7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv 8 | 9 | 10 | class Continuous_MountainCarEnv(BaseModelBasedEnv): 11 | metadata = { 12 | 'render.modes': ['human', 'rgb_array'], 13 | 'video.frames_per_second': 30 14 | } 15 | 16 | def __init__(self): 17 | self.min_action = -1.0 18 | self.max_action = 1.0 19 | self.min_position = -1.2 20 | self.max_position = 0.6 21 | self.max_speed = 0.07 22 | self.goal_position = 0.45 # was 0.5 in gym, 0.45 in Arnaud de Broissia's version 23 | self.power = 0.0015 24 | 25 | self.low_state = np.array([self.min_position, -self.max_speed]) 26 | self.high_state = np.array([self.max_position, self.max_speed]) 27 | 28 | self.viewer = None 29 | 30 | self.action_space = spaces.Box(self.min_action, self.max_action, shape=(1,)) 31 | self.observation_space = spaces.Box(self.low_state, self.high_state) 32 | 33 | self._seed() 34 | self.reset() 35 | 36 | def _seed(self, seed=None): 37 | self.np_random, seed = seeding.np_random(seed) 38 | return [seed] 39 | 40 | def step(self, action): 41 | 42 | position = self.state[0] 43 | velocity = self.state[1] 44 | force = min(max(action[0], -1.0), 1.0) 45 | reward = position 46 | 47 | velocity += force * self.power - 0.0025 * math.cos(3 * position) 48 | if (velocity > self.max_speed): 49 | velocity = self.max_speed 50 | if (velocity < -self.max_speed): 51 | velocity = -self.max_speed 52 | position += velocity 53 | if (position > self.max_position): 54 | position = self.max_position 55 | if (position < self.min_position): 56 | position = self.min_position 57 | if (position == self.min_position and velocity < 0): 58 | velocity = 0 59 | 60 | """ 61 | done = bool(position >= self.goal_position) 62 | 63 | reward = 0 64 | if done: 65 | reward = 100.0 66 | reward -= math.pow(action[0], 2) * 0.1 67 | 68 | """ 69 | done = False 70 | self.state = np.array([position, velocity]) 71 | return self.state, reward, done, {} 72 | 73 | def _reset(self): 74 | self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0]) 75 | return np.array(self.state) 76 | 77 | # def get_state(self): 78 | # return self.state 79 | 80 | def _height(self, xs): 81 | return np.sin(3 * xs) * .45 + .55 82 | 83 | def render(self, mode='human', close=False): 84 | if close: 85 | if self.viewer is not None: 86 | self.viewer.close() 87 | self.viewer = None 88 | return 89 | 90 | screen_width = 600 91 | screen_height = 400 92 | 93 | world_width = self.max_position - self.min_position 94 | scale = screen_width / world_width 95 | carwidth = 40 96 | carheight = 20 97 | 98 | if self.viewer is None: 99 | from gym.envs.classic_control import rendering 100 | self.viewer = rendering.Viewer(screen_width, screen_height) 101 | xs = np.linspace(self.min_position, self.max_position, 100) 102 | ys = self._height(xs) 103 | xys = list(zip((xs - self.min_position) * scale, ys * scale)) 104 | 105 | self.track = rendering.make_polyline(xys) 106 | self.track.set_linewidth(4) 107 | self.viewer.add_geom(self.track) 108 | 109 | clearance = 10 110 | 111 | l, r, t, b = -carwidth / 2, carwidth / 2, carheight, 0 112 | car = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) 113 | car.add_attr(rendering.Transform(translation=(0, clearance))) 114 | self.cartrans = rendering.Transform() 115 | car.add_attr(self.cartrans) 116 | self.viewer.add_geom(car) 117 | frontwheel = rendering.make_circle(carheight / 2.5) 118 | frontwheel.set_color(.5, .5, .5) 119 | frontwheel.add_attr(rendering.Transform(translation=(carwidth / 4, clearance))) 120 | frontwheel.add_attr(self.cartrans) 121 | self.viewer.add_geom(frontwheel) 122 | backwheel = rendering.make_circle(carheight / 2.5) 123 | backwheel.add_attr(rendering.Transform(translation=(-carwidth / 4, clearance))) 124 | backwheel.add_attr(self.cartrans) 125 | backwheel.set_color(.5, .5, .5) 126 | self.viewer.add_geom(backwheel) 127 | flagx = (self.goal_position - self.min_position) * scale 128 | flagy1 = self._height(self.goal_position) * scale 129 | flagy2 = flagy1 + 50 130 | flagpole = rendering.Line((flagx, flagy1), (flagx, flagy2)) 131 | self.viewer.add_geom(flagpole) 132 | flag = rendering.FilledPolygon([(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)]) 133 | flag.set_color(.8, .8, 0) 134 | self.viewer.add_geom(flag) 135 | 136 | pos = self.state[0] 137 | self.cartrans.set_translation((pos - self.min_position) * scale, self._height(pos) * scale) 138 | self.cartrans.set_rotation(math.cos(3 * pos)) 139 | 140 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array')) 141 | 142 | def mb_step(self, states, actions, next_states): 143 | # returns rewards and dones 144 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs 145 | if getattr(self, 'action_space', None): 146 | actions = np.clip(actions, self.action_space.low, 147 | self.action_space.high) 148 | rewards = - self.cost_np_vec(states, actions, next_states) 149 | return rewards, np.zeros_like(rewards, dtype=np.bool) 150 | 151 | def cost_np_vec(self, obs, acts, next_obs): 152 | """ 153 | position = self.state[0] 154 | velocity = self.state[1] 155 | force = min(max(action[0], -1.0), 1.0) 156 | reward = position 157 | """ 158 | position = obs[:, 0] 159 | return -position 160 | -------------------------------------------------------------------------------- /mbpo_pytorch/envs/benchmarking_envs/gym/assets/pusher.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 102 | -------------------------------------------------------------------------------- /mbpo_pytorch/algos/mfrl/sac.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from operator import itemgetter 4 | from typing import TYPE_CHECKING, Dict 5 | 6 | import torch 7 | from torch import nn as nn 8 | 9 | from mbpo_pytorch.models.utils import soft_update 10 | 11 | if TYPE_CHECKING: 12 | from mbpo_pytorch.models import Actor, QCritic 13 | from mbpo_pytorch.storages import SimpleUniversalBuffer as Buffer 14 | 15 | 16 | class SAC: 17 | def __init__( 18 | self, 19 | actor: Actor, 20 | q_critic1: QCritic, 21 | q_critic2: QCritic, 22 | target_q_critic1: QCritic, 23 | target_q_critic2: QCritic, 24 | batch_size: int, 25 | num_grad_steps: int, 26 | gamma=0.99, 27 | reward_scale=1.0, 28 | actor_lr=1e-3, 29 | critic_lr=1e-3, 30 | soft_target_tau=1e-2, 31 | target_update_interval=1, 32 | use_automatic_entropy_tuning=True, 33 | target_entropy=None, 34 | alpha=1.0, 35 | ): 36 | super(SAC).__init__() 37 | self.actor = actor 38 | self.q_critic1 = q_critic1 39 | self.q_critic2 = q_critic2 40 | self.target_q_critic1 = target_q_critic1 41 | self.target_q_critic2 = target_q_critic2 42 | self.soft_target_tau = soft_target_tau 43 | self.target_update_interval = target_update_interval 44 | 45 | self.batch_size = batch_size 46 | self.num_grad_steps = num_grad_steps 47 | 48 | self.use_automatic_entropy_tuning = use_automatic_entropy_tuning 49 | if self.use_automatic_entropy_tuning: 50 | self.target_entropy = torch.tensor(target_entropy) 51 | self.log_alpha = torch.zeros(1, requires_grad=True) 52 | self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=actor_lr) 53 | 54 | self._alpha = torch.tensor(alpha) 55 | 56 | self.qf_criterion = nn.MSELoss() 57 | 58 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) 59 | self.qf1_optimizer = torch.optim.Adam(self.q_critic1.parameters(), lr=critic_lr) 60 | self.qf2_optimizer = torch.optim.Adam(self.q_critic2.parameters(), lr=critic_lr) 61 | 62 | self.gamma = gamma 63 | self.reward_scale = reward_scale 64 | self.total_num_updates = 0 65 | self._need_to_update_eval_statistics = True 66 | 67 | self._sync_target_network() 68 | 69 | @staticmethod 70 | def check_buffer(buffer): 71 | assert {'states', 'actions', 'rewards', 'masks', 'next_states'}.issubset(buffer.entry_infos.keys()) 72 | 73 | def _sync_target_network(self): 74 | soft_update(self.q_critic1, self.target_q_critic1, 1.0) 75 | soft_update(self.q_critic2, self.target_q_critic2, 1.0) 76 | 77 | def update(self, policy_buffer: Buffer) -> Dict[str, float]: 78 | 79 | data_generator = policy_buffer.get_batch_generator_inf(self.batch_size) 80 | 81 | self.actor.train() 82 | self.q_critic1.train() 83 | self.q_critic2.train() 84 | self.target_q_critic1.train() 85 | self.target_q_critic2.train() 86 | 87 | policy_loss_epoch = 0. 88 | qf1_loss_epoch = 0. 89 | qf2_loss_epoch = 0. 90 | alpha_loss_epoch = 0. 91 | 92 | for _ in range(self.num_grad_steps): 93 | 94 | samples = next(data_generator) 95 | 96 | states, actions, rewards, masks, next_states = \ 97 | itemgetter('states', 'actions', 'rewards', 'masks', 'next_states')(samples) 98 | 99 | new_actions, log_probs = itemgetter('actions', 'log_probs')(self.actor.act(states, reparameterize=True)) 100 | 101 | if self.use_automatic_entropy_tuning: 102 | alpha_loss = -(self.log_alpha.to(log_probs.device) * (log_probs + self.target_entropy).detach()).mean() 103 | self.alpha_optimizer.zero_grad() 104 | alpha_loss.backward() 105 | self.alpha_optimizer.step() 106 | alpha = self.log_alpha.exp() 107 | else: 108 | alpha_loss = torch.tensor([0.]) 109 | alpha = self._alpha 110 | 111 | alpha = alpha.to(log_probs.device) 112 | 113 | q_new_actions = torch.min(self.q_critic1(states, new_actions), 114 | self.q_critic2(states, new_actions)) 115 | policy_loss = (alpha * log_probs - q_new_actions).mean() 116 | 117 | q1_pred = self.q_critic1(states, actions) 118 | q2_pred = self.q_critic2(states, actions) 119 | 120 | new_next_actions, new_next_log_probs = \ 121 | itemgetter('actions', 'log_probs')(self.actor.act(next_states, reparameterize=True)) 122 | 123 | target_q_values = torch.min(self.target_q_critic1(next_states, new_next_actions), 124 | self.target_q_critic2(next_states, new_next_actions)) \ 125 | - alpha * new_next_log_probs 126 | 127 | q_target = self.reward_scale * rewards + masks * self.gamma * target_q_values 128 | qf1_loss = self.qf_criterion(q1_pred, q_target.detach()) 129 | qf2_loss = self.qf_criterion(q2_pred, q_target.detach()) 130 | 131 | self.actor_optimizer.zero_grad() 132 | policy_loss.backward() 133 | self.actor_optimizer.step() 134 | 135 | self.qf1_optimizer.zero_grad() 136 | qf1_loss.backward() 137 | self.qf1_optimizer.step() 138 | 139 | self.qf2_optimizer.zero_grad() 140 | qf2_loss.backward() 141 | self.qf2_optimizer.step() 142 | 143 | if self.total_num_updates % self.target_update_interval == 0: 144 | soft_update(self.q_critic1, self.target_q_critic1, self.soft_target_tau) 145 | soft_update(self.q_critic2, self.target_q_critic2, self.soft_target_tau) 146 | 147 | self.total_num_updates += 1 148 | 149 | policy_loss_epoch += policy_loss.item() 150 | qf1_loss_epoch += qf1_loss.item() 151 | qf2_loss_epoch += qf2_loss.item() 152 | alpha_loss_epoch += alpha_loss.item() 153 | 154 | policy_loss_epoch /= self.num_grad_steps 155 | qf1_loss_epoch /= self.num_grad_steps 156 | qf2_loss_epoch /= self.num_grad_steps 157 | alpha_loss_epoch /= self.num_grad_steps 158 | 159 | return {'policy_loss': policy_loss_epoch, 'qf1_loss': qf1_loss_epoch, 160 | 'qf2_loss': qf2_loss_epoch, 'alpha_loss': alpha_loss_epoch} 161 | 162 | --------------------------------------------------------------------------------