├── .gitignore
├── mbpo_pytorch
├── __init__.py
├── configs
│ ├── __init__.py
│ ├── ant.yaml
│ ├── hopper.yaml
│ ├── walker2d.yaml
│ ├── halfcheetah.yaml
│ ├── inverted_pendulum.yaml
│ ├── inverted_double_pendulum.yaml
│ ├── humanoid.yaml
│ ├── sac.yaml
│ ├── mbpo.yaml
│ └── config.py
├── envs
│ ├── __init__.py
│ ├── benchmarking_envs
│ │ ├── __init__.py
│ │ ├── gym
│ │ │ ├── __init__.py
│ │ │ ├── gym_oswimmer.py
│ │ │ ├── gym_ohalfcheetah.py
│ │ │ ├── gym_oant.py
│ │ │ ├── gym_ohumanoid.py
│ │ │ ├── gym_ohopper.py
│ │ │ ├── gym_owalker.py
│ │ │ ├── pets_cartpole.py
│ │ │ ├── assets
│ │ │ │ ├── cartpole.xml
│ │ │ │ ├── fixed_swimmer.xml
│ │ │ │ ├── half_cheetah.xml
│ │ │ │ └── pusher.xml
│ │ │ ├── pets_cheetah.py
│ │ │ ├── inverted_pendulum.py
│ │ │ ├── gym_cheetahO01.py
│ │ │ ├── half_cheetah.py
│ │ │ ├── gym_fswimmer.py
│ │ │ ├── reacher.py
│ │ │ ├── ant.py
│ │ │ ├── gym_cheetahO001.py
│ │ │ ├── swimmer.py
│ │ │ ├── gym_fant.py
│ │ │ ├── gym_cheetahA003.py
│ │ │ ├── gym_cheetahA01.py
│ │ │ ├── pets_pusher.py
│ │ │ ├── walker2d.py
│ │ │ ├── gym_nostopslimhumanoid.py
│ │ │ ├── hopper.py
│ │ │ ├── gym_slimhumanoid.py
│ │ │ ├── gym_fhopper.py
│ │ │ ├── gym_fwalker2d.py
│ │ │ ├── gym_humanoid.py
│ │ │ ├── pets_reacher.py
│ │ │ ├── pendulum.py
│ │ │ ├── gym_pendulumO01.py
│ │ │ ├── gym_pendulumO001.py
│ │ │ ├── gym_cartpoleO001.py
│ │ │ ├── cartpole.py
│ │ │ ├── gym_cartpoleO01.py
│ │ │ └── mountain_car.py
│ │ ├── assets
│ │ │ ├── point.xml
│ │ │ ├── swimmer.xml
│ │ │ ├── reacher.xml
│ │ │ ├── hopper.xml
│ │ │ ├── walker2d.xml
│ │ │ ├── ant.xml
│ │ │ ├── pusher.xml
│ │ │ └── half_cheetah.xml
│ │ └── benchmarking_envs.py
│ └── wrapped_envs.py
├── misc
│ ├── __init__.py
│ ├── distributions.py
│ └── utils.py
├── thirdparty
│ ├── __init__.py
│ ├── summary_writer.py
│ ├── tile_images.py
│ ├── running_mean_std.py
│ ├── util.py
│ └── dummy_vec_env.py
├── algos
│ ├── mbrl
│ │ └── __init__.py
│ ├── mfrl
│ │ ├── __init__.py
│ │ └── sac.py
│ └── __init__.py
├── storages
│ ├── __init__.py
│ └── mixture_buffer.py
├── scripts
│ ├── run_mbpo.sh
│ └── remove_tb_logs.py
└── models
│ ├── __init__.py
│ ├── initializer.py
│ ├── critic.py
│ ├── q_critic.py
│ ├── utils.py
│ ├── actor.py
│ ├── normalizers.py
│ └── actor_layer.py
├── setup.py
└── readme.md
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
--------------------------------------------------------------------------------
/mbpo_pytorch/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mbpo_pytorch/misc/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mbpo_pytorch/thirdparty/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/mbpo_pytorch/algos/mbrl/__init__.py:
--------------------------------------------------------------------------------
1 | from .mbpo import MBPO
2 |
--------------------------------------------------------------------------------
/mbpo_pytorch/algos/mfrl/__init__.py:
--------------------------------------------------------------------------------
1 | from .sac import SAC
2 |
--------------------------------------------------------------------------------
/mbpo_pytorch/algos/__init__.py:
--------------------------------------------------------------------------------
1 | from .mbrl import MBPO
2 | from .mfrl import SAC
3 |
--------------------------------------------------------------------------------
/mbpo_pytorch/storages/__init__.py:
--------------------------------------------------------------------------------
1 | from .universal_offpolicy_buffer import SimpleUniversalBuffer
2 | from .mixture_buffer import MixtureBuffer
--------------------------------------------------------------------------------
/mbpo_pytorch/scripts/run_mbpo.sh:
--------------------------------------------------------------------------------
1 | for env in "halfcheetah" "walker2d" "hopper" "ant"
2 | do
3 | python run_mbpo.py --configs "mbpo.yaml" "${env}.yaml" "priv.yaml"
4 | done
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/ant.yaml:
--------------------------------------------------------------------------------
1 | env:
2 | env_name: 'OriginalAnt'
3 | max_episode_steps: 1000
4 |
5 | sac:
6 | target_entropy: -4
7 | num_grad_steps: 20
8 |
9 | mbpo:
10 | rollout_schedule: [ 20, 100, 1, 25 ]
11 |
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/hopper.yaml:
--------------------------------------------------------------------------------
1 | env:
2 | env_name: 'OriginalHopper'
3 | max_episode_steps: 1000
4 |
5 | sac:
6 | target_entropy: ~
7 | num_grad_steps: 20
8 |
9 | mbpo:
10 | rollout_schedule: [ 20, 150, 1, 15 ]
11 |
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/walker2d.yaml:
--------------------------------------------------------------------------------
1 | env:
2 | env_name: 'OriginalWalker'
3 | max_episode_steps: 1000
4 |
5 | sac:
6 | target_entropy: -3
7 | num_grad_steps: 20
8 |
9 | mbpo:
10 | rollout_schedule: [ 20, 150, 1, 1 ]
11 |
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/halfcheetah.yaml:
--------------------------------------------------------------------------------
1 | env:
2 | env_name: 'OriginalHalfCheetah'
3 | max_episode_steps: 1000
4 |
5 | sac:
6 | target_entropy: -3
7 | num_grad_steps: 40
8 |
9 | mbpo:
10 | rollout_schedule: [ 20, 150, 1, 1 ]
11 |
--------------------------------------------------------------------------------
/mbpo_pytorch/models/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from .actor import Actor
3 | from .critic import QCritic
4 | from .dynamics import RDynamics, EnsembleRDynamics
5 | from .normalizers import RunningNormalizer, BatchNormalizer
6 |
7 | setattr(torch, 'identity', lambda x: x)
8 | setattr(torch, 'swish', lambda x: x * torch.sigmoid(x))
9 |
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/inverted_pendulum.yaml:
--------------------------------------------------------------------------------
1 | env:
2 | env_name: 'OriginalInvertedPendulum'
3 | max_episode_steps: 250
4 |
5 | ac:
6 | target_entropy: -0.05
7 | num_grad_steps: 10
8 |
9 | mbpo:
10 | rollout_schedule: [ 1, 15, 1, 1 ]
11 | num_total_epochs: 80
12 | num_warmup_samples: 500
13 |
14 |
15 |
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/inverted_double_pendulum.yaml:
--------------------------------------------------------------------------------
1 | env:
2 | env_name: 'OriginalDoubleInvertedPendulum'
3 | max_episode_steps: 250
4 |
5 | ac:
6 | target_entropy: -0.5
7 | num_grad_steps: 20
8 |
9 | mbpo:
10 | rollout_schedule: [ 1, 15, 1, 1 ]
11 | num_total_epochs: 80
12 | num_warmup_samples: 500
13 |
14 |
15 |
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/humanoid.yaml:
--------------------------------------------------------------------------------
1 | env:
2 | env_name: 'OriginalHumanoid'
3 | max_episode_steps: 1000
4 |
5 | sac:
6 | target_entropy: -2
7 | num_grad_steps: 40
8 |
9 | mbpo:
10 | rollout_schedule: [ 20, 300, 1, 15 ]
11 | dynamics_hidden_dims: [400, 400, 400, 400]
12 | num_model_retain_epochs: 5
13 | model_update_interval: 1000
14 |
15 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_oswimmer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco.swimmer import SwimmerEnv
3 |
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class OriginalSwimmerEnv(SwimmerEnv, BaseModelBasedEnv):
9 |
10 | def mb_step(self, states, actions, next_states):
11 | return None, np.zeros([states.shape[0], 1], dtype=np.bool)
12 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_ohalfcheetah.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco.half_cheetah import HalfCheetahEnv
3 |
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class OriginalHalfCheetahEnv(HalfCheetahEnv, BaseModelBasedEnv):
9 |
10 | def mb_step(self, states, actions, next_states):
11 | return None, np.zeros([states.shape[0], 1], dtype=np.bool)
12 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_oant.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco.ant import AntEnv
3 |
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class OriginalAntEnv(AntEnv, BaseModelBasedEnv):
9 |
10 | def mb_step(self, states, actions, next_states):
11 | heights = next_states[:, 0]
12 | dones = np.logical_or((heights > 1.0), (heights < 0.2))
13 | return None, dones
14 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_ohumanoid.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco.humanoid import HumanoidEnv
3 |
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class OriginalHumanoidEnv(HumanoidEnv, BaseModelBasedEnv):
9 |
10 | def mb_step(self, states, actions, next_states):
11 | heights = next_states[:, 0]
12 | dones = np.logical_or((heights > 2.0), (heights < 1.0))
13 | return None, dones
14 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_ohopper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco.hopper import HopperEnv
3 |
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class OriginalHopperEnv(HopperEnv, BaseModelBasedEnv):
9 |
10 | def mb_step(self, states, actions, next_states):
11 | heights, angs = next_states[:, 0], next_states[:, 1]
12 | dones = np.logical_or(heights <= 0.7, abs(angs) >= 0.2)
13 | return None, dones
14 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_owalker.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco.walker2d import Walker2dEnv
3 |
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class OriginalWalkerEnv(Walker2dEnv, BaseModelBasedEnv):
9 |
10 | def mb_step(self, states, actions, next_states):
11 | heights, angs = next_states[:, 0], next_states[:, 1]
12 | dones = np.logical_or(
13 | np.logical_or(heights >= 2.0, heights <= 0.8),
14 | np.abs(angs) >= 1.0
15 | )
16 | return None, dones
17 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages
2 | from setuptools import setup
3 |
4 | setup(
5 | name='mbpo_pytorch',
6 | auther='Shengyi Jiang',
7 | author_email='shengyi.jiang@outlook.com',
8 | packages=find_packages(),
9 | python_requires='>=3.7',
10 | install_requires=[
11 | 'torch>=1.4.0',
12 | 'mujoco-py',
13 | 'scipy',
14 | 'numpy',
15 | 'gym>=0.17.0',
16 | 'pyglib',
17 | 'munch',
18 | 'pyyaml',
19 | 'colorama',
20 | 'tensorboard>=1.15.0',
21 | 'pandas'
22 | ],
23 | package_data={
24 | # include default config files and env data files
25 | "": ["*.yaml", "*.xml"],
26 | }
27 | )
28 |
--------------------------------------------------------------------------------
/mbpo_pytorch/thirdparty/summary_writer.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.tensorboard import SummaryWriter
3 | from torch.utils.tensorboard.summary import hparams
4 |
5 |
6 | class FixedSummaryWriter(SummaryWriter):
7 | def add_hparams(self, hparam_dict, metric_dict):
8 | torch._C._log_api_usage_once("tensorboard.logging.add_hparams")
9 | if type(hparam_dict) is not dict or type(metric_dict) is not dict:
10 | raise TypeError('hparam_dict and metric_dict should be dictionary.')
11 | exp, ssi, sei = hparams(hparam_dict, metric_dict)
12 |
13 | self._get_file_writer().add_summary(exp)
14 | self._get_file_writer().add_summary(ssi)
15 | self._get_file_writer().add_summary(sei)
16 | for k, v in metric_dict.items():
17 | self.add_scalar(k, v)
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/sac.yaml:
--------------------------------------------------------------------------------
1 | proj_name: 'SAC'
2 | proj_dir: '/home/liuxh/Documents/mbpo'
3 | result_dir: './result'
4 | save_dir: './save'
5 | use_cuda: True
6 | device: 'cpu' # e.g 'cpu', 'cuda', 'cuda:0'
7 | seed: 3
8 | verbose: 0
9 | model_load_path: ~
10 | buffer_load_path: ~
11 | log_interval: 1
12 | save_interval: 10
13 | eval_interval: 1
14 | log_email: False
15 | debug: False
16 |
17 | env:
18 | env_name: 'Walker2d-v2'
19 | num_envs: 1
20 | gamma: 0.99
21 | max_episode_steps: 1000
22 |
23 | sac:
24 | num_total_steps: 1000000
25 | num_warmup_steps: 1000
26 | num_epoch_steps: 1000
27 | buffer_size: 200000
28 | actor_hidden_dims: [256, 256]
29 | critic_hidden_dims: [256, 256]
30 | num_grad_steps: 1000
31 | batch_size: 256
32 | target_entropy: ~
33 | actor_lr: 3.0e-4
34 | critic_lr: 3.0e-4
35 | soft_target_tau: 5.0e-3
36 |
37 |
--------------------------------------------------------------------------------
/mbpo_pytorch/models/initializer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def normc_init(tensor, std=1.0, **kwargs):
5 | tensor.data.normal_(0, 1)
6 | tensor.data *= std / np.sqrt(tensor.data.pow(2).sum(1, keepdim=True))
7 |
8 |
9 | def fanin_init(tensor, **kwargs):
10 | size = tensor.size()
11 | if len(size) == 2:
12 | fan_in = size[0]
13 | elif len(size) > 2:
14 | fan_in = np.prod(size[1:])
15 | else:
16 | raise Exception("Shape must be have dimension at least 2.")
17 | bound = 1. / np.sqrt(fan_in)
18 | return tensor.data.uniform_(-bound, bound)
19 |
20 |
21 | def truncated_norm_init(tensor, mean=0, std=None, **kwargs):
22 | size = tensor.shape
23 | std = std or 1.0/(2*np.sqrt(size[0]))
24 | tmp = tensor.new_empty(size + (4,)).normal_()
25 | valid = (tmp < 2) & (tmp > -2)
26 | ind = valid.max(-1, keepdim=True)[1]
27 | tensor.data.copy_(tmp.gather(-1, ind).squeeze(-1))
28 | tensor.data.mul_(std).add_(mean)
29 | return tensor
30 |
31 |
--------------------------------------------------------------------------------
/mbpo_pytorch/storages/mixture_buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from mbpo_pytorch.misc.utils import merge_dicts
5 | from mbpo_pytorch.misc import logger
6 |
7 |
8 | class MixtureBuffer:
9 | def __init__(self, buffers, weights, verbose=0):
10 | self.buffers = buffers
11 | self.weights = np.array(weights)
12 | self.verbose = verbose
13 |
14 | def get_batch_generator_inf(self, batch_size, **kwargs):
15 | batch_sizes = (batch_size * self.weights).astype(np.int)
16 | if self.verbose:
17 | logger.log('[Buffer Mixing] Max error {}'.format(np.max(np.abs(batch_sizes / batch_size - self.weights))))
18 | rand_index = np.random.randint(len(batch_sizes))
19 | batch_sizes[rand_index] = batch_size - np.delete(batch_sizes, rand_index).sum()
20 | inf_gens = [buffer.get_batch_generator_inf(int(batch_size_), **kwargs)
21 | for buffer, batch_size_ in zip(self.buffers, batch_sizes)]
22 | while True:
23 | buffer_samples = list(map(lambda gen: next(gen), inf_gens))
24 | yield merge_dicts(buffer_samples, lambda x: torch.cat(x, dim=0))
25 |
--------------------------------------------------------------------------------
/mbpo_pytorch/thirdparty/tile_images.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def tile_images(img_nhwc):
5 | """
6 | Tile N images into one big PxQ image
7 | (P,Q) are chosen to be as close as possible, and if N
8 | is square, then P=Q.
9 |
10 | :param img_nhwc: (list) list or array of images, ndim=4 once turned into array. img nhwc
11 | n = batch index, h = height, w = width, c = channel
12 | :return: (numpy float) img_HWc, ndim=3
13 | """
14 | img_nhwc = np.asarray(img_nhwc)
15 | n_images, height, width, n_channels = img_nhwc.shape
16 | # new_height was named H before
17 | new_height = int(np.ceil(np.sqrt(n_images)))
18 | # new_width was named W before
19 | new_width = int(np.ceil(float(n_images) / new_height))
20 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)])
21 | # img_HWhwc
22 | out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels)
23 | # img_HhWwc
24 | out_image = out_image.transpose(0, 2, 1, 3, 4)
25 | # img_Hh_Ww_c
26 | out_image = out_image.reshape(new_height * height, new_width * width, n_channels)
27 | return out_image
28 |
29 |
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/mbpo.yaml:
--------------------------------------------------------------------------------
1 | proj_name: 'MBPO'
2 | mf_algo: 'sac'
3 | proj_dir: '~'
4 | result_dir: './result'
5 | save_dir: './save'
6 | device: 'cuda:0' # e.g 'cpu', 'cuda', 'cuda:0'
7 | seed: 3
8 | verbose: 0
9 | model_load_path: ~
10 | buffer_load_path: ~
11 | save_interval: 2
12 | eval_interval: 1
13 | log_interval: 250
14 | log_email: False
15 | debug: False
16 |
17 | env:
18 | num_real_envs: 1
19 | gamma: 0.99
20 |
21 | sac:
22 | actor_hidden_dims: [256, 256]
23 | critic_hidden_dims: [256, 256]
24 | num_grad_steps: 20
25 | batch_size: 256
26 | target_entropy: ~
27 | actor_lr: 3.0e-4
28 | critic_lr: 3.0e-4
29 | soft_target_tau: 5.0e-3
30 |
31 | mbpo:
32 | num_total_epochs: 1000
33 | dynamics_hidden_dims: [200, 200, 200, 200]
34 | l2_loss_coefs: [0.000025, 0.00005, 0.000075, 0.000075, 0.0001]
35 | lr: 1.0e-3
36 | dynamics_batch_size: 256
37 | num_dynamics_networks: 7
38 | num_elite_dynamics_networks: 5
39 | real_buffer_size: 1000000
40 | rollout_batch_size: 100000
41 | num_model_retain_epochs: 1
42 | model_update_interval: 250
43 | rollout_schedule: [20, 150, 1, 15]
44 | max_num_epochs: ~
45 | real_sample_ratio: 0.
46 | num_warmup_samples: 5000
47 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # MBPO PyTorch
2 | A PyTorch reimplementation of MBPO (When to trust your model: model-based policy optimization)
3 |
4 | # Deprecated Warning
5 | The owner of this repo has graduated and this repo is no longer maintained. Please refer to this new [MBPO](https://github.com/x35f/model_based_rl) Pytorch re-implementation, which is a submodule of the [Unstable Baselines](https://github.com/x35f/unstable_Baselines) project maintained by researchers from the same [lab](http://www.lamda.nju.edu.cn/MainPage.ashx). This new MBPO re-implementation strictly follows the original TF implementation and has been tested on several MuJoCo tasks.
6 |
7 | # Dependency
8 |
9 | Please refer to ./requirements.txt.
10 |
11 | # Usage
12 |
13 | pip install -e .
14 |
15 | # default hyperparams in ./configs/mbpo.yaml
16 | # remember to CHANGE proj_dir to your actual directory
17 | python ./mbpo_pytorch/scripts/run_mbpo.py
18 |
19 | # you can also overwrite hyperparams by passing args, e.g.
20 | python ./mbpo_pytorch/scripts/run_mbpo.py --set seed=0 verbose=1 device="'cuda:0'" env.env_name='FixedHopper'
21 |
22 |
23 | # Credits
24 | 1. [vitchyr/rlkit](https://github.com/vitchyr/rlkit)
25 | 2. [JannerM/mbpo](https://github.com/JannerM/mbpo)
26 | 3. [WilsonWangTHU/mbbl](https://github.com/WilsonWangTHU/mbbl)
--------------------------------------------------------------------------------
/mbpo_pytorch/scripts/remove_tb_logs.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | import shutil
3 | from tensorboard.backend.event_processing.event_file_inspector import get_inspection_units, get_dict_to_print
4 |
5 |
6 | parser = ArgumentParser('delete small runs')
7 | parser.add_argument('--logdir', type=str, default='/home/liuxh/Documents/mbpo_torch/result')
8 | parser.add_argument('--min_run_len', type=int, default=100)
9 | parser.add_argument('--list', action='store_true')
10 | args = parser.parse_args()
11 |
12 | run_len = {}
13 | inspect_units = get_inspection_units(logdir=args.logdir)
14 |
15 |
16 | for run in inspect_units:
17 | path = run[0]
18 | max_length = 0
19 | for key, value in get_dict_to_print(run.field_to_obs).items():
20 | if value is not None:
21 | length = value['max_step']
22 | if max_length < length:
23 | max_length = length
24 | run_len[path] = max_length
25 |
26 | for run, length in run_len.items():
27 | if length < args.min_run_len:
28 | if args.list:
29 | print(f'{run} is {length} steps long and so will be deleted')
30 | else:
31 | try:
32 | print(f'{run} is {length} and was deleted')
33 | shutil.rmtree(run)
34 | except OSError:
35 | print(f"OS didn't let us delete {run}")
36 | else:
37 | print(f'{run} is {length} and is good')
38 |
--------------------------------------------------------------------------------
/mbpo_pytorch/thirdparty/running_mean_std.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class RunningMeanStd(object):
5 | def __init__(self, epsilon=1e-4, shape=()):
6 | """
7 | calulates the running mean and std of a data stream
8 | https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
9 |
10 | :param epsilon: (float) helps with arithmetic issues
11 | :param shape: (tuple) the shape of the data stream's output
12 | """
13 | self.mean = np.zeros(shape, 'float64')
14 | self.var = np.ones(shape, 'float64')
15 | self.count = epsilon
16 |
17 | def update(self, arr):
18 | batch_mean = np.mean(arr, axis=0)
19 | batch_var = np.var(arr, axis=0)
20 | batch_count = arr.shape[0]
21 | self.update_from_moments(batch_mean, batch_var, batch_count)
22 |
23 | def update_from_moments(self, batch_mean, batch_var, batch_count):
24 | delta = batch_mean - self.mean
25 | tot_count = self.count + batch_count
26 |
27 | new_mean = self.mean + delta * batch_count / tot_count
28 | m_a = self.var * self.count
29 | m_b = batch_var * batch_count
30 | m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
31 | new_var = m_2 / (self.count + batch_count)
32 |
33 | new_count = batch_count + self.count
34 |
35 | self.mean = new_mean
36 | self.var = new_var
37 | self.count = new_count
38 |
--------------------------------------------------------------------------------
/mbpo_pytorch/models/critic.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 | from typing import List
3 |
4 | from gym.spaces import Box, MultiBinary, Discrete
5 | import torch
6 | import torch.nn as nn
7 |
8 | from .initializer import fanin_init
9 | from .utils import MLP, init
10 |
11 |
12 | class QCritic(nn.Module, ABC):
13 | def __init__(self, state_dim, action_space, hidden_dims: List[int], init_w=3e-3, init_b=0.1,
14 | use_multihead_output=False, **kwargs):
15 | super(QCritic, self).__init__()
16 |
17 | assert not use_multihead_output or action_space.__class__.__name__ == 'Discrete'
18 |
19 | if isinstance(action_space, Box) or isinstance(action_space, MultiBinary):
20 | action_dim = action_space.shape[0]
21 | else:
22 | assert isinstance(action_space, Discrete)
23 | action_dim = action_space.n
24 |
25 | mlp_kwargs = kwargs.copy()
26 | mlp_kwargs['activation'] = kwargs.get('activation', 'ReLU')
27 | mlp_kwargs['last_activation'] = kwargs.get('last_activation', 'Identity')
28 |
29 | self.critic = MLP(state_dim + action_dim, 1, hidden_dims, **kwargs)
30 |
31 | def init_(m): init(m, fanin_init, lambda x: nn.init.constant_(x, init_b))
32 | def init_last_(m): init(m, lambda x: nn.init.uniform_(x, -init_w, init_w),
33 | lambda x: nn.init.uniform_(x, -init_w, init_w))
34 | self.critic.init(init_, init_last_)
35 |
36 | def forward(self, states, actions):
37 | return self.critic(torch.cat([states, actions], dim=-1))
38 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/pets_cartpole.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class CartpoleEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 | PENDULUM_LENGTH = 0.6
12 |
13 | def __init__(self):
14 | utils.EzPickle.__init__(self)
15 | dir_path = os.path.dirname(os.path.realpath(__file__))
16 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/cartpole.xml' % dir_path, 2)
17 |
18 | def step(self, a):
19 | self.do_simulation(a, self.frame_skip)
20 | ob = self._get_obs()
21 |
22 | cost_lscale = CartpoleEnv.PENDULUM_LENGTH
23 | reward = np.exp(
24 | -np.sum(np.square(self._get_ee_pos(ob) - np.array([0.0, CartpoleEnv.PENDULUM_LENGTH]))) / (cost_lscale ** 2)
25 | )
26 | reward -= 0.01 * np.sum(np.square(a))
27 |
28 | done = False
29 | return ob, reward, done, {}
30 |
31 | def reset_model(self):
32 | qpos = self.init_qpos + np.random.normal(0, 0.1, np.shape(self.init_qpos))
33 | qvel = self.init_qvel + np.random.normal(0, 0.1, np.shape(self.init_qvel))
34 | self.set_state(qpos, qvel)
35 | return self._get_obs()
36 |
37 | def _get_obs(self):
38 | return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel()
39 |
40 | @staticmethod
41 | def _get_ee_pos(x):
42 | x0, theta = x[0], x[1]
43 | return np.array([
44 | x0 - CartpoleEnv.PENDULUM_LENGTH * np.sin(theta),
45 | -CartpoleEnv.PENDULUM_LENGTH * np.cos(theta)
46 | ])
47 |
48 | def viewer_setup(self):
49 | v = self.viewer
50 | v.cam.trackbodyid = 0
51 | v.cam.distance = v.model.stat.extent
52 |
--------------------------------------------------------------------------------
/mbpo_pytorch/models/q_critic.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from .utils import MLP, init
7 | from .initializer import fanin_init
8 |
9 |
10 | class QCritic(nn.Module, ABC):
11 | def __init__(self, state_dim, action_space, hidden_dims, activation='relu', last_activation='Identity',
12 | init_w=3e-3, init_b=0.1, use_multihead_output=False):
13 | super(QCritic, self).__init__()
14 |
15 | assert not use_multihead_output or action_space.__class__.__name__ == 'Discrete'
16 |
17 | if action_space.__class__.__name__ == 'Discrete':
18 | action_dim = action_space.n
19 | else:
20 | assert action_space.__class__.__name__ == 'Box'
21 | action_dim = action_space.shape[0]
22 |
23 | if use_multihead_output:
24 | action_dim = action_space.n
25 | self.critic = MLP(state_dim, action_dim, hidden_dims,
26 | activation=activation, last_activation=last_activation)
27 | self.forward = self._get_q_value_discrete
28 | else:
29 | self.critic = MLP(state_dim + action_dim, 1, hidden_dims,
30 | activation=activation, last_activation=last_activation)
31 | self.forward = self._get_q_value_continuous
32 |
33 | def init_(m): init(m, fanin_init, lambda x: nn.init.constant_(x, init_b))
34 | def init_last_(m): init(m, lambda x: nn.init.uniform_(x, -init_w, init_w),
35 | lambda x: nn.init.uniform_(x, -init_w, init_w))
36 | self.critic.init(init_, init_last_)
37 |
38 | def _get_q_value_continuous(self, state, action):
39 | return self.critic(torch.cat([state, action], dim=-1))
40 |
41 | def _get_q_value_discrete(self, state, action):
42 | return self.critic_feature(state)[action]
43 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/assets/cartpole.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/assets/point.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/pets_cheetah.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.realpath(__file__))
15 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/half_cheetah.xml' % dir_path, 5)
16 | utils.EzPickle.__init__(self)
17 |
18 | def step(self, action):
19 | self.prev_qpos = np.copy(self.sim.data.qpos.flat)
20 | self.do_simulation(action, self.frame_skip)
21 | ob = self._get_obs()
22 |
23 | reward_ctrl = -0.1 * np.square(action).sum()
24 | reward_run = ob[0] - 0.0 * np.square(ob[2])
25 | reward = reward_run + reward_ctrl
26 |
27 | done = False
28 | return ob, reward, done, {}
29 |
30 | def _get_obs(self):
31 | return np.concatenate([
32 | (self.sim.data.qpos.flat[:1] - self.prev_qpos[:1]) / self.dt,
33 | self.sim.data.qpos.flat[1:],
34 | self.sim.data.qvel.flat,
35 | ])
36 |
37 | def reset_model(self):
38 | qpos = self.init_qpos + np.random.normal(loc=0, scale=0.001, size=self.model.nq)
39 | qvel = self.init_qvel + np.random.normal(loc=0, scale=0.001, size=self.model.nv)
40 | self.set_state(qpos, qvel)
41 | self.prev_qpos = np.copy(self.sim.data.qpos.flat)
42 | return self._get_obs()
43 |
44 | def cost_np_vec(self, obs, acts, next_obs):
45 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
46 | reward_run = obs[:, 0]
47 | reward = reward_run + reward_ctrl
48 | return -reward
49 |
50 | def viewer_setup(self):
51 | self.viewer.cam.distance = self.model.stat.extent * 0.25
52 | self.viewer.cam.elevation = -55
53 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/inverted_pendulum.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from gym.envs.mujoco import mujoco_env
4 |
5 |
6 | class InvertedPendulumEnv(mujoco_env.MujocoEnv, utils.EzPickle):
7 |
8 | def __init__(self):
9 | utils.EzPickle.__init__(self)
10 | mujoco_env.MujocoEnv.__init__(self, 'inverted_pendulum.xml', 2)
11 |
12 | def step(self, a):
13 | # reward = 1.0
14 | reward = self._get_reward()
15 | self.do_simulation(a, self.frame_skip)
16 | ob = self._get_obs()
17 | # notdone = np.isfinite(ob).all() and (np.abs(ob[1]) <= .2)
18 | # done = not notdone
19 | done = False
20 | return ob, reward, done, {}
21 |
22 | def reset_model(self):
23 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-0.01, high=0.01)
24 | qvel = self.init_qvel + self.np_random.uniform(size=self.model.nv, low=-0.01, high=0.01)
25 | self.set_state(qpos, qvel)
26 | return self._get_obs()
27 |
28 | def _get_reward(self):
29 | old_ob = self._get_obs()
30 | reward = -((old_ob[1]) ** 2)
31 | return reward
32 |
33 | def _get_obs(self):
34 | return np.concatenate([self.sim.data.qpos, self.sim.data.qvel]).ravel()
35 |
36 | def viewer_setup(self):
37 | v = self.viewer
38 | v.cam.trackbodyid = 0
39 | v.cam.distance = v.model.stat.extent
40 |
41 | def mb_step(self, states, actions, next_states):
42 | # returns rewards and dones
43 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
44 | if getattr(self, 'action_space', None):
45 | actions = np.clip(actions, self.action_space.low,
46 | self.action_space.high)
47 | rewards = - self.cost_np_vec(states, actions, next_states)
48 | return rewards, np.zeros_like(rewards, dtype=np.bool)
49 |
50 | def cost_np_vec(self, obs, acts, next_obs):
51 | return ((obs[:, 1]) ** 2)
52 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/assets/swimmer.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/assets/reacher.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/assets/fixed_swimmer.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_cheetahO01.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=5):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action):
21 | start_ob = self._get_obs()
22 | reward_run = start_ob[8]
23 |
24 | self.do_simulation(action, self.frame_skip)
25 | ob = self._get_obs()
26 | if getattr(self, 'action_space', None):
27 | action = np.clip(action, self.action_space.low,
28 | self.action_space.high)
29 | reward_ctrl = -0.1 * np.square(action).sum()
30 |
31 | reward = reward_run + reward_ctrl
32 | done = False
33 | ob += np.random.uniform(low=-0.1, high=0.1, size=ob.shape)
34 | return ob, reward, done, {}
35 |
36 | def _get_obs(self):
37 | return np.concatenate([
38 | self.sim.data.qpos.flat[1:],
39 | self.sim.data.qvel.flat,
40 | ])
41 |
42 | def reset_model(self):
43 | qpos = self.init_qpos + \
44 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
45 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
46 | self.set_state(qpos, qvel)
47 | return self._get_obs()
48 |
49 | def viewer_setup(self):
50 | self.viewer.cam.distance = self.model.stat.extent * 0.5
51 |
52 | def cost_np_vec(self, obs, acts, next_obs):
53 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
54 | reward_run = obs[:, 8]
55 | reward = reward_run + reward_ctrl
56 | return -reward
57 |
58 | def mb_step(self, states, actions, next_states):
59 | # returns rewards and dones
60 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
61 | if getattr(self, 'action_space', None):
62 | actions = np.clip(actions, self.action_space.low,
63 | self.action_space.high)
64 | rewards = - self.cost_np_vec(states, actions, next_states)
65 | return rewards, np.zeros_like(rewards, dtype=np.bool)
66 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/half_cheetah.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=5):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action: np.ndarray):
21 | start_ob = self._get_obs()
22 | reward_run = start_ob[8]
23 |
24 | self.do_simulation(action, self.frame_skip)
25 | ob = self._get_obs()
26 | if getattr(self, 'action_space', None):
27 | action = np.clip(action, self.action_space.low,
28 | self.action_space.high)
29 | reward_ctrl = -0.1 * np.square(action).sum()
30 |
31 | reward = reward_run + reward_ctrl
32 | done = False
33 | return ob, reward, done, {}
34 |
35 | def _get_obs(self):
36 | return np.concatenate([
37 | self.sim.data.qpos.flat[1:],
38 | self.sim.data.qvel.flat,
39 | ])
40 |
41 | def mb_step(self, states, actions, next_states):
42 | # returns rewards and dones
43 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
44 | if getattr(self, 'action_space', None):
45 | actions = np.clip(actions, self.action_space.low,
46 | self.action_space.high)
47 | rewards = - self.cost_np_vec(states, actions, next_states)
48 | return rewards, np.zeros_like(rewards, dtype=np.bool)
49 |
50 | def reset_model(self):
51 | qpos = self.init_qpos + \
52 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
53 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
54 | self.set_state(qpos, qvel)
55 | return self._get_obs()
56 |
57 | def viewer_setup(self):
58 | self.viewer.cam.distance = self.model.stat.extent * 0.5
59 |
60 | def cost_np_vec(self, obs, acts, next_obs):
61 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
62 | reward_run = obs[:, 8]
63 | reward = reward_run + reward_ctrl
64 | return -reward
65 |
66 | def cost_tf_vec(self, obs, acts, next_obs):
67 | raise NotImplementedError
68 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_fswimmer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class fixedSwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self):
13 | dir_path = os.path.dirname(os.path.realpath(__file__))
14 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/fixed_swimmer.xml' % dir_path, 4)
15 | utils.EzPickle.__init__(self)
16 |
17 | def step(self, a):
18 | ctrl_cost_coeff = 0.0001
19 |
20 | """
21 | xposbefore = self.sim.data.qpos[0, 0]
22 | self.do_simulation(a, self.frame_skip)
23 | xposafter = self.sim.data.qpos[0, 0]
24 | """
25 |
26 | self.xposbefore = self.sim.data.site_xpos[0][0] / self.dt
27 | self.do_simulation(a, self.frame_skip)
28 | self.xposafter = self.sim.data.site_xpos[0][0] / self.dt
29 | self.pos_diff = self.xposafter - self.xposbefore
30 |
31 | reward_fwd = self.xposafter - self.xposbefore
32 | reward_ctrl = - ctrl_cost_coeff * np.square(a).sum()
33 | reward = reward_fwd + reward_ctrl
34 | ob = self._get_obs()
35 | return ob, reward, False, dict(reward_fwd=reward_fwd, reward_ctrl=reward_ctrl)
36 |
37 | def _get_obs(self):
38 | qpos = self.sim.data.qpos
39 | qvel = self.sim.data.qvel
40 | return np.concatenate([qpos.flat[2:], qvel.flat, self.pos_diff.flat])
41 |
42 | def mb_step(self, states, actions, next_states):
43 | # returns rewards and dones
44 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
45 | if getattr(self, 'action_space', None):
46 | actions = np.clip(actions, self.action_space.low,
47 | self.action_space.high)
48 | rewards = - self.cost_np_vec(states, actions, next_states)
49 | return rewards, np.zeros_like(rewards, dtype=np.bool)
50 |
51 | def reset_model(self):
52 | self.set_state(
53 | self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq),
54 | self.init_qvel + self.np_random.uniform(low=-.1, high=.1, size=self.model.nv)
55 | )
56 | return self._get_obs()
57 |
58 | def cost_np_vec(self, obs, acts, next_obs):
59 | reward_ctrl = -0.0001 * np.sum(np.square(acts), axis=1)
60 | reward_run = obs[:, -1]
61 | reward = reward_run + reward_ctrl
62 | return -reward
63 |
64 | def cost_tf_vec(self, obs, acts, next_obs):
65 | raise NotImplementedError
66 |
67 |
68 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/reacher.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from gym.envs.mujoco import mujoco_env
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class ReacherEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
9 |
10 | def __init__(self):
11 | utils.EzPickle.__init__(self)
12 | mujoco_env.MujocoEnv.__init__(self, 'reacher.xml', 2)
13 |
14 | def step(self, a):
15 | vec = self.get_body_com("fingertip") - self.get_body_com("target")
16 |
17 | if getattr(self, 'action_space', None):
18 | a = np.clip(a, self.action_space.low,
19 | self.action_space.high)
20 | reward_dist = - np.linalg.norm(vec)
21 | reward_ctrl = - np.square(a).sum()
22 | reward = reward_dist + reward_ctrl
23 | self.do_simulation(a, self.frame_skip)
24 | ob = self._get_obs()
25 | done = False
26 | return ob, reward, done, dict(reward_dist=reward_dist, reward_ctrl=reward_ctrl)
27 |
28 | def viewer_setup(self):
29 | self.viewer.cam.trackbodyid = 0
30 |
31 | def mb_step(self, states, actions, next_states):
32 | # returns rewards and dones
33 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
34 | if getattr(self, 'action_space', None):
35 | actions = np.clip(actions, self.action_space.low,
36 | self.action_space.high)
37 | rewards = - self.cost_np_vec(states, actions, next_states)
38 | return rewards, np.zeros_like(rewards, dtype=np.bool)
39 |
40 | def reset_model(self):
41 | qpos = self.np_random.uniform(low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos
42 | while True:
43 | self.goal = self.np_random.uniform(low=-.2, high=.2, size=2)
44 | if np.linalg.norm(self.goal) < 2:
45 | break
46 | qpos[-2:] = self.goal
47 | qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
48 | qvel[-2:] = 0
49 | self.set_state(qpos, qvel)
50 | return self._get_obs()
51 |
52 | def _get_obs(self):
53 | theta = self.sim.data.qpos.flat[:2]
54 | return np.concatenate([
55 | np.cos(theta),
56 | np.sin(theta),
57 | self.sim.data.qpos.flat[2:],
58 | self.sim.data.qvel.flat[:2],
59 | self.get_body_com("fingertip") - self.get_body_com("target")
60 | ])
61 |
62 | def cost_np_vec(self, obs, acts, next_obs):
63 | dist_vec = obs[:, -3:]
64 | reward_dist = - np.linalg.norm(dist_vec, axis=1)
65 | reward_ctrl = - np.sum(np.square(acts), axis=1)
66 | reward = reward_dist + reward_ctrl
67 | return -reward
68 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/ant.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=5):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/ant.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action: np.ndarray):
21 | old_ob = self._get_obs()
22 | self.do_simulation(action, self.frame_skip)
23 |
24 | if getattr(self, 'action_space', None):
25 | action = np.clip(action, self.action_space.low, self.action_space.high)
26 | ob = self._get_obs()
27 |
28 | reward_ctrl = -0.1 * np.square(action).sum()
29 | reward_run = old_ob[13]
30 | reward_height = -3.0 * np.square(old_ob[0] - 0.57)
31 | reward = reward_run + reward_ctrl + reward_height + 1.0
32 | done = False
33 | return ob, reward, done, {}
34 |
35 | def _get_obs(self):
36 | return np.concatenate([
37 | # (self.sim.data.qpos.flat[:1] - self.prev_qpos[:1]) / self.dt,
38 | # self.get_body_comvel("torso")[:1],
39 | self.sim.data.qpos.flat[2:],
40 | self.sim.data.qvel.flat,
41 | ])
42 |
43 | def mb_step(self, states, actions, next_states):
44 | # returns rewards and dones
45 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
46 | if getattr(self, 'action_space', None):
47 | actions = np.clip(actions, self.action_space.low,
48 | self.action_space.high)
49 | rewards = - self.cost_np_vec(states, actions, next_states)
50 | return rewards, np.zeros_like(rewards, dtype=np.bool)
51 |
52 | def reset_model(self):
53 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
54 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
55 | self.set_state(qpos, qvel)
56 | # self.prev_qpos = np.copy(self.sim.data.qpos.flat)
57 | return self._get_obs()
58 |
59 | def viewer_setup(self):
60 | self.viewer.cam.distance = self.model.stat.extent * 0.5
61 |
62 | def cost_np_vec(self, obs, acts, next_obs):
63 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
64 | reward_run = obs[:, 13]
65 | reward_height = -3.0 * np.square(obs[:, 0] - 0.57)
66 | reward = reward_run + reward_ctrl + reward_height + 1.0
67 | return -reward
68 |
69 |
--------------------------------------------------------------------------------
/mbpo_pytorch/models/utils.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 |
7 |
8 | class MLP(nn.Module, ABC):
9 | def __init__(self, input_dim, output_dim, hidden_dims, activation='tanh', last_activation='identity', biases=None):
10 | super(MLP, self).__init__()
11 | sizes_list = hidden_dims.copy()
12 | self.activation = getattr(torch, activation)
13 | self.last_activation = getattr(torch, last_activation)
14 | sizes_list.insert(0, input_dim)
15 | biases = [True] * len(sizes_list) if biases is None else biases.copy()
16 |
17 | layers = []
18 | if 1 < len(sizes_list):
19 | for i in range(len(sizes_list) - 1):
20 | layers.append(nn.Linear(sizes_list[i], sizes_list[i + 1], bias=biases[i]))
21 | self.last_layer = nn.Linear(sizes_list[-1], output_dim)
22 | self.layers = nn.ModuleList(layers)
23 |
24 | def forward(self, x):
25 | for layer in self.layers:
26 | x = layer(x)
27 | x = self.activation(x)
28 | x = self.last_layer(x)
29 | x = self.last_activation(x)
30 | return x
31 |
32 | def init(self, init_fn, last_init_fn):
33 | for layer in self.layers:
34 | init_fn(layer)
35 | last_init_fn(self.last_layer)
36 |
37 |
38 | def soft_update(source_model: nn.Module, target_model: nn.Module, tau):
39 | for target_param, param in zip(target_model.parameters(), source_model.parameters()):
40 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
41 |
42 |
43 | def copy_model_params_from_to(source, target):
44 | for target_param, param in zip(target.parameters(), source.parameters()):
45 | target_param.data.copy_(param.data)
46 |
47 |
48 | def init(module, weight_init=None, bias_init=None):
49 | if weight_init:
50 | weight_init(module.weight.data)
51 | if bias_init:
52 | bias_init(module.bias.data)
53 |
54 |
55 | def get_flat_params(model):
56 | params = []
57 | for param in model.parameters():
58 | params.append(param.data.view(-1))
59 |
60 | flat_params = torch.cat(params)
61 | return flat_params
62 |
63 |
64 | def set_flat_params(model, flat_params):
65 | prev_ind = 0
66 | for param in model.parameters():
67 | flat_size = int(np.prod(list(param.size())))
68 | param.data.copy_(
69 | flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
70 | prev_ind += flat_size
71 |
72 |
73 | def get_flat_grad(net, grad_grad=False):
74 | grads = []
75 | for param in net.parameters():
76 | if grad_grad:
77 | grads.append(param.grad.grad.view(-1))
78 | else:
79 | grads.append(param.grad.view(-1))
80 |
81 | flat_grad = torch.cat(grads)
82 | return flat_grad
83 |
84 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_cheetahO001.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=5):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action):
21 | start_ob = self._get_obs()
22 | reward_run = start_ob[8]
23 |
24 | self.do_simulation(action, self.frame_skip)
25 | ob = self._get_obs()
26 | if getattr(self, 'action_space', None):
27 | action = np.clip(action, self.action_space.low,
28 | self.action_space.high)
29 | reward_ctrl = -0.1 * np.square(action).sum()
30 |
31 | reward = reward_run + reward_ctrl
32 | done = False
33 | ob += np.random.uniform(low=-0.01, high=0.01, size=ob.shape)
34 | return ob, reward, done, {}
35 |
36 | def _get_obs(self):
37 | return np.concatenate([
38 | self.sim.data.qpos.flat[1:],
39 | self.sim.data.qvel.flat,
40 | ])
41 |
42 | def reset_model(self):
43 | qpos = self.init_qpos + \
44 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
45 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
46 | self.set_state(qpos, qvel)
47 | return self._get_obs()
48 |
49 | def viewer_setup(self):
50 | self.viewer.cam.distance = self.model.stat.extent * 0.5
51 |
52 | def cost_np_vec(self, obs, acts, next_obs):
53 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
54 | reward_run = obs[:, 8]
55 | reward = reward_run + reward_ctrl
56 | return -reward
57 |
58 | def cost_tf_vec(self, obs, acts, next_obs):
59 | raise NotImplementedError
60 | """
61 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1)
62 | reward_run = next_obs[:, 0]
63 | reward = reward_run + reward_ctrl
64 | return -reward
65 | """
66 |
67 | def mb_step(self, states, actions, next_states):
68 | # returns rewards and dones
69 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
70 | if getattr(self, 'action_space', None):
71 | actions = np.clip(actions, self.action_space.low,
72 | self.action_space.high)
73 | rewards = - self.cost_np_vec(states, actions, next_states)
74 | return rewards, np.zeros_like(rewards, dtype=np.bool)
75 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/swimmer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=4):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/swimmer.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action):
21 | old_ob = self._get_obs()
22 | self.do_simulation(action, self.frame_skip)
23 |
24 | if getattr(self, 'action_space', None):
25 | action = np.clip(action, self.action_space.low,
26 | self.action_space.high)
27 | ob = self._get_obs()
28 |
29 | reward_ctrl = -0.0001 * np.square(action).sum()
30 | reward_run = old_ob[3]
31 | reward = reward_run + reward_ctrl
32 |
33 | done = False
34 | return ob, reward, done, {}
35 |
36 | def _get_obs(self):
37 | return np.concatenate([
38 | # (self.sim.data.qpos.flat[:1] - self.prev_qpos[:1]) / self.dt,
39 | # self.get_body_comvel("torso")[:1],
40 | self.sim.data.qpos.flat[2:],
41 | self.sim.data.qvel.flat,
42 | ])
43 |
44 | def mb_step(self, states, actions, next_states):
45 | # returns rewards and dones
46 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
47 | if getattr(self, 'action_space', None):
48 | actions = np.clip(actions, self.action_space.low,
49 | self.action_space.high)
50 | rewards = - self.cost_np_vec(states, actions, next_states)
51 | return rewards, np.zeros_like(rewards, dtype=np.bool)
52 |
53 | def reset_model(self):
54 | self.set_state(
55 | self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq),
56 | self.init_qvel + self.np_random.uniform(low=-.1, high=.1, size=self.model.nv)
57 | )
58 | self.prev_qpos = np.copy(self.sim.data.qpos.flat)
59 | return self._get_obs()
60 |
61 | def cost_np_vec(self, obs, acts, next_obs):
62 | reward_ctrl = -0.0001 * np.sum(np.square(acts), axis=1)
63 | reward_run = obs[:, 3]
64 | reward = reward_run + reward_ctrl
65 | return -reward
66 |
67 | def cost_tf_vec(self, obs, acts, next_obs):
68 | """
69 | reward_ctrl = -0.0001 * tf.reduce_sum(tf.square(acts), axis=1)
70 | reward_run = next_obs[:, 0]
71 | reward = reward_run + reward_ctrl
72 | return -reward
73 | """
74 | raise NotImplementedError
75 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_fant.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=5):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/ant.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action):
21 | old_ob = self._get_obs()
22 | self.do_simulation(action, self.frame_skip)
23 |
24 | if getattr(self, 'action_space', None):
25 | action = np.clip(action, self.action_space.low, self.action_space.high)
26 | ob = self._get_obs()
27 |
28 | reward_ctrl = -0.1 * np.square(action).sum()
29 | reward_run = old_ob[13]
30 | reward_height = -3.0 * np.square(old_ob[0] - 0.57)
31 |
32 | # the alive bonus
33 | height = ob[0]
34 | done = (height > 1.0) or (height < 0.2)
35 | alive_reward = float(not done)
36 |
37 | reward = reward_run + reward_ctrl + reward_height + alive_reward
38 | return ob, reward, done, {}
39 |
40 | def _get_obs(self):
41 | return np.concatenate([
42 | self.sim.data.qpos.flat[2:],
43 | self.sim.data.qvel.flat,
44 | ])
45 |
46 | def reset_model(self):
47 | qpos = self.init_qpos + \
48 | self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
49 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
50 | self.set_state(qpos, qvel)
51 | # self.prev_qpos = np.copy(self.sim.data.qpos.flat)
52 | return self._get_obs()
53 |
54 | def viewer_setup(self):
55 | self.viewer.cam.distance = self.model.stat.extent * 0.5
56 |
57 | def cost_np_vec(self, obs, acts, next_obs):
58 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
59 | reward_run = obs[:, 13]
60 | reward_height = -3.0 * np.square(obs[:, 0] - 0.57)
61 |
62 | height = next_obs[:, 0]
63 | done = np.logical_or((height > 1.0), (height < 0.2))
64 | alive_reward = 1.0 - np.array(done, dtype=np.float)
65 |
66 | reward = reward_run + reward_ctrl + reward_height + alive_reward
67 | return -reward
68 |
69 | def mb_step(self, states, actions, next_states):
70 | if getattr(self, 'action_space', None):
71 | actions = np.clip(actions, self.action_space.low,
72 | self.action_space.high)
73 | rewards = - self.cost_np_vec(states, actions, next_states)
74 | height = next_states[:, 0]
75 | done = np.logical_or((height > 1.0), (height < 0.2))
76 | return rewards, done
77 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_cheetahA003.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=5):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action):
21 | action = np.array(action)
22 | action += np.random.uniform(low=-0.03, high=0.03, size=action.shape)
23 | start_ob = self._get_obs()
24 | reward_run = start_ob[8]
25 |
26 | self.do_simulation(action, self.frame_skip)
27 | ob = self._get_obs()
28 | if getattr(self, 'action_space', None):
29 | action = np.clip(action, self.action_space.low,
30 | self.action_space.high)
31 | reward_ctrl = -0.1 * np.square(action).sum()
32 |
33 | reward = reward_run + reward_ctrl
34 | done = False
35 | return ob, reward, done, {}
36 |
37 | def _get_obs(self):
38 | return np.concatenate([
39 | self.sim.data.qpos.flat[1:],
40 | self.sim.data.qvel.flat,
41 | ])
42 |
43 | def reset_model(self):
44 | qpos = self.init_qpos + \
45 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
46 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
47 | self.set_state(qpos, qvel)
48 | return self._get_obs()
49 |
50 | def viewer_setup(self):
51 | self.viewer.cam.distance = self.model.stat.extent * 0.5
52 |
53 | def cost_np_vec(self, obs, acts, next_obs):
54 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
55 | reward_run = obs[:, 8]
56 | reward = reward_run + reward_ctrl
57 | return -reward
58 |
59 | def cost_tf_vec(self, obs, acts, next_obs):
60 | raise NotImplementedError
61 | """
62 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1)
63 | reward_run = next_obs[:, 0]
64 | reward = reward_run + reward_ctrl
65 | return -reward
66 | """
67 |
68 | def mb_step(self, states, actions, next_states):
69 | # returns rewards and dones
70 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
71 | if getattr(self, 'action_space', None):
72 | actions = np.clip(actions, self.action_space.low,
73 | self.action_space.high)
74 | rewards = - self.cost_np_vec(states, actions, next_states)
75 | return rewards, np.zeros_like(rewards, dtype=np.bool)
76 |
77 |
78 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_cheetahA01.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=5):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/half_cheetah.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action):
21 | action = np.array(action)
22 | action += np.random.uniform(low=-0.1, high=0.1, size=action.shape)
23 | start_ob = self._get_obs()
24 | reward_run = start_ob[8]
25 |
26 | self.do_simulation(action, self.frame_skip)
27 | ob = self._get_obs()
28 | if getattr(self, 'action_space', None):
29 | action = np.clip(action, self.action_space.low,
30 | self.action_space.high)
31 | reward_ctrl = -0.1 * np.square(action).sum()
32 |
33 | reward = reward_run + reward_ctrl
34 | done = False
35 | return ob, reward, done, {}
36 |
37 | def _get_obs(self):
38 | return np.concatenate([
39 | self.sim.data.qpos.flat[1:],
40 | self.sim.data.qvel.flat,
41 | ])
42 |
43 | def reset_model(self):
44 | qpos = self.init_qpos + \
45 | self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
46 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
47 | self.set_state(qpos, qvel)
48 | return self._get_obs()
49 |
50 | def viewer_setup(self):
51 | self.viewer.cam.distance = self.model.stat.extent * 0.5
52 |
53 | def cost_np_vec(self, obs, acts, next_obs):
54 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
55 | reward_run = obs[:, 8]
56 | reward = reward_run + reward_ctrl
57 | return -reward
58 |
59 | def cost_tf_vec(self, obs, acts, next_obs):
60 | raise NotImplementedError
61 | """
62 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1)
63 | reward_run = next_obs[:, 0]
64 | reward = reward_run + reward_ctrl
65 | return -reward
66 | """
67 |
68 | def mb_step(self, states, actions, next_states):
69 | # returns rewards and dones
70 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
71 | if getattr(self, 'action_space', None):
72 | actions = np.clip(actions, self.action_space.low,
73 | self.action_space.high)
74 | rewards = - self.cost_np_vec(states, actions, next_states)
75 | return rewards, np.zeros_like(rewards, dtype=np.bool)
76 |
77 |
78 |
--------------------------------------------------------------------------------
/mbpo_pytorch/models/actor.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import List, Optional
4 |
5 | from .initializer import fanin_init
6 | from .utils import MLP
7 | from .actor_layer import *
8 |
9 |
10 | # noinspection DuplicatedCode
11 | class Actor(nn.Module, ABC):
12 | def __init__(self, state_dim: int, action_space, hidden_dims: List[int],
13 | state_normalizer: Optional[nn.Module], use_limited_entropy=False, use_tanh_squash=False,
14 | use_state_dependent_std=False, **kwargs):
15 | super(Actor, self).__init__()
16 | self.state_dim = state_dim
17 | self.action_dim = action_space
18 | self.hidden_dims = hidden_dims
19 | self.use_limited_entropy = use_limited_entropy
20 | self.use_tanh_squash = use_tanh_squash
21 |
22 | mlp_kwargs = kwargs.copy()
23 | mlp_kwargs['activation'] = kwargs.get('activation', 'relu')
24 | mlp_kwargs['last_activation'] = kwargs.get('activation', 'relu')
25 |
26 | self.actor_feature = MLP(state_dim, hidden_dims[-1], hidden_dims[:-1], **mlp_kwargs)
27 |
28 | self.state_normalizer = state_normalizer or nn.Identity()
29 |
30 | self.actor_layer = TanhGaussainActorLayer(hidden_dims[-1], action_space.shape[0],
31 | use_state_dependent_std)
32 |
33 | def init_(m): init(m, fanin_init, lambda x: nn.init.constant_(x, 0))
34 | self.actor_feature.init(init_, init_)
35 |
36 | def act(self, state, deterministic=False, reparameterize=False):
37 | action_feature = self.actor_feature(state)
38 | action_dist, action_means, action_logstds = self.actor_layer(action_feature)
39 |
40 | log_probs = None
41 | pretanh_actions = None
42 |
43 | if deterministic:
44 | actions = action_means
45 | else:
46 | if reparameterize:
47 | result = action_dist.rsample()
48 | else:
49 | result = action_dist.sample()
50 | actions, pretanh_actions = result
51 | log_probs = action_dist.log_probs(actions, pretanh_actions)
52 |
53 | entropy = action_dist.entropy().mean()
54 |
55 | return {'actions': actions, 'log_probs': log_probs, 'entropy': entropy,
56 | 'action_means': action_means, 'action_logstds': action_logstds, 'pretanh_actions': pretanh_actions}
57 |
58 | def evaluate_actions(self, states, actions, pretanh_actions=None):
59 | states = self.state_normalizer(states)
60 |
61 | action_feature = self.actor_feature(states)
62 | action_dist, *_ = self.actor_layer(action_feature)
63 |
64 | if pretanh_actions:
65 | log_probs = action_dist.log_probs(actions, pretanh_actions)
66 | else:
67 | log_probs = action_dist.log_probs(actions)
68 |
69 | entropy = action_dist.entropy().mean()
70 |
71 | return {'log_probs': log_probs, 'entropy': entropy}
72 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/assets/hopper.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/pets_pusher.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class PusherEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self):
13 | dir_path = os.path.dirname(os.path.realpath(__file__))
14 | mujoco_env.MujocoEnv.__init__(self, '%s/assets/pusher.xml' % dir_path, 4)
15 | utils.EzPickle.__init__(self)
16 | self.reset_model()
17 |
18 | def step(self, a):
19 | obj_pos = self.get_body_com("object"),
20 | vec_1 = obj_pos - self.get_body_com("tips_arm")
21 | vec_2 = obj_pos - self.get_body_com("goal")
22 |
23 | reward_near = -np.sum(np.abs(vec_1))
24 | reward_dist = -np.sum(np.abs(vec_2))
25 | reward_ctrl = -np.square(a).sum()
26 | reward = 1.25 * reward_dist + 0.1 * reward_ctrl + 0.5 * reward_near
27 |
28 | self.do_simulation(a, self.frame_skip)
29 | ob = self._get_obs()
30 | done = False
31 | return ob, reward, done, {}
32 |
33 | def viewer_setup(self):
34 | self.viewer.cam.trackbodyid = -1
35 | self.viewer.cam.distance = 4.0
36 |
37 | def reset_model(self):
38 | qpos = self.init_qpos
39 |
40 | self.goal_pos = np.asarray([0, 0])
41 | self.cylinder_pos = np.array([-0.25, 0.15]) + np.random.normal(0, 0.025, [2])
42 |
43 | qpos[-4:-2] = self.cylinder_pos
44 | qpos[-2:] = self.goal_pos
45 | qvel = self.init_qvel + \
46 | self.np_random.uniform(low=-0.005, high=0.005, size=self.model.nv)
47 | qvel[-4:] = 0
48 | self.set_state(qpos, qvel)
49 | self.ac_goal_pos = self.get_body_com("goal")
50 |
51 | return self._get_obs()
52 |
53 | def _get_obs(self):
54 | return np.concatenate([
55 | self.sim.data.qpos.flat[:7],
56 | self.sim.data.qvel.flat[:7],
57 | self.get_body_com("tips_arm"),
58 | self.get_body_com("object"),
59 | self.get_body_com("goal"),
60 | ])
61 |
62 | def cost_np_vec(self, obs, acts, next_obs):
63 | """
64 | to_w, og_w = 0.5, 1.25
65 | tip_pos, obj_pos, goal_pos = obs[:, 14:17], obs[:, 17:20], obs[:, -3:]
66 |
67 | tip_obj_dist = np.sum(np.abs(tip_pos - obj_pos), axis=1)
68 | obj_goal_dist = np.sum(np.abs(goal_pos - obj_pos), axis=1)
69 | return to_w * tip_obj_dist + og_w * obj_goal_dist
70 |
71 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
72 | reward_run = obs[:, 8]
73 | reward = reward_run + reward_ctrl
74 | """
75 | to_w, og_w = 0.5, 1.25
76 | tip_pos, obj_pos, goal_pos = obs[:, 14:17], obs[:, 17:20], obs[:, -3:]
77 |
78 | tip_obj_dist = -np.sum(np.abs(tip_pos - obj_pos), axis=1)
79 | obj_goal_dist = -np.sum(np.abs(goal_pos - obj_pos), axis=1)
80 | ctrl_reward = -0.1 * np.sum(np.square(acts), axis=1)
81 |
82 | reward = to_w * tip_obj_dist + og_w * obj_goal_dist + ctrl_reward
83 | return -reward
84 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/walker2d.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=4):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/walker2d.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action):
21 | old_ob = self._get_obs()
22 | self.do_simulation(action, self.frame_skip)
23 | ob = self._get_obs()
24 |
25 | if getattr(self, 'action_space', None):
26 | action = np.clip(action, self.action_space.low,
27 | self.action_space.high)
28 |
29 | reward_ctrl = -0.1 * np.square(action).sum()
30 | reward_run = old_ob[8]
31 | reward_height = -3.0 * np.square(old_ob[0] - 1.3)
32 | reward = reward_run + reward_ctrl + reward_height + 1.0
33 |
34 | done = False
35 | return ob, reward, done, {}
36 |
37 | def _get_obs(self):
38 | return np.concatenate([
39 | self.sim.data.qpos.flat[1:],
40 | self.sim.data.qvel.flat
41 | ])
42 |
43 | def mb_step(self, states, actions, next_states):
44 | # returns rewards and dones
45 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
46 | if getattr(self, 'action_space', None):
47 | actions = np.clip(actions, self.action_space.low,
48 | self.action_space.high)
49 | rewards = - self.cost_np_vec(states, actions, next_states)
50 | return rewards, np.zeros_like(rewards, dtype=np.bool)
51 |
52 | def reset_model(self):
53 | self.set_state(
54 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq),
55 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
56 | )
57 | self.prev_qpos = np.copy(self.sim.data.qpos.flat)
58 | return self._get_obs()
59 |
60 | def viewer_setup(self):
61 | self.viewer.cam.trackbodyid = 2
62 | self.viewer.cam.distance = self.model.stat.extent * 0.5
63 | self.viewer.cam.lookat[2] += .8
64 | self.viewer.cam.elevation = -20
65 |
66 | def cost_np_vec(self, obs, acts, next_obs):
67 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
68 | reward_run = obs[:, 8]
69 | reward_height = -3.0 * np.square(obs[:, 0] - 1.3)
70 | reward = reward_run + reward_ctrl + reward_height + 1.0
71 | return -reward
72 |
73 | def cost_tf_vec(self, obs, acts, next_obs):
74 | """
75 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1)
76 | reward_run = next_obs[:, 0]
77 | # reward_height = -3.0 * tf.square(next_obs[:, 1] - 1.3)
78 | reward = reward_run + reward_ctrl
79 | return -reward
80 | """
81 | raise NotImplementedError
82 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_nostopslimhumanoid.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco import mujoco_env
3 | from gym import utils
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
9 |
10 | def __init__(self):
11 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5)
12 | utils.EzPickle.__init__(self)
13 |
14 | def _get_obs(self):
15 | data = self.sim.data
16 | return np.concatenate([data.qpos.flat[2:],
17 | data.qvel.flat])
18 |
19 | def step(self, a):
20 | data = self.sim.data
21 | action = a
22 | if getattr(self, 'action_space', None):
23 | action = np.clip(a, self.action_space.low,
24 | self.action_space.high)
25 | qpos = self.sim.data.qpos
26 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
27 |
28 | # reward
29 | alive_bonus = 5 * (1 - float(done))
30 | lin_vel_cost = 0.25 / 0.015 * data.qvel.flat[0]
31 | quad_ctrl_cost = 0.1 * np.square(action).sum()
32 | quad_impact_cost = 0.0
33 |
34 | self.do_simulation(action, self.frame_skip)
35 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
36 | done = False
37 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost,
38 | reward_alive=alive_bonus, reward_impact=-quad_impact_cost)
39 |
40 | def reset_model(self):
41 | c = 0.01
42 | self.set_state(
43 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
44 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
45 | )
46 | return self._get_obs()
47 |
48 | def viewer_setup(self):
49 | self.viewer.cam.trackbodyid = 1
50 | self.viewer.cam.distance = self.model.stat.extent * 1.0
51 | self.viewer.cam.lookat[2] += .8
52 | self.viewer.cam.elevation = -20
53 |
54 | def cost_np_vec(self, obs, acts, next_obs):
55 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
56 | reward_run = 0.25 / 0.015 * obs[:, 22]
57 |
58 | quad_impact_cost = 0.0
59 |
60 | height = next_obs[:, 0]
61 | done = np.logical_or((height > 2.0), (height < 1.0))
62 | alive_reward = 5 * (1.0 - np.array(done, dtype=np.float))
63 |
64 | reward = reward_run + reward_ctrl + (-quad_impact_cost) + alive_reward
65 | return -reward
66 |
67 | def cost_tf_vec(self, obs, acts, next_obs):
68 | raise NotImplementedError
69 |
70 | def mb_step(self, states, actions, next_states):
71 | # returns rewards and dones
72 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
73 | if getattr(self, 'action_space', None):
74 | actions = np.clip(actions, self.action_space.low,
75 | self.action_space.high)
76 | rewards = - self.cost_np_vec(states, actions, next_states)
77 | return rewards, np.zeros_like(rewards, dtype=np.bool)
78 |
79 |
80 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/hopper.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=4):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/hopper.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action: np.ndarray):
21 | old_ob = self._get_obs()
22 | self.do_simulation(action, self.frame_skip)
23 | ob = self._get_obs()
24 |
25 | if getattr(self, 'action_space', None):
26 | action = np.clip(action, self.action_space.low,
27 | self.action_space.high)
28 |
29 | reward_ctrl = -0.1 * np.square(action).sum()
30 | reward_run = old_ob[5]
31 | reward_height = -3.0 * np.square(old_ob[0] - 1.3)
32 | reward = reward_run + reward_ctrl + reward_height + 1.0
33 |
34 | done = False
35 | return ob, reward, done, {}
36 |
37 | def _get_obs(self):
38 | return np.concatenate([
39 | self.sim.data.qpos.flat[1:],
40 | self.sim.data.qvel.flat,
41 | ])
42 |
43 | def mb_step(self, states, actions, next_states):
44 | # returns rewards and dones
45 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
46 | if getattr(self, 'action_space', None):
47 | actions = np.clip(actions, self.action_space.low,
48 | self.action_space.high)
49 | rewards = - self.cost_np_vec(states, actions, next_states)
50 | return rewards, np.zeros_like(rewards, dtype=np.bool)
51 |
52 | def reset_model(self):
53 | self.set_state(
54 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq),
55 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
56 | )
57 | self.prev_qpos = np.copy(self.sim.data.qpos.flat)
58 | return self._get_obs()
59 |
60 | def viewer_setup(self):
61 | self.viewer.cam.trackbodyid = 2
62 | self.viewer.cam.distance = self.model.stat.extent * 0.75
63 | self.viewer.cam.lookat[2] += .8
64 | self.viewer.cam.elevation = -20
65 |
66 | def cost_np_vec(self, obs, acts, next_obs):
67 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
68 | reward_run = obs[:, 5]
69 | reward_height = -3.0 * np.square(obs[:, 0] - 1.3)
70 | reward = reward_run + reward_ctrl + reward_height + 1.0
71 | return -reward
72 |
73 | def cost_tf_vec(self, obs, acts, next_obs):
74 | """
75 | reward_ctrl = -0.1 * tf.reduce_sum(tf.square(acts), axis=1)
76 | reward_run = next_obs[:, 0]
77 | # reward_height = -3.0 * tf.square(next_obs[:, 1] - 1.3)
78 | reward = reward_run + reward_ctrl
79 | return -reward
80 | """
81 | raise NotImplementedError
82 |
--------------------------------------------------------------------------------
/mbpo_pytorch/thirdparty/util.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | import gym
4 | import numpy as np
5 |
6 |
7 | def copy_obs_dict(obs):
8 | """
9 | Deep-copy a dict of numpy arrays.
10 |
11 | :param obs: (OrderedDict): a dict of numpy arrays.
12 | :return (OrderedDict) a dict of copied numpy arrays.
13 | """
14 | assert isinstance(obs, OrderedDict), "unexpected type for observations '{}'".format(type(obs))
15 | return OrderedDict([(k, np.copy(v)) for k, v in obs.items()])
16 |
17 |
18 | def dict_to_obs(space, obs_dict):
19 | """
20 | Convert an internal representation raw_obs into the appropriate type
21 | specified by space.
22 |
23 | :param space: (gym.spaces.Space) an observation space.
24 | :param obs_dict: (OrderedDict) a dict of numpy arrays.
25 | :return (ndarray, tuple or dict): returns an observation
26 | of the same type as space. If space is Dict, function is identity;
27 | if space is Tuple, converts dict to Tuple; otherwise, space is
28 | unstructured and returns the value raw_obs[None].
29 | """
30 | if isinstance(space, gym.spaces.Dict):
31 | return obs_dict
32 | elif isinstance(space, gym.spaces.Tuple):
33 | assert len(obs_dict) == len(space.spaces), "size of observation does not match size of observation space"
34 | return tuple((obs_dict[i] for i in range(len(space.spaces))))
35 | else:
36 | assert set(obs_dict.keys()) == {None}, "multiple observation keys for unstructured observation space"
37 | return obs_dict[None]
38 |
39 |
40 | def obs_space_info(obs_space):
41 | """
42 | Get dict-structured information about a gym.Space.
43 |
44 | Dict spaces are represented directly by their dict of subspaces.
45 | Tuple spaces are converted into a dict with keys indexing into the tuple.
46 | Unstructured spaces are represented by {None: obs_space}.
47 |
48 | :param obs_space: (gym.spaces.Space) an observation space
49 | :return (tuple) A tuple (keys, shapes, dtypes):
50 | keys: a list of dict keys.
51 | shapes: a dict mapping keys to shapes.
52 | dtypes: a dict mapping keys to dtypes.
53 | """
54 | if isinstance(obs_space, gym.spaces.Dict):
55 | assert isinstance(obs_space.spaces, OrderedDict), "Dict space must have ordered subspaces"
56 | subspaces = obs_space.spaces
57 | elif isinstance(obs_space, gym.spaces.Tuple):
58 | subspaces = {i: space for i, space in enumerate(obs_space.spaces)}
59 | else:
60 | assert not hasattr(obs_space, 'spaces'), "Unsupported structured space '{}'".format(type(obs_space))
61 | subspaces = {None: obs_space}
62 | keys = []
63 | shapes = {}
64 | dtypes = {}
65 | for key, box in subspaces.items():
66 | keys.append(key)
67 | shapes[key] = box.shape
68 | dtypes[key] = box.dtype
69 | return keys, shapes, dtypes
70 |
71 |
72 | def mpi_rank_or_zero():
73 | """
74 | Return the MPI rank if mpi is installed. Otherwise, return 0.
75 | :return: (int)
76 | """
77 | try:
78 | import mpi4py
79 | return mpi4py.MPI.COMM_WORLD.Get_rank()
80 | except (ImportError, AttributeError) as _:
81 | return 0
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_slimhumanoid.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco import mujoco_env
3 | from gym import utils
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
9 |
10 | def __init__(self):
11 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5)
12 | utils.EzPickle.__init__(self)
13 |
14 | def _get_obs(self):
15 | data = self.sim.data
16 | return np.concatenate([data.qpos.flat[2:],
17 | data.qvel.flat])
18 |
19 | def step(self, a):
20 | data = self.sim.data
21 | action = a
22 | if getattr(self, 'action_space', None):
23 | action = np.clip(a, self.action_space.low,
24 | self.action_space.high)
25 |
26 | # reward
27 | alive_bonus = 5.0
28 | lin_vel_cost = 0.25 / 0.015 * data.qvel.flat[0]
29 | quad_ctrl_cost = 0.1 * np.square(action).sum()
30 | quad_impact_cost = 0.0
31 |
32 | self.do_simulation(action, self.frame_skip)
33 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
34 | qpos = self.sim.data.qpos
35 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
36 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost,
37 | reward_alive=alive_bonus, reward_impact=-quad_impact_cost)
38 |
39 | def reset_model(self):
40 | c = 0.01
41 | self.set_state(
42 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
43 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
44 | )
45 | return self._get_obs()
46 |
47 | def viewer_setup(self):
48 | self.viewer.cam.trackbodyid = 1
49 | self.viewer.cam.distance = self.model.stat.extent * 1.0
50 | self.viewer.cam.lookat[2] += .8
51 | self.viewer.cam.elevation = -20
52 |
53 | def cost_np_vec(self, obs, acts, next_obs):
54 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
55 | reward_run = 0.25 / 0.015 * obs[:, 22]
56 |
57 | quad_impact_cost = 0.0
58 |
59 | height = next_obs[:, 0]
60 | done = np.logical_or((height > 2.0), (height < 1.0))
61 | alive_reward = 5 * (1.0 - np.array(done, dtype=np.float))
62 |
63 | reward = reward_run + reward_ctrl + (-quad_impact_cost) + alive_reward
64 | return -reward
65 |
66 | def cost_tf_vec(self, obs, acts, next_obs):
67 | raise NotImplementedError
68 |
69 | def mb_step(self, states, actions, next_states):
70 | # returns rewards and dones
71 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
72 | if getattr(self, 'action_space', None):
73 | actions = np.clip(actions, self.action_space.low,
74 | self.action_space.high)
75 | rewards = - self.cost_np_vec(states, actions, next_states)
76 | height = next_states[:, 0]
77 | done = np.logical_or((height > 2.0), (height < 1.0))
78 | return rewards, done
79 |
80 |
81 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/benchmarking_envs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from .gym.half_cheetah import HalfCheetahEnv
4 | from .gym.walker2d import Walker2dEnv
5 | from .gym.ant import AntEnv
6 | from .gym.hopper import HopperEnv
7 | from .gym.swimmer import SwimmerEnv
8 | from .gym.reacher import ReacherEnv
9 | from .gym.pendulum import PendulumEnv
10 | from .gym.inverted_pendulum import InvertedPendulumEnv
11 | from .gym.acrobot import AcrobotEnv
12 | from .gym.cartpole import CartPoleEnv
13 | from .gym.mountain_car import Continuous_MountainCarEnv
14 | from .gym.gym_ohalfcheetah import OriginalHalfCheetahEnv
15 | from .gym.gym_oant import OriginalAntEnv
16 | from .gym.gym_owalker import OriginalWalkerEnv
17 | from .gym.gym_oswimmer import OriginalSwimmerEnv
18 | from .gym.gym_ohopper import OriginalHopperEnv
19 | from .gym.gym_ohumanoid import OriginalHumanoidEnv
20 | from .gym import gym_fswimmer
21 | from .gym import gym_fwalker2d
22 | from .gym import gym_fhopper
23 | from .gym import gym_fant
24 | from .gym import gym_cheetahA01
25 | from .gym import gym_cheetahA003
26 | from .gym import gym_cheetahO01
27 | from .gym import gym_cheetahO001
28 | from .gym import gym_pendulumO01
29 | from .gym import gym_pendulumO001
30 | from .gym import gym_cartpoleO01
31 | from .gym import gym_cartpoleO001
32 | from .gym import gym_humanoid
33 | from .gym import gym_nostopslimhumanoid
34 | from .gym import gym_slimhumanoid
35 |
36 |
37 | def make_benchmarking_env(env_id: str):
38 | envs = {
39 | 'OriginalHalfCheetah': OriginalHalfCheetahEnv,
40 | 'OriginalAnt': OriginalAntEnv,
41 | 'OriginalWalker': OriginalWalkerEnv,
42 | 'OriginalSwimmer': OriginalSwimmerEnv,
43 | 'OriginalHumanoid': OriginalHumanoidEnv,
44 | 'OriginalHopper': OriginalHopperEnv,
45 |
46 | 'HalfCheetah': HalfCheetahEnv,
47 | 'Walker2D': Walker2dEnv,
48 | 'Ant': AntEnv,
49 | 'Hopper': HopperEnv,
50 | 'Swimmer': SwimmerEnv,
51 | 'FixedSwimmer': gym_fswimmer.fixedSwimmerEnv,
52 | 'FixedWalker': gym_fwalker2d.Walker2dEnv,
53 | 'FixedHopper': gym_fhopper.HopperEnv,
54 | 'FixedAnt': gym_fant.AntEnv,
55 | 'Reacher': ReacherEnv,
56 | 'Pendulum': PendulumEnv,
57 | 'InvertedPendulum': InvertedPendulumEnv,
58 | 'Acrobot': AcrobotEnv,
59 | 'CartPole': CartPoleEnv,
60 | 'MountainCar': Continuous_MountainCarEnv,
61 |
62 | 'HalfCheetahO01': gym_cheetahO01.HalfCheetahEnv,
63 | 'HalfCheetahO001': gym_cheetahO001.HalfCheetahEnv,
64 | 'HalfCheetahA01': gym_cheetahA01.HalfCheetahEnv,
65 | 'HalfCheetahA003': gym_cheetahA003.HalfCheetahEnv,
66 |
67 | 'PendulumO01': gym_pendulumO01.PendulumEnv,
68 | 'PendulumO001': gym_pendulumO001.PendulumEnv,
69 |
70 | 'CartPoleO01': gym_cartpoleO01.CartPoleEnv,
71 | 'CartPoleO001': gym_cartpoleO001.CartPoleEnv,
72 |
73 | 'gym_humanoid': gym_humanoid.HumanoidEnv,
74 | 'gym_slimhumanoid': gym_slimhumanoid.HumanoidEnv,
75 | 'gym_nostopslimhumanoid': gym_nostopslimhumanoid.HumanoidEnv,
76 | }
77 | env = envs[env_id]()
78 | if not hasattr(env, 'reward_range'):
79 | env.reward_range = (-np.inf, np.inf)
80 | if not hasattr(env, 'metadata'):
81 | env.metadata = {}
82 | return env
83 |
84 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_fhopper.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | # noinspection DuplicatedCode
11 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
12 |
13 | def __init__(self, frame_skip=4):
14 | self.prev_qpos = None
15 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
16 | mujoco_env.MujocoEnv.__init__(
17 | self, '%s/assets/hopper.xml' % dir_path, frame_skip=frame_skip
18 | )
19 | utils.EzPickle.__init__(self)
20 |
21 | def step(self, action):
22 | old_ob = self._get_obs()
23 | self.do_simulation(action, self.frame_skip)
24 | ob = self._get_obs()
25 |
26 | if getattr(self, 'action_space', None):
27 | action = np.clip(action, self.action_space.low,
28 | self.action_space.high)
29 |
30 | reward_ctrl = -0.1 * np.square(action).sum()
31 | reward_run = old_ob[5]
32 | reward_height = -3.0 * np.square(old_ob[0] - 1.3)
33 | height, ang = ob[0], ob[1]
34 | done = (height <= 0.7) or (abs(ang) >= 0.2)
35 | alive_reward = float(not done)
36 | reward = reward_run + reward_ctrl + reward_height + alive_reward
37 |
38 | return ob, reward, done, {}
39 |
40 | def _get_obs(self):
41 | return np.concatenate([
42 | self.sim.data.qpos.flat[1:],
43 | self.sim.data.qvel.flat,
44 | ])
45 |
46 | def reset_model(self):
47 | self.set_state(
48 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq),
49 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
50 | )
51 | self.prev_qpos = np.copy(self.sim.data.qpos.flat)
52 | return self._get_obs()
53 |
54 | def viewer_setup(self):
55 | self.viewer.cam.trackbodyid = 2
56 | self.viewer.cam.distance = self.model.stat.extent * 0.75
57 | self.viewer.cam.lookat[2] += .8
58 | self.viewer.cam.elevation = -20
59 |
60 | def cost_np_vec(self, obs, acts, next_obs):
61 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
62 | reward_run = obs[:, 5]
63 | reward_height = -3.0 * np.square(obs[:, 0] - 1.3)
64 | height, ang = next_obs[:, 0], next_obs[:, 1]
65 | done = np.logical_or(height <= 0.7, abs(ang) >= 0.2)
66 | alive_reward = 1.0 - np.array(done, dtype=np.float)
67 | reward = reward_run + reward_ctrl + reward_height + alive_reward
68 | return -reward
69 |
70 | def mb_step(self, states, actions, next_states):
71 | # returns rewards and dones
72 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
73 | if getattr(self, 'action_space', None):
74 | actions = np.clip(actions, self.action_space.low,
75 | self.action_space.high)
76 | rewards = - self.cost_np_vec(states, actions, next_states)
77 | heights, angs = next_states[:, 0], next_states[:, 1]
78 | dones = np.logical_or(heights <= 0.7, abs(angs) >= 0.2)
79 | return rewards, dones
80 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_fwalker2d.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self, frame_skip=4):
13 | self.prev_qpos = None
14 | dir_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 | mujoco_env.MujocoEnv.__init__(
16 | self, '%s/assets/walker2d.xml' % dir_path, frame_skip=frame_skip
17 | )
18 | utils.EzPickle.__init__(self)
19 |
20 | def step(self, action):
21 | old_ob = self._get_obs()
22 | self.do_simulation(action, self.frame_skip)
23 | ob = self._get_obs()
24 |
25 | if getattr(self, 'action_space', None):
26 | action = np.clip(action, self.action_space.low,
27 | self.action_space.high)
28 |
29 | reward_ctrl = -0.1 * np.square(action).sum()
30 | reward_run = old_ob[8]
31 | reward_height = -3.0 * np.square(old_ob[0] - 1.3)
32 |
33 | height, ang = ob[0], ob[1]
34 | done = (height >= 2.0) or (height <= 0.8) or (abs(ang) >= 1.0)
35 | alive_reward = float(not done)
36 |
37 | reward = reward_run + reward_ctrl + reward_height + alive_reward
38 | return ob, reward, done, {}
39 |
40 | def _get_obs(self):
41 | return np.concatenate([
42 | self.sim.data.qpos.flat[1:],
43 | self.sim.data.qvel.flat
44 | ])
45 |
46 | def reset_model(self):
47 | self.set_state(
48 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq),
49 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
50 | )
51 | self.prev_qpos = np.copy(self.sim.data.qpos.flat)
52 | return self._get_obs()
53 |
54 | def viewer_setup(self):
55 | self.viewer.cam.trackbodyid = 2
56 | self.viewer.cam.distance = self.model.stat.extent * 0.5
57 | self.viewer.cam.lookat[2] += .8
58 | self.viewer.cam.elevation = -20
59 |
60 | def cost_np_vec(self, obs, acts, next_obs):
61 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
62 | reward_run = obs[:, 8]
63 | reward_height = -3.0 * np.square(next_obs[:, 0] - 1.3)
64 | height, ang = next_obs[:, 0], next_obs[:, 1]
65 | done = np.logical_or(
66 | np.logical_or(height >= 2.0, height <= 0.8),
67 | np.abs(ang) >= 1.0
68 | )
69 | alive_reward = 1.0 - np.array(done, dtype=np.float)
70 | reward = reward_run + reward_ctrl + reward_height + alive_reward
71 | return -reward
72 |
73 | def mb_step(self, states, actions, next_states):
74 | if getattr(self, 'action_space', None):
75 | actions = np.clip(actions, self.action_space.low,
76 | self.action_space.high)
77 | rewards = - self.cost_np_vec(states, actions, next_states)
78 | height, ang = next_states[:, 0], next_states[:, 1]
79 | done = np.logical_or(
80 | np.logical_or(height >= 2.0, height <= 0.8),
81 | np.abs(ang) >= 1.0
82 | )
83 | return rewards, done
84 |
--------------------------------------------------------------------------------
/mbpo_pytorch/configs/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import argparse
4 | import munch
5 | import yaml
6 | from yaml import Loader
7 | import collections
8 |
9 | from mbpo_pytorch.misc import logger
10 |
11 |
12 | def flatten(d, parent_key='', sep='.'):
13 | items = []
14 | for k, v in d.items():
15 | new_key = parent_key + sep + k if parent_key else k
16 | if isinstance(v, collections.MutableMapping):
17 | items.extend(flatten(v, new_key, sep=sep).items())
18 | else:
19 | items.append((new_key, str(v)))
20 | return dict(items)
21 |
22 |
23 | def safe_eval(exp: str):
24 | try:
25 | return eval(exp)
26 | except (NameError, SyntaxError):
27 | return exp
28 |
29 |
30 | def deflatten_with_eval(d, sep='.'):
31 | deflattend_d = {}
32 | for k, v in d.items():
33 | d = deflattend_d
34 | key_seq = k.split(sep)
35 | for key in key_seq[:-1]:
36 | try:
37 | d = d[key]
38 | except (TypeError, KeyError):
39 | d[key] = {}
40 | d = d[key]
41 | d[key_seq[-1]] = safe_eval(v)
42 | return deflattend_d
43 |
44 |
45 | class Config:
46 | def __new__(cls, config_paths='config.yaml'):
47 | parser = argparse.ArgumentParser()
48 | parser.add_argument('--configs', nargs='+', default=[])
49 | parser.add_argument('--set', type=str, nargs='*', action='append')
50 |
51 | args, unknown = parser.parse_known_args()
52 | flattened_config_dict = {}
53 | overwritten_config_dict = {}
54 |
55 | if args.configs:
56 | config_paths = args.configs
57 |
58 | if isinstance(config_paths, str):
59 | config_paths = [config_paths]
60 |
61 | for config_path in config_paths:
62 | if not config_path.startswith('/'):
63 | config_path = os.path.join(os.path.dirname(__file__), config_path)
64 | logger.info('Loading configs from {}.'.format(config_path))
65 |
66 | with open(config_path, 'r', encoding='utf-8') as f:
67 | new_config_dict = yaml.load(f, Loader=Loader)
68 | flattened_new_config_dict = flatten(new_config_dict)
69 | overwritten_config_dict.update(
70 | {k: v for k, v in flattened_new_config_dict.items()
71 | if (k in flattened_config_dict.keys() and v != flattened_config_dict[k])})
72 | flattened_config_dict.update(flattened_new_config_dict)
73 |
74 | if args.set:
75 | for instruction in sum(args.set, []):
76 | key, value = instruction.split('=')
77 | flattened_config_dict.update({key: safe_eval(value)})
78 | # values set by args should be recorded all
79 | overwritten_config_dict.update({key: safe_eval(value)})
80 |
81 | config_dict = deflatten_with_eval(flattened_config_dict)
82 |
83 | for key, value in overwritten_config_dict.items():
84 | logger.notice('Hyperparams {} has been overwritten to {}.'.format(key, value))
85 |
86 | config = munch.munchify(config_dict)
87 | config_dict = flatten(config_dict)
88 | logged_config_dict = {}
89 |
90 | for key, value in config_dict.items():
91 | if key.find('.') >= 0:
92 | logged_config_dict[key] = value
93 | return config, logged_config_dict
94 |
--------------------------------------------------------------------------------
/mbpo_pytorch/models/normalizers.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 |
3 | import torch
4 | import torch.nn as nn
5 | from typing import List
6 |
7 | from mbpo_pytorch.thirdparty.running_mean_std import RunningMeanStd
8 |
9 |
10 | class RunningNormalizer(nn.Module, ABC):
11 | def __init__(self, shape: List[int], eps=1e-8, verbose=0):
12 | super().__init__()
13 |
14 | self.shape = shape
15 | self.verbose = verbose
16 |
17 | self.mean = torch.zeros(shape, dtype=torch.float32)
18 | self.var = torch.ones(shape, dtype=torch.float32)
19 | self.eps = eps
20 | self.count = 1e-4
21 |
22 | def forward(self, x: torch.Tensor, inverse=False):
23 | if inverse:
24 | return x * torch.sqrt(self.var) + self.mean
25 | return (x - self.mean) / torch.sqrt(self.var + self.eps)
26 |
27 | def to(self, *args, **kwargs):
28 | self.mean = self.mean.to(*args, **kwargs)
29 | self.var = self.var.to(*args, **kwargs)
30 |
31 | def update(self, samples: torch.Tensor):
32 | sample_count = samples.shape[0]
33 | sample_mean = samples.mean(dim=0)
34 | sample_var = samples.var(dim=0, unbiased=False)
35 | delta = sample_mean - self.mean
36 | total_count = self.count + sample_count
37 |
38 | new_mean = self.mean + delta * sample_count / total_count
39 | m_a = self.var * self.count
40 | m_b = sample_var * sample_count
41 | m_2 = m_a + m_b + delta * delta * self.count * sample_count / (self.count + sample_count)
42 | new_var = m_2 / (self.count + sample_count)
43 |
44 | new_count = sample_count + self.count
45 |
46 | self.mean = new_mean
47 | self.var = new_var
48 | self.count = new_count
49 |
50 | def state_dict(self, *args, **kwargs):
51 | return {'mean': self.mean, 'var': self.var, 'count': self.count}
52 |
53 | def load_state_dict(self, state_dict, strict=True):
54 | self.mean = state_dict['mean']
55 | self.var = state_dict['var']
56 | self.count = state_dict['count']
57 |
58 | def get_rms(self):
59 | rms = RunningMeanStd(self.shape)
60 | rms.count = self.count
61 | rms.mean = self.mean.cpu().numpy()
62 | rms.var = self.var.cpu().numpy()
63 | return rms
64 |
65 |
66 | class BatchNormalizer(nn.Module, ABC):
67 | def __init__(self, shape: List[int], eps=1e-8, verbose=0):
68 | super().__init__()
69 |
70 | self.shape = shape
71 | self.verbose = verbose
72 |
73 | self.mean = torch.zeros(shape, dtype=torch.float32)
74 | self.std = torch.ones(shape, dtype=torch.float32)
75 | self.eps = eps
76 |
77 | def forward(self, x: torch.Tensor, inverse=False):
78 | if inverse:
79 | return x * self.std + self.mean
80 | return (x - self.mean) / (torch.clamp(self.std, min=self.eps))
81 |
82 | def to(self, *args, **kwargs):
83 | self.mean = self.mean.to(*args, **kwargs)
84 | self.std = self.std.to(*args, **kwargs)
85 |
86 | # noinspection DuplicatedCode
87 | # samples in [batch_size, ...]
88 | def update(self, samples: torch.Tensor):
89 | self.mean = torch.mean(samples, dim=0)
90 | self.std = torch.std(samples, dim=0)
91 |
92 | def state_dict(self, *args, **kwargs):
93 | return {'mean': self.mean, 'std': self.std}
94 |
95 | def load_state_dict(self, state_dict, strict=True):
96 | self.mean = state_dict['mean']
97 | self.std = state_dict['std']
98 |
99 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_humanoid.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco import mujoco_env
3 | from gym import utils
4 |
5 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
6 |
7 |
8 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
9 |
10 | def __init__(self):
11 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5)
12 | utils.EzPickle.__init__(self)
13 |
14 | def _get_obs(self):
15 | data = self.sim.data
16 | return np.concatenate([data.qpos.flat[2:],
17 | data.qvel.flat,
18 | data.cinert.flat,
19 | data.cvel.flat,
20 | data.qfrc_actuator.flat,
21 | data.cfrc_ext.flat])
22 |
23 | def step(self, a):
24 | data = self.sim.data
25 | action = a
26 | if getattr(self, 'action_space', None):
27 | action = np.clip(a, self.action_space.low,
28 | self.action_space.high)
29 |
30 | # reward
31 | alive_bonus = 5.0
32 | lin_vel_cost = 0.25 / 0.015 * data.qvel.flat[0]
33 | quad_ctrl_cost = 0.1 * np.square(action).sum()
34 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum()
35 | quad_impact_cost = min(quad_impact_cost, 10)
36 |
37 | self.do_simulation(action, self.frame_skip)
38 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
39 | qpos = self.sim.data.qpos
40 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
41 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost,
42 | reward_alive=alive_bonus, reward_impact=-quad_impact_cost)
43 |
44 | def reset_model(self):
45 | c = 0.01
46 | self.set_state(
47 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
48 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
49 | )
50 | return self._get_obs()
51 |
52 | def viewer_setup(self):
53 | self.viewer.cam.trackbodyid = 1
54 | self.viewer.cam.distance = self.model.stat.extent * 1.0
55 | self.viewer.cam.lookat[2] += .8
56 | self.viewer.cam.elevation = -20
57 |
58 | def cost_np_vec(self, obs, acts, next_obs):
59 | reward_ctrl = -0.1 * np.sum(np.square(acts), axis=1)
60 | reward_run = 0.25 / 0.015 * obs[:, 22]
61 |
62 | quad_impact_cost = .5e-6 * np.square(obs[:, -84:]).sum()
63 | quad_impact_cost = min(quad_impact_cost, 10)
64 |
65 | height = next_obs[:, 0]
66 | done = np.logical_or((height > 2.0), (height < 1.0))
67 | alive_reward = 5 * (1.0 - np.array(done, dtype=np.float))
68 |
69 | reward = reward_run + reward_ctrl + (-quad_impact_cost) + alive_reward
70 | return -reward
71 |
72 | def cost_tf_vec(self, obs, acts, next_obs):
73 | raise NotImplementedError
74 |
75 | def mb_step(self, states, actions, next_states):
76 | # returns rewards and dones
77 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
78 | if getattr(self, 'action_space', None):
79 | actions = np.clip(actions, self.action_space.low,
80 | self.action_space.high)
81 | rewards = - self.cost_np_vec(states, actions, next_states)
82 |
83 | height = next_states[:, 0]
84 | done = np.logical_or((height > 2.0), (height < 1.0))
85 | return rewards, done
86 |
87 |
88 |
--------------------------------------------------------------------------------
/mbpo_pytorch/misc/distributions.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 |
3 | import torch
4 | from torch.distributions import Distribution, Normal
5 | import math
6 |
7 |
8 | class TanhNormal(Distribution, ABC):
9 | """
10 | Represent distribution of X where
11 | Z ~ N(mean, std)
12 | X ~ tanh(Z)
13 | Note: this is not very numerically stable.
14 | """
15 | def __init__(self, mean, std, epsilon=1e-6):
16 | """
17 | :param mean: Mean of the normal distribution
18 | :param std: Std of the normal distribution
19 | :param epsilon: Numerical stability epsilon when computing log-prob.
20 | """
21 | super().__init__()
22 | self.normal_mean = mean
23 | self.normal_std = std
24 | self.normal = Normal(mean, std)
25 | self.epsilon = epsilon
26 |
27 | def log_prob(self, value, pre_tanh_value=None):
28 | if pre_tanh_value is None:
29 | pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2
30 | return self.normal.log_prob(pre_tanh_value) - torch.log(1 - value * value + self.epsilon)
31 |
32 | def log_probs(self, value, pre_tanh_value):
33 | return self.log_prob(value, pre_tanh_value).sum(-1, keepdim=True)
34 |
35 | def sample(self, sample_shape=torch.Size([])):
36 | z = self.normal.sample(sample_shape)
37 | return torch.tanh(z), z
38 |
39 | def rsample(self, sample_shape=torch.Size([]), return_pretanh_value=False):
40 | z = (
41 | self.normal_mean +
42 | self.normal_std *
43 | Normal(
44 | torch.zeros_like(self.normal_mean),
45 | torch.ones_like(self.normal_std)
46 | ).sample()
47 | )
48 | z.requires_grad_()
49 | return torch.tanh(z), z
50 |
51 | def entropy(self):
52 | return self.normal.entropy().sum(-1)
53 |
54 | def mode(self):
55 | return torch.tan(self.normal_mean), self.normal_mean
56 |
57 |
58 | class FixedLimitedEntNormal(torch.distributions.Normal, ABC):
59 | def log_probs(self, actions):
60 | return super().log_prob(actions).sum(-1, keepdim=True)
61 |
62 | def entropy(self):
63 | limit = 2.
64 | lo, hi = (-limit - self.loc) / self.scale / math.sqrt(2), (limit - self.loc) / self.scale / math.sqrt(2)
65 | return (0.5 * (self.scale.log() + math.log(2 * math.pi) / 2) * (hi.erf() - lo.erf()) + 0.5 *
66 | (torch.exp(-hi * hi) * hi - torch.exp(-lo * lo) * lo)).sum(-1)
67 |
68 | def mode(self):
69 | return self.mean
70 |
71 |
72 | class FixedCategorical(torch.distributions.Categorical, ABC):
73 | def sample(self, **kwargs):
74 | return super().sample(**kwargs).unsqueeze(-1)
75 |
76 | def log_probs(self, actions):
77 | return (
78 | super()
79 | .log_prob(actions.squeeze(-1))
80 | .view(actions.size(0), -1)
81 | .sum(-1)
82 | .unsqueeze(-1)
83 | )
84 |
85 | def mode(self):
86 | return self.probs.argmax(dim=-1, keepdim=True)
87 |
88 |
89 | class FixedNormal(torch.distributions.Normal, ABC):
90 |
91 | def log_probs(self, actions):
92 | return super().log_prob(actions).sum(-1, keepdim=True)
93 |
94 | def entropy(self):
95 | return super().entropy().sum(-1)
96 |
97 | def mode(self):
98 | return self.mean
99 |
100 |
101 | class FixedBernoulli(torch.distributions.Bernoulli, ABC):
102 |
103 | def log_probs(self, actions):
104 | return super().log_prob(actions).view(actions.size(0), -1).sum(-1, keepdim=True)
105 |
106 | def entropy(self):
107 | return super().entropy().sum(-1)
108 |
109 | def mode(self):
110 | return torch.gt(self.probs, 0.5).float()
111 |
112 |
--------------------------------------------------------------------------------
/mbpo_pytorch/misc/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | import random
5 | from typing import List, Optional
6 |
7 | import numpy as np
8 | import torch
9 |
10 | from mbpo_pytorch.envs.wrapped_envs import make_vec_envs, get_vec_normalize
11 | from mbpo_pytorch.misc import logger
12 | from mbpo_pytorch.thirdparty.summary_writer import FixedSummaryWriter as SummaryWriter
13 |
14 |
15 | def log_and_write(writer: Optional[SummaryWriter], log_infos: List, global_step: int):
16 | for idx, (name, value) in enumerate(log_infos):
17 | logger.logkv('{}.'.format(idx) + name.split('/')[-1], value)
18 | if writer and name.find('/') > -1:
19 | writer.add_scalar(name, value, global_step=global_step)
20 | logger.dumpkvs()
21 |
22 |
23 | def evaluate(actor, env_name, seed, num_episode, eval_log_dir,
24 | device, norm_reward=False, norm_obs=True, obs_rms=None):
25 | eval_envs = make_vec_envs(env_name, seed, 1, None, eval_log_dir, device, allow_early_resets=True,
26 | norm_obs=norm_obs, norm_reward=norm_reward)
27 | vec_norm = get_vec_normalize(eval_envs)
28 | if vec_norm is not None and norm_obs:
29 | assert obs_rms is not None
30 | vec_norm.training = False
31 | vec_norm.obs_rms = obs_rms
32 |
33 | eval_episode_rewards = []
34 | eval_episode_lengths = []
35 |
36 | states = eval_envs.reset()
37 | while len(eval_episode_rewards) < num_episode:
38 | with torch.no_grad():
39 | actions = actor.act(states, deterministic=True)['actions']
40 |
41 | states, _, _, infos = eval_envs.step(actions)
42 |
43 | eval_episode_rewards.extend([info['episode']['r'] for info in infos if 'episode' in info])
44 | eval_episode_lengths.extend([info['episode']['l'] for info in infos if 'episode' in info])
45 |
46 | eval_envs.close()
47 |
48 | return eval_episode_rewards, eval_episode_lengths
49 |
50 |
51 | # noinspection PyUnresolvedReferences
52 | def set_seed(seed: int, strict=False):
53 | np.random.seed(seed)
54 | torch.manual_seed(np.random.randint(2 ** 30))
55 | random.seed(np.random.randint(2 ** 30))
56 | try:
57 | torch.cuda.manual_seed_all(np.random.randint(2 ** 30))
58 | if strict:
59 | torch.backends.cudnn.deterministic = True
60 | torch.backends.cudnn.benchmark = False
61 | except AttributeError:
62 | pass
63 |
64 |
65 | def get_seed():
66 | return random.randint(0, 2 ** 32 - 1)
67 |
68 |
69 | def commit_and_save(proj_dir: str, save_dir: Optional[str] = None, auto_save: bool = False):
70 | import shutil
71 | if save_dir and auto_save:
72 | shutil.copytree(proj_dir, save_dir + '/code', ignore=shutil.ignore_patterns('result', 'data', 'ref'))
73 |
74 |
75 | def merge_dicts(dicts, merge_fn):
76 | new_dict = {k: [dic[k] for dic in dicts] for k in dicts[0]}
77 | new_dict = {k: merge_fn(v) for k, v in new_dict.items()}
78 | return new_dict
79 |
80 |
81 | def init_logging(config, hparam_dict):
82 | import datetime
83 | current_time = datetime.datetime.now().strftime('%b%d_%H%M%S')
84 | log_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'log')
85 | eval_log_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'log_eval')
86 | save_dir = os.path.join(config.proj_dir, config.result_dir, current_time, 'save')
87 | os.makedirs(log_dir, exist_ok=True)
88 | os.makedirs(eval_log_dir, exist_ok=True)
89 | os.makedirs(save_dir, exist_ok=True)
90 | writer = SummaryWriter(log_dir=log_dir)
91 | writer.add_hparams(hparam_dict, metric_dict={})
92 |
93 | logger.configure(log_dir, None, config.log_email, config.proj_name)
94 | logger.info('Hyperparms:')
95 | for key, value in hparam_dict.items():
96 | logger.log('{:35s}: {}'.format(key, value))
97 |
98 | return writer, log_dir, eval_log_dir, save_dir
99 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/pets_reacher.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | from gym import utils
5 | from gym.envs.mujoco import mujoco_env
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class Reacher3DEnv(mujoco_env.MujocoEnv, utils.EzPickle, BaseModelBasedEnv):
11 |
12 | def __init__(self):
13 | self.viewer = None
14 | utils.EzPickle.__init__(self)
15 | dir_path = os.path.dirname(os.path.realpath(__file__))
16 | self.goal = np.zeros(3)
17 | mujoco_env.MujocoEnv.__init__(self, os.path.join(dir_path, 'assets/reacher3d.xml'), 2)
18 |
19 | def step(self, a):
20 | self.do_simulation(a, self.frame_skip)
21 | ob = self._get_obs()
22 | reward = -np.sum(np.square(self.get_EE_pos(ob[None]) - self.goal))
23 | reward -= 0.01 * np.square(a).sum()
24 | done = False
25 | return ob, reward, done, dict(reward_dist=0, reward_ctrl=0)
26 |
27 | def viewer_setup(self):
28 | self.viewer.cam.trackbodyid = 1
29 | self.viewer.cam.distance = 2.5
30 | self.viewer.cam.elevation = -30
31 | self.viewer.cam.azimuth = 270
32 |
33 | def reset_model(self):
34 | qpos, qvel = np.copy(self.init_qpos), np.copy(self.init_qvel)
35 | qpos[-3:] += np.random.normal(loc=0, scale=0.1, size=[3])
36 | qvel[-3:] = 0
37 | self.goal = qpos[-3:]
38 | self.set_state(qpos, qvel)
39 | return self._get_obs()
40 |
41 | def _get_obs(self):
42 | raw_obs = np.concatenate([
43 | self.sim.data.qpos.flat, self.sim.data.qvel.flat[:-3],
44 | ])
45 |
46 | EE_pos = np.reshape(self.get_EE_pos(raw_obs[None]), [-1])
47 |
48 | return np.concatenate([raw_obs, EE_pos])
49 |
50 | def get_EE_pos(self, states):
51 | theta1, theta2, theta3, theta4, theta5, theta6, theta7 = \
52 | states[:, :1], states[:, 1:2], states[:, 2:3], states[:, 3:4], states[:, 4:5], states[:, 5:6], states[:, 6:]
53 |
54 | rot_axis = np.concatenate([np.cos(theta2) * np.cos(theta1), np.cos(theta2) * np.sin(theta1), -np.sin(theta2)],
55 | axis=1)
56 | rot_perp_axis = np.concatenate([-np.sin(theta1), np.cos(theta1), np.zeros(theta1.shape)], axis=1)
57 | cur_end = np.concatenate([
58 | 0.1 * np.cos(theta1) + 0.4 * np.cos(theta1) * np.cos(theta2),
59 | 0.1 * np.sin(theta1) + 0.4 * np.sin(theta1) * np.cos(theta2) - 0.188,
60 | -0.4 * np.sin(theta2)
61 | ], axis=1)
62 |
63 | for length, hinge, roll in [(0.321, theta4, theta3), (0.16828, theta6, theta5)]:
64 | perp_all_axis = np.cross(rot_axis, rot_perp_axis)
65 | x = np.cos(hinge) * rot_axis
66 | y = np.sin(hinge) * np.sin(roll) * rot_perp_axis
67 | z = -np.sin(hinge) * np.cos(roll) * perp_all_axis
68 | new_rot_axis = x + y + z
69 | new_rot_perp_axis = np.cross(new_rot_axis, rot_axis)
70 | new_rot_perp_axis[np.linalg.norm(new_rot_perp_axis, axis=1) < 1e-30] = \
71 | rot_perp_axis[np.linalg.norm(new_rot_perp_axis, axis=1) < 1e-30]
72 | new_rot_perp_axis /= np.linalg.norm(new_rot_perp_axis, axis=1, keepdims=True)
73 | rot_axis, rot_perp_axis, cur_end = new_rot_axis, new_rot_perp_axis, cur_end + length * new_rot_axis
74 |
75 | return cur_end
76 |
77 | def cost_np_vec(self, obs, acts, next_obs):
78 | """
79 | def obs_cost_fn(self, obs):
80 | self.ENV.goal = obs[:, 7: 10]
81 | ee_pos = obs[:, -3:]
82 | return np.sum(np.square(ee_pos - self.ENV.goal), axis=1)
83 |
84 | @staticmethod
85 | def ac_cost_fn(acs):
86 | return 0.01 * np.sum(np.square(acs), axis=1)
87 | """
88 | reward_ctrl = -0.01 * np.sum(np.square(acts), axis=1)
89 | goal = obs[:, 7: 10]
90 | ee_pos = obs[:, -3:]
91 |
92 | reward = -np.sum(np.square(ee_pos - goal), axis=1) + reward_ctrl
93 | return -reward
94 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/assets/walker2d.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
56 |
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/mbpo_pytorch/models/actor_layer.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from mbpo_pytorch.misc.distributions import FixedNormal, FixedCategorical, FixedBernoulli, \
7 | TanhNormal, FixedLimitedEntNormal
8 | from .utils import init
9 |
10 |
11 | class CategoricalActorLayer(nn.Module, ABC):
12 | def __init__(self, num_inputs, num_outputs):
13 | super(CategoricalActorLayer, self).__init__()
14 |
15 | self.logit_actor = nn.Linear(num_inputs, num_outputs)
16 | init(self.logit_actor, lambda x: nn.init.orthogonal_(x, 0.01), lambda x: nn.init.constant_(x, 0))
17 |
18 | def forward(self, states):
19 | logits = self.logit_actor(states)
20 | return FixedCategorical(logits=logits)
21 |
22 |
23 | class GaussianActorLayer(nn.Module, ABC):
24 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std):
25 | super(GaussianActorLayer, self).__init__()
26 |
27 | self.actor_mean = nn.Linear(num_inputs, num_outputs)
28 | init(self.actor_mean, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0))
29 | self.use_state_dependent_std = use_state_dependent_std
30 | if self.use_state_dependent_std:
31 | self.actor_logstd = nn.Linear(num_inputs, num_outputs)
32 | init(self.actor_logstd, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0))
33 |
34 | else:
35 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True)
36 |
37 | def forward(self, x):
38 | action_mean = self.actor_mean(x)
39 |
40 | if self.use_state_dependent_std:
41 | logstd = self.actor_logstd(x)
42 | else:
43 | logstd = self.logstd
44 |
45 | return FixedNormal(action_mean, logstd.exp()), action_mean, logstd
46 |
47 |
48 | class LimitedEntGaussianActorLayer(nn.Module, ABC):
49 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std):
50 | super(LimitedEntGaussianActorLayer, self).__init__()
51 |
52 | self.mean_actor = nn.Linear(num_inputs, num_outputs)
53 | init(self.mean_actor, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0))
54 | self.use_state_dependent_std = use_state_dependent_std
55 | if self.use_state_dependent_std:
56 | self.logstd_actor = nn.Linear(num_inputs, num_outputs)
57 | init(self.logstd_actor, nn.init.orthogonal_, lambda x: nn.init.constant_(x, 0))
58 |
59 | else:
60 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True)
61 | self.logstd_actor = lambda _: self.logstd
62 |
63 | def forward(self, x):
64 | action_mean = self.mean_actor(x)
65 | logstd = self.logstd_actor(x)
66 | return FixedLimitedEntNormal(action_mean, logstd.exp()), action_mean, logstd
67 |
68 |
69 | class BernoulliActorLayer(nn.Module, ABC):
70 | def __init__(self, num_inputs, num_outputs):
71 | super(BernoulliActorLayer, self).__init__()
72 |
73 | self.logit_actor = nn.Linear(num_inputs, num_outputs)
74 | init(self.logit_actor, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0))
75 |
76 | def forward(self, states):
77 | logits = self.logit_actor(states)
78 | return FixedBernoulli(logits=logits)
79 |
80 |
81 | class TanhGaussainActorLayer(nn.Module, ABC):
82 | def __init__(self, num_inputs, num_outputs, use_state_dependent_std, init_w=1e-3):
83 | super(TanhGaussainActorLayer, self).__init__()
84 |
85 | self.mean_actor = nn.Linear(num_inputs, num_outputs)
86 | init(self.mean_actor, lambda x: nn.init.uniform_(x, -init_w, init_w),
87 | lambda x: nn.init.uniform_(x, -init_w, init_w))
88 |
89 | self.state_dependent_std = use_state_dependent_std
90 | if self.state_dependent_std:
91 | self.logstd_actor = nn.Linear(num_inputs, num_outputs)
92 | init(self.logstd_actor, lambda x: nn.init.uniform_(x, -init_w, init_w),
93 | lambda x: nn.init.uniform_(x, -init_w, init_w))
94 | else:
95 | self.logstd = nn.Parameter(torch.zeros(num_outputs), requires_grad=True)
96 | self.logstd_actor = lambda _: self.logstd
97 |
98 | def forward(self, states):
99 | action_means = self.mean_actor(states)
100 | action_logstds = self.logstd_actor(states)
101 | action_logstds = torch.clamp(action_logstds, -20, 2)
102 |
103 | return TanhNormal(action_means, action_logstds.exp()), torch.tanh(action_means), action_logstds
104 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/assets/ant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/wrapped_envs.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | import os
3 | from typing import Optional, TYPE_CHECKING
4 |
5 | from gym.wrappers import TimeLimit
6 | import torch
7 |
8 | from .virtual_env import VecVirtualEnv
9 | from .benchmarking_envs.benchmarking_envs import make_benchmarking_env
10 | from mbpo_pytorch.thirdparty.base_vec_env import VecEnvWrapper
11 | from mbpo_pytorch.thirdparty.dummy_vec_env import DummyVecEnv
12 | from mbpo_pytorch.thirdparty.subproc_vec_env import SubprocVecEnv
13 | from mbpo_pytorch.thirdparty.vec_normalize import VecNormalize
14 | from mbpo_pytorch.thirdparty.monitor import Monitor
15 |
16 | if TYPE_CHECKING:
17 | from mbpo_pytorch.models.dynamics import BaseDynamics
18 |
19 |
20 | def make_env(env_id, seed, rank, log_dir, allow_early_resets, max_episode_steps):
21 | def _thunk():
22 | env = make_benchmarking_env(env_id)
23 | env = TimeLimit(env, max_episode_steps)
24 |
25 | env.seed(seed + rank)
26 | log_dir_ = os.path.join(log_dir, str(rank)) if log_dir is not None else log_dir
27 | env = Monitor(env, log_dir_, allow_early_resets=allow_early_resets)
28 |
29 | return env
30 |
31 | return _thunk
32 |
33 |
34 | def make_vec_envs(env_name: str,
35 | seed: int,
36 | num_envs: int,
37 | gamma: Optional[float],
38 | log_dir: Optional[str],
39 | device: torch.device,
40 | allow_early_resets: bool,
41 | max_episode_steps: int = 1000,
42 | norm_reward=True,
43 | norm_obs=True,
44 | ):
45 | envs = [
46 | make_env(env_name, seed, i, log_dir, allow_early_resets, max_episode_steps)
47 | for i in range(num_envs)
48 | ]
49 |
50 | if len(envs) > 1:
51 | envs = SubprocVecEnv(envs)
52 | else:
53 | envs = DummyVecEnv(envs)
54 |
55 | if len(envs.observation_space.shape) == 1:
56 | if gamma is None:
57 | envs = VecNormalize(envs, norm_reward=False, norm_obs=norm_obs)
58 | else:
59 | envs = VecNormalize(envs, gamma=gamma, norm_reward=norm_reward, norm_obs=norm_obs)
60 |
61 | envs = VecPyTorch(envs, device)
62 |
63 | return envs
64 |
65 |
66 | def make_vec_virtual_envs(env_name: str,
67 | dynamics: BaseDynamics,
68 | seed: int,
69 | num_envs: int,
70 | gamma: Optional[float],
71 | device: torch.device,
72 | norm_reward=False,
73 | norm_obs=False,
74 | **kwargs
75 | ):
76 | envs = VecVirtualEnv(dynamics, make_benchmarking_env(env_name), num_envs, seed, **kwargs)
77 |
78 | if (len(envs.observation_space.shape) == 1 and norm_obs) or norm_reward:
79 | if gamma is None:
80 | envs = VecNormalize(envs, norm_reward=False, norm_obs=norm_obs)
81 | else:
82 | envs = VecNormalize(envs, gamma=gamma, norm_reward=norm_reward, norm_obs=norm_obs)
83 |
84 | envs = VecPyTorch(envs, device)
85 |
86 | return envs
87 |
88 |
89 | class VecPyTorch(VecEnvWrapper):
90 | def __init__(self, venv, device):
91 | super(VecPyTorch, self).__init__(venv)
92 | self.device = device
93 |
94 | def reset(self):
95 | obs = self.venv.reset()
96 | obs = torch.from_numpy(obs).float().to(self.device)
97 | return obs
98 |
99 | def step_with_states(self, states: torch.Tensor, actions: torch.Tensor):
100 | if isinstance(actions, torch.LongTensor):
101 | actions = actions.squeeze(1)
102 | return self.venv.step_with_states(states, actions)
103 |
104 | def step_async(self, actions: torch.Tensor):
105 | if isinstance(actions, torch.LongTensor):
106 | actions = actions.squeeze(1)
107 | actions = actions.cpu().numpy()
108 | self.venv.step_async(actions)
109 |
110 | def step_wait(self):
111 | obs, reward, done, info = self.venv.step_wait()
112 | obs = torch.from_numpy(obs).float().to(self.device)
113 | reward = torch.from_numpy(reward).unsqueeze(dim=1).float()
114 | return obs, reward, done, info
115 |
116 | def env_method(self, method_name, *method_args, indices=None, **method_kwargs):
117 | new_method_args = []
118 | new_method_kwargs = {}
119 | for method_arg in method_args:
120 | if type(method_arg) == torch.Tensor:
121 | new_method_args.append(method_arg.cpu().numpy())
122 | for method_arg_k, method_arg_v in method_kwargs.items():
123 | if type(method_arg_v) == torch.Tensor:
124 | new_method_kwargs[method_arg_k] = method_arg_v.cpu().numpy()
125 | self.venv.env_method(method_name, *new_method_args, indices, **new_method_kwargs)
126 |
127 |
128 | def get_vec_normalize(venv):
129 | if isinstance(venv, VecNormalize):
130 | return venv
131 | elif hasattr(venv, 'venv'):
132 | return get_vec_normalize(venv.venv)
133 |
134 | return None
135 |
136 |
137 |
--------------------------------------------------------------------------------
/mbpo_pytorch/thirdparty/dummy_vec_env.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | import numpy as np
3 | from typing import Sequence
4 |
5 | from mbpo_pytorch.thirdparty.base_vec_env import VecEnv
6 | from mbpo_pytorch.thirdparty.util import copy_obs_dict, dict_to_obs, obs_space_info
7 |
8 |
9 | class DummyVecEnv(VecEnv):
10 | """
11 | Creates a simple vectorized wrapper for multiple environments, calling each environment in sequence on the current
12 | Python process. This is useful for computationally simple environment such as ``cartpole-v1``, as the overhead of
13 | multiprocess or multithread outweighs the environment computation time. This can also be used for RL methods that
14 | require a vectorized environment, but that you want a single environments to train with.
15 |
16 | :param env_fns: ([callable]) A list of functions that will create the environments
17 | (each callable returns a `Gym.Env` instance when called).
18 | """
19 |
20 | def __init__(self, env_fns):
21 | self.envs = [fn() for fn in env_fns]
22 | env = self.envs[0]
23 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
24 | obs_space = env.observation_space
25 | self.keys, shapes, dtypes = obs_space_info(obs_space)
26 |
27 | self.buf_obs = OrderedDict([
28 | (k, np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]))
29 | for k in self.keys])
30 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
31 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
32 | self.buf_infos = [{} for _ in range(self.num_envs)]
33 | self.actions = None
34 | self.metadata = env.metadata
35 |
36 | def step_async(self, actions):
37 | self.actions = actions
38 |
39 | def step_wait(self):
40 | for env_idx in range(self.num_envs):
41 | obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] =\
42 | self.envs[env_idx].step(self.actions[env_idx])
43 | if self.buf_dones[env_idx]:
44 | # save final observation where user can get it, then reset
45 | self.buf_infos[env_idx]['terminal_observation'] = obs
46 | obs = self.envs[env_idx].reset()
47 | self._save_obs(env_idx, obs)
48 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
49 | self.buf_infos.copy())
50 |
51 | def seed(self, seed=None):
52 | seeds = list()
53 | for idx, env in enumerate(self.envs):
54 | seeds.append(env.seed(seed + idx))
55 | return seeds
56 |
57 | def reset(self):
58 | for env_idx in range(self.num_envs):
59 | obs = self.envs[env_idx].reset()
60 | self._save_obs(env_idx, obs)
61 | return self._obs_from_buf()
62 |
63 | def close(self):
64 | for env in self.envs:
65 | env.close()
66 |
67 | def get_images(self, *args, **kwargs) -> Sequence[np.ndarray]:
68 | return [env.render(*args, mode='rgb_array', **kwargs) for env in self.envs]
69 |
70 | def render(self, *args, **kwargs):
71 | """
72 | Gym environment rendering. If there are multiple environments then
73 | they are tiled together in one image via `BaseVecEnv.render()`.
74 | Otherwise (if `self.num_envs == 1`), we pass the render call directly to the
75 | underlying environment.
76 |
77 | Therefore, some arguments such as `mode` will have values that are valid
78 | only when `num_envs == 1`.
79 |
80 | :param mode: The rendering type.
81 | """
82 | if self.num_envs == 1:
83 | return self.envs[0].render(*args, **kwargs)
84 | else:
85 | return super().render(*args, **kwargs)
86 |
87 | def _save_obs(self, env_idx, obs):
88 | for key in self.keys:
89 | if key is None:
90 | self.buf_obs[key][env_idx] = obs
91 | else:
92 | self.buf_obs[key][env_idx] = obs[key]
93 |
94 | def _obs_from_buf(self):
95 | return dict_to_obs(self.observation_space, copy_obs_dict(self.buf_obs))
96 |
97 | def get_attr(self, attr_name, indices=None):
98 | """Return attribute from vectorized environment (see base class)."""
99 | target_envs = self._get_target_envs(indices)
100 | return [getattr(env_i, attr_name) for env_i in target_envs]
101 |
102 | def set_attr(self, attr_name, value, indices=None):
103 | """Set attribute inside vectorized environments (see base class)."""
104 | target_envs = self._get_target_envs(indices)
105 | for env_i in target_envs:
106 | setattr(env_i, attr_name, value)
107 |
108 | def env_method(self, method_name, *method_args, indices=None, **method_kwargs):
109 | """Call instance methods of vectorized environments."""
110 | target_envs = self._get_target_envs(indices)
111 | return [getattr(env_i, method_name)(*method_args, **method_kwargs) for env_i in target_envs]
112 |
113 | def _get_target_envs(self, indices):
114 | indices = self._get_indices(indices)
115 | return [self.envs[i] for i in indices]
116 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/pendulum.py:
--------------------------------------------------------------------------------
1 | from os import path
2 |
3 | from gym import spaces
4 | from gym.utils import seeding
5 | import numpy as np
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class PendulumEnv(BaseModelBasedEnv):
11 | metadata = {
12 | 'render.modes': ['human', 'rgb_array'],
13 | 'video.frames_per_second': 30
14 | }
15 |
16 | def __init__(self):
17 | self.max_speed = 8
18 | self.max_torque = 2.
19 | self.dt = .05
20 | self.viewer = None
21 |
22 | high = np.array([1., 1., self.max_speed])
23 | self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,))
24 | self.observation_space = spaces.Box(low=-high, high=high)
25 |
26 | self._seed()
27 |
28 | def _seed(self, seed=None):
29 | self.np_random, seed = seeding.np_random(seed)
30 | return [seed]
31 |
32 | def step(self, u):
33 | th, thdot = self.state # th := theta
34 | '''
35 | theta, thetadot = self.state
36 | return np.array([np.cos(theta), np.sin(theta), thetadot])
37 | '''
38 |
39 | # for the reward
40 | y, x, thetadot = np.cos(th), np.sin(th), thdot
41 | u = np.clip(u, -self.max_torque, self.max_torque)[0]
42 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2)
43 | reward = -costs
44 |
45 | g = 10.
46 | m = 1.
47 | l = 1.
48 | dt = self.dt
49 |
50 | self.last_u = u # for rendering
51 | # costs = angle_normalize(th) ** 2 + .1 * thdot ** 2 + .001 * (u ** 2)
52 |
53 | newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
54 | newth = th + newthdot * dt
55 | newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) # pylint: disable=E1111
56 |
57 | self.state = np.array([newth, newthdot])
58 | return self._get_obs(), reward, False, {}
59 |
60 | def _reset(self):
61 | high = np.array([np.pi, 1])
62 | self.state = self.np_random.uniform(low=-high, high=high)
63 | self.last_u = None
64 | return self._get_obs()
65 |
66 | def _get_obs(self):
67 | theta, thetadot = self.state
68 | return np.array([np.cos(theta), np.sin(theta), thetadot])
69 |
70 | def render(self, mode='human', close=False):
71 | if close:
72 | if self.viewer is not None:
73 | self.viewer.close()
74 | self.viewer = None
75 | return
76 |
77 | if self.viewer is None:
78 | from gym.envs.classic_control import rendering
79 | self.viewer = rendering.Viewer(500, 500)
80 | self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)
81 | rod = rendering.make_capsule(1, .2)
82 | rod.set_color(.8, .3, .3)
83 | self.pole_transform = rendering.Transform()
84 | rod.add_attr(self.pole_transform)
85 | self.viewer.add_geom(rod)
86 | axle = rendering.make_circle(.05)
87 | axle.set_color(0, 0, 0)
88 | self.viewer.add_geom(axle)
89 | fname = path.join(path.dirname(__file__), "assets/clockwise.png")
90 | self.img = rendering.Image(fname, 1., 1.)
91 | self.imgtrans = rendering.Transform()
92 | self.img.add_attr(self.imgtrans)
93 |
94 | self.viewer.add_onetime(self.img)
95 | self.pole_transform.set_rotation(self.state[0] + np.pi / 2)
96 | if self.last_u:
97 | self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2)
98 |
99 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
100 |
101 | def mb_step(self, states, actions, next_states):
102 | # returns rewards and dones
103 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
104 | if getattr(self, 'action_space', None):
105 | actions = np.clip(actions, self.action_space.low,
106 | self.action_space.high)
107 | rewards = - self.cost_np_vec(states, actions, next_states)
108 | return rewards, np.zeros_like(rewards, dtype=np.bool)
109 |
110 | def cost_np_vec(self, obs, acts, next_obs):
111 | """
112 | dist_vec = obs[:, -3:]
113 | reward_dist = - np.linalg.norm(dist_vec, axis=1)
114 | reward_ctrl = - np.sum(np.square(acts), axis=1)
115 | reward = reward_dist + reward_ctrl
116 |
117 | # for the reward
118 | y, x, thetadot = np.cos(th), np.sin(th), thdot
119 | u = np.clip(u, -self.max_torque, self.max_torque)[0]
120 | costs = y + .1 * x + .1 * (thetadot ** 2) + .001 * (u ** 2)
121 | reward = -costs
122 |
123 | def _get_obs(self):
124 | theta, thetadot = self.state
125 | return np.array([np.cos(theta), np.sin(theta), thetadot])
126 |
127 | """
128 | y, x, thetadot = obs[:, 0], obs[:, 1], obs[:, 2]
129 | u = np.clip(acts[:, 0], -self.max_torque, self.max_torque)
130 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2)
131 | return costs
132 |
133 | def angle_normalize(x):
134 | return (((x + np.pi) % (2 * np.pi)) - np.pi)
135 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_pendulumO01.py:
--------------------------------------------------------------------------------
1 | from os import path
2 |
3 | import gym
4 | from gym import spaces
5 | from gym.utils import seeding
6 | import numpy as np
7 |
8 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
9 |
10 |
11 | class PendulumEnv(BaseModelBasedEnv):
12 | metadata = {
13 | 'render.modes': ['human', 'rgb_array'],
14 | 'video.frames_per_second': 30
15 | }
16 |
17 | def __init__(self):
18 | self.max_speed = 8
19 | self.max_torque = 2.
20 | self.dt = .05
21 | self.viewer = None
22 |
23 | high = np.array([1., 1., self.max_speed])
24 | self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,))
25 | self.observation_space = spaces.Box(low=-high, high=high)
26 |
27 | self._seed()
28 |
29 | def _seed(self, seed=None):
30 | self.np_random, seed = seeding.np_random(seed)
31 | return [seed]
32 |
33 | def step(self, u):
34 | th, thdot = self.state # th := theta
35 | '''
36 | theta, thetadot = self.state
37 | return np.array([np.cos(theta), np.sin(theta), thetadot])
38 | '''
39 |
40 | # for the reward
41 | y, x, thetadot = np.cos(th), np.sin(th), thdot
42 | u = np.clip(u, -self.max_torque, self.max_torque)[0]
43 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2)
44 | reward = -costs
45 |
46 | g = 10.
47 | m = 1.
48 | l = 1.
49 | dt = self.dt
50 |
51 | self.last_u = u # for rendering
52 | # costs = angle_normalize(th) ** 2 + .1 * thdot ** 2 + .001 * (u ** 2)
53 |
54 | newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
55 | newth = th + newthdot * dt
56 | newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) # pylint: disable=E1111
57 |
58 | self.state = np.array([newth, newthdot])
59 | ob = self._get_obs()
60 | ob += np.random.uniform(low=-0.1, high=0.1, size=ob.shape)
61 | return ob, reward, False, {}
62 |
63 | def _reset(self):
64 | high = np.array([np.pi, 1])
65 | self.state = self.np_random.uniform(low=-high, high=high)
66 | self.last_u = None
67 | return self._get_obs()
68 |
69 | def _get_obs(self):
70 | theta, thetadot = self.state
71 | return np.array([np.cos(theta), np.sin(theta), thetadot])
72 |
73 | def render(self, mode='human', close=False):
74 | if close:
75 | if self.viewer is not None:
76 | self.viewer.close()
77 | self.viewer = None
78 | return
79 |
80 | if self.viewer is None:
81 | from gym.envs.classic_control import rendering
82 | self.viewer = rendering.Viewer(500, 500)
83 | self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)
84 | rod = rendering.make_capsule(1, .2)
85 | rod.set_color(.8, .3, .3)
86 | self.pole_transform = rendering.Transform()
87 | rod.add_attr(self.pole_transform)
88 | self.viewer.add_geom(rod)
89 | axle = rendering.make_circle(.05)
90 | axle.set_color(0, 0, 0)
91 | self.viewer.add_geom(axle)
92 | fname = path.join(path.dirname(__file__), "assets/clockwise.png")
93 | self.img = rendering.Image(fname, 1., 1.)
94 | self.imgtrans = rendering.Transform()
95 | self.img.add_attr(self.imgtrans)
96 |
97 | self.viewer.add_onetime(self.img)
98 | self.pole_transform.set_rotation(self.state[0] + np.pi / 2)
99 | if self.last_u:
100 | self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2)
101 |
102 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
103 |
104 | def cost_np_vec(self, obs, acts, next_obs):
105 | """
106 | dist_vec = obs[:, -3:]
107 | reward_dist = - np.linalg.norm(dist_vec, axis=1)
108 | reward_ctrl = - np.sum(np.square(acts), axis=1)
109 | reward = reward_dist + reward_ctrl
110 |
111 | # for the reward
112 | y, x, thetadot = np.cos(th), np.sin(th), thdot
113 | u = np.clip(u, -self.max_torque, self.max_torque)[0]
114 | costs = y + .1 * x + .1 * (thetadot ** 2) + .001 * (u ** 2)
115 | reward = -costs
116 |
117 | def _get_obs(self):
118 | theta, thetadot = self.state
119 | return np.array([np.cos(theta), np.sin(theta), thetadot])
120 |
121 | """
122 | y, x, thetadot = obs[:, 0], obs[:, 1], obs[:, 2]
123 | u = np.clip(acts[:, 0], -self.max_torque, self.max_torque)
124 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2)
125 | return costs
126 |
127 | def mb_step(self, states, actions, next_states):
128 | # returns rewards and dones
129 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
130 | if getattr(self, 'action_space', None):
131 | actions = np.clip(actions, self.action_space.low,
132 | self.action_space.high)
133 | rewards = - self.cost_np_vec(states, actions, next_states)
134 | return rewards, np.zeros_like(rewards, dtype=np.bool)
135 |
136 | def angle_normalize(x):
137 | return (((x + np.pi) % (2 * np.pi)) - np.pi)
138 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_pendulumO001.py:
--------------------------------------------------------------------------------
1 | from os import path
2 |
3 | import gym
4 | from gym import spaces
5 | from gym.utils import seeding
6 | import numpy as np
7 |
8 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
9 |
10 |
11 | class PendulumEnv(BaseModelBasedEnv):
12 | metadata = {
13 | 'render.modes': ['human', 'rgb_array'],
14 | 'video.frames_per_second': 30
15 | }
16 |
17 | def __init__(self):
18 | self.max_speed = 8
19 | self.max_torque = 2.
20 | self.dt = .05
21 | self.viewer = None
22 |
23 | high = np.array([1., 1., self.max_speed])
24 | self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1,))
25 | self.observation_space = spaces.Box(low=-high, high=high)
26 |
27 | self._seed()
28 |
29 | def _seed(self, seed=None):
30 | self.np_random, seed = seeding.np_random(seed)
31 | return [seed]
32 |
33 | def step(self, u):
34 | th, thdot = self.state # th := theta
35 | '''
36 | theta, thetadot = self.state
37 | return np.array([np.cos(theta), np.sin(theta), thetadot])
38 | '''
39 |
40 | # for the reward
41 | y, x, thetadot = np.cos(th), np.sin(th), thdot
42 | u = np.clip(u, -self.max_torque, self.max_torque)[0]
43 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2)
44 | reward = -costs
45 |
46 | g = 10.
47 | m = 1.
48 | l = 1.
49 | dt = self.dt
50 |
51 | self.last_u = u # for rendering
52 | # costs = angle_normalize(th) ** 2 + .1 * thdot ** 2 + .001 * (u ** 2)
53 |
54 | newthdot = thdot + (-3 * g / (2 * l) * np.sin(th + np.pi) + 3. / (m * l ** 2) * u) * dt
55 | newth = th + newthdot * dt
56 | newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) # pylint: disable=E1111
57 |
58 | self.state = np.array([newth, newthdot])
59 | ob = self._get_obs()
60 | ob += np.random.uniform(low=-0.01, high=0.01, size=ob.shape)
61 | return ob, reward, False, {}
62 |
63 | def _reset(self):
64 | high = np.array([np.pi, 1])
65 | self.state = self.np_random.uniform(low=-high, high=high)
66 | self.last_u = None
67 | return self._get_obs()
68 |
69 | def _get_obs(self):
70 | theta, thetadot = self.state
71 | return np.array([np.cos(theta), np.sin(theta), thetadot])
72 |
73 | def render(self, mode='human', close=False):
74 | if close:
75 | if self.viewer is not None:
76 | self.viewer.close()
77 | self.viewer = None
78 | return
79 |
80 | if self.viewer is None:
81 | from gym.envs.classic_control import rendering
82 | self.viewer = rendering.Viewer(500, 500)
83 | self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2)
84 | rod = rendering.make_capsule(1, .2)
85 | rod.set_color(.8, .3, .3)
86 | self.pole_transform = rendering.Transform()
87 | rod.add_attr(self.pole_transform)
88 | self.viewer.add_geom(rod)
89 | axle = rendering.make_circle(.05)
90 | axle.set_color(0, 0, 0)
91 | self.viewer.add_geom(axle)
92 | fname = path.join(path.dirname(__file__), "assets/clockwise.png")
93 | self.img = rendering.Image(fname, 1., 1.)
94 | self.imgtrans = rendering.Transform()
95 | self.img.add_attr(self.imgtrans)
96 |
97 | self.viewer.add_onetime(self.img)
98 | self.pole_transform.set_rotation(self.state[0] + np.pi / 2)
99 | if self.last_u:
100 | self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2)
101 |
102 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
103 |
104 | def cost_np_vec(self, obs, acts, next_obs):
105 | """
106 | dist_vec = obs[:, -3:]
107 | reward_dist = - np.linalg.norm(dist_vec, axis=1)
108 | reward_ctrl = - np.sum(np.square(acts), axis=1)
109 | reward = reward_dist + reward_ctrl
110 |
111 | # for the reward
112 | y, x, thetadot = np.cos(th), np.sin(th), thdot
113 | u = np.clip(u, -self.max_torque, self.max_torque)[0]
114 | costs = y + .1 * x + .1 * (thetadot ** 2) + .001 * (u ** 2)
115 | reward = -costs
116 |
117 | def _get_obs(self):
118 | theta, thetadot = self.state
119 | return np.array([np.cos(theta), np.sin(theta), thetadot])
120 |
121 | """
122 | y, x, thetadot = obs[:, 0], obs[:, 1], obs[:, 2]
123 | u = np.clip(acts[:, 0], -self.max_torque, self.max_torque)
124 | costs = y + .1 * np.abs(x) + .1 * (thetadot ** 2) + .001 * (u ** 2)
125 | return costs
126 |
127 | def mb_step(self, states, actions, next_states):
128 | # returns rewards and dones
129 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
130 | if getattr(self, 'action_space', None):
131 | actions = np.clip(actions, self.action_space.low,
132 | self.action_space.high)
133 | rewards = - self.cost_np_vec(states, actions, next_states)
134 | return rewards, np.zeros_like(rewards, dtype=np.bool)
135 |
136 | def angle_normalize(x):
137 | return (((x + np.pi) % (2 * np.pi)) - np.pi)
138 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/assets/pusher.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/assets/half_cheetah.xml:
--------------------------------------------------------------------------------
1 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/assets/half_cheetah.xml:
--------------------------------------------------------------------------------
1 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_cartpoleO001.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from gym import spaces
4 | from gym.utils import seeding
5 | import numpy as np
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class CartPoleEnv(BaseModelBasedEnv):
11 | metadata = {
12 | 'render.modes': ['human', 'rgb_array'],
13 | 'video.frames_per_second': 50
14 | }
15 |
16 | def __init__(self):
17 | self.gravity = 9.8
18 | self.masscart = 1.0
19 | self.masspole = 0.1
20 | self.total_mass = (self.masspole + self.masscart)
21 | self.length = 0.5 # actually half the pole's length
22 | self.polemass_length = (self.masspole * self.length)
23 | self.force_mag = 10.0
24 | self.tau = 0.02 # seconds between state updates
25 |
26 | # Angle at which to fail the episode
27 | self.theta_threshold_radians = 12 * 2 * math.pi / 360
28 | self.x_threshold = 2.4
29 |
30 | # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
31 | high = np.array([
32 | self.x_threshold * 2,
33 | np.finfo(np.float32).max,
34 | self.theta_threshold_radians * 2,
35 | np.finfo(np.float32).max])
36 |
37 | # self.action_space = spaces.Discrete(2)
38 | self.action_space = \
39 | spaces.Box(low=np.array([-1.0]), high=np.array([1.0]))
40 | self.observation_space = spaces.Box(-high, high)
41 |
42 | self._seed()
43 | self.viewer = None
44 | self.state = None
45 |
46 | self.steps_beyond_done = None
47 |
48 | def _seed(self, seed=None):
49 | self.np_random, seed = seeding.np_random(seed)
50 | return [seed]
51 |
52 | def step(self, action):
53 | action = 1 if action[0] > .0 else 0
54 | state = self.state
55 | obs = self.state
56 | reward = np.cos(obs[2]) - 0.01 * (obs[0] ** 2)
57 |
58 | x, x_dot, theta, theta_dot = state
59 | force = self.force_mag if action == 1 else -self.force_mag
60 | costheta = math.cos(theta)
61 | sintheta = math.sin(theta)
62 | temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
63 | thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass))
64 | xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
65 | x = x + self.tau * x_dot
66 | x_dot = x_dot + self.tau * xacc
67 | theta = theta + self.tau * theta_dot
68 | theta_dot = theta_dot + self.tau * thetaacc
69 | self.state = (x, x_dot, theta, theta_dot)
70 |
71 | done = False
72 | self.steps_beyond_done = None
73 |
74 | ob = np.array(self.state)
75 | ob += np.random.uniform(low=-0.01, high=0.01, size=ob.shape)
76 |
77 | return ob, reward, done, {}
78 |
79 | def _reset(self):
80 | self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
81 | self.steps_beyond_done = None
82 | return np.array(self.state)
83 |
84 | def render(self, mode='human', close=False):
85 | if close:
86 | if self.viewer is not None:
87 | self.viewer.close()
88 | self.viewer = None
89 | return
90 |
91 | screen_width = 600
92 | screen_height = 400
93 |
94 | world_width = self.x_threshold * 2
95 | scale = screen_width / world_width
96 | carty = 100 # TOP OF CART
97 | polewidth = 10.0
98 | polelen = scale * 1.0
99 | cartwidth = 50.0
100 | cartheight = 30.0
101 |
102 | if self.viewer is None:
103 | from gym.envs.classic_control import rendering
104 | self.viewer = rendering.Viewer(screen_width, screen_height)
105 | l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
106 | axleoffset = cartheight / 4.0
107 | cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
108 | self.carttrans = rendering.Transform()
109 | cart.add_attr(self.carttrans)
110 | self.viewer.add_geom(cart)
111 | l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2
112 | pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
113 | pole.set_color(.8, .6, .4)
114 | self.poletrans = rendering.Transform(translation=(0, axleoffset))
115 | pole.add_attr(self.poletrans)
116 | pole.add_attr(self.carttrans)
117 | self.viewer.add_geom(pole)
118 | self.axle = rendering.make_circle(polewidth / 2)
119 | self.axle.add_attr(self.poletrans)
120 | self.axle.add_attr(self.carttrans)
121 | self.axle.set_color(.5, .5, .8)
122 | self.viewer.add_geom(self.axle)
123 | self.track = rendering.Line((0, carty), (screen_width, carty))
124 | self.track.set_color(0, 0, 0)
125 | self.viewer.add_geom(self.track)
126 |
127 | if self.state is None:
128 | return None
129 |
130 | x = self.state
131 | cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART
132 | self.carttrans.set_translation(cartx, carty)
133 | self.poletrans.set_rotation(-x[2])
134 |
135 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
136 |
137 | def cost_np_vec(self, obs, acts, next_obs):
138 | x = obs[:, 0]
139 | theta = obs[:, 2]
140 | return -(np.cos(theta) - 0.01 * (x ** 2))
141 |
142 | def mb_step(self, states, actions, next_states):
143 | # returns rewards and dones
144 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
145 | if getattr(self, 'action_space', None):
146 | actions = np.clip(actions, self.action_space.low,
147 | self.action_space.high)
148 | rewards = - self.cost_np_vec(states, actions, next_states)
149 | return rewards, np.zeros_like(rewards, dtype=np.bool)
150 |
151 |
152 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/cartpole.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from gym import spaces
4 | from gym.utils import seeding
5 | import numpy as np
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class CartPoleEnv(BaseModelBasedEnv):
11 | metadata = {
12 | 'render.modes': ['human', 'rgb_array'],
13 | 'video.frames_per_second': 50
14 | }
15 |
16 | def __init__(self):
17 | self.gravity = 9.8
18 | self.masscart = 1.0
19 | self.masspole = 0.1
20 | self.total_mass = (self.masspole + self.masscart)
21 | self.length = 0.5 # actually half the pole's length
22 | self.polemass_length = (self.masspole * self.length)
23 | self.force_mag = 10.0
24 | self.tau = 0.02 # seconds between state updates
25 |
26 | # Angle at which to fail the episode
27 | self.theta_threshold_radians = 12 * 2 * math.pi / 360
28 | self.x_threshold = 2.4
29 |
30 | # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
31 | high = np.array([
32 | self.x_threshold * 2,
33 | np.finfo(np.float32).max,
34 | self.theta_threshold_radians * 2,
35 | np.finfo(np.float32).max])
36 |
37 | # self.action_space = spaces.Discrete(2)
38 | self.action_space = \
39 | spaces.Box(low=np.array([-1.0]), high=np.array([1.0]))
40 | self.observation_space = spaces.Box(-high, high)
41 |
42 | self._seed()
43 | self.viewer = None
44 | self.state = None
45 |
46 | self.steps_beyond_done = None
47 |
48 | def _seed(self, seed=None):
49 | self.np_random, seed = seeding.np_random(seed)
50 | return [seed]
51 |
52 | def step(self, action):
53 | action = 1 if action[0] > .0 else 0
54 | # assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))
55 | state = self.state
56 | obs = self.state
57 | reward = np.cos(obs[2]) - 0.01 * (obs[0] ** 2)
58 |
59 | x, x_dot, theta, theta_dot = state
60 | force = self.force_mag if action == 1 else -self.force_mag
61 | costheta = math.cos(theta)
62 | sintheta = math.sin(theta)
63 | temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
64 | thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass))
65 | xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
66 | x = x + self.tau * x_dot
67 | x_dot = x_dot + self.tau * xacc
68 | theta = theta + self.tau * theta_dot
69 | theta_dot = theta_dot + self.tau * thetaacc
70 | self.state = (x, x_dot, theta, theta_dot)
71 | done = False
72 | self.steps_beyond_done = None
73 |
74 | return np.array(self.state), reward, done, {}
75 |
76 | def _reset(self):
77 | self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
78 | self.steps_beyond_done = None
79 | return np.array(self.state)
80 |
81 | def render(self, mode='human', close=False):
82 | if close:
83 | if self.viewer is not None:
84 | self.viewer.close()
85 | self.viewer = None
86 | return
87 |
88 | screen_width = 600
89 | screen_height = 400
90 |
91 | world_width = self.x_threshold * 2
92 | scale = screen_width / world_width
93 | carty = 100 # TOP OF CART
94 | polewidth = 10.0
95 | polelen = scale * 1.0
96 | cartwidth = 50.0
97 | cartheight = 30.0
98 |
99 | if self.viewer is None:
100 | from gym.envs.classic_control import rendering
101 | self.viewer = rendering.Viewer(screen_width, screen_height)
102 | l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
103 | axleoffset = cartheight / 4.0
104 | cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
105 | self.carttrans = rendering.Transform()
106 | cart.add_attr(self.carttrans)
107 | self.viewer.add_geom(cart)
108 | l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2
109 | pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
110 | pole.set_color(.8, .6, .4)
111 | self.poletrans = rendering.Transform(translation=(0, axleoffset))
112 | pole.add_attr(self.poletrans)
113 | pole.add_attr(self.carttrans)
114 | self.viewer.add_geom(pole)
115 | self.axle = rendering.make_circle(polewidth / 2)
116 | self.axle.add_attr(self.poletrans)
117 | self.axle.add_attr(self.carttrans)
118 | self.axle.set_color(.5, .5, .8)
119 | self.viewer.add_geom(self.axle)
120 | self.track = rendering.Line((0, carty), (screen_width, carty))
121 | self.track.set_color(0, 0, 0)
122 | self.viewer.add_geom(self.track)
123 |
124 | if self.state is None:
125 | return None
126 |
127 | x = self.state
128 | cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART
129 | self.carttrans.set_translation(cartx, carty)
130 | self.poletrans.set_rotation(-x[2])
131 |
132 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
133 |
134 | def mb_step(self, states, actions, next_states):
135 | # returns rewards and dones
136 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
137 | if getattr(self, 'action_space', None):
138 | actions = np.clip(actions, self.action_space.low,
139 | self.action_space.high)
140 | rewards = - self.cost_np_vec(states, actions, next_states)
141 | return rewards, np.zeros_like(rewards, dtype=np.bool)
142 |
143 | def cost_np_vec(self, obs, acts, next_obs):
144 | x = obs[:, 0]
145 | theta = obs[:, 2]
146 | return -(np.cos(theta) - 0.01 * (x ** 2))
147 |
148 | def verify(self, n=2000, eps=1e-4):
149 | pass
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/gym_cartpoleO01.py:
--------------------------------------------------------------------------------
1 | import math
2 | from gym import spaces
3 | from gym.utils import seeding
4 | import numpy as np
5 |
6 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
7 |
8 |
9 | class CartPoleEnv(BaseModelBasedEnv):
10 | metadata = {
11 | 'render.modes': ['human', 'rgb_array'],
12 | 'video.frames_per_second': 50
13 | }
14 |
15 | def __init__(self):
16 | self.gravity = 9.8
17 | self.masscart = 1.0
18 | self.masspole = 0.1
19 | self.total_mass = (self.masspole + self.masscart)
20 | self.length = 0.5 # actually half the pole's length
21 | self.polemass_length = (self.masspole * self.length)
22 | self.force_mag = 10.0
23 | self.tau = 0.02 # seconds between state updates
24 |
25 | # Angle at which to fail the episode
26 | self.theta_threshold_radians = 12 * 2 * math.pi / 360
27 | self.x_threshold = 2.4
28 |
29 | # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
30 | high = np.array([
31 | self.x_threshold * 2,
32 | np.finfo(np.float32).max,
33 | self.theta_threshold_radians * 2,
34 | np.finfo(np.float32).max])
35 |
36 | # self.action_space = spaces.Discrete(2)
37 | self.action_space = \
38 | spaces.Box(low=np.array([-1.0]), high=np.array([1.0]))
39 | self.observation_space = spaces.Box(-high, high)
40 |
41 | self._seed()
42 | self.viewer = None
43 | self.state = None
44 |
45 | self.steps_beyond_done = None
46 |
47 | def _seed(self, seed=None):
48 | self.np_random, seed = seeding.np_random(seed)
49 | return [seed]
50 |
51 | def step(self, action):
52 | action = 1 if action[0] > .0 else 0
53 | # assert self.action_space.contains(action), "%r (%s) invalid" % (action, type(action))
54 | state = self.state
55 | obs = self.state
56 | reward = np.cos(obs[2]) - 0.01 * (obs[0] ** 2)
57 |
58 | x, x_dot, theta, theta_dot = state
59 | force = self.force_mag if action == 1 else -self.force_mag
60 | costheta = math.cos(theta)
61 | sintheta = math.sin(theta)
62 | temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
63 | thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta * costheta / self.total_mass))
64 | xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
65 | x = x + self.tau * x_dot
66 | x_dot = x_dot + self.tau * xacc
67 | theta = theta + self.tau * theta_dot
68 | theta_dot = theta_dot + self.tau * thetaacc
69 | self.state = (x, x_dot, theta, theta_dot)
70 |
71 | done = False
72 | self.steps_beyond_done = None
73 |
74 | ob = np.array(self.state)
75 | ob += np.random.uniform(low=-0.1, high=0.1, size=ob.shape)
76 |
77 | return ob, reward, done, {}
78 |
79 | def _reset(self):
80 | self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
81 | self.steps_beyond_done = None
82 | return np.array(self.state)
83 |
84 | def render(self, mode='human', close=False):
85 | if close:
86 | if self.viewer is not None:
87 | self.viewer.close()
88 | self.viewer = None
89 | return
90 |
91 | screen_width = 600
92 | screen_height = 400
93 |
94 | world_width = self.x_threshold * 2
95 | scale = screen_width / world_width
96 | carty = 100 # TOP OF CART
97 | polewidth = 10.0
98 | polelen = scale * 1.0
99 | cartwidth = 50.0
100 | cartheight = 30.0
101 |
102 | if self.viewer is None:
103 | from gym.envs.classic_control import rendering
104 | self.viewer = rendering.Viewer(screen_width, screen_height)
105 | l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
106 | axleoffset = cartheight / 4.0
107 | cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
108 | self.carttrans = rendering.Transform()
109 | cart.add_attr(self.carttrans)
110 | self.viewer.add_geom(cart)
111 | l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2
112 | pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
113 | pole.set_color(.8, .6, .4)
114 | self.poletrans = rendering.Transform(translation=(0, axleoffset))
115 | pole.add_attr(self.poletrans)
116 | pole.add_attr(self.carttrans)
117 | self.viewer.add_geom(pole)
118 | self.axle = rendering.make_circle(polewidth / 2)
119 | self.axle.add_attr(self.poletrans)
120 | self.axle.add_attr(self.carttrans)
121 | self.axle.set_color(.5, .5, .8)
122 | self.viewer.add_geom(self.axle)
123 | self.track = rendering.Line((0, carty), (screen_width, carty))
124 | self.track.set_color(0, 0, 0)
125 | self.viewer.add_geom(self.track)
126 |
127 | if self.state is None:
128 | return None
129 |
130 | x = self.state
131 | cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART
132 | self.carttrans.set_translation(cartx, carty)
133 | self.poletrans.set_rotation(-x[2])
134 |
135 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
136 |
137 | def cost_np_vec(self, obs, acts, next_obs):
138 | x = obs[:, 0]
139 | theta = obs[:, 2]
140 | return -(np.cos(theta) - 0.01 * (x ** 2))
141 |
142 | def mb_step(self, states, actions, next_states):
143 | # returns rewards and dones
144 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
145 | if getattr(self, 'action_space', None):
146 | actions = np.clip(actions, self.action_space.low, self.action_space.high)
147 | rewards = - self.cost_np_vec(states, actions, next_states)
148 | return rewards, np.zeros_like(rewards, dtype=np.bool)
149 |
150 |
151 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/mountain_car.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | from gym import spaces
4 | from gym.utils import seeding
5 | import numpy as np
6 |
7 | from mbpo_pytorch.envs.virtual_env import BaseModelBasedEnv
8 |
9 |
10 | class Continuous_MountainCarEnv(BaseModelBasedEnv):
11 | metadata = {
12 | 'render.modes': ['human', 'rgb_array'],
13 | 'video.frames_per_second': 30
14 | }
15 |
16 | def __init__(self):
17 | self.min_action = -1.0
18 | self.max_action = 1.0
19 | self.min_position = -1.2
20 | self.max_position = 0.6
21 | self.max_speed = 0.07
22 | self.goal_position = 0.45 # was 0.5 in gym, 0.45 in Arnaud de Broissia's version
23 | self.power = 0.0015
24 |
25 | self.low_state = np.array([self.min_position, -self.max_speed])
26 | self.high_state = np.array([self.max_position, self.max_speed])
27 |
28 | self.viewer = None
29 |
30 | self.action_space = spaces.Box(self.min_action, self.max_action, shape=(1,))
31 | self.observation_space = spaces.Box(self.low_state, self.high_state)
32 |
33 | self._seed()
34 | self.reset()
35 |
36 | def _seed(self, seed=None):
37 | self.np_random, seed = seeding.np_random(seed)
38 | return [seed]
39 |
40 | def step(self, action):
41 |
42 | position = self.state[0]
43 | velocity = self.state[1]
44 | force = min(max(action[0], -1.0), 1.0)
45 | reward = position
46 |
47 | velocity += force * self.power - 0.0025 * math.cos(3 * position)
48 | if (velocity > self.max_speed):
49 | velocity = self.max_speed
50 | if (velocity < -self.max_speed):
51 | velocity = -self.max_speed
52 | position += velocity
53 | if (position > self.max_position):
54 | position = self.max_position
55 | if (position < self.min_position):
56 | position = self.min_position
57 | if (position == self.min_position and velocity < 0):
58 | velocity = 0
59 |
60 | """
61 | done = bool(position >= self.goal_position)
62 |
63 | reward = 0
64 | if done:
65 | reward = 100.0
66 | reward -= math.pow(action[0], 2) * 0.1
67 |
68 | """
69 | done = False
70 | self.state = np.array([position, velocity])
71 | return self.state, reward, done, {}
72 |
73 | def _reset(self):
74 | self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0])
75 | return np.array(self.state)
76 |
77 | # def get_state(self):
78 | # return self.state
79 |
80 | def _height(self, xs):
81 | return np.sin(3 * xs) * .45 + .55
82 |
83 | def render(self, mode='human', close=False):
84 | if close:
85 | if self.viewer is not None:
86 | self.viewer.close()
87 | self.viewer = None
88 | return
89 |
90 | screen_width = 600
91 | screen_height = 400
92 |
93 | world_width = self.max_position - self.min_position
94 | scale = screen_width / world_width
95 | carwidth = 40
96 | carheight = 20
97 |
98 | if self.viewer is None:
99 | from gym.envs.classic_control import rendering
100 | self.viewer = rendering.Viewer(screen_width, screen_height)
101 | xs = np.linspace(self.min_position, self.max_position, 100)
102 | ys = self._height(xs)
103 | xys = list(zip((xs - self.min_position) * scale, ys * scale))
104 |
105 | self.track = rendering.make_polyline(xys)
106 | self.track.set_linewidth(4)
107 | self.viewer.add_geom(self.track)
108 |
109 | clearance = 10
110 |
111 | l, r, t, b = -carwidth / 2, carwidth / 2, carheight, 0
112 | car = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
113 | car.add_attr(rendering.Transform(translation=(0, clearance)))
114 | self.cartrans = rendering.Transform()
115 | car.add_attr(self.cartrans)
116 | self.viewer.add_geom(car)
117 | frontwheel = rendering.make_circle(carheight / 2.5)
118 | frontwheel.set_color(.5, .5, .5)
119 | frontwheel.add_attr(rendering.Transform(translation=(carwidth / 4, clearance)))
120 | frontwheel.add_attr(self.cartrans)
121 | self.viewer.add_geom(frontwheel)
122 | backwheel = rendering.make_circle(carheight / 2.5)
123 | backwheel.add_attr(rendering.Transform(translation=(-carwidth / 4, clearance)))
124 | backwheel.add_attr(self.cartrans)
125 | backwheel.set_color(.5, .5, .5)
126 | self.viewer.add_geom(backwheel)
127 | flagx = (self.goal_position - self.min_position) * scale
128 | flagy1 = self._height(self.goal_position) * scale
129 | flagy2 = flagy1 + 50
130 | flagpole = rendering.Line((flagx, flagy1), (flagx, flagy2))
131 | self.viewer.add_geom(flagpole)
132 | flag = rendering.FilledPolygon([(flagx, flagy2), (flagx, flagy2 - 10), (flagx + 25, flagy2 - 5)])
133 | flag.set_color(.8, .8, 0)
134 | self.viewer.add_geom(flag)
135 |
136 | pos = self.state[0]
137 | self.cartrans.set_translation((pos - self.min_position) * scale, self._height(pos) * scale)
138 | self.cartrans.set_rotation(math.cos(3 * pos))
139 |
140 | return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
141 |
142 | def mb_step(self, states, actions, next_states):
143 | # returns rewards and dones
144 | # forward rewards are calculated based on states, instead of next_states as in original SLBO envs
145 | if getattr(self, 'action_space', None):
146 | actions = np.clip(actions, self.action_space.low,
147 | self.action_space.high)
148 | rewards = - self.cost_np_vec(states, actions, next_states)
149 | return rewards, np.zeros_like(rewards, dtype=np.bool)
150 |
151 | def cost_np_vec(self, obs, acts, next_obs):
152 | """
153 | position = self.state[0]
154 | velocity = self.state[1]
155 | force = min(max(action[0], -1.0), 1.0)
156 | reward = position
157 | """
158 | position = obs[:, 0]
159 | return -position
160 |
--------------------------------------------------------------------------------
/mbpo_pytorch/envs/benchmarking_envs/gym/assets/pusher.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/mbpo_pytorch/algos/mfrl/sac.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from operator import itemgetter
4 | from typing import TYPE_CHECKING, Dict
5 |
6 | import torch
7 | from torch import nn as nn
8 |
9 | from mbpo_pytorch.models.utils import soft_update
10 |
11 | if TYPE_CHECKING:
12 | from mbpo_pytorch.models import Actor, QCritic
13 | from mbpo_pytorch.storages import SimpleUniversalBuffer as Buffer
14 |
15 |
16 | class SAC:
17 | def __init__(
18 | self,
19 | actor: Actor,
20 | q_critic1: QCritic,
21 | q_critic2: QCritic,
22 | target_q_critic1: QCritic,
23 | target_q_critic2: QCritic,
24 | batch_size: int,
25 | num_grad_steps: int,
26 | gamma=0.99,
27 | reward_scale=1.0,
28 | actor_lr=1e-3,
29 | critic_lr=1e-3,
30 | soft_target_tau=1e-2,
31 | target_update_interval=1,
32 | use_automatic_entropy_tuning=True,
33 | target_entropy=None,
34 | alpha=1.0,
35 | ):
36 | super(SAC).__init__()
37 | self.actor = actor
38 | self.q_critic1 = q_critic1
39 | self.q_critic2 = q_critic2
40 | self.target_q_critic1 = target_q_critic1
41 | self.target_q_critic2 = target_q_critic2
42 | self.soft_target_tau = soft_target_tau
43 | self.target_update_interval = target_update_interval
44 |
45 | self.batch_size = batch_size
46 | self.num_grad_steps = num_grad_steps
47 |
48 | self.use_automatic_entropy_tuning = use_automatic_entropy_tuning
49 | if self.use_automatic_entropy_tuning:
50 | self.target_entropy = torch.tensor(target_entropy)
51 | self.log_alpha = torch.zeros(1, requires_grad=True)
52 | self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=actor_lr)
53 |
54 | self._alpha = torch.tensor(alpha)
55 |
56 | self.qf_criterion = nn.MSELoss()
57 |
58 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
59 | self.qf1_optimizer = torch.optim.Adam(self.q_critic1.parameters(), lr=critic_lr)
60 | self.qf2_optimizer = torch.optim.Adam(self.q_critic2.parameters(), lr=critic_lr)
61 |
62 | self.gamma = gamma
63 | self.reward_scale = reward_scale
64 | self.total_num_updates = 0
65 | self._need_to_update_eval_statistics = True
66 |
67 | self._sync_target_network()
68 |
69 | @staticmethod
70 | def check_buffer(buffer):
71 | assert {'states', 'actions', 'rewards', 'masks', 'next_states'}.issubset(buffer.entry_infos.keys())
72 |
73 | def _sync_target_network(self):
74 | soft_update(self.q_critic1, self.target_q_critic1, 1.0)
75 | soft_update(self.q_critic2, self.target_q_critic2, 1.0)
76 |
77 | def update(self, policy_buffer: Buffer) -> Dict[str, float]:
78 |
79 | data_generator = policy_buffer.get_batch_generator_inf(self.batch_size)
80 |
81 | self.actor.train()
82 | self.q_critic1.train()
83 | self.q_critic2.train()
84 | self.target_q_critic1.train()
85 | self.target_q_critic2.train()
86 |
87 | policy_loss_epoch = 0.
88 | qf1_loss_epoch = 0.
89 | qf2_loss_epoch = 0.
90 | alpha_loss_epoch = 0.
91 |
92 | for _ in range(self.num_grad_steps):
93 |
94 | samples = next(data_generator)
95 |
96 | states, actions, rewards, masks, next_states = \
97 | itemgetter('states', 'actions', 'rewards', 'masks', 'next_states')(samples)
98 |
99 | new_actions, log_probs = itemgetter('actions', 'log_probs')(self.actor.act(states, reparameterize=True))
100 |
101 | if self.use_automatic_entropy_tuning:
102 | alpha_loss = -(self.log_alpha.to(log_probs.device) * (log_probs + self.target_entropy).detach()).mean()
103 | self.alpha_optimizer.zero_grad()
104 | alpha_loss.backward()
105 | self.alpha_optimizer.step()
106 | alpha = self.log_alpha.exp()
107 | else:
108 | alpha_loss = torch.tensor([0.])
109 | alpha = self._alpha
110 |
111 | alpha = alpha.to(log_probs.device)
112 |
113 | q_new_actions = torch.min(self.q_critic1(states, new_actions),
114 | self.q_critic2(states, new_actions))
115 | policy_loss = (alpha * log_probs - q_new_actions).mean()
116 |
117 | q1_pred = self.q_critic1(states, actions)
118 | q2_pred = self.q_critic2(states, actions)
119 |
120 | new_next_actions, new_next_log_probs = \
121 | itemgetter('actions', 'log_probs')(self.actor.act(next_states, reparameterize=True))
122 |
123 | target_q_values = torch.min(self.target_q_critic1(next_states, new_next_actions),
124 | self.target_q_critic2(next_states, new_next_actions)) \
125 | - alpha * new_next_log_probs
126 |
127 | q_target = self.reward_scale * rewards + masks * self.gamma * target_q_values
128 | qf1_loss = self.qf_criterion(q1_pred, q_target.detach())
129 | qf2_loss = self.qf_criterion(q2_pred, q_target.detach())
130 |
131 | self.actor_optimizer.zero_grad()
132 | policy_loss.backward()
133 | self.actor_optimizer.step()
134 |
135 | self.qf1_optimizer.zero_grad()
136 | qf1_loss.backward()
137 | self.qf1_optimizer.step()
138 |
139 | self.qf2_optimizer.zero_grad()
140 | qf2_loss.backward()
141 | self.qf2_optimizer.step()
142 |
143 | if self.total_num_updates % self.target_update_interval == 0:
144 | soft_update(self.q_critic1, self.target_q_critic1, self.soft_target_tau)
145 | soft_update(self.q_critic2, self.target_q_critic2, self.soft_target_tau)
146 |
147 | self.total_num_updates += 1
148 |
149 | policy_loss_epoch += policy_loss.item()
150 | qf1_loss_epoch += qf1_loss.item()
151 | qf2_loss_epoch += qf2_loss.item()
152 | alpha_loss_epoch += alpha_loss.item()
153 |
154 | policy_loss_epoch /= self.num_grad_steps
155 | qf1_loss_epoch /= self.num_grad_steps
156 | qf2_loss_epoch /= self.num_grad_steps
157 | alpha_loss_epoch /= self.num_grad_steps
158 |
159 | return {'policy_loss': policy_loss_epoch, 'qf1_loss': qf1_loss_epoch,
160 | 'qf2_loss': qf2_loss_epoch, 'alpha_loss': alpha_loss_epoch}
161 |
162 |
--------------------------------------------------------------------------------