├── .gitignore ├── README.md ├── example_run.sh ├── humanoid_fig.png ├── milo ├── milo │ ├── __init__.py │ ├── cost │ │ ├── __init__.py │ │ └── linear_cost.py │ ├── dataset │ │ ├── __init__.py │ │ └── datasets.py │ ├── dynamics_model │ │ ├── __init__.py │ │ └── mlp_dynamics.py │ ├── gym_env │ │ ├── __init__.py │ │ ├── ant.py │ │ ├── half_cheetah.py │ │ ├── hopper.py │ │ ├── humanoid.py │ │ ├── multiprocessing_env.py │ │ ├── walker2d.py │ │ └── wrappers.py │ ├── sampler │ │ ├── __init__.py │ │ └── sampler.py │ └── utils │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── evaluate.py │ │ ├── logger.py │ │ └── util.py └── setup.py ├── mjrl ├── .gitignore ├── LICENSE ├── README.md ├── examples │ ├── README.md │ ├── behavior_clone.py │ ├── example_configs │ │ ├── hopper_npg.txt │ │ ├── swimmer_npg.txt │ │ └── swimmer_ppo.txt │ ├── linear_nn_comparison.py │ └── policy_opt_job_script.py ├── mjrl │ ├── __init__.py │ ├── algos │ │ ├── __init__.py │ │ ├── batch_reinforce.py │ │ ├── behavior_cloning.py │ │ ├── dapg.py │ │ ├── mbac.py │ │ ├── model_accel │ │ │ ├── __init__.py │ │ │ ├── model_accel_npg.py │ │ │ ├── model_learning_mpc.py │ │ │ ├── nn_dynamics.py │ │ │ ├── run_experiments │ │ │ │ ├── configs │ │ │ │ │ ├── point_mass.txt │ │ │ │ │ └── reacher.txt │ │ │ │ ├── run_model_accel_npg.py │ │ │ │ ├── sandbox │ │ │ │ │ ├── example_config_mpc.txt │ │ │ │ │ └── run_model_learning_mpc.py │ │ │ │ └── utils │ │ │ │ │ ├── reward_functions │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── mjrl_point_mass.py │ │ │ │ │ ├── visualize_policy.py │ │ │ │ │ └── visualize_trajectories.py │ │ │ └── sampling.py │ │ ├── npg_cg.py │ │ ├── ppo_clip.py │ │ └── trpo.py │ ├── baselines │ │ ├── __init__.py │ │ ├── linear_baseline.py │ │ ├── mlp_baseline.py │ │ ├── quadratic_baseline.py │ │ └── zero_baseline.py │ ├── envs │ │ ├── __init__.py │ │ ├── assets │ │ │ ├── peg_insertion.xml │ │ │ ├── point_mass.xml │ │ │ ├── sawyer.xml │ │ │ └── swimmer.xml │ │ ├── mujoco_env.py │ │ ├── peg_insertion_sawyer.py │ │ ├── point_mass.py │ │ ├── reacher_sawyer.py │ │ └── swimmer.py │ ├── policies │ │ ├── __init__.py │ │ ├── gaussian_linear.py │ │ ├── gaussian_mlp.py │ │ └── mpc_actor.py │ ├── samplers │ │ ├── __init__.py │ │ └── core.py │ └── utils │ │ ├── __init__.py │ │ ├── cg_solve.py │ │ ├── fc_network.py │ │ ├── get_environment.py │ │ ├── gym_env.py │ │ ├── logger.py │ │ ├── make_train_plots.py │ │ ├── optimize_model.py │ │ ├── process_samples.py │ │ ├── tensor_utils.py │ │ ├── train_agent.py │ │ └── visualize_policy.py ├── setup.py ├── setup │ ├── README.md │ └── env.yml └── tests │ ├── point_mass_test.py │ └── visualizer_test.py ├── requirements.txt ├── run.py └── run_hand.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Source Code for Model-based Imitation Learning from Offline data (MILO) 2 | Implementation of MILO, a model-based, offline imitation learning algorithm. 3 | 4 | ![figure](https://github.com/jdchang1/milo/blob/main/humanoid_fig.png) 5 | 6 | Link to pdf: https://arxiv.org/abs/2106.03207 7 | 8 | ## Notes on Installation 9 | After cloning this repository and installing the requirements, please run 10 | 11 | `cd milo && pip install -e .` 12 | 13 | `cd mjrl && pip install -e .` 14 | 15 | The experiments are run using MuJoCo physics, which requires a license to install. Please follow the instructions on [MuJoCo Website](http://www.mujoco.org) 16 | 17 | ## Overview 18 | The `milo` package contains our imitation learning, model-based environment stack, and boilerplate code. We modified the `mjrl` package to interface with our cost functions when doing model-based policy gradient. This modification can be seen in `mjrl/mjrl/algos/batch_reinforce.py`. Note that we currently only support NPG/TRPO as our policy gradient algorithm; however, in principle one could replace this with other algorithms/repositories. 19 | 20 | ## Environments Supported 21 | This repository supports 5 modified MuJoCo environments that can be found in `milo/milo/gym_env`. They are 22 | 1. Hopper-v4 23 | 2. Walker2d-v4 24 | 3. HalfCheetah-v4 25 | 4. Ant-v4 26 | 5. Humanoid-v4 27 | 28 | If you would like to add an environment, register the environment in `/milo/milo/gym_env/__init__.py` according to [OpenAI Gym](http://gym.openai.com/docs/#environments) instructions. 29 | 30 | ## Downloading the Datasets 31 | Please download the datasets from this [google drive link](https://drive.google.com/drive/folders/1gG2WIgL1mdznhuel5uKRb6lepF7EVeFr?usp=sharing). Each environment will have 2 datasets: `[ENV]_expert.pt` and `[ENV]_offline.pt`. 32 | 33 | In the `data` directory, place the expert and offline datasets in the `data/expert_data` and `data/offline_data` direcotires respectively. 34 | 35 | ## Running an Experiment 36 | We provide an example run script for Hopper, `example_run.sh`, that can be modified to be used with any other registered environment. To view all the possible arguments you can run please see the argparse in `milo/milo/utils/arguments.py`. 37 | 38 | ## Bibliography 39 | To cite this work, please use the following citation. Note that this repository builds upon MJRL so please also cite any references noted in the README [here](https://github.com/aravindr93/mjrl). 40 | ``` 41 | @misc{chang2021mitigating, 42 | title={Mitigating Covariate Shift in Imitation Learning via Offline Data Without Great Coverage}, 43 | author={Jonathan D. Chang and Masatoshi Uehara and Dhruv Sreenivas and Rahul Kidambi and Wen Sun}, 44 | year={2021}, 45 | eprint={2106.03207}, 46 | archivePrefix={arXiv}, 47 | primaryClass={cs.LG} 48 | } 49 | ``` 50 | -------------------------------------------------------------------------------- /example_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python run.py --env Hopper-v4 \ 4 | --seed 100 \ 5 | --expert_db Hopper-v6_expert.pt \ 6 | --offline_db Hopper-v6_offline.pt \ 7 | --n_models 4 \ 8 | --lambda_b 0.0025 \ 9 | --samples_per_step 40000 \ 10 | --pg_iter 1 \ 11 | --bw_quantile 0.1 \ 12 | --id 1 \ 13 | --subsample_expert \ 14 | --n_iter 300 \ 15 | --cg_iter 25 \ 16 | --bc_epochs 1 \ 17 | --do_bc_reg \ 18 | --bc_reg_coeff 0.1 19 | -------------------------------------------------------------------------------- /humanoid_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/humanoid_fig.png -------------------------------------------------------------------------------- /milo/milo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/milo/milo/__init__.py -------------------------------------------------------------------------------- /milo/milo/cost/__init__.py: -------------------------------------------------------------------------------- 1 | from milo.cost.linear_cost import RBFLinearCost 2 | -------------------------------------------------------------------------------- /milo/milo/cost/linear_cost.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import numpy as np 5 | 6 | class RBFLinearCost: 7 | """ 8 | MMD cost implementation with rff feature representations 9 | 10 | NOTE: Currently hardcoded to cpu 11 | 12 | :param expert_data: (torch Tensor) expert data used for feature matching 13 | :param feature_dim: (int) feature dimension for rff 14 | :param input_type: (str) state (s), state-action (sa), state-next state (ss), 15 | state-action-next state (sas) 16 | :param cost_range: (list) inclusive range of costs 17 | :param bw_quantile: (float) quantile used to fit bandwidth for rff kernel 18 | :param bw_samples: (int) number of samples used to fit bandwidth 19 | :param lambda_b: (float) weight parameter for bonus and cost 20 | :param lr: (float) learning rate for discriminator/cost update. 0.0 = closed form update 21 | :param seed: (int) random seed to set cost function 22 | """ 23 | def __init__(self, 24 | expert_data, 25 | feature_dim=1024, 26 | input_type='sa', 27 | cost_range=[-1.,0.], 28 | bw_quantile=0.1, 29 | bw_samples=100000, 30 | lambda_b=1.0, 31 | lr=0.0, 32 | seed=100): 33 | 34 | # Set Random Seed 35 | torch.manual_seed(seed) 36 | np.random.seed(seed) 37 | 38 | self.expert_data = expert_data 39 | input_dim = expert_data.size(1) 40 | self.input_type = input_type 41 | self.feature_dim = feature_dim 42 | self.cost_range = cost_range 43 | if cost_range is not None: 44 | self.c_min, self.c_max = cost_range 45 | self.lambda_b = lambda_b 46 | self.lr = lr 47 | 48 | # Fit Bandwidth 49 | self.quantile = bw_quantile 50 | self.bw_samples = bw_samples 51 | self.bw = self.fit_bandwidth(expert_data) 52 | 53 | # Define Phi and Cost weights 54 | self.rff = nn.Linear(input_dim, feature_dim) 55 | self.rff.bias.data = (torch.rand_like(self.rff.bias.data)-0.5)*2.0*np.pi 56 | self.rff.weight.data = torch.rand_like(self.rff.weight.data)/(self.bw+1e-8) 57 | 58 | # W Update Init 59 | self.w = None 60 | 61 | # Compute Expert Phi Mean 62 | self.expert_rep = self.get_rep(expert_data) 63 | self.phi_e = self.expert_rep.mean(dim=0) 64 | 65 | def get_rep(self, x): 66 | """ 67 | Returns an RFF representation given an input 68 | """ 69 | with torch.no_grad(): 70 | out = self.rff(x.cpu()) 71 | out = torch.cos(out)*np.sqrt(2/self.feature_dim) 72 | return out 73 | 74 | def fit_bandwidth(self, data): 75 | """ 76 | Uses the median trick to fit the bandwidth for the RFF kernel 77 | """ 78 | num_data = data.shape[0] 79 | idxs_0 = torch.randint(low=0, high=num_data, size=(self.bw_samples,)) 80 | idxs_1 = torch.randint(low=0, high=num_data, size=(self.bw_samples,)) 81 | norm = torch.norm(data[idxs_0, :]-data[idxs_1, :], dim=1) 82 | bw = torch.quantile(norm, q=self.quantile).item() 83 | return bw 84 | 85 | def fit_cost(self, data_pi): 86 | """ 87 | Updates the weights of the cost with the closed form solution 88 | """ 89 | phi = self.get_rep(data_pi).mean(0) 90 | feat_diff = phi - self.phi_e 91 | 92 | # Closed form solution 93 | self.w = feat_diff 94 | 95 | return torch.dot(self.w, feat_diff).item() 96 | 97 | def get_costs(self, x): 98 | """ 99 | Returrns the IPM (MMD) cost for a given input 100 | """ 101 | data = self.get_rep(x) 102 | if self.cost_range is not None: 103 | return torch.clamp(torch.mm(data, self.w.unsqueeze(1)), self.c_min, self.c_max) 104 | return torch.mm(data, self.w.unsqueeze(1)) 105 | 106 | def get_expert_cost(self): 107 | """ 108 | Returns the mean expert cost given our current discriminator weights and representations 109 | """ 110 | return (1-self.lambda_b)*torch.clamp(torch.mm(self.expert_rep, self.w.unsqueeze(1)), self.c_min, self.c_max).mean() 111 | 112 | def get_bonus_costs(self, states, actions, ensemble, next_states=None): 113 | """ 114 | Computes the cost with pessimism 115 | """ 116 | if self.input_type == 'sa': 117 | rff_input = torch.cat([states, actions], dim=1) 118 | elif self.input_type == 'ss': 119 | assert(next_states is not None) 120 | rff_input = torch.cat([states, next_states], dim=1) 121 | elif self.input_type == 'sas': 122 | rff_input = torch.cat([states, actions, next_states], dim=1) 123 | elif self.input_type == 's': 124 | rff_input = states 125 | else: 126 | raise NotImplementedError("Input type not implemented") 127 | 128 | # Get Linear Cost 129 | rff_cost = self.get_costs(rff_input) 130 | 131 | if self.cost_range is not None: 132 | # Get Bonus from Ensemble 133 | discrepancy = ensemble.get_action_discrepancy(states, actions)/ensemble.threshold 134 | discrepancy = discrepancy.view(-1, 1) 135 | discrepancy[discrepancy>1.0] = 1.0 136 | # Bonus is LOW if (s,a) is unknown 137 | bonus = discrepancy * self.c_min 138 | else: 139 | bonus = ensemble.get_action_discrepancy(states, actions).view(-1,1) 140 | 141 | # Weight cost components 142 | ipm = (1-self.lambda_b)*rff_cost 143 | 144 | # Conservative/Pessimism Penalty term 145 | weighted_bonus = self.lambda_b*bonus.cpu() # Note cpu hardcoding 146 | 147 | # Cost 148 | cost = ipm - weighted_bonus 149 | 150 | # Logging info 151 | info = {'bonus': weighted_bonus, 'ipm': ipm, 'v_targ': rff_cost, 'cost': cost} 152 | 153 | return cost, info 154 | 155 | -------------------------------------------------------------------------------- /milo/milo/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from milo.dataset.datasets import OfflineDataset 2 | -------------------------------------------------------------------------------- /milo/milo/dataset/datasets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym.spaces import Discrete, Box 4 | 5 | import torch 6 | from torch.utils.data import Dataset 7 | 8 | 9 | class OfflineDataset(Dataset): 10 | """ 11 | Pytorch Dataset class for our offline dataset. Note we return (s,a,s') triples. 12 | :param env_name: (str) name of gym environment 13 | :param state: (torch Tensor) tensor with shape (number of samples, state dimension) with state data 14 | :param action: (torch Tensor) tensor with shape (number of samples, action dimension) with action data 15 | :param next_state: (torch Tensor) tensor with shape (number of samples, state dimension) with next state data 16 | :param device: (torch Device) device for pytorch. Currently hardcoded to cpu 17 | """ 18 | def __init__(self, env_name, state, action, next_state, device=torch.device('cpu')): 19 | self.device = device 20 | self.state = state 21 | self.action = action 22 | 23 | env = gym.make(env_name) 24 | if isinstance(env.action_space, Discrete): 25 | self.action = self.one_hot(action, env.action_space.n) 26 | elif isinstance(env.action_space, Box): 27 | self.action = action 28 | else: 29 | raise NotImplementedError( 30 | "Environment Action Space not yet supported") 31 | self.next_state = next_state 32 | del env 33 | 34 | def get_transformations(self): 35 | diff = self.next_state - self.state 36 | 37 | # Compute Means 38 | state_mean = self.state.mean(dim=0).float().requires_grad_(False) 39 | action_mean = self.action.mean(dim=0).float().requires_grad_(False) 40 | diff_mean = diff.mean(dim=0).float().requires_grad_(False) 41 | 42 | # Compute Scales 43 | state_scale = torch.abs( 44 | self.state - state_mean).mean(dim=0).float().requires_grad_(False) + 1e-8 45 | action_scale = torch.abs( 46 | self.action - action_mean).mean(dim=0).float().requires_grad_(False) + 1e-8 47 | diff_scale = torch.abs( 48 | diff - diff_mean).mean(dim=0).float().requires_grad_(False) + 1e-8 49 | 50 | return state_mean.to(self.device), state_scale.to(self.device), action_mean.to(self.device), \ 51 | action_scale.to(self.device), diff_mean.to( 52 | self.device), diff_scale.to(self.device) 53 | 54 | def one_hot(self, action, action_dim): 55 | db_size = action.size(0) 56 | one_hot_action = torch.eye(action_dim)[action] 57 | return one_hot_action.view(db_size, action_dim) 58 | 59 | def __len__(self): 60 | return self.state.size(0) 61 | 62 | def __getitem__(self, idx): 63 | return self.state[idx].float(), self.action[idx].float(), self.next_state[idx].float() 64 | -------------------------------------------------------------------------------- /milo/milo/dynamics_model/__init__.py: -------------------------------------------------------------------------------- 1 | from milo.dynamics_model.mlp_dynamics import DynamicsEnsemble, DynamicsModel 2 | -------------------------------------------------------------------------------- /milo/milo/gym_env/__init__.py: -------------------------------------------------------------------------------- 1 | from milo.gym_env.wrappers import model_based_env 2 | from milo.gym_env.multiprocessing_env import MujocoEnvProcess 3 | from gym.envs.registration import register 4 | 5 | register( 6 | id='Hopper-v4', 7 | entry_point='milo.gym_env.hopper:HopperEnv', 8 | max_episode_steps=400, 9 | reward_threshold=3800.0, 10 | ) 11 | 12 | register( 13 | id='Walker2d-v4', 14 | max_episode_steps=400, 15 | entry_point='milo.gym_env.walker2d:Walker2dEnv' 16 | ) 17 | 18 | register( 19 | id='HalfCheetah-v4', 20 | entry_point='milo.gym_env.half_cheetah:HalfCheetahEnv', 21 | max_episode_steps=500, 22 | reward_threshold=4800.0, 23 | ) 24 | 25 | register( 26 | id='Ant-v4', 27 | entry_point='milo.gym_env.ant:AntEnv', 28 | max_episode_steps=500, 29 | reward_threshold=6000.0, 30 | ) 31 | 32 | register( 33 | id='Humanoid-v4', 34 | entry_point='milo.gym_env.humanoid:HumanoidEnv', 35 | max_episode_steps=500, 36 | ) 37 | -------------------------------------------------------------------------------- /milo/milo/gym_env/ant.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | self.pos_before = np.array([0.0, 0.0]) 8 | mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 10) 9 | # mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5) 10 | utils.EzPickle.__init__(self) 11 | 12 | def step(self, a): 13 | self.pos_before = self.data.qpos[:2].copy() 14 | self.do_simulation(a, self.frame_skip) 15 | obs = self._get_obs() 16 | reward = self.get_reward(obs, a) 17 | done = self.get_done(obs) 18 | return obs, reward, done, {} 19 | 20 | def _get_obs(self): 21 | delta = self.data.qpos[:2] - self.pos_before 22 | return np.concatenate([ 23 | delta, 24 | self.sim.data.qpos.flat[2:], 25 | self.sim.data.qvel.ravel() * self.dt, 26 | # NOTE: We are throwing away contact related info, since it is often unnecessary 27 | # np.clip(self.sim.data.cfrc_ext, -1, 1).flat, 28 | ]) 29 | 30 | def get_reward(self, obs, act): 31 | obs = np.clip(obs, -10.0, 10.0) 32 | if len(obs.shape) == 1: 33 | # vector obs, called when stepping the env 34 | vel_x = obs[0] / self.dt # recover velocity from delta 35 | power = np.square(act).sum() 36 | # NOTE: We will use the contact force penalties for actual reward 37 | # to be consistent with gym results 38 | cfrc_ext = np.clip(self.sim.data.cfrc_ext, -1, 1).ravel() 39 | height = obs[2] 40 | reward = - 0.5 * 1e-3 * np.square(cfrc_ext).sum() # contact cost 41 | else: 42 | # for imaginary rollouts using learned model 43 | vel_x = obs[:, :, 0] / self.dt # recover velocity from delta 44 | power = np.square(act).sum(axis=-1) 45 | height = obs[:, :, 2] 46 | # NOTE: WE will not consider contact costs for imaginary rollouts 47 | reward = 0.0 48 | survive_reward = 1.0 * (height > 0.2) * (height < 1.0) 49 | ctrl_cost = 0.5 * power 50 | reward += vel_x - ctrl_cost + survive_reward 51 | reward = reward * 2.0 # to account for scaling difference (skip 5 --> 10) 52 | return reward 53 | 54 | def compute_path_rewards(self, paths): 55 | # path has two keys: observations and actions 56 | # path["observations"] : (num_traj, horizon, obs_dim) 57 | # path["rewards"] should have shape (num_traj, horizon) 58 | obs = paths["observations"] 59 | act = paths["actions"] 60 | rewards = self.get_reward(obs, act) 61 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 62 | return paths 63 | 64 | def get_done(self, obs): 65 | done = not (np.isfinite(obs).all() and (obs[2] > 0.2) and (obs[2] < 1.0)) 66 | return done 67 | 68 | def truncate_paths(self, paths): 69 | for path in paths: 70 | obs = path["observations"] 71 | height = obs[:,2]#obs[:, 0] 72 | T = obs.shape[0] 73 | t = 0 74 | done = False 75 | while t < T and done is False: 76 | done = not (np.isfinite(obs[t]).all() and (height[t] > 0.2) and (height[t] < 1.0)) 77 | T = t if done else T 78 | t = t + 1 79 | path["observations"] = path["observations"][:T] 80 | path["actions"] = path["actions"][:T] 81 | path["rewards"] = path["rewards"][:T] 82 | path["terminated"] = done 83 | return paths 84 | 85 | def reset_model(self): 86 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 87 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 88 | self.set_state(qpos, qvel) 89 | return self._get_obs() 90 | 91 | def get_env_state(self): 92 | return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy()) 93 | 94 | def set_env_state(self, state): 95 | qpos = state['qpos'] 96 | qvel = state['qvel'] 97 | self.sim.reset() 98 | self.data.qpos[:] = qpos 99 | self.data.qvel[:] = qvel 100 | self.sim.forward() 101 | 102 | def viewer_setup(self): 103 | self.viewer.cam.distance = self.model.stat.extent * 0.5 104 | -------------------------------------------------------------------------------- /milo/milo/gym_env/half_cheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | self.pos_before = 0.0 8 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 10) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, a): 12 | self.pos_before = self.data.qpos[0].copy() 13 | self.do_simulation(a, self.frame_skip) 14 | obs = self._get_obs() 15 | reward = self.get_reward(obs, a) 16 | done = False # no termination for this env 17 | return obs, reward, done, {} 18 | 19 | def _get_obs(self): 20 | delta = self.data.qpos[0] - self.pos_before 21 | return np.concatenate([ 22 | [delta], 23 | self.sim.data.qpos.ravel()[1:], 24 | self.sim.data.qvel.ravel() * self.dt, 25 | ]) 26 | 27 | def get_reward(self, obs, act): 28 | obs = np.clip(obs, -10.0, 10.0) 29 | if len(obs.shape) == 1: 30 | # vector obs, called when stepping the env 31 | # vel_x = obs[-9] / self.dt # recover velocity from delta 32 | vel_x = obs[0] / self.dt 33 | power = np.square(act).sum() 34 | else: 35 | # vel_x = obs[:, :, -9] / self.dt # recover velocity from delta 36 | vel_x = obs[:, :, 0] / self.dt 37 | power = np.square(act).sum(axis=-1) 38 | reward = vel_x - 0.1 * power 39 | reward = reward * 2.0 # to account for scaling difference (skip 5 --> 10) 40 | return reward 41 | 42 | def compute_path_rewards(self, paths): 43 | # path has two keys: observations and actions 44 | # path["observations"] : (num_traj, horizon, obs_dim) 45 | # path["rewards"] should have shape (num_traj, horizon) 46 | obs = paths["observations"] 47 | act = paths["actions"] 48 | rewards = self.get_reward(obs, act) 49 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 50 | 51 | def get_env_state(self): 52 | return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy()) 53 | 54 | def set_env_state(self, state): 55 | qpos = state['qpos'] 56 | qvel = state['qvel'] 57 | self.sim.reset() 58 | self.data.qpos[:] = qpos 59 | self.data.qvel[:] = qvel 60 | self.sim.forward() 61 | 62 | def reset_model(self): 63 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 64 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 65 | self.set_state(qpos, qvel) 66 | return self._get_obs() 67 | 68 | def viewer_setup(self): 69 | self.viewer.cam.distance = self.model.stat.extent * 0.5 70 | -------------------------------------------------------------------------------- /milo/milo/gym_env/hopper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | self.pos_before = 0.0 8 | self.height_idx = 1 9 | self.ang_idx = 2 10 | self.ang_threshold = 1.0 11 | mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 10) 12 | # mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 4) 13 | utils.EzPickle.__init__(self) 14 | 15 | def step(self, a): 16 | self.pos_before = self.data.qpos[0].copy() 17 | self.do_simulation(a, self.frame_skip) 18 | obs = self._get_obs() 19 | reward = self.get_reward(obs, a) 20 | done = self.get_done(obs) 21 | return obs, reward, done, {} 22 | 23 | def _get_obs(self): 24 | # I am using delta instead of velocity, 25 | # so that all obs are of similar magnitude 26 | delta = self.data.qpos[0] - self.pos_before 27 | return np.concatenate([ 28 | [delta], 29 | self.sim.data.qpos.ravel()[1:], 30 | self.sim.data.qvel.ravel() * self.dt, 31 | ]) 32 | 33 | def get_reward(self, obs, act): 34 | obs = np.clip(obs, -10.0, 10.0) 35 | if len(obs.shape) == 1: 36 | # vector obs, called when stepping the env 37 | # vel_x = (obs[1] - obs[0]) / self.dt # recover velocity from delta 38 | vel_x = obs[0] / self.dt 39 | power = np.square(act).sum() 40 | height, ang = obs[self.height_idx:(self.ang_idx+1)] 41 | else: 42 | # vel_x = (obs[:, :, 1] - obs[:, :, 0]) / self.dt # recover velocity from delta 43 | vel_x = obs[:, :, 0] / self.dt 44 | power = np.square(act).sum(axis=-1) 45 | height = obs[:, :, self.height_idx] 46 | ang = obs[:, :, self.ang_idx] 47 | alive_bonus = 1.0 * (height > .7) * (np.abs(ang) < self.ang_threshold) 48 | reward = vel_x + alive_bonus - 1e-3*power 49 | reward = reward * 2.5 # to account for scaling difference (skip 4 --> 10) 50 | return reward 51 | 52 | def compute_path_rewards(self, paths): 53 | # path has two keys: observations and actions 54 | # path["observations"] : (num_traj, horizon, obs_dim) 55 | # path["rewards"] should have shape (num_traj, horizon) 56 | obs = paths["observations"] 57 | act = paths["actions"] 58 | rewards = self.get_reward(obs, act) 59 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 60 | 61 | def get_done(self, obs): 62 | height, ang = obs[self.height_idx:(self.ang_idx+1)] 63 | done = not (np.isfinite(obs).all() and (np.abs(obs) < 100).all() and 64 | (height > .7) and (np.abs(ang) < self.ang_threshold)) 65 | return done 66 | 67 | def truncate_paths(self, paths): 68 | for path in paths: 69 | obs = path["observations"] 70 | height = obs[:, self.height_idx] 71 | angle = obs[:, self.ang_idx] 72 | T = obs.shape[0] 73 | t = 0 74 | done = False 75 | while t < T and done is False: 76 | done = not ((np.abs(obs[t]) < 100).all() and (height[t] > .7) and (np.abs(angle[t]) < self.ang_threshold)) 77 | t = t + 1 78 | T = t if done else T 79 | path["observations"] = path["observations"][:T] 80 | path["actions"] = path["actions"][:T] 81 | path["rewards"] = path["rewards"][:T] 82 | path["terminated"] = done 83 | return paths 84 | 85 | def get_env_state(self): 86 | return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy()) 87 | 88 | def set_env_state(self, state): 89 | qpos = state['qpos'] 90 | qvel = state['qvel'] 91 | self.sim.reset() 92 | self.data.qpos[:] = qpos 93 | self.data.qvel[:] = qvel 94 | self.sim.forward() 95 | 96 | def reset_model(self): 97 | qpos = self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq) 98 | qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 99 | self.set_state(qpos, qvel) 100 | return self._get_obs() 101 | 102 | def viewer_setup(self): 103 | self.viewer.cam.trackbodyid = 2 104 | self.viewer.cam.distance = self.model.stat.extent * 0.75 105 | self.viewer.cam.lookat[2] = 1.15 106 | self.viewer.cam.elevation = -20 107 | -------------------------------------------------------------------------------- /milo/milo/gym_env/humanoid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import mujoco_env 3 | from gym import utils 4 | 5 | DEFAULT_CAMERA_CONFIG = { 6 | 'trackbodyid': 1, 7 | 'distance': 4.0, 8 | 'lookat': np.array((0.0, 0.0, 2.0)), 9 | 'elevation': -20.0, 10 | } 11 | 12 | def mass_center(model, sim): 13 | mass = np.expand_dims(model.body_mass, axis=1) 14 | xpos = sim.data.xipos 15 | return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy() 16 | 17 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): 18 | def __init__(self, 19 | xml_file='humanoid.xml', 20 | reset_noise_scale=1e-2): 21 | utils.EzPickle.__init__(**locals()) 22 | 23 | self._reset_noise_scale = reset_noise_scale 24 | 25 | #mujoco_env.MujocoEnv.__init__(self, xml_file, 5) 26 | mujoco_env.MujocoEnv.__init__(self, xml_file, 10) 27 | 28 | def step(self, action): 29 | self.xypos_before = mass_center(self.model, self.sim) 30 | self.do_simulation(action, self.frame_skip) 31 | 32 | observation = self._get_obs() 33 | reward = self.get_reward(observation, action) 34 | done = self.get_done(observation) 35 | 36 | return observation, reward, done, {} 37 | 38 | def _get_obs(self): 39 | position = self.sim.data.qpos.flat.copy() 40 | velocity = self.sim.data.qvel.flat.copy() 41 | # Add Difference of center of mass to get reward 42 | delta = mass_center(self.model, self.sim) - self.xypos_before 43 | 44 | return np.concatenate(( 45 | delta, 46 | position[2:], 47 | velocity*self.dt, 48 | )) 49 | 50 | def get_reward(self, obs, action): 51 | obs = np.clip(obs, -10.0, 10.0) 52 | ctrl = np.clip(action, -0.4, 0.4) 53 | 54 | x_velocity, y_velocity = obs[:2]/self.dt 55 | z = obs[2] 56 | forward_reward = 1.25 * x_velocity 57 | alive_reward = 5.0 58 | ctrl_cost = 0.1 * np.sum(np.square(ctrl)) 59 | reward = forward_reward + alive_reward - ctrl_cost 60 | 61 | return reward * 2.0 62 | 63 | def get_done(self, obs): 64 | healthy = 1.0 < obs[2] < 2.0 65 | return not healthy 66 | 67 | def reset_model(self): 68 | noise_low = -self._reset_noise_scale 69 | noise_high = self._reset_noise_scale 70 | 71 | qpos = self.init_qpos + self.np_random.uniform( 72 | low=noise_low, high=noise_high, size=self.model.nq) 73 | qvel = self.init_qvel + self.np_random.uniform( 74 | low=noise_low, high=noise_high, size=self.model.nv) 75 | self.set_state(qpos, qvel) 76 | 77 | observation = self._get_obs() 78 | return observation 79 | 80 | def viewer_setup(self): 81 | for key, value in DEFAULT_CAMERA_CONFIG.items(): 82 | if isinstance(value, np.ndarray): 83 | getattr(self.viewer.cam, key)[:] = value 84 | else: 85 | setattr(self.viewer.cam, key, value) 86 | -------------------------------------------------------------------------------- /milo/milo/gym_env/multiprocessing_env.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | from torch.multiprocessing import Process 4 | 5 | class MujocoEnvProcess(Process): 6 | """ 7 | Process class for model based environments that are compatible with MJRL 8 | """ 9 | def __init__(self, env, child_conn, seed, eval_mode=False, paths_per_process=25): 10 | super().__init__() 11 | self.daemon = True 12 | self.env = copy.deepcopy(env) 13 | self.horizon = env.horizon 14 | self.child_conn = child_conn 15 | self.paths_per_process = paths_per_process 16 | self.seed = seed 17 | self.eval_mode = eval_mode 18 | 19 | def run(self): 20 | super().run() 21 | while True: 22 | paths, ctr = [], 0 23 | policy = self.child_conn.recv() # Recieve policy 24 | for ep in range(self.paths_per_process): 25 | # Set new seed 26 | seed = self.seed + ep 27 | self.env.set_seed(seed) 28 | np.random.seed(seed) 29 | 30 | observations = [] 31 | actions = [] 32 | rewards = [] 33 | next_observations = [] 34 | agent_infos = [] 35 | env_infos = [] 36 | 37 | o = self.env.reset() 38 | done = False 39 | t = 0 40 | while t < self.horizon and done != True: 41 | a, agent_info = policy.get_action(o) 42 | if self.eval_mode: 43 | a = agent_info['evaluation'] 44 | next_o, r, done, info = self.env.step(a) # Take step 45 | 46 | observations.append(o) 47 | next_observations.append(next_o) 48 | actions.append(a) 49 | rewards.append(r) 50 | agent_infos.append(agent_info) 51 | env_infos.append(info) 52 | 53 | o = next_o 54 | t += 1 55 | 56 | path = dict( 57 | observations = np.array(observations), 58 | next_observations = np.array(next_observations), 59 | actions = np.array(actions), 60 | rewards = np.array(rewards), 61 | agent_infos = stack_tensor_dict_list(agent_infos), 62 | env_infos = stack_tensor_dict_list(env_infos), 63 | terminated = done 64 | ) 65 | 66 | paths.append(path) 67 | ctr += t 68 | 69 | self.child_conn.send([paths, ctr]) # Return num samples 70 | 71 | def close(self): 72 | super().close() 73 | 74 | def stack_tensor_list(tensor_list): 75 | return np.array(tensor_list) 76 | 77 | def stack_tensor_dict_list(tensor_dict_list): 78 | """ 79 | Stack a list of dictionaries of {tensors or dictionary of tensors}. 80 | :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}. 81 | :return: a dictionary of {stacked tensors or dictionary of stacked tensors} 82 | """ 83 | keys = list(tensor_dict_list[0].keys()) 84 | ret = dict() 85 | for k in keys: 86 | example = tensor_dict_list[0][k] 87 | if isinstance(example, dict): 88 | v = stack_tensor_dict_list([x[k] for x in tensor_dict_list]) 89 | else: 90 | v = stack_tensor_list([x[k] for x in tensor_dict_list]) 91 | ret[k] = v 92 | return ret 93 | -------------------------------------------------------------------------------- /milo/milo/gym_env/walker2d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | self.pos_before = 0.0 8 | self.height_idx, self.ang_idx = 1, 2 9 | mujoco_env.MujocoEnv.__init__(self, 'walker2d.xml', 10) 10 | # mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4) 11 | utils.EzPickle.__init__(self) 12 | 13 | def step(self, a): 14 | self.pos_before = self.data.qpos[0].copy() 15 | self.do_simulation(a, self.frame_skip) 16 | obs = self._get_obs() 17 | reward = self.get_reward(obs, a) 18 | done = self.get_done(obs) 19 | return obs, reward, done, {} 20 | 21 | def _get_obs(self): 22 | # I am using delta instead of velocity, 23 | # so that all obs are of similar magnitude 24 | delta = self.data.qpos[0] - self.pos_before 25 | return np.concatenate([ 26 | [delta], 27 | self.sim.data.qpos.ravel()[1:], 28 | self.sim.data.qvel.ravel() * self.dt, 29 | ]) 30 | 31 | def get_reward(self, obs, act): 32 | obs = np.clip(obs, -10.0, 10.0) 33 | if len(obs.shape) == 1: 34 | # vector obs, called when stepping the env 35 | vel_x = obs[0] / self.dt # recover velocity from delta 36 | power = np.square(act).sum() 37 | height, ang = obs[self.height_idx:(self.ang_idx+1)] 38 | else: 39 | vel_x = obs[:, :, 0] / self.dt # recover velocity from delta 40 | power = np.square(act).sum(axis=-1) 41 | height = obs[:, :, self.height_idx] 42 | ang = obs[:, :, self.ang_idx] 43 | alive_bonus = 1.0 * (height > 0.8) * (height < 2.0) * (np.abs(ang) < 1.0) 44 | reward = vel_x + alive_bonus - 1e-3 * power 45 | reward = reward * 2.5 # to account for scaling difference (skip 4 --> 10) 46 | return reward 47 | 48 | def compute_path_rewards(self, paths): 49 | # path has two keys: observations and actions 50 | # path["observations"] : (num_traj, horizon, obs_dim) 51 | # path["rewards"] should have shape (num_traj, horizon) 52 | obs = paths["observations"] 53 | act = paths["actions"] 54 | rewards = self.get_reward(obs, act) 55 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 56 | return paths 57 | 58 | def get_done(self, obs): 59 | height, ang = obs[self.height_idx:(self.ang_idx+1)] 60 | done = not (np.isfinite(obs).all() and (np.abs(obs) < 100).all() and 61 | (height > 0.8) and (height < 2.0) and (np.abs(ang) < 1.0)) 62 | return done 63 | 64 | def truncate_paths(self, paths): 65 | for path in paths: 66 | obs = path["observations"] 67 | height = obs[:, self.height_idx] 68 | angle = obs[:, self.ang_idx] 69 | T = obs.shape[0] 70 | t = 0 71 | done = False 72 | while t < T and done is False: 73 | done = not ((np.abs(obs[t]) < 100).all() and (height[t] > 0.8) and \ 74 | (height[t] < 2.0) and (np.abs(angle[t]) < 1.0)) 75 | T = t if done else T 76 | t = t + 1 77 | path["observations"] = path["observations"][:T] 78 | path["actions"] = path["actions"][:T] 79 | path["rewards"] = path["rewards"][:T] 80 | path["terminated"] = done 81 | return paths 82 | 83 | def get_env_state(self): 84 | return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy()) 85 | 86 | def set_env_state(self, state): 87 | qpos = state['qpos'] 88 | qvel = state['qvel'] 89 | self.sim.reset() 90 | self.data.qpos[:] = qpos 91 | self.data.qvel[:] = qvel 92 | self.sim.forward() 93 | 94 | def reset_model(self): 95 | self.set_state( 96 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq), 97 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 98 | ) 99 | return self._get_obs() 100 | 101 | def viewer_setup(self): 102 | self.viewer.cam.trackbodyid = 2 103 | self.viewer.cam.distance = self.model.stat.extent * 0.5 104 | self.viewer.cam.lookat[2] = 1.15 105 | self.viewer.cam.elevation = -20 106 | -------------------------------------------------------------------------------- /milo/milo/sampler/__init__.py: -------------------------------------------------------------------------------- 1 | from milo.sampler.sampler import mb_sampler 2 | -------------------------------------------------------------------------------- /milo/milo/sampler/sampler.py: -------------------------------------------------------------------------------- 1 | import time 2 | from copy import deepcopy 3 | from milo.gym_env import MujocoEnvProcess 4 | from torch.multiprocessing import Pipe 5 | 6 | def mb_sampler(env, 7 | policy, 8 | num_samples, 9 | base_seed, 10 | eval_mode=False, 11 | num_workers=4, 12 | paths_per_process=13, 13 | verbose=False): 14 | """ 15 | Multiprocess sampler for model-based rollouts. Note, this is only meant for CPU usage. 16 | """ 17 | 18 | # Create Pipes and spawn jobs 19 | jobs, parent_conns, child_conns = [], [], [] 20 | for idx in range(num_workers): 21 | parent_conn, child_conn = Pipe() 22 | seed = 12345+base_seed*idx 23 | job = MujocoEnvProcess(env, child_conn, seed, eval_mode=eval_mode, paths_per_process=paths_per_process) 24 | job.start() 25 | jobs.append(job) 26 | parent_conns.append(parent_conn) 27 | child_conns.append(child_conn) 28 | 29 | # Run Jobs 30 | start_time = time.time() 31 | all_paths, curr_samples = [], 0 32 | while curr_samples < num_samples: 33 | for parent_conn in parent_conns: 34 | parent_conn.send(deepcopy(policy)) 35 | for parent_conn in parent_conns: 36 | paths, ctr = parent_conn.recv() 37 | all_paths.extend(paths) 38 | curr_samples += ctr 39 | if verbose: 40 | print(f"Collected {curr_samples} samples and {len(all_paths)} trajectories <<<<<< took {time.time()-start_time} seconds") 41 | 42 | for job in jobs: 43 | job.terminate() 44 | 45 | return all_paths 46 | -------------------------------------------------------------------------------- /milo/milo/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from milo.utils.logger import init_logger 2 | from milo.utils.arguments import get_args 3 | from milo.utils.evaluate import evaluate 4 | from milo.utils.util import * 5 | -------------------------------------------------------------------------------- /milo/milo/utils/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def get_args(): 5 | # ====== Argument Parser ====== 6 | parser = argparse.ArgumentParser( 7 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 8 | ) 9 | 10 | # Logging/Environment Arguments 11 | parser.add_argument('--env', type=str, 12 | help='environment ID', default='Hopper-v6') 13 | parser.add_argument('--seed', type=int, help='seed', default=100) 14 | parser.add_argument('--num_cpu', type=int, 15 | help='number of processes used for inference', default=4) 16 | parser.add_argument('--num_trajs', type=int, 17 | help='number of expert trajs', default=10) 18 | parser.add_argument('--num_samples', type=int, 19 | help='number of expert samples', default=500) 20 | parser.add_argument('--subsample_freq', type=int, 21 | help='subsample frequency', default=8) 22 | parser.add_argument('--norm_thresh_coeff', type=float, 23 | help='Norm threshold', default=2) 24 | parser.add_argument('--include_expert', action='store_true', 25 | help='include expert data into offline db', default=False) 26 | parser.add_argument('--subsample_expert', action='store_true', 27 | help='subsample expert samples', default=False) 28 | parser.add_argument('--randomize_expert', action='store_true', 29 | help='randomize expert samples', default=False) 30 | parser.add_argument('--save_iter', type=int, 31 | help='Interval to Save checkpoints', default=10) 32 | 33 | # Path Arguments 34 | parser.add_argument('--root_path', type=str, 35 | help='Root dir to save outputs', default='./experiments') 36 | parser.add_argument('--data_path', type=str, 37 | help='Root data dir to get db', default='./data') 38 | parser.add_argument('--expert_db', type=str, 39 | help='expert db name', default='Hopper-v6_100_3012.62.pt') 40 | parser.add_argument('--offline_db', type=str, 41 | help='offline db name', default='Hopper-v6_100_3025.47.pt') 42 | parser.add_argument('--model_save_path', type=str, help='Path to save models', 43 | default='./experiments/dynamics_model_weights') 44 | parser.add_argument('--id', type=int, help='Experiment id', default=0) 45 | 46 | # Dynamics Model Ensemble Arguments 47 | parser.add_argument('--n_models', type=int, 48 | help='Number of dynamics models in ensemble', default=4) 49 | parser.add_argument('--n_epochs', type=int, 50 | help='Number of epochs to train models', default=5) 51 | parser.add_argument('--grad_clip', type=float, 52 | help='Max Gradient Norm', default=1.0) 53 | parser.add_argument('--dynamics_optim', type=str, 54 | help='Optimizer to use [sgd, adam]', default='sgd') 55 | 56 | # Cost Arguments 57 | parser.add_argument('--feature_dim', type=int, 58 | help='Feature dimension', default=512) 59 | parser.add_argument('--update_type', type=str, 60 | help='exact, geometric, decay, decay_sqrt, ficticious', default='exact') 61 | parser.add_argument('--bw_quantile', type=float, 62 | help='Quantile when fitting bandwidth', default=0.2) 63 | parser.add_argument('--lambda_b', type=float, 64 | help='Bonus/Penalty weighting param', default=0.1) 65 | parser.add_argument('--cost_lr', type=float, 66 | help='0.0 is exact update, otherwise learning rate', default=0.0) 67 | 68 | # Policy Gradient Arguments 69 | parser.add_argument('--planner', type=str, 70 | help='pg alg to use (trpo, ppo)', default='trpo') 71 | parser.add_argument('--actor_model_hidden', type=int, 72 | nargs='+', help='hidden dims for actor', default=[32, 32]) 73 | parser.add_argument('--critic_model_hidden', type=int, nargs='+', 74 | help='hidden dims for critic', default=[128, 128]) 75 | parser.add_argument('--gamma', type=float, 76 | help='discount factor for rewards (default: 0.99)', default=0.995) 77 | parser.add_argument('--gae_lambda', type=float, 78 | help='gae lambda val', default=0.97) 79 | parser.add_argument('--samples_per_step', type=int, 80 | help='Number of mb samples per pg step', default=512) 81 | parser.add_argument('--policy_init_log', type=float, 82 | help='policy init log', default=-0.25) 83 | parser.add_argument('--policy_min_log', type=float, 84 | help='policy min log', default=-2.0) 85 | parser.add_argument('--vf_iters', type=int, 86 | help='Number of value optim steps', default=2) 87 | parser.add_argument('--vf_batch_size', type=int, 88 | help='Critic batch size', default=64) 89 | parser.add_argument('--vf_lr', type=float, help='Value lr', default=1e-3) 90 | parser.add_argument('--vf_reg_coef', type=float, 91 | help='baseline regularization coeff', default=1e-3) 92 | 93 | # BC regularization Arguments 94 | parser.add_argument('--do_bc_reg', action='store_true', help='Add bc regularization to policy gradient', default=False) 95 | parser.add_argument('--bc_reg_coeff', type=float, help='Regularization coefficient for policy gradient', default=0.1) 96 | 97 | # TRPO Arguments 98 | parser.add_argument('--cg_iter', type=int, 99 | help='Number of CG iterations', default=10) 100 | parser.add_argument('--cg_damping', type=float, 101 | help='CG damping coefficient', default=1e-4) 102 | parser.add_argument('--kl_dist', type=float, 103 | help='Trust region', default=0.05) 104 | parser.add_argument('--hvp_sample_frac', type=float, 105 | help='Fraction of samples for FIM', default=1.0) 106 | 107 | # PPO Arguments 108 | parser.add_argument('--clip_coef', type=float, 109 | help='Clip Coefficient for PPO Trust region', default=0.2) 110 | parser.add_argument('--ppo_lr', type=float, 111 | help='PPO learning rate', default=3e-4) 112 | parser.add_argument('--ppo_epochs', type=int, 113 | help='Epochs per PPO step', default=10) 114 | parser.add_argument('--ppo_batch_size', type=int, 115 | help='Mini-batch size for PPO', default=64) 116 | 117 | # BC Arguments 118 | parser.add_argument('--bc_epochs', type=int, 119 | help='Number of BC epochs', default=3) 120 | parser.add_argument('--n_bc_iters', type=int, default=10, 121 | help='number of times to run BC iterations') 122 | 123 | # General Algorithm Arguments 124 | parser.add_argument('--n_iter', type=int, help='Number of offline IL iterations to run', default=300) 125 | parser.add_argument('--pg_iter', type=int, help='Number of pg steps', default=5) 126 | parser.add_argument('--use_ground_truth', action='store_true', help='use ground truth rewards', default=False) 127 | parser.add_argument('--do_model_free', action='store_true', help='do model free policy gradient', default=False) 128 | 129 | args = parser.parse_args() 130 | return args 131 | -------------------------------------------------------------------------------- /milo/milo/utils/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from mjrl.samplers.core import sample_paths 5 | 6 | # ======================== 7 | # === Evaluation Utils === 8 | # ======================== 9 | 10 | def evaluate(n_iter, logger, writer, args, env, policy, reward_func, num_traj=10, adroit=False): 11 | greedy_samples = sample_paths(num_traj=num_traj, env=env, policy=policy, \ 12 | num_cpu=args.num_cpu, base_seed=args.seed, eval_mode=True, suppress_print=True) 13 | samples = sample_paths(num_traj=num_traj, env=env, policy=policy, \ 14 | num_cpu=args.num_cpu, base_seed=args.seed, eval_mode=False, suppress_print=True) 15 | 16 | if adroit: 17 | greedy_success = env.evaluate_success(greedy_samples) 18 | sample_success = env.evaluate_success(samples) 19 | 20 | # Compute scores 21 | greedy_scores = np.array([np.sum(traj['rewards']) for traj in greedy_samples]) 22 | sample_scores = np.array([np.sum(traj['rewards']) for traj in samples]) 23 | greedy_mean_lengths = np.mean([len(traj['rewards']) for traj in greedy_samples]) 24 | sample_mean_lengths = np.mean([len(traj['rewards']) for traj in samples]) 25 | greedy_mean, greedy_max, greedy_min = greedy_scores.mean(), greedy_scores.max(), greedy_scores.min() 26 | sample_mean, sample_max, sample_min = sample_scores.mean(), sample_scores.max(), sample_scores.min() 27 | 28 | # Compute MMD (S, A) 29 | greedy_x = np.concatenate([np.concatenate([traj['observations'], traj['actions']], axis=1) for traj in greedy_samples], axis=0) 30 | sample_x = np.concatenate([np.concatenate([traj['observations'], traj['actions']], axis=1) for traj in samples], axis=0) 31 | greedy_x = torch.from_numpy(greedy_x).float() 32 | sample_x = torch.from_numpy(sample_x).float() 33 | 34 | greedy_diff = reward_func.get_rep(greedy_x).mean(0) - reward_func.phi_e 35 | sample_diff = reward_func.get_rep(sample_x).mean(0) - reward_func.phi_e 36 | 37 | greedy_mmd = torch.dot(greedy_diff, greedy_diff) 38 | sample_mmd = torch.dot(sample_diff, sample_diff) 39 | 40 | # Log 41 | logger.info(f'Greedy Evaluation Score mean (min, max): {greedy_mean:.2f} ({greedy_min:.2f}, {greedy_max:.2f})') 42 | logger.info(f'Greedy Evaluation Trajectory Lengths: {greedy_mean_lengths:.2f}') 43 | logger.info(f'Greedy MMD: {greedy_mmd}') 44 | if adroit: 45 | logger.info(f'Greedy Success %: {greedy_success}%') 46 | logger.info(f'Sampled Evaluation Score mean (min, max): {sample_mean:.2f} ({sample_min:.2f}, {sample_max:.2f})') 47 | logger.info(f'Sampled Evaluation Trajectory Lengths: {sample_mean_lengths:.2f}') 48 | logger.info(f'Sampled MMD: {sample_mmd}') 49 | if adroit: 50 | logger.info(f'Sampled Success %: {sample_success}%') 51 | 52 | # Tensorboard Logging 53 | writer.add_scalars('data/inf_greedy_reward', {'min_score': greedy_min, 54 | 'mean_score': greedy_mean, 55 | 'max_score': greedy_max}, n_iter+1) 56 | writer.add_scalar('data/inf_greedy_len', greedy_mean_lengths, n_iter+1) 57 | writer.add_scalar('data/greedy_mmd', greedy_mmd, n_iter+1) 58 | writer.add_scalars('data/inf_sampled_reward', {'min_score': sample_min, 59 | 'mean_score': sample_mean, 60 | 'max_score': sample_max}, n_iter+1) 61 | writer.add_scalar('data/inf_sampled_len', sample_mean_lengths, n_iter+1) 62 | writer.add_scalar('data/sampled_mmd', sample_mmd, n_iter+1) 63 | if adroit: 64 | writer.add_scalar('data/greedy_success_percen', greedy_success, n_iter+1) 65 | writer.add_scalar('data/sampled_success_percen', sample_success, n_iter+1) 66 | 67 | scores = {'greedy': greedy_mean, 'sample': sample_mean} 68 | mmds = {'greedy': greedy_mmd, 'sample': sample_mmd} 69 | 70 | return scores, mmds 71 | 72 | -------------------------------------------------------------------------------- /milo/milo/utils/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logger singleton wrapper 3 | Default logger folder is `os.path.join(__file__, '..', '..', 'logs')` 4 | """ 5 | import logging 6 | import logging.handlers 7 | import os 8 | import sys 9 | 10 | 11 | __all__ = ['init_logger'] 12 | 13 | 14 | def init_logger(log_dir): 15 | os.makedirs(log_dir, exist_ok=True) 16 | log_level = logging.DEBUG 17 | log_format = '%(message)s' 18 | 19 | logger = logging.getLogger(log_dir) 20 | logger.setLevel(log_level) 21 | path = os.path.join(log_dir, 'main.log') 22 | 23 | # file handler (log file) 24 | log_handler = logging.handlers.RotatingFileHandler(filename=path) 25 | log_handler.setLevel(log_level) 26 | log_handler.setFormatter(logging.Formatter(log_format)) 27 | logger.addHandler(log_handler) 28 | 29 | # stream handler (default sys.stderr) 30 | log_handler = logging.StreamHandler() 31 | log_handler.setLevel(log_level) 32 | log_handler.setFormatter(logging.Formatter(log_format)) 33 | logger.addHandler(log_handler) 34 | 35 | return logger 36 | -------------------------------------------------------------------------------- /milo/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='milo', 5 | version='0.1.0', 6 | packages=find_packages(), 7 | description='Components for MILO: Model based Imitation Learning from Offline data', 8 | ) 9 | -------------------------------------------------------------------------------- /mjrl/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # idea 104 | *.idea/ 105 | 106 | # Mac OSX files 107 | *.DS_Store -------------------------------------------------------------------------------- /mjrl/README.md: -------------------------------------------------------------------------------- 1 | # RL for MuJoCo 2 | 3 | This package contains implementations of various RL algorithms for continuous control tasks simulated with [MuJoCo.](http://www.mujoco.org/) 4 | 5 | # Installation 6 | The main package dependencies are `MuJoCo`, `python=3.7`, `gym>=0.13`, `mujoco-py>=2.0`, and `pytorch>=1.0`. See `setup/README.md` ([link](https://github.com/aravindr93/mjrl/tree/master/setup#installation)) for detailed install instructions. 7 | 8 | # Bibliography 9 | If you find the package useful, please cite the following papers. 10 | ``` 11 | @INPROCEEDINGS{Rajeswaran-NIPS-17, 12 | AUTHOR = {Aravind Rajeswaran and Kendall Lowrey and Emanuel Todorov and Sham Kakade}, 13 | TITLE = "{Towards Generalization and Simplicity in Continuous Control}", 14 | BOOKTITLE = {NIPS}, 15 | YEAR = {2017}, 16 | } 17 | 18 | @INPROCEEDINGS{Rajeswaran-RSS-18, 19 | AUTHOR = {Aravind Rajeswaran AND Vikash Kumar AND Abhishek Gupta AND 20 | Giulia Vezzani AND John Schulman AND Emanuel Todorov AND Sergey Levine}, 21 | TITLE = "{Learning Complex Dexterous Manipulation with Deep Reinforcement Learning and Demonstrations}", 22 | BOOKTITLE = {Proceedings of Robotics: Science and Systems (RSS)}, 23 | YEAR = {2018}, 24 | } 25 | ``` 26 | 27 | # Credits 28 | This package is maintained by [Aravind Rajeswaran](http://homes.cs.washington.edu/~aravraj/) and other members of the [Movement Control Lab,](http://homes.cs.washington.edu/~todorov/) University of Washington Seattle. 29 | -------------------------------------------------------------------------------- /mjrl/examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | Here we provide a job script to illustrate policy optimization with incrimental learning methods like NPG and PPO. To run the experiments, use the commands below. The experiments are run through the job script provided which tasks two arguments: 4 | - `output`: path to directory where all the results will be saved 5 | - `config`: a config `.txt` file with all the experiment parameters (examples are provided) 6 | The script has to be run from this directory, i.e. `mjrl/examples` 7 | 8 | 1. To train an NPG agent on a task shipped with `mjrl` (e.g. swimmer) 9 | ``` 10 | $ python policy_opt_job_script.py --output swimmer_npg_exp --config example_configs/swimmer_npg.txt 11 | ``` 12 | 13 | 2. To train an NPG agent on an OpenAI gym benchmark task (e.g. hopper) 14 | ``` 15 | $ python policy_opt_job_script.py --output hopper_npg_exp --config example_configs/hopper_npg.txt 16 | ``` 17 | Note that since the Hopper env has termination conditions, we pick the sampling mode in the config to be `samples` rather than trajectories, so that per update we have 10K samples. 18 | 19 | 3. To train a PPO agent on the swimmer task 20 | ``` 21 | $ python policy_opt_job_script.py --output swimmer_ppo_exp --config example_configs/swimmer_ppo.txt 22 | ``` -------------------------------------------------------------------------------- /mjrl/examples/behavior_clone.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 4 | from mjrl.baselines.mlp_baseline import MLPBaseline 5 | from mjrl.algos.npg_cg import NPG 6 | from mjrl.algos.behavior_cloning import BC 7 | from mjrl.utils.train_agent import train_agent 8 | from mjrl.samplers.core import sample_paths 9 | import mjrl.envs 10 | import time as timer 11 | import pickle 12 | SEED = 500 13 | 14 | # ------------------------------ 15 | # Train expert policy first 16 | e = GymEnv('mjrl_swimmer-v0') 17 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 18 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=5, learn_rate=1e-3) 19 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) 20 | 21 | ts = timer.time() 22 | print("========================================") 23 | print("Training expert policy") 24 | print("========================================") 25 | train_agent(job_name='swimmer_exp1', 26 | agent=agent, 27 | seed=SEED, 28 | niter=50, 29 | gamma=0.995, 30 | gae_lambda=0.97, 31 | num_cpu=1, 32 | sample_mode='trajectories', 33 | num_traj=10, 34 | save_freq=5, 35 | evaluation_rollouts=None) 36 | print("========================================") 37 | print("Expert policy training complete !!!") 38 | print("========================================") 39 | print("time taken = %f" % (timer.time()-ts)) 40 | print("========================================") 41 | 42 | # ------------------------------ 43 | # Get demonstrations 44 | print("========================================") 45 | print("Collecting expert demonstrations") 46 | print("========================================") 47 | expert_pol = pickle.load(open('swimmer_exp1/iterations/best_policy.pickle', 'rb')) 48 | demo_paths = sample_paths(num_traj=5, policy=expert_pol, env=e.env_id) 49 | 50 | # ------------------------------ 51 | # Train BC 52 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 53 | bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=64, lr=1e-3) # will use Adam by default 54 | ts = timer.time() 55 | print("========================================") 56 | print("Running BC with expert demonstrations") 57 | print("========================================") 58 | bc_agent.train() 59 | print("========================================") 60 | print("BC training complete !!!") 61 | print("time taken = %f" % (timer.time()-ts)) 62 | print("========================================") 63 | 64 | # ------------------------------ 65 | # Evaluate Policies 66 | bc_pol_score = e.evaluate_policy(policy, num_episodes=5, mean_action=True) 67 | expert_score = e.evaluate_policy(expert_pol, num_episodes=5, mean_action=True) 68 | print("Expert policy performance (eval mode) = %f" % expert_score[0][0]) 69 | print("BC policy performance (eval mode) = %f" % bc_pol_score[0][0]) 70 | -------------------------------------------------------------------------------- /mjrl/examples/example_configs/hopper_npg.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env' : 'Hopper-v3', 6 | 'algorithm' : 'NPG', 7 | 'seed' : 123, 8 | 'sample_mode' : 'samples', 9 | 'rl_num_samples' : 10000, 10 | 'rl_num_iter' : 100, 11 | 'num_cpu' : 1, 12 | 'save_freq' : 25, 13 | 'eval_rollouts' : None, 14 | 'exp_notes' : 'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.', 15 | 16 | # RL parameters (all params related to PG, value function etc.) 17 | 18 | 'policy_size' : (32, 32), 19 | 'init_log_std' : -0.5, 20 | 'vf_hidden_size' : (128, 128), 21 | 'vf_batch_size' : 64, 22 | 'vf_epochs' : 2, 23 | 'vf_learn_rate' : 1e-3, 24 | 'rl_step_size' : 0.05, 25 | 'rl_gamma' : 0.995, 26 | 'rl_gae' : 0.97, 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | 'alg_hyper_params' : dict(), 31 | 32 | } 33 | 34 | -------------------------------------------------------------------------------- /mjrl/examples/example_configs/swimmer_npg.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env' : 'mjrl_swimmer-v0', 6 | 'algorithm' : 'NPG', 7 | 'seed' : 123, 8 | 'sample_mode' : 'trajectories', 9 | 'rl_num_traj' : 10, 10 | 'rl_num_iter' : 50, 11 | 'num_cpu' : 2, 12 | 'save_freq' : 25, 13 | 'eval_rollouts' : None, 14 | 'exp_notes' : 'Example config for training policy with NPG on the mjrl swimmer task.', 15 | 16 | # RL parameters (all params related to PG, value function, DAPG etc.) 17 | 18 | 'policy_size' : (32, 32), 19 | 'init_log_std' : -0.5, 20 | 'vf_hidden_size' : (128, 128), 21 | 'vf_batch_size' : 64, 22 | 'vf_epochs' : 2, 23 | 'vf_learn_rate' : 1e-3, 24 | 'rl_step_size' : 0.1, 25 | 'rl_gamma' : 0.995, 26 | 'rl_gae' : 0.97, 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | 'alg_hyper_params' : dict(), 31 | 32 | } -------------------------------------------------------------------------------- /mjrl/examples/example_configs/swimmer_ppo.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env' : 'mjrl_swimmer-v0', 6 | 'algorithm' : 'PPO', 7 | 'seed' : 123, 8 | 'sample_mode' : 'trajectories', 9 | 'rl_num_traj' : 10, 10 | 'rl_num_iter' : 50, 11 | 'num_cpu' : 2, 12 | 'save_freq' : 25, 13 | 'eval_rollouts' : None, 14 | 'exp_notes' : 'Example config for training policy with PPO on the mjrl swimmer task.', 15 | 16 | # RL parameters (all params related to PG, value function, DAPG etc.) 17 | 18 | 'policy_size' : (32, 32), 19 | 'init_log_std' : -0.5, 20 | 'vf_hidden_size' : (128, 128), 21 | 'vf_batch_size' : 64, 22 | 'vf_epochs' : 2, 23 | 'vf_learn_rate' : 1e-3, 24 | 'rl_step_size' : 0.1, 25 | 'rl_gamma' : 0.995, 26 | 'rl_gae' : 0.97, 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | 'alg_hyper_params' : dict(clip_coef=0.2, epochs=10, mb_size=64, learn_rate=5e-4), 31 | 32 | } -------------------------------------------------------------------------------- /mjrl/examples/linear_nn_comparison.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.policies.gaussian_linear import LinearPolicy 4 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 5 | from mjrl.baselines.mlp_baseline import MLPBaseline 6 | from mjrl.algos.npg_cg import NPG 7 | from mjrl.utils.train_agent import train_agent 8 | import mjrl.envs 9 | import time as timer 10 | SEED = 500 11 | 12 | # NN policy 13 | # ================================== 14 | e = GymEnv('mjrl_swimmer-v0') 15 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 16 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) 17 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) 18 | 19 | ts = timer.time() 20 | train_agent(job_name='swimmer_nn_exp1', 21 | agent=agent, 22 | seed=SEED, 23 | niter=50, 24 | gamma=0.995, 25 | gae_lambda=0.97, 26 | num_cpu=1, 27 | sample_mode='trajectories', 28 | num_traj=10, 29 | save_freq=5, 30 | evaluation_rollouts=5) 31 | print("time taken for NN policy training = %f" % (timer.time()-ts)) 32 | 33 | 34 | # Linear policy 35 | # ================================== 36 | e = GymEnv('mjrl_swimmer-v0') 37 | policy = LinearPolicy(e.spec, seed=SEED) 38 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) 39 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) 40 | 41 | ts = timer.time() 42 | train_agent(job_name='swimmer_linear_exp1', 43 | agent=agent, 44 | seed=SEED, 45 | niter=50, 46 | gamma=0.995, 47 | gae_lambda=0.97, 48 | num_cpu=1, 49 | sample_mode='trajectories', 50 | num_traj=10, 51 | save_freq=5, 52 | evaluation_rollouts=5) 53 | print("time taken for linear policy training = %f" % (timer.time()-ts)) 54 | -------------------------------------------------------------------------------- /mjrl/examples/policy_opt_job_script.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a job script for running policy gradient algorithms on gym tasks. 3 | Separate job scripts are provided to run few other algorithms 4 | - For DAPG see here: https://github.com/aravindr93/hand_dapg/tree/master/dapg/examples 5 | - For model-based NPG see here: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos/model_accel 6 | """ 7 | 8 | from mjrl.utils.gym_env import GymEnv 9 | from mjrl.policies.gaussian_mlp import MLP 10 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 11 | from mjrl.baselines.mlp_baseline import MLPBaseline 12 | from mjrl.algos.npg_cg import NPG 13 | from mjrl.algos.batch_reinforce import BatchREINFORCE 14 | from mjrl.algos.ppo_clip import PPO 15 | from mjrl.utils.train_agent import train_agent 16 | import os 17 | import json 18 | import gym 19 | import mjrl.envs 20 | import time as timer 21 | import pickle 22 | import argparse 23 | 24 | # =============================================================================== 25 | # Get command line arguments 26 | # =============================================================================== 27 | 28 | parser = argparse.ArgumentParser(description='Natural policy gradient from mjrl on mujoco environments') 29 | parser.add_argument('--output', type=str, required=True, help='location to store results') 30 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params') 31 | 32 | args = parser.parse_args() 33 | JOB_DIR = args.output 34 | if not os.path.exists(JOB_DIR): 35 | os.mkdir(JOB_DIR) 36 | with open(args.config, 'r') as f: 37 | job_data = eval(f.read()) 38 | assert 'algorithm' in job_data.keys() 39 | assert any([job_data['algorithm'] == a for a in ['NPG', 'NVPG', 'VPG', 'PPO']]) 40 | assert 'sample_mode' in job_data.keys() 41 | job_data['alg_hyper_params'] = dict() if 'alg_hyper_params' not in job_data.keys() else job_data['alg_hyper_params'] 42 | 43 | EXP_FILE = JOB_DIR + '/job_config.json' 44 | with open(EXP_FILE, 'w') as f: 45 | json.dump(job_data, f, indent=4) 46 | 47 | if job_data['sample_mode'] == 'trajectories': 48 | assert 'rl_num_traj' in job_data.keys() 49 | job_data['rl_num_samples'] = 0 # will be ignored 50 | elif job_data['sample_mode'] == 'samples': 51 | assert 'rl_num_samples' in job_data.keys() 52 | job_data['rl_num_traj'] = 0 # will be ignored 53 | else: 54 | print("Unknown sampling mode. Choose either trajectories or samples") 55 | exit() 56 | 57 | # =============================================================================== 58 | # Train Loop 59 | # =============================================================================== 60 | 61 | e = GymEnv(job_data['env']) 62 | policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed'], init_log_std=job_data['init_log_std']) 63 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], hidden_sizes=job_data['vf_hidden_size'], 64 | epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) 65 | 66 | # Construct the algorithm 67 | if job_data['algorithm'] == 'NPG': 68 | # Other hyperparameters (like number of CG steps) can be specified in config for pass through 69 | # or default hyperparameters will be used 70 | agent = NPG(e, policy, baseline, normalized_step_size=job_data['rl_step_size'], 71 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params']) 72 | 73 | elif job_data['algorithm'] == 'VPG': 74 | agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data['rl_step_size'], 75 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params']) 76 | 77 | elif job_data['algorithm'] == 'NVPG': 78 | agent = BatchREINFORCE(e, policy, baseline, desired_kl=job_data['rl_step_size'], 79 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params']) 80 | 81 | elif job_data['algorithm'] == 'PPO': 82 | # There are many hyperparameters for PPO. They can be specified in config for pass through 83 | # or defaults in the PPO algorithm will be used 84 | agent = PPO(e, policy, baseline, save_logs=True, **job_data['alg_hyper_params']) 85 | 86 | print("========================================") 87 | print("Starting policy learning") 88 | print("========================================") 89 | 90 | ts = timer.time() 91 | train_agent(job_name=JOB_DIR, 92 | agent=agent, 93 | seed=job_data['seed'], 94 | niter=job_data['rl_num_iter'], 95 | gamma=job_data['rl_gamma'], 96 | gae_lambda=job_data['rl_gae'], 97 | num_cpu=job_data['num_cpu'], 98 | sample_mode=job_data['sample_mode'], 99 | num_traj=job_data['rl_num_traj'], 100 | num_samples=job_data['rl_num_samples'], 101 | save_freq=job_data['save_freq'], 102 | evaluation_rollouts=job_data['eval_rollouts']) 103 | print("time taken = %f" % (timer.time()-ts)) 104 | -------------------------------------------------------------------------------- /mjrl/mjrl/__init__.py: -------------------------------------------------------------------------------- 1 | import mjrl.envs -------------------------------------------------------------------------------- /mjrl/mjrl/algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/algos/__init__.py -------------------------------------------------------------------------------- /mjrl/mjrl/algos/behavior_cloning.py: -------------------------------------------------------------------------------- 1 | """ 2 | Minimize bc loss (MLE, MSE, RWR etc.) with pytorch optimizers 3 | """ 4 | 5 | import logging 6 | #logging.disable(logging.CRITICAL) 7 | import numpy as np 8 | import time as timer 9 | import torch 10 | from torch.autograd import Variable 11 | from mjrl.utils.logger import DataLog 12 | from tqdm import tqdm 13 | 14 | 15 | class BC: 16 | def __init__(self, expert_paths, 17 | policy, 18 | epochs = 5, 19 | batch_size = 64, 20 | lr = 1e-3, 21 | optimizer = None, 22 | loss_type = 'MSE', # can be 'MLE' or 'MSE' 23 | save_logs = True, 24 | set_transforms = False, 25 | **kwargs, 26 | ): 27 | 28 | self.policy = policy 29 | self.expert_paths = expert_paths 30 | self.epochs = epochs 31 | self.mb_size = batch_size 32 | self.logger = DataLog() 33 | self.loss_type = loss_type 34 | self.save_logs = save_logs 35 | 36 | if set_transforms: 37 | in_shift, in_scale, out_shift, out_scale = self.compute_transformations() 38 | self.set_transformations(in_shift, in_scale, out_shift, out_scale) 39 | self.set_variance_with_data(out_scale) 40 | 41 | # construct optimizer 42 | self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=lr) if optimizer is None else optimizer 43 | 44 | # Loss criterion if required 45 | if loss_type == 'MSE': 46 | self.loss_criterion = torch.nn.MSELoss() 47 | 48 | # make logger 49 | if self.save_logs: 50 | self.logger = DataLog() 51 | 52 | def compute_transformations(self): 53 | # get transformations 54 | if self.expert_paths == [] or self.expert_paths is None: 55 | in_shift, in_scale, out_shift, out_scale = None, None, None, None 56 | else: 57 | observations = np.concatenate([path["observations"] for path in self.expert_paths]) 58 | actions = np.concatenate([path["actions"] for path in self.expert_paths]) 59 | in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) 60 | out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0) 61 | return in_shift, in_scale, out_shift, out_scale 62 | 63 | def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None): 64 | # set scalings in the target policy 65 | self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale) 66 | self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale) 67 | 68 | def set_variance_with_data(self, out_scale): 69 | # set the variance of gaussian policy based on out_scale 70 | params = self.policy.get_param_values() 71 | params[-self.policy.m:] = np.log(out_scale + 1e-12) 72 | self.policy.set_param_values(params) 73 | 74 | def loss(self, data, idx=None): 75 | if self.loss_type == 'MLE': 76 | return self.mle_loss(data, idx) 77 | elif self.loss_type == 'MSE': 78 | return self.mse_loss(data, idx) 79 | else: 80 | print("Please use valid loss type") 81 | return None 82 | 83 | def mle_loss(self, data, idx): 84 | # use indices if provided (e.g. for mini-batching) 85 | # otherwise, use all the data 86 | idx = range(data['observations'].shape[0]) if idx is None else idx 87 | if type(data['observations']) == torch.Tensor: 88 | idx = torch.LongTensor(idx) 89 | obs = data['observations'][idx] 90 | act = data['expert_actions'][idx] 91 | LL, mu, log_std = self.policy.new_dist_info(obs, act) 92 | # minimize negative log likelihood 93 | return -torch.mean(LL) 94 | 95 | def mse_loss(self, data, idx=None): 96 | idx = range(data['observations'].shape[0]) if idx is None else idx 97 | if type(data['observations']) is torch.Tensor: 98 | idx = torch.LongTensor(idx) 99 | obs = data['observations'][idx] 100 | act_expert = data['expert_actions'][idx] 101 | if type(data['observations']) is not torch.Tensor: 102 | obs = Variable(torch.from_numpy(obs).float(), requires_grad=False) 103 | act_expert = Variable(torch.from_numpy(act_expert).float(), requires_grad=False) 104 | act_pi = self.policy.model(obs) 105 | return self.loss_criterion(act_pi, act_expert.detach()) 106 | 107 | def fit(self, data, suppress_fit_tqdm=False, **kwargs): 108 | # data is a dict 109 | # keys should have "observations" and "expert_actions" 110 | validate_keys = all([k in data.keys() for k in ["observations", "expert_actions"]]) 111 | assert validate_keys is True 112 | ts = timer.time() 113 | num_samples = data["observations"].shape[0] 114 | 115 | # log stats before 116 | if self.save_logs: 117 | loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0] 118 | self.logger.log_kv('loss_before', loss_val) 119 | 120 | # train loop 121 | for ep in config_tqdm(range(self.epochs), suppress_fit_tqdm): 122 | for mb in range(int(num_samples / self.mb_size)): 123 | rand_idx = np.random.choice(num_samples, size=self.mb_size) 124 | self.optimizer.zero_grad() 125 | loss = self.loss(data, idx=rand_idx) 126 | loss.backward() 127 | self.optimizer.step() 128 | params_after_opt = self.policy.get_param_values() 129 | self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) 130 | 131 | # log stats after 132 | if self.save_logs: 133 | self.logger.log_kv('epoch', self.epochs) 134 | loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0] 135 | self.logger.log_kv('loss_after', loss_val) 136 | self.logger.log_kv('time', (timer.time()-ts)) 137 | 138 | def train(self, **kwargs): 139 | observations = np.concatenate([path["observations"] for path in self.expert_paths]) 140 | expert_actions = np.concatenate([path["actions"] for path in self.expert_paths]) 141 | data = dict(observations=observations, expert_actions=expert_actions) 142 | self.fit(data, **kwargs) 143 | 144 | 145 | def config_tqdm(range_inp, suppress_tqdm=False): 146 | if suppress_tqdm: 147 | return range_inp 148 | else: 149 | return tqdm(range_inp) 150 | -------------------------------------------------------------------------------- /mjrl/mjrl/algos/dapg.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.disable(logging.CRITICAL) 3 | import numpy as np 4 | import scipy as sp 5 | import scipy.sparse.linalg as spLA 6 | import copy 7 | import time as timer 8 | import torch 9 | import torch.nn as nn 10 | from torch.autograd import Variable 11 | import copy 12 | 13 | # samplers 14 | import mjrl.samplers.core as trajectory_sampler 15 | 16 | # utility functions 17 | import mjrl.utils.process_samples as process_samples 18 | from mjrl.utils.logger import DataLog 19 | from mjrl.utils.cg_solve import cg_solve 20 | 21 | # Import Algs 22 | from mjrl.algos.npg_cg import NPG 23 | from mjrl.algos.behavior_cloning import BC 24 | 25 | class DAPG(NPG): 26 | def __init__(self, env, policy, baseline, 27 | demo_paths=None, 28 | normalized_step_size=0.01, 29 | FIM_invert_args={'iters': 10, 'damping': 1e-4}, 30 | hvp_sample_frac=1.0, 31 | seed=123, 32 | save_logs=False, 33 | kl_dist=None, 34 | lam_0=1.0, # demo coef 35 | lam_1=0.95, # decay coef 36 | **kwargs, 37 | ): 38 | 39 | self.env = env 40 | self.policy = policy 41 | self.baseline = baseline 42 | self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size 43 | self.seed = seed 44 | self.save_logs = save_logs 45 | self.FIM_invert_args = FIM_invert_args 46 | self.hvp_subsample = hvp_sample_frac 47 | self.running_score = None 48 | self.demo_paths = demo_paths 49 | self.lam_0 = lam_0 50 | self.lam_1 = lam_1 51 | self.iter_count = 0.0 52 | if save_logs: self.logger = DataLog() 53 | 54 | def train_from_paths(self, paths): 55 | 56 | # Concatenate from all the trajectories 57 | observations = np.concatenate([path["observations"] for path in paths]) 58 | actions = np.concatenate([path["actions"] for path in paths]) 59 | advantages = np.concatenate([path["advantages"] for path in paths]) 60 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) 61 | 62 | if self.demo_paths is not None and self.lam_0 > 0.0: 63 | demo_obs = np.concatenate([path["observations"] for path in self.demo_paths]) 64 | demo_act = np.concatenate([path["actions"] for path in self.demo_paths]) 65 | demo_adv = self.lam_0 * (self.lam_1 ** self.iter_count) * np.ones(demo_obs.shape[0]) 66 | self.iter_count += 1 67 | # concatenate all 68 | all_obs = np.concatenate([observations, demo_obs]) 69 | all_act = np.concatenate([actions, demo_act]) 70 | all_adv = 1e-2*np.concatenate([advantages/(np.std(advantages) + 1e-8), demo_adv]) 71 | else: 72 | all_obs = observations 73 | all_act = actions 74 | all_adv = advantages 75 | 76 | # cache return distributions for the paths 77 | path_returns = [sum(p["rewards"]) for p in paths] 78 | mean_return = np.mean(path_returns) 79 | std_return = np.std(path_returns) 80 | min_return = np.amin(path_returns) 81 | max_return = np.amax(path_returns) 82 | base_stats = [mean_return, std_return, min_return, max_return] 83 | self.running_score = mean_return if self.running_score is None else \ 84 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters 85 | if self.save_logs: self.log_rollout_statistics(paths) 86 | 87 | # Keep track of times for various computations 88 | t_gLL = 0.0 89 | t_FIM = 0.0 90 | 91 | # Optimization algorithm 92 | # -------------------------- 93 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 94 | 95 | # DAPG 96 | ts = timer.time() 97 | sample_coef = all_adv.shape[0]/advantages.shape[0] 98 | dapg_grad = sample_coef*self.flat_vpg(all_obs, all_act, all_adv) 99 | t_gLL += timer.time() - ts 100 | 101 | # NPG 102 | ts = timer.time() 103 | hvp = self.build_Hvp_eval([observations, actions], 104 | regu_coef=self.FIM_invert_args['damping']) 105 | npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(), 106 | cg_iters=self.FIM_invert_args['iters']) 107 | t_FIM += timer.time() - ts 108 | 109 | # Step size computation 110 | # -------------------------- 111 | n_step_size = 2.0*self.kl_dist 112 | alpha = np.sqrt(np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20))) 113 | 114 | # Policy update 115 | # -------------------------- 116 | curr_params = self.policy.get_param_values() 117 | new_params = curr_params + alpha * npg_grad 118 | self.policy.set_param_values(new_params, set_new=True, set_old=False) 119 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 120 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] 121 | self.policy.set_param_values(new_params, set_new=True, set_old=True) 122 | 123 | # Log information 124 | if self.save_logs: 125 | self.logger.log_kv('alpha', alpha) 126 | self.logger.log_kv('delta', n_step_size) 127 | self.logger.log_kv('time_vpg', t_gLL) 128 | self.logger.log_kv('time_npg', t_FIM) 129 | self.logger.log_kv('kl_dist', kl_dist) 130 | self.logger.log_kv('surr_improvement', surr_after - surr_before) 131 | self.logger.log_kv('running_score', self.running_score) 132 | try: 133 | self.env.env.env.evaluate_success(paths, self.logger) 134 | except: 135 | # nested logic for backwards compatibility. TODO: clean this up. 136 | try: 137 | success_rate = self.env.env.env.evaluate_success(paths) 138 | self.logger.log_kv('success_rate', success_rate) 139 | except: 140 | pass 141 | return base_stats 142 | -------------------------------------------------------------------------------- /mjrl/mjrl/algos/mbac.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.disable(logging.CRITICAL) 3 | import numpy as np 4 | import time as timer 5 | import torch 6 | import torch.nn as nn 7 | from torch.autograd import Variable 8 | from mjrl.utils.logger import DataLog 9 | from tqdm import tqdm 10 | from mjrl.utils.gym_env import GymEnv 11 | from mjrl.policies.mpc_actor import MPCActor 12 | from mjrl.algos.behavior_cloning import BC 13 | 14 | 15 | class MBAC(BC): 16 | def __init__(self, 17 | env_name, 18 | policy, 19 | expert_paths = None, # for the initial seeding 20 | epochs = 5, 21 | batch_size = 64, 22 | lr = 1e-3, 23 | optimizer = None, 24 | loss_type = 'MSE', # can be 'MLE' or 'MSE' 25 | seed = 123, 26 | buffer_size = 50, # measured in number of trajectories 27 | mpc_params = None, 28 | save_logs = True, 29 | ): 30 | 31 | super().__init__(expert_paths=expert_paths, 32 | policy=policy, 33 | epochs=epochs, 34 | batch_size=batch_size, 35 | lr=lr, 36 | optimizer=optimizer, 37 | loss_type=loss_type, 38 | save_logs=save_logs, 39 | ) 40 | self.expert_paths = [] if self.expert_paths is None else self.expert_paths 41 | self.buffer_size = buffer_size 42 | 43 | # For the MPC policy 44 | self.env = GymEnv(env_name) 45 | self.env.reset(seed=seed) 46 | if mpc_params is None: 47 | mean = np.zeros(self.env.action_dim) 48 | sigma = 1.0 * np.ones(self.env.action_dim) 49 | filter_coefs = [sigma, 0.05, 0.0, 0.0] 50 | mpc_params = dict(env=GymEnv(env_name), H=10, 51 | paths_per_cpu=25, num_cpu=1, 52 | kappa=10.0, gamma=1.0, 53 | mean=mean, filter_coefs=filter_coefs, 54 | seed=seed) 55 | else: 56 | mpc_params['env'] = GymEnv(env_name) 57 | mpc_params['seed'] = seed 58 | 59 | self.mpc_params = mpc_params 60 | self.mpc_policy = MPCActor(**mpc_params) 61 | 62 | def collect_paths(self, num_traj=10, 63 | mode='policy', 64 | horizon=None, 65 | render=False 66 | ): 67 | horizon = self.env.horizon if horizon is None else horizon 68 | paths = [] 69 | for i in tqdm(range(num_traj)): 70 | self.env.reset() 71 | obs, act_pi, act_mpc, rew, states = [], [], [], [], [] 72 | for t in range(horizon): 73 | o = self.env.get_obs() 74 | s = self.env.get_env_state() 75 | a_pi = self.policy.get_action(o)[0] 76 | a_mpc = self.mpc_policy.get_action(s) 77 | a = a_pi if mode == 'policy' else a_mpc 78 | next_o, r, done, _ = self.env.step(a) 79 | if render: 80 | self.env.render() 81 | # store data 82 | obs.append(o) 83 | rew.append(r) 84 | states.append(s) 85 | act_pi.append(a_pi) 86 | act_mpc.append(a_mpc) 87 | # kill if done 88 | if done: 89 | break 90 | path = dict(observations=np.array(obs), 91 | actions=np.array(act_pi), 92 | expert_actions=np.array(act_mpc), 93 | rewards=np.array(rew), 94 | states=states, 95 | ) 96 | paths.append(path) 97 | return paths 98 | 99 | def add_paths_to_buffer(self, paths): 100 | for path in paths: 101 | self.expert_paths.append(path) 102 | if len(self.expert_paths) > self.buffer_size: 103 | # keep recent trajectories 104 | # TODO: Also consider keeping best performing trajectories 105 | self.expert_paths = self.expert_paths[-self.buffer_size:] 106 | if self.save_logs: 107 | self.logger.log_kv('buffer_size', len(self.expert_paths)) 108 | 109 | def get_data_from_buffer(self): 110 | observations = np.concatenate([path["observations"] for path in self.expert_paths]) 111 | expert_actions = np.concatenate([path["expert_actions"] for path in self.expert_paths]) 112 | observations = torch.Tensor(observations).float() 113 | expert_actions = torch.Tensor(expert_actions).float() 114 | data = dict(observations=observations, expert_actions=expert_actions) 115 | return data 116 | 117 | def train_step(self, num_traj=10, **kwargs): 118 | # collect data using policy actions 119 | # fit policy to expert actions on these states 120 | new_paths = self.collect_paths(num_traj, mode='policy') 121 | self.add_paths_to_buffer(new_paths) 122 | data = self.get_data_from_buffer() 123 | self.fit(data, **kwargs) 124 | stoc_pol_perf = np.mean([np.sum(path['rewards']) for path in new_paths]) 125 | return stoc_pol_perf -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/algos/model_accel/__init__.py -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/model_learning_mpc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mjrl.algos.model_accel.sampling import generate_paths, generate_perturbed_actions, trajectory_rollout 3 | 4 | 5 | class MPCPolicy(object): 6 | def __init__(self, env, 7 | plan_horizon, 8 | plan_paths=10, 9 | kappa=1.0, 10 | gamma=1.0, 11 | mean=None, 12 | filter_coefs=None, 13 | seed=123, 14 | warmstart=True, 15 | fitted_model=None, 16 | omega=5.0, 17 | **kwargs, 18 | ): 19 | 20 | # initialize 21 | self.env, self.seed = env, seed 22 | self.n, self.m = env.observation_dim, env.action_dim 23 | self.plan_horizon, self.num_traj = plan_horizon, plan_paths 24 | 25 | if fitted_model is None: 26 | print("Policy requires a fitted dynamics model") 27 | quit() 28 | else: 29 | self.fitted_model = fitted_model 30 | 31 | # initialize other params 32 | self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma 33 | if mean is None: 34 | self.mean = np.zeros(self.m) 35 | if filter_coefs is None: 36 | self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0] 37 | self.act_sequence = np.ones((self.plan_horizon, self.m)) * self.mean 38 | self.init_act_sequence = self.act_sequence.copy() 39 | self.warmstart = warmstart 40 | self.omega = omega 41 | 42 | def get_action(self, obs): 43 | # generate paths 44 | if type(self.fitted_model) == list: 45 | 46 | # Ensemble case 47 | # Collect trajectories from different models with same action sequences 48 | base_act = self.act_sequence 49 | act_list = [generate_perturbed_actions(base_act, self.filter_coefs) 50 | for _ in range(self.num_traj)] 51 | actions = np.array(act_list) 52 | paths_list = [] 53 | for model in self.fitted_model: 54 | paths = trajectory_rollout(actions, model, obs) 55 | self.env.env.env.compute_path_rewards(paths) 56 | paths_list.append(paths) 57 | # consolidate paths 58 | paths = dict() 59 | for k in paths_list[0].keys(): 60 | v = np.vstack([p[k] for p in paths_list]) 61 | paths[k] = v 62 | R = self.score_trajectory_ensemble(paths, paths_list) 63 | 64 | else: 65 | paths = generate_paths(num_traj=self.num_traj, fitted_model=self.fitted_model, 66 | start_state=obs, base_act=self.act_sequence, filter_coefs=self.filter_coefs) 67 | self.env.env.env.compute_path_rewards(paths) # will populate path['rewards'] 68 | R = self.score_trajectory(paths) 69 | 70 | S = np.exp(self.kappa * (R - np.max(R))) 71 | act = paths["actions"] 72 | 73 | weighted_seq = S * act.T 74 | act_sequence = np.sum(weighted_seq.T, axis=0) / (np.sum(S) + 1e-6) 75 | action = act_sequence[0].copy() 76 | 77 | # get updated action sequence 78 | if self.warmstart: 79 | self.act_sequence[:-1] = act_sequence[1:] 80 | self.act_sequence[-1] = self.mean.copy() 81 | else: 82 | self.act_sequence = self.init_act_sequence.copy() 83 | return action 84 | 85 | def score_trajectory_ensemble(self, paths, paths_list): 86 | num_traj = self.num_traj 87 | num_models = len(paths_list) 88 | total_traj = paths['rewards'].shape[0] 89 | horizon = paths['rewards'].shape[1] 90 | predictions = [p['observations'] for p in paths_list] 91 | disagreement = np.std(predictions, axis=0) # (num_traj, horizon, state_dim) 92 | disagreement = np.sum(disagreement, axis=(1,2)) # (num_traj,) 93 | scores = np.zeros(total_traj) 94 | for i in range(total_traj): 95 | disagreement_score = disagreement[i // self.num_traj] 96 | scores[i] = self.omega * disagreement_score 97 | for t in range(horizon): 98 | scores[i] += (self.gamma ** t) * paths["rewards"][i][t] 99 | return scores 100 | 101 | def score_trajectory(self, paths): 102 | # rewards shape: (num_traj, horizon) 103 | num_traj = paths["rewards"].shape[0] 104 | horizon = paths["rewards"].shape[1] 105 | scores = np.zeros(num_traj) 106 | for i in range(num_traj): 107 | scores[i] = 0.0 108 | for t in range(horizon): 109 | scores[i] += (self.gamma**t)*paths["rewards"][i][t] 110 | return scores 111 | -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/run_experiments/configs/point_mass.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env_name' : 'mjrl_point_mass-v0', 6 | 'seed' : 123, 7 | 'debug_mode' : False, 8 | 'num_iter' : 5, 9 | 'iter_samples' : 100, 10 | 'eval_rollouts' : 25, 11 | 'num_models' : 3, 12 | 'exp_notes' : 'Toy experiment for initial trial.', 13 | 'save_freq' : 1, 14 | 'device' : 'cpu', 15 | 'learn_reward' : False, 16 | 'reward_file' : 'utils/reward_functions/mjrl_point_mass.py', 17 | 18 | # dynamics learning 19 | 20 | 'hidden_size' : (256, 256), 21 | 'activation' : 'relu', 22 | 'fit_lr' : 1e-3, 23 | 'fit_wd' : 1e-5, 24 | 'buffer_size' : 10000, 25 | 'fit_mb_size' : 16, 26 | 'fit_epochs' : 25, 27 | 'refresh_fit' : False, 28 | 29 | # initial data 30 | 31 | 'init_log_std' : -0.5, 32 | 'min_log_std' : -2.0, 33 | 'init_samples' : 1000, 34 | 35 | # NPG params 36 | 37 | 'policy_size' : (32, 32), 38 | 'inner_steps' : 10, 39 | 'step_size' : 0.05, 40 | 'update_paths' : 250, 41 | 'start_state' : 'init', 42 | 'horizon' : 25, 43 | 44 | } 45 | -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/run_experiments/configs/reacher.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env_name' : 'mjrl_reacher_7dof-v0', 6 | 'seed' : 123, 7 | 'debug_mode' : False, 8 | 'num_iter' : 25, 9 | 'iter_samples' : 500, 10 | 'eval_rollouts' : 10, 11 | 'num_models' : 4, 12 | 'save_freq' : 1, 13 | 'device' : 'cpu', 14 | 15 | # dynamics learning 16 | 17 | 'hidden_size' : (256, 256), 18 | 'activation' : 'relu', 19 | 'fit_lr' : 1e-3, 20 | 'fit_wd' : 0.0, 21 | 'buffer_size' : 20000, 22 | 'fit_mb_size' : 64, 23 | 'fit_epochs' : 20, 24 | 'refresh_fit' : False, 25 | 26 | # initial data 27 | 28 | 'init_log_std' : -0.5, 29 | 'min_log_std' : -2.5, 30 | 'init_samples' : 2500, 31 | 'init_policy' : None, 32 | 33 | 34 | # NPG params 35 | 36 | 'policy_size' : (64, 64), 37 | 'inner_steps' : 5, 38 | 'step_size' : 0.05, 39 | 'update_paths' : 250, 40 | 'start_state' : 'init', 41 | 'horizon' : 50, 42 | 43 | } -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/run_experiments/sandbox/example_config_mpc.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env_name' : 'mjrl_point_mass-v0', 6 | 'seed' : 123, 7 | 'debug_mode' : False, 8 | 'num_iter' : 5, 9 | 'paths_per_iter': 5, 10 | 'eval_rollouts' : 10, 11 | 'num_models' : 3, 12 | 'exp_notes' : 'Toy experiment for initial trial.', 13 | 'save_freq' : 5, 14 | 'device' : 'cpu', 15 | 16 | # dynamics learning 17 | 18 | 'hidden_size' : (64, 64), 19 | 'activation' : 'relu', 20 | 'fit_lr' : 1e-3, 21 | 'fit_wd' : 1e-5, 22 | 'max_paths' : 1000, 23 | 'fit_mb_size' : 16, 24 | 'fit_epochs' : 25, 25 | 'refresh_fit' : True, 26 | 27 | # initial data 28 | 29 | 'init_log_std' : -0.5, 30 | 'n_init_paths' : 25, 31 | 'use_demos' : False, 32 | 'demo_file' : None, 33 | 34 | # model predictive control 35 | 36 | 'noisy_mpc' : True, # when collecting data for exploration 37 | 'noise_level' : 0.1, 38 | 'filter_coefs' : {'f1': 0.5, 'f2': 1.0, 'f3': 0.0, 'f4': 0.0}, 39 | 'plan_paths' : 200, 40 | 'plan_horizon' : 10, 41 | 'kappa' : 2.0, 42 | 'omega' : 0.0, 43 | 44 | } 45 | -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/run_experiments/sandbox/run_model_learning_mpc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Job script to optimize trajectories with fitted model 3 | """ 4 | 5 | import numpy as np 6 | import copy 7 | import torch 8 | import torch.nn as nn 9 | import pickle 10 | import mjrl.envs 11 | import time as timer 12 | import argparse 13 | import os 14 | import json 15 | import mjrl.samplers.core as trajectory_sampler 16 | import mjrl.utils.tensor_utils as tensor_utils 17 | from tqdm import tqdm 18 | from tabulate import tabulate 19 | from mjrl.policies.gaussian_mlp import MLP 20 | from mjrl.baselines.mlp_baseline import MLPBaseline 21 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 22 | from mjrl.utils.gym_env import GymEnv 23 | from mjrl.utils.logger import DataLog 24 | from mjrl.utils.make_train_plots import make_train_plots 25 | from mjrl.algos.model_accel.nn_dynamics import DynamicsModel 26 | from mjrl.algos.model_accel.model_learning_mpc import MPCPolicy 27 | from mjrl.algos.model_accel.sampling import sample_paths, evaluate_policy 28 | 29 | 30 | # =============================================================================== 31 | # Get command line arguments 32 | # =============================================================================== 33 | 34 | parser = argparse.ArgumentParser(description='Trajectory Optimization with fitted models.') 35 | parser.add_argument('--output', type=str, required=True, help='location to store results') 36 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params') 37 | args = parser.parse_args() 38 | OUT_DIR = args.output 39 | if not os.path.exists(OUT_DIR): 40 | os.mkdir(OUT_DIR) 41 | with open(args.config, 'r') as f: 42 | job_data = eval(f.read()) 43 | 44 | # Unpack args and make files for easy access 45 | logger = DataLog() 46 | ENV_NAME = job_data['env_name'] 47 | PICKLE_FILE = OUT_DIR + '/exp_results.pickle' 48 | EXP_FILE = OUT_DIR + '/job_data.json' 49 | SEED = job_data['seed'] 50 | job_data['filter_coefs'] = [job_data['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']] 51 | 52 | # base cases 53 | if 'num_models' not in job_data.keys(): 54 | job_data['num_models'] = 1 55 | if job_data['num_models'] == 1 or 'omega' not in job_data.keys(): 56 | job_data['omega'] = 0.0 57 | if 'eval_rollouts' not in job_data.keys(): 58 | job_data['eval_rollouts'] = 0 59 | if 'save_freq' not in job_data.keys(): 60 | job_data['save_freq'] = 10 61 | if 'device' not in job_data.keys(): 62 | job_data['device'] = 'cpu' 63 | if 'debug_mode' in job_data.keys(): 64 | DEBUG = job_data['debug_mode'] 65 | else: 66 | DEBUG =False 67 | if 'device_path' not in job_data.keys(): 68 | job_data['device_path'] = None 69 | with open(EXP_FILE, 'w') as f: 70 | json.dump(job_data, f, indent=4) 71 | 72 | del(job_data['seed']) 73 | job_data['base_seed'] = SEED 74 | 75 | # =============================================================================== 76 | # Train loop 77 | # =============================================================================== 78 | 79 | np.random.seed(SEED) 80 | torch.random.manual_seed(SEED) 81 | 82 | # TODO(Aravind): Map to hardware if device_path is specified 83 | 84 | e = GymEnv(ENV_NAME) 85 | e.set_seed(SEED) 86 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+i, **job_data) 87 | for i in range(job_data['num_models'])] 88 | exploratory_policy = MLP(e.spec, seed=SEED, init_log_std=job_data['init_log_std']) 89 | paths = [] 90 | 91 | for outer_iter in range(job_data['num_iter']): 92 | 93 | ts = timer.time() 94 | print("================> ITERATION : %i " % outer_iter) 95 | print("Getting interaction data from real dynamics ...") 96 | 97 | if outer_iter == 0: 98 | iter_paths = trajectory_sampler.sample_paths(job_data['n_init_paths'], e, 99 | exploratory_policy, 100 | eval_mode=False, base_seed=SEED) 101 | else: 102 | iter_paths = sample_paths(job_data['paths_per_iter'], 103 | mpc_policy.env, mpc_policy, 104 | eval_mode=(not job_data['noisy_mpc']), 105 | noise_level=job_data['noise_level'], 106 | base_seed=SEED + outer_iter) 107 | 108 | # reset the environment (good for hardware) 109 | e.reset() 110 | 111 | for p in iter_paths: 112 | paths.append(p) 113 | 114 | if len(paths) > job_data['max_paths']: 115 | diff = len(paths) - job_data['max_paths'] 116 | paths[:diff] = [] 117 | 118 | s = np.concatenate([p['observations'][:-1] for p in paths]) 119 | a = np.concatenate([p['actions'][:-1] for p in paths]) 120 | sp = np.concatenate([p['observations'][1:] for p in paths]) 121 | r = np.array([np.sum(p['rewards']) for p in iter_paths]) 122 | rollout_score = np.mean(r) 123 | 124 | logger.log_kv('fit_epochs', job_data['fit_epochs']) 125 | logger.log_kv('rollout_score', rollout_score) 126 | try: 127 | rollout_metric = e.env.env.evaluate_success(iter_paths) 128 | logger.log_kv('rollout_metric', rollout_metric) 129 | except: 130 | pass 131 | 132 | print("Data gathered, fitting model ...") 133 | if job_data['refresh_fit']: 134 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+123*outer_iter, 135 | **job_data) for i in range(job_data['num_models'])] 136 | 137 | for i, model in enumerate(models): 138 | epoch_loss = model.fit(s, a, sp, job_data['fit_mb_size'], job_data['fit_epochs']) 139 | logger.log_kv('loss_before_' + str(i), epoch_loss[0]) 140 | logger.log_kv('loss_after_' + str(i), epoch_loss[-1]) 141 | 142 | mpc_policy = MPCPolicy(env=e, fitted_model=models, seed=SEED+12345*outer_iter, **job_data) 143 | 144 | if job_data['eval_rollouts'] > 0: 145 | print("Performing validation rollouts ... ") 146 | eval_paths = evaluate_policy(mpc_policy.env, mpc_policy, mpc_policy.fitted_model[0], noise_level=0.0, 147 | real_step=True, num_episodes=job_data['eval_rollouts'], visualize=False) 148 | eval_score = np.mean([np.sum(p['rewards']) for p in eval_paths]) 149 | logger.log_kv('eval_score', eval_score) 150 | try: 151 | eval_metric = e.env.env.evaluate_success(eval_paths) 152 | logger.log_kv('eval_metric', eval_metric) 153 | except: 154 | pass 155 | else: 156 | eval_paths = [] 157 | 158 | exp_data = dict(policy=mpc_policy, fitted_model=mpc_policy.fitted_model, 159 | log=logger.log, rollout_paths=iter_paths, eval_paths=eval_paths) 160 | if outer_iter > 0 and outer_iter % job_data['save_freq'] == 0: 161 | pickle.dump(exp_data, open(PICKLE_FILE, 'wb')) 162 | pickle.dump(exp_data, open(OUT_DIR + '/iteration_' + str(outer_iter) + '.pickle', 'wb')) 163 | 164 | tf = timer.time() 165 | logger.log_kv('iter_time', tf-ts) 166 | print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1, 167 | logger.get_current_log().items())) 168 | print(tabulate(print_data)) 169 | logger.save_log(OUT_DIR+'/') 170 | make_train_plots(log=logger.log, keys=['rollout_score', 'eval_score', 'rollout_metric', 'eval_metric'], 171 | save_loc=OUT_DIR+'/') 172 | 173 | if job_data['debug_mode']: 174 | evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], False, 5, True) 175 | evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], True, 5, True) 176 | 177 | pickle.dump(exp_data, open(PICKLE_FILE, 'wb')) # final save -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/mjrl_point_mass.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def reward_function(paths): 4 | # path has two keys: observations and actions 5 | # path["observations"] : (num_traj, horizon, obs_dim) 6 | # return paths that contain rewards in path["rewards"] 7 | # path["rewards"] should have shape (num_traj, horizon) 8 | obs = paths["observations"] 9 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 10 | agent_pos = obs[:, :, :2] 11 | target_pos = obs[:, :, -2:] 12 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1) 13 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1) 14 | rewards = -1.0 * l1_dist - 0.5 * l2_dist 15 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s') 16 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 17 | return paths 18 | -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/run_experiments/utils/visualize_policy.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import mjrl.envs 3 | import trajopt.envs 4 | import mj_envs 5 | import click 6 | import os 7 | import gym 8 | import numpy as np 9 | import pickle 10 | import torch 11 | from mjrl.utils.gym_env import GymEnv 12 | from mjrl.policies.gaussian_mlp import MLP 13 | import trajopt.envs 14 | 15 | DESC = ''' 16 | Helper script to visualize policy (in mjrl format).\n 17 | USAGE:\n 18 | Visualizes policy on the env\n 19 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n 20 | ''' 21 | 22 | # MAIN ========================================================= 23 | @click.command(help=DESC) 24 | @click.option('--env_name', type=str, help='environment to load', required= True) 25 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None) 26 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation') 27 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123) 28 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10) 29 | @click.option('--log_std', type=float, default=-0.5) 30 | @click.option('--terminate', type=bool, default=True) 31 | @click.option('--device_path', type=str, default=None) 32 | def main(env_name, policy, mode, seed, episodes, log_std, terminate, device_path): 33 | render = True 34 | 35 | # TODO(Aravind): Map to hardware if device_path is specified 36 | 37 | e = GymEnv(env_name) 38 | e.set_seed(seed) 39 | np.random.seed(seed) 40 | torch.manual_seed(seed) 41 | if policy is not None: 42 | policy = pickle.load(open(policy, 'rb')) 43 | else: 44 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=log_std) 45 | 46 | for ep in range(episodes): 47 | o = e.reset() 48 | rew = 0.0 49 | t = 0 50 | done = False 51 | while t < e.horizon and done is False: 52 | o = e.get_obs() 53 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation'] 54 | next_o, r, done, ifo = e.step(a) 55 | if terminate is False: 56 | done = False 57 | rew = rew + r 58 | t = t + 1 59 | if render: 60 | e.render() 61 | if done and t < e.horizon - 1: 62 | print("Episode terminated early") 63 | print("episode score = %f " % rew) 64 | 65 | e.reset() 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /mjrl/mjrl/algos/model_accel/run_experiments/utils/visualize_trajectories.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import click 3 | import json 4 | import numpy as np 5 | import torch 6 | import mjrl.envs 7 | import trajopt.envs 8 | import mj_envs 9 | import mjrl.utils.tensor_utils as tensor_utils 10 | 11 | from mjrl.utils.gym_env import GymEnv 12 | from mjrl.algos.model_accel.sampling import evaluate_policy 13 | 14 | DESC = ''' 15 | Helper script to visualize optimized trajectories (list of trajectories in trajopt format).\n 16 | USAGE:\n 17 | $ python viz_trajectories.py --file path_to_file.pickle\n 18 | ''' 19 | @click.command(help=DESC) 20 | @click.option('--file', type=str, help='pickle file with trajectories', required= True) 21 | @click.option('--seed', type=int, default=123) 22 | @click.option('--noise_level', type=float, default=0.0) 23 | @click.option('--num_episodes', type=int, help='number of times to play trajectories', default=5) 24 | @click.option('--config', type=str, help='if provided MPC params from here will be used.', default=None) 25 | @click.option('--device_path', type=str, default=None) 26 | def main(file, seed, noise_level, num_episodes, config, device_path): 27 | exp_data = pickle.load(open(file, 'rb')) 28 | policy = exp_data['policy'] 29 | model = exp_data['fitted_model'] 30 | model = model[-1] if type(model) == list else model 31 | env_id = policy.env.env_id 32 | render = True 33 | 34 | # TODO(Aravind): Map to hardware if device_path is specified 35 | 36 | env = GymEnv(env_id) 37 | policy.env = env 38 | 39 | env.set_seed(seed) 40 | np.random.seed(seed) 41 | torch.manual_seed(seed) 42 | 43 | if config is not None: 44 | try: 45 | with open(config, 'r') as f: 46 | config = eval(f.read()) 47 | except: 48 | with open(config, 'r') as f: 49 | config = json.load(f) 50 | policy.plan_horizon = config['plan_horizon'] 51 | policy.num_traj = config['plan_paths'] 52 | policy.kappa = config['kappa'] 53 | policy.filter_coefs = [config['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']] 54 | policy.omega = config['omega'] if 'omega' in config.keys() else 0.0 55 | 56 | # TODO(Aravind): Implement capability to set predicted state for rendering purposes 57 | # evaluate_policy(env, policy, model, noise_level, real_step=False, num_episodes=num_episodes, visualize=render) 58 | evaluate_policy(env, policy, model, noise_level, real_step=True, num_episodes=num_episodes, visualize=render) 59 | 60 | # final close out 61 | env.reset() 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /mjrl/mjrl/algos/ppo_clip.py: -------------------------------------------------------------------------------- 1 | import logging 2 | #logging.disable(logging.CRITICAL) 3 | import numpy as np 4 | import scipy as sp 5 | import scipy.sparse.linalg as spLA 6 | import copy 7 | import time as timer 8 | import torch 9 | import torch.nn as nn 10 | from torch.autograd import Variable 11 | import copy 12 | 13 | # samplers 14 | import mjrl.samplers.core as trajectory_sampler 15 | 16 | # utility functions 17 | import mjrl.utils.process_samples as process_samples 18 | from mjrl.utils.logger import DataLog 19 | from mjrl.utils.cg_solve import cg_solve 20 | from mjrl.algos.batch_reinforce import BatchREINFORCE 21 | 22 | 23 | class PPO(BatchREINFORCE): 24 | def __init__(self, env, policy, baseline, 25 | clip_coef = 0.2, 26 | epochs = 10, 27 | mb_size = 64, 28 | learn_rate = 3e-4, 29 | seed = 123, 30 | save_logs = False, 31 | **kwargs 32 | ): 33 | 34 | self.env = env 35 | self.policy = policy 36 | self.baseline = baseline 37 | self.learn_rate = learn_rate 38 | self.seed = seed 39 | self.save_logs = save_logs 40 | self.clip_coef = clip_coef 41 | self.epochs = epochs 42 | self.mb_size = mb_size 43 | self.running_score = None 44 | if save_logs: self.logger = DataLog() 45 | 46 | self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=learn_rate) 47 | 48 | def PPO_surrogate(self, observations, actions, advantages): 49 | adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False) 50 | old_dist_info = self.policy.old_dist_info(observations, actions) 51 | new_dist_info = self.policy.new_dist_info(observations, actions) 52 | LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info) 53 | LR_clip = torch.clamp(LR, min=1-self.clip_coef, max=1+self.clip_coef) 54 | ppo_surr = torch.mean(torch.min(LR*adv_var,LR_clip*adv_var)) 55 | return ppo_surr 56 | 57 | # ---------------------------------------------------------- 58 | def train_from_paths(self, paths): 59 | 60 | # Concatenate from all the trajectories 61 | observations = np.concatenate([path["observations"] for path in paths]) 62 | actions = np.concatenate([path["actions"] for path in paths]) 63 | advantages = np.concatenate([path["advantages"] for path in paths]) 64 | # Advantage whitening 65 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) 66 | # NOTE : advantage should be zero mean in expectation 67 | # normalized step size invariant to advantage scaling, 68 | # but scaling can help with least squares 69 | 70 | # cache return distributions for the paths 71 | path_returns = [sum(p["rewards"]) for p in paths] 72 | mean_return = np.mean(path_returns) 73 | std_return = np.std(path_returns) 74 | min_return = np.amin(path_returns) 75 | max_return = np.amax(path_returns) 76 | base_stats = [mean_return, std_return, min_return, max_return] 77 | self.running_score = mean_return if self.running_score is None else \ 78 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters 79 | if self.save_logs: self.log_rollout_statistics(paths) 80 | 81 | # Optimization algorithm 82 | # -------------------------- 83 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 84 | params_before_opt = self.policy.get_param_values() 85 | 86 | ts = timer.time() 87 | num_samples = observations.shape[0] 88 | for ep in range(self.epochs): 89 | for mb in range(int(num_samples / self.mb_size)): 90 | rand_idx = np.random.choice(num_samples, size=self.mb_size) 91 | obs = observations[rand_idx] 92 | act = actions[rand_idx] 93 | adv = advantages[rand_idx] 94 | self.optimizer.zero_grad() 95 | loss = - self.PPO_surrogate(obs, act, adv) 96 | loss.backward() 97 | self.optimizer.step() 98 | 99 | params_after_opt = self.policy.get_param_values() 100 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 101 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] 102 | self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) 103 | t_opt = timer.time() - ts 104 | 105 | # Log information 106 | if self.save_logs: 107 | self.logger.log_kv('t_opt', t_opt) 108 | self.logger.log_kv('kl_dist', kl_dist) 109 | self.logger.log_kv('surr_improvement', surr_after - surr_before) 110 | self.logger.log_kv('running_score', self.running_score) 111 | try: 112 | self.env.env.env.evaluate_success(paths, self.logger) 113 | except: 114 | # nested logic for backwards compatibility. TODO: clean this up. 115 | try: 116 | success_rate = self.env.env.env.evaluate_success(paths) 117 | self.logger.log_kv('success_rate', success_rate) 118 | except: 119 | pass 120 | 121 | return base_stats 122 | -------------------------------------------------------------------------------- /mjrl/mjrl/algos/trpo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.disable(logging.CRITICAL) 3 | import numpy as np 4 | import scipy as sp 5 | import scipy.sparse.linalg as spLA 6 | import copy 7 | import time as timer 8 | import torch 9 | import torch.nn as nn 10 | from torch.autograd import Variable 11 | import copy 12 | 13 | # samplers 14 | import mjrl.samplers.core as trajectory_sampler 15 | import mjrl.samplers.batch_sampler as batch_sampler 16 | 17 | # utility functions 18 | import mjrl.utils.process_samples as process_samples 19 | from mjrl.utils.logger import DataLog 20 | from mjrl.utils.cg_solve import cg_solve 21 | 22 | # Import NPG 23 | from mjrl.algos.npg_cg import NPG 24 | 25 | class TRPO(NPG): 26 | def __init__(self, env, policy, baseline, 27 | kl_dist=0.01, 28 | FIM_invert_args={'iters': 10, 'damping': 1e-4}, 29 | hvp_sample_frac=1.0, 30 | seed=123, 31 | save_logs=False, 32 | normalized_step_size=0.01, 33 | **kwargs 34 | ): 35 | """ 36 | All inputs are expected in mjrl's format unless specified 37 | :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance 38 | :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. 39 | :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) 40 | :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG 41 | :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) 42 | :param seed: random seed 43 | """ 44 | 45 | self.env = env 46 | self.policy = policy 47 | self.baseline = baseline 48 | self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size 49 | self.seed = seed 50 | self.save_logs = save_logs 51 | self.FIM_invert_args = FIM_invert_args 52 | self.hvp_subsample = hvp_sample_frac 53 | self.running_score = None 54 | if save_logs: self.logger = DataLog() 55 | 56 | def train_from_paths(self, paths): 57 | 58 | # Concatenate from all the trajectories 59 | observations = np.concatenate([path["observations"] for path in paths]) 60 | actions = np.concatenate([path["actions"] for path in paths]) 61 | advantages = np.concatenate([path["advantages"] for path in paths]) 62 | # Advantage whitening 63 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) 64 | # NOTE : advantage should be zero mean in expectation 65 | # normalized step size invariant to advantage scaling, 66 | # but scaling can help with least squares 67 | 68 | # cache return distributions for the paths 69 | path_returns = [sum(p["rewards"]) for p in paths] 70 | mean_return = np.mean(path_returns) 71 | std_return = np.std(path_returns) 72 | min_return = np.amin(path_returns) 73 | max_return = np.amax(path_returns) 74 | base_stats = [mean_return, std_return, min_return, max_return] 75 | self.running_score = mean_return if self.running_score is None else \ 76 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters 77 | if self.save_logs: self.log_rollout_statistics(paths) 78 | 79 | # Keep track of times for various computations 80 | t_gLL = 0.0 81 | t_FIM = 0.0 82 | 83 | # Optimization algorithm 84 | # -------------------------- 85 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 86 | 87 | # VPG 88 | ts = timer.time() 89 | vpg_grad = self.flat_vpg(observations, actions, advantages) 90 | t_gLL += timer.time() - ts 91 | 92 | # NPG 93 | ts = timer.time() 94 | hvp = self.build_Hvp_eval([observations, actions], 95 | regu_coef=self.FIM_invert_args['damping']) 96 | npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), 97 | cg_iters=self.FIM_invert_args['iters']) 98 | t_FIM += timer.time() - ts 99 | 100 | # Step size computation 101 | # -------------------------- 102 | n_step_size = 2.0*self.kl_dist 103 | alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) 104 | 105 | # Policy update 106 | # -------------------------- 107 | curr_params = self.policy.get_param_values() 108 | for k in range(100): 109 | new_params = curr_params + alpha * npg_grad 110 | self.policy.set_param_values(new_params, set_new=True, set_old=False) 111 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] 112 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 113 | if kl_dist < self.kl_dist: 114 | break 115 | else: 116 | alpha = 0.9*alpha # backtrack 117 | print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \ 118 | (kl_dist, surr_after-surr_before) ) 119 | if k == 99: 120 | alpha = 0.0 121 | 122 | new_params = curr_params + alpha * npg_grad 123 | self.policy.set_param_values(new_params, set_new=True, set_old=False) 124 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] 125 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 126 | self.policy.set_param_values(new_params, set_new=True, set_old=True) 127 | 128 | # Log information 129 | if self.save_logs: 130 | self.logger.log_kv('alpha', alpha) 131 | self.logger.log_kv('delta', n_step_size) 132 | self.logger.log_kv('time_vpg', t_gLL) 133 | self.logger.log_kv('time_npg', t_FIM) 134 | self.logger.log_kv('kl_dist', kl_dist) 135 | self.logger.log_kv('surr_improvement', surr_after - surr_before) 136 | self.logger.log_kv('running_score', self.running_score) 137 | try: 138 | self.env.env.env.evaluate_success(paths, self.logger) 139 | except: 140 | # nested logic for backwards compatibility. TODO: clean this up. 141 | try: 142 | success_rate = self.env.env.env.evaluate_success(paths) 143 | self.logger.log_kv('success_rate', success_rate) 144 | except: 145 | pass 146 | 147 | return base_stats -------------------------------------------------------------------------------- /mjrl/mjrl/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/baselines/__init__.py -------------------------------------------------------------------------------- /mjrl/mjrl/baselines/linear_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | 5 | class LinearBaseline: 6 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-5): 7 | self.inp = inp 8 | self._reg_coeff = reg_coeff 9 | self._coeffs = None 10 | 11 | def _features(self, paths): 12 | if self.inp == 'env_features': 13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths]) 14 | else: 15 | o = np.concatenate([path["observations"] for path in paths]) 16 | o = np.clip(o, -10, 10)/10.0 17 | if o.ndim > 2: 18 | o = o.reshape(o.shape[0], -1) 19 | N, n = o.shape 20 | num_feat = int( n + 1 + 4 ) # linear + bias (1.0) + time till pow 4 21 | feat_mat = np.ones((N, num_feat)) 22 | 23 | # linear features 24 | feat_mat[:,:n] = o 25 | 26 | k = 0 # start from this row 27 | for i in range(len(paths)): 28 | l = len(paths[i]["rewards"]) 29 | al = np.arange(l)/1000.0 30 | for j in range(4): 31 | feat_mat[k:k+l, -4+j] = al**(j+1) 32 | k += l 33 | 34 | return feat_mat 35 | 36 | def fit(self, paths, return_errors=False): 37 | 38 | featmat = self._features(paths) 39 | returns = np.concatenate([path["returns"] for path in paths]) 40 | 41 | if return_errors: 42 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape) 43 | errors = returns - predictions 44 | error_before = np.sum(errors**2)/np.sum(returns**2) 45 | 46 | reg_coeff = copy.deepcopy(self._reg_coeff) 47 | for _ in range(10): 48 | self._coeffs = np.linalg.lstsq( 49 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]), 50 | featmat.T.dot(returns) 51 | )[0] 52 | if not np.any(np.isnan(self._coeffs)): 53 | break 54 | reg_coeff *= 10 55 | 56 | if return_errors: 57 | predictions = featmat.dot(self._coeffs) 58 | errors = returns - predictions 59 | error_after = np.sum(errors**2)/np.sum(returns**2) 60 | return error_before, error_after 61 | 62 | def predict(self, path): 63 | if self._coeffs is None: 64 | return np.zeros(len(path["rewards"])) 65 | return self._features([path]).dot(self._coeffs) 66 | -------------------------------------------------------------------------------- /mjrl/mjrl/baselines/mlp_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | from mjrl.utils.optimize_model import fit_data 7 | 8 | import pickle 9 | 10 | class MLPBaseline: 11 | def __init__(self, env_spec, inp_dim=None, inp='obs', learn_rate=1e-3, reg_coef=0.0, 12 | batch_size=64, epochs=1, use_gpu=False, hidden_sizes=(128, 128)): 13 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim 14 | self.batch_size = batch_size 15 | self.epochs = epochs 16 | self.reg_coef = reg_coef 17 | self.use_gpu = use_gpu 18 | self.inp = inp 19 | self.hidden_sizes = hidden_sizes 20 | 21 | self.model = nn.Sequential() 22 | layer_sizes = (self.n + 4, ) + hidden_sizes + (1, ) 23 | for i in range(len(layer_sizes) - 1): 24 | layer_id = 'fc_' + str(i) 25 | relu_id = 'relu_' + str(i) 26 | self.model.add_module(layer_id, nn.Linear(layer_sizes[i], layer_sizes[i+1])) 27 | if i != len(layer_sizes) - 2: 28 | self.model.add_module(relu_id, nn.ReLU()) 29 | 30 | if self.use_gpu: 31 | self.model.cuda() 32 | 33 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learn_rate, weight_decay=reg_coef) 34 | self.loss_function = torch.nn.MSELoss() 35 | 36 | def _features(self, paths): 37 | if self.inp == 'env_features': 38 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths]) 39 | else: 40 | o = np.concatenate([path["observations"] for path in paths]) 41 | o = np.clip(o, -10, 10)/10.0 42 | if o.ndim > 2: 43 | o = o.reshape(o.shape[0], -1) 44 | N, n = o.shape 45 | num_feat = int( n + 4 ) # linear + time till pow 4 46 | feat_mat = np.ones((N, num_feat)) # memory allocation 47 | 48 | # linear features 49 | feat_mat[:,:n] = o 50 | 51 | k = 0 # start from this row 52 | for i in range(len(paths)): 53 | l = len(paths[i]["rewards"]) 54 | al = np.arange(l)/1000.0 55 | for j in range(4): 56 | feat_mat[k:k+l, -4+j] = al**(j+1) 57 | k += l 58 | return feat_mat 59 | 60 | 61 | def fit(self, paths, return_errors=False, return_all_errors=False): 62 | 63 | featmat = self._features(paths) 64 | returns = np.concatenate([path["returns"] for path in paths]).reshape(-1, 1) 65 | featmat = featmat.astype('float32') 66 | returns = returns.astype('float32') 67 | num_samples = returns.shape[0] 68 | 69 | # Make variables with the above data 70 | if self.use_gpu: 71 | featmat_var = Variable(torch.from_numpy(featmat).cuda(), requires_grad=False) 72 | returns_var = Variable(torch.from_numpy(returns).cuda(), requires_grad=False) 73 | else: 74 | featmat_var = Variable(torch.from_numpy(featmat), requires_grad=False) 75 | returns_var = Variable(torch.from_numpy(returns), requires_grad=False) 76 | 77 | if return_errors: 78 | if self.use_gpu: 79 | predictions = self.model(featmat_var).cpu().data.numpy().ravel() 80 | else: 81 | predictions = self.model(featmat_var).data.numpy().ravel() 82 | errors = returns.ravel() - predictions 83 | error_before = np.sum(errors**2)/(np.sum(returns**2) + 1e-8) 84 | 85 | epoch_losses = fit_data(self.model, featmat_var, returns_var, self.optimizer, 86 | self.loss_function, self.batch_size, self.epochs) 87 | 88 | if return_errors: 89 | if self.use_gpu: 90 | predictions = self.model(featmat_var).cpu().data.numpy().ravel() 91 | else: 92 | predictions = self.model(featmat_var).data.numpy().ravel() 93 | errors = returns.ravel() - predictions 94 | error_after = np.sum(errors**2)/(np.sum(returns**2) + 1e-8) 95 | if return_all_errors: 96 | return error_before, error_after, epoch_losses 97 | return error_before, error_after 98 | 99 | def predict(self, path): 100 | featmat = self._features([path]).astype('float32') 101 | if self.use_gpu: 102 | feat_var = Variable(torch.from_numpy(featmat).float().cuda(), requires_grad=False) 103 | prediction = self.model(feat_var).cpu().data.numpy().ravel() 104 | else: 105 | feat_var = Variable(torch.from_numpy(featmat).float(), requires_grad=False) 106 | prediction = self.model(feat_var).data.numpy().ravel() 107 | return prediction 108 | -------------------------------------------------------------------------------- /mjrl/mjrl/baselines/quadratic_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | class QuadraticBaseline: 5 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-3): 6 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim 7 | self.inp = inp 8 | self._reg_coeff = reg_coeff 9 | self._coeffs = None 10 | 11 | def _features(self, paths): 12 | if self.inp == 'env_features': 13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths]) 14 | else: 15 | o = np.concatenate([path["observations"] for path in paths]) 16 | o = np.clip(o, -10, 10)/10.0 17 | if o.ndim > 2: 18 | o = o.reshape(o.shape[0], -1) 19 | N, n = o.shape 20 | num_feat = int( n + n*(n+1)/2 + 1 + 4 ) # linear + full quad (symmetric matrix) + bias (1.0) + time till pow 4 21 | feat_mat = np.ones((N, num_feat)) # memory allocation 22 | 23 | # linear features 24 | feat_mat[:,:n] = o 25 | 26 | # quadratic features 27 | k = n # starting from this column in feat_mat 28 | for i in range(n): 29 | for j in range(i, n): 30 | feat_mat[:,k] = o[:,i]*o[:,j] # element-wise product 31 | k += 1 32 | 33 | k = 0 # start from this row 34 | for i in range(len(paths)): 35 | l = len(paths[i]["rewards"]) 36 | al = np.arange(l)/1000.0 37 | for j in range(4): 38 | feat_mat[k:k+l, -4+j] = al**(j+1) 39 | k += l 40 | 41 | return feat_mat 42 | 43 | 44 | def fit(self, paths, return_errors=False): 45 | 46 | #featmat = np.concatenate([self._features(path) for path in paths]) 47 | featmat = self._features(paths) 48 | returns = np.concatenate([path["returns"] for path in paths]) 49 | 50 | if return_errors: 51 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape) 52 | errors = returns - predictions 53 | error_before = np.sum(errors**2)/np.sum(returns**2) 54 | 55 | reg_coeff = copy.deepcopy(self._reg_coeff) 56 | for _ in range(10): 57 | self._coeffs = np.linalg.lstsq( 58 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]), 59 | featmat.T.dot(returns) 60 | )[0] 61 | if not np.any(np.isnan(self._coeffs)): 62 | break 63 | reg_coeff *= 10 64 | 65 | if return_errors: 66 | predictions = featmat.dot(self._coeffs) 67 | errors = returns - predictions 68 | error_after = np.sum(errors**2)/np.sum(returns**2) 69 | return error_before, error_after 70 | 71 | def predict(self, path): 72 | if self._coeffs is None: 73 | return np.zeros(len(path["rewards"])) 74 | return self._features([path]).dot(self._coeffs) 75 | -------------------------------------------------------------------------------- /mjrl/mjrl/baselines/zero_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | class ZeroBaseline: 5 | def __init__(self, env_spec, **kwargs): 6 | n = env_spec.observation_dim # number of states 7 | self._coeffs = None 8 | 9 | def fit(self, paths, return_errors=False): 10 | if return_errors: 11 | return 1.0, 1.0 12 | 13 | def predict(self, path): 14 | return np.zeros(len(path["rewards"])) 15 | -------------------------------------------------------------------------------- /mjrl/mjrl/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # ---------------------------------------- 4 | # mjrl environments 5 | # ---------------------------------------- 6 | 7 | register( 8 | id='mjrl_point_mass-v0', 9 | entry_point='mjrl.envs:PointMassEnv', 10 | max_episode_steps=25, 11 | ) 12 | 13 | register( 14 | id='mjrl_swimmer-v0', 15 | entry_point='mjrl.envs:SwimmerEnv', 16 | max_episode_steps=500, 17 | ) 18 | 19 | register( 20 | id='mjrl_reacher_7dof-v0', 21 | entry_point='mjrl.envs:Reacher7DOFEnv', 22 | max_episode_steps=50, 23 | ) 24 | 25 | register( 26 | id='mjrl_peg_insertion-v0', 27 | entry_point='mjrl.envs:PegEnv', 28 | max_episode_steps=50, 29 | ) 30 | 31 | from mjrl.envs.mujoco_env import MujocoEnv 32 | # ^^^^^ so that user gets the correct error 33 | # message if mujoco is not installed correctly 34 | from mjrl.envs.point_mass import PointMassEnv 35 | from mjrl.envs.swimmer import SwimmerEnv 36 | from mjrl.envs.reacher_sawyer import Reacher7DOFEnv 37 | from mjrl.envs.peg_insertion_sawyer import PegEnv 38 | -------------------------------------------------------------------------------- /mjrl/mjrl/envs/assets/point_mass.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 44 | -------------------------------------------------------------------------------- /mjrl/mjrl/envs/assets/sawyer.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /mjrl/mjrl/envs/assets/swimmer.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 67 | -------------------------------------------------------------------------------- /mjrl/mjrl/envs/mujoco_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from gym import error, spaces 4 | from gym.utils import seeding 5 | import numpy as np 6 | from os import path 7 | import gym 8 | import six 9 | import time as timer 10 | 11 | try: 12 | import mujoco_py 13 | from mujoco_py import load_model_from_path, MjSim, MjViewer 14 | except ImportError as e: 15 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) 16 | 17 | def get_sim(model_path): 18 | if model_path.startswith("/"): 19 | fullpath = model_path 20 | else: 21 | fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path) 22 | if not path.exists(fullpath): 23 | raise IOError("File %s does not exist" % fullpath) 24 | model = load_model_from_path(fullpath) 25 | return MjSim(model) 26 | 27 | class MujocoEnv(gym.Env): 28 | """Superclass for all MuJoCo environments. 29 | """ 30 | 31 | def __init__(self, model_path=None, frame_skip=1, sim=None): 32 | 33 | if sim is None: 34 | self.sim = get_sim(model_path) 35 | else: 36 | self.sim = sim 37 | self.data = self.sim.data 38 | self.model = self.sim.model 39 | 40 | self.frame_skip = frame_skip 41 | self.metadata = { 42 | 'render.modes': ['human', 'rgb_array'], 43 | 'video.frames_per_second': int(np.round(1.0 / self.dt)) 44 | } 45 | self.mujoco_render_frames = False 46 | 47 | self.init_qpos = self.data.qpos.ravel().copy() 48 | self.init_qvel = self.data.qvel.ravel().copy() 49 | try: 50 | observation, _reward, done, _info = self.step(np.zeros(self.model.nu)) 51 | except NotImplementedError: 52 | observation, _reward, done, _info = self._step(np.zeros(self.model.nu)) 53 | assert not done 54 | self.obs_dim = np.sum([o.size for o in observation]) if type(observation) is tuple else observation.size 55 | 56 | bounds = self.model.actuator_ctrlrange.copy() 57 | low = bounds[:, 0] 58 | high = bounds[:, 1] 59 | self.action_space = spaces.Box(low, high, dtype=np.float32) 60 | 61 | high = np.inf*np.ones(self.obs_dim) 62 | low = -high 63 | self.observation_space = spaces.Box(low, high, dtype=np.float32) 64 | 65 | self.seed() 66 | 67 | def seed(self, seed=None): 68 | self.np_random, seed = seeding.np_random(seed) 69 | return [seed] 70 | 71 | # methods to override: 72 | # ---------------------------- 73 | 74 | def reset_model(self): 75 | """ 76 | Reset the robot degrees of freedom (qpos and qvel). 77 | Implement this in each subclass. 78 | """ 79 | raise NotImplementedError 80 | 81 | def mj_viewer_setup(self): 82 | """ 83 | Due to specifics of new mujoco rendering, the standard viewer cannot be used 84 | with this set-up. Instead we use this mujoco specific function. 85 | """ 86 | pass 87 | 88 | def viewer_setup(self): 89 | """ 90 | Does not work. Use mj_viewer_setup() instead 91 | """ 92 | pass 93 | 94 | def evaluate_success(self, paths, logger=None): 95 | """ 96 | Log various success metrics calculated based on input paths into the logger 97 | """ 98 | pass 99 | 100 | # ----------------------------- 101 | 102 | def reset(self): 103 | self.sim.reset() 104 | self.sim.forward() 105 | ob = self.reset_model() 106 | return ob 107 | 108 | def set_state(self, qpos, qvel): 109 | assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,) 110 | old_state = self.sim.get_state() 111 | new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel, 112 | old_state.act, old_state.udd_state) 113 | self.sim.set_state(new_state) 114 | self.sim.forward() 115 | 116 | @property 117 | def dt(self): 118 | return self.model.opt.timestep * self.frame_skip 119 | 120 | def do_simulation(self, ctrl, n_frames): 121 | for i in range(self.model.nu): 122 | self.sim.data.ctrl[i] = ctrl[i] 123 | for _ in range(n_frames): 124 | self.sim.step() 125 | if self.mujoco_render_frames is True: 126 | self.mj_render() 127 | 128 | def mj_render(self): 129 | try: 130 | self.viewer.render() 131 | except: 132 | self.mj_viewer_setup() 133 | self.viewer._run_speed = 0.5 134 | #self.viewer._run_speed /= self.frame_skip 135 | self.viewer.render() 136 | 137 | def render(self, *args, **kwargs): 138 | pass 139 | #return self.mj_render() 140 | 141 | def _get_viewer(self): 142 | pass 143 | #return None 144 | 145 | def state_vector(self): 146 | state = self.sim.get_state() 147 | return np.concatenate([ 148 | state.qpos.flat, state.qvel.flat]) 149 | 150 | # ----------------------------- 151 | 152 | def visualize_policy(self, policy, horizon=1000, num_episodes=1, mode='exploration'): 153 | self.mujoco_render_frames = True 154 | for ep in range(num_episodes): 155 | o = self.reset() 156 | d = False 157 | t = 0 158 | score = 0.0 159 | while t < horizon and d is False: 160 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation'] 161 | o, r, d, _ = self.step(a) 162 | t = t+1 163 | score = score + r 164 | print("Episode score = %f" % score) 165 | self.mujoco_render_frames = False 166 | 167 | def visualize_policy_offscreen(self, policy, horizon=1000, 168 | num_episodes=1, 169 | frame_size=(640,480), 170 | mode='exploration', 171 | save_loc='/tmp/', 172 | filename='newvid', 173 | camera_name=None): 174 | import skvideo.io 175 | for ep in range(num_episodes): 176 | print("Episode %d: rendering offline " % ep, end='', flush=True) 177 | o = self.reset() 178 | d = False 179 | t = 0 180 | arrs = [] 181 | t0 = timer.time() 182 | while t < horizon and d is False: 183 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation'] 184 | o, r, d, _ = self.step(a) 185 | t = t+1 186 | curr_frame = self.sim.render(width=frame_size[0], height=frame_size[1], 187 | mode='offscreen', camera_name=camera_name, device_id=0) 188 | arrs.append(curr_frame[::-1,:,:]) 189 | print(t, end=', ', flush=True) 190 | file_name = save_loc + filename + str(ep) + ".mp4" 191 | skvideo.io.vwrite( file_name, np.asarray(arrs)) 192 | print("saved", file_name) 193 | t1 = timer.time() 194 | print("time taken = %f"% (t1-t0)) 195 | -------------------------------------------------------------------------------- /mjrl/mjrl/envs/peg_insertion_sawyer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | 7 | class PegEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | self.peg_sid = -2 10 | self.target_sid = -1 11 | mujoco_env.MujocoEnv.__init__(self, 'peg_insertion.xml', 4) 12 | utils.EzPickle.__init__(self) 13 | self.peg_sid = self.model.site_name2id("peg_bottom") 14 | self.target_sid = self.model.site_name2id("target") 15 | self.init_body_pos = self.model.body_pos.copy() 16 | 17 | def step(self, a): 18 | self.do_simulation(a, self.frame_skip) 19 | obs = self.get_obs() 20 | reward = self.get_reward(obs, a) 21 | return obs, reward, False, self.get_env_infos() 22 | 23 | def get_obs(self): 24 | return np.concatenate([ 25 | self.data.qpos.flat, 26 | self.data.qvel.flat, 27 | self.data.site_xpos[self.peg_sid], 28 | self.data.site_xpos[self.target_sid], 29 | ]) 30 | 31 | def get_reward(self, obs, act=None): 32 | obs = np.clip(obs, -10.0, 10.0) 33 | if len(obs.shape) == 1: 34 | # vector obs, called when stepping the env 35 | hand_pos = obs[-6:-3] 36 | target_pos = obs[-3:] 37 | l1_dist = np.sum(np.abs(hand_pos - target_pos)) 38 | l2_dist = np.linalg.norm(hand_pos - target_pos) 39 | else: 40 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 41 | hand_pos = obs[:, :, -6:-3] 42 | target_pos = obs[:, :, -3:] 43 | l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1) 44 | l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1) 45 | bonus = 5.0 * (l2_dist < 0.06) 46 | reward = - l1_dist - 5.0 * l2_dist + bonus 47 | return reward 48 | 49 | def compute_path_rewards(self, paths): 50 | # path has two keys: observations and actions 51 | # path["observations"] : (num_traj, horizon, obs_dim) 52 | # path["rewards"] should have shape (num_traj, horizon) 53 | obs = paths["observations"] 54 | rewards = self.get_reward(obs) 55 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 56 | 57 | # -------------------------------- 58 | # resets and randomization 59 | # -------------------------------- 60 | 61 | def robot_reset(self): 62 | self.set_state(self.init_qpos, self.init_qvel) 63 | 64 | def target_reset(self): 65 | # Randomize goal position 66 | goal_y = self.np_random.uniform(low=0.1, high=0.5) 67 | try: 68 | self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29) 69 | self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29) 70 | self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29) 71 | self.sim.forward() 72 | except: 73 | pass 74 | 75 | def reset_model(self, seed=None): 76 | if seed is not None: 77 | self.seeding = True 78 | self.seed(seed) 79 | self.robot_reset() 80 | self.target_reset() 81 | return self.get_obs() 82 | 83 | # -------------------------------- 84 | # get and set states 85 | # -------------------------------- 86 | 87 | def get_env_state(self): 88 | target_pos = self.model.body_pos[-1].copy() 89 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(), 90 | target_pos=target_pos) 91 | 92 | def set_env_state(self, state): 93 | self.sim.reset() 94 | qp = state['qp'].copy() 95 | qv = state['qv'].copy() 96 | target_pos = state['target_pos'] 97 | self.model.body_pos[-1] = target_pos 98 | goal_y = target_pos[1] 99 | self.data.qpos[:] = qp 100 | self.data.qvel[:] = qv 101 | self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29) 102 | self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29) 103 | self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29) 104 | self.sim.forward() 105 | 106 | # -------------------------------- 107 | # utility functions 108 | # -------------------------------- 109 | 110 | def get_env_infos(self): 111 | return dict(state=self.get_env_state()) 112 | 113 | def mj_viewer_setup(self): 114 | self.viewer = MjViewer(self.sim) 115 | self.viewer.cam.azimuth += 200 116 | self.sim.forward() 117 | self.viewer.cam.distance = self.model.stat.extent*2.0 118 | -------------------------------------------------------------------------------- /mjrl/mjrl/envs/point_mass.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | 7 | class PointMassEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | self.agent_bid = 0 10 | self.target_sid = 0 11 | utils.EzPickle.__init__(self) 12 | mujoco_env.MujocoEnv.__init__(self, 'point_mass.xml', 5) 13 | self.agent_bid = self.sim.model.body_name2id('agent') 14 | self.target_sid = self.sim.model.site_name2id('target') 15 | 16 | def step(self, a): 17 | self.do_simulation(a, self.frame_skip) 18 | obs = self.get_obs() 19 | reward = self.get_reward(obs) 20 | return obs, reward, False, dict(solved=(reward > -0.1), state=self.get_env_state()) 21 | 22 | def get_obs(self): 23 | agent_pos = self.data.body_xpos[self.agent_bid].ravel() 24 | target_pos = self.data.site_xpos[self.target_sid].ravel() 25 | return np.concatenate([agent_pos[:2], self.data.qvel.ravel(), target_pos[:2]]) 26 | 27 | def get_reward(self, obs, act=None): 28 | if len(obs.shape) == 1: 29 | # vector obs, called when stepping the env 30 | agent_pos = obs[:2] 31 | target_pos = obs[-2:] 32 | l1_dist = np.sum(np.abs(agent_pos - target_pos)) 33 | l2_dist = np.linalg.norm(agent_pos - target_pos) 34 | else: 35 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 36 | agent_pos = obs[:, :, :2] 37 | target_pos = obs[:, :, -2:] 38 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1) 39 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1) 40 | reward = -1.0 * l1_dist - 0.5 * l2_dist 41 | return reward 42 | 43 | def compute_path_rewards(self, paths): 44 | # path has two keys: observations and actions 45 | # path["observations"] : (num_traj, horizon, obs_dim) 46 | # path["rewards"] should have shape (num_traj, horizon) 47 | obs = paths["observations"] 48 | rewards = self.get_reward(obs) 49 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s') 50 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 51 | return paths 52 | 53 | def reset_model(self): 54 | # randomize the agent and goal 55 | agent_x = self.np_random.uniform(low=-1.0, high=1.0) 56 | agent_y = self.np_random.uniform(low=-1.0, high=1.0) 57 | goal_x = self.np_random.uniform(low=-1.0, high=1.0) 58 | goal_y = self.np_random.uniform(low=-1.0, high=1.0) 59 | qp = np.array([agent_x, agent_y]) 60 | qv = self.init_qvel.copy() 61 | self.set_state(qp, qv) 62 | self.model.site_pos[self.target_sid][0] = goal_x 63 | self.model.site_pos[self.target_sid][1] = goal_y 64 | self.sim.forward() 65 | return self.get_obs() 66 | 67 | def evaluate_success(self, paths, logger=None): 68 | success = 0.0 69 | for p in paths: 70 | if np.mean(p['env_infos']['solved'][-4:]) > 0.0: 71 | success += 1.0 72 | success_rate = 100.0*success/len(paths) 73 | if logger is None: 74 | # nowhere to log so return the value 75 | return success_rate 76 | else: 77 | # log the success 78 | # can log multiple statistics here if needed 79 | logger.log_kv('success_rate', success_rate) 80 | return None 81 | 82 | # -------------------------------- 83 | # get and set states 84 | # -------------------------------- 85 | 86 | def get_env_state(self): 87 | target_pos = self.model.site_pos[self.target_sid].copy() 88 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(), 89 | target_pos=target_pos) 90 | 91 | def set_env_state(self, state): 92 | self.sim.reset() 93 | qp = state['qp'].copy() 94 | qv = state['qv'].copy() 95 | target_pos = state['target_pos'] 96 | self.set_state(qp, qv) 97 | self.model.site_pos[self.target_sid] = target_pos 98 | self.sim.forward() 99 | 100 | # -------------------------------- 101 | # utility functions 102 | # -------------------------------- 103 | 104 | def get_env_infos(self): 105 | return dict(state=self.get_env_state()) 106 | 107 | def mj_viewer_setup(self): 108 | self.viewer = MjViewer(self.sim) 109 | self.sim.forward() 110 | -------------------------------------------------------------------------------- /mjrl/mjrl/envs/reacher_sawyer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | 7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | self.hand_sid = -2 10 | self.target_sid = -1 11 | mujoco_env.MujocoEnv.__init__(self, 'sawyer.xml', 4) 12 | utils.EzPickle.__init__(self) 13 | self.hand_sid = self.model.site_name2id("finger") 14 | self.target_sid = self.model.site_name2id("target") 15 | 16 | def step(self, a): 17 | self.do_simulation(a, self.frame_skip) 18 | obs = self.get_obs() 19 | reward = self.get_reward(obs, a) 20 | return obs, reward, False, self.get_env_infos() 21 | 22 | def get_obs(self): 23 | return np.concatenate([ 24 | self.data.qpos.flat, 25 | self.data.qvel.ravel() * self.dt, # delta_x instead of velocity 26 | self.data.site_xpos[self.hand_sid], 27 | self.data.site_xpos[self.target_sid], 28 | ]) 29 | 30 | def get_reward(self, obs, act=None): 31 | obs = np.clip(obs, -10.0, 10.0) 32 | if len(obs.shape) == 1: 33 | # vector obs, called when stepping the env 34 | hand_pos = obs[-6:-3] 35 | target_pos = obs[-3:] 36 | l1_dist = np.sum(np.abs(hand_pos - target_pos)) 37 | l2_dist = np.linalg.norm(hand_pos - target_pos) 38 | else: 39 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 40 | hand_pos = obs[:, :, -6:-3] 41 | target_pos = obs[:, :, -3:] 42 | l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1) 43 | l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1) 44 | reward = - l1_dist - 5.0 * l2_dist 45 | return reward 46 | 47 | def compute_path_rewards(self, paths): 48 | # path has two keys: observations and actions 49 | # path["observations"] : (num_traj, horizon, obs_dim) 50 | # path["rewards"] should have shape (num_traj, horizon) 51 | obs = paths["observations"] 52 | rewards = self.get_reward(obs) 53 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 54 | 55 | # -------------------------------- 56 | # resets and randomization 57 | # -------------------------------- 58 | 59 | def robot_reset(self): 60 | self.set_state(self.init_qpos, self.init_qvel) 61 | 62 | def target_reset(self): 63 | target_pos = np.array([0.1, 0.1, 0.1]) 64 | target_pos[0] = self.np_random.uniform(low=-0.3, high=0.3) 65 | target_pos[1] = self.np_random.uniform(low=-0.2, high=0.2) 66 | target_pos[2] = self.np_random.uniform(low=-0.25, high=0.25) 67 | self.model.site_pos[self.target_sid] = target_pos 68 | self.sim.forward() 69 | 70 | def reset_model(self, seed=None): 71 | if seed is not None: 72 | self.seeding = True 73 | self.seed(seed) 74 | self.robot_reset() 75 | self.target_reset() 76 | return self.get_obs() 77 | 78 | # -------------------------------- 79 | # get and set states 80 | # -------------------------------- 81 | 82 | def get_env_state(self): 83 | target_pos = self.model.site_pos[self.target_sid].copy() 84 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(), 85 | target_pos=target_pos) 86 | 87 | def set_env_state(self, state): 88 | self.sim.reset() 89 | qp = state['qp'].copy() 90 | qv = state['qv'].copy() 91 | target_pos = state['target_pos'] 92 | self.model.site_pos[self.target_sid] = target_pos 93 | self.data.qpos[:] = qp 94 | self.data.qvel[:] = qv 95 | self.sim.forward() 96 | 97 | # -------------------------------- 98 | # utility functions 99 | # -------------------------------- 100 | 101 | def get_env_infos(self): 102 | return dict(state=self.get_env_state()) 103 | 104 | def mj_viewer_setup(self): 105 | self.viewer = MjViewer(self.sim) 106 | self.viewer.cam.trackbodyid = 1 107 | self.viewer.cam.type = 1 108 | self.sim.forward() 109 | self.viewer.cam.distance = self.model.stat.extent * 2.0 110 | -------------------------------------------------------------------------------- /mjrl/mjrl/envs/swimmer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, 'swimmer.xml', 5) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, a): 12 | xposbefore = self.data.qpos[0] 13 | self.do_simulation(a, self.frame_skip) 14 | xposafter = self.data.qpos[0] 15 | 16 | delta = (xposafter - xposbefore) 17 | # make agent move in the negative x direction 18 | reward = -10.0 * delta 19 | done = False 20 | 21 | ob = self.get_obs() 22 | return ob, reward, done, self.get_env_infos() 23 | 24 | def get_obs(self): 25 | return np.concatenate([ 26 | self.data.qpos.flat[2:], 27 | self.data.qvel.flat, 28 | ]) 29 | 30 | def reset_model(self): 31 | qpos_init = self.init_qpos.copy() 32 | qpos_init[2] = self.np_random.uniform(low=-np.pi, high=np.pi) 33 | self.set_state(qpos_init, self.init_qvel) 34 | self.sim.forward() 35 | return self.get_obs() 36 | 37 | # -------------------------------- 38 | # get and set states 39 | # -------------------------------- 40 | 41 | def get_env_state(self): 42 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy()) 43 | 44 | def set_env_state(self, state): 45 | self.sim.reset() 46 | qp = state['qp'].copy() 47 | qv = state['qv'].copy() 48 | self.set_state(qp, qv) 49 | self.sim.forward() 50 | 51 | # -------------------------------- 52 | # utility functions 53 | # -------------------------------- 54 | 55 | def get_env_infos(self): 56 | return dict(state=self.get_env_state()) 57 | 58 | def mj_viewer_setup(self): 59 | self.viewer = MjViewer(self.sim) 60 | self.viewer.cam.trackbodyid = 1 61 | self.viewer.cam.type = 1 62 | self.sim.forward() 63 | self.viewer.cam.distance = self.model.stat.extent*1.2 -------------------------------------------------------------------------------- /mjrl/mjrl/policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/policies/__init__.py -------------------------------------------------------------------------------- /mjrl/mjrl/policies/gaussian_linear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from mjrl.utils.fc_network import FCNetwork 7 | 8 | 9 | class LinearPolicy: 10 | def __init__(self, env_spec, 11 | min_log_std=-3, 12 | init_log_std=0, 13 | seed=None): 14 | """ 15 | :param env_spec: specifications of the env (see utils/gym_env.py) 16 | :param min_log_std: log_std is clamped at this value and can't go below 17 | :param init_log_std: initial log standard deviation 18 | :param seed: random seed 19 | """ 20 | self.n = env_spec.observation_dim # number of states 21 | self.m = env_spec.action_dim # number of actions 22 | self.min_log_std = min_log_std 23 | 24 | # Set seed 25 | # ------------------------ 26 | if seed is not None: 27 | torch.manual_seed(seed) 28 | np.random.seed(seed) 29 | 30 | # Policy network 31 | # ------------------------ 32 | self.model = FCNetwork(self.n, self.m, hidden_sizes=()) 33 | # make weights small 34 | for param in list(self.model.parameters())[-2:]: # only last layer 35 | param.data = 1e-2 * param.data 36 | self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True) 37 | self.trainable_params = list(self.model.parameters()) + [self.log_std] 38 | 39 | # Old Policy network 40 | # ------------------------ 41 | self.old_model = FCNetwork(self.n, self.m, hidden_sizes=()) 42 | self.old_log_std = Variable(torch.ones(self.m) * init_log_std) 43 | self.old_params = list(self.old_model.parameters()) + [self.old_log_std] 44 | for idx, param in enumerate(self.old_params): 45 | param.data = self.trainable_params[idx].data.clone() 46 | 47 | # Easy access variables 48 | # ------------------------- 49 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel()) 50 | self.param_shapes = [p.data.numpy().shape for p in self.trainable_params] 51 | self.param_sizes = [p.data.numpy().size for p in self.trainable_params] 52 | self.d = np.sum(self.param_sizes) # total number of params 53 | 54 | # Placeholders 55 | # ------------------------ 56 | self.obs_var = Variable(torch.randn(self.n), requires_grad=False) 57 | 58 | # Utility functions 59 | # ============================================ 60 | def get_param_values(self): 61 | params = np.concatenate([p.contiguous().view(-1).data.numpy() 62 | for p in self.trainable_params]) 63 | return params.copy() 64 | 65 | def set_param_values(self, new_params, set_new=True, set_old=True): 66 | if set_new: 67 | current_idx = 0 68 | for idx, param in enumerate(self.trainable_params): 69 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]] 70 | vals = vals.reshape(self.param_shapes[idx]) 71 | param.data = torch.from_numpy(vals).float() 72 | current_idx += self.param_sizes[idx] 73 | # clip std at minimum value 74 | self.trainable_params[-1].data = \ 75 | torch.clamp(self.trainable_params[-1], self.min_log_std).data 76 | # update log_std_val for sampling 77 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel()) 78 | if set_old: 79 | current_idx = 0 80 | for idx, param in enumerate(self.old_params): 81 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]] 82 | vals = vals.reshape(self.param_shapes[idx]) 83 | param.data = torch.from_numpy(vals).float() 84 | current_idx += self.param_sizes[idx] 85 | # clip std at minimum value 86 | self.old_params[-1].data = \ 87 | torch.clamp(self.old_params[-1], self.min_log_std).data 88 | 89 | # Main functions 90 | # ============================================ 91 | def get_action(self, observation): 92 | o = np.float32(observation.reshape(1, -1)) 93 | self.obs_var.data = torch.from_numpy(o) 94 | mean = self.model(self.obs_var).data.numpy().ravel() 95 | noise = np.exp(self.log_std_val) * np.random.randn(self.m) 96 | action = mean + noise 97 | return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}] 98 | 99 | def mean_LL(self, observations, actions, model=None, log_std=None): 100 | model = self.model if model is None else model 101 | log_std = self.log_std if log_std is None else log_std 102 | obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False) 103 | act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False) 104 | mean = model(obs_var) 105 | zs = (act_var - mean) / torch.exp(log_std) 106 | LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \ 107 | - torch.sum(log_std) + \ 108 | - 0.5 * self.m * np.log(2 * np.pi) 109 | return mean, LL 110 | 111 | def log_likelihood(self, observations, actions, model=None, log_std=None): 112 | mean, LL = self.mean_LL(observations, actions, model, log_std) 113 | return LL.data.numpy() 114 | 115 | def old_dist_info(self, observations, actions): 116 | mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std) 117 | return [LL, mean, self.old_log_std] 118 | 119 | def new_dist_info(self, observations, actions): 120 | mean, LL = self.mean_LL(observations, actions, self.model, self.log_std) 121 | return [LL, mean, self.log_std] 122 | 123 | def likelihood_ratio(self, new_dist_info, old_dist_info): 124 | LL_old = old_dist_info[0] 125 | LL_new = new_dist_info[0] 126 | LR = torch.exp(LL_new - LL_old) 127 | return LR 128 | 129 | def mean_kl(self, new_dist_info, old_dist_info): 130 | old_log_std = old_dist_info[2] 131 | new_log_std = new_dist_info[2] 132 | old_std = torch.exp(old_log_std) 133 | new_std = torch.exp(new_log_std) 134 | old_mean = old_dist_info[1] 135 | new_mean = new_dist_info[1] 136 | Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2 137 | Dr = 2 * new_std ** 2 + 1e-8 138 | sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1) 139 | return torch.mean(sample_kl) 140 | -------------------------------------------------------------------------------- /mjrl/mjrl/policies/gaussian_mlp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mjrl.utils.fc_network import FCNetwork 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | 7 | class MLP: 8 | def __init__(self, env_spec, 9 | hidden_sizes=(64,64), 10 | min_log_std=-3, 11 | init_log_std=0, 12 | eps=0.0, 13 | seed=None): 14 | """ 15 | :param env_spec: specifications of the env (see utils/gym_env.py) 16 | :param hidden_sizes: network hidden layer sizes (currently 2 layers only) 17 | :param min_log_std: log_std is clamped at this value and can't go below 18 | :param init_log_std: initial log standard deviation 19 | :param seed: random seed 20 | """ 21 | self.n = env_spec.observation_dim # number of states 22 | self.m = env_spec.action_dim # number of actions 23 | self.min_log_std = min_log_std 24 | self.eps = eps 25 | 26 | # Set seed 27 | # ------------------------ 28 | if seed is not None: 29 | torch.manual_seed(seed) 30 | np.random.seed(seed) 31 | 32 | # Policy network 33 | # ------------------------ 34 | self.model = FCNetwork(self.n, self.m, hidden_sizes) 35 | # make weights small 36 | for param in list(self.model.parameters())[-2:]: # only last layer 37 | param.data = 1e-2 * param.data 38 | self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True) 39 | self.trainable_params = list(self.model.parameters()) + [self.log_std] 40 | 41 | # Old Policy network 42 | # ------------------------ 43 | self.old_model = FCNetwork(self.n, self.m, hidden_sizes) 44 | self.old_log_std = Variable(torch.ones(self.m) * init_log_std) 45 | self.old_params = list(self.old_model.parameters()) + [self.old_log_std] 46 | for idx, param in enumerate(self.old_params): 47 | param.data = self.trainable_params[idx].data.clone() 48 | 49 | # Easy access variables 50 | # ------------------------- 51 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel()) 52 | self.param_shapes = [p.data.numpy().shape for p in self.trainable_params] 53 | self.param_sizes = [p.data.numpy().size for p in self.trainable_params] 54 | self.d = np.sum(self.param_sizes) # total number of params 55 | 56 | # Placeholders 57 | # ------------------------ 58 | self.obs_var = Variable(torch.randn(self.n), requires_grad=False) 59 | 60 | # Utility functions 61 | # ============================================ 62 | def get_param_values(self): 63 | params = np.concatenate([p.contiguous().view(-1).data.numpy() 64 | for p in self.trainable_params]) 65 | return params.copy() 66 | 67 | def set_param_values(self, new_params, set_new=True, set_old=True): 68 | if set_new: 69 | current_idx = 0 70 | for idx, param in enumerate(self.trainable_params): 71 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]] 72 | vals = vals.reshape(self.param_shapes[idx]) 73 | param.data = torch.from_numpy(vals).float() 74 | current_idx += self.param_sizes[idx] 75 | # clip std at minimum value 76 | self.trainable_params[-1].data = \ 77 | torch.clamp(self.trainable_params[-1], self.min_log_std).data 78 | # update log_std_val for sampling 79 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel()) 80 | if set_old: 81 | current_idx = 0 82 | for idx, param in enumerate(self.old_params): 83 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]] 84 | vals = vals.reshape(self.param_shapes[idx]) 85 | param.data = torch.from_numpy(vals).float() 86 | current_idx += self.param_sizes[idx] 87 | # clip std at minimum value 88 | self.old_params[-1].data = \ 89 | torch.clamp(self.old_params[-1], self.min_log_std).data 90 | 91 | # Main functions 92 | # ============================================ 93 | def get_action(self, observation): 94 | o = np.float32(observation.reshape(1, -1)) 95 | self.obs_var.data = torch.from_numpy(o) 96 | mean = self.model(self.obs_var).data.numpy().ravel() 97 | if np.random.uniform() < self.eps: 98 | action = np.random.uniform(0, 1, self.m) 99 | else: 100 | noise = np.exp(self.log_std_val) * np.random.randn(self.m) 101 | action = mean + noise 102 | return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}] 103 | 104 | def mean_LL(self, observations, actions, model=None, log_std=None): 105 | model = self.model if model is None else model 106 | log_std = self.log_std if log_std is None else log_std 107 | if type(observations) is not torch.Tensor: 108 | obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False) 109 | else: 110 | obs_var = observations 111 | if type(actions) is not torch.Tensor: 112 | act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False) 113 | else: 114 | act_var = actions 115 | mean = model(obs_var) 116 | zs = (act_var - mean) / torch.exp(log_std) 117 | LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \ 118 | - torch.sum(log_std) + \ 119 | - 0.5 * self.m * np.log(2 * np.pi) 120 | return mean, LL 121 | 122 | def log_likelihood(self, observations, actions, model=None, log_std=None): 123 | mean, LL = self.mean_LL(observations, actions, model, log_std) 124 | return LL.data.numpy() 125 | 126 | def old_dist_info(self, observations, actions): 127 | mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std) 128 | return [LL, mean, self.old_log_std] 129 | 130 | def new_dist_info(self, observations, actions): 131 | mean, LL = self.mean_LL(observations, actions, self.model, self.log_std) 132 | return [LL, mean, self.log_std] 133 | 134 | def likelihood_ratio(self, new_dist_info, old_dist_info): 135 | LL_old = old_dist_info[0] 136 | LL_new = new_dist_info[0] 137 | LR = torch.exp(LL_new - LL_old) 138 | return LR 139 | 140 | def mean_kl(self, new_dist_info, old_dist_info): 141 | old_log_std = old_dist_info[2] 142 | new_log_std = new_dist_info[2] 143 | old_std = torch.exp(old_log_std) 144 | new_std = torch.exp(new_log_std) 145 | old_mean = old_dist_info[1] 146 | new_mean = new_dist_info[1] 147 | Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2 148 | Dr = 2 * new_std ** 2 + 1e-8 149 | sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1) 150 | return torch.mean(sample_kl) 151 | -------------------------------------------------------------------------------- /mjrl/mjrl/policies/mpc_actor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from trajopt.utils import gather_paths_parallel 3 | 4 | 5 | class MPCActor(object): 6 | def __init__(self, env, H, paths_per_cpu, 7 | num_cpu=1, 8 | kappa=1.0, 9 | gamma=1.0, 10 | mean=None, 11 | filter_coefs=None, 12 | seed=123, 13 | ): 14 | 15 | self.env, self.seed = env, seed 16 | self.n, self.m = env.observation_dim, env.action_dim 17 | self.H, self.paths_per_cpu, self.num_cpu = H, paths_per_cpu, num_cpu 18 | 19 | self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma 20 | if mean is None: 21 | self.mean = np.zeros(self.m) 22 | if filter_coefs is None: 23 | self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0] 24 | 25 | self.env.reset() 26 | self.env.set_seed(seed) 27 | self.env.reset(seed=seed) 28 | self.act_sequence = np.ones((self.H, self.m)) * self.mean 29 | self.ctr = 1 30 | 31 | def score_trajectory(self, paths): 32 | scores = np.zeros(len(paths)) 33 | for i in range(len(paths)): 34 | scores[i] = 0.0 35 | for t in range(paths[i]["rewards"].shape[0]): 36 | scores[i] += (self.gamma**t)*paths[i]["rewards"][t] 37 | return scores 38 | 39 | def get_action(self, env_state): 40 | # Set to env_state 41 | # Shoot trajectories 42 | # Return optimal action 43 | seed = self.seed + self.ctr * 1000 44 | paths = gather_paths_parallel(self.env.env_id, 45 | env_state, 46 | self.act_sequence, 47 | self.filter_coefs, 48 | seed, 49 | self.paths_per_cpu, 50 | self.num_cpu, 51 | ) 52 | 53 | num_traj = len(paths) 54 | R = self.score_trajectory(paths) 55 | S = np.exp(self.kappa*(R-np.max(R))) 56 | act = np.sum([paths[i]["actions"][0] * S[i] for i in range(num_traj)], axis=0) 57 | act = act / (np.sum(S) + 1e-6) 58 | return act -------------------------------------------------------------------------------- /mjrl/mjrl/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/samplers/__init__.py -------------------------------------------------------------------------------- /mjrl/mjrl/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/utils/__init__.py -------------------------------------------------------------------------------- /mjrl/mjrl/utils/cg_solve.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def cg_solve(f_Ax, b, x_0=None, cg_iters=10, residual_tol=1e-10): 4 | x = np.zeros_like(b) #if x_0 is None else x_0 5 | r = b.copy() #if x_0 is None else b-f_Ax(x_0) 6 | p = r.copy() 7 | rdotr = r.dot(r) 8 | 9 | for i in range(cg_iters): 10 | z = f_Ax(p) 11 | v = rdotr / p.dot(z) 12 | x += v * p 13 | r -= v * z 14 | newrdotr = r.dot(r) 15 | mu = newrdotr / rdotr 16 | p = r + mu * p 17 | 18 | rdotr = newrdotr 19 | if rdotr < residual_tol: 20 | break 21 | 22 | return x 23 | -------------------------------------------------------------------------------- /mjrl/mjrl/utils/fc_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class FCNetwork(nn.Module): 7 | def __init__(self, obs_dim, act_dim, 8 | hidden_sizes=(64,64), 9 | nonlinearity='tanh', # either 'tanh' or 'relu' 10 | in_shift = None, 11 | in_scale = None, 12 | out_shift = None, 13 | out_scale = None): 14 | super(FCNetwork, self).__init__() 15 | 16 | self.obs_dim = obs_dim 17 | self.act_dim = act_dim 18 | assert type(hidden_sizes) == tuple 19 | self.layer_sizes = (obs_dim, ) + hidden_sizes + (act_dim, ) 20 | self.set_transformations(in_shift, in_scale, out_shift, out_scale) 21 | 22 | # hidden layers 23 | self.fc_layers = nn.ModuleList([nn.Linear(self.layer_sizes[i], self.layer_sizes[i+1]) \ 24 | for i in range(len(self.layer_sizes) -1)]) 25 | self.nonlinearity = torch.relu if nonlinearity == 'relu' else torch.tanh 26 | 27 | def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None): 28 | # store native scales that can be used for resets 29 | self.transformations = dict(in_shift=in_shift, 30 | in_scale=in_scale, 31 | out_shift=out_shift, 32 | out_scale=out_scale 33 | ) 34 | self.in_shift = torch.from_numpy(np.float32(in_shift)) if in_shift is not None else torch.zeros(self.obs_dim) 35 | self.in_scale = torch.from_numpy(np.float32(in_scale)) if in_scale is not None else torch.ones(self.obs_dim) 36 | self.out_shift = torch.from_numpy(np.float32(out_shift)) if out_shift is not None else torch.zeros(self.act_dim) 37 | self.out_scale = torch.from_numpy(np.float32(out_scale)) if out_scale is not None else torch.ones(self.act_dim) 38 | 39 | def forward(self, x): 40 | # TODO(Aravind): Remove clamping to CPU 41 | # This is a temp change that should be fixed shortly 42 | if x.is_cuda: 43 | out = x.to('cpu') 44 | else: 45 | out = x 46 | out = (out - self.in_shift)/(self.in_scale + 1e-8) 47 | for i in range(len(self.fc_layers)-1): 48 | out = self.fc_layers[i](out) 49 | out = self.nonlinearity(out) 50 | out = self.fc_layers[-1](out) 51 | out = out * self.out_scale + self.out_shift 52 | return out 53 | -------------------------------------------------------------------------------- /mjrl/mjrl/utils/get_environment.py: -------------------------------------------------------------------------------- 1 | """ 2 | convenience function to generate env 3 | useful if we want some procedural env generation 4 | """ 5 | 6 | import gym 7 | from mjrl.utils.gym_env import GymEnv 8 | 9 | def get_environment(env_name=None, **kwargs): 10 | if env_name is None: print("Need to specify environment name") 11 | e = GymEnv(env_name) 12 | # can make procedural modifications here if needed using kwargs 13 | return e 14 | -------------------------------------------------------------------------------- /mjrl/mjrl/utils/logger.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import scipy 6 | import pickle 7 | import os 8 | import csv 9 | 10 | class DataLog: 11 | 12 | def __init__(self): 13 | self.log = {} 14 | self.max_len = 0 15 | 16 | def log_kv(self, key, value): 17 | # logs the (key, value) pair 18 | 19 | # TODO: This implementation is error-prone: 20 | # it would be NOT aligned if some keys are missing during one iteration. 21 | if key not in self.log: 22 | self.log[key] = [] 23 | self.log[key].append(value) 24 | if len(self.log[key]) > self.max_len: 25 | self.max_len = self.max_len + 1 26 | 27 | def save_log(self, save_path): 28 | # TODO: Validate all lengths are the same. 29 | pickle.dump(self.log, open(save_path + '/log.pickle', 'wb')) 30 | with open(save_path + '/log.csv', 'w') as csv_file: 31 | fieldnames = list(self.log.keys()) 32 | if 'iteration' not in fieldnames: 33 | fieldnames = ['iteration'] + fieldnames 34 | 35 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames) 36 | writer.writeheader() 37 | for row in range(self.max_len): 38 | row_dict = {'iteration': row} 39 | for key in self.log.keys(): 40 | if row < len(self.log[key]): 41 | row_dict[key] = self.log[key][row] 42 | writer.writerow(row_dict) 43 | 44 | def get_current_log(self): 45 | row_dict = {} 46 | for key in self.log.keys(): 47 | # TODO: this is very error-prone (alignment is not guaranteed) 48 | row_dict[key] = self.log[key][-1] 49 | return row_dict 50 | 51 | def shrink_to(self, num_entries): 52 | for key in self.log.keys(): 53 | self.log[key] = self.log[key][:num_entries] 54 | 55 | self.max_len = num_entries 56 | assert min([len(series) for series in self.log.values()]) == \ 57 | max([len(series) for series in self.log.values()]) 58 | 59 | def read_log(self, log_path): 60 | assert log_path.endswith('log.csv') 61 | 62 | with open(log_path) as csv_file: 63 | reader = csv.DictReader(csv_file) 64 | listr = list(reader) 65 | keys = reader.fieldnames 66 | data = {} 67 | for key in keys: 68 | data[key] = [] 69 | for row, row_dict in enumerate(listr): 70 | for key in keys: 71 | try: 72 | data[key].append(eval(row_dict[key])) 73 | except: 74 | print("ERROR on reading key {}: {}".format(key, row_dict[key])) 75 | 76 | if 'iteration' in data and data['iteration'][-1] != row: 77 | raise RuntimeError("Iteration %d mismatch -- possibly corrupted logfile?" % row) 78 | 79 | self.log = data 80 | self.max_len = max(len(v) for k, v in self.log.items()) 81 | print("Log read from {}: had {} entries".format(log_path, self.max_len)) 82 | -------------------------------------------------------------------------------- /mjrl/mjrl/utils/make_train_plots.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import scipy 6 | import csv 7 | from mjrl.utils.logger import DataLog 8 | import argparse 9 | 10 | def make_train_plots(log = None, 11 | log_path = None, 12 | keys = None, 13 | save_loc = None, 14 | sample_key = 'num_samples', 15 | x_scale = 1.0, 16 | y_scale = 1.0): 17 | if log is None and log_path is None: 18 | print("Need to provide either the log or path to a log file") 19 | if log is None: 20 | logger = DataLog() 21 | logger.read_log(log_path) 22 | log = logger.log 23 | # make plots for specified keys 24 | for key in keys: 25 | if key in log.keys(): 26 | fig = plt.figure(figsize=(10,6)) 27 | ax1 = fig.add_subplot(111) 28 | try: 29 | cum_samples = [np.sum(log[sample_key][:i]) * x_scale for i in range(len(log[sample_key]))] 30 | ax1.plot(cum_samples, [elem * y_scale for elem in log[key]]) 31 | ax1.set_xlabel('samples') 32 | # mark iteration on the top axis 33 | ax2 = ax1.twiny() 34 | ax2.set_xlabel('iterations', color=(.7,.7,.7)) 35 | ax2.tick_params(axis='x', labelcolor=(.7,.7,.7)) 36 | ax2.set_xlim([0, len(log[key])]) 37 | except: 38 | ax1.plot(log[key]) 39 | ax1.set_xlabel('iterations') 40 | ax1.set_title(key) 41 | plt.savefig(save_loc+'/'+key+'.png', dpi=100) 42 | plt.close() 43 | 44 | # MAIN ========================================================= 45 | # Example: python make_train_plots.py --log_path logs/log.csv --keys eval_score rollout_score save_loc logs 46 | def main(): 47 | # Parse arguments 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument( 50 | '-l', '--log_path', type=str, required=True, help='path file to log.csv') 51 | parser.add_argument( 52 | '-k', '--keys', type=str, action='append', nargs='+', required=True, help='keys to plot') 53 | parser.add_argument( 54 | '-s', '--save_loc', type=str, default='', help='Path for logs') 55 | args = parser.parse_args() 56 | 57 | make_train_plots(log_path=args.log_path, keys=args.keys[0], save_loc=args.save_loc) 58 | 59 | if __name__ == '__main__': 60 | main() 61 | 62 | -------------------------------------------------------------------------------- /mjrl/mjrl/utils/optimize_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | def fit_data(model, x, y, optimizer, loss_func, batch_size, epochs): 8 | """ 9 | :param model: pytorch model of form y_hat = f(x) (class) 10 | :param x: inputs to the model (tensor) 11 | :param y: desired outputs or targets (tensor) 12 | :param optimizer: optimizer to be used (class) 13 | :param loss_func: loss criterion (callable) 14 | :param batch_size: mini-batch size for optimization (int) 15 | :param epochs: number of epochs (int) 16 | :return: 17 | """ 18 | 19 | num_samples = x.shape[0] 20 | epoch_losses = [] 21 | for ep in range(epochs): 22 | rand_idx = torch.LongTensor(np.random.permutation(num_samples)) 23 | ep_loss = 0.0 24 | num_steps = int(num_samples / batch_size) - 1 25 | for mb in range(num_steps): 26 | data_idx = rand_idx[mb*batch_size:(mb+1)*batch_size] 27 | batch_x = x[data_idx] 28 | batch_y = y[data_idx] 29 | optimizer.zero_grad() 30 | yhat = model(batch_x) 31 | loss = loss_func(yhat, batch_y) 32 | loss.backward() 33 | optimizer.step() 34 | ep_loss += loss.detach() 35 | epoch_losses.append(ep_loss.to('cpu').data.numpy().ravel() / num_steps) 36 | return epoch_losses 37 | -------------------------------------------------------------------------------- /mjrl/mjrl/utils/process_samples.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def compute_returns(paths, gamma): 4 | for path in paths: 5 | path["returns"] = discount_sum(path["rewards"], gamma) 6 | 7 | def compute_advantages(paths, baseline, gamma, gae_lambda=None, normalize=False): 8 | # compute and store returns, advantages, and baseline 9 | # standard mode 10 | if gae_lambda == None or gae_lambda < 0.0 or gae_lambda > 1.0: 11 | for path in paths: 12 | path["baseline"] = baseline.predict(path) 13 | path["advantages"] = path["returns"] - path["baseline"] 14 | if normalize: 15 | alladv = np.concatenate([path["advantages"] for path in paths]) 16 | mean_adv = alladv.mean() 17 | std_adv = alladv.std() 18 | for path in paths: 19 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8) 20 | # GAE mode 21 | else: 22 | for path in paths: 23 | b = path["baseline"] = baseline.predict(path) 24 | if b.ndim == 1: 25 | b1 = np.append(path["baseline"], 0.0 if path["terminated"] else b[-1]) 26 | else: 27 | b1 = np.vstack((b, np.zeros(b.shape[1]) if path["terminated"] else b[-1])) 28 | td_deltas = path["rewards"] + gamma*b1[1:] - b1[:-1] 29 | path["advantages"] = discount_sum(td_deltas, gamma*gae_lambda) 30 | if normalize: 31 | alladv = np.concatenate([path["advantages"] for path in paths]) 32 | mean_adv = alladv.mean() 33 | std_adv = alladv.std() 34 | for path in paths: 35 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8) 36 | 37 | def discount_sum(x, gamma, terminal=0.0): 38 | y = [] 39 | run_sum = terminal 40 | for t in range( len(x)-1, -1, -1): 41 | run_sum = x[t] + gamma*run_sum 42 | y.append(run_sum) 43 | 44 | return np.array(y[::-1]) -------------------------------------------------------------------------------- /mjrl/mjrl/utils/tensor_utils.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | import numpy as np 4 | 5 | 6 | def flatten_tensors(tensors): 7 | if len(tensors) > 0: 8 | return np.concatenate([np.reshape(x, [-1]) for x in tensors]) 9 | else: 10 | return np.asarray([]) 11 | 12 | 13 | def unflatten_tensors(flattened, tensor_shapes): 14 | tensor_sizes = list(map(np.prod, tensor_shapes)) 15 | indices = np.cumsum(tensor_sizes)[:-1] 16 | return [np.reshape(pair[0], pair[1]) for pair in zip(np.split(flattened, indices), tensor_shapes)] 17 | 18 | 19 | def pad_tensor(x, max_len, mode='zero'): 20 | padding = np.zeros_like(x[0]) 21 | if mode == 'last': 22 | padding = x[-1] 23 | return np.concatenate([ 24 | x, 25 | np.tile(padding, (max_len - len(x),) + (1,) * np.ndim(x[0])) 26 | ]) 27 | 28 | 29 | def pad_tensor_n(xs, max_len): 30 | ret = np.zeros((len(xs), max_len) + xs[0].shape[1:], dtype=xs[0].dtype) 31 | for idx, x in enumerate(xs): 32 | ret[idx][:len(x)] = x 33 | return ret 34 | 35 | 36 | def pad_tensor_dict(tensor_dict, max_len, mode='zero'): 37 | keys = list(tensor_dict.keys()) 38 | ret = dict() 39 | for k in keys: 40 | if isinstance(tensor_dict[k], dict): 41 | ret[k] = pad_tensor_dict(tensor_dict[k], max_len, mode=mode) 42 | else: 43 | ret[k] = pad_tensor(tensor_dict[k], max_len, mode=mode) 44 | return ret 45 | 46 | 47 | def flatten_first_axis_tensor_dict(tensor_dict): 48 | keys = list(tensor_dict.keys()) 49 | ret = dict() 50 | for k in keys: 51 | if isinstance(tensor_dict[k], dict): 52 | ret[k] = flatten_first_axis_tensor_dict(tensor_dict[k]) 53 | else: 54 | old_shape = tensor_dict[k].shape 55 | ret[k] = tensor_dict[k].reshape((-1,) + old_shape[2:]) 56 | return ret 57 | 58 | 59 | def high_res_normalize(probs): 60 | return [x / sum(map(float, probs)) for x in list(map(float, probs))] 61 | 62 | 63 | def stack_tensor_list(tensor_list): 64 | return np.array(tensor_list) 65 | # tensor_shape = np.array(tensor_list[0]).shape 66 | # if tensor_shape is tuple(): 67 | # return np.array(tensor_list) 68 | # return np.vstack(tensor_list) 69 | 70 | 71 | def stack_tensor_dict_list(tensor_dict_list): 72 | """ 73 | Stack a list of dictionaries of {tensors or dictionary of tensors}. 74 | :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}. 75 | :return: a dictionary of {stacked tensors or dictionary of stacked tensors} 76 | """ 77 | keys = list(tensor_dict_list[0].keys()) 78 | ret = dict() 79 | for k in keys: 80 | example = tensor_dict_list[0][k] 81 | if isinstance(example, dict): 82 | v = stack_tensor_dict_list([x[k] for x in tensor_dict_list]) 83 | else: 84 | v = stack_tensor_list([x[k] for x in tensor_dict_list]) 85 | ret[k] = v 86 | return ret 87 | 88 | 89 | def concat_tensor_list_subsample(tensor_list, f): 90 | return np.concatenate( 91 | [t[np.random.choice(len(t), int(np.ceil(len(t) * f)), replace=False)] for t in tensor_list], axis=0) 92 | 93 | 94 | def concat_tensor_dict_list_subsample(tensor_dict_list, f): 95 | keys = list(tensor_dict_list[0].keys()) 96 | ret = dict() 97 | for k in keys: 98 | example = tensor_dict_list[0][k] 99 | if isinstance(example, dict): 100 | v = concat_tensor_dict_list_subsample([x[k] for x in tensor_dict_list], f) 101 | else: 102 | v = concat_tensor_list_subsample([x[k] for x in tensor_dict_list], f) 103 | ret[k] = v 104 | return ret 105 | 106 | 107 | def concat_tensor_list(tensor_list): 108 | return np.concatenate(tensor_list, axis=0) 109 | 110 | 111 | def concat_tensor_dict_list(tensor_dict_list): 112 | keys = list(tensor_dict_list[0].keys()) 113 | ret = dict() 114 | for k in keys: 115 | example = tensor_dict_list[0][k] 116 | if isinstance(example, dict): 117 | v = concat_tensor_dict_list([x[k] for x in tensor_dict_list]) 118 | else: 119 | v = concat_tensor_list([x[k] for x in tensor_dict_list]) 120 | ret[k] = v 121 | return ret 122 | 123 | 124 | def split_tensor_dict_list(tensor_dict): 125 | keys = list(tensor_dict.keys()) 126 | ret = None 127 | for k in keys: 128 | vals = tensor_dict[k] 129 | if isinstance(vals, dict): 130 | vals = split_tensor_dict_list(vals) 131 | if ret is None: 132 | ret = [{k: v} for v in vals] 133 | else: 134 | for v, cur_dict in zip(vals, ret): 135 | cur_dict[k] = v 136 | return ret 137 | 138 | 139 | def truncate_tensor_list(tensor_list, truncated_len): 140 | return tensor_list[:truncated_len] 141 | 142 | 143 | def truncate_tensor_dict(tensor_dict, truncated_len): 144 | ret = dict() 145 | for k, v in tensor_dict.items(): 146 | if isinstance(v, dict): 147 | ret[k] = truncate_tensor_dict(v, truncated_len) 148 | else: 149 | ret[k] = truncate_tensor_list(v, truncated_len) 150 | return ret 151 | -------------------------------------------------------------------------------- /mjrl/mjrl/utils/train_agent.py: -------------------------------------------------------------------------------- 1 | import logging 2 | #logging.disable(logging.CRITICAL) 3 | 4 | from tabulate import tabulate 5 | from mjrl.utils.make_train_plots import make_train_plots 6 | from mjrl.utils.gym_env import GymEnv 7 | from mjrl.samplers.core import sample_paths 8 | import numpy as np 9 | import pickle 10 | import time as timer 11 | import os 12 | import copy 13 | 14 | 15 | def _load_latest_policy_and_logs(agent, *, policy_dir, logs_dir): 16 | """Loads the latest policy. 17 | Returns the next step number to begin with. 18 | """ 19 | assert os.path.isdir(policy_dir), str(policy_dir) 20 | assert os.path.isdir(logs_dir), str(logs_dir) 21 | 22 | log_csv_path = os.path.join(logs_dir, 'log.csv') 23 | if not os.path.exists(log_csv_path): 24 | return 0 # fresh start 25 | 26 | print("Reading: {}".format(log_csv_path)) 27 | agent.logger.read_log(log_csv_path) 28 | last_step = agent.logger.max_len - 1 29 | if last_step <= 0: 30 | return 0 # fresh start 31 | 32 | 33 | # find latest policy/baseline 34 | i = last_step 35 | while i >= 0: 36 | policy_path = os.path.join(policy_dir, 'policy_{}.pickle'.format(i)) 37 | baseline_path = os.path.join(policy_dir, 'baseline_{}.pickle'.format(i)) 38 | 39 | if not os.path.isfile(policy_path): 40 | i = i -1 41 | continue 42 | else: 43 | print("Loaded last saved iteration: {}".format(i)) 44 | 45 | with open(policy_path, 'rb') as fp: 46 | agent.policy = pickle.load(fp) 47 | with open(baseline_path, 'rb') as fp: 48 | agent.baseline = pickle.load(fp) 49 | 50 | # additional 51 | # global_status_path = os.path.join(policy_dir, 'global_status.pickle') 52 | # with open(global_status_path, 'rb') as fp: 53 | # agent.load_global_status( pickle.load(fp) ) 54 | 55 | agent.logger.shrink_to(i + 1) 56 | assert agent.logger.max_len == i + 1 57 | return agent.logger.max_len 58 | 59 | # cannot find any saved policy 60 | raise RuntimeError("Log file exists, but cannot find any saved policy.") 61 | 62 | def train_agent(job_name, agent, 63 | seed = 0, 64 | niter = 101, 65 | gamma = 0.995, 66 | gae_lambda = None, 67 | num_cpu = 1, 68 | sample_mode = 'trajectories', 69 | num_traj = 50, 70 | num_samples = 50000, # has precedence, used with sample_mode = 'samples' 71 | save_freq = 10, 72 | evaluation_rollouts = None, 73 | plot_keys = ['stoc_pol_mean'], 74 | reward_kwargs = None, 75 | adroit=False 76 | ): 77 | 78 | np.random.seed(seed) 79 | if os.path.isdir(job_name) == False: 80 | os.mkdir(job_name) 81 | previous_dir = os.getcwd() 82 | os.chdir(job_name) # important! we are now in the directory to save data 83 | if os.path.isdir('iterations') == False: os.mkdir('iterations') 84 | if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs') 85 | best_policy = copy.deepcopy(agent.policy) 86 | best_perf = -1e8 87 | train_curve = best_perf*np.ones(niter) 88 | mean_pol_perf = 0.0 89 | e = GymEnv(agent.env.env.spec.id) 90 | 91 | # Load from any existing checkpoint, policy, statistics, etc. 92 | # Why no checkpointing.. :( 93 | #i_start = _load_latest_policy_and_logs(agent, 94 | # policy_dir='iterations', 95 | # logs_dir='logs') 96 | #if i_start: 97 | # print("Resuming from an existing job folder ...") 98 | i_start = 0 99 | for i in range(i_start, niter): 100 | print("......................................................................................") 101 | print("ITERATION : %i " % i) 102 | 103 | if train_curve[i-1] > best_perf: 104 | best_policy = copy.deepcopy(agent.policy) 105 | best_perf = train_curve[i-1] 106 | 107 | N = num_traj if sample_mode == 'trajectories' else num_samples 108 | 109 | args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu, reward_kwargs=reward_kwargs) 110 | # NOTE: Samples are inputed here 111 | stats = agent.train_step(**args) 112 | train_curve[i] = stats[0] 113 | 114 | if evaluation_rollouts is not None and evaluation_rollouts > 0: 115 | print("Performing evaluation rollouts ........") 116 | eval_paths = sample_paths(num_traj=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu, 117 | env=e.env_id, eval_mode=True, base_seed=seed) 118 | mean_pol_perf = np.mean([np.sum(path['rewards']) for path in eval_paths]) 119 | if agent.save_logs: 120 | agent.logger.log_kv('eval_score', mean_pol_perf) 121 | 122 | if save_freq != 0 and i > 0 and i % save_freq == 0: 123 | if agent.save_logs: 124 | agent.logger.save_log('logs/') 125 | make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') 126 | policy_file = 'policy_%i.pickle' % i 127 | baseline_file = 'baseline_%i.pickle' % i 128 | pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb')) 129 | pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb')) 130 | pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) 131 | # pickle.dump(agent.global_status, open('iterations/global_status.pickle', 'wb')) 132 | 133 | # print results to console 134 | if i == 0: 135 | result_file = open('results.txt', 'w') 136 | print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n") 137 | result_file.write("Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n") 138 | result_file.close() 139 | if not adroit: 140 | print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())), 141 | i, train_curve[i], mean_pol_perf, best_perf)) 142 | else: 143 | print("[ %s ] %4i %5.2f %5.2f %5.2f %5.2f" % (timer.asctime(timer.localtime(timer.time())), 144 | i, train_curve[i], mean_pol_perf, best_perf, stats[-1])) 145 | 146 | result_file = open('results.txt', 'a') 147 | result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf)) 148 | result_file.close() 149 | if agent.save_logs: 150 | print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1, 151 | agent.logger.get_current_log().items())) 152 | print(tabulate(print_data)) 153 | 154 | # final save 155 | pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) 156 | if agent.save_logs: 157 | agent.logger.save_log('logs/') 158 | make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') 159 | os.chdir(previous_dir) 160 | -------------------------------------------------------------------------------- /mjrl/mjrl/utils/visualize_policy.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import mjrl.envs 3 | import click 4 | import os 5 | import gym 6 | import numpy as np 7 | import pickle 8 | from mjrl.utils.gym_env import GymEnv 9 | from mjrl.policies.gaussian_mlp import MLP 10 | import trajopt.envs 11 | 12 | DESC = ''' 13 | Helper script to visualize policy (in mjrl format).\n 14 | USAGE:\n 15 | Visualizes policy on the env\n 16 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n 17 | ''' 18 | 19 | # MAIN ========================================================= 20 | @click.command(help=DESC) 21 | @click.option('--env_name', type=str, help='environment to load', required= True) 22 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None) 23 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation') 24 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123) 25 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10) 26 | 27 | def main(env_name, policy, mode, seed, episodes): 28 | e = GymEnv(env_name) 29 | e.set_seed(seed) 30 | if policy is not None: 31 | pi = pickle.load(open(policy, 'rb')) 32 | else: 33 | pi = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=-1.0) 34 | # render policy 35 | e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode) 36 | 37 | if __name__ == '__main__': 38 | main() 39 | 40 | -------------------------------------------------------------------------------- /mjrl/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | print("Installing mjrl. \n Package intended for use with provided conda env. See setup instructions here: https://github.com/aravindr93/mjrl/tree/master/setup") 6 | 7 | if sys.version_info.major != 3: 8 | print("This Python is only compatible with Python 3, but you are running " 9 | "Python {}. The installation will likely fail.".format(sys.version_info.major)) 10 | 11 | def read(fname): 12 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 13 | 14 | setup( 15 | name='mjrl', 16 | version='1.0.0', 17 | packages=find_packages(), 18 | description='RL algorithms for environments in MuJoCo', 19 | long_description=read('README.md'), 20 | url='https://github.com/aravindr93/mjrl.git', 21 | author='Aravind Rajeswaran', 22 | ) 23 | -------------------------------------------------------------------------------- /mjrl/setup/README.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | A short guide to install this package is below. The package relies on `mujoco-py` which might be the trickiest part of the installation. See `known issues` below and also instructions from the mujoco-py [page](https://github.com/openai/mujoco-py) if you are stuck with mujoco-py installation. 4 | 5 | The package can handle both `MuJoCo v1.5` as well as `MuJoCo v2.0`, but the former is not supported for future updates. We encourage you to use v2.0. 6 | 7 | ## Linux 8 | 9 | - Download MuJoCo v2.0 binaries from the official [website](http://www.mujoco.org/) and also obtain the license key. 10 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200`, and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. Note that unzip of the MuJoCo binaries will generate `mujoco200_linux`. You need to rename the directory and place it at `~/.mujoco/mujoco200`. 11 | - Install osmesa related dependencies: 12 | ``` 13 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev build-essential libglfw3 14 | ``` 15 | - Update `bashrc` by adding the following lines and source it 16 | ``` 17 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH" 18 | export MUJOCO_PY_FORCE_CPU=True 19 | alias MJPL='LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so:/usr/lib/nvidia-384/libGL.so' 20 | ``` 21 | - Install this package using 22 | ``` 23 | $ conda update conda 24 | $ cd 25 | $ conda env create -f setup/env.yml 26 | $ source activate mjrl-env 27 | $ pip install -e . 28 | ``` 29 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly based on the specific version of CUDA (or CPU-only) you have. 30 | 31 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info. 32 | 33 | ## Mac OS 34 | 35 | - Download MuJoCo binaries from the official [website](http://www.mujoco.org/) and also obtain the license key. 36 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200` (rename unzipped directory to this), and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. 37 | - Update `bashrc` by adding the following lines and source it 38 | ``` 39 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH" 40 | ``` 41 | - Install this package using 42 | ``` 43 | $ conda update conda 44 | $ cd path/to/mjrl 45 | $ conda env create -f setup/env.yml 46 | $ source activate mjrl-env 47 | $ pip install -e . 48 | ``` 49 | 50 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly. 51 | 52 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info. 53 | 54 | 55 | ## Known Issues 56 | 57 | - Visualization in linux: If the linux system has a GPU, then mujoco-py does not automatically preload the correct drivers. We added an alias `MJPL` in bashrc (see instructions) which stands for mujoco pre-load. When runing any python script that requires rendering, prepend the execution with MJPL. 58 | ``` 59 | $ MJPL python script.py 60 | ``` 61 | 62 | - Errors related to osmesa during installation. This is a `mujoco-py` build error and would likely go away if the following command is used before creating the conda environment. If the problem still persists, please contact the developers of mujoco-py 63 | ``` 64 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev 65 | ``` 66 | 67 | - If conda environment creation gets interrupted for some reason, you can resume it with the following: 68 | ``` 69 | $ conda env update -n mjrl-env -f setup/env.yml 70 | ``` 71 | 72 | - GCC error in Mac OS: If you get a GCC error from mujoco-py, you can get the correct version mujoco-py expects with `brew install gcc --without-multilib`. This may require uninstalling other versions of GCC that may have been previously installed with `brew remove gcc@6` for example. You can see which brew packages were already installed with `brew list`. 73 | 74 | -------------------------------------------------------------------------------- /mjrl/setup/env.yml: -------------------------------------------------------------------------------- 1 | name: mjrl-env 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - python=3.7 7 | - pip 8 | - ipython 9 | - mkl-service 10 | - pytorch==1.4 11 | - tabulate 12 | - termcolor 13 | - torchvision 14 | - patchelf 15 | - pip: 16 | - click 17 | - cloudpickle 18 | - gym==0.13 19 | - ipdb 20 | - matplotlib 21 | - mujoco-py<2.1,>=2.0 22 | - pip 23 | - pyyaml 24 | - tqdm 25 | - wheel 26 | - scipy 27 | - transforms3d 28 | -------------------------------------------------------------------------------- /mjrl/tests/point_mass_test.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 4 | from mjrl.baselines.mlp_baseline import MLPBaseline 5 | from mjrl.algos.npg_cg import NPG 6 | from mjrl.utils.train_agent import train_agent 7 | import mjrl.envs 8 | import time as timer 9 | SEED = 500 10 | 11 | e = GymEnv('mjrl_point_mass-v0') 12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 13 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=10, learn_rate=1e-3) 14 | agent = NPG(e, policy, baseline, normalized_step_size=0.05, seed=SEED, save_logs=True) 15 | 16 | ts = timer.time() 17 | train_agent(job_name='point_mass_exp1', 18 | agent=agent, 19 | seed=SEED, 20 | niter=50, 21 | gamma=0.95, 22 | gae_lambda=0.97, 23 | num_cpu=1, 24 | sample_mode='trajectories', 25 | num_traj=40, # samples = 40*25 = 1000 26 | save_freq=5, 27 | evaluation_rollouts=None, 28 | plot_keys=['stoc_pol_mean', 'running_score']) 29 | print("time taken = %f" % (timer.time()-ts)) 30 | -------------------------------------------------------------------------------- /mjrl/tests/visualizer_test.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 4 | from mjrl.baselines.mlp_baseline import MLPBaseline 5 | from mjrl.algos.npg_cg import NPG 6 | from mjrl.utils.train_agent import train_agent 7 | import mjrl.envs 8 | import time as timer 9 | SEED = 500 10 | 11 | e = GymEnv('mjrl_point_mass-v0') 12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 13 | baseline = QuadraticBaseline(e.spec) 14 | agent = NPG(e, policy, baseline, normalized_step_size=0.5, seed=SEED, save_logs=True) 15 | 16 | ts = timer.time() 17 | train_agent(job_name='vis_exp', 18 | agent=agent, 19 | seed=SEED, 20 | niter=10, 21 | gamma=0.95, 22 | gae_lambda=0.97, 23 | num_cpu=1, 24 | sample_mode='trajectories', 25 | num_traj=100, 26 | save_freq=5, 27 | evaluation_rollouts=None) 28 | print("time taken = %f" % (timer.time()-ts)) 29 | e.visualize_policy(policy, num_episodes=5, horizon=e.horizon, mode='exploration') 30 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.4.1 2 | mujoco-py==2.0.2.13 3 | numpy==1.20.2 4 | tabulate==0.8.9 5 | tensorboard==2.5.0 6 | tensorboard-data-server==0.6.1 7 | tensorboard-plugin-wit==1.8.0 8 | torch==1.8.1 9 | torchaudio==0.8.0a0+e4e171a 10 | torchvision==0.9.1 11 | tqdm==4.60.0 12 | --------------------------------------------------------------------------------