├── .gitignore
├── README.md
├── example_run.sh
├── humanoid_fig.png
├── milo
├── milo
│ ├── __init__.py
│ ├── cost
│ │ ├── __init__.py
│ │ └── linear_cost.py
│ ├── dataset
│ │ ├── __init__.py
│ │ └── datasets.py
│ ├── dynamics_model
│ │ ├── __init__.py
│ │ └── mlp_dynamics.py
│ ├── gym_env
│ │ ├── __init__.py
│ │ ├── ant.py
│ │ ├── half_cheetah.py
│ │ ├── hopper.py
│ │ ├── humanoid.py
│ │ ├── multiprocessing_env.py
│ │ ├── walker2d.py
│ │ └── wrappers.py
│ ├── sampler
│ │ ├── __init__.py
│ │ └── sampler.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── arguments.py
│ │ ├── evaluate.py
│ │ ├── logger.py
│ │ └── util.py
└── setup.py
├── mjrl
├── .gitignore
├── LICENSE
├── README.md
├── examples
│ ├── README.md
│ ├── behavior_clone.py
│ ├── example_configs
│ │ ├── hopper_npg.txt
│ │ ├── swimmer_npg.txt
│ │ └── swimmer_ppo.txt
│ ├── linear_nn_comparison.py
│ └── policy_opt_job_script.py
├── mjrl
│ ├── __init__.py
│ ├── algos
│ │ ├── __init__.py
│ │ ├── batch_reinforce.py
│ │ ├── behavior_cloning.py
│ │ ├── dapg.py
│ │ ├── mbac.py
│ │ ├── model_accel
│ │ │ ├── __init__.py
│ │ │ ├── model_accel_npg.py
│ │ │ ├── model_learning_mpc.py
│ │ │ ├── nn_dynamics.py
│ │ │ ├── run_experiments
│ │ │ │ ├── configs
│ │ │ │ │ ├── point_mass.txt
│ │ │ │ │ └── reacher.txt
│ │ │ │ ├── run_model_accel_npg.py
│ │ │ │ ├── sandbox
│ │ │ │ │ ├── example_config_mpc.txt
│ │ │ │ │ └── run_model_learning_mpc.py
│ │ │ │ └── utils
│ │ │ │ │ ├── reward_functions
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── mjrl_point_mass.py
│ │ │ │ │ ├── visualize_policy.py
│ │ │ │ │ └── visualize_trajectories.py
│ │ │ └── sampling.py
│ │ ├── npg_cg.py
│ │ ├── ppo_clip.py
│ │ └── trpo.py
│ ├── baselines
│ │ ├── __init__.py
│ │ ├── linear_baseline.py
│ │ ├── mlp_baseline.py
│ │ ├── quadratic_baseline.py
│ │ └── zero_baseline.py
│ ├── envs
│ │ ├── __init__.py
│ │ ├── assets
│ │ │ ├── peg_insertion.xml
│ │ │ ├── point_mass.xml
│ │ │ ├── sawyer.xml
│ │ │ └── swimmer.xml
│ │ ├── mujoco_env.py
│ │ ├── peg_insertion_sawyer.py
│ │ ├── point_mass.py
│ │ ├── reacher_sawyer.py
│ │ └── swimmer.py
│ ├── policies
│ │ ├── __init__.py
│ │ ├── gaussian_linear.py
│ │ ├── gaussian_mlp.py
│ │ └── mpc_actor.py
│ ├── samplers
│ │ ├── __init__.py
│ │ └── core.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── cg_solve.py
│ │ ├── fc_network.py
│ │ ├── get_environment.py
│ │ ├── gym_env.py
│ │ ├── logger.py
│ │ ├── make_train_plots.py
│ │ ├── optimize_model.py
│ │ ├── process_samples.py
│ │ ├── tensor_utils.py
│ │ ├── train_agent.py
│ │ └── visualize_policy.py
├── setup.py
├── setup
│ ├── README.md
│ └── env.yml
└── tests
│ ├── point_mass_test.py
│ └── visualizer_test.py
├── requirements.txt
├── run.py
└── run_hand.py
/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 |
129 | # Pyre type checker
130 | .pyre/
131 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Source Code for Model-based Imitation Learning from Offline data (MILO)
2 | Implementation of MILO, a model-based, offline imitation learning algorithm.
3 |
4 | 
5 |
6 | Link to pdf: https://arxiv.org/abs/2106.03207
7 |
8 | ## Notes on Installation
9 | After cloning this repository and installing the requirements, please run
10 |
11 | `cd milo && pip install -e .`
12 |
13 | `cd mjrl && pip install -e .`
14 |
15 | The experiments are run using MuJoCo physics, which requires a license to install. Please follow the instructions on [MuJoCo Website](http://www.mujoco.org)
16 |
17 | ## Overview
18 | The `milo` package contains our imitation learning, model-based environment stack, and boilerplate code. We modified the `mjrl` package to interface with our cost functions when doing model-based policy gradient. This modification can be seen in `mjrl/mjrl/algos/batch_reinforce.py`. Note that we currently only support NPG/TRPO as our policy gradient algorithm; however, in principle one could replace this with other algorithms/repositories.
19 |
20 | ## Environments Supported
21 | This repository supports 5 modified MuJoCo environments that can be found in `milo/milo/gym_env`. They are
22 | 1. Hopper-v4
23 | 2. Walker2d-v4
24 | 3. HalfCheetah-v4
25 | 4. Ant-v4
26 | 5. Humanoid-v4
27 |
28 | If you would like to add an environment, register the environment in `/milo/milo/gym_env/__init__.py` according to [OpenAI Gym](http://gym.openai.com/docs/#environments) instructions.
29 |
30 | ## Downloading the Datasets
31 | Please download the datasets from this [google drive link](https://drive.google.com/drive/folders/1gG2WIgL1mdznhuel5uKRb6lepF7EVeFr?usp=sharing). Each environment will have 2 datasets: `[ENV]_expert.pt` and `[ENV]_offline.pt`.
32 |
33 | In the `data` directory, place the expert and offline datasets in the `data/expert_data` and `data/offline_data` direcotires respectively.
34 |
35 | ## Running an Experiment
36 | We provide an example run script for Hopper, `example_run.sh`, that can be modified to be used with any other registered environment. To view all the possible arguments you can run please see the argparse in `milo/milo/utils/arguments.py`.
37 |
38 | ## Bibliography
39 | To cite this work, please use the following citation. Note that this repository builds upon MJRL so please also cite any references noted in the README [here](https://github.com/aravindr93/mjrl).
40 | ```
41 | @misc{chang2021mitigating,
42 | title={Mitigating Covariate Shift in Imitation Learning via Offline Data Without Great Coverage},
43 | author={Jonathan D. Chang and Masatoshi Uehara and Dhruv Sreenivas and Rahul Kidambi and Wen Sun},
44 | year={2021},
45 | eprint={2106.03207},
46 | archivePrefix={arXiv},
47 | primaryClass={cs.LG}
48 | }
49 | ```
50 |
--------------------------------------------------------------------------------
/example_run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python run.py --env Hopper-v4 \
4 | --seed 100 \
5 | --expert_db Hopper-v6_expert.pt \
6 | --offline_db Hopper-v6_offline.pt \
7 | --n_models 4 \
8 | --lambda_b 0.0025 \
9 | --samples_per_step 40000 \
10 | --pg_iter 1 \
11 | --bw_quantile 0.1 \
12 | --id 1 \
13 | --subsample_expert \
14 | --n_iter 300 \
15 | --cg_iter 25 \
16 | --bc_epochs 1 \
17 | --do_bc_reg \
18 | --bc_reg_coeff 0.1
19 |
--------------------------------------------------------------------------------
/humanoid_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/humanoid_fig.png
--------------------------------------------------------------------------------
/milo/milo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/milo/milo/__init__.py
--------------------------------------------------------------------------------
/milo/milo/cost/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.cost.linear_cost import RBFLinearCost
2 |
--------------------------------------------------------------------------------
/milo/milo/cost/linear_cost.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | import numpy as np
5 |
6 | class RBFLinearCost:
7 | """
8 | MMD cost implementation with rff feature representations
9 |
10 | NOTE: Currently hardcoded to cpu
11 |
12 | :param expert_data: (torch Tensor) expert data used for feature matching
13 | :param feature_dim: (int) feature dimension for rff
14 | :param input_type: (str) state (s), state-action (sa), state-next state (ss),
15 | state-action-next state (sas)
16 | :param cost_range: (list) inclusive range of costs
17 | :param bw_quantile: (float) quantile used to fit bandwidth for rff kernel
18 | :param bw_samples: (int) number of samples used to fit bandwidth
19 | :param lambda_b: (float) weight parameter for bonus and cost
20 | :param lr: (float) learning rate for discriminator/cost update. 0.0 = closed form update
21 | :param seed: (int) random seed to set cost function
22 | """
23 | def __init__(self,
24 | expert_data,
25 | feature_dim=1024,
26 | input_type='sa',
27 | cost_range=[-1.,0.],
28 | bw_quantile=0.1,
29 | bw_samples=100000,
30 | lambda_b=1.0,
31 | lr=0.0,
32 | seed=100):
33 |
34 | # Set Random Seed
35 | torch.manual_seed(seed)
36 | np.random.seed(seed)
37 |
38 | self.expert_data = expert_data
39 | input_dim = expert_data.size(1)
40 | self.input_type = input_type
41 | self.feature_dim = feature_dim
42 | self.cost_range = cost_range
43 | if cost_range is not None:
44 | self.c_min, self.c_max = cost_range
45 | self.lambda_b = lambda_b
46 | self.lr = lr
47 |
48 | # Fit Bandwidth
49 | self.quantile = bw_quantile
50 | self.bw_samples = bw_samples
51 | self.bw = self.fit_bandwidth(expert_data)
52 |
53 | # Define Phi and Cost weights
54 | self.rff = nn.Linear(input_dim, feature_dim)
55 | self.rff.bias.data = (torch.rand_like(self.rff.bias.data)-0.5)*2.0*np.pi
56 | self.rff.weight.data = torch.rand_like(self.rff.weight.data)/(self.bw+1e-8)
57 |
58 | # W Update Init
59 | self.w = None
60 |
61 | # Compute Expert Phi Mean
62 | self.expert_rep = self.get_rep(expert_data)
63 | self.phi_e = self.expert_rep.mean(dim=0)
64 |
65 | def get_rep(self, x):
66 | """
67 | Returns an RFF representation given an input
68 | """
69 | with torch.no_grad():
70 | out = self.rff(x.cpu())
71 | out = torch.cos(out)*np.sqrt(2/self.feature_dim)
72 | return out
73 |
74 | def fit_bandwidth(self, data):
75 | """
76 | Uses the median trick to fit the bandwidth for the RFF kernel
77 | """
78 | num_data = data.shape[0]
79 | idxs_0 = torch.randint(low=0, high=num_data, size=(self.bw_samples,))
80 | idxs_1 = torch.randint(low=0, high=num_data, size=(self.bw_samples,))
81 | norm = torch.norm(data[idxs_0, :]-data[idxs_1, :], dim=1)
82 | bw = torch.quantile(norm, q=self.quantile).item()
83 | return bw
84 |
85 | def fit_cost(self, data_pi):
86 | """
87 | Updates the weights of the cost with the closed form solution
88 | """
89 | phi = self.get_rep(data_pi).mean(0)
90 | feat_diff = phi - self.phi_e
91 |
92 | # Closed form solution
93 | self.w = feat_diff
94 |
95 | return torch.dot(self.w, feat_diff).item()
96 |
97 | def get_costs(self, x):
98 | """
99 | Returrns the IPM (MMD) cost for a given input
100 | """
101 | data = self.get_rep(x)
102 | if self.cost_range is not None:
103 | return torch.clamp(torch.mm(data, self.w.unsqueeze(1)), self.c_min, self.c_max)
104 | return torch.mm(data, self.w.unsqueeze(1))
105 |
106 | def get_expert_cost(self):
107 | """
108 | Returns the mean expert cost given our current discriminator weights and representations
109 | """
110 | return (1-self.lambda_b)*torch.clamp(torch.mm(self.expert_rep, self.w.unsqueeze(1)), self.c_min, self.c_max).mean()
111 |
112 | def get_bonus_costs(self, states, actions, ensemble, next_states=None):
113 | """
114 | Computes the cost with pessimism
115 | """
116 | if self.input_type == 'sa':
117 | rff_input = torch.cat([states, actions], dim=1)
118 | elif self.input_type == 'ss':
119 | assert(next_states is not None)
120 | rff_input = torch.cat([states, next_states], dim=1)
121 | elif self.input_type == 'sas':
122 | rff_input = torch.cat([states, actions, next_states], dim=1)
123 | elif self.input_type == 's':
124 | rff_input = states
125 | else:
126 | raise NotImplementedError("Input type not implemented")
127 |
128 | # Get Linear Cost
129 | rff_cost = self.get_costs(rff_input)
130 |
131 | if self.cost_range is not None:
132 | # Get Bonus from Ensemble
133 | discrepancy = ensemble.get_action_discrepancy(states, actions)/ensemble.threshold
134 | discrepancy = discrepancy.view(-1, 1)
135 | discrepancy[discrepancy>1.0] = 1.0
136 | # Bonus is LOW if (s,a) is unknown
137 | bonus = discrepancy * self.c_min
138 | else:
139 | bonus = ensemble.get_action_discrepancy(states, actions).view(-1,1)
140 |
141 | # Weight cost components
142 | ipm = (1-self.lambda_b)*rff_cost
143 |
144 | # Conservative/Pessimism Penalty term
145 | weighted_bonus = self.lambda_b*bonus.cpu() # Note cpu hardcoding
146 |
147 | # Cost
148 | cost = ipm - weighted_bonus
149 |
150 | # Logging info
151 | info = {'bonus': weighted_bonus, 'ipm': ipm, 'v_targ': rff_cost, 'cost': cost}
152 |
153 | return cost, info
154 |
155 |
--------------------------------------------------------------------------------
/milo/milo/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.dataset.datasets import OfflineDataset
2 |
--------------------------------------------------------------------------------
/milo/milo/dataset/datasets.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | from gym.spaces import Discrete, Box
4 |
5 | import torch
6 | from torch.utils.data import Dataset
7 |
8 |
9 | class OfflineDataset(Dataset):
10 | """
11 | Pytorch Dataset class for our offline dataset. Note we return (s,a,s') triples.
12 | :param env_name: (str) name of gym environment
13 | :param state: (torch Tensor) tensor with shape (number of samples, state dimension) with state data
14 | :param action: (torch Tensor) tensor with shape (number of samples, action dimension) with action data
15 | :param next_state: (torch Tensor) tensor with shape (number of samples, state dimension) with next state data
16 | :param device: (torch Device) device for pytorch. Currently hardcoded to cpu
17 | """
18 | def __init__(self, env_name, state, action, next_state, device=torch.device('cpu')):
19 | self.device = device
20 | self.state = state
21 | self.action = action
22 |
23 | env = gym.make(env_name)
24 | if isinstance(env.action_space, Discrete):
25 | self.action = self.one_hot(action, env.action_space.n)
26 | elif isinstance(env.action_space, Box):
27 | self.action = action
28 | else:
29 | raise NotImplementedError(
30 | "Environment Action Space not yet supported")
31 | self.next_state = next_state
32 | del env
33 |
34 | def get_transformations(self):
35 | diff = self.next_state - self.state
36 |
37 | # Compute Means
38 | state_mean = self.state.mean(dim=0).float().requires_grad_(False)
39 | action_mean = self.action.mean(dim=0).float().requires_grad_(False)
40 | diff_mean = diff.mean(dim=0).float().requires_grad_(False)
41 |
42 | # Compute Scales
43 | state_scale = torch.abs(
44 | self.state - state_mean).mean(dim=0).float().requires_grad_(False) + 1e-8
45 | action_scale = torch.abs(
46 | self.action - action_mean).mean(dim=0).float().requires_grad_(False) + 1e-8
47 | diff_scale = torch.abs(
48 | diff - diff_mean).mean(dim=0).float().requires_grad_(False) + 1e-8
49 |
50 | return state_mean.to(self.device), state_scale.to(self.device), action_mean.to(self.device), \
51 | action_scale.to(self.device), diff_mean.to(
52 | self.device), diff_scale.to(self.device)
53 |
54 | def one_hot(self, action, action_dim):
55 | db_size = action.size(0)
56 | one_hot_action = torch.eye(action_dim)[action]
57 | return one_hot_action.view(db_size, action_dim)
58 |
59 | def __len__(self):
60 | return self.state.size(0)
61 |
62 | def __getitem__(self, idx):
63 | return self.state[idx].float(), self.action[idx].float(), self.next_state[idx].float()
64 |
--------------------------------------------------------------------------------
/milo/milo/dynamics_model/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.dynamics_model.mlp_dynamics import DynamicsEnsemble, DynamicsModel
2 |
--------------------------------------------------------------------------------
/milo/milo/gym_env/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.gym_env.wrappers import model_based_env
2 | from milo.gym_env.multiprocessing_env import MujocoEnvProcess
3 | from gym.envs.registration import register
4 |
5 | register(
6 | id='Hopper-v4',
7 | entry_point='milo.gym_env.hopper:HopperEnv',
8 | max_episode_steps=400,
9 | reward_threshold=3800.0,
10 | )
11 |
12 | register(
13 | id='Walker2d-v4',
14 | max_episode_steps=400,
15 | entry_point='milo.gym_env.walker2d:Walker2dEnv'
16 | )
17 |
18 | register(
19 | id='HalfCheetah-v4',
20 | entry_point='milo.gym_env.half_cheetah:HalfCheetahEnv',
21 | max_episode_steps=500,
22 | reward_threshold=4800.0,
23 | )
24 |
25 | register(
26 | id='Ant-v4',
27 | entry_point='milo.gym_env.ant:AntEnv',
28 | max_episode_steps=500,
29 | reward_threshold=6000.0,
30 | )
31 |
32 | register(
33 | id='Humanoid-v4',
34 | entry_point='milo.gym_env.humanoid:HumanoidEnv',
35 | max_episode_steps=500,
36 | )
37 |
--------------------------------------------------------------------------------
/milo/milo/gym_env/ant.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from gym.envs.mujoco import mujoco_env
4 |
5 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
6 | def __init__(self):
7 | self.pos_before = np.array([0.0, 0.0])
8 | mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 10)
9 | # mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5)
10 | utils.EzPickle.__init__(self)
11 |
12 | def step(self, a):
13 | self.pos_before = self.data.qpos[:2].copy()
14 | self.do_simulation(a, self.frame_skip)
15 | obs = self._get_obs()
16 | reward = self.get_reward(obs, a)
17 | done = self.get_done(obs)
18 | return obs, reward, done, {}
19 |
20 | def _get_obs(self):
21 | delta = self.data.qpos[:2] - self.pos_before
22 | return np.concatenate([
23 | delta,
24 | self.sim.data.qpos.flat[2:],
25 | self.sim.data.qvel.ravel() * self.dt,
26 | # NOTE: We are throwing away contact related info, since it is often unnecessary
27 | # np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
28 | ])
29 |
30 | def get_reward(self, obs, act):
31 | obs = np.clip(obs, -10.0, 10.0)
32 | if len(obs.shape) == 1:
33 | # vector obs, called when stepping the env
34 | vel_x = obs[0] / self.dt # recover velocity from delta
35 | power = np.square(act).sum()
36 | # NOTE: We will use the contact force penalties for actual reward
37 | # to be consistent with gym results
38 | cfrc_ext = np.clip(self.sim.data.cfrc_ext, -1, 1).ravel()
39 | height = obs[2]
40 | reward = - 0.5 * 1e-3 * np.square(cfrc_ext).sum() # contact cost
41 | else:
42 | # for imaginary rollouts using learned model
43 | vel_x = obs[:, :, 0] / self.dt # recover velocity from delta
44 | power = np.square(act).sum(axis=-1)
45 | height = obs[:, :, 2]
46 | # NOTE: WE will not consider contact costs for imaginary rollouts
47 | reward = 0.0
48 | survive_reward = 1.0 * (height > 0.2) * (height < 1.0)
49 | ctrl_cost = 0.5 * power
50 | reward += vel_x - ctrl_cost + survive_reward
51 | reward = reward * 2.0 # to account for scaling difference (skip 5 --> 10)
52 | return reward
53 |
54 | def compute_path_rewards(self, paths):
55 | # path has two keys: observations and actions
56 | # path["observations"] : (num_traj, horizon, obs_dim)
57 | # path["rewards"] should have shape (num_traj, horizon)
58 | obs = paths["observations"]
59 | act = paths["actions"]
60 | rewards = self.get_reward(obs, act)
61 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
62 | return paths
63 |
64 | def get_done(self, obs):
65 | done = not (np.isfinite(obs).all() and (obs[2] > 0.2) and (obs[2] < 1.0))
66 | return done
67 |
68 | def truncate_paths(self, paths):
69 | for path in paths:
70 | obs = path["observations"]
71 | height = obs[:,2]#obs[:, 0]
72 | T = obs.shape[0]
73 | t = 0
74 | done = False
75 | while t < T and done is False:
76 | done = not (np.isfinite(obs[t]).all() and (height[t] > 0.2) and (height[t] < 1.0))
77 | T = t if done else T
78 | t = t + 1
79 | path["observations"] = path["observations"][:T]
80 | path["actions"] = path["actions"][:T]
81 | path["rewards"] = path["rewards"][:T]
82 | path["terminated"] = done
83 | return paths
84 |
85 | def reset_model(self):
86 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
87 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
88 | self.set_state(qpos, qvel)
89 | return self._get_obs()
90 |
91 | def get_env_state(self):
92 | return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy())
93 |
94 | def set_env_state(self, state):
95 | qpos = state['qpos']
96 | qvel = state['qvel']
97 | self.sim.reset()
98 | self.data.qpos[:] = qpos
99 | self.data.qvel[:] = qvel
100 | self.sim.forward()
101 |
102 | def viewer_setup(self):
103 | self.viewer.cam.distance = self.model.stat.extent * 0.5
104 |
--------------------------------------------------------------------------------
/milo/milo/gym_env/half_cheetah.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from gym.envs.mujoco import mujoco_env
4 |
5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
6 | def __init__(self):
7 | self.pos_before = 0.0
8 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 10)
9 | utils.EzPickle.__init__(self)
10 |
11 | def step(self, a):
12 | self.pos_before = self.data.qpos[0].copy()
13 | self.do_simulation(a, self.frame_skip)
14 | obs = self._get_obs()
15 | reward = self.get_reward(obs, a)
16 | done = False # no termination for this env
17 | return obs, reward, done, {}
18 |
19 | def _get_obs(self):
20 | delta = self.data.qpos[0] - self.pos_before
21 | return np.concatenate([
22 | [delta],
23 | self.sim.data.qpos.ravel()[1:],
24 | self.sim.data.qvel.ravel() * self.dt,
25 | ])
26 |
27 | def get_reward(self, obs, act):
28 | obs = np.clip(obs, -10.0, 10.0)
29 | if len(obs.shape) == 1:
30 | # vector obs, called when stepping the env
31 | # vel_x = obs[-9] / self.dt # recover velocity from delta
32 | vel_x = obs[0] / self.dt
33 | power = np.square(act).sum()
34 | else:
35 | # vel_x = obs[:, :, -9] / self.dt # recover velocity from delta
36 | vel_x = obs[:, :, 0] / self.dt
37 | power = np.square(act).sum(axis=-1)
38 | reward = vel_x - 0.1 * power
39 | reward = reward * 2.0 # to account for scaling difference (skip 5 --> 10)
40 | return reward
41 |
42 | def compute_path_rewards(self, paths):
43 | # path has two keys: observations and actions
44 | # path["observations"] : (num_traj, horizon, obs_dim)
45 | # path["rewards"] should have shape (num_traj, horizon)
46 | obs = paths["observations"]
47 | act = paths["actions"]
48 | rewards = self.get_reward(obs, act)
49 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
50 |
51 | def get_env_state(self):
52 | return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy())
53 |
54 | def set_env_state(self, state):
55 | qpos = state['qpos']
56 | qvel = state['qvel']
57 | self.sim.reset()
58 | self.data.qpos[:] = qpos
59 | self.data.qvel[:] = qvel
60 | self.sim.forward()
61 |
62 | def reset_model(self):
63 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
64 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
65 | self.set_state(qpos, qvel)
66 | return self._get_obs()
67 |
68 | def viewer_setup(self):
69 | self.viewer.cam.distance = self.model.stat.extent * 0.5
70 |
--------------------------------------------------------------------------------
/milo/milo/gym_env/hopper.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from gym.envs.mujoco import mujoco_env
4 |
5 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
6 | def __init__(self):
7 | self.pos_before = 0.0
8 | self.height_idx = 1
9 | self.ang_idx = 2
10 | self.ang_threshold = 1.0
11 | mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 10)
12 | # mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 4)
13 | utils.EzPickle.__init__(self)
14 |
15 | def step(self, a):
16 | self.pos_before = self.data.qpos[0].copy()
17 | self.do_simulation(a, self.frame_skip)
18 | obs = self._get_obs()
19 | reward = self.get_reward(obs, a)
20 | done = self.get_done(obs)
21 | return obs, reward, done, {}
22 |
23 | def _get_obs(self):
24 | # I am using delta instead of velocity,
25 | # so that all obs are of similar magnitude
26 | delta = self.data.qpos[0] - self.pos_before
27 | return np.concatenate([
28 | [delta],
29 | self.sim.data.qpos.ravel()[1:],
30 | self.sim.data.qvel.ravel() * self.dt,
31 | ])
32 |
33 | def get_reward(self, obs, act):
34 | obs = np.clip(obs, -10.0, 10.0)
35 | if len(obs.shape) == 1:
36 | # vector obs, called when stepping the env
37 | # vel_x = (obs[1] - obs[0]) / self.dt # recover velocity from delta
38 | vel_x = obs[0] / self.dt
39 | power = np.square(act).sum()
40 | height, ang = obs[self.height_idx:(self.ang_idx+1)]
41 | else:
42 | # vel_x = (obs[:, :, 1] - obs[:, :, 0]) / self.dt # recover velocity from delta
43 | vel_x = obs[:, :, 0] / self.dt
44 | power = np.square(act).sum(axis=-1)
45 | height = obs[:, :, self.height_idx]
46 | ang = obs[:, :, self.ang_idx]
47 | alive_bonus = 1.0 * (height > .7) * (np.abs(ang) < self.ang_threshold)
48 | reward = vel_x + alive_bonus - 1e-3*power
49 | reward = reward * 2.5 # to account for scaling difference (skip 4 --> 10)
50 | return reward
51 |
52 | def compute_path_rewards(self, paths):
53 | # path has two keys: observations and actions
54 | # path["observations"] : (num_traj, horizon, obs_dim)
55 | # path["rewards"] should have shape (num_traj, horizon)
56 | obs = paths["observations"]
57 | act = paths["actions"]
58 | rewards = self.get_reward(obs, act)
59 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
60 |
61 | def get_done(self, obs):
62 | height, ang = obs[self.height_idx:(self.ang_idx+1)]
63 | done = not (np.isfinite(obs).all() and (np.abs(obs) < 100).all() and
64 | (height > .7) and (np.abs(ang) < self.ang_threshold))
65 | return done
66 |
67 | def truncate_paths(self, paths):
68 | for path in paths:
69 | obs = path["observations"]
70 | height = obs[:, self.height_idx]
71 | angle = obs[:, self.ang_idx]
72 | T = obs.shape[0]
73 | t = 0
74 | done = False
75 | while t < T and done is False:
76 | done = not ((np.abs(obs[t]) < 100).all() and (height[t] > .7) and (np.abs(angle[t]) < self.ang_threshold))
77 | t = t + 1
78 | T = t if done else T
79 | path["observations"] = path["observations"][:T]
80 | path["actions"] = path["actions"][:T]
81 | path["rewards"] = path["rewards"][:T]
82 | path["terminated"] = done
83 | return paths
84 |
85 | def get_env_state(self):
86 | return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy())
87 |
88 | def set_env_state(self, state):
89 | qpos = state['qpos']
90 | qvel = state['qvel']
91 | self.sim.reset()
92 | self.data.qpos[:] = qpos
93 | self.data.qvel[:] = qvel
94 | self.sim.forward()
95 |
96 | def reset_model(self):
97 | qpos = self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq)
98 | qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
99 | self.set_state(qpos, qvel)
100 | return self._get_obs()
101 |
102 | def viewer_setup(self):
103 | self.viewer.cam.trackbodyid = 2
104 | self.viewer.cam.distance = self.model.stat.extent * 0.75
105 | self.viewer.cam.lookat[2] = 1.15
106 | self.viewer.cam.elevation = -20
107 |
--------------------------------------------------------------------------------
/milo/milo/gym_env/humanoid.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym.envs.mujoco import mujoco_env
3 | from gym import utils
4 |
5 | DEFAULT_CAMERA_CONFIG = {
6 | 'trackbodyid': 1,
7 | 'distance': 4.0,
8 | 'lookat': np.array((0.0, 0.0, 2.0)),
9 | 'elevation': -20.0,
10 | }
11 |
12 | def mass_center(model, sim):
13 | mass = np.expand_dims(model.body_mass, axis=1)
14 | xpos = sim.data.xipos
15 | return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy()
16 |
17 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
18 | def __init__(self,
19 | xml_file='humanoid.xml',
20 | reset_noise_scale=1e-2):
21 | utils.EzPickle.__init__(**locals())
22 |
23 | self._reset_noise_scale = reset_noise_scale
24 |
25 | #mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
26 | mujoco_env.MujocoEnv.__init__(self, xml_file, 10)
27 |
28 | def step(self, action):
29 | self.xypos_before = mass_center(self.model, self.sim)
30 | self.do_simulation(action, self.frame_skip)
31 |
32 | observation = self._get_obs()
33 | reward = self.get_reward(observation, action)
34 | done = self.get_done(observation)
35 |
36 | return observation, reward, done, {}
37 |
38 | def _get_obs(self):
39 | position = self.sim.data.qpos.flat.copy()
40 | velocity = self.sim.data.qvel.flat.copy()
41 | # Add Difference of center of mass to get reward
42 | delta = mass_center(self.model, self.sim) - self.xypos_before
43 |
44 | return np.concatenate((
45 | delta,
46 | position[2:],
47 | velocity*self.dt,
48 | ))
49 |
50 | def get_reward(self, obs, action):
51 | obs = np.clip(obs, -10.0, 10.0)
52 | ctrl = np.clip(action, -0.4, 0.4)
53 |
54 | x_velocity, y_velocity = obs[:2]/self.dt
55 | z = obs[2]
56 | forward_reward = 1.25 * x_velocity
57 | alive_reward = 5.0
58 | ctrl_cost = 0.1 * np.sum(np.square(ctrl))
59 | reward = forward_reward + alive_reward - ctrl_cost
60 |
61 | return reward * 2.0
62 |
63 | def get_done(self, obs):
64 | healthy = 1.0 < obs[2] < 2.0
65 | return not healthy
66 |
67 | def reset_model(self):
68 | noise_low = -self._reset_noise_scale
69 | noise_high = self._reset_noise_scale
70 |
71 | qpos = self.init_qpos + self.np_random.uniform(
72 | low=noise_low, high=noise_high, size=self.model.nq)
73 | qvel = self.init_qvel + self.np_random.uniform(
74 | low=noise_low, high=noise_high, size=self.model.nv)
75 | self.set_state(qpos, qvel)
76 |
77 | observation = self._get_obs()
78 | return observation
79 |
80 | def viewer_setup(self):
81 | for key, value in DEFAULT_CAMERA_CONFIG.items():
82 | if isinstance(value, np.ndarray):
83 | getattr(self.viewer.cam, key)[:] = value
84 | else:
85 | setattr(self.viewer.cam, key, value)
86 |
--------------------------------------------------------------------------------
/milo/milo/gym_env/multiprocessing_env.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import numpy as np
3 | from torch.multiprocessing import Process
4 |
5 | class MujocoEnvProcess(Process):
6 | """
7 | Process class for model based environments that are compatible with MJRL
8 | """
9 | def __init__(self, env, child_conn, seed, eval_mode=False, paths_per_process=25):
10 | super().__init__()
11 | self.daemon = True
12 | self.env = copy.deepcopy(env)
13 | self.horizon = env.horizon
14 | self.child_conn = child_conn
15 | self.paths_per_process = paths_per_process
16 | self.seed = seed
17 | self.eval_mode = eval_mode
18 |
19 | def run(self):
20 | super().run()
21 | while True:
22 | paths, ctr = [], 0
23 | policy = self.child_conn.recv() # Recieve policy
24 | for ep in range(self.paths_per_process):
25 | # Set new seed
26 | seed = self.seed + ep
27 | self.env.set_seed(seed)
28 | np.random.seed(seed)
29 |
30 | observations = []
31 | actions = []
32 | rewards = []
33 | next_observations = []
34 | agent_infos = []
35 | env_infos = []
36 |
37 | o = self.env.reset()
38 | done = False
39 | t = 0
40 | while t < self.horizon and done != True:
41 | a, agent_info = policy.get_action(o)
42 | if self.eval_mode:
43 | a = agent_info['evaluation']
44 | next_o, r, done, info = self.env.step(a) # Take step
45 |
46 | observations.append(o)
47 | next_observations.append(next_o)
48 | actions.append(a)
49 | rewards.append(r)
50 | agent_infos.append(agent_info)
51 | env_infos.append(info)
52 |
53 | o = next_o
54 | t += 1
55 |
56 | path = dict(
57 | observations = np.array(observations),
58 | next_observations = np.array(next_observations),
59 | actions = np.array(actions),
60 | rewards = np.array(rewards),
61 | agent_infos = stack_tensor_dict_list(agent_infos),
62 | env_infos = stack_tensor_dict_list(env_infos),
63 | terminated = done
64 | )
65 |
66 | paths.append(path)
67 | ctr += t
68 |
69 | self.child_conn.send([paths, ctr]) # Return num samples
70 |
71 | def close(self):
72 | super().close()
73 |
74 | def stack_tensor_list(tensor_list):
75 | return np.array(tensor_list)
76 |
77 | def stack_tensor_dict_list(tensor_dict_list):
78 | """
79 | Stack a list of dictionaries of {tensors or dictionary of tensors}.
80 | :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}.
81 | :return: a dictionary of {stacked tensors or dictionary of stacked tensors}
82 | """
83 | keys = list(tensor_dict_list[0].keys())
84 | ret = dict()
85 | for k in keys:
86 | example = tensor_dict_list[0][k]
87 | if isinstance(example, dict):
88 | v = stack_tensor_dict_list([x[k] for x in tensor_dict_list])
89 | else:
90 | v = stack_tensor_list([x[k] for x in tensor_dict_list])
91 | ret[k] = v
92 | return ret
93 |
--------------------------------------------------------------------------------
/milo/milo/gym_env/walker2d.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from gym.envs.mujoco import mujoco_env
4 |
5 | class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
6 | def __init__(self):
7 | self.pos_before = 0.0
8 | self.height_idx, self.ang_idx = 1, 2
9 | mujoco_env.MujocoEnv.__init__(self, 'walker2d.xml', 10)
10 | # mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4)
11 | utils.EzPickle.__init__(self)
12 |
13 | def step(self, a):
14 | self.pos_before = self.data.qpos[0].copy()
15 | self.do_simulation(a, self.frame_skip)
16 | obs = self._get_obs()
17 | reward = self.get_reward(obs, a)
18 | done = self.get_done(obs)
19 | return obs, reward, done, {}
20 |
21 | def _get_obs(self):
22 | # I am using delta instead of velocity,
23 | # so that all obs are of similar magnitude
24 | delta = self.data.qpos[0] - self.pos_before
25 | return np.concatenate([
26 | [delta],
27 | self.sim.data.qpos.ravel()[1:],
28 | self.sim.data.qvel.ravel() * self.dt,
29 | ])
30 |
31 | def get_reward(self, obs, act):
32 | obs = np.clip(obs, -10.0, 10.0)
33 | if len(obs.shape) == 1:
34 | # vector obs, called when stepping the env
35 | vel_x = obs[0] / self.dt # recover velocity from delta
36 | power = np.square(act).sum()
37 | height, ang = obs[self.height_idx:(self.ang_idx+1)]
38 | else:
39 | vel_x = obs[:, :, 0] / self.dt # recover velocity from delta
40 | power = np.square(act).sum(axis=-1)
41 | height = obs[:, :, self.height_idx]
42 | ang = obs[:, :, self.ang_idx]
43 | alive_bonus = 1.0 * (height > 0.8) * (height < 2.0) * (np.abs(ang) < 1.0)
44 | reward = vel_x + alive_bonus - 1e-3 * power
45 | reward = reward * 2.5 # to account for scaling difference (skip 4 --> 10)
46 | return reward
47 |
48 | def compute_path_rewards(self, paths):
49 | # path has two keys: observations and actions
50 | # path["observations"] : (num_traj, horizon, obs_dim)
51 | # path["rewards"] should have shape (num_traj, horizon)
52 | obs = paths["observations"]
53 | act = paths["actions"]
54 | rewards = self.get_reward(obs, act)
55 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
56 | return paths
57 |
58 | def get_done(self, obs):
59 | height, ang = obs[self.height_idx:(self.ang_idx+1)]
60 | done = not (np.isfinite(obs).all() and (np.abs(obs) < 100).all() and
61 | (height > 0.8) and (height < 2.0) and (np.abs(ang) < 1.0))
62 | return done
63 |
64 | def truncate_paths(self, paths):
65 | for path in paths:
66 | obs = path["observations"]
67 | height = obs[:, self.height_idx]
68 | angle = obs[:, self.ang_idx]
69 | T = obs.shape[0]
70 | t = 0
71 | done = False
72 | while t < T and done is False:
73 | done = not ((np.abs(obs[t]) < 100).all() and (height[t] > 0.8) and \
74 | (height[t] < 2.0) and (np.abs(angle[t]) < 1.0))
75 | T = t if done else T
76 | t = t + 1
77 | path["observations"] = path["observations"][:T]
78 | path["actions"] = path["actions"][:T]
79 | path["rewards"] = path["rewards"][:T]
80 | path["terminated"] = done
81 | return paths
82 |
83 | def get_env_state(self):
84 | return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy())
85 |
86 | def set_env_state(self, state):
87 | qpos = state['qpos']
88 | qvel = state['qvel']
89 | self.sim.reset()
90 | self.data.qpos[:] = qpos
91 | self.data.qvel[:] = qvel
92 | self.sim.forward()
93 |
94 | def reset_model(self):
95 | self.set_state(
96 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq),
97 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
98 | )
99 | return self._get_obs()
100 |
101 | def viewer_setup(self):
102 | self.viewer.cam.trackbodyid = 2
103 | self.viewer.cam.distance = self.model.stat.extent * 0.5
104 | self.viewer.cam.lookat[2] = 1.15
105 | self.viewer.cam.elevation = -20
106 |
--------------------------------------------------------------------------------
/milo/milo/sampler/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.sampler.sampler import mb_sampler
2 |
--------------------------------------------------------------------------------
/milo/milo/sampler/sampler.py:
--------------------------------------------------------------------------------
1 | import time
2 | from copy import deepcopy
3 | from milo.gym_env import MujocoEnvProcess
4 | from torch.multiprocessing import Pipe
5 |
6 | def mb_sampler(env,
7 | policy,
8 | num_samples,
9 | base_seed,
10 | eval_mode=False,
11 | num_workers=4,
12 | paths_per_process=13,
13 | verbose=False):
14 | """
15 | Multiprocess sampler for model-based rollouts. Note, this is only meant for CPU usage.
16 | """
17 |
18 | # Create Pipes and spawn jobs
19 | jobs, parent_conns, child_conns = [], [], []
20 | for idx in range(num_workers):
21 | parent_conn, child_conn = Pipe()
22 | seed = 12345+base_seed*idx
23 | job = MujocoEnvProcess(env, child_conn, seed, eval_mode=eval_mode, paths_per_process=paths_per_process)
24 | job.start()
25 | jobs.append(job)
26 | parent_conns.append(parent_conn)
27 | child_conns.append(child_conn)
28 |
29 | # Run Jobs
30 | start_time = time.time()
31 | all_paths, curr_samples = [], 0
32 | while curr_samples < num_samples:
33 | for parent_conn in parent_conns:
34 | parent_conn.send(deepcopy(policy))
35 | for parent_conn in parent_conns:
36 | paths, ctr = parent_conn.recv()
37 | all_paths.extend(paths)
38 | curr_samples += ctr
39 | if verbose:
40 | print(f"Collected {curr_samples} samples and {len(all_paths)} trajectories <<<<<< took {time.time()-start_time} seconds")
41 |
42 | for job in jobs:
43 | job.terminate()
44 |
45 | return all_paths
46 |
--------------------------------------------------------------------------------
/milo/milo/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.utils.logger import init_logger
2 | from milo.utils.arguments import get_args
3 | from milo.utils.evaluate import evaluate
4 | from milo.utils.util import *
5 |
--------------------------------------------------------------------------------
/milo/milo/utils/arguments.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | def get_args():
5 | # ====== Argument Parser ======
6 | parser = argparse.ArgumentParser(
7 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
8 | )
9 |
10 | # Logging/Environment Arguments
11 | parser.add_argument('--env', type=str,
12 | help='environment ID', default='Hopper-v6')
13 | parser.add_argument('--seed', type=int, help='seed', default=100)
14 | parser.add_argument('--num_cpu', type=int,
15 | help='number of processes used for inference', default=4)
16 | parser.add_argument('--num_trajs', type=int,
17 | help='number of expert trajs', default=10)
18 | parser.add_argument('--num_samples', type=int,
19 | help='number of expert samples', default=500)
20 | parser.add_argument('--subsample_freq', type=int,
21 | help='subsample frequency', default=8)
22 | parser.add_argument('--norm_thresh_coeff', type=float,
23 | help='Norm threshold', default=2)
24 | parser.add_argument('--include_expert', action='store_true',
25 | help='include expert data into offline db', default=False)
26 | parser.add_argument('--subsample_expert', action='store_true',
27 | help='subsample expert samples', default=False)
28 | parser.add_argument('--randomize_expert', action='store_true',
29 | help='randomize expert samples', default=False)
30 | parser.add_argument('--save_iter', type=int,
31 | help='Interval to Save checkpoints', default=10)
32 |
33 | # Path Arguments
34 | parser.add_argument('--root_path', type=str,
35 | help='Root dir to save outputs', default='./experiments')
36 | parser.add_argument('--data_path', type=str,
37 | help='Root data dir to get db', default='./data')
38 | parser.add_argument('--expert_db', type=str,
39 | help='expert db name', default='Hopper-v6_100_3012.62.pt')
40 | parser.add_argument('--offline_db', type=str,
41 | help='offline db name', default='Hopper-v6_100_3025.47.pt')
42 | parser.add_argument('--model_save_path', type=str, help='Path to save models',
43 | default='./experiments/dynamics_model_weights')
44 | parser.add_argument('--id', type=int, help='Experiment id', default=0)
45 |
46 | # Dynamics Model Ensemble Arguments
47 | parser.add_argument('--n_models', type=int,
48 | help='Number of dynamics models in ensemble', default=4)
49 | parser.add_argument('--n_epochs', type=int,
50 | help='Number of epochs to train models', default=5)
51 | parser.add_argument('--grad_clip', type=float,
52 | help='Max Gradient Norm', default=1.0)
53 | parser.add_argument('--dynamics_optim', type=str,
54 | help='Optimizer to use [sgd, adam]', default='sgd')
55 |
56 | # Cost Arguments
57 | parser.add_argument('--feature_dim', type=int,
58 | help='Feature dimension', default=512)
59 | parser.add_argument('--update_type', type=str,
60 | help='exact, geometric, decay, decay_sqrt, ficticious', default='exact')
61 | parser.add_argument('--bw_quantile', type=float,
62 | help='Quantile when fitting bandwidth', default=0.2)
63 | parser.add_argument('--lambda_b', type=float,
64 | help='Bonus/Penalty weighting param', default=0.1)
65 | parser.add_argument('--cost_lr', type=float,
66 | help='0.0 is exact update, otherwise learning rate', default=0.0)
67 |
68 | # Policy Gradient Arguments
69 | parser.add_argument('--planner', type=str,
70 | help='pg alg to use (trpo, ppo)', default='trpo')
71 | parser.add_argument('--actor_model_hidden', type=int,
72 | nargs='+', help='hidden dims for actor', default=[32, 32])
73 | parser.add_argument('--critic_model_hidden', type=int, nargs='+',
74 | help='hidden dims for critic', default=[128, 128])
75 | parser.add_argument('--gamma', type=float,
76 | help='discount factor for rewards (default: 0.99)', default=0.995)
77 | parser.add_argument('--gae_lambda', type=float,
78 | help='gae lambda val', default=0.97)
79 | parser.add_argument('--samples_per_step', type=int,
80 | help='Number of mb samples per pg step', default=512)
81 | parser.add_argument('--policy_init_log', type=float,
82 | help='policy init log', default=-0.25)
83 | parser.add_argument('--policy_min_log', type=float,
84 | help='policy min log', default=-2.0)
85 | parser.add_argument('--vf_iters', type=int,
86 | help='Number of value optim steps', default=2)
87 | parser.add_argument('--vf_batch_size', type=int,
88 | help='Critic batch size', default=64)
89 | parser.add_argument('--vf_lr', type=float, help='Value lr', default=1e-3)
90 | parser.add_argument('--vf_reg_coef', type=float,
91 | help='baseline regularization coeff', default=1e-3)
92 |
93 | # BC regularization Arguments
94 | parser.add_argument('--do_bc_reg', action='store_true', help='Add bc regularization to policy gradient', default=False)
95 | parser.add_argument('--bc_reg_coeff', type=float, help='Regularization coefficient for policy gradient', default=0.1)
96 |
97 | # TRPO Arguments
98 | parser.add_argument('--cg_iter', type=int,
99 | help='Number of CG iterations', default=10)
100 | parser.add_argument('--cg_damping', type=float,
101 | help='CG damping coefficient', default=1e-4)
102 | parser.add_argument('--kl_dist', type=float,
103 | help='Trust region', default=0.05)
104 | parser.add_argument('--hvp_sample_frac', type=float,
105 | help='Fraction of samples for FIM', default=1.0)
106 |
107 | # PPO Arguments
108 | parser.add_argument('--clip_coef', type=float,
109 | help='Clip Coefficient for PPO Trust region', default=0.2)
110 | parser.add_argument('--ppo_lr', type=float,
111 | help='PPO learning rate', default=3e-4)
112 | parser.add_argument('--ppo_epochs', type=int,
113 | help='Epochs per PPO step', default=10)
114 | parser.add_argument('--ppo_batch_size', type=int,
115 | help='Mini-batch size for PPO', default=64)
116 |
117 | # BC Arguments
118 | parser.add_argument('--bc_epochs', type=int,
119 | help='Number of BC epochs', default=3)
120 | parser.add_argument('--n_bc_iters', type=int, default=10,
121 | help='number of times to run BC iterations')
122 |
123 | # General Algorithm Arguments
124 | parser.add_argument('--n_iter', type=int, help='Number of offline IL iterations to run', default=300)
125 | parser.add_argument('--pg_iter', type=int, help='Number of pg steps', default=5)
126 | parser.add_argument('--use_ground_truth', action='store_true', help='use ground truth rewards', default=False)
127 | parser.add_argument('--do_model_free', action='store_true', help='do model free policy gradient', default=False)
128 |
129 | args = parser.parse_args()
130 | return args
131 |
--------------------------------------------------------------------------------
/milo/milo/utils/evaluate.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | from mjrl.samplers.core import sample_paths
5 |
6 | # ========================
7 | # === Evaluation Utils ===
8 | # ========================
9 |
10 | def evaluate(n_iter, logger, writer, args, env, policy, reward_func, num_traj=10, adroit=False):
11 | greedy_samples = sample_paths(num_traj=num_traj, env=env, policy=policy, \
12 | num_cpu=args.num_cpu, base_seed=args.seed, eval_mode=True, suppress_print=True)
13 | samples = sample_paths(num_traj=num_traj, env=env, policy=policy, \
14 | num_cpu=args.num_cpu, base_seed=args.seed, eval_mode=False, suppress_print=True)
15 |
16 | if adroit:
17 | greedy_success = env.evaluate_success(greedy_samples)
18 | sample_success = env.evaluate_success(samples)
19 |
20 | # Compute scores
21 | greedy_scores = np.array([np.sum(traj['rewards']) for traj in greedy_samples])
22 | sample_scores = np.array([np.sum(traj['rewards']) for traj in samples])
23 | greedy_mean_lengths = np.mean([len(traj['rewards']) for traj in greedy_samples])
24 | sample_mean_lengths = np.mean([len(traj['rewards']) for traj in samples])
25 | greedy_mean, greedy_max, greedy_min = greedy_scores.mean(), greedy_scores.max(), greedy_scores.min()
26 | sample_mean, sample_max, sample_min = sample_scores.mean(), sample_scores.max(), sample_scores.min()
27 |
28 | # Compute MMD (S, A)
29 | greedy_x = np.concatenate([np.concatenate([traj['observations'], traj['actions']], axis=1) for traj in greedy_samples], axis=0)
30 | sample_x = np.concatenate([np.concatenate([traj['observations'], traj['actions']], axis=1) for traj in samples], axis=0)
31 | greedy_x = torch.from_numpy(greedy_x).float()
32 | sample_x = torch.from_numpy(sample_x).float()
33 |
34 | greedy_diff = reward_func.get_rep(greedy_x).mean(0) - reward_func.phi_e
35 | sample_diff = reward_func.get_rep(sample_x).mean(0) - reward_func.phi_e
36 |
37 | greedy_mmd = torch.dot(greedy_diff, greedy_diff)
38 | sample_mmd = torch.dot(sample_diff, sample_diff)
39 |
40 | # Log
41 | logger.info(f'Greedy Evaluation Score mean (min, max): {greedy_mean:.2f} ({greedy_min:.2f}, {greedy_max:.2f})')
42 | logger.info(f'Greedy Evaluation Trajectory Lengths: {greedy_mean_lengths:.2f}')
43 | logger.info(f'Greedy MMD: {greedy_mmd}')
44 | if adroit:
45 | logger.info(f'Greedy Success %: {greedy_success}%')
46 | logger.info(f'Sampled Evaluation Score mean (min, max): {sample_mean:.2f} ({sample_min:.2f}, {sample_max:.2f})')
47 | logger.info(f'Sampled Evaluation Trajectory Lengths: {sample_mean_lengths:.2f}')
48 | logger.info(f'Sampled MMD: {sample_mmd}')
49 | if adroit:
50 | logger.info(f'Sampled Success %: {sample_success}%')
51 |
52 | # Tensorboard Logging
53 | writer.add_scalars('data/inf_greedy_reward', {'min_score': greedy_min,
54 | 'mean_score': greedy_mean,
55 | 'max_score': greedy_max}, n_iter+1)
56 | writer.add_scalar('data/inf_greedy_len', greedy_mean_lengths, n_iter+1)
57 | writer.add_scalar('data/greedy_mmd', greedy_mmd, n_iter+1)
58 | writer.add_scalars('data/inf_sampled_reward', {'min_score': sample_min,
59 | 'mean_score': sample_mean,
60 | 'max_score': sample_max}, n_iter+1)
61 | writer.add_scalar('data/inf_sampled_len', sample_mean_lengths, n_iter+1)
62 | writer.add_scalar('data/sampled_mmd', sample_mmd, n_iter+1)
63 | if adroit:
64 | writer.add_scalar('data/greedy_success_percen', greedy_success, n_iter+1)
65 | writer.add_scalar('data/sampled_success_percen', sample_success, n_iter+1)
66 |
67 | scores = {'greedy': greedy_mean, 'sample': sample_mean}
68 | mmds = {'greedy': greedy_mmd, 'sample': sample_mmd}
69 |
70 | return scores, mmds
71 |
72 |
--------------------------------------------------------------------------------
/milo/milo/utils/logger.py:
--------------------------------------------------------------------------------
1 | """
2 | Logger singleton wrapper
3 | Default logger folder is `os.path.join(__file__, '..', '..', 'logs')`
4 | """
5 | import logging
6 | import logging.handlers
7 | import os
8 | import sys
9 |
10 |
11 | __all__ = ['init_logger']
12 |
13 |
14 | def init_logger(log_dir):
15 | os.makedirs(log_dir, exist_ok=True)
16 | log_level = logging.DEBUG
17 | log_format = '%(message)s'
18 |
19 | logger = logging.getLogger(log_dir)
20 | logger.setLevel(log_level)
21 | path = os.path.join(log_dir, 'main.log')
22 |
23 | # file handler (log file)
24 | log_handler = logging.handlers.RotatingFileHandler(filename=path)
25 | log_handler.setLevel(log_level)
26 | log_handler.setFormatter(logging.Formatter(log_format))
27 | logger.addHandler(log_handler)
28 |
29 | # stream handler (default sys.stderr)
30 | log_handler = logging.StreamHandler()
31 | log_handler.setLevel(log_level)
32 | log_handler.setFormatter(logging.Formatter(log_format))
33 | logger.addHandler(log_handler)
34 |
35 | return logger
36 |
--------------------------------------------------------------------------------
/milo/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name='milo',
5 | version='0.1.0',
6 | packages=find_packages(),
7 | description='Components for MILO: Model based Imitation Learning from Offline data',
8 | )
9 |
--------------------------------------------------------------------------------
/mjrl/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | # idea
104 | *.idea/
105 |
106 | # Mac OSX files
107 | *.DS_Store
--------------------------------------------------------------------------------
/mjrl/README.md:
--------------------------------------------------------------------------------
1 | # RL for MuJoCo
2 |
3 | This package contains implementations of various RL algorithms for continuous control tasks simulated with [MuJoCo.](http://www.mujoco.org/)
4 |
5 | # Installation
6 | The main package dependencies are `MuJoCo`, `python=3.7`, `gym>=0.13`, `mujoco-py>=2.0`, and `pytorch>=1.0`. See `setup/README.md` ([link](https://github.com/aravindr93/mjrl/tree/master/setup#installation)) for detailed install instructions.
7 |
8 | # Bibliography
9 | If you find the package useful, please cite the following papers.
10 | ```
11 | @INPROCEEDINGS{Rajeswaran-NIPS-17,
12 | AUTHOR = {Aravind Rajeswaran and Kendall Lowrey and Emanuel Todorov and Sham Kakade},
13 | TITLE = "{Towards Generalization and Simplicity in Continuous Control}",
14 | BOOKTITLE = {NIPS},
15 | YEAR = {2017},
16 | }
17 |
18 | @INPROCEEDINGS{Rajeswaran-RSS-18,
19 | AUTHOR = {Aravind Rajeswaran AND Vikash Kumar AND Abhishek Gupta AND
20 | Giulia Vezzani AND John Schulman AND Emanuel Todorov AND Sergey Levine},
21 | TITLE = "{Learning Complex Dexterous Manipulation with Deep Reinforcement Learning and Demonstrations}",
22 | BOOKTITLE = {Proceedings of Robotics: Science and Systems (RSS)},
23 | YEAR = {2018},
24 | }
25 | ```
26 |
27 | # Credits
28 | This package is maintained by [Aravind Rajeswaran](http://homes.cs.washington.edu/~aravraj/) and other members of the [Movement Control Lab,](http://homes.cs.washington.edu/~todorov/) University of Washington Seattle.
29 |
--------------------------------------------------------------------------------
/mjrl/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | Here we provide a job script to illustrate policy optimization with incrimental learning methods like NPG and PPO. To run the experiments, use the commands below. The experiments are run through the job script provided which tasks two arguments:
4 | - `output`: path to directory where all the results will be saved
5 | - `config`: a config `.txt` file with all the experiment parameters (examples are provided)
6 | The script has to be run from this directory, i.e. `mjrl/examples`
7 |
8 | 1. To train an NPG agent on a task shipped with `mjrl` (e.g. swimmer)
9 | ```
10 | $ python policy_opt_job_script.py --output swimmer_npg_exp --config example_configs/swimmer_npg.txt
11 | ```
12 |
13 | 2. To train an NPG agent on an OpenAI gym benchmark task (e.g. hopper)
14 | ```
15 | $ python policy_opt_job_script.py --output hopper_npg_exp --config example_configs/hopper_npg.txt
16 | ```
17 | Note that since the Hopper env has termination conditions, we pick the sampling mode in the config to be `samples` rather than trajectories, so that per update we have 10K samples.
18 |
19 | 3. To train a PPO agent on the swimmer task
20 | ```
21 | $ python policy_opt_job_script.py --output swimmer_ppo_exp --config example_configs/swimmer_ppo.txt
22 | ```
--------------------------------------------------------------------------------
/mjrl/examples/behavior_clone.py:
--------------------------------------------------------------------------------
1 | from mjrl.utils.gym_env import GymEnv
2 | from mjrl.policies.gaussian_mlp import MLP
3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
4 | from mjrl.baselines.mlp_baseline import MLPBaseline
5 | from mjrl.algos.npg_cg import NPG
6 | from mjrl.algos.behavior_cloning import BC
7 | from mjrl.utils.train_agent import train_agent
8 | from mjrl.samplers.core import sample_paths
9 | import mjrl.envs
10 | import time as timer
11 | import pickle
12 | SEED = 500
13 |
14 | # ------------------------------
15 | # Train expert policy first
16 | e = GymEnv('mjrl_swimmer-v0')
17 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
18 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=5, learn_rate=1e-3)
19 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
20 |
21 | ts = timer.time()
22 | print("========================================")
23 | print("Training expert policy")
24 | print("========================================")
25 | train_agent(job_name='swimmer_exp1',
26 | agent=agent,
27 | seed=SEED,
28 | niter=50,
29 | gamma=0.995,
30 | gae_lambda=0.97,
31 | num_cpu=1,
32 | sample_mode='trajectories',
33 | num_traj=10,
34 | save_freq=5,
35 | evaluation_rollouts=None)
36 | print("========================================")
37 | print("Expert policy training complete !!!")
38 | print("========================================")
39 | print("time taken = %f" % (timer.time()-ts))
40 | print("========================================")
41 |
42 | # ------------------------------
43 | # Get demonstrations
44 | print("========================================")
45 | print("Collecting expert demonstrations")
46 | print("========================================")
47 | expert_pol = pickle.load(open('swimmer_exp1/iterations/best_policy.pickle', 'rb'))
48 | demo_paths = sample_paths(num_traj=5, policy=expert_pol, env=e.env_id)
49 |
50 | # ------------------------------
51 | # Train BC
52 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
53 | bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=64, lr=1e-3) # will use Adam by default
54 | ts = timer.time()
55 | print("========================================")
56 | print("Running BC with expert demonstrations")
57 | print("========================================")
58 | bc_agent.train()
59 | print("========================================")
60 | print("BC training complete !!!")
61 | print("time taken = %f" % (timer.time()-ts))
62 | print("========================================")
63 |
64 | # ------------------------------
65 | # Evaluate Policies
66 | bc_pol_score = e.evaluate_policy(policy, num_episodes=5, mean_action=True)
67 | expert_score = e.evaluate_policy(expert_pol, num_episodes=5, mean_action=True)
68 | print("Expert policy performance (eval mode) = %f" % expert_score[0][0])
69 | print("BC policy performance (eval mode) = %f" % bc_pol_score[0][0])
70 |
--------------------------------------------------------------------------------
/mjrl/examples/example_configs/hopper_npg.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env' : 'Hopper-v3',
6 | 'algorithm' : 'NPG',
7 | 'seed' : 123,
8 | 'sample_mode' : 'samples',
9 | 'rl_num_samples' : 10000,
10 | 'rl_num_iter' : 100,
11 | 'num_cpu' : 1,
12 | 'save_freq' : 25,
13 | 'eval_rollouts' : None,
14 | 'exp_notes' : 'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.',
15 |
16 | # RL parameters (all params related to PG, value function etc.)
17 |
18 | 'policy_size' : (32, 32),
19 | 'init_log_std' : -0.5,
20 | 'vf_hidden_size' : (128, 128),
21 | 'vf_batch_size' : 64,
22 | 'vf_epochs' : 2,
23 | 'vf_learn_rate' : 1e-3,
24 | 'rl_step_size' : 0.05,
25 | 'rl_gamma' : 0.995,
26 | 'rl_gae' : 0.97,
27 |
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 |
30 | 'alg_hyper_params' : dict(),
31 |
32 | }
33 |
34 |
--------------------------------------------------------------------------------
/mjrl/examples/example_configs/swimmer_npg.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env' : 'mjrl_swimmer-v0',
6 | 'algorithm' : 'NPG',
7 | 'seed' : 123,
8 | 'sample_mode' : 'trajectories',
9 | 'rl_num_traj' : 10,
10 | 'rl_num_iter' : 50,
11 | 'num_cpu' : 2,
12 | 'save_freq' : 25,
13 | 'eval_rollouts' : None,
14 | 'exp_notes' : 'Example config for training policy with NPG on the mjrl swimmer task.',
15 |
16 | # RL parameters (all params related to PG, value function, DAPG etc.)
17 |
18 | 'policy_size' : (32, 32),
19 | 'init_log_std' : -0.5,
20 | 'vf_hidden_size' : (128, 128),
21 | 'vf_batch_size' : 64,
22 | 'vf_epochs' : 2,
23 | 'vf_learn_rate' : 1e-3,
24 | 'rl_step_size' : 0.1,
25 | 'rl_gamma' : 0.995,
26 | 'rl_gae' : 0.97,
27 |
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 |
30 | 'alg_hyper_params' : dict(),
31 |
32 | }
--------------------------------------------------------------------------------
/mjrl/examples/example_configs/swimmer_ppo.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env' : 'mjrl_swimmer-v0',
6 | 'algorithm' : 'PPO',
7 | 'seed' : 123,
8 | 'sample_mode' : 'trajectories',
9 | 'rl_num_traj' : 10,
10 | 'rl_num_iter' : 50,
11 | 'num_cpu' : 2,
12 | 'save_freq' : 25,
13 | 'eval_rollouts' : None,
14 | 'exp_notes' : 'Example config for training policy with PPO on the mjrl swimmer task.',
15 |
16 | # RL parameters (all params related to PG, value function, DAPG etc.)
17 |
18 | 'policy_size' : (32, 32),
19 | 'init_log_std' : -0.5,
20 | 'vf_hidden_size' : (128, 128),
21 | 'vf_batch_size' : 64,
22 | 'vf_epochs' : 2,
23 | 'vf_learn_rate' : 1e-3,
24 | 'rl_step_size' : 0.1,
25 | 'rl_gamma' : 0.995,
26 | 'rl_gae' : 0.97,
27 |
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 |
30 | 'alg_hyper_params' : dict(clip_coef=0.2, epochs=10, mb_size=64, learn_rate=5e-4),
31 |
32 | }
--------------------------------------------------------------------------------
/mjrl/examples/linear_nn_comparison.py:
--------------------------------------------------------------------------------
1 | from mjrl.utils.gym_env import GymEnv
2 | from mjrl.policies.gaussian_mlp import MLP
3 | from mjrl.policies.gaussian_linear import LinearPolicy
4 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
5 | from mjrl.baselines.mlp_baseline import MLPBaseline
6 | from mjrl.algos.npg_cg import NPG
7 | from mjrl.utils.train_agent import train_agent
8 | import mjrl.envs
9 | import time as timer
10 | SEED = 500
11 |
12 | # NN policy
13 | # ==================================
14 | e = GymEnv('mjrl_swimmer-v0')
15 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
16 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
17 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
18 |
19 | ts = timer.time()
20 | train_agent(job_name='swimmer_nn_exp1',
21 | agent=agent,
22 | seed=SEED,
23 | niter=50,
24 | gamma=0.995,
25 | gae_lambda=0.97,
26 | num_cpu=1,
27 | sample_mode='trajectories',
28 | num_traj=10,
29 | save_freq=5,
30 | evaluation_rollouts=5)
31 | print("time taken for NN policy training = %f" % (timer.time()-ts))
32 |
33 |
34 | # Linear policy
35 | # ==================================
36 | e = GymEnv('mjrl_swimmer-v0')
37 | policy = LinearPolicy(e.spec, seed=SEED)
38 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
39 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
40 |
41 | ts = timer.time()
42 | train_agent(job_name='swimmer_linear_exp1',
43 | agent=agent,
44 | seed=SEED,
45 | niter=50,
46 | gamma=0.995,
47 | gae_lambda=0.97,
48 | num_cpu=1,
49 | sample_mode='trajectories',
50 | num_traj=10,
51 | save_freq=5,
52 | evaluation_rollouts=5)
53 | print("time taken for linear policy training = %f" % (timer.time()-ts))
54 |
--------------------------------------------------------------------------------
/mjrl/examples/policy_opt_job_script.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a job script for running policy gradient algorithms on gym tasks.
3 | Separate job scripts are provided to run few other algorithms
4 | - For DAPG see here: https://github.com/aravindr93/hand_dapg/tree/master/dapg/examples
5 | - For model-based NPG see here: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos/model_accel
6 | """
7 |
8 | from mjrl.utils.gym_env import GymEnv
9 | from mjrl.policies.gaussian_mlp import MLP
10 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
11 | from mjrl.baselines.mlp_baseline import MLPBaseline
12 | from mjrl.algos.npg_cg import NPG
13 | from mjrl.algos.batch_reinforce import BatchREINFORCE
14 | from mjrl.algos.ppo_clip import PPO
15 | from mjrl.utils.train_agent import train_agent
16 | import os
17 | import json
18 | import gym
19 | import mjrl.envs
20 | import time as timer
21 | import pickle
22 | import argparse
23 |
24 | # ===============================================================================
25 | # Get command line arguments
26 | # ===============================================================================
27 |
28 | parser = argparse.ArgumentParser(description='Natural policy gradient from mjrl on mujoco environments')
29 | parser.add_argument('--output', type=str, required=True, help='location to store results')
30 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params')
31 |
32 | args = parser.parse_args()
33 | JOB_DIR = args.output
34 | if not os.path.exists(JOB_DIR):
35 | os.mkdir(JOB_DIR)
36 | with open(args.config, 'r') as f:
37 | job_data = eval(f.read())
38 | assert 'algorithm' in job_data.keys()
39 | assert any([job_data['algorithm'] == a for a in ['NPG', 'NVPG', 'VPG', 'PPO']])
40 | assert 'sample_mode' in job_data.keys()
41 | job_data['alg_hyper_params'] = dict() if 'alg_hyper_params' not in job_data.keys() else job_data['alg_hyper_params']
42 |
43 | EXP_FILE = JOB_DIR + '/job_config.json'
44 | with open(EXP_FILE, 'w') as f:
45 | json.dump(job_data, f, indent=4)
46 |
47 | if job_data['sample_mode'] == 'trajectories':
48 | assert 'rl_num_traj' in job_data.keys()
49 | job_data['rl_num_samples'] = 0 # will be ignored
50 | elif job_data['sample_mode'] == 'samples':
51 | assert 'rl_num_samples' in job_data.keys()
52 | job_data['rl_num_traj'] = 0 # will be ignored
53 | else:
54 | print("Unknown sampling mode. Choose either trajectories or samples")
55 | exit()
56 |
57 | # ===============================================================================
58 | # Train Loop
59 | # ===============================================================================
60 |
61 | e = GymEnv(job_data['env'])
62 | policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed'], init_log_std=job_data['init_log_std'])
63 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], hidden_sizes=job_data['vf_hidden_size'],
64 | epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate'])
65 |
66 | # Construct the algorithm
67 | if job_data['algorithm'] == 'NPG':
68 | # Other hyperparameters (like number of CG steps) can be specified in config for pass through
69 | # or default hyperparameters will be used
70 | agent = NPG(e, policy, baseline, normalized_step_size=job_data['rl_step_size'],
71 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
72 |
73 | elif job_data['algorithm'] == 'VPG':
74 | agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data['rl_step_size'],
75 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
76 |
77 | elif job_data['algorithm'] == 'NVPG':
78 | agent = BatchREINFORCE(e, policy, baseline, desired_kl=job_data['rl_step_size'],
79 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
80 |
81 | elif job_data['algorithm'] == 'PPO':
82 | # There are many hyperparameters for PPO. They can be specified in config for pass through
83 | # or defaults in the PPO algorithm will be used
84 | agent = PPO(e, policy, baseline, save_logs=True, **job_data['alg_hyper_params'])
85 |
86 | print("========================================")
87 | print("Starting policy learning")
88 | print("========================================")
89 |
90 | ts = timer.time()
91 | train_agent(job_name=JOB_DIR,
92 | agent=agent,
93 | seed=job_data['seed'],
94 | niter=job_data['rl_num_iter'],
95 | gamma=job_data['rl_gamma'],
96 | gae_lambda=job_data['rl_gae'],
97 | num_cpu=job_data['num_cpu'],
98 | sample_mode=job_data['sample_mode'],
99 | num_traj=job_data['rl_num_traj'],
100 | num_samples=job_data['rl_num_samples'],
101 | save_freq=job_data['save_freq'],
102 | evaluation_rollouts=job_data['eval_rollouts'])
103 | print("time taken = %f" % (timer.time()-ts))
104 |
--------------------------------------------------------------------------------
/mjrl/mjrl/__init__.py:
--------------------------------------------------------------------------------
1 | import mjrl.envs
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/algos/__init__.py
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/behavior_cloning.py:
--------------------------------------------------------------------------------
1 | """
2 | Minimize bc loss (MLE, MSE, RWR etc.) with pytorch optimizers
3 | """
4 |
5 | import logging
6 | #logging.disable(logging.CRITICAL)
7 | import numpy as np
8 | import time as timer
9 | import torch
10 | from torch.autograd import Variable
11 | from mjrl.utils.logger import DataLog
12 | from tqdm import tqdm
13 |
14 |
15 | class BC:
16 | def __init__(self, expert_paths,
17 | policy,
18 | epochs = 5,
19 | batch_size = 64,
20 | lr = 1e-3,
21 | optimizer = None,
22 | loss_type = 'MSE', # can be 'MLE' or 'MSE'
23 | save_logs = True,
24 | set_transforms = False,
25 | **kwargs,
26 | ):
27 |
28 | self.policy = policy
29 | self.expert_paths = expert_paths
30 | self.epochs = epochs
31 | self.mb_size = batch_size
32 | self.logger = DataLog()
33 | self.loss_type = loss_type
34 | self.save_logs = save_logs
35 |
36 | if set_transforms:
37 | in_shift, in_scale, out_shift, out_scale = self.compute_transformations()
38 | self.set_transformations(in_shift, in_scale, out_shift, out_scale)
39 | self.set_variance_with_data(out_scale)
40 |
41 | # construct optimizer
42 | self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=lr) if optimizer is None else optimizer
43 |
44 | # Loss criterion if required
45 | if loss_type == 'MSE':
46 | self.loss_criterion = torch.nn.MSELoss()
47 |
48 | # make logger
49 | if self.save_logs:
50 | self.logger = DataLog()
51 |
52 | def compute_transformations(self):
53 | # get transformations
54 | if self.expert_paths == [] or self.expert_paths is None:
55 | in_shift, in_scale, out_shift, out_scale = None, None, None, None
56 | else:
57 | observations = np.concatenate([path["observations"] for path in self.expert_paths])
58 | actions = np.concatenate([path["actions"] for path in self.expert_paths])
59 | in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0)
60 | out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0)
61 | return in_shift, in_scale, out_shift, out_scale
62 |
63 | def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None):
64 | # set scalings in the target policy
65 | self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale)
66 | self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale)
67 |
68 | def set_variance_with_data(self, out_scale):
69 | # set the variance of gaussian policy based on out_scale
70 | params = self.policy.get_param_values()
71 | params[-self.policy.m:] = np.log(out_scale + 1e-12)
72 | self.policy.set_param_values(params)
73 |
74 | def loss(self, data, idx=None):
75 | if self.loss_type == 'MLE':
76 | return self.mle_loss(data, idx)
77 | elif self.loss_type == 'MSE':
78 | return self.mse_loss(data, idx)
79 | else:
80 | print("Please use valid loss type")
81 | return None
82 |
83 | def mle_loss(self, data, idx):
84 | # use indices if provided (e.g. for mini-batching)
85 | # otherwise, use all the data
86 | idx = range(data['observations'].shape[0]) if idx is None else idx
87 | if type(data['observations']) == torch.Tensor:
88 | idx = torch.LongTensor(idx)
89 | obs = data['observations'][idx]
90 | act = data['expert_actions'][idx]
91 | LL, mu, log_std = self.policy.new_dist_info(obs, act)
92 | # minimize negative log likelihood
93 | return -torch.mean(LL)
94 |
95 | def mse_loss(self, data, idx=None):
96 | idx = range(data['observations'].shape[0]) if idx is None else idx
97 | if type(data['observations']) is torch.Tensor:
98 | idx = torch.LongTensor(idx)
99 | obs = data['observations'][idx]
100 | act_expert = data['expert_actions'][idx]
101 | if type(data['observations']) is not torch.Tensor:
102 | obs = Variable(torch.from_numpy(obs).float(), requires_grad=False)
103 | act_expert = Variable(torch.from_numpy(act_expert).float(), requires_grad=False)
104 | act_pi = self.policy.model(obs)
105 | return self.loss_criterion(act_pi, act_expert.detach())
106 |
107 | def fit(self, data, suppress_fit_tqdm=False, **kwargs):
108 | # data is a dict
109 | # keys should have "observations" and "expert_actions"
110 | validate_keys = all([k in data.keys() for k in ["observations", "expert_actions"]])
111 | assert validate_keys is True
112 | ts = timer.time()
113 | num_samples = data["observations"].shape[0]
114 |
115 | # log stats before
116 | if self.save_logs:
117 | loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0]
118 | self.logger.log_kv('loss_before', loss_val)
119 |
120 | # train loop
121 | for ep in config_tqdm(range(self.epochs), suppress_fit_tqdm):
122 | for mb in range(int(num_samples / self.mb_size)):
123 | rand_idx = np.random.choice(num_samples, size=self.mb_size)
124 | self.optimizer.zero_grad()
125 | loss = self.loss(data, idx=rand_idx)
126 | loss.backward()
127 | self.optimizer.step()
128 | params_after_opt = self.policy.get_param_values()
129 | self.policy.set_param_values(params_after_opt, set_new=True, set_old=True)
130 |
131 | # log stats after
132 | if self.save_logs:
133 | self.logger.log_kv('epoch', self.epochs)
134 | loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0]
135 | self.logger.log_kv('loss_after', loss_val)
136 | self.logger.log_kv('time', (timer.time()-ts))
137 |
138 | def train(self, **kwargs):
139 | observations = np.concatenate([path["observations"] for path in self.expert_paths])
140 | expert_actions = np.concatenate([path["actions"] for path in self.expert_paths])
141 | data = dict(observations=observations, expert_actions=expert_actions)
142 | self.fit(data, **kwargs)
143 |
144 |
145 | def config_tqdm(range_inp, suppress_tqdm=False):
146 | if suppress_tqdm:
147 | return range_inp
148 | else:
149 | return tqdm(range_inp)
150 |
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/dapg.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.disable(logging.CRITICAL)
3 | import numpy as np
4 | import scipy as sp
5 | import scipy.sparse.linalg as spLA
6 | import copy
7 | import time as timer
8 | import torch
9 | import torch.nn as nn
10 | from torch.autograd import Variable
11 | import copy
12 |
13 | # samplers
14 | import mjrl.samplers.core as trajectory_sampler
15 |
16 | # utility functions
17 | import mjrl.utils.process_samples as process_samples
18 | from mjrl.utils.logger import DataLog
19 | from mjrl.utils.cg_solve import cg_solve
20 |
21 | # Import Algs
22 | from mjrl.algos.npg_cg import NPG
23 | from mjrl.algos.behavior_cloning import BC
24 |
25 | class DAPG(NPG):
26 | def __init__(self, env, policy, baseline,
27 | demo_paths=None,
28 | normalized_step_size=0.01,
29 | FIM_invert_args={'iters': 10, 'damping': 1e-4},
30 | hvp_sample_frac=1.0,
31 | seed=123,
32 | save_logs=False,
33 | kl_dist=None,
34 | lam_0=1.0, # demo coef
35 | lam_1=0.95, # decay coef
36 | **kwargs,
37 | ):
38 |
39 | self.env = env
40 | self.policy = policy
41 | self.baseline = baseline
42 | self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size
43 | self.seed = seed
44 | self.save_logs = save_logs
45 | self.FIM_invert_args = FIM_invert_args
46 | self.hvp_subsample = hvp_sample_frac
47 | self.running_score = None
48 | self.demo_paths = demo_paths
49 | self.lam_0 = lam_0
50 | self.lam_1 = lam_1
51 | self.iter_count = 0.0
52 | if save_logs: self.logger = DataLog()
53 |
54 | def train_from_paths(self, paths):
55 |
56 | # Concatenate from all the trajectories
57 | observations = np.concatenate([path["observations"] for path in paths])
58 | actions = np.concatenate([path["actions"] for path in paths])
59 | advantages = np.concatenate([path["advantages"] for path in paths])
60 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
61 |
62 | if self.demo_paths is not None and self.lam_0 > 0.0:
63 | demo_obs = np.concatenate([path["observations"] for path in self.demo_paths])
64 | demo_act = np.concatenate([path["actions"] for path in self.demo_paths])
65 | demo_adv = self.lam_0 * (self.lam_1 ** self.iter_count) * np.ones(demo_obs.shape[0])
66 | self.iter_count += 1
67 | # concatenate all
68 | all_obs = np.concatenate([observations, demo_obs])
69 | all_act = np.concatenate([actions, demo_act])
70 | all_adv = 1e-2*np.concatenate([advantages/(np.std(advantages) + 1e-8), demo_adv])
71 | else:
72 | all_obs = observations
73 | all_act = actions
74 | all_adv = advantages
75 |
76 | # cache return distributions for the paths
77 | path_returns = [sum(p["rewards"]) for p in paths]
78 | mean_return = np.mean(path_returns)
79 | std_return = np.std(path_returns)
80 | min_return = np.amin(path_returns)
81 | max_return = np.amax(path_returns)
82 | base_stats = [mean_return, std_return, min_return, max_return]
83 | self.running_score = mean_return if self.running_score is None else \
84 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters
85 | if self.save_logs: self.log_rollout_statistics(paths)
86 |
87 | # Keep track of times for various computations
88 | t_gLL = 0.0
89 | t_FIM = 0.0
90 |
91 | # Optimization algorithm
92 | # --------------------------
93 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
94 |
95 | # DAPG
96 | ts = timer.time()
97 | sample_coef = all_adv.shape[0]/advantages.shape[0]
98 | dapg_grad = sample_coef*self.flat_vpg(all_obs, all_act, all_adv)
99 | t_gLL += timer.time() - ts
100 |
101 | # NPG
102 | ts = timer.time()
103 | hvp = self.build_Hvp_eval([observations, actions],
104 | regu_coef=self.FIM_invert_args['damping'])
105 | npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(),
106 | cg_iters=self.FIM_invert_args['iters'])
107 | t_FIM += timer.time() - ts
108 |
109 | # Step size computation
110 | # --------------------------
111 | n_step_size = 2.0*self.kl_dist
112 | alpha = np.sqrt(np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20)))
113 |
114 | # Policy update
115 | # --------------------------
116 | curr_params = self.policy.get_param_values()
117 | new_params = curr_params + alpha * npg_grad
118 | self.policy.set_param_values(new_params, set_new=True, set_old=False)
119 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
120 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
121 | self.policy.set_param_values(new_params, set_new=True, set_old=True)
122 |
123 | # Log information
124 | if self.save_logs:
125 | self.logger.log_kv('alpha', alpha)
126 | self.logger.log_kv('delta', n_step_size)
127 | self.logger.log_kv('time_vpg', t_gLL)
128 | self.logger.log_kv('time_npg', t_FIM)
129 | self.logger.log_kv('kl_dist', kl_dist)
130 | self.logger.log_kv('surr_improvement', surr_after - surr_before)
131 | self.logger.log_kv('running_score', self.running_score)
132 | try:
133 | self.env.env.env.evaluate_success(paths, self.logger)
134 | except:
135 | # nested logic for backwards compatibility. TODO: clean this up.
136 | try:
137 | success_rate = self.env.env.env.evaluate_success(paths)
138 | self.logger.log_kv('success_rate', success_rate)
139 | except:
140 | pass
141 | return base_stats
142 |
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/mbac.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.disable(logging.CRITICAL)
3 | import numpy as np
4 | import time as timer
5 | import torch
6 | import torch.nn as nn
7 | from torch.autograd import Variable
8 | from mjrl.utils.logger import DataLog
9 | from tqdm import tqdm
10 | from mjrl.utils.gym_env import GymEnv
11 | from mjrl.policies.mpc_actor import MPCActor
12 | from mjrl.algos.behavior_cloning import BC
13 |
14 |
15 | class MBAC(BC):
16 | def __init__(self,
17 | env_name,
18 | policy,
19 | expert_paths = None, # for the initial seeding
20 | epochs = 5,
21 | batch_size = 64,
22 | lr = 1e-3,
23 | optimizer = None,
24 | loss_type = 'MSE', # can be 'MLE' or 'MSE'
25 | seed = 123,
26 | buffer_size = 50, # measured in number of trajectories
27 | mpc_params = None,
28 | save_logs = True,
29 | ):
30 |
31 | super().__init__(expert_paths=expert_paths,
32 | policy=policy,
33 | epochs=epochs,
34 | batch_size=batch_size,
35 | lr=lr,
36 | optimizer=optimizer,
37 | loss_type=loss_type,
38 | save_logs=save_logs,
39 | )
40 | self.expert_paths = [] if self.expert_paths is None else self.expert_paths
41 | self.buffer_size = buffer_size
42 |
43 | # For the MPC policy
44 | self.env = GymEnv(env_name)
45 | self.env.reset(seed=seed)
46 | if mpc_params is None:
47 | mean = np.zeros(self.env.action_dim)
48 | sigma = 1.0 * np.ones(self.env.action_dim)
49 | filter_coefs = [sigma, 0.05, 0.0, 0.0]
50 | mpc_params = dict(env=GymEnv(env_name), H=10,
51 | paths_per_cpu=25, num_cpu=1,
52 | kappa=10.0, gamma=1.0,
53 | mean=mean, filter_coefs=filter_coefs,
54 | seed=seed)
55 | else:
56 | mpc_params['env'] = GymEnv(env_name)
57 | mpc_params['seed'] = seed
58 |
59 | self.mpc_params = mpc_params
60 | self.mpc_policy = MPCActor(**mpc_params)
61 |
62 | def collect_paths(self, num_traj=10,
63 | mode='policy',
64 | horizon=None,
65 | render=False
66 | ):
67 | horizon = self.env.horizon if horizon is None else horizon
68 | paths = []
69 | for i in tqdm(range(num_traj)):
70 | self.env.reset()
71 | obs, act_pi, act_mpc, rew, states = [], [], [], [], []
72 | for t in range(horizon):
73 | o = self.env.get_obs()
74 | s = self.env.get_env_state()
75 | a_pi = self.policy.get_action(o)[0]
76 | a_mpc = self.mpc_policy.get_action(s)
77 | a = a_pi if mode == 'policy' else a_mpc
78 | next_o, r, done, _ = self.env.step(a)
79 | if render:
80 | self.env.render()
81 | # store data
82 | obs.append(o)
83 | rew.append(r)
84 | states.append(s)
85 | act_pi.append(a_pi)
86 | act_mpc.append(a_mpc)
87 | # kill if done
88 | if done:
89 | break
90 | path = dict(observations=np.array(obs),
91 | actions=np.array(act_pi),
92 | expert_actions=np.array(act_mpc),
93 | rewards=np.array(rew),
94 | states=states,
95 | )
96 | paths.append(path)
97 | return paths
98 |
99 | def add_paths_to_buffer(self, paths):
100 | for path in paths:
101 | self.expert_paths.append(path)
102 | if len(self.expert_paths) > self.buffer_size:
103 | # keep recent trajectories
104 | # TODO: Also consider keeping best performing trajectories
105 | self.expert_paths = self.expert_paths[-self.buffer_size:]
106 | if self.save_logs:
107 | self.logger.log_kv('buffer_size', len(self.expert_paths))
108 |
109 | def get_data_from_buffer(self):
110 | observations = np.concatenate([path["observations"] for path in self.expert_paths])
111 | expert_actions = np.concatenate([path["expert_actions"] for path in self.expert_paths])
112 | observations = torch.Tensor(observations).float()
113 | expert_actions = torch.Tensor(expert_actions).float()
114 | data = dict(observations=observations, expert_actions=expert_actions)
115 | return data
116 |
117 | def train_step(self, num_traj=10, **kwargs):
118 | # collect data using policy actions
119 | # fit policy to expert actions on these states
120 | new_paths = self.collect_paths(num_traj, mode='policy')
121 | self.add_paths_to_buffer(new_paths)
122 | data = self.get_data_from_buffer()
123 | self.fit(data, **kwargs)
124 | stoc_pol_perf = np.mean([np.sum(path['rewards']) for path in new_paths])
125 | return stoc_pol_perf
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/algos/model_accel/__init__.py
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/model_learning_mpc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from mjrl.algos.model_accel.sampling import generate_paths, generate_perturbed_actions, trajectory_rollout
3 |
4 |
5 | class MPCPolicy(object):
6 | def __init__(self, env,
7 | plan_horizon,
8 | plan_paths=10,
9 | kappa=1.0,
10 | gamma=1.0,
11 | mean=None,
12 | filter_coefs=None,
13 | seed=123,
14 | warmstart=True,
15 | fitted_model=None,
16 | omega=5.0,
17 | **kwargs,
18 | ):
19 |
20 | # initialize
21 | self.env, self.seed = env, seed
22 | self.n, self.m = env.observation_dim, env.action_dim
23 | self.plan_horizon, self.num_traj = plan_horizon, plan_paths
24 |
25 | if fitted_model is None:
26 | print("Policy requires a fitted dynamics model")
27 | quit()
28 | else:
29 | self.fitted_model = fitted_model
30 |
31 | # initialize other params
32 | self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma
33 | if mean is None:
34 | self.mean = np.zeros(self.m)
35 | if filter_coefs is None:
36 | self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0]
37 | self.act_sequence = np.ones((self.plan_horizon, self.m)) * self.mean
38 | self.init_act_sequence = self.act_sequence.copy()
39 | self.warmstart = warmstart
40 | self.omega = omega
41 |
42 | def get_action(self, obs):
43 | # generate paths
44 | if type(self.fitted_model) == list:
45 |
46 | # Ensemble case
47 | # Collect trajectories from different models with same action sequences
48 | base_act = self.act_sequence
49 | act_list = [generate_perturbed_actions(base_act, self.filter_coefs)
50 | for _ in range(self.num_traj)]
51 | actions = np.array(act_list)
52 | paths_list = []
53 | for model in self.fitted_model:
54 | paths = trajectory_rollout(actions, model, obs)
55 | self.env.env.env.compute_path_rewards(paths)
56 | paths_list.append(paths)
57 | # consolidate paths
58 | paths = dict()
59 | for k in paths_list[0].keys():
60 | v = np.vstack([p[k] for p in paths_list])
61 | paths[k] = v
62 | R = self.score_trajectory_ensemble(paths, paths_list)
63 |
64 | else:
65 | paths = generate_paths(num_traj=self.num_traj, fitted_model=self.fitted_model,
66 | start_state=obs, base_act=self.act_sequence, filter_coefs=self.filter_coefs)
67 | self.env.env.env.compute_path_rewards(paths) # will populate path['rewards']
68 | R = self.score_trajectory(paths)
69 |
70 | S = np.exp(self.kappa * (R - np.max(R)))
71 | act = paths["actions"]
72 |
73 | weighted_seq = S * act.T
74 | act_sequence = np.sum(weighted_seq.T, axis=0) / (np.sum(S) + 1e-6)
75 | action = act_sequence[0].copy()
76 |
77 | # get updated action sequence
78 | if self.warmstart:
79 | self.act_sequence[:-1] = act_sequence[1:]
80 | self.act_sequence[-1] = self.mean.copy()
81 | else:
82 | self.act_sequence = self.init_act_sequence.copy()
83 | return action
84 |
85 | def score_trajectory_ensemble(self, paths, paths_list):
86 | num_traj = self.num_traj
87 | num_models = len(paths_list)
88 | total_traj = paths['rewards'].shape[0]
89 | horizon = paths['rewards'].shape[1]
90 | predictions = [p['observations'] for p in paths_list]
91 | disagreement = np.std(predictions, axis=0) # (num_traj, horizon, state_dim)
92 | disagreement = np.sum(disagreement, axis=(1,2)) # (num_traj,)
93 | scores = np.zeros(total_traj)
94 | for i in range(total_traj):
95 | disagreement_score = disagreement[i // self.num_traj]
96 | scores[i] = self.omega * disagreement_score
97 | for t in range(horizon):
98 | scores[i] += (self.gamma ** t) * paths["rewards"][i][t]
99 | return scores
100 |
101 | def score_trajectory(self, paths):
102 | # rewards shape: (num_traj, horizon)
103 | num_traj = paths["rewards"].shape[0]
104 | horizon = paths["rewards"].shape[1]
105 | scores = np.zeros(num_traj)
106 | for i in range(num_traj):
107 | scores[i] = 0.0
108 | for t in range(horizon):
109 | scores[i] += (self.gamma**t)*paths["rewards"][i][t]
110 | return scores
111 |
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/configs/point_mass.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env_name' : 'mjrl_point_mass-v0',
6 | 'seed' : 123,
7 | 'debug_mode' : False,
8 | 'num_iter' : 5,
9 | 'iter_samples' : 100,
10 | 'eval_rollouts' : 25,
11 | 'num_models' : 3,
12 | 'exp_notes' : 'Toy experiment for initial trial.',
13 | 'save_freq' : 1,
14 | 'device' : 'cpu',
15 | 'learn_reward' : False,
16 | 'reward_file' : 'utils/reward_functions/mjrl_point_mass.py',
17 |
18 | # dynamics learning
19 |
20 | 'hidden_size' : (256, 256),
21 | 'activation' : 'relu',
22 | 'fit_lr' : 1e-3,
23 | 'fit_wd' : 1e-5,
24 | 'buffer_size' : 10000,
25 | 'fit_mb_size' : 16,
26 | 'fit_epochs' : 25,
27 | 'refresh_fit' : False,
28 |
29 | # initial data
30 |
31 | 'init_log_std' : -0.5,
32 | 'min_log_std' : -2.0,
33 | 'init_samples' : 1000,
34 |
35 | # NPG params
36 |
37 | 'policy_size' : (32, 32),
38 | 'inner_steps' : 10,
39 | 'step_size' : 0.05,
40 | 'update_paths' : 250,
41 | 'start_state' : 'init',
42 | 'horizon' : 25,
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/configs/reacher.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env_name' : 'mjrl_reacher_7dof-v0',
6 | 'seed' : 123,
7 | 'debug_mode' : False,
8 | 'num_iter' : 25,
9 | 'iter_samples' : 500,
10 | 'eval_rollouts' : 10,
11 | 'num_models' : 4,
12 | 'save_freq' : 1,
13 | 'device' : 'cpu',
14 |
15 | # dynamics learning
16 |
17 | 'hidden_size' : (256, 256),
18 | 'activation' : 'relu',
19 | 'fit_lr' : 1e-3,
20 | 'fit_wd' : 0.0,
21 | 'buffer_size' : 20000,
22 | 'fit_mb_size' : 64,
23 | 'fit_epochs' : 20,
24 | 'refresh_fit' : False,
25 |
26 | # initial data
27 |
28 | 'init_log_std' : -0.5,
29 | 'min_log_std' : -2.5,
30 | 'init_samples' : 2500,
31 | 'init_policy' : None,
32 |
33 |
34 | # NPG params
35 |
36 | 'policy_size' : (64, 64),
37 | 'inner_steps' : 5,
38 | 'step_size' : 0.05,
39 | 'update_paths' : 250,
40 | 'start_state' : 'init',
41 | 'horizon' : 50,
42 |
43 | }
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/sandbox/example_config_mpc.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env_name' : 'mjrl_point_mass-v0',
6 | 'seed' : 123,
7 | 'debug_mode' : False,
8 | 'num_iter' : 5,
9 | 'paths_per_iter': 5,
10 | 'eval_rollouts' : 10,
11 | 'num_models' : 3,
12 | 'exp_notes' : 'Toy experiment for initial trial.',
13 | 'save_freq' : 5,
14 | 'device' : 'cpu',
15 |
16 | # dynamics learning
17 |
18 | 'hidden_size' : (64, 64),
19 | 'activation' : 'relu',
20 | 'fit_lr' : 1e-3,
21 | 'fit_wd' : 1e-5,
22 | 'max_paths' : 1000,
23 | 'fit_mb_size' : 16,
24 | 'fit_epochs' : 25,
25 | 'refresh_fit' : True,
26 |
27 | # initial data
28 |
29 | 'init_log_std' : -0.5,
30 | 'n_init_paths' : 25,
31 | 'use_demos' : False,
32 | 'demo_file' : None,
33 |
34 | # model predictive control
35 |
36 | 'noisy_mpc' : True, # when collecting data for exploration
37 | 'noise_level' : 0.1,
38 | 'filter_coefs' : {'f1': 0.5, 'f2': 1.0, 'f3': 0.0, 'f4': 0.0},
39 | 'plan_paths' : 200,
40 | 'plan_horizon' : 10,
41 | 'kappa' : 2.0,
42 | 'omega' : 0.0,
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/sandbox/run_model_learning_mpc.py:
--------------------------------------------------------------------------------
1 | """
2 | Job script to optimize trajectories with fitted model
3 | """
4 |
5 | import numpy as np
6 | import copy
7 | import torch
8 | import torch.nn as nn
9 | import pickle
10 | import mjrl.envs
11 | import time as timer
12 | import argparse
13 | import os
14 | import json
15 | import mjrl.samplers.core as trajectory_sampler
16 | import mjrl.utils.tensor_utils as tensor_utils
17 | from tqdm import tqdm
18 | from tabulate import tabulate
19 | from mjrl.policies.gaussian_mlp import MLP
20 | from mjrl.baselines.mlp_baseline import MLPBaseline
21 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
22 | from mjrl.utils.gym_env import GymEnv
23 | from mjrl.utils.logger import DataLog
24 | from mjrl.utils.make_train_plots import make_train_plots
25 | from mjrl.algos.model_accel.nn_dynamics import DynamicsModel
26 | from mjrl.algos.model_accel.model_learning_mpc import MPCPolicy
27 | from mjrl.algos.model_accel.sampling import sample_paths, evaluate_policy
28 |
29 |
30 | # ===============================================================================
31 | # Get command line arguments
32 | # ===============================================================================
33 |
34 | parser = argparse.ArgumentParser(description='Trajectory Optimization with fitted models.')
35 | parser.add_argument('--output', type=str, required=True, help='location to store results')
36 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params')
37 | args = parser.parse_args()
38 | OUT_DIR = args.output
39 | if not os.path.exists(OUT_DIR):
40 | os.mkdir(OUT_DIR)
41 | with open(args.config, 'r') as f:
42 | job_data = eval(f.read())
43 |
44 | # Unpack args and make files for easy access
45 | logger = DataLog()
46 | ENV_NAME = job_data['env_name']
47 | PICKLE_FILE = OUT_DIR + '/exp_results.pickle'
48 | EXP_FILE = OUT_DIR + '/job_data.json'
49 | SEED = job_data['seed']
50 | job_data['filter_coefs'] = [job_data['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']]
51 |
52 | # base cases
53 | if 'num_models' not in job_data.keys():
54 | job_data['num_models'] = 1
55 | if job_data['num_models'] == 1 or 'omega' not in job_data.keys():
56 | job_data['omega'] = 0.0
57 | if 'eval_rollouts' not in job_data.keys():
58 | job_data['eval_rollouts'] = 0
59 | if 'save_freq' not in job_data.keys():
60 | job_data['save_freq'] = 10
61 | if 'device' not in job_data.keys():
62 | job_data['device'] = 'cpu'
63 | if 'debug_mode' in job_data.keys():
64 | DEBUG = job_data['debug_mode']
65 | else:
66 | DEBUG =False
67 | if 'device_path' not in job_data.keys():
68 | job_data['device_path'] = None
69 | with open(EXP_FILE, 'w') as f:
70 | json.dump(job_data, f, indent=4)
71 |
72 | del(job_data['seed'])
73 | job_data['base_seed'] = SEED
74 |
75 | # ===============================================================================
76 | # Train loop
77 | # ===============================================================================
78 |
79 | np.random.seed(SEED)
80 | torch.random.manual_seed(SEED)
81 |
82 | # TODO(Aravind): Map to hardware if device_path is specified
83 |
84 | e = GymEnv(ENV_NAME)
85 | e.set_seed(SEED)
86 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+i, **job_data)
87 | for i in range(job_data['num_models'])]
88 | exploratory_policy = MLP(e.spec, seed=SEED, init_log_std=job_data['init_log_std'])
89 | paths = []
90 |
91 | for outer_iter in range(job_data['num_iter']):
92 |
93 | ts = timer.time()
94 | print("================> ITERATION : %i " % outer_iter)
95 | print("Getting interaction data from real dynamics ...")
96 |
97 | if outer_iter == 0:
98 | iter_paths = trajectory_sampler.sample_paths(job_data['n_init_paths'], e,
99 | exploratory_policy,
100 | eval_mode=False, base_seed=SEED)
101 | else:
102 | iter_paths = sample_paths(job_data['paths_per_iter'],
103 | mpc_policy.env, mpc_policy,
104 | eval_mode=(not job_data['noisy_mpc']),
105 | noise_level=job_data['noise_level'],
106 | base_seed=SEED + outer_iter)
107 |
108 | # reset the environment (good for hardware)
109 | e.reset()
110 |
111 | for p in iter_paths:
112 | paths.append(p)
113 |
114 | if len(paths) > job_data['max_paths']:
115 | diff = len(paths) - job_data['max_paths']
116 | paths[:diff] = []
117 |
118 | s = np.concatenate([p['observations'][:-1] for p in paths])
119 | a = np.concatenate([p['actions'][:-1] for p in paths])
120 | sp = np.concatenate([p['observations'][1:] for p in paths])
121 | r = np.array([np.sum(p['rewards']) for p in iter_paths])
122 | rollout_score = np.mean(r)
123 |
124 | logger.log_kv('fit_epochs', job_data['fit_epochs'])
125 | logger.log_kv('rollout_score', rollout_score)
126 | try:
127 | rollout_metric = e.env.env.evaluate_success(iter_paths)
128 | logger.log_kv('rollout_metric', rollout_metric)
129 | except:
130 | pass
131 |
132 | print("Data gathered, fitting model ...")
133 | if job_data['refresh_fit']:
134 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+123*outer_iter,
135 | **job_data) for i in range(job_data['num_models'])]
136 |
137 | for i, model in enumerate(models):
138 | epoch_loss = model.fit(s, a, sp, job_data['fit_mb_size'], job_data['fit_epochs'])
139 | logger.log_kv('loss_before_' + str(i), epoch_loss[0])
140 | logger.log_kv('loss_after_' + str(i), epoch_loss[-1])
141 |
142 | mpc_policy = MPCPolicy(env=e, fitted_model=models, seed=SEED+12345*outer_iter, **job_data)
143 |
144 | if job_data['eval_rollouts'] > 0:
145 | print("Performing validation rollouts ... ")
146 | eval_paths = evaluate_policy(mpc_policy.env, mpc_policy, mpc_policy.fitted_model[0], noise_level=0.0,
147 | real_step=True, num_episodes=job_data['eval_rollouts'], visualize=False)
148 | eval_score = np.mean([np.sum(p['rewards']) for p in eval_paths])
149 | logger.log_kv('eval_score', eval_score)
150 | try:
151 | eval_metric = e.env.env.evaluate_success(eval_paths)
152 | logger.log_kv('eval_metric', eval_metric)
153 | except:
154 | pass
155 | else:
156 | eval_paths = []
157 |
158 | exp_data = dict(policy=mpc_policy, fitted_model=mpc_policy.fitted_model,
159 | log=logger.log, rollout_paths=iter_paths, eval_paths=eval_paths)
160 | if outer_iter > 0 and outer_iter % job_data['save_freq'] == 0:
161 | pickle.dump(exp_data, open(PICKLE_FILE, 'wb'))
162 | pickle.dump(exp_data, open(OUT_DIR + '/iteration_' + str(outer_iter) + '.pickle', 'wb'))
163 |
164 | tf = timer.time()
165 | logger.log_kv('iter_time', tf-ts)
166 | print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
167 | logger.get_current_log().items()))
168 | print(tabulate(print_data))
169 | logger.save_log(OUT_DIR+'/')
170 | make_train_plots(log=logger.log, keys=['rollout_score', 'eval_score', 'rollout_metric', 'eval_metric'],
171 | save_loc=OUT_DIR+'/')
172 |
173 | if job_data['debug_mode']:
174 | evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], False, 5, True)
175 | evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], True, 5, True)
176 |
177 | pickle.dump(exp_data, open(PICKLE_FILE, 'wb')) # final save
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/mjrl_point_mass.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def reward_function(paths):
4 | # path has two keys: observations and actions
5 | # path["observations"] : (num_traj, horizon, obs_dim)
6 | # return paths that contain rewards in path["rewards"]
7 | # path["rewards"] should have shape (num_traj, horizon)
8 | obs = paths["observations"]
9 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
10 | agent_pos = obs[:, :, :2]
11 | target_pos = obs[:, :, -2:]
12 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1)
13 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1)
14 | rewards = -1.0 * l1_dist - 0.5 * l2_dist
15 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s')
16 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
17 | return paths
18 |
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/utils/visualize_policy.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import mjrl.envs
3 | import trajopt.envs
4 | import mj_envs
5 | import click
6 | import os
7 | import gym
8 | import numpy as np
9 | import pickle
10 | import torch
11 | from mjrl.utils.gym_env import GymEnv
12 | from mjrl.policies.gaussian_mlp import MLP
13 | import trajopt.envs
14 |
15 | DESC = '''
16 | Helper script to visualize policy (in mjrl format).\n
17 | USAGE:\n
18 | Visualizes policy on the env\n
19 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n
20 | '''
21 |
22 | # MAIN =========================================================
23 | @click.command(help=DESC)
24 | @click.option('--env_name', type=str, help='environment to load', required= True)
25 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None)
26 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation')
27 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123)
28 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10)
29 | @click.option('--log_std', type=float, default=-0.5)
30 | @click.option('--terminate', type=bool, default=True)
31 | @click.option('--device_path', type=str, default=None)
32 | def main(env_name, policy, mode, seed, episodes, log_std, terminate, device_path):
33 | render = True
34 |
35 | # TODO(Aravind): Map to hardware if device_path is specified
36 |
37 | e = GymEnv(env_name)
38 | e.set_seed(seed)
39 | np.random.seed(seed)
40 | torch.manual_seed(seed)
41 | if policy is not None:
42 | policy = pickle.load(open(policy, 'rb'))
43 | else:
44 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=log_std)
45 |
46 | for ep in range(episodes):
47 | o = e.reset()
48 | rew = 0.0
49 | t = 0
50 | done = False
51 | while t < e.horizon and done is False:
52 | o = e.get_obs()
53 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
54 | next_o, r, done, ifo = e.step(a)
55 | if terminate is False:
56 | done = False
57 | rew = rew + r
58 | t = t + 1
59 | if render:
60 | e.render()
61 | if done and t < e.horizon - 1:
62 | print("Episode terminated early")
63 | print("episode score = %f " % rew)
64 |
65 | e.reset()
66 |
67 |
68 | if __name__ == '__main__':
69 | main()
70 |
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/utils/visualize_trajectories.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import click
3 | import json
4 | import numpy as np
5 | import torch
6 | import mjrl.envs
7 | import trajopt.envs
8 | import mj_envs
9 | import mjrl.utils.tensor_utils as tensor_utils
10 |
11 | from mjrl.utils.gym_env import GymEnv
12 | from mjrl.algos.model_accel.sampling import evaluate_policy
13 |
14 | DESC = '''
15 | Helper script to visualize optimized trajectories (list of trajectories in trajopt format).\n
16 | USAGE:\n
17 | $ python viz_trajectories.py --file path_to_file.pickle\n
18 | '''
19 | @click.command(help=DESC)
20 | @click.option('--file', type=str, help='pickle file with trajectories', required= True)
21 | @click.option('--seed', type=int, default=123)
22 | @click.option('--noise_level', type=float, default=0.0)
23 | @click.option('--num_episodes', type=int, help='number of times to play trajectories', default=5)
24 | @click.option('--config', type=str, help='if provided MPC params from here will be used.', default=None)
25 | @click.option('--device_path', type=str, default=None)
26 | def main(file, seed, noise_level, num_episodes, config, device_path):
27 | exp_data = pickle.load(open(file, 'rb'))
28 | policy = exp_data['policy']
29 | model = exp_data['fitted_model']
30 | model = model[-1] if type(model) == list else model
31 | env_id = policy.env.env_id
32 | render = True
33 |
34 | # TODO(Aravind): Map to hardware if device_path is specified
35 |
36 | env = GymEnv(env_id)
37 | policy.env = env
38 |
39 | env.set_seed(seed)
40 | np.random.seed(seed)
41 | torch.manual_seed(seed)
42 |
43 | if config is not None:
44 | try:
45 | with open(config, 'r') as f:
46 | config = eval(f.read())
47 | except:
48 | with open(config, 'r') as f:
49 | config = json.load(f)
50 | policy.plan_horizon = config['plan_horizon']
51 | policy.num_traj = config['plan_paths']
52 | policy.kappa = config['kappa']
53 | policy.filter_coefs = [config['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']]
54 | policy.omega = config['omega'] if 'omega' in config.keys() else 0.0
55 |
56 | # TODO(Aravind): Implement capability to set predicted state for rendering purposes
57 | # evaluate_policy(env, policy, model, noise_level, real_step=False, num_episodes=num_episodes, visualize=render)
58 | evaluate_policy(env, policy, model, noise_level, real_step=True, num_episodes=num_episodes, visualize=render)
59 |
60 | # final close out
61 | env.reset()
62 |
63 |
64 | if __name__ == '__main__':
65 | main()
66 |
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/ppo_clip.py:
--------------------------------------------------------------------------------
1 | import logging
2 | #logging.disable(logging.CRITICAL)
3 | import numpy as np
4 | import scipy as sp
5 | import scipy.sparse.linalg as spLA
6 | import copy
7 | import time as timer
8 | import torch
9 | import torch.nn as nn
10 | from torch.autograd import Variable
11 | import copy
12 |
13 | # samplers
14 | import mjrl.samplers.core as trajectory_sampler
15 |
16 | # utility functions
17 | import mjrl.utils.process_samples as process_samples
18 | from mjrl.utils.logger import DataLog
19 | from mjrl.utils.cg_solve import cg_solve
20 | from mjrl.algos.batch_reinforce import BatchREINFORCE
21 |
22 |
23 | class PPO(BatchREINFORCE):
24 | def __init__(self, env, policy, baseline,
25 | clip_coef = 0.2,
26 | epochs = 10,
27 | mb_size = 64,
28 | learn_rate = 3e-4,
29 | seed = 123,
30 | save_logs = False,
31 | **kwargs
32 | ):
33 |
34 | self.env = env
35 | self.policy = policy
36 | self.baseline = baseline
37 | self.learn_rate = learn_rate
38 | self.seed = seed
39 | self.save_logs = save_logs
40 | self.clip_coef = clip_coef
41 | self.epochs = epochs
42 | self.mb_size = mb_size
43 | self.running_score = None
44 | if save_logs: self.logger = DataLog()
45 |
46 | self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=learn_rate)
47 |
48 | def PPO_surrogate(self, observations, actions, advantages):
49 | adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False)
50 | old_dist_info = self.policy.old_dist_info(observations, actions)
51 | new_dist_info = self.policy.new_dist_info(observations, actions)
52 | LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info)
53 | LR_clip = torch.clamp(LR, min=1-self.clip_coef, max=1+self.clip_coef)
54 | ppo_surr = torch.mean(torch.min(LR*adv_var,LR_clip*adv_var))
55 | return ppo_surr
56 |
57 | # ----------------------------------------------------------
58 | def train_from_paths(self, paths):
59 |
60 | # Concatenate from all the trajectories
61 | observations = np.concatenate([path["observations"] for path in paths])
62 | actions = np.concatenate([path["actions"] for path in paths])
63 | advantages = np.concatenate([path["advantages"] for path in paths])
64 | # Advantage whitening
65 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
66 | # NOTE : advantage should be zero mean in expectation
67 | # normalized step size invariant to advantage scaling,
68 | # but scaling can help with least squares
69 |
70 | # cache return distributions for the paths
71 | path_returns = [sum(p["rewards"]) for p in paths]
72 | mean_return = np.mean(path_returns)
73 | std_return = np.std(path_returns)
74 | min_return = np.amin(path_returns)
75 | max_return = np.amax(path_returns)
76 | base_stats = [mean_return, std_return, min_return, max_return]
77 | self.running_score = mean_return if self.running_score is None else \
78 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters
79 | if self.save_logs: self.log_rollout_statistics(paths)
80 |
81 | # Optimization algorithm
82 | # --------------------------
83 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
84 | params_before_opt = self.policy.get_param_values()
85 |
86 | ts = timer.time()
87 | num_samples = observations.shape[0]
88 | for ep in range(self.epochs):
89 | for mb in range(int(num_samples / self.mb_size)):
90 | rand_idx = np.random.choice(num_samples, size=self.mb_size)
91 | obs = observations[rand_idx]
92 | act = actions[rand_idx]
93 | adv = advantages[rand_idx]
94 | self.optimizer.zero_grad()
95 | loss = - self.PPO_surrogate(obs, act, adv)
96 | loss.backward()
97 | self.optimizer.step()
98 |
99 | params_after_opt = self.policy.get_param_values()
100 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
101 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
102 | self.policy.set_param_values(params_after_opt, set_new=True, set_old=True)
103 | t_opt = timer.time() - ts
104 |
105 | # Log information
106 | if self.save_logs:
107 | self.logger.log_kv('t_opt', t_opt)
108 | self.logger.log_kv('kl_dist', kl_dist)
109 | self.logger.log_kv('surr_improvement', surr_after - surr_before)
110 | self.logger.log_kv('running_score', self.running_score)
111 | try:
112 | self.env.env.env.evaluate_success(paths, self.logger)
113 | except:
114 | # nested logic for backwards compatibility. TODO: clean this up.
115 | try:
116 | success_rate = self.env.env.env.evaluate_success(paths)
117 | self.logger.log_kv('success_rate', success_rate)
118 | except:
119 | pass
120 |
121 | return base_stats
122 |
--------------------------------------------------------------------------------
/mjrl/mjrl/algos/trpo.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.disable(logging.CRITICAL)
3 | import numpy as np
4 | import scipy as sp
5 | import scipy.sparse.linalg as spLA
6 | import copy
7 | import time as timer
8 | import torch
9 | import torch.nn as nn
10 | from torch.autograd import Variable
11 | import copy
12 |
13 | # samplers
14 | import mjrl.samplers.core as trajectory_sampler
15 | import mjrl.samplers.batch_sampler as batch_sampler
16 |
17 | # utility functions
18 | import mjrl.utils.process_samples as process_samples
19 | from mjrl.utils.logger import DataLog
20 | from mjrl.utils.cg_solve import cg_solve
21 |
22 | # Import NPG
23 | from mjrl.algos.npg_cg import NPG
24 |
25 | class TRPO(NPG):
26 | def __init__(self, env, policy, baseline,
27 | kl_dist=0.01,
28 | FIM_invert_args={'iters': 10, 'damping': 1e-4},
29 | hvp_sample_frac=1.0,
30 | seed=123,
31 | save_logs=False,
32 | normalized_step_size=0.01,
33 | **kwargs
34 | ):
35 | """
36 | All inputs are expected in mjrl's format unless specified
37 | :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance
38 | :param kl_dist: desired KL distance between steps. Overrides normalized_step_size.
39 | :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well)
40 | :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG
41 | :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow)
42 | :param seed: random seed
43 | """
44 |
45 | self.env = env
46 | self.policy = policy
47 | self.baseline = baseline
48 | self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size
49 | self.seed = seed
50 | self.save_logs = save_logs
51 | self.FIM_invert_args = FIM_invert_args
52 | self.hvp_subsample = hvp_sample_frac
53 | self.running_score = None
54 | if save_logs: self.logger = DataLog()
55 |
56 | def train_from_paths(self, paths):
57 |
58 | # Concatenate from all the trajectories
59 | observations = np.concatenate([path["observations"] for path in paths])
60 | actions = np.concatenate([path["actions"] for path in paths])
61 | advantages = np.concatenate([path["advantages"] for path in paths])
62 | # Advantage whitening
63 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
64 | # NOTE : advantage should be zero mean in expectation
65 | # normalized step size invariant to advantage scaling,
66 | # but scaling can help with least squares
67 |
68 | # cache return distributions for the paths
69 | path_returns = [sum(p["rewards"]) for p in paths]
70 | mean_return = np.mean(path_returns)
71 | std_return = np.std(path_returns)
72 | min_return = np.amin(path_returns)
73 | max_return = np.amax(path_returns)
74 | base_stats = [mean_return, std_return, min_return, max_return]
75 | self.running_score = mean_return if self.running_score is None else \
76 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters
77 | if self.save_logs: self.log_rollout_statistics(paths)
78 |
79 | # Keep track of times for various computations
80 | t_gLL = 0.0
81 | t_FIM = 0.0
82 |
83 | # Optimization algorithm
84 | # --------------------------
85 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
86 |
87 | # VPG
88 | ts = timer.time()
89 | vpg_grad = self.flat_vpg(observations, actions, advantages)
90 | t_gLL += timer.time() - ts
91 |
92 | # NPG
93 | ts = timer.time()
94 | hvp = self.build_Hvp_eval([observations, actions],
95 | regu_coef=self.FIM_invert_args['damping'])
96 | npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(),
97 | cg_iters=self.FIM_invert_args['iters'])
98 | t_FIM += timer.time() - ts
99 |
100 | # Step size computation
101 | # --------------------------
102 | n_step_size = 2.0*self.kl_dist
103 | alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20)))
104 |
105 | # Policy update
106 | # --------------------------
107 | curr_params = self.policy.get_param_values()
108 | for k in range(100):
109 | new_params = curr_params + alpha * npg_grad
110 | self.policy.set_param_values(new_params, set_new=True, set_old=False)
111 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
112 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
113 | if kl_dist < self.kl_dist:
114 | break
115 | else:
116 | alpha = 0.9*alpha # backtrack
117 | print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \
118 | (kl_dist, surr_after-surr_before) )
119 | if k == 99:
120 | alpha = 0.0
121 |
122 | new_params = curr_params + alpha * npg_grad
123 | self.policy.set_param_values(new_params, set_new=True, set_old=False)
124 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
125 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
126 | self.policy.set_param_values(new_params, set_new=True, set_old=True)
127 |
128 | # Log information
129 | if self.save_logs:
130 | self.logger.log_kv('alpha', alpha)
131 | self.logger.log_kv('delta', n_step_size)
132 | self.logger.log_kv('time_vpg', t_gLL)
133 | self.logger.log_kv('time_npg', t_FIM)
134 | self.logger.log_kv('kl_dist', kl_dist)
135 | self.logger.log_kv('surr_improvement', surr_after - surr_before)
136 | self.logger.log_kv('running_score', self.running_score)
137 | try:
138 | self.env.env.env.evaluate_success(paths, self.logger)
139 | except:
140 | # nested logic for backwards compatibility. TODO: clean this up.
141 | try:
142 | success_rate = self.env.env.env.evaluate_success(paths)
143 | self.logger.log_kv('success_rate', success_rate)
144 | except:
145 | pass
146 |
147 | return base_stats
--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/baselines/__init__.py
--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/linear_baseline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 |
4 |
5 | class LinearBaseline:
6 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-5):
7 | self.inp = inp
8 | self._reg_coeff = reg_coeff
9 | self._coeffs = None
10 |
11 | def _features(self, paths):
12 | if self.inp == 'env_features':
13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
14 | else:
15 | o = np.concatenate([path["observations"] for path in paths])
16 | o = np.clip(o, -10, 10)/10.0
17 | if o.ndim > 2:
18 | o = o.reshape(o.shape[0], -1)
19 | N, n = o.shape
20 | num_feat = int( n + 1 + 4 ) # linear + bias (1.0) + time till pow 4
21 | feat_mat = np.ones((N, num_feat))
22 |
23 | # linear features
24 | feat_mat[:,:n] = o
25 |
26 | k = 0 # start from this row
27 | for i in range(len(paths)):
28 | l = len(paths[i]["rewards"])
29 | al = np.arange(l)/1000.0
30 | for j in range(4):
31 | feat_mat[k:k+l, -4+j] = al**(j+1)
32 | k += l
33 |
34 | return feat_mat
35 |
36 | def fit(self, paths, return_errors=False):
37 |
38 | featmat = self._features(paths)
39 | returns = np.concatenate([path["returns"] for path in paths])
40 |
41 | if return_errors:
42 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape)
43 | errors = returns - predictions
44 | error_before = np.sum(errors**2)/np.sum(returns**2)
45 |
46 | reg_coeff = copy.deepcopy(self._reg_coeff)
47 | for _ in range(10):
48 | self._coeffs = np.linalg.lstsq(
49 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
50 | featmat.T.dot(returns)
51 | )[0]
52 | if not np.any(np.isnan(self._coeffs)):
53 | break
54 | reg_coeff *= 10
55 |
56 | if return_errors:
57 | predictions = featmat.dot(self._coeffs)
58 | errors = returns - predictions
59 | error_after = np.sum(errors**2)/np.sum(returns**2)
60 | return error_before, error_after
61 |
62 | def predict(self, path):
63 | if self._coeffs is None:
64 | return np.zeros(len(path["rewards"]))
65 | return self._features([path]).dot(self._coeffs)
66 |
--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/mlp_baseline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 | import torch
4 | import torch.nn as nn
5 | from torch.autograd import Variable
6 | from mjrl.utils.optimize_model import fit_data
7 |
8 | import pickle
9 |
10 | class MLPBaseline:
11 | def __init__(self, env_spec, inp_dim=None, inp='obs', learn_rate=1e-3, reg_coef=0.0,
12 | batch_size=64, epochs=1, use_gpu=False, hidden_sizes=(128, 128)):
13 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim
14 | self.batch_size = batch_size
15 | self.epochs = epochs
16 | self.reg_coef = reg_coef
17 | self.use_gpu = use_gpu
18 | self.inp = inp
19 | self.hidden_sizes = hidden_sizes
20 |
21 | self.model = nn.Sequential()
22 | layer_sizes = (self.n + 4, ) + hidden_sizes + (1, )
23 | for i in range(len(layer_sizes) - 1):
24 | layer_id = 'fc_' + str(i)
25 | relu_id = 'relu_' + str(i)
26 | self.model.add_module(layer_id, nn.Linear(layer_sizes[i], layer_sizes[i+1]))
27 | if i != len(layer_sizes) - 2:
28 | self.model.add_module(relu_id, nn.ReLU())
29 |
30 | if self.use_gpu:
31 | self.model.cuda()
32 |
33 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learn_rate, weight_decay=reg_coef)
34 | self.loss_function = torch.nn.MSELoss()
35 |
36 | def _features(self, paths):
37 | if self.inp == 'env_features':
38 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
39 | else:
40 | o = np.concatenate([path["observations"] for path in paths])
41 | o = np.clip(o, -10, 10)/10.0
42 | if o.ndim > 2:
43 | o = o.reshape(o.shape[0], -1)
44 | N, n = o.shape
45 | num_feat = int( n + 4 ) # linear + time till pow 4
46 | feat_mat = np.ones((N, num_feat)) # memory allocation
47 |
48 | # linear features
49 | feat_mat[:,:n] = o
50 |
51 | k = 0 # start from this row
52 | for i in range(len(paths)):
53 | l = len(paths[i]["rewards"])
54 | al = np.arange(l)/1000.0
55 | for j in range(4):
56 | feat_mat[k:k+l, -4+j] = al**(j+1)
57 | k += l
58 | return feat_mat
59 |
60 |
61 | def fit(self, paths, return_errors=False, return_all_errors=False):
62 |
63 | featmat = self._features(paths)
64 | returns = np.concatenate([path["returns"] for path in paths]).reshape(-1, 1)
65 | featmat = featmat.astype('float32')
66 | returns = returns.astype('float32')
67 | num_samples = returns.shape[0]
68 |
69 | # Make variables with the above data
70 | if self.use_gpu:
71 | featmat_var = Variable(torch.from_numpy(featmat).cuda(), requires_grad=False)
72 | returns_var = Variable(torch.from_numpy(returns).cuda(), requires_grad=False)
73 | else:
74 | featmat_var = Variable(torch.from_numpy(featmat), requires_grad=False)
75 | returns_var = Variable(torch.from_numpy(returns), requires_grad=False)
76 |
77 | if return_errors:
78 | if self.use_gpu:
79 | predictions = self.model(featmat_var).cpu().data.numpy().ravel()
80 | else:
81 | predictions = self.model(featmat_var).data.numpy().ravel()
82 | errors = returns.ravel() - predictions
83 | error_before = np.sum(errors**2)/(np.sum(returns**2) + 1e-8)
84 |
85 | epoch_losses = fit_data(self.model, featmat_var, returns_var, self.optimizer,
86 | self.loss_function, self.batch_size, self.epochs)
87 |
88 | if return_errors:
89 | if self.use_gpu:
90 | predictions = self.model(featmat_var).cpu().data.numpy().ravel()
91 | else:
92 | predictions = self.model(featmat_var).data.numpy().ravel()
93 | errors = returns.ravel() - predictions
94 | error_after = np.sum(errors**2)/(np.sum(returns**2) + 1e-8)
95 | if return_all_errors:
96 | return error_before, error_after, epoch_losses
97 | return error_before, error_after
98 |
99 | def predict(self, path):
100 | featmat = self._features([path]).astype('float32')
101 | if self.use_gpu:
102 | feat_var = Variable(torch.from_numpy(featmat).float().cuda(), requires_grad=False)
103 | prediction = self.model(feat_var).cpu().data.numpy().ravel()
104 | else:
105 | feat_var = Variable(torch.from_numpy(featmat).float(), requires_grad=False)
106 | prediction = self.model(feat_var).data.numpy().ravel()
107 | return prediction
108 |
--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/quadratic_baseline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 |
4 | class QuadraticBaseline:
5 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-3):
6 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim
7 | self.inp = inp
8 | self._reg_coeff = reg_coeff
9 | self._coeffs = None
10 |
11 | def _features(self, paths):
12 | if self.inp == 'env_features':
13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
14 | else:
15 | o = np.concatenate([path["observations"] for path in paths])
16 | o = np.clip(o, -10, 10)/10.0
17 | if o.ndim > 2:
18 | o = o.reshape(o.shape[0], -1)
19 | N, n = o.shape
20 | num_feat = int( n + n*(n+1)/2 + 1 + 4 ) # linear + full quad (symmetric matrix) + bias (1.0) + time till pow 4
21 | feat_mat = np.ones((N, num_feat)) # memory allocation
22 |
23 | # linear features
24 | feat_mat[:,:n] = o
25 |
26 | # quadratic features
27 | k = n # starting from this column in feat_mat
28 | for i in range(n):
29 | for j in range(i, n):
30 | feat_mat[:,k] = o[:,i]*o[:,j] # element-wise product
31 | k += 1
32 |
33 | k = 0 # start from this row
34 | for i in range(len(paths)):
35 | l = len(paths[i]["rewards"])
36 | al = np.arange(l)/1000.0
37 | for j in range(4):
38 | feat_mat[k:k+l, -4+j] = al**(j+1)
39 | k += l
40 |
41 | return feat_mat
42 |
43 |
44 | def fit(self, paths, return_errors=False):
45 |
46 | #featmat = np.concatenate([self._features(path) for path in paths])
47 | featmat = self._features(paths)
48 | returns = np.concatenate([path["returns"] for path in paths])
49 |
50 | if return_errors:
51 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape)
52 | errors = returns - predictions
53 | error_before = np.sum(errors**2)/np.sum(returns**2)
54 |
55 | reg_coeff = copy.deepcopy(self._reg_coeff)
56 | for _ in range(10):
57 | self._coeffs = np.linalg.lstsq(
58 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
59 | featmat.T.dot(returns)
60 | )[0]
61 | if not np.any(np.isnan(self._coeffs)):
62 | break
63 | reg_coeff *= 10
64 |
65 | if return_errors:
66 | predictions = featmat.dot(self._coeffs)
67 | errors = returns - predictions
68 | error_after = np.sum(errors**2)/np.sum(returns**2)
69 | return error_before, error_after
70 |
71 | def predict(self, path):
72 | if self._coeffs is None:
73 | return np.zeros(len(path["rewards"]))
74 | return self._features([path]).dot(self._coeffs)
75 |
--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/zero_baseline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 |
4 | class ZeroBaseline:
5 | def __init__(self, env_spec, **kwargs):
6 | n = env_spec.observation_dim # number of states
7 | self._coeffs = None
8 |
9 | def fit(self, paths, return_errors=False):
10 | if return_errors:
11 | return 1.0, 1.0
12 |
13 | def predict(self, path):
14 | return np.zeros(len(path["rewards"]))
15 |
--------------------------------------------------------------------------------
/mjrl/mjrl/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | # ----------------------------------------
4 | # mjrl environments
5 | # ----------------------------------------
6 |
7 | register(
8 | id='mjrl_point_mass-v0',
9 | entry_point='mjrl.envs:PointMassEnv',
10 | max_episode_steps=25,
11 | )
12 |
13 | register(
14 | id='mjrl_swimmer-v0',
15 | entry_point='mjrl.envs:SwimmerEnv',
16 | max_episode_steps=500,
17 | )
18 |
19 | register(
20 | id='mjrl_reacher_7dof-v0',
21 | entry_point='mjrl.envs:Reacher7DOFEnv',
22 | max_episode_steps=50,
23 | )
24 |
25 | register(
26 | id='mjrl_peg_insertion-v0',
27 | entry_point='mjrl.envs:PegEnv',
28 | max_episode_steps=50,
29 | )
30 |
31 | from mjrl.envs.mujoco_env import MujocoEnv
32 | # ^^^^^ so that user gets the correct error
33 | # message if mujoco is not installed correctly
34 | from mjrl.envs.point_mass import PointMassEnv
35 | from mjrl.envs.swimmer import SwimmerEnv
36 | from mjrl.envs.reacher_sawyer import Reacher7DOFEnv
37 | from mjrl.envs.peg_insertion_sawyer import PegEnv
38 |
--------------------------------------------------------------------------------
/mjrl/mjrl/envs/assets/point_mass.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/mjrl/mjrl/envs/assets/sawyer.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
58 |
59 |
60 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/mjrl/mjrl/envs/assets/swimmer.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/mjrl/mjrl/envs/mujoco_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from gym import error, spaces
4 | from gym.utils import seeding
5 | import numpy as np
6 | from os import path
7 | import gym
8 | import six
9 | import time as timer
10 |
11 | try:
12 | import mujoco_py
13 | from mujoco_py import load_model_from_path, MjSim, MjViewer
14 | except ImportError as e:
15 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e))
16 |
17 | def get_sim(model_path):
18 | if model_path.startswith("/"):
19 | fullpath = model_path
20 | else:
21 | fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path)
22 | if not path.exists(fullpath):
23 | raise IOError("File %s does not exist" % fullpath)
24 | model = load_model_from_path(fullpath)
25 | return MjSim(model)
26 |
27 | class MujocoEnv(gym.Env):
28 | """Superclass for all MuJoCo environments.
29 | """
30 |
31 | def __init__(self, model_path=None, frame_skip=1, sim=None):
32 |
33 | if sim is None:
34 | self.sim = get_sim(model_path)
35 | else:
36 | self.sim = sim
37 | self.data = self.sim.data
38 | self.model = self.sim.model
39 |
40 | self.frame_skip = frame_skip
41 | self.metadata = {
42 | 'render.modes': ['human', 'rgb_array'],
43 | 'video.frames_per_second': int(np.round(1.0 / self.dt))
44 | }
45 | self.mujoco_render_frames = False
46 |
47 | self.init_qpos = self.data.qpos.ravel().copy()
48 | self.init_qvel = self.data.qvel.ravel().copy()
49 | try:
50 | observation, _reward, done, _info = self.step(np.zeros(self.model.nu))
51 | except NotImplementedError:
52 | observation, _reward, done, _info = self._step(np.zeros(self.model.nu))
53 | assert not done
54 | self.obs_dim = np.sum([o.size for o in observation]) if type(observation) is tuple else observation.size
55 |
56 | bounds = self.model.actuator_ctrlrange.copy()
57 | low = bounds[:, 0]
58 | high = bounds[:, 1]
59 | self.action_space = spaces.Box(low, high, dtype=np.float32)
60 |
61 | high = np.inf*np.ones(self.obs_dim)
62 | low = -high
63 | self.observation_space = spaces.Box(low, high, dtype=np.float32)
64 |
65 | self.seed()
66 |
67 | def seed(self, seed=None):
68 | self.np_random, seed = seeding.np_random(seed)
69 | return [seed]
70 |
71 | # methods to override:
72 | # ----------------------------
73 |
74 | def reset_model(self):
75 | """
76 | Reset the robot degrees of freedom (qpos and qvel).
77 | Implement this in each subclass.
78 | """
79 | raise NotImplementedError
80 |
81 | def mj_viewer_setup(self):
82 | """
83 | Due to specifics of new mujoco rendering, the standard viewer cannot be used
84 | with this set-up. Instead we use this mujoco specific function.
85 | """
86 | pass
87 |
88 | def viewer_setup(self):
89 | """
90 | Does not work. Use mj_viewer_setup() instead
91 | """
92 | pass
93 |
94 | def evaluate_success(self, paths, logger=None):
95 | """
96 | Log various success metrics calculated based on input paths into the logger
97 | """
98 | pass
99 |
100 | # -----------------------------
101 |
102 | def reset(self):
103 | self.sim.reset()
104 | self.sim.forward()
105 | ob = self.reset_model()
106 | return ob
107 |
108 | def set_state(self, qpos, qvel):
109 | assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
110 | old_state = self.sim.get_state()
111 | new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel,
112 | old_state.act, old_state.udd_state)
113 | self.sim.set_state(new_state)
114 | self.sim.forward()
115 |
116 | @property
117 | def dt(self):
118 | return self.model.opt.timestep * self.frame_skip
119 |
120 | def do_simulation(self, ctrl, n_frames):
121 | for i in range(self.model.nu):
122 | self.sim.data.ctrl[i] = ctrl[i]
123 | for _ in range(n_frames):
124 | self.sim.step()
125 | if self.mujoco_render_frames is True:
126 | self.mj_render()
127 |
128 | def mj_render(self):
129 | try:
130 | self.viewer.render()
131 | except:
132 | self.mj_viewer_setup()
133 | self.viewer._run_speed = 0.5
134 | #self.viewer._run_speed /= self.frame_skip
135 | self.viewer.render()
136 |
137 | def render(self, *args, **kwargs):
138 | pass
139 | #return self.mj_render()
140 |
141 | def _get_viewer(self):
142 | pass
143 | #return None
144 |
145 | def state_vector(self):
146 | state = self.sim.get_state()
147 | return np.concatenate([
148 | state.qpos.flat, state.qvel.flat])
149 |
150 | # -----------------------------
151 |
152 | def visualize_policy(self, policy, horizon=1000, num_episodes=1, mode='exploration'):
153 | self.mujoco_render_frames = True
154 | for ep in range(num_episodes):
155 | o = self.reset()
156 | d = False
157 | t = 0
158 | score = 0.0
159 | while t < horizon and d is False:
160 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
161 | o, r, d, _ = self.step(a)
162 | t = t+1
163 | score = score + r
164 | print("Episode score = %f" % score)
165 | self.mujoco_render_frames = False
166 |
167 | def visualize_policy_offscreen(self, policy, horizon=1000,
168 | num_episodes=1,
169 | frame_size=(640,480),
170 | mode='exploration',
171 | save_loc='/tmp/',
172 | filename='newvid',
173 | camera_name=None):
174 | import skvideo.io
175 | for ep in range(num_episodes):
176 | print("Episode %d: rendering offline " % ep, end='', flush=True)
177 | o = self.reset()
178 | d = False
179 | t = 0
180 | arrs = []
181 | t0 = timer.time()
182 | while t < horizon and d is False:
183 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
184 | o, r, d, _ = self.step(a)
185 | t = t+1
186 | curr_frame = self.sim.render(width=frame_size[0], height=frame_size[1],
187 | mode='offscreen', camera_name=camera_name, device_id=0)
188 | arrs.append(curr_frame[::-1,:,:])
189 | print(t, end=', ', flush=True)
190 | file_name = save_loc + filename + str(ep) + ".mp4"
191 | skvideo.io.vwrite( file_name, np.asarray(arrs))
192 | print("saved", file_name)
193 | t1 = timer.time()
194 | print("time taken = %f"% (t1-t0))
195 |
--------------------------------------------------------------------------------
/mjrl/mjrl/envs/peg_insertion_sawyer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from mjrl.envs import mujoco_env
4 | from mujoco_py import MjViewer
5 |
6 |
7 | class PegEnv(mujoco_env.MujocoEnv, utils.EzPickle):
8 | def __init__(self):
9 | self.peg_sid = -2
10 | self.target_sid = -1
11 | mujoco_env.MujocoEnv.__init__(self, 'peg_insertion.xml', 4)
12 | utils.EzPickle.__init__(self)
13 | self.peg_sid = self.model.site_name2id("peg_bottom")
14 | self.target_sid = self.model.site_name2id("target")
15 | self.init_body_pos = self.model.body_pos.copy()
16 |
17 | def step(self, a):
18 | self.do_simulation(a, self.frame_skip)
19 | obs = self.get_obs()
20 | reward = self.get_reward(obs, a)
21 | return obs, reward, False, self.get_env_infos()
22 |
23 | def get_obs(self):
24 | return np.concatenate([
25 | self.data.qpos.flat,
26 | self.data.qvel.flat,
27 | self.data.site_xpos[self.peg_sid],
28 | self.data.site_xpos[self.target_sid],
29 | ])
30 |
31 | def get_reward(self, obs, act=None):
32 | obs = np.clip(obs, -10.0, 10.0)
33 | if len(obs.shape) == 1:
34 | # vector obs, called when stepping the env
35 | hand_pos = obs[-6:-3]
36 | target_pos = obs[-3:]
37 | l1_dist = np.sum(np.abs(hand_pos - target_pos))
38 | l2_dist = np.linalg.norm(hand_pos - target_pos)
39 | else:
40 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
41 | hand_pos = obs[:, :, -6:-3]
42 | target_pos = obs[:, :, -3:]
43 | l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1)
44 | l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1)
45 | bonus = 5.0 * (l2_dist < 0.06)
46 | reward = - l1_dist - 5.0 * l2_dist + bonus
47 | return reward
48 |
49 | def compute_path_rewards(self, paths):
50 | # path has two keys: observations and actions
51 | # path["observations"] : (num_traj, horizon, obs_dim)
52 | # path["rewards"] should have shape (num_traj, horizon)
53 | obs = paths["observations"]
54 | rewards = self.get_reward(obs)
55 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
56 |
57 | # --------------------------------
58 | # resets and randomization
59 | # --------------------------------
60 |
61 | def robot_reset(self):
62 | self.set_state(self.init_qpos, self.init_qvel)
63 |
64 | def target_reset(self):
65 | # Randomize goal position
66 | goal_y = self.np_random.uniform(low=0.1, high=0.5)
67 | try:
68 | self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29)
69 | self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29)
70 | self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29)
71 | self.sim.forward()
72 | except:
73 | pass
74 |
75 | def reset_model(self, seed=None):
76 | if seed is not None:
77 | self.seeding = True
78 | self.seed(seed)
79 | self.robot_reset()
80 | self.target_reset()
81 | return self.get_obs()
82 |
83 | # --------------------------------
84 | # get and set states
85 | # --------------------------------
86 |
87 | def get_env_state(self):
88 | target_pos = self.model.body_pos[-1].copy()
89 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
90 | target_pos=target_pos)
91 |
92 | def set_env_state(self, state):
93 | self.sim.reset()
94 | qp = state['qp'].copy()
95 | qv = state['qv'].copy()
96 | target_pos = state['target_pos']
97 | self.model.body_pos[-1] = target_pos
98 | goal_y = target_pos[1]
99 | self.data.qpos[:] = qp
100 | self.data.qvel[:] = qv
101 | self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29)
102 | self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29)
103 | self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29)
104 | self.sim.forward()
105 |
106 | # --------------------------------
107 | # utility functions
108 | # --------------------------------
109 |
110 | def get_env_infos(self):
111 | return dict(state=self.get_env_state())
112 |
113 | def mj_viewer_setup(self):
114 | self.viewer = MjViewer(self.sim)
115 | self.viewer.cam.azimuth += 200
116 | self.sim.forward()
117 | self.viewer.cam.distance = self.model.stat.extent*2.0
118 |
--------------------------------------------------------------------------------
/mjrl/mjrl/envs/point_mass.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from mjrl.envs import mujoco_env
4 | from mujoco_py import MjViewer
5 |
6 |
7 | class PointMassEnv(mujoco_env.MujocoEnv, utils.EzPickle):
8 | def __init__(self):
9 | self.agent_bid = 0
10 | self.target_sid = 0
11 | utils.EzPickle.__init__(self)
12 | mujoco_env.MujocoEnv.__init__(self, 'point_mass.xml', 5)
13 | self.agent_bid = self.sim.model.body_name2id('agent')
14 | self.target_sid = self.sim.model.site_name2id('target')
15 |
16 | def step(self, a):
17 | self.do_simulation(a, self.frame_skip)
18 | obs = self.get_obs()
19 | reward = self.get_reward(obs)
20 | return obs, reward, False, dict(solved=(reward > -0.1), state=self.get_env_state())
21 |
22 | def get_obs(self):
23 | agent_pos = self.data.body_xpos[self.agent_bid].ravel()
24 | target_pos = self.data.site_xpos[self.target_sid].ravel()
25 | return np.concatenate([agent_pos[:2], self.data.qvel.ravel(), target_pos[:2]])
26 |
27 | def get_reward(self, obs, act=None):
28 | if len(obs.shape) == 1:
29 | # vector obs, called when stepping the env
30 | agent_pos = obs[:2]
31 | target_pos = obs[-2:]
32 | l1_dist = np.sum(np.abs(agent_pos - target_pos))
33 | l2_dist = np.linalg.norm(agent_pos - target_pos)
34 | else:
35 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
36 | agent_pos = obs[:, :, :2]
37 | target_pos = obs[:, :, -2:]
38 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1)
39 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1)
40 | reward = -1.0 * l1_dist - 0.5 * l2_dist
41 | return reward
42 |
43 | def compute_path_rewards(self, paths):
44 | # path has two keys: observations and actions
45 | # path["observations"] : (num_traj, horizon, obs_dim)
46 | # path["rewards"] should have shape (num_traj, horizon)
47 | obs = paths["observations"]
48 | rewards = self.get_reward(obs)
49 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s')
50 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
51 | return paths
52 |
53 | def reset_model(self):
54 | # randomize the agent and goal
55 | agent_x = self.np_random.uniform(low=-1.0, high=1.0)
56 | agent_y = self.np_random.uniform(low=-1.0, high=1.0)
57 | goal_x = self.np_random.uniform(low=-1.0, high=1.0)
58 | goal_y = self.np_random.uniform(low=-1.0, high=1.0)
59 | qp = np.array([agent_x, agent_y])
60 | qv = self.init_qvel.copy()
61 | self.set_state(qp, qv)
62 | self.model.site_pos[self.target_sid][0] = goal_x
63 | self.model.site_pos[self.target_sid][1] = goal_y
64 | self.sim.forward()
65 | return self.get_obs()
66 |
67 | def evaluate_success(self, paths, logger=None):
68 | success = 0.0
69 | for p in paths:
70 | if np.mean(p['env_infos']['solved'][-4:]) > 0.0:
71 | success += 1.0
72 | success_rate = 100.0*success/len(paths)
73 | if logger is None:
74 | # nowhere to log so return the value
75 | return success_rate
76 | else:
77 | # log the success
78 | # can log multiple statistics here if needed
79 | logger.log_kv('success_rate', success_rate)
80 | return None
81 |
82 | # --------------------------------
83 | # get and set states
84 | # --------------------------------
85 |
86 | def get_env_state(self):
87 | target_pos = self.model.site_pos[self.target_sid].copy()
88 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
89 | target_pos=target_pos)
90 |
91 | def set_env_state(self, state):
92 | self.sim.reset()
93 | qp = state['qp'].copy()
94 | qv = state['qv'].copy()
95 | target_pos = state['target_pos']
96 | self.set_state(qp, qv)
97 | self.model.site_pos[self.target_sid] = target_pos
98 | self.sim.forward()
99 |
100 | # --------------------------------
101 | # utility functions
102 | # --------------------------------
103 |
104 | def get_env_infos(self):
105 | return dict(state=self.get_env_state())
106 |
107 | def mj_viewer_setup(self):
108 | self.viewer = MjViewer(self.sim)
109 | self.sim.forward()
110 |
--------------------------------------------------------------------------------
/mjrl/mjrl/envs/reacher_sawyer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from mjrl.envs import mujoco_env
4 | from mujoco_py import MjViewer
5 |
6 |
7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle):
8 | def __init__(self):
9 | self.hand_sid = -2
10 | self.target_sid = -1
11 | mujoco_env.MujocoEnv.__init__(self, 'sawyer.xml', 4)
12 | utils.EzPickle.__init__(self)
13 | self.hand_sid = self.model.site_name2id("finger")
14 | self.target_sid = self.model.site_name2id("target")
15 |
16 | def step(self, a):
17 | self.do_simulation(a, self.frame_skip)
18 | obs = self.get_obs()
19 | reward = self.get_reward(obs, a)
20 | return obs, reward, False, self.get_env_infos()
21 |
22 | def get_obs(self):
23 | return np.concatenate([
24 | self.data.qpos.flat,
25 | self.data.qvel.ravel() * self.dt, # delta_x instead of velocity
26 | self.data.site_xpos[self.hand_sid],
27 | self.data.site_xpos[self.target_sid],
28 | ])
29 |
30 | def get_reward(self, obs, act=None):
31 | obs = np.clip(obs, -10.0, 10.0)
32 | if len(obs.shape) == 1:
33 | # vector obs, called when stepping the env
34 | hand_pos = obs[-6:-3]
35 | target_pos = obs[-3:]
36 | l1_dist = np.sum(np.abs(hand_pos - target_pos))
37 | l2_dist = np.linalg.norm(hand_pos - target_pos)
38 | else:
39 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
40 | hand_pos = obs[:, :, -6:-3]
41 | target_pos = obs[:, :, -3:]
42 | l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1)
43 | l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1)
44 | reward = - l1_dist - 5.0 * l2_dist
45 | return reward
46 |
47 | def compute_path_rewards(self, paths):
48 | # path has two keys: observations and actions
49 | # path["observations"] : (num_traj, horizon, obs_dim)
50 | # path["rewards"] should have shape (num_traj, horizon)
51 | obs = paths["observations"]
52 | rewards = self.get_reward(obs)
53 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
54 |
55 | # --------------------------------
56 | # resets and randomization
57 | # --------------------------------
58 |
59 | def robot_reset(self):
60 | self.set_state(self.init_qpos, self.init_qvel)
61 |
62 | def target_reset(self):
63 | target_pos = np.array([0.1, 0.1, 0.1])
64 | target_pos[0] = self.np_random.uniform(low=-0.3, high=0.3)
65 | target_pos[1] = self.np_random.uniform(low=-0.2, high=0.2)
66 | target_pos[2] = self.np_random.uniform(low=-0.25, high=0.25)
67 | self.model.site_pos[self.target_sid] = target_pos
68 | self.sim.forward()
69 |
70 | def reset_model(self, seed=None):
71 | if seed is not None:
72 | self.seeding = True
73 | self.seed(seed)
74 | self.robot_reset()
75 | self.target_reset()
76 | return self.get_obs()
77 |
78 | # --------------------------------
79 | # get and set states
80 | # --------------------------------
81 |
82 | def get_env_state(self):
83 | target_pos = self.model.site_pos[self.target_sid].copy()
84 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
85 | target_pos=target_pos)
86 |
87 | def set_env_state(self, state):
88 | self.sim.reset()
89 | qp = state['qp'].copy()
90 | qv = state['qv'].copy()
91 | target_pos = state['target_pos']
92 | self.model.site_pos[self.target_sid] = target_pos
93 | self.data.qpos[:] = qp
94 | self.data.qvel[:] = qv
95 | self.sim.forward()
96 |
97 | # --------------------------------
98 | # utility functions
99 | # --------------------------------
100 |
101 | def get_env_infos(self):
102 | return dict(state=self.get_env_state())
103 |
104 | def mj_viewer_setup(self):
105 | self.viewer = MjViewer(self.sim)
106 | self.viewer.cam.trackbodyid = 1
107 | self.viewer.cam.type = 1
108 | self.sim.forward()
109 | self.viewer.cam.distance = self.model.stat.extent * 2.0
110 |
--------------------------------------------------------------------------------
/mjrl/mjrl/envs/swimmer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from mjrl.envs import mujoco_env
4 | from mujoco_py import MjViewer
5 |
6 | class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
7 | def __init__(self):
8 | mujoco_env.MujocoEnv.__init__(self, 'swimmer.xml', 5)
9 | utils.EzPickle.__init__(self)
10 |
11 | def step(self, a):
12 | xposbefore = self.data.qpos[0]
13 | self.do_simulation(a, self.frame_skip)
14 | xposafter = self.data.qpos[0]
15 |
16 | delta = (xposafter - xposbefore)
17 | # make agent move in the negative x direction
18 | reward = -10.0 * delta
19 | done = False
20 |
21 | ob = self.get_obs()
22 | return ob, reward, done, self.get_env_infos()
23 |
24 | def get_obs(self):
25 | return np.concatenate([
26 | self.data.qpos.flat[2:],
27 | self.data.qvel.flat,
28 | ])
29 |
30 | def reset_model(self):
31 | qpos_init = self.init_qpos.copy()
32 | qpos_init[2] = self.np_random.uniform(low=-np.pi, high=np.pi)
33 | self.set_state(qpos_init, self.init_qvel)
34 | self.sim.forward()
35 | return self.get_obs()
36 |
37 | # --------------------------------
38 | # get and set states
39 | # --------------------------------
40 |
41 | def get_env_state(self):
42 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy())
43 |
44 | def set_env_state(self, state):
45 | self.sim.reset()
46 | qp = state['qp'].copy()
47 | qv = state['qv'].copy()
48 | self.set_state(qp, qv)
49 | self.sim.forward()
50 |
51 | # --------------------------------
52 | # utility functions
53 | # --------------------------------
54 |
55 | def get_env_infos(self):
56 | return dict(state=self.get_env_state())
57 |
58 | def mj_viewer_setup(self):
59 | self.viewer = MjViewer(self.sim)
60 | self.viewer.cam.trackbodyid = 1
61 | self.viewer.cam.type = 1
62 | self.sim.forward()
63 | self.viewer.cam.distance = self.model.stat.extent*1.2
--------------------------------------------------------------------------------
/mjrl/mjrl/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/policies/__init__.py
--------------------------------------------------------------------------------
/mjrl/mjrl/policies/gaussian_linear.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.autograd import Variable
6 | from mjrl.utils.fc_network import FCNetwork
7 |
8 |
9 | class LinearPolicy:
10 | def __init__(self, env_spec,
11 | min_log_std=-3,
12 | init_log_std=0,
13 | seed=None):
14 | """
15 | :param env_spec: specifications of the env (see utils/gym_env.py)
16 | :param min_log_std: log_std is clamped at this value and can't go below
17 | :param init_log_std: initial log standard deviation
18 | :param seed: random seed
19 | """
20 | self.n = env_spec.observation_dim # number of states
21 | self.m = env_spec.action_dim # number of actions
22 | self.min_log_std = min_log_std
23 |
24 | # Set seed
25 | # ------------------------
26 | if seed is not None:
27 | torch.manual_seed(seed)
28 | np.random.seed(seed)
29 |
30 | # Policy network
31 | # ------------------------
32 | self.model = FCNetwork(self.n, self.m, hidden_sizes=())
33 | # make weights small
34 | for param in list(self.model.parameters())[-2:]: # only last layer
35 | param.data = 1e-2 * param.data
36 | self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True)
37 | self.trainable_params = list(self.model.parameters()) + [self.log_std]
38 |
39 | # Old Policy network
40 | # ------------------------
41 | self.old_model = FCNetwork(self.n, self.m, hidden_sizes=())
42 | self.old_log_std = Variable(torch.ones(self.m) * init_log_std)
43 | self.old_params = list(self.old_model.parameters()) + [self.old_log_std]
44 | for idx, param in enumerate(self.old_params):
45 | param.data = self.trainable_params[idx].data.clone()
46 |
47 | # Easy access variables
48 | # -------------------------
49 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
50 | self.param_shapes = [p.data.numpy().shape for p in self.trainable_params]
51 | self.param_sizes = [p.data.numpy().size for p in self.trainable_params]
52 | self.d = np.sum(self.param_sizes) # total number of params
53 |
54 | # Placeholders
55 | # ------------------------
56 | self.obs_var = Variable(torch.randn(self.n), requires_grad=False)
57 |
58 | # Utility functions
59 | # ============================================
60 | def get_param_values(self):
61 | params = np.concatenate([p.contiguous().view(-1).data.numpy()
62 | for p in self.trainable_params])
63 | return params.copy()
64 |
65 | def set_param_values(self, new_params, set_new=True, set_old=True):
66 | if set_new:
67 | current_idx = 0
68 | for idx, param in enumerate(self.trainable_params):
69 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
70 | vals = vals.reshape(self.param_shapes[idx])
71 | param.data = torch.from_numpy(vals).float()
72 | current_idx += self.param_sizes[idx]
73 | # clip std at minimum value
74 | self.trainable_params[-1].data = \
75 | torch.clamp(self.trainable_params[-1], self.min_log_std).data
76 | # update log_std_val for sampling
77 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
78 | if set_old:
79 | current_idx = 0
80 | for idx, param in enumerate(self.old_params):
81 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
82 | vals = vals.reshape(self.param_shapes[idx])
83 | param.data = torch.from_numpy(vals).float()
84 | current_idx += self.param_sizes[idx]
85 | # clip std at minimum value
86 | self.old_params[-1].data = \
87 | torch.clamp(self.old_params[-1], self.min_log_std).data
88 |
89 | # Main functions
90 | # ============================================
91 | def get_action(self, observation):
92 | o = np.float32(observation.reshape(1, -1))
93 | self.obs_var.data = torch.from_numpy(o)
94 | mean = self.model(self.obs_var).data.numpy().ravel()
95 | noise = np.exp(self.log_std_val) * np.random.randn(self.m)
96 | action = mean + noise
97 | return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}]
98 |
99 | def mean_LL(self, observations, actions, model=None, log_std=None):
100 | model = self.model if model is None else model
101 | log_std = self.log_std if log_std is None else log_std
102 | obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False)
103 | act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False)
104 | mean = model(obs_var)
105 | zs = (act_var - mean) / torch.exp(log_std)
106 | LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \
107 | - torch.sum(log_std) + \
108 | - 0.5 * self.m * np.log(2 * np.pi)
109 | return mean, LL
110 |
111 | def log_likelihood(self, observations, actions, model=None, log_std=None):
112 | mean, LL = self.mean_LL(observations, actions, model, log_std)
113 | return LL.data.numpy()
114 |
115 | def old_dist_info(self, observations, actions):
116 | mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std)
117 | return [LL, mean, self.old_log_std]
118 |
119 | def new_dist_info(self, observations, actions):
120 | mean, LL = self.mean_LL(observations, actions, self.model, self.log_std)
121 | return [LL, mean, self.log_std]
122 |
123 | def likelihood_ratio(self, new_dist_info, old_dist_info):
124 | LL_old = old_dist_info[0]
125 | LL_new = new_dist_info[0]
126 | LR = torch.exp(LL_new - LL_old)
127 | return LR
128 |
129 | def mean_kl(self, new_dist_info, old_dist_info):
130 | old_log_std = old_dist_info[2]
131 | new_log_std = new_dist_info[2]
132 | old_std = torch.exp(old_log_std)
133 | new_std = torch.exp(new_log_std)
134 | old_mean = old_dist_info[1]
135 | new_mean = new_dist_info[1]
136 | Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2
137 | Dr = 2 * new_std ** 2 + 1e-8
138 | sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1)
139 | return torch.mean(sample_kl)
140 |
--------------------------------------------------------------------------------
/mjrl/mjrl/policies/gaussian_mlp.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from mjrl.utils.fc_network import FCNetwork
3 | import torch
4 | from torch.autograd import Variable
5 |
6 |
7 | class MLP:
8 | def __init__(self, env_spec,
9 | hidden_sizes=(64,64),
10 | min_log_std=-3,
11 | init_log_std=0,
12 | eps=0.0,
13 | seed=None):
14 | """
15 | :param env_spec: specifications of the env (see utils/gym_env.py)
16 | :param hidden_sizes: network hidden layer sizes (currently 2 layers only)
17 | :param min_log_std: log_std is clamped at this value and can't go below
18 | :param init_log_std: initial log standard deviation
19 | :param seed: random seed
20 | """
21 | self.n = env_spec.observation_dim # number of states
22 | self.m = env_spec.action_dim # number of actions
23 | self.min_log_std = min_log_std
24 | self.eps = eps
25 |
26 | # Set seed
27 | # ------------------------
28 | if seed is not None:
29 | torch.manual_seed(seed)
30 | np.random.seed(seed)
31 |
32 | # Policy network
33 | # ------------------------
34 | self.model = FCNetwork(self.n, self.m, hidden_sizes)
35 | # make weights small
36 | for param in list(self.model.parameters())[-2:]: # only last layer
37 | param.data = 1e-2 * param.data
38 | self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True)
39 | self.trainable_params = list(self.model.parameters()) + [self.log_std]
40 |
41 | # Old Policy network
42 | # ------------------------
43 | self.old_model = FCNetwork(self.n, self.m, hidden_sizes)
44 | self.old_log_std = Variable(torch.ones(self.m) * init_log_std)
45 | self.old_params = list(self.old_model.parameters()) + [self.old_log_std]
46 | for idx, param in enumerate(self.old_params):
47 | param.data = self.trainable_params[idx].data.clone()
48 |
49 | # Easy access variables
50 | # -------------------------
51 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
52 | self.param_shapes = [p.data.numpy().shape for p in self.trainable_params]
53 | self.param_sizes = [p.data.numpy().size for p in self.trainable_params]
54 | self.d = np.sum(self.param_sizes) # total number of params
55 |
56 | # Placeholders
57 | # ------------------------
58 | self.obs_var = Variable(torch.randn(self.n), requires_grad=False)
59 |
60 | # Utility functions
61 | # ============================================
62 | def get_param_values(self):
63 | params = np.concatenate([p.contiguous().view(-1).data.numpy()
64 | for p in self.trainable_params])
65 | return params.copy()
66 |
67 | def set_param_values(self, new_params, set_new=True, set_old=True):
68 | if set_new:
69 | current_idx = 0
70 | for idx, param in enumerate(self.trainable_params):
71 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
72 | vals = vals.reshape(self.param_shapes[idx])
73 | param.data = torch.from_numpy(vals).float()
74 | current_idx += self.param_sizes[idx]
75 | # clip std at minimum value
76 | self.trainable_params[-1].data = \
77 | torch.clamp(self.trainable_params[-1], self.min_log_std).data
78 | # update log_std_val for sampling
79 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
80 | if set_old:
81 | current_idx = 0
82 | for idx, param in enumerate(self.old_params):
83 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
84 | vals = vals.reshape(self.param_shapes[idx])
85 | param.data = torch.from_numpy(vals).float()
86 | current_idx += self.param_sizes[idx]
87 | # clip std at minimum value
88 | self.old_params[-1].data = \
89 | torch.clamp(self.old_params[-1], self.min_log_std).data
90 |
91 | # Main functions
92 | # ============================================
93 | def get_action(self, observation):
94 | o = np.float32(observation.reshape(1, -1))
95 | self.obs_var.data = torch.from_numpy(o)
96 | mean = self.model(self.obs_var).data.numpy().ravel()
97 | if np.random.uniform() < self.eps:
98 | action = np.random.uniform(0, 1, self.m)
99 | else:
100 | noise = np.exp(self.log_std_val) * np.random.randn(self.m)
101 | action = mean + noise
102 | return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}]
103 |
104 | def mean_LL(self, observations, actions, model=None, log_std=None):
105 | model = self.model if model is None else model
106 | log_std = self.log_std if log_std is None else log_std
107 | if type(observations) is not torch.Tensor:
108 | obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False)
109 | else:
110 | obs_var = observations
111 | if type(actions) is not torch.Tensor:
112 | act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False)
113 | else:
114 | act_var = actions
115 | mean = model(obs_var)
116 | zs = (act_var - mean) / torch.exp(log_std)
117 | LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \
118 | - torch.sum(log_std) + \
119 | - 0.5 * self.m * np.log(2 * np.pi)
120 | return mean, LL
121 |
122 | def log_likelihood(self, observations, actions, model=None, log_std=None):
123 | mean, LL = self.mean_LL(observations, actions, model, log_std)
124 | return LL.data.numpy()
125 |
126 | def old_dist_info(self, observations, actions):
127 | mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std)
128 | return [LL, mean, self.old_log_std]
129 |
130 | def new_dist_info(self, observations, actions):
131 | mean, LL = self.mean_LL(observations, actions, self.model, self.log_std)
132 | return [LL, mean, self.log_std]
133 |
134 | def likelihood_ratio(self, new_dist_info, old_dist_info):
135 | LL_old = old_dist_info[0]
136 | LL_new = new_dist_info[0]
137 | LR = torch.exp(LL_new - LL_old)
138 | return LR
139 |
140 | def mean_kl(self, new_dist_info, old_dist_info):
141 | old_log_std = old_dist_info[2]
142 | new_log_std = new_dist_info[2]
143 | old_std = torch.exp(old_log_std)
144 | new_std = torch.exp(new_log_std)
145 | old_mean = old_dist_info[1]
146 | new_mean = new_dist_info[1]
147 | Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2
148 | Dr = 2 * new_std ** 2 + 1e-8
149 | sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1)
150 | return torch.mean(sample_kl)
151 |
--------------------------------------------------------------------------------
/mjrl/mjrl/policies/mpc_actor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from trajopt.utils import gather_paths_parallel
3 |
4 |
5 | class MPCActor(object):
6 | def __init__(self, env, H, paths_per_cpu,
7 | num_cpu=1,
8 | kappa=1.0,
9 | gamma=1.0,
10 | mean=None,
11 | filter_coefs=None,
12 | seed=123,
13 | ):
14 |
15 | self.env, self.seed = env, seed
16 | self.n, self.m = env.observation_dim, env.action_dim
17 | self.H, self.paths_per_cpu, self.num_cpu = H, paths_per_cpu, num_cpu
18 |
19 | self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma
20 | if mean is None:
21 | self.mean = np.zeros(self.m)
22 | if filter_coefs is None:
23 | self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0]
24 |
25 | self.env.reset()
26 | self.env.set_seed(seed)
27 | self.env.reset(seed=seed)
28 | self.act_sequence = np.ones((self.H, self.m)) * self.mean
29 | self.ctr = 1
30 |
31 | def score_trajectory(self, paths):
32 | scores = np.zeros(len(paths))
33 | for i in range(len(paths)):
34 | scores[i] = 0.0
35 | for t in range(paths[i]["rewards"].shape[0]):
36 | scores[i] += (self.gamma**t)*paths[i]["rewards"][t]
37 | return scores
38 |
39 | def get_action(self, env_state):
40 | # Set to env_state
41 | # Shoot trajectories
42 | # Return optimal action
43 | seed = self.seed + self.ctr * 1000
44 | paths = gather_paths_parallel(self.env.env_id,
45 | env_state,
46 | self.act_sequence,
47 | self.filter_coefs,
48 | seed,
49 | self.paths_per_cpu,
50 | self.num_cpu,
51 | )
52 |
53 | num_traj = len(paths)
54 | R = self.score_trajectory(paths)
55 | S = np.exp(self.kappa*(R-np.max(R)))
56 | act = np.sum([paths[i]["actions"][0] * S[i] for i in range(num_traj)], axis=0)
57 | act = act / (np.sum(S) + 1e-6)
58 | return act
--------------------------------------------------------------------------------
/mjrl/mjrl/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/samplers/__init__.py
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/utils/__init__.py
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/cg_solve.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def cg_solve(f_Ax, b, x_0=None, cg_iters=10, residual_tol=1e-10):
4 | x = np.zeros_like(b) #if x_0 is None else x_0
5 | r = b.copy() #if x_0 is None else b-f_Ax(x_0)
6 | p = r.copy()
7 | rdotr = r.dot(r)
8 |
9 | for i in range(cg_iters):
10 | z = f_Ax(p)
11 | v = rdotr / p.dot(z)
12 | x += v * p
13 | r -= v * z
14 | newrdotr = r.dot(r)
15 | mu = newrdotr / rdotr
16 | p = r + mu * p
17 |
18 | rdotr = newrdotr
19 | if rdotr < residual_tol:
20 | break
21 |
22 | return x
23 |
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/fc_network.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class FCNetwork(nn.Module):
7 | def __init__(self, obs_dim, act_dim,
8 | hidden_sizes=(64,64),
9 | nonlinearity='tanh', # either 'tanh' or 'relu'
10 | in_shift = None,
11 | in_scale = None,
12 | out_shift = None,
13 | out_scale = None):
14 | super(FCNetwork, self).__init__()
15 |
16 | self.obs_dim = obs_dim
17 | self.act_dim = act_dim
18 | assert type(hidden_sizes) == tuple
19 | self.layer_sizes = (obs_dim, ) + hidden_sizes + (act_dim, )
20 | self.set_transformations(in_shift, in_scale, out_shift, out_scale)
21 |
22 | # hidden layers
23 | self.fc_layers = nn.ModuleList([nn.Linear(self.layer_sizes[i], self.layer_sizes[i+1]) \
24 | for i in range(len(self.layer_sizes) -1)])
25 | self.nonlinearity = torch.relu if nonlinearity == 'relu' else torch.tanh
26 |
27 | def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None):
28 | # store native scales that can be used for resets
29 | self.transformations = dict(in_shift=in_shift,
30 | in_scale=in_scale,
31 | out_shift=out_shift,
32 | out_scale=out_scale
33 | )
34 | self.in_shift = torch.from_numpy(np.float32(in_shift)) if in_shift is not None else torch.zeros(self.obs_dim)
35 | self.in_scale = torch.from_numpy(np.float32(in_scale)) if in_scale is not None else torch.ones(self.obs_dim)
36 | self.out_shift = torch.from_numpy(np.float32(out_shift)) if out_shift is not None else torch.zeros(self.act_dim)
37 | self.out_scale = torch.from_numpy(np.float32(out_scale)) if out_scale is not None else torch.ones(self.act_dim)
38 |
39 | def forward(self, x):
40 | # TODO(Aravind): Remove clamping to CPU
41 | # This is a temp change that should be fixed shortly
42 | if x.is_cuda:
43 | out = x.to('cpu')
44 | else:
45 | out = x
46 | out = (out - self.in_shift)/(self.in_scale + 1e-8)
47 | for i in range(len(self.fc_layers)-1):
48 | out = self.fc_layers[i](out)
49 | out = self.nonlinearity(out)
50 | out = self.fc_layers[-1](out)
51 | out = out * self.out_scale + self.out_shift
52 | return out
53 |
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/get_environment.py:
--------------------------------------------------------------------------------
1 | """
2 | convenience function to generate env
3 | useful if we want some procedural env generation
4 | """
5 |
6 | import gym
7 | from mjrl.utils.gym_env import GymEnv
8 |
9 | def get_environment(env_name=None, **kwargs):
10 | if env_name is None: print("Need to specify environment name")
11 | e = GymEnv(env_name)
12 | # can make procedural modifications here if needed using kwargs
13 | return e
14 |
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/logger.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import scipy
6 | import pickle
7 | import os
8 | import csv
9 |
10 | class DataLog:
11 |
12 | def __init__(self):
13 | self.log = {}
14 | self.max_len = 0
15 |
16 | def log_kv(self, key, value):
17 | # logs the (key, value) pair
18 |
19 | # TODO: This implementation is error-prone:
20 | # it would be NOT aligned if some keys are missing during one iteration.
21 | if key not in self.log:
22 | self.log[key] = []
23 | self.log[key].append(value)
24 | if len(self.log[key]) > self.max_len:
25 | self.max_len = self.max_len + 1
26 |
27 | def save_log(self, save_path):
28 | # TODO: Validate all lengths are the same.
29 | pickle.dump(self.log, open(save_path + '/log.pickle', 'wb'))
30 | with open(save_path + '/log.csv', 'w') as csv_file:
31 | fieldnames = list(self.log.keys())
32 | if 'iteration' not in fieldnames:
33 | fieldnames = ['iteration'] + fieldnames
34 |
35 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
36 | writer.writeheader()
37 | for row in range(self.max_len):
38 | row_dict = {'iteration': row}
39 | for key in self.log.keys():
40 | if row < len(self.log[key]):
41 | row_dict[key] = self.log[key][row]
42 | writer.writerow(row_dict)
43 |
44 | def get_current_log(self):
45 | row_dict = {}
46 | for key in self.log.keys():
47 | # TODO: this is very error-prone (alignment is not guaranteed)
48 | row_dict[key] = self.log[key][-1]
49 | return row_dict
50 |
51 | def shrink_to(self, num_entries):
52 | for key in self.log.keys():
53 | self.log[key] = self.log[key][:num_entries]
54 |
55 | self.max_len = num_entries
56 | assert min([len(series) for series in self.log.values()]) == \
57 | max([len(series) for series in self.log.values()])
58 |
59 | def read_log(self, log_path):
60 | assert log_path.endswith('log.csv')
61 |
62 | with open(log_path) as csv_file:
63 | reader = csv.DictReader(csv_file)
64 | listr = list(reader)
65 | keys = reader.fieldnames
66 | data = {}
67 | for key in keys:
68 | data[key] = []
69 | for row, row_dict in enumerate(listr):
70 | for key in keys:
71 | try:
72 | data[key].append(eval(row_dict[key]))
73 | except:
74 | print("ERROR on reading key {}: {}".format(key, row_dict[key]))
75 |
76 | if 'iteration' in data and data['iteration'][-1] != row:
77 | raise RuntimeError("Iteration %d mismatch -- possibly corrupted logfile?" % row)
78 |
79 | self.log = data
80 | self.max_len = max(len(v) for k, v in self.log.items())
81 | print("Log read from {}: had {} entries".format(log_path, self.max_len))
82 |
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/make_train_plots.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import scipy
6 | import csv
7 | from mjrl.utils.logger import DataLog
8 | import argparse
9 |
10 | def make_train_plots(log = None,
11 | log_path = None,
12 | keys = None,
13 | save_loc = None,
14 | sample_key = 'num_samples',
15 | x_scale = 1.0,
16 | y_scale = 1.0):
17 | if log is None and log_path is None:
18 | print("Need to provide either the log or path to a log file")
19 | if log is None:
20 | logger = DataLog()
21 | logger.read_log(log_path)
22 | log = logger.log
23 | # make plots for specified keys
24 | for key in keys:
25 | if key in log.keys():
26 | fig = plt.figure(figsize=(10,6))
27 | ax1 = fig.add_subplot(111)
28 | try:
29 | cum_samples = [np.sum(log[sample_key][:i]) * x_scale for i in range(len(log[sample_key]))]
30 | ax1.plot(cum_samples, [elem * y_scale for elem in log[key]])
31 | ax1.set_xlabel('samples')
32 | # mark iteration on the top axis
33 | ax2 = ax1.twiny()
34 | ax2.set_xlabel('iterations', color=(.7,.7,.7))
35 | ax2.tick_params(axis='x', labelcolor=(.7,.7,.7))
36 | ax2.set_xlim([0, len(log[key])])
37 | except:
38 | ax1.plot(log[key])
39 | ax1.set_xlabel('iterations')
40 | ax1.set_title(key)
41 | plt.savefig(save_loc+'/'+key+'.png', dpi=100)
42 | plt.close()
43 |
44 | # MAIN =========================================================
45 | # Example: python make_train_plots.py --log_path logs/log.csv --keys eval_score rollout_score save_loc logs
46 | def main():
47 | # Parse arguments
48 | parser = argparse.ArgumentParser()
49 | parser.add_argument(
50 | '-l', '--log_path', type=str, required=True, help='path file to log.csv')
51 | parser.add_argument(
52 | '-k', '--keys', type=str, action='append', nargs='+', required=True, help='keys to plot')
53 | parser.add_argument(
54 | '-s', '--save_loc', type=str, default='', help='Path for logs')
55 | args = parser.parse_args()
56 |
57 | make_train_plots(log_path=args.log_path, keys=args.keys[0], save_loc=args.save_loc)
58 |
59 | if __name__ == '__main__':
60 | main()
61 |
62 |
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/optimize_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 | import torch
4 | import torch.nn as nn
5 |
6 |
7 | def fit_data(model, x, y, optimizer, loss_func, batch_size, epochs):
8 | """
9 | :param model: pytorch model of form y_hat = f(x) (class)
10 | :param x: inputs to the model (tensor)
11 | :param y: desired outputs or targets (tensor)
12 | :param optimizer: optimizer to be used (class)
13 | :param loss_func: loss criterion (callable)
14 | :param batch_size: mini-batch size for optimization (int)
15 | :param epochs: number of epochs (int)
16 | :return:
17 | """
18 |
19 | num_samples = x.shape[0]
20 | epoch_losses = []
21 | for ep in range(epochs):
22 | rand_idx = torch.LongTensor(np.random.permutation(num_samples))
23 | ep_loss = 0.0
24 | num_steps = int(num_samples / batch_size) - 1
25 | for mb in range(num_steps):
26 | data_idx = rand_idx[mb*batch_size:(mb+1)*batch_size]
27 | batch_x = x[data_idx]
28 | batch_y = y[data_idx]
29 | optimizer.zero_grad()
30 | yhat = model(batch_x)
31 | loss = loss_func(yhat, batch_y)
32 | loss.backward()
33 | optimizer.step()
34 | ep_loss += loss.detach()
35 | epoch_losses.append(ep_loss.to('cpu').data.numpy().ravel() / num_steps)
36 | return epoch_losses
37 |
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/process_samples.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def compute_returns(paths, gamma):
4 | for path in paths:
5 | path["returns"] = discount_sum(path["rewards"], gamma)
6 |
7 | def compute_advantages(paths, baseline, gamma, gae_lambda=None, normalize=False):
8 | # compute and store returns, advantages, and baseline
9 | # standard mode
10 | if gae_lambda == None or gae_lambda < 0.0 or gae_lambda > 1.0:
11 | for path in paths:
12 | path["baseline"] = baseline.predict(path)
13 | path["advantages"] = path["returns"] - path["baseline"]
14 | if normalize:
15 | alladv = np.concatenate([path["advantages"] for path in paths])
16 | mean_adv = alladv.mean()
17 | std_adv = alladv.std()
18 | for path in paths:
19 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8)
20 | # GAE mode
21 | else:
22 | for path in paths:
23 | b = path["baseline"] = baseline.predict(path)
24 | if b.ndim == 1:
25 | b1 = np.append(path["baseline"], 0.0 if path["terminated"] else b[-1])
26 | else:
27 | b1 = np.vstack((b, np.zeros(b.shape[1]) if path["terminated"] else b[-1]))
28 | td_deltas = path["rewards"] + gamma*b1[1:] - b1[:-1]
29 | path["advantages"] = discount_sum(td_deltas, gamma*gae_lambda)
30 | if normalize:
31 | alladv = np.concatenate([path["advantages"] for path in paths])
32 | mean_adv = alladv.mean()
33 | std_adv = alladv.std()
34 | for path in paths:
35 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8)
36 |
37 | def discount_sum(x, gamma, terminal=0.0):
38 | y = []
39 | run_sum = terminal
40 | for t in range( len(x)-1, -1, -1):
41 | run_sum = x[t] + gamma*run_sum
42 | y.append(run_sum)
43 |
44 | return np.array(y[::-1])
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/tensor_utils.py:
--------------------------------------------------------------------------------
1 | import operator
2 |
3 | import numpy as np
4 |
5 |
6 | def flatten_tensors(tensors):
7 | if len(tensors) > 0:
8 | return np.concatenate([np.reshape(x, [-1]) for x in tensors])
9 | else:
10 | return np.asarray([])
11 |
12 |
13 | def unflatten_tensors(flattened, tensor_shapes):
14 | tensor_sizes = list(map(np.prod, tensor_shapes))
15 | indices = np.cumsum(tensor_sizes)[:-1]
16 | return [np.reshape(pair[0], pair[1]) for pair in zip(np.split(flattened, indices), tensor_shapes)]
17 |
18 |
19 | def pad_tensor(x, max_len, mode='zero'):
20 | padding = np.zeros_like(x[0])
21 | if mode == 'last':
22 | padding = x[-1]
23 | return np.concatenate([
24 | x,
25 | np.tile(padding, (max_len - len(x),) + (1,) * np.ndim(x[0]))
26 | ])
27 |
28 |
29 | def pad_tensor_n(xs, max_len):
30 | ret = np.zeros((len(xs), max_len) + xs[0].shape[1:], dtype=xs[0].dtype)
31 | for idx, x in enumerate(xs):
32 | ret[idx][:len(x)] = x
33 | return ret
34 |
35 |
36 | def pad_tensor_dict(tensor_dict, max_len, mode='zero'):
37 | keys = list(tensor_dict.keys())
38 | ret = dict()
39 | for k in keys:
40 | if isinstance(tensor_dict[k], dict):
41 | ret[k] = pad_tensor_dict(tensor_dict[k], max_len, mode=mode)
42 | else:
43 | ret[k] = pad_tensor(tensor_dict[k], max_len, mode=mode)
44 | return ret
45 |
46 |
47 | def flatten_first_axis_tensor_dict(tensor_dict):
48 | keys = list(tensor_dict.keys())
49 | ret = dict()
50 | for k in keys:
51 | if isinstance(tensor_dict[k], dict):
52 | ret[k] = flatten_first_axis_tensor_dict(tensor_dict[k])
53 | else:
54 | old_shape = tensor_dict[k].shape
55 | ret[k] = tensor_dict[k].reshape((-1,) + old_shape[2:])
56 | return ret
57 |
58 |
59 | def high_res_normalize(probs):
60 | return [x / sum(map(float, probs)) for x in list(map(float, probs))]
61 |
62 |
63 | def stack_tensor_list(tensor_list):
64 | return np.array(tensor_list)
65 | # tensor_shape = np.array(tensor_list[0]).shape
66 | # if tensor_shape is tuple():
67 | # return np.array(tensor_list)
68 | # return np.vstack(tensor_list)
69 |
70 |
71 | def stack_tensor_dict_list(tensor_dict_list):
72 | """
73 | Stack a list of dictionaries of {tensors or dictionary of tensors}.
74 | :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}.
75 | :return: a dictionary of {stacked tensors or dictionary of stacked tensors}
76 | """
77 | keys = list(tensor_dict_list[0].keys())
78 | ret = dict()
79 | for k in keys:
80 | example = tensor_dict_list[0][k]
81 | if isinstance(example, dict):
82 | v = stack_tensor_dict_list([x[k] for x in tensor_dict_list])
83 | else:
84 | v = stack_tensor_list([x[k] for x in tensor_dict_list])
85 | ret[k] = v
86 | return ret
87 |
88 |
89 | def concat_tensor_list_subsample(tensor_list, f):
90 | return np.concatenate(
91 | [t[np.random.choice(len(t), int(np.ceil(len(t) * f)), replace=False)] for t in tensor_list], axis=0)
92 |
93 |
94 | def concat_tensor_dict_list_subsample(tensor_dict_list, f):
95 | keys = list(tensor_dict_list[0].keys())
96 | ret = dict()
97 | for k in keys:
98 | example = tensor_dict_list[0][k]
99 | if isinstance(example, dict):
100 | v = concat_tensor_dict_list_subsample([x[k] for x in tensor_dict_list], f)
101 | else:
102 | v = concat_tensor_list_subsample([x[k] for x in tensor_dict_list], f)
103 | ret[k] = v
104 | return ret
105 |
106 |
107 | def concat_tensor_list(tensor_list):
108 | return np.concatenate(tensor_list, axis=0)
109 |
110 |
111 | def concat_tensor_dict_list(tensor_dict_list):
112 | keys = list(tensor_dict_list[0].keys())
113 | ret = dict()
114 | for k in keys:
115 | example = tensor_dict_list[0][k]
116 | if isinstance(example, dict):
117 | v = concat_tensor_dict_list([x[k] for x in tensor_dict_list])
118 | else:
119 | v = concat_tensor_list([x[k] for x in tensor_dict_list])
120 | ret[k] = v
121 | return ret
122 |
123 |
124 | def split_tensor_dict_list(tensor_dict):
125 | keys = list(tensor_dict.keys())
126 | ret = None
127 | for k in keys:
128 | vals = tensor_dict[k]
129 | if isinstance(vals, dict):
130 | vals = split_tensor_dict_list(vals)
131 | if ret is None:
132 | ret = [{k: v} for v in vals]
133 | else:
134 | for v, cur_dict in zip(vals, ret):
135 | cur_dict[k] = v
136 | return ret
137 |
138 |
139 | def truncate_tensor_list(tensor_list, truncated_len):
140 | return tensor_list[:truncated_len]
141 |
142 |
143 | def truncate_tensor_dict(tensor_dict, truncated_len):
144 | ret = dict()
145 | for k, v in tensor_dict.items():
146 | if isinstance(v, dict):
147 | ret[k] = truncate_tensor_dict(v, truncated_len)
148 | else:
149 | ret[k] = truncate_tensor_list(v, truncated_len)
150 | return ret
151 |
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/train_agent.py:
--------------------------------------------------------------------------------
1 | import logging
2 | #logging.disable(logging.CRITICAL)
3 |
4 | from tabulate import tabulate
5 | from mjrl.utils.make_train_plots import make_train_plots
6 | from mjrl.utils.gym_env import GymEnv
7 | from mjrl.samplers.core import sample_paths
8 | import numpy as np
9 | import pickle
10 | import time as timer
11 | import os
12 | import copy
13 |
14 |
15 | def _load_latest_policy_and_logs(agent, *, policy_dir, logs_dir):
16 | """Loads the latest policy.
17 | Returns the next step number to begin with.
18 | """
19 | assert os.path.isdir(policy_dir), str(policy_dir)
20 | assert os.path.isdir(logs_dir), str(logs_dir)
21 |
22 | log_csv_path = os.path.join(logs_dir, 'log.csv')
23 | if not os.path.exists(log_csv_path):
24 | return 0 # fresh start
25 |
26 | print("Reading: {}".format(log_csv_path))
27 | agent.logger.read_log(log_csv_path)
28 | last_step = agent.logger.max_len - 1
29 | if last_step <= 0:
30 | return 0 # fresh start
31 |
32 |
33 | # find latest policy/baseline
34 | i = last_step
35 | while i >= 0:
36 | policy_path = os.path.join(policy_dir, 'policy_{}.pickle'.format(i))
37 | baseline_path = os.path.join(policy_dir, 'baseline_{}.pickle'.format(i))
38 |
39 | if not os.path.isfile(policy_path):
40 | i = i -1
41 | continue
42 | else:
43 | print("Loaded last saved iteration: {}".format(i))
44 |
45 | with open(policy_path, 'rb') as fp:
46 | agent.policy = pickle.load(fp)
47 | with open(baseline_path, 'rb') as fp:
48 | agent.baseline = pickle.load(fp)
49 |
50 | # additional
51 | # global_status_path = os.path.join(policy_dir, 'global_status.pickle')
52 | # with open(global_status_path, 'rb') as fp:
53 | # agent.load_global_status( pickle.load(fp) )
54 |
55 | agent.logger.shrink_to(i + 1)
56 | assert agent.logger.max_len == i + 1
57 | return agent.logger.max_len
58 |
59 | # cannot find any saved policy
60 | raise RuntimeError("Log file exists, but cannot find any saved policy.")
61 |
62 | def train_agent(job_name, agent,
63 | seed = 0,
64 | niter = 101,
65 | gamma = 0.995,
66 | gae_lambda = None,
67 | num_cpu = 1,
68 | sample_mode = 'trajectories',
69 | num_traj = 50,
70 | num_samples = 50000, # has precedence, used with sample_mode = 'samples'
71 | save_freq = 10,
72 | evaluation_rollouts = None,
73 | plot_keys = ['stoc_pol_mean'],
74 | reward_kwargs = None,
75 | adroit=False
76 | ):
77 |
78 | np.random.seed(seed)
79 | if os.path.isdir(job_name) == False:
80 | os.mkdir(job_name)
81 | previous_dir = os.getcwd()
82 | os.chdir(job_name) # important! we are now in the directory to save data
83 | if os.path.isdir('iterations') == False: os.mkdir('iterations')
84 | if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs')
85 | best_policy = copy.deepcopy(agent.policy)
86 | best_perf = -1e8
87 | train_curve = best_perf*np.ones(niter)
88 | mean_pol_perf = 0.0
89 | e = GymEnv(agent.env.env.spec.id)
90 |
91 | # Load from any existing checkpoint, policy, statistics, etc.
92 | # Why no checkpointing.. :(
93 | #i_start = _load_latest_policy_and_logs(agent,
94 | # policy_dir='iterations',
95 | # logs_dir='logs')
96 | #if i_start:
97 | # print("Resuming from an existing job folder ...")
98 | i_start = 0
99 | for i in range(i_start, niter):
100 | print("......................................................................................")
101 | print("ITERATION : %i " % i)
102 |
103 | if train_curve[i-1] > best_perf:
104 | best_policy = copy.deepcopy(agent.policy)
105 | best_perf = train_curve[i-1]
106 |
107 | N = num_traj if sample_mode == 'trajectories' else num_samples
108 |
109 | args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu, reward_kwargs=reward_kwargs)
110 | # NOTE: Samples are inputed here
111 | stats = agent.train_step(**args)
112 | train_curve[i] = stats[0]
113 |
114 | if evaluation_rollouts is not None and evaluation_rollouts > 0:
115 | print("Performing evaluation rollouts ........")
116 | eval_paths = sample_paths(num_traj=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu,
117 | env=e.env_id, eval_mode=True, base_seed=seed)
118 | mean_pol_perf = np.mean([np.sum(path['rewards']) for path in eval_paths])
119 | if agent.save_logs:
120 | agent.logger.log_kv('eval_score', mean_pol_perf)
121 |
122 | if save_freq != 0 and i > 0 and i % save_freq == 0:
123 | if agent.save_logs:
124 | agent.logger.save_log('logs/')
125 | make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
126 | policy_file = 'policy_%i.pickle' % i
127 | baseline_file = 'baseline_%i.pickle' % i
128 | pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
129 | pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb'))
130 | pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
131 | # pickle.dump(agent.global_status, open('iterations/global_status.pickle', 'wb'))
132 |
133 | # print results to console
134 | if i == 0:
135 | result_file = open('results.txt', 'w')
136 | print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
137 | result_file.write("Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
138 | result_file.close()
139 | if not adroit:
140 | print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())),
141 | i, train_curve[i], mean_pol_perf, best_perf))
142 | else:
143 | print("[ %s ] %4i %5.2f %5.2f %5.2f %5.2f" % (timer.asctime(timer.localtime(timer.time())),
144 | i, train_curve[i], mean_pol_perf, best_perf, stats[-1]))
145 |
146 | result_file = open('results.txt', 'a')
147 | result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf))
148 | result_file.close()
149 | if agent.save_logs:
150 | print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
151 | agent.logger.get_current_log().items()))
152 | print(tabulate(print_data))
153 |
154 | # final save
155 | pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
156 | if agent.save_logs:
157 | agent.logger.save_log('logs/')
158 | make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
159 | os.chdir(previous_dir)
160 |
--------------------------------------------------------------------------------
/mjrl/mjrl/utils/visualize_policy.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import mjrl.envs
3 | import click
4 | import os
5 | import gym
6 | import numpy as np
7 | import pickle
8 | from mjrl.utils.gym_env import GymEnv
9 | from mjrl.policies.gaussian_mlp import MLP
10 | import trajopt.envs
11 |
12 | DESC = '''
13 | Helper script to visualize policy (in mjrl format).\n
14 | USAGE:\n
15 | Visualizes policy on the env\n
16 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n
17 | '''
18 |
19 | # MAIN =========================================================
20 | @click.command(help=DESC)
21 | @click.option('--env_name', type=str, help='environment to load', required= True)
22 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None)
23 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation')
24 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123)
25 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10)
26 |
27 | def main(env_name, policy, mode, seed, episodes):
28 | e = GymEnv(env_name)
29 | e.set_seed(seed)
30 | if policy is not None:
31 | pi = pickle.load(open(policy, 'rb'))
32 | else:
33 | pi = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=-1.0)
34 | # render policy
35 | e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode)
36 |
37 | if __name__ == '__main__':
38 | main()
39 |
40 |
--------------------------------------------------------------------------------
/mjrl/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from setuptools import setup, find_packages
4 |
5 | print("Installing mjrl. \n Package intended for use with provided conda env. See setup instructions here: https://github.com/aravindr93/mjrl/tree/master/setup")
6 |
7 | if sys.version_info.major != 3:
8 | print("This Python is only compatible with Python 3, but you are running "
9 | "Python {}. The installation will likely fail.".format(sys.version_info.major))
10 |
11 | def read(fname):
12 | return open(os.path.join(os.path.dirname(__file__), fname)).read()
13 |
14 | setup(
15 | name='mjrl',
16 | version='1.0.0',
17 | packages=find_packages(),
18 | description='RL algorithms for environments in MuJoCo',
19 | long_description=read('README.md'),
20 | url='https://github.com/aravindr93/mjrl.git',
21 | author='Aravind Rajeswaran',
22 | )
23 |
--------------------------------------------------------------------------------
/mjrl/setup/README.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | A short guide to install this package is below. The package relies on `mujoco-py` which might be the trickiest part of the installation. See `known issues` below and also instructions from the mujoco-py [page](https://github.com/openai/mujoco-py) if you are stuck with mujoco-py installation.
4 |
5 | The package can handle both `MuJoCo v1.5` as well as `MuJoCo v2.0`, but the former is not supported for future updates. We encourage you to use v2.0.
6 |
7 | ## Linux
8 |
9 | - Download MuJoCo v2.0 binaries from the official [website](http://www.mujoco.org/) and also obtain the license key.
10 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200`, and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. Note that unzip of the MuJoCo binaries will generate `mujoco200_linux`. You need to rename the directory and place it at `~/.mujoco/mujoco200`.
11 | - Install osmesa related dependencies:
12 | ```
13 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev build-essential libglfw3
14 | ```
15 | - Update `bashrc` by adding the following lines and source it
16 | ```
17 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH"
18 | export MUJOCO_PY_FORCE_CPU=True
19 | alias MJPL='LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so:/usr/lib/nvidia-384/libGL.so'
20 | ```
21 | - Install this package using
22 | ```
23 | $ conda update conda
24 | $ cd
25 | $ conda env create -f setup/env.yml
26 | $ source activate mjrl-env
27 | $ pip install -e .
28 | ```
29 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly based on the specific version of CUDA (or CPU-only) you have.
30 |
31 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info.
32 |
33 | ## Mac OS
34 |
35 | - Download MuJoCo binaries from the official [website](http://www.mujoco.org/) and also obtain the license key.
36 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200` (rename unzipped directory to this), and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`.
37 | - Update `bashrc` by adding the following lines and source it
38 | ```
39 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH"
40 | ```
41 | - Install this package using
42 | ```
43 | $ conda update conda
44 | $ cd path/to/mjrl
45 | $ conda env create -f setup/env.yml
46 | $ source activate mjrl-env
47 | $ pip install -e .
48 | ```
49 |
50 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly.
51 |
52 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info.
53 |
54 |
55 | ## Known Issues
56 |
57 | - Visualization in linux: If the linux system has a GPU, then mujoco-py does not automatically preload the correct drivers. We added an alias `MJPL` in bashrc (see instructions) which stands for mujoco pre-load. When runing any python script that requires rendering, prepend the execution with MJPL.
58 | ```
59 | $ MJPL python script.py
60 | ```
61 |
62 | - Errors related to osmesa during installation. This is a `mujoco-py` build error and would likely go away if the following command is used before creating the conda environment. If the problem still persists, please contact the developers of mujoco-py
63 | ```
64 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev
65 | ```
66 |
67 | - If conda environment creation gets interrupted for some reason, you can resume it with the following:
68 | ```
69 | $ conda env update -n mjrl-env -f setup/env.yml
70 | ```
71 |
72 | - GCC error in Mac OS: If you get a GCC error from mujoco-py, you can get the correct version mujoco-py expects with `brew install gcc --without-multilib`. This may require uninstalling other versions of GCC that may have been previously installed with `brew remove gcc@6` for example. You can see which brew packages were already installed with `brew list`.
73 |
74 |
--------------------------------------------------------------------------------
/mjrl/setup/env.yml:
--------------------------------------------------------------------------------
1 | name: mjrl-env
2 | channels:
3 | - pytorch
4 | - defaults
5 | dependencies:
6 | - python=3.7
7 | - pip
8 | - ipython
9 | - mkl-service
10 | - pytorch==1.4
11 | - tabulate
12 | - termcolor
13 | - torchvision
14 | - patchelf
15 | - pip:
16 | - click
17 | - cloudpickle
18 | - gym==0.13
19 | - ipdb
20 | - matplotlib
21 | - mujoco-py<2.1,>=2.0
22 | - pip
23 | - pyyaml
24 | - tqdm
25 | - wheel
26 | - scipy
27 | - transforms3d
28 |
--------------------------------------------------------------------------------
/mjrl/tests/point_mass_test.py:
--------------------------------------------------------------------------------
1 | from mjrl.utils.gym_env import GymEnv
2 | from mjrl.policies.gaussian_mlp import MLP
3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
4 | from mjrl.baselines.mlp_baseline import MLPBaseline
5 | from mjrl.algos.npg_cg import NPG
6 | from mjrl.utils.train_agent import train_agent
7 | import mjrl.envs
8 | import time as timer
9 | SEED = 500
10 |
11 | e = GymEnv('mjrl_point_mass-v0')
12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
13 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=10, learn_rate=1e-3)
14 | agent = NPG(e, policy, baseline, normalized_step_size=0.05, seed=SEED, save_logs=True)
15 |
16 | ts = timer.time()
17 | train_agent(job_name='point_mass_exp1',
18 | agent=agent,
19 | seed=SEED,
20 | niter=50,
21 | gamma=0.95,
22 | gae_lambda=0.97,
23 | num_cpu=1,
24 | sample_mode='trajectories',
25 | num_traj=40, # samples = 40*25 = 1000
26 | save_freq=5,
27 | evaluation_rollouts=None,
28 | plot_keys=['stoc_pol_mean', 'running_score'])
29 | print("time taken = %f" % (timer.time()-ts))
30 |
--------------------------------------------------------------------------------
/mjrl/tests/visualizer_test.py:
--------------------------------------------------------------------------------
1 | from mjrl.utils.gym_env import GymEnv
2 | from mjrl.policies.gaussian_mlp import MLP
3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
4 | from mjrl.baselines.mlp_baseline import MLPBaseline
5 | from mjrl.algos.npg_cg import NPG
6 | from mjrl.utils.train_agent import train_agent
7 | import mjrl.envs
8 | import time as timer
9 | SEED = 500
10 |
11 | e = GymEnv('mjrl_point_mass-v0')
12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
13 | baseline = QuadraticBaseline(e.spec)
14 | agent = NPG(e, policy, baseline, normalized_step_size=0.5, seed=SEED, save_logs=True)
15 |
16 | ts = timer.time()
17 | train_agent(job_name='vis_exp',
18 | agent=agent,
19 | seed=SEED,
20 | niter=10,
21 | gamma=0.95,
22 | gae_lambda=0.97,
23 | num_cpu=1,
24 | sample_mode='trajectories',
25 | num_traj=100,
26 | save_freq=5,
27 | evaluation_rollouts=None)
28 | print("time taken = %f" % (timer.time()-ts))
29 | e.visualize_policy(policy, num_episodes=5, horizon=e.horizon, mode='exploration')
30 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib==3.4.1
2 | mujoco-py==2.0.2.13
3 | numpy==1.20.2
4 | tabulate==0.8.9
5 | tensorboard==2.5.0
6 | tensorboard-data-server==0.6.1
7 | tensorboard-plugin-wit==1.8.0
8 | torch==1.8.1
9 | torchaudio==0.8.0a0+e4e171a
10 | torchvision==0.9.1
11 | tqdm==4.60.0
12 |
--------------------------------------------------------------------------------