├── .gitignore
├── README.md
├── example_run.sh
├── humanoid_fig.png
├── milo
    ├── milo
    │   ├── __init__.py
    │   ├── cost
    │   │   ├── __init__.py
    │   │   └── linear_cost.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   └── datasets.py
    │   ├── dynamics_model
    │   │   ├── __init__.py
    │   │   └── mlp_dynamics.py
    │   ├── gym_env
    │   │   ├── __init__.py
    │   │   ├── ant.py
    │   │   ├── half_cheetah.py
    │   │   ├── hopper.py
    │   │   ├── humanoid.py
    │   │   ├── multiprocessing_env.py
    │   │   ├── walker2d.py
    │   │   └── wrappers.py
    │   ├── sampler
    │   │   ├── __init__.py
    │   │   └── sampler.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── arguments.py
    │   │   ├── evaluate.py
    │   │   ├── logger.py
    │   │   └── util.py
    └── setup.py
├── mjrl
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── examples
    │   ├── README.md
    │   ├── behavior_clone.py
    │   ├── example_configs
    │   │   ├── hopper_npg.txt
    │   │   ├── swimmer_npg.txt
    │   │   └── swimmer_ppo.txt
    │   ├── linear_nn_comparison.py
    │   └── policy_opt_job_script.py
    ├── mjrl
    │   ├── __init__.py
    │   ├── algos
    │   │   ├── __init__.py
    │   │   ├── batch_reinforce.py
    │   │   ├── behavior_cloning.py
    │   │   ├── dapg.py
    │   │   ├── mbac.py
    │   │   ├── model_accel
    │   │   │   ├── __init__.py
    │   │   │   ├── model_accel_npg.py
    │   │   │   ├── model_learning_mpc.py
    │   │   │   ├── nn_dynamics.py
    │   │   │   ├── run_experiments
    │   │   │   │   ├── configs
    │   │   │   │   │   ├── point_mass.txt
    │   │   │   │   │   └── reacher.txt
    │   │   │   │   ├── run_model_accel_npg.py
    │   │   │   │   ├── sandbox
    │   │   │   │   │   ├── example_config_mpc.txt
    │   │   │   │   │   └── run_model_learning_mpc.py
    │   │   │   │   └── utils
    │   │   │   │   │   ├── reward_functions
    │   │   │   │   │       ├── __init__.py
    │   │   │   │   │       └── mjrl_point_mass.py
    │   │   │   │   │   ├── visualize_policy.py
    │   │   │   │   │   └── visualize_trajectories.py
    │   │   │   └── sampling.py
    │   │   ├── npg_cg.py
    │   │   ├── ppo_clip.py
    │   │   └── trpo.py
    │   ├── baselines
    │   │   ├── __init__.py
    │   │   ├── linear_baseline.py
    │   │   ├── mlp_baseline.py
    │   │   ├── quadratic_baseline.py
    │   │   └── zero_baseline.py
    │   ├── envs
    │   │   ├── __init__.py
    │   │   ├── assets
    │   │   │   ├── peg_insertion.xml
    │   │   │   ├── point_mass.xml
    │   │   │   ├── sawyer.xml
    │   │   │   └── swimmer.xml
    │   │   ├── mujoco_env.py
    │   │   ├── peg_insertion_sawyer.py
    │   │   ├── point_mass.py
    │   │   ├── reacher_sawyer.py
    │   │   └── swimmer.py
    │   ├── policies
    │   │   ├── __init__.py
    │   │   ├── gaussian_linear.py
    │   │   ├── gaussian_mlp.py
    │   │   └── mpc_actor.py
    │   ├── samplers
    │   │   ├── __init__.py
    │   │   └── core.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── cg_solve.py
    │   │   ├── fc_network.py
    │   │   ├── get_environment.py
    │   │   ├── gym_env.py
    │   │   ├── logger.py
    │   │   ├── make_train_plots.py
    │   │   ├── optimize_model.py
    │   │   ├── process_samples.py
    │   │   ├── tensor_utils.py
    │   │   ├── train_agent.py
    │   │   └── visualize_policy.py
    ├── setup.py
    ├── setup
    │   ├── README.md
    │   └── env.yml
    └── tests
    │   ├── point_mass_test.py
    │   └── visualizer_test.py
├── requirements.txt
├── run.py
└── run_hand.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | data/
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Source Code for Model-based Imitation Learning from Offline data (MILO)
 2 | Implementation of MILO, a model-based, offline imitation learning algorithm. 
 3 | 
 4 | ![figure](https://github.com/jdchang1/milo/blob/main/humanoid_fig.png)
 5 | 
 6 | Link to pdf: https://arxiv.org/abs/2106.03207
 7 | 
 8 | ## Notes on Installation
 9 | After cloning this repository and installing the requirements, please run
10 | 
11 | `cd milo && pip install -e .`
12 | 
13 | `cd mjrl && pip install -e .`
14 | 
15 | The experiments are run using MuJoCo physics, which requires a license to install. Please follow the instructions on [MuJoCo Website](http://www.mujoco.org)
16 | 
17 | ## Overview
18 | The `milo` package contains our imitation learning, model-based environment stack, and boilerplate code. We modified the `mjrl` package to interface with our cost functions when doing model-based policy gradient. This modification can be seen in `mjrl/mjrl/algos/batch_reinforce.py`. Note that we currently only support NPG/TRPO as our policy gradient algorithm; however, in principle one could replace this with other algorithms/repositories. 
19 | 
20 | ## Environments Supported
21 | This repository supports 5 modified MuJoCo environments that can be found in `milo/milo/gym_env`. They are
22 | 1. Hopper-v4
23 | 2. Walker2d-v4
24 | 3. HalfCheetah-v4
25 | 4. Ant-v4
26 | 5. Humanoid-v4
27 | 
28 | If you would like to add an environment, register the environment in `/milo/milo/gym_env/__init__.py` according to [OpenAI Gym](http://gym.openai.com/docs/#environments) instructions.
29 | 
30 | ## Downloading the Datasets
31 | Please download the datasets from this [google drive link](https://drive.google.com/drive/folders/1gG2WIgL1mdznhuel5uKRb6lepF7EVeFr?usp=sharing). Each environment will have 2 datasets: `[ENV]_expert.pt` and `[ENV]_offline.pt`.
32 | 
33 | In the `data` directory, place the expert and offline datasets in the `data/expert_data` and `data/offline_data` direcotires respectively. 
34 | 
35 | ## Running an Experiment
36 | We provide an example run script for Hopper, `example_run.sh`, that can be modified to be used with any other registered environment. To view all the possible arguments you can run please see the argparse in `milo/milo/utils/arguments.py`.
37 | 
38 | ## Bibliography
39 | To cite this work, please use the following citation. Note that this repository builds upon MJRL so please also cite any references noted in the README [here](https://github.com/aravindr93/mjrl).
40 | ```
41 | @misc{chang2021mitigating,
42 |       title={Mitigating Covariate Shift in Imitation Learning via Offline Data Without Great Coverage}, 
43 |       author={Jonathan D. Chang and Masatoshi Uehara and Dhruv Sreenivas and Rahul Kidambi and Wen Sun},
44 |       year={2021},
45 |       eprint={2106.03207},
46 |       archivePrefix={arXiv},
47 |       primaryClass={cs.LG}
48 | }
49 | ```
50 | 


--------------------------------------------------------------------------------
/example_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python run.py --env Hopper-v4 \
 4 |               --seed 100 \
 5 |               --expert_db Hopper-v6_expert.pt \
 6 |               --offline_db Hopper-v6_offline.pt \
 7 |               --n_models 4 \
 8 |               --lambda_b 0.0025 \
 9 |               --samples_per_step 40000 \
10 |               --pg_iter 1 \
11 |               --bw_quantile 0.1 \
12 |               --id 1 \
13 |               --subsample_expert \
14 |               --n_iter 300 \
15 |               --cg_iter 25 \
16 |               --bc_epochs 1 \
17 |               --do_bc_reg \
18 |               --bc_reg_coeff 0.1
19 | 


--------------------------------------------------------------------------------
/humanoid_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/humanoid_fig.png


--------------------------------------------------------------------------------
/milo/milo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/milo/milo/__init__.py


--------------------------------------------------------------------------------
/milo/milo/cost/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.cost.linear_cost import RBFLinearCost
2 | 


--------------------------------------------------------------------------------
/milo/milo/cost/linear_cost.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | import numpy as np
  5 | 
  6 | class RBFLinearCost:
  7 |     """
  8 |     MMD cost implementation with rff feature representations
  9 | 
 10 |     NOTE: Currently hardcoded to cpu
 11 | 
 12 |     :param expert_data: (torch Tensor) expert data used for feature matching
 13 |     :param feature_dim: (int) feature dimension for rff
 14 |     :param input_type: (str) state (s), state-action (sa), state-next state (ss),
 15 |                        state-action-next state (sas)
 16 |     :param cost_range: (list) inclusive range of costs
 17 |     :param bw_quantile: (float) quantile used to fit bandwidth for rff kernel
 18 |     :param bw_samples: (int) number of samples used to fit bandwidth
 19 |     :param lambda_b: (float) weight parameter for bonus and cost
 20 |     :param lr: (float) learning rate for discriminator/cost update. 0.0 = closed form update
 21 |     :param seed: (int) random seed to set cost function
 22 |     """
 23 |     def __init__(self,
 24 |                  expert_data,
 25 |                  feature_dim=1024,
 26 |                  input_type='sa',
 27 |                  cost_range=[-1.,0.],
 28 |                  bw_quantile=0.1,
 29 |                  bw_samples=100000,
 30 |                  lambda_b=1.0,
 31 |                  lr=0.0,
 32 |                  seed=100):
 33 | 
 34 |         # Set Random Seed 
 35 |         torch.manual_seed(seed)
 36 |         np.random.seed(seed)
 37 | 
 38 |         self.expert_data = expert_data
 39 |         input_dim = expert_data.size(1)
 40 |         self.input_type = input_type
 41 |         self.feature_dim = feature_dim
 42 |         self.cost_range = cost_range
 43 |         if cost_range is not None:
 44 |             self.c_min, self.c_max = cost_range
 45 |         self.lambda_b = lambda_b
 46 |         self.lr = lr
 47 | 
 48 |         # Fit Bandwidth
 49 |         self.quantile = bw_quantile
 50 |         self.bw_samples = bw_samples
 51 |         self.bw = self.fit_bandwidth(expert_data)
 52 | 
 53 |         # Define Phi and Cost weights
 54 |         self.rff = nn.Linear(input_dim, feature_dim)
 55 |         self.rff.bias.data = (torch.rand_like(self.rff.bias.data)-0.5)*2.0*np.pi
 56 |         self.rff.weight.data = torch.rand_like(self.rff.weight.data)/(self.bw+1e-8)
 57 | 
 58 |         # W Update Init
 59 |         self.w = None
 60 | 
 61 |         # Compute Expert Phi Mean
 62 |         self.expert_rep = self.get_rep(expert_data)
 63 |         self.phi_e = self.expert_rep.mean(dim=0)
 64 | 
 65 |     def get_rep(self, x):
 66 |         """
 67 |         Returns an RFF representation given an input
 68 |         """
 69 |         with torch.no_grad():
 70 |             out = self.rff(x.cpu())
 71 |             out = torch.cos(out)*np.sqrt(2/self.feature_dim)
 72 |         return out
 73 | 
 74 |     def fit_bandwidth(self, data):
 75 |         """
 76 |         Uses the median trick to fit the bandwidth for the RFF kernel
 77 |         """
 78 |         num_data = data.shape[0]
 79 |         idxs_0 = torch.randint(low=0, high=num_data, size=(self.bw_samples,))
 80 |         idxs_1 = torch.randint(low=0, high=num_data, size=(self.bw_samples,))
 81 |         norm = torch.norm(data[idxs_0, :]-data[idxs_1, :], dim=1)
 82 |         bw = torch.quantile(norm, q=self.quantile).item()
 83 |         return bw
 84 | 
 85 |     def fit_cost(self, data_pi):
 86 |         """
 87 |         Updates the weights of the cost with the closed form solution
 88 |         """
 89 |         phi = self.get_rep(data_pi).mean(0)
 90 |         feat_diff = phi - self.phi_e
 91 | 
 92 |         # Closed form solution
 93 |         self.w = feat_diff
 94 | 
 95 |         return torch.dot(self.w, feat_diff).item()
 96 | 
 97 |     def get_costs(self, x):
 98 |         """
 99 |         Returrns the IPM (MMD) cost for a given input
100 |         """
101 |         data = self.get_rep(x)
102 |         if self.cost_range is not None:
103 |             return torch.clamp(torch.mm(data, self.w.unsqueeze(1)), self.c_min, self.c_max)
104 |         return torch.mm(data, self.w.unsqueeze(1))
105 | 
106 |     def get_expert_cost(self):
107 |         """
108 |         Returns the mean expert cost given our current discriminator weights and representations
109 |         """
110 |         return (1-self.lambda_b)*torch.clamp(torch.mm(self.expert_rep, self.w.unsqueeze(1)), self.c_min, self.c_max).mean()
111 | 
112 |     def get_bonus_costs(self, states, actions, ensemble, next_states=None):
113 |         """
114 |         Computes the cost with pessimism
115 |         """
116 |         if self.input_type == 'sa':
117 |             rff_input = torch.cat([states, actions], dim=1)
118 |         elif self.input_type == 'ss':
119 |             assert(next_states is not None)
120 |             rff_input = torch.cat([states, next_states], dim=1)
121 |         elif self.input_type == 'sas':
122 |             rff_input = torch.cat([states, actions, next_states], dim=1)
123 |         elif self.input_type == 's':
124 |             rff_input = states
125 |         else:
126 |             raise NotImplementedError("Input type not implemented")
127 | 
128 |         # Get Linear Cost 
129 |         rff_cost = self.get_costs(rff_input)
130 | 
131 |         if self.cost_range is not None:
132 |             # Get Bonus from Ensemble
133 |             discrepancy = ensemble.get_action_discrepancy(states, actions)/ensemble.threshold
134 |             discrepancy = discrepancy.view(-1, 1)
135 |             discrepancy[discrepancy>1.0] = 1.0
136 |             # Bonus is LOW if (s,a) is unknown
137 |             bonus = discrepancy * self.c_min
138 |         else:
139 |             bonus = ensemble.get_action_discrepancy(states, actions).view(-1,1)
140 | 
141 |         # Weight cost components
142 |         ipm = (1-self.lambda_b)*rff_cost
143 | 
144 |         # Conservative/Pessimism Penalty term
145 |         weighted_bonus = self.lambda_b*bonus.cpu() # Note cpu hardcoding
146 | 
147 |         # Cost
148 |         cost = ipm - weighted_bonus
149 | 
150 |         # Logging info
151 |         info = {'bonus': weighted_bonus, 'ipm': ipm, 'v_targ': rff_cost, 'cost': cost}
152 | 
153 |         return cost, info
154 | 
155 | 


--------------------------------------------------------------------------------
/milo/milo/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.dataset.datasets import OfflineDataset
2 | 


--------------------------------------------------------------------------------
/milo/milo/dataset/datasets.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from gym.spaces import Discrete, Box
 4 | 
 5 | import torch
 6 | from torch.utils.data import Dataset
 7 | 
 8 | 
 9 | class OfflineDataset(Dataset):
10 |     """
11 |     Pytorch Dataset class for our offline dataset. Note we return (s,a,s') triples.
12 |     :param env_name: (str) name of gym environment
13 |     :param state: (torch Tensor) tensor with shape (number of samples, state dimension) with state data
14 |     :param action: (torch Tensor) tensor with shape (number of samples, action dimension) with action data
15 |     :param next_state: (torch Tensor) tensor with shape (number of samples, state dimension) with next state data
16 |     :param device: (torch Device) device for pytorch. Currently hardcoded to cpu
17 |     """
18 |     def __init__(self, env_name, state, action, next_state, device=torch.device('cpu')):
19 |         self.device = device
20 |         self.state = state
21 |         self.action = action
22 | 
23 |         env = gym.make(env_name)
24 |         if isinstance(env.action_space, Discrete):
25 |             self.action = self.one_hot(action, env.action_space.n)
26 |         elif isinstance(env.action_space, Box):
27 |             self.action = action
28 |         else:
29 |             raise NotImplementedError(
30 |                 "Environment Action Space not yet supported")
31 |         self.next_state = next_state
32 |         del env
33 | 
34 |     def get_transformations(self):
35 |         diff = self.next_state - self.state
36 | 
37 |         # Compute Means
38 |         state_mean = self.state.mean(dim=0).float().requires_grad_(False)
39 |         action_mean = self.action.mean(dim=0).float().requires_grad_(False)
40 |         diff_mean = diff.mean(dim=0).float().requires_grad_(False)
41 | 
42 |         # Compute Scales
43 |         state_scale = torch.abs(
44 |             self.state - state_mean).mean(dim=0).float().requires_grad_(False) + 1e-8
45 |         action_scale = torch.abs(
46 |             self.action - action_mean).mean(dim=0).float().requires_grad_(False) + 1e-8
47 |         diff_scale = torch.abs(
48 |             diff - diff_mean).mean(dim=0).float().requires_grad_(False) + 1e-8
49 | 
50 |         return state_mean.to(self.device), state_scale.to(self.device), action_mean.to(self.device), \
51 |             action_scale.to(self.device), diff_mean.to(
52 |                 self.device), diff_scale.to(self.device)
53 | 
54 |     def one_hot(self, action, action_dim):
55 |         db_size = action.size(0)
56 |         one_hot_action = torch.eye(action_dim)[action]
57 |         return one_hot_action.view(db_size, action_dim)
58 | 
59 |     def __len__(self):
60 |         return self.state.size(0)
61 | 
62 |     def __getitem__(self, idx):
63 |         return self.state[idx].float(), self.action[idx].float(), self.next_state[idx].float()
64 | 


--------------------------------------------------------------------------------
/milo/milo/dynamics_model/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.dynamics_model.mlp_dynamics import DynamicsEnsemble, DynamicsModel 
2 | 


--------------------------------------------------------------------------------
/milo/milo/gym_env/__init__.py:
--------------------------------------------------------------------------------
 1 | from milo.gym_env.wrappers import model_based_env
 2 | from milo.gym_env.multiprocessing_env import MujocoEnvProcess
 3 | from gym.envs.registration import register
 4 | 
 5 | register(
 6 |     id='Hopper-v4',
 7 |     entry_point='milo.gym_env.hopper:HopperEnv',
 8 |     max_episode_steps=400,
 9 |     reward_threshold=3800.0,
10 | )
11 | 
12 | register(
13 |     id='Walker2d-v4',
14 |     max_episode_steps=400,
15 |     entry_point='milo.gym_env.walker2d:Walker2dEnv'
16 | )
17 | 
18 | register(
19 |     id='HalfCheetah-v4',
20 |     entry_point='milo.gym_env.half_cheetah:HalfCheetahEnv',
21 |     max_episode_steps=500,
22 |     reward_threshold=4800.0,
23 | )
24 | 
25 | register(
26 |     id='Ant-v4',
27 |     entry_point='milo.gym_env.ant:AntEnv',
28 |     max_episode_steps=500,
29 |     reward_threshold=6000.0,
30 | )
31 | 
32 | register(
33 |     id='Humanoid-v4',
34 |     entry_point='milo.gym_env.humanoid:HumanoidEnv',
35 |     max_episode_steps=500,
36 | )
37 | 


--------------------------------------------------------------------------------
/milo/milo/gym_env/ant.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from gym.envs.mujoco import mujoco_env
  4 | 
  5 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  6 |     def __init__(self):
  7 |         self.pos_before = np.array([0.0, 0.0])
  8 |         mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 10)
  9 |         # mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5)
 10 |         utils.EzPickle.__init__(self)
 11 | 
 12 |     def step(self, a):
 13 |         self.pos_before = self.data.qpos[:2].copy()
 14 |         self.do_simulation(a, self.frame_skip)
 15 |         obs = self._get_obs()
 16 |         reward = self.get_reward(obs, a)
 17 |         done = self.get_done(obs)
 18 |         return obs, reward, done, {}
 19 | 
 20 |     def _get_obs(self):
 21 |         delta = self.data.qpos[:2] - self.pos_before
 22 |         return np.concatenate([
 23 |             delta,
 24 |             self.sim.data.qpos.flat[2:],
 25 |             self.sim.data.qvel.ravel() * self.dt,
 26 |             # NOTE: We are throwing away contact related info, since it is often unnecessary
 27 |             # np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
 28 |         ])
 29 | 
 30 |     def get_reward(self, obs, act):
 31 |         obs = np.clip(obs, -10.0, 10.0)
 32 |         if len(obs.shape) == 1:
 33 |             # vector obs, called when stepping the env
 34 |             vel_x = obs[0] / self.dt               # recover velocity from delta
 35 |             power = np.square(act).sum()
 36 |             # NOTE: We will use the contact force penalties for actual reward
 37 |             # to be consistent with gym results
 38 |             cfrc_ext = np.clip(self.sim.data.cfrc_ext, -1, 1).ravel()
 39 |             height = obs[2]
 40 |             reward = - 0.5 * 1e-3 * np.square(cfrc_ext).sum()   # contact cost
 41 |         else:
 42 |             # for imaginary rollouts using learned model
 43 |             vel_x = obs[:, :, 0] / self.dt         # recover velocity from delta
 44 |             power = np.square(act).sum(axis=-1)
 45 |             height = obs[:, :, 2]
 46 |             # NOTE: WE will not consider contact costs for imaginary rollouts
 47 |             reward = 0.0
 48 |         survive_reward = 1.0 * (height > 0.2) * (height < 1.0)
 49 |         ctrl_cost = 0.5 * power
 50 |         reward += vel_x - ctrl_cost + survive_reward
 51 |         reward = reward * 2.0    # to account for scaling difference (skip 5 --> 10)
 52 |         return reward
 53 | 
 54 |     def compute_path_rewards(self, paths):
 55 |         # path has two keys: observations and actions
 56 |         # path["observations"] : (num_traj, horizon, obs_dim)
 57 |         # path["rewards"] should have shape (num_traj, horizon)
 58 |         obs = paths["observations"]
 59 |         act = paths["actions"]
 60 |         rewards = self.get_reward(obs, act)
 61 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
 62 |         return paths
 63 | 
 64 |     def get_done(self, obs):
 65 |         done = not (np.isfinite(obs).all() and (obs[2] > 0.2) and (obs[2] < 1.0))
 66 |         return done
 67 | 
 68 |     def truncate_paths(self, paths):
 69 |         for path in paths:
 70 |             obs = path["observations"]
 71 |             height = obs[:,2]#obs[:, 0]
 72 |             T = obs.shape[0]
 73 |             t = 0
 74 |             done = False
 75 |             while t < T and done is False:
 76 |                 done = not (np.isfinite(obs[t]).all() and (height[t] > 0.2) and (height[t] < 1.0))
 77 |                 T = t if done else T
 78 |                 t = t + 1
 79 |             path["observations"] = path["observations"][:T]
 80 |             path["actions"] = path["actions"][:T]
 81 |             path["rewards"] = path["rewards"][:T]
 82 |             path["terminated"] = done
 83 |         return paths
 84 | 
 85 |     def reset_model(self):
 86 |         qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
 87 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
 88 |         self.set_state(qpos, qvel)
 89 |         return self._get_obs()
 90 | 
 91 |     def get_env_state(self):
 92 |         return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy())
 93 | 
 94 |     def set_env_state(self, state):
 95 |         qpos = state['qpos']
 96 |         qvel = state['qvel']
 97 |         self.sim.reset()
 98 |         self.data.qpos[:] = qpos
 99 |         self.data.qvel[:] = qvel
100 |         self.sim.forward()
101 | 
102 |     def viewer_setup(self):
103 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
104 | 


--------------------------------------------------------------------------------
/milo/milo/gym_env/half_cheetah.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 6 |     def __init__(self):
 7 |         self.pos_before = 0.0
 8 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 10)
 9 |         utils.EzPickle.__init__(self)
10 | 
11 |     def step(self, a):
12 |         self.pos_before = self.data.qpos[0].copy()
13 |         self.do_simulation(a, self.frame_skip)
14 |         obs = self._get_obs()
15 |         reward = self.get_reward(obs, a)
16 |         done = False    # no termination for this env
17 |         return obs, reward, done, {}
18 | 
19 |     def _get_obs(self):
20 |         delta = self.data.qpos[0] - self.pos_before
21 |         return np.concatenate([
22 |             [delta],
23 |             self.sim.data.qpos.ravel()[1:],
24 |             self.sim.data.qvel.ravel() * self.dt,
25 |         ])
26 | 
27 |     def get_reward(self, obs, act):
28 |         obs = np.clip(obs, -10.0, 10.0)
29 |         if len(obs.shape) == 1:
30 |             # vector obs, called when stepping the env
31 |             # vel_x = obs[-9] / self.dt               # recover velocity from delta
32 |             vel_x = obs[0] / self.dt
33 |             power = np.square(act).sum()
34 |         else:
35 |             # vel_x = obs[:, :, -9] / self.dt         # recover velocity from delta
36 |             vel_x = obs[:, :, 0] / self.dt
37 |             power = np.square(act).sum(axis=-1)
38 |         reward = vel_x - 0.1 * power
39 |         reward = reward * 2.0    # to account for scaling difference (skip 5 --> 10)
40 |         return reward
41 | 
42 |     def compute_path_rewards(self, paths):
43 |         # path has two keys: observations and actions
44 |         # path["observations"] : (num_traj, horizon, obs_dim)
45 |         # path["rewards"] should have shape (num_traj, horizon)
46 |         obs = paths["observations"]
47 |         act = paths["actions"]
48 |         rewards = self.get_reward(obs, act)
49 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
50 | 
51 |     def get_env_state(self):
52 |         return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy())
53 |     
54 |     def set_env_state(self, state):
55 |         qpos = state['qpos']
56 |         qvel = state['qvel']
57 |         self.sim.reset()
58 |         self.data.qpos[:] = qpos
59 |         self.data.qvel[:] = qvel
60 |         self.sim.forward()
61 | 
62 |     def reset_model(self):
63 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
64 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
65 |         self.set_state(qpos, qvel)
66 |         return self._get_obs()
67 | 
68 |     def viewer_setup(self):
69 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
70 | 


--------------------------------------------------------------------------------
/milo/milo/gym_env/hopper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from gym.envs.mujoco import mujoco_env
  4 | 
  5 | class HopperEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  6 |     def __init__(self):
  7 |         self.pos_before = 0.0
  8 |         self.height_idx = 1
  9 |         self.ang_idx = 2
 10 |         self.ang_threshold = 1.0
 11 |         mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 10)
 12 |         # mujoco_env.MujocoEnv.__init__(self, 'hopper.xml', 4)
 13 |         utils.EzPickle.__init__(self)
 14 | 
 15 |     def step(self, a):
 16 |         self.pos_before = self.data.qpos[0].copy()
 17 |         self.do_simulation(a, self.frame_skip)
 18 |         obs = self._get_obs()
 19 |         reward = self.get_reward(obs, a)
 20 |         done = self.get_done(obs)
 21 |         return obs, reward, done, {}
 22 | 
 23 |     def _get_obs(self):
 24 |         # I am using delta instead of velocity, 
 25 |         # so that all obs are of similar magnitude
 26 |         delta = self.data.qpos[0] - self.pos_before
 27 |         return np.concatenate([
 28 |             [delta],
 29 |             self.sim.data.qpos.ravel()[1:],
 30 |             self.sim.data.qvel.ravel() * self.dt,
 31 |         ])
 32 | 
 33 |     def get_reward(self, obs, act):
 34 |         obs = np.clip(obs, -10.0, 10.0)
 35 |         if len(obs.shape) == 1:
 36 |             # vector obs, called when stepping the env
 37 |             # vel_x = (obs[1] - obs[0]) / self.dt               # recover velocity from delta
 38 |             vel_x = obs[0] / self.dt
 39 |             power = np.square(act).sum()
 40 |             height, ang = obs[self.height_idx:(self.ang_idx+1)]
 41 |         else:
 42 |             # vel_x = (obs[:, :, 1] - obs[:, :, 0]) / self.dt   # recover velocity from delta
 43 |             vel_x = obs[:, :, 0] / self.dt
 44 |             power = np.square(act).sum(axis=-1)
 45 |             height = obs[:, :, self.height_idx]
 46 |             ang = obs[:, :, self.ang_idx]
 47 |         alive_bonus = 1.0 * (height > .7) * (np.abs(ang) < self.ang_threshold)
 48 |         reward = vel_x + alive_bonus - 1e-3*power
 49 |         reward = reward * 2.5    # to account for scaling difference (skip 4 --> 10)
 50 |         return reward
 51 | 
 52 |     def compute_path_rewards(self, paths):
 53 |         # path has two keys: observations and actions
 54 |         # path["observations"] : (num_traj, horizon, obs_dim)
 55 |         # path["rewards"] should have shape (num_traj, horizon)
 56 |         obs = paths["observations"]
 57 |         act = paths["actions"]
 58 |         rewards = self.get_reward(obs, act)
 59 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
 60 | 
 61 |     def get_done(self, obs):
 62 |         height, ang = obs[self.height_idx:(self.ang_idx+1)]
 63 |         done = not (np.isfinite(obs).all() and (np.abs(obs) < 100).all() and
 64 |                     (height > .7) and (np.abs(ang) < self.ang_threshold))
 65 |         return done
 66 | 
 67 |     def truncate_paths(self, paths):
 68 |         for path in paths:
 69 |             obs = path["observations"]
 70 |             height = obs[:, self.height_idx]
 71 |             angle = obs[:, self.ang_idx]
 72 |             T = obs.shape[0]
 73 |             t = 0
 74 |             done = False
 75 |             while t < T and done is False:
 76 |                 done = not ((np.abs(obs[t]) < 100).all() and (height[t] > .7) and (np.abs(angle[t]) < self.ang_threshold))
 77 |                 t = t + 1
 78 |                 T = t if done else T
 79 |             path["observations"] = path["observations"][:T]
 80 |             path["actions"] = path["actions"][:T]
 81 |             path["rewards"] = path["rewards"][:T]
 82 |             path["terminated"] = done
 83 |         return paths
 84 | 
 85 |     def get_env_state(self):
 86 |         return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy())
 87 | 
 88 |     def set_env_state(self, state):
 89 |         qpos = state['qpos']
 90 |         qvel = state['qvel']
 91 |         self.sim.reset()
 92 |         self.data.qpos[:] = qpos
 93 |         self.data.qvel[:] = qvel
 94 |         self.sim.forward()
 95 | 
 96 |     def reset_model(self):
 97 |         qpos = self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq)
 98 |         qvel = self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
 99 |         self.set_state(qpos, qvel)
100 |         return self._get_obs()
101 | 
102 |     def viewer_setup(self):
103 |         self.viewer.cam.trackbodyid = 2
104 |         self.viewer.cam.distance = self.model.stat.extent * 0.75
105 |         self.viewer.cam.lookat[2] = 1.15
106 |         self.viewer.cam.elevation = -20
107 | 


--------------------------------------------------------------------------------
/milo/milo/gym_env/humanoid.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym.envs.mujoco import mujoco_env
 3 | from gym import utils
 4 | 
 5 | DEFAULT_CAMERA_CONFIG = {
 6 |     'trackbodyid': 1,
 7 |     'distance': 4.0,
 8 |     'lookat': np.array((0.0, 0.0, 2.0)),
 9 |     'elevation': -20.0,
10 | }
11 | 
12 | def mass_center(model, sim):
13 |     mass = np.expand_dims(model.body_mass, axis=1)
14 |     xpos = sim.data.xipos
15 |     return (np.sum(mass * xpos, axis=0) / np.sum(mass))[0:2].copy()
16 | 
17 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
18 |     def __init__(self,
19 |                  xml_file='humanoid.xml',
20 |                  reset_noise_scale=1e-2):
21 |         utils.EzPickle.__init__(**locals())
22 | 
23 |         self._reset_noise_scale = reset_noise_scale
24 | 
25 |         #mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
26 |         mujoco_env.MujocoEnv.__init__(self, xml_file, 10)
27 | 
28 |     def step(self, action):
29 |         self.xypos_before = mass_center(self.model, self.sim)
30 |         self.do_simulation(action, self.frame_skip)
31 | 
32 |         observation = self._get_obs()
33 |         reward = self.get_reward(observation, action)
34 |         done = self.get_done(observation)
35 | 
36 |         return observation, reward, done, {}
37 | 
38 |     def _get_obs(self):
39 |         position = self.sim.data.qpos.flat.copy()
40 |         velocity = self.sim.data.qvel.flat.copy()
41 |         # Add Difference of center of mass to get reward
42 |         delta = mass_center(self.model, self.sim) - self.xypos_before
43 |         
44 |         return np.concatenate((
45 |             delta,
46 |             position[2:],
47 |             velocity*self.dt,
48 |         ))
49 |     
50 |     def get_reward(self, obs, action):
51 |         obs = np.clip(obs, -10.0, 10.0)
52 |         ctrl = np.clip(action, -0.4, 0.4)
53 | 
54 |         x_velocity, y_velocity = obs[:2]/self.dt
55 |         z = obs[2]
56 |         forward_reward = 1.25 * x_velocity
57 |         alive_reward = 5.0
58 |         ctrl_cost = 0.1 * np.sum(np.square(ctrl))
59 |         reward = forward_reward + alive_reward - ctrl_cost
60 |         
61 |         return reward * 2.0
62 | 
63 |     def get_done(self, obs):
64 |         healthy = 1.0 < obs[2] < 2.0
65 |         return not healthy
66 | 
67 |     def reset_model(self):
68 |         noise_low = -self._reset_noise_scale
69 |         noise_high = self._reset_noise_scale
70 | 
71 |         qpos = self.init_qpos + self.np_random.uniform(
72 |             low=noise_low, high=noise_high, size=self.model.nq)
73 |         qvel = self.init_qvel + self.np_random.uniform(
74 |             low=noise_low, high=noise_high, size=self.model.nv)
75 |         self.set_state(qpos, qvel)
76 |         
77 |         observation = self._get_obs()
78 |         return observation
79 | 
80 |     def viewer_setup(self):
81 |         for key, value in DEFAULT_CAMERA_CONFIG.items():
82 |             if isinstance(value, np.ndarray):
83 |                 getattr(self.viewer.cam, key)[:] = value
84 |             else:
85 |                 setattr(self.viewer.cam, key, value)
86 | 


--------------------------------------------------------------------------------
/milo/milo/gym_env/multiprocessing_env.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import numpy as np
 3 | from torch.multiprocessing import Process
 4 | 
 5 | class MujocoEnvProcess(Process):
 6 |     """
 7 |     Process class for model based environments that are compatible with MJRL
 8 |     """
 9 |     def __init__(self, env, child_conn, seed, eval_mode=False, paths_per_process=25):
10 |         super().__init__()
11 |         self.daemon = True
12 |         self.env = copy.deepcopy(env)
13 |         self.horizon = env.horizon
14 |         self.child_conn = child_conn
15 |         self.paths_per_process = paths_per_process
16 |         self.seed = seed
17 |         self.eval_mode = eval_mode
18 | 
19 |     def run(self):
20 |         super().run()
21 |         while True:
22 |             paths, ctr = [], 0
23 |             policy = self.child_conn.recv() # Recieve policy
24 |             for ep in range(self.paths_per_process):
25 |                 # Set new seed
26 |                 seed = self.seed + ep
27 |                 self.env.set_seed(seed)
28 |                 np.random.seed(seed)
29 | 
30 |                 observations = []
31 |                 actions = []
32 |                 rewards = []
33 |                 next_observations = []
34 |                 agent_infos = []
35 |                 env_infos = []
36 | 
37 |                 o = self.env.reset()
38 |                 done = False
39 |                 t = 0
40 |                 while t < self.horizon and done != True:
41 |                     a, agent_info = policy.get_action(o)
42 |                     if self.eval_mode:
43 |                         a = agent_info['evaluation']
44 |                     next_o, r, done, info = self.env.step(a) # Take step
45 | 
46 |                     observations.append(o)
47 |                     next_observations.append(next_o)
48 |                     actions.append(a)
49 |                     rewards.append(r)
50 |                     agent_infos.append(agent_info)
51 |                     env_infos.append(info)
52 | 
53 |                     o = next_o
54 |                     t += 1
55 | 
56 |                 path = dict(
57 |                     observations      = np.array(observations),
58 |                     next_observations = np.array(next_observations),
59 |                     actions           = np.array(actions),
60 |                     rewards           = np.array(rewards),
61 |                     agent_infos       = stack_tensor_dict_list(agent_infos),
62 |                     env_infos         = stack_tensor_dict_list(env_infos),
63 |                     terminated        = done
64 |                 )
65 | 
66 |                 paths.append(path)
67 |                 ctr += t
68 | 
69 |             self.child_conn.send([paths, ctr]) # Return num samples
70 | 
71 |     def close(self):
72 |         super().close()
73 | 
74 | def stack_tensor_list(tensor_list):
75 |     return np.array(tensor_list)
76 | 
77 | def stack_tensor_dict_list(tensor_dict_list):
78 |     """
79 |     Stack a list of dictionaries of {tensors or dictionary of tensors}.
80 |     :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}.
81 |     :return: a dictionary of {stacked tensors or dictionary of stacked tensors}
82 |     """
83 |     keys = list(tensor_dict_list[0].keys())
84 |     ret = dict()
85 |     for k in keys:
86 |         example = tensor_dict_list[0][k]
87 |         if isinstance(example, dict):
88 |             v = stack_tensor_dict_list([x[k] for x in tensor_dict_list])
89 |         else:
90 |             v = stack_tensor_list([x[k] for x in tensor_dict_list])
91 |         ret[k] = v
92 |     return ret
93 | 


--------------------------------------------------------------------------------
/milo/milo/gym_env/walker2d.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from gym.envs.mujoco import mujoco_env
  4 | 
  5 | class Walker2dEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  6 |     def __init__(self):
  7 |         self.pos_before = 0.0
  8 |         self.height_idx, self.ang_idx = 1, 2
  9 |         mujoco_env.MujocoEnv.__init__(self, 'walker2d.xml', 10)
 10 |         # mujoco_env.MujocoEnv.__init__(self, "walker2d.xml", 4)
 11 |         utils.EzPickle.__init__(self)
 12 | 
 13 |     def step(self, a):
 14 |         self.pos_before = self.data.qpos[0].copy()
 15 |         self.do_simulation(a, self.frame_skip)
 16 |         obs = self._get_obs()
 17 |         reward = self.get_reward(obs, a)
 18 |         done = self.get_done(obs)
 19 |         return obs, reward, done, {}
 20 | 
 21 |     def _get_obs(self):
 22 |         # I am using delta instead of velocity, 
 23 |         # so that all obs are of similar magnitude
 24 |         delta = self.data.qpos[0] - self.pos_before
 25 |         return np.concatenate([
 26 |             [delta],
 27 |             self.sim.data.qpos.ravel()[1:],
 28 |             self.sim.data.qvel.ravel() * self.dt,
 29 |         ])
 30 | 
 31 |     def get_reward(self, obs, act):
 32 |         obs = np.clip(obs, -10.0, 10.0)
 33 |         if len(obs.shape) == 1:
 34 |             # vector obs, called when stepping the env
 35 |             vel_x = obs[0] / self.dt               # recover velocity from delta
 36 |             power = np.square(act).sum()
 37 |             height, ang = obs[self.height_idx:(self.ang_idx+1)]
 38 |         else:
 39 |             vel_x = obs[:, :, 0] / self.dt         # recover velocity from delta
 40 |             power = np.square(act).sum(axis=-1)
 41 |             height = obs[:, :, self.height_idx]
 42 |             ang = obs[:, :, self.ang_idx]
 43 |         alive_bonus = 1.0 * (height > 0.8) * (height < 2.0) * (np.abs(ang) < 1.0)
 44 |         reward = vel_x + alive_bonus - 1e-3 * power
 45 |         reward = reward * 2.5    # to account for scaling difference (skip 4 --> 10)
 46 |         return reward
 47 | 
 48 |     def compute_path_rewards(self, paths):
 49 |         # path has two keys: observations and actions
 50 |         # path["observations"] : (num_traj, horizon, obs_dim)
 51 |         # path["rewards"] should have shape (num_traj, horizon)
 52 |         obs = paths["observations"]
 53 |         act = paths["actions"]
 54 |         rewards = self.get_reward(obs, act)
 55 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
 56 |         return paths
 57 | 
 58 |     def get_done(self, obs):
 59 |         height, ang = obs[self.height_idx:(self.ang_idx+1)]
 60 |         done = not (np.isfinite(obs).all() and (np.abs(obs) < 100).all() and
 61 |                     (height > 0.8) and (height < 2.0) and (np.abs(ang) < 1.0))
 62 |         return done
 63 | 
 64 |     def truncate_paths(self, paths):
 65 |         for path in paths:
 66 |             obs = path["observations"]
 67 |             height = obs[:, self.height_idx]
 68 |             angle = obs[:, self.ang_idx]
 69 |             T = obs.shape[0]
 70 |             t = 0
 71 |             done = False
 72 |             while t < T and done is False:
 73 |                 done = not ((np.abs(obs[t]) < 100).all() and (height[t] > 0.8) and \
 74 |                            (height[t] < 2.0) and (np.abs(angle[t]) < 1.0))
 75 |                 T = t if done else T
 76 |                 t = t + 1
 77 |             path["observations"] = path["observations"][:T]
 78 |             path["actions"] = path["actions"][:T]
 79 |             path["rewards"] = path["rewards"][:T]
 80 |             path["terminated"] = done
 81 |         return paths
 82 | 
 83 |     def get_env_state(self):
 84 |         return dict(qpos=self.data.qpos.copy(), qvel=self.data.qvel.copy())
 85 |     
 86 |     def set_env_state(self, state):
 87 |         qpos = state['qpos']
 88 |         qvel = state['qvel']
 89 |         self.sim.reset()
 90 |         self.data.qpos[:] = qpos
 91 |         self.data.qvel[:] = qvel
 92 |         self.sim.forward()
 93 | 
 94 |     def reset_model(self):
 95 |         self.set_state(
 96 |             self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq),
 97 |             self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
 98 |         )
 99 |         return self._get_obs()
100 | 
101 |     def viewer_setup(self):
102 |         self.viewer.cam.trackbodyid = 2
103 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
104 |         self.viewer.cam.lookat[2] = 1.15
105 |         self.viewer.cam.elevation = -20
106 | 


--------------------------------------------------------------------------------
/milo/milo/sampler/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.sampler.sampler import mb_sampler
2 | 


--------------------------------------------------------------------------------
/milo/milo/sampler/sampler.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from copy import deepcopy
 3 | from milo.gym_env import MujocoEnvProcess
 4 | from torch.multiprocessing import Pipe
 5 | 
 6 | def mb_sampler(env,
 7 |                policy,
 8 |                num_samples,
 9 |                base_seed,
10 |                eval_mode=False,
11 |                num_workers=4,
12 |                paths_per_process=13,
13 |                verbose=False):
14 |     """
15 |     Multiprocess sampler for model-based rollouts. Note, this is only meant for CPU usage.
16 |     """
17 | 
18 |     # Create Pipes and spawn jobs
19 |     jobs, parent_conns, child_conns = [], [], []
20 |     for idx in range(num_workers):
21 |         parent_conn, child_conn = Pipe()
22 |         seed = 12345+base_seed*idx
23 |         job = MujocoEnvProcess(env, child_conn, seed, eval_mode=eval_mode, paths_per_process=paths_per_process)
24 |         job.start()
25 |         jobs.append(job)
26 |         parent_conns.append(parent_conn)
27 |         child_conns.append(child_conn)
28 | 
29 |     # Run Jobs
30 |     start_time = time.time()
31 |     all_paths, curr_samples = [], 0
32 |     while curr_samples < num_samples:
33 |         for parent_conn in parent_conns:
34 |             parent_conn.send(deepcopy(policy))
35 |         for parent_conn in parent_conns:
36 |             paths, ctr = parent_conn.recv()
37 |             all_paths.extend(paths)
38 |             curr_samples += ctr
39 |     if verbose:
40 |         print(f"Collected {curr_samples} samples and {len(all_paths)} trajectories <<<<<< took {time.time()-start_time} seconds")
41 | 
42 |     for job in jobs:
43 |         job.terminate()
44 | 
45 |     return all_paths
46 | 


--------------------------------------------------------------------------------
/milo/milo/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from milo.utils.logger import init_logger
2 | from milo.utils.arguments import get_args
3 | from milo.utils.evaluate import evaluate
4 | from milo.utils.util import *
5 | 


--------------------------------------------------------------------------------
/milo/milo/utils/arguments.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | 
  4 | def get_args():
  5 |     # ====== Argument Parser ======
  6 |     parser = argparse.ArgumentParser(
  7 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
  8 |     )
  9 | 
 10 |     # Logging/Environment Arguments
 11 |     parser.add_argument('--env', type=str,
 12 |                         help='environment ID', default='Hopper-v6')
 13 |     parser.add_argument('--seed', type=int, help='seed', default=100)
 14 |     parser.add_argument('--num_cpu', type=int,
 15 |                         help='number of processes used for inference', default=4)
 16 |     parser.add_argument('--num_trajs', type=int,
 17 |                         help='number of expert trajs', default=10)
 18 |     parser.add_argument('--num_samples', type=int,
 19 |                         help='number of expert samples', default=500)
 20 |     parser.add_argument('--subsample_freq', type=int,
 21 |                         help='subsample frequency', default=8)
 22 |     parser.add_argument('--norm_thresh_coeff', type=float,
 23 |                         help='Norm threshold', default=2)
 24 |     parser.add_argument('--include_expert', action='store_true',
 25 |                         help='include expert data into offline db', default=False)
 26 |     parser.add_argument('--subsample_expert', action='store_true',
 27 |                         help='subsample expert samples', default=False)
 28 |     parser.add_argument('--randomize_expert', action='store_true',
 29 |                         help='randomize expert samples', default=False)
 30 |     parser.add_argument('--save_iter', type=int,
 31 |                         help='Interval to Save checkpoints', default=10)
 32 | 
 33 |     # Path Arguments
 34 |     parser.add_argument('--root_path', type=str,
 35 |                         help='Root dir to save outputs', default='./experiments')
 36 |     parser.add_argument('--data_path', type=str,
 37 |                         help='Root data dir to get db', default='./data')
 38 |     parser.add_argument('--expert_db', type=str,
 39 |                         help='expert db name', default='Hopper-v6_100_3012.62.pt')
 40 |     parser.add_argument('--offline_db', type=str,
 41 |                         help='offline db name', default='Hopper-v6_100_3025.47.pt')
 42 |     parser.add_argument('--model_save_path', type=str,  help='Path to save models',
 43 |                         default='./experiments/dynamics_model_weights')
 44 |     parser.add_argument('--id', type=int,  help='Experiment id', default=0)
 45 | 
 46 |     # Dynamics Model Ensemble Arguments
 47 |     parser.add_argument('--n_models', type=int,
 48 |                         help='Number of dynamics models in ensemble', default=4)
 49 |     parser.add_argument('--n_epochs', type=int,
 50 |                         help='Number of epochs to train models', default=5)
 51 |     parser.add_argument('--grad_clip', type=float,
 52 |                         help='Max Gradient Norm', default=1.0)
 53 |     parser.add_argument('--dynamics_optim', type=str,
 54 |                         help='Optimizer to use [sgd, adam]', default='sgd')
 55 | 
 56 |     # Cost Arguments
 57 |     parser.add_argument('--feature_dim', type=int,
 58 |                         help='Feature dimension', default=512)
 59 |     parser.add_argument('--update_type', type=str,
 60 |                         help='exact, geometric, decay, decay_sqrt, ficticious', default='exact')
 61 |     parser.add_argument('--bw_quantile', type=float,
 62 |                         help='Quantile when fitting bandwidth', default=0.2)
 63 |     parser.add_argument('--lambda_b', type=float,
 64 |                         help='Bonus/Penalty weighting param', default=0.1)
 65 |     parser.add_argument('--cost_lr', type=float,
 66 |                         help='0.0 is exact update, otherwise learning rate', default=0.0)
 67 | 
 68 |     # Policy Gradient Arguments
 69 |     parser.add_argument('--planner', type=str,
 70 |                         help='pg alg to use (trpo, ppo)', default='trpo')
 71 |     parser.add_argument('--actor_model_hidden', type=int,
 72 |                         nargs='+', help='hidden dims for actor', default=[32, 32])
 73 |     parser.add_argument('--critic_model_hidden', type=int, nargs='+',
 74 |                         help='hidden dims for critic', default=[128, 128])
 75 |     parser.add_argument('--gamma', type=float,
 76 |                         help='discount factor for rewards (default: 0.99)', default=0.995)
 77 |     parser.add_argument('--gae_lambda', type=float,
 78 |                         help='gae lambda val', default=0.97)
 79 |     parser.add_argument('--samples_per_step', type=int,
 80 |                         help='Number of mb samples per pg step', default=512)
 81 |     parser.add_argument('--policy_init_log', type=float,
 82 |                         help='policy init log', default=-0.25)
 83 |     parser.add_argument('--policy_min_log', type=float,
 84 |                         help='policy min log', default=-2.0)
 85 |     parser.add_argument('--vf_iters', type=int,
 86 |                         help='Number of value optim steps', default=2)
 87 |     parser.add_argument('--vf_batch_size', type=int,
 88 |                         help='Critic batch size', default=64)
 89 |     parser.add_argument('--vf_lr', type=float, help='Value lr', default=1e-3)
 90 |     parser.add_argument('--vf_reg_coef', type=float,
 91 |                         help='baseline regularization coeff', default=1e-3)
 92 | 
 93 |     # BC regularization Arguments
 94 |     parser.add_argument('--do_bc_reg', action='store_true', help='Add bc regularization to policy gradient', default=False)
 95 |     parser.add_argument('--bc_reg_coeff', type=float, help='Regularization coefficient for policy gradient', default=0.1)
 96 | 
 97 |     # TRPO Arguments
 98 |     parser.add_argument('--cg_iter', type=int,
 99 |                         help='Number of CG iterations', default=10)
100 |     parser.add_argument('--cg_damping', type=float,
101 |                         help='CG damping coefficient', default=1e-4)
102 |     parser.add_argument('--kl_dist', type=float,
103 |                         help='Trust region', default=0.05)
104 |     parser.add_argument('--hvp_sample_frac', type=float,
105 |                         help='Fraction of samples for FIM', default=1.0)
106 | 
107 |     # PPO Arguments
108 |     parser.add_argument('--clip_coef', type=float,
109 |                         help='Clip Coefficient for PPO Trust region', default=0.2)
110 |     parser.add_argument('--ppo_lr', type=float,
111 |                         help='PPO learning rate', default=3e-4)
112 |     parser.add_argument('--ppo_epochs', type=int,
113 |                         help='Epochs per PPO step', default=10)
114 |     parser.add_argument('--ppo_batch_size', type=int,
115 |                         help='Mini-batch size for PPO', default=64)
116 | 
117 |     # BC Arguments
118 |     parser.add_argument('--bc_epochs', type=int,
119 |                         help='Number of BC epochs', default=3)
120 |     parser.add_argument('--n_bc_iters', type=int, default=10,
121 |                         help='number of times to run BC iterations')
122 | 
123 |     # General Algorithm Arguments
124 |     parser.add_argument('--n_iter', type=int, help='Number of offline IL iterations to run', default=300)
125 |     parser.add_argument('--pg_iter', type=int, help='Number of pg steps', default=5)
126 |     parser.add_argument('--use_ground_truth', action='store_true', help='use ground truth rewards', default=False)
127 |     parser.add_argument('--do_model_free', action='store_true', help='do model free policy gradient', default=False)
128 | 
129 |     args = parser.parse_args()
130 |     return args
131 | 


--------------------------------------------------------------------------------
/milo/milo/utils/evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | from mjrl.samplers.core import sample_paths
 5 | 
 6 | # ========================
 7 | # === Evaluation Utils ===
 8 | # ========================
 9 | 
10 | def evaluate(n_iter, logger, writer, args, env, policy, reward_func, num_traj=10, adroit=False):
11 |     greedy_samples = sample_paths(num_traj=num_traj, env=env, policy=policy, \
12 |                         num_cpu=args.num_cpu, base_seed=args.seed, eval_mode=True, suppress_print=True)
13 |     samples = sample_paths(num_traj=num_traj, env=env, policy=policy, \
14 |                         num_cpu=args.num_cpu, base_seed=args.seed, eval_mode=False, suppress_print=True)
15 | 
16 |     if adroit:
17 |         greedy_success = env.evaluate_success(greedy_samples)
18 |         sample_success = env.evaluate_success(samples)
19 | 
20 |     # Compute scores
21 |     greedy_scores = np.array([np.sum(traj['rewards']) for traj in greedy_samples])
22 |     sample_scores = np.array([np.sum(traj['rewards']) for traj in samples])
23 |     greedy_mean_lengths = np.mean([len(traj['rewards']) for traj in greedy_samples])
24 |     sample_mean_lengths = np.mean([len(traj['rewards']) for traj in samples])
25 |     greedy_mean, greedy_max, greedy_min = greedy_scores.mean(), greedy_scores.max(), greedy_scores.min()
26 |     sample_mean, sample_max, sample_min = sample_scores.mean(), sample_scores.max(), sample_scores.min()
27 | 
28 |     # Compute MMD (S, A)
29 |     greedy_x = np.concatenate([np.concatenate([traj['observations'], traj['actions']], axis=1) for traj in greedy_samples], axis=0)
30 |     sample_x = np.concatenate([np.concatenate([traj['observations'], traj['actions']], axis=1) for traj in samples], axis=0)
31 |     greedy_x = torch.from_numpy(greedy_x).float()
32 |     sample_x = torch.from_numpy(sample_x).float()
33 | 
34 |     greedy_diff = reward_func.get_rep(greedy_x).mean(0) - reward_func.phi_e
35 |     sample_diff = reward_func.get_rep(sample_x).mean(0) - reward_func.phi_e
36 | 
37 |     greedy_mmd = torch.dot(greedy_diff, greedy_diff)
38 |     sample_mmd = torch.dot(sample_diff, sample_diff)
39 | 
40 |     # Log
41 |     logger.info(f'Greedy Evaluation Score mean (min, max): {greedy_mean:.2f} ({greedy_min:.2f}, {greedy_max:.2f})')
42 |     logger.info(f'Greedy Evaluation Trajectory Lengths: {greedy_mean_lengths:.2f}')
43 |     logger.info(f'Greedy MMD: {greedy_mmd}')
44 |     if adroit:
45 |         logger.info(f'Greedy Success %: {greedy_success}%')
46 |     logger.info(f'Sampled Evaluation Score mean (min, max): {sample_mean:.2f} ({sample_min:.2f}, {sample_max:.2f})')
47 |     logger.info(f'Sampled Evaluation Trajectory Lengths: {sample_mean_lengths:.2f}')
48 |     logger.info(f'Sampled MMD: {sample_mmd}')
49 |     if adroit:
50 |         logger.info(f'Sampled Success %: {sample_success}%')
51 | 
52 |     # Tensorboard Logging
53 |     writer.add_scalars('data/inf_greedy_reward', {'min_score': greedy_min,
54 |                                               'mean_score': greedy_mean,
55 |                                               'max_score': greedy_max}, n_iter+1)
56 |     writer.add_scalar('data/inf_greedy_len', greedy_mean_lengths, n_iter+1)
57 |     writer.add_scalar('data/greedy_mmd', greedy_mmd, n_iter+1)
58 |     writer.add_scalars('data/inf_sampled_reward', {'min_score': sample_min,
59 |                                               'mean_score': sample_mean,
60 |                                               'max_score': sample_max}, n_iter+1)
61 |     writer.add_scalar('data/inf_sampled_len', sample_mean_lengths, n_iter+1)
62 |     writer.add_scalar('data/sampled_mmd', sample_mmd, n_iter+1)
63 |     if adroit:
64 |         writer.add_scalar('data/greedy_success_percen', greedy_success, n_iter+1)
65 |         writer.add_scalar('data/sampled_success_percen', sample_success, n_iter+1)
66 | 
67 |     scores = {'greedy': greedy_mean, 'sample': sample_mean}
68 |     mmds = {'greedy': greedy_mmd, 'sample': sample_mmd}
69 | 
70 |     return scores, mmds
71 | 
72 | 


--------------------------------------------------------------------------------
/milo/milo/utils/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logger singleton wrapper
 3 | Default logger folder is `os.path.join(__file__, '..', '..', 'logs')`
 4 | """
 5 | import logging
 6 | import logging.handlers
 7 | import os
 8 | import sys
 9 | 
10 | 
11 | __all__ = ['init_logger']
12 | 
13 | 
14 | def init_logger(log_dir):
15 |     os.makedirs(log_dir, exist_ok=True)
16 |     log_level = logging.DEBUG
17 |     log_format = '%(message)s'
18 | 
19 |     logger = logging.getLogger(log_dir)
20 |     logger.setLevel(log_level)
21 |     path = os.path.join(log_dir, 'main.log')
22 | 
23 |     # file handler (log file)
24 |     log_handler = logging.handlers.RotatingFileHandler(filename=path)
25 |     log_handler.setLevel(log_level)
26 |     log_handler.setFormatter(logging.Formatter(log_format))
27 |     logger.addHandler(log_handler)
28 | 
29 |     # stream handler (default sys.stderr)
30 |     log_handler = logging.StreamHandler()
31 |     log_handler.setLevel(log_level)
32 |     log_handler.setFormatter(logging.Formatter(log_format))
33 |     logger.addHandler(log_handler)
34 | 
35 |     return logger
36 | 


--------------------------------------------------------------------------------
/milo/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(
4 |     name='milo',
5 |     version='0.1.0',
6 |     packages=find_packages(),
7 |     description='Components for MILO: Model based Imitation Learning from Offline data',
8 | )
9 | 


--------------------------------------------------------------------------------
/mjrl/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # idea
104 | *.idea/
105 | 
106 | # Mac OSX files
107 | *.DS_Store


--------------------------------------------------------------------------------
/mjrl/README.md:
--------------------------------------------------------------------------------
 1 | # RL for MuJoCo
 2 | 
 3 | This package  contains implementations of various RL algorithms for continuous control tasks simulated with [MuJoCo.](http://www.mujoco.org/)
 4 | 
 5 | # Installation
 6 | The main package dependencies are `MuJoCo`, `python=3.7`, `gym>=0.13`, `mujoco-py>=2.0`, and `pytorch>=1.0`. See `setup/README.md` ([link](https://github.com/aravindr93/mjrl/tree/master/setup#installation)) for detailed install instructions.
 7 | 
 8 | # Bibliography
 9 | If you find the package useful, please cite the following papers.
10 | ```
11 | @INPROCEEDINGS{Rajeswaran-NIPS-17,
12 |     AUTHOR    = {Aravind Rajeswaran and Kendall Lowrey and Emanuel Todorov and Sham Kakade},
13 |     TITLE     = "{Towards Generalization and Simplicity in Continuous Control}",
14 |     BOOKTITLE = {NIPS},
15 |     YEAR      = {2017},
16 | }
17 | 
18 | @INPROCEEDINGS{Rajeswaran-RSS-18,
19 |     AUTHOR    = {Aravind Rajeswaran AND Vikash Kumar AND Abhishek Gupta AND
20 |                  Giulia Vezzani AND John Schulman AND Emanuel Todorov AND Sergey Levine},
21 |     TITLE     = "{Learning Complex Dexterous Manipulation with Deep Reinforcement Learning and Demonstrations}",
22 |     BOOKTITLE = {Proceedings of Robotics: Science and Systems (RSS)},
23 |     YEAR      = {2018},
24 | }
25 | ```
26 | 
27 | # Credits
28 | This package is maintained by [Aravind Rajeswaran](http://homes.cs.washington.edu/~aravraj/) and other members of the [Movement Control Lab,](http://homes.cs.washington.edu/~todorov/) University of Washington Seattle.
29 | 


--------------------------------------------------------------------------------
/mjrl/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | Here we provide a job script to illustrate policy optimization with incrimental learning methods like NPG and PPO. To run the experiments, use the commands below. The experiments are run through the job script provided which tasks two arguments:
 4 | - `output`: path to directory where all the results will be saved
 5 | - `config`: a config `.txt` file with all the experiment parameters (examples are provided)
 6 | The script has to be run from this directory, i.e. `mjrl/examples` 
 7 | 
 8 | 1. To train an NPG agent on a task shipped with `mjrl` (e.g. swimmer)
 9 | ```
10 | $ python policy_opt_job_script.py --output swimmer_npg_exp --config example_configs/swimmer_npg.txt
11 | ```
12 | 
13 | 2. To train an NPG agent on an OpenAI gym benchmark task (e.g. hopper)
14 | ```
15 | $ python policy_opt_job_script.py --output hopper_npg_exp --config example_configs/hopper_npg.txt
16 | ```
17 | Note that since the Hopper env has termination conditions, we pick the sampling mode in the config to be `samples` rather than trajectories, so that per update we have 10K samples.
18 | 
19 | 3. To train a PPO agent on the swimmer task
20 | ```
21 | $ python policy_opt_job_script.py --output swimmer_ppo_exp --config example_configs/swimmer_ppo.txt
22 | ```


--------------------------------------------------------------------------------
/mjrl/examples/behavior_clone.py:
--------------------------------------------------------------------------------
 1 | from mjrl.utils.gym_env import GymEnv
 2 | from mjrl.policies.gaussian_mlp import MLP
 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 4 | from mjrl.baselines.mlp_baseline import MLPBaseline
 5 | from mjrl.algos.npg_cg import NPG
 6 | from mjrl.algos.behavior_cloning import BC
 7 | from mjrl.utils.train_agent import train_agent
 8 | from mjrl.samplers.core import sample_paths
 9 | import mjrl.envs
10 | import time as timer
11 | import pickle
12 | SEED = 500
13 | 
14 | # ------------------------------
15 | # Train expert policy first
16 | e = GymEnv('mjrl_swimmer-v0')
17 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
18 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=5, learn_rate=1e-3)
19 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
20 | 
21 | ts = timer.time()
22 | print("========================================")
23 | print("Training expert policy")
24 | print("========================================")
25 | train_agent(job_name='swimmer_exp1',
26 |             agent=agent,
27 |             seed=SEED,
28 |             niter=50,
29 |             gamma=0.995,
30 |             gae_lambda=0.97,
31 |             num_cpu=1,
32 |             sample_mode='trajectories',
33 |             num_traj=10,
34 |             save_freq=5,
35 |             evaluation_rollouts=None)
36 | print("========================================")
37 | print("Expert policy training complete !!!")
38 | print("========================================")
39 | print("time taken = %f" % (timer.time()-ts))
40 | print("========================================")
41 | 
42 | # ------------------------------
43 | # Get demonstrations
44 | print("========================================")
45 | print("Collecting expert demonstrations")
46 | print("========================================")
47 | expert_pol = pickle.load(open('swimmer_exp1/iterations/best_policy.pickle', 'rb'))
48 | demo_paths = sample_paths(num_traj=5, policy=expert_pol, env=e.env_id)
49 | 
50 | # ------------------------------
51 | # Train BC
52 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
53 | bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=64, lr=1e-3) # will use Adam by default
54 | ts = timer.time()
55 | print("========================================")
56 | print("Running BC with expert demonstrations")
57 | print("========================================")
58 | bc_agent.train()
59 | print("========================================")
60 | print("BC training complete !!!")
61 | print("time taken = %f" % (timer.time()-ts))
62 | print("========================================")
63 | 
64 | # ------------------------------
65 | # Evaluate Policies
66 | bc_pol_score = e.evaluate_policy(policy, num_episodes=5, mean_action=True)
67 | expert_score = e.evaluate_policy(expert_pol, num_episodes=5, mean_action=True)
68 | print("Expert policy performance (eval mode) = %f" % expert_score[0][0])
69 | print("BC policy performance (eval mode) = %f" % bc_pol_score[0][0])
70 | 


--------------------------------------------------------------------------------
/mjrl/examples/example_configs/hopper_npg.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env'               :   'Hopper-v3',
 6 | 'algorithm'         :   'NPG',
 7 | 'seed'              :   123,
 8 | 'sample_mode'       :   'samples',
 9 | 'rl_num_samples'    :   10000,
10 | 'rl_num_iter'       :   100,
11 | 'num_cpu'           :   1,
12 | 'save_freq'         :   25,
13 | 'eval_rollouts'     :   None,
14 | 'exp_notes'         :   'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.',
15 | 
16 | # RL parameters (all params related to PG, value function etc.)
17 | 
18 | 'policy_size'       :   (32, 32),
19 | 'init_log_std'      :   -0.5,
20 | 'vf_hidden_size'    :   (128, 128),
21 | 'vf_batch_size'     :   64,
22 | 'vf_epochs'         :   2,
23 | 'vf_learn_rate'     :   1e-3,
24 | 'rl_step_size'      :   0.05,
25 | 'rl_gamma'          :   0.995,
26 | 'rl_gae'            :   0.97,
27 | 
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 | 
30 | 'alg_hyper_params'  :   dict(),
31 | 
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/mjrl/examples/example_configs/swimmer_npg.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env'               :   'mjrl_swimmer-v0',
 6 | 'algorithm'         :   'NPG',
 7 | 'seed'              :   123,
 8 | 'sample_mode'       :   'trajectories',
 9 | 'rl_num_traj'       :   10,
10 | 'rl_num_iter'       :   50,
11 | 'num_cpu'           :   2,
12 | 'save_freq'         :   25,
13 | 'eval_rollouts'     :   None,
14 | 'exp_notes'         :   'Example config for training policy with NPG on the mjrl swimmer task.',
15 | 
16 | # RL parameters (all params related to PG, value function, DAPG etc.)
17 | 
18 | 'policy_size'       :   (32, 32),
19 | 'init_log_std'      :   -0.5,
20 | 'vf_hidden_size'    :   (128, 128),
21 | 'vf_batch_size'     :   64,
22 | 'vf_epochs'         :   2,
23 | 'vf_learn_rate'     :   1e-3,
24 | 'rl_step_size'      :   0.1,
25 | 'rl_gamma'          :   0.995,
26 | 'rl_gae'            :   0.97,
27 | 
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 | 
30 | 'alg_hyper_params'  :   dict(),
31 | 
32 | }


--------------------------------------------------------------------------------
/mjrl/examples/example_configs/swimmer_ppo.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env'               :   'mjrl_swimmer-v0',
 6 | 'algorithm'         :   'PPO',
 7 | 'seed'              :   123,
 8 | 'sample_mode'       :   'trajectories',
 9 | 'rl_num_traj'       :   10,
10 | 'rl_num_iter'       :   50,
11 | 'num_cpu'           :   2,
12 | 'save_freq'         :   25,
13 | 'eval_rollouts'     :   None,
14 | 'exp_notes'         :   'Example config for training policy with PPO on the mjrl swimmer task.',
15 | 
16 | # RL parameters (all params related to PG, value function, DAPG etc.)
17 | 
18 | 'policy_size'       :   (32, 32),
19 | 'init_log_std'      :   -0.5,
20 | 'vf_hidden_size'    :   (128, 128),
21 | 'vf_batch_size'     :   64,
22 | 'vf_epochs'         :   2,
23 | 'vf_learn_rate'     :   1e-3,
24 | 'rl_step_size'      :   0.1,
25 | 'rl_gamma'          :   0.995,
26 | 'rl_gae'            :   0.97,
27 | 
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 | 
30 | 'alg_hyper_params'  :   dict(clip_coef=0.2, epochs=10, mb_size=64, learn_rate=5e-4),
31 | 
32 | }


--------------------------------------------------------------------------------
/mjrl/examples/linear_nn_comparison.py:
--------------------------------------------------------------------------------
 1 | from mjrl.utils.gym_env import GymEnv
 2 | from mjrl.policies.gaussian_mlp import MLP
 3 | from mjrl.policies.gaussian_linear import LinearPolicy
 4 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 5 | from mjrl.baselines.mlp_baseline import MLPBaseline
 6 | from mjrl.algos.npg_cg import NPG
 7 | from mjrl.utils.train_agent import train_agent
 8 | import mjrl.envs
 9 | import time as timer
10 | SEED = 500
11 | 
12 | # NN policy
13 | # ==================================
14 | e = GymEnv('mjrl_swimmer-v0')
15 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
16 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
17 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
18 | 
19 | ts = timer.time()
20 | train_agent(job_name='swimmer_nn_exp1',
21 |             agent=agent,
22 |             seed=SEED,
23 |             niter=50,
24 |             gamma=0.995,  
25 |             gae_lambda=0.97,
26 |             num_cpu=1,
27 |             sample_mode='trajectories',
28 |             num_traj=10,
29 |             save_freq=5,
30 |             evaluation_rollouts=5)
31 | print("time taken for NN policy training = %f" % (timer.time()-ts))
32 | 
33 | 
34 | # Linear policy
35 | # ==================================
36 | e = GymEnv('mjrl_swimmer-v0')
37 | policy = LinearPolicy(e.spec, seed=SEED)
38 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
39 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
40 | 
41 | ts = timer.time()
42 | train_agent(job_name='swimmer_linear_exp1',
43 |             agent=agent,
44 |             seed=SEED,
45 |             niter=50,
46 |             gamma=0.995,  
47 |             gae_lambda=0.97,
48 |             num_cpu=1,
49 |             sample_mode='trajectories',
50 |             num_traj=10,
51 |             save_freq=5,
52 |             evaluation_rollouts=5)
53 | print("time taken for linear policy training = %f" % (timer.time()-ts))
54 | 


--------------------------------------------------------------------------------
/mjrl/examples/policy_opt_job_script.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a job script for running policy gradient algorithms on gym tasks.
  3 | Separate job scripts are provided to run few other algorithms
  4 | - For DAPG see here: https://github.com/aravindr93/hand_dapg/tree/master/dapg/examples
  5 | - For model-based NPG see here: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos/model_accel
  6 | """
  7 | 
  8 | from mjrl.utils.gym_env import GymEnv
  9 | from mjrl.policies.gaussian_mlp import MLP
 10 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 11 | from mjrl.baselines.mlp_baseline import MLPBaseline
 12 | from mjrl.algos.npg_cg import NPG
 13 | from mjrl.algos.batch_reinforce import BatchREINFORCE
 14 | from mjrl.algos.ppo_clip import PPO
 15 | from mjrl.utils.train_agent import train_agent
 16 | import os
 17 | import json
 18 | import gym
 19 | import mjrl.envs
 20 | import time as timer
 21 | import pickle
 22 | import argparse
 23 | 
 24 | # ===============================================================================
 25 | # Get command line arguments
 26 | # ===============================================================================
 27 | 
 28 | parser = argparse.ArgumentParser(description='Natural policy gradient from mjrl on mujoco environments')
 29 | parser.add_argument('--output', type=str, required=True, help='location to store results')
 30 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params')
 31 | 
 32 | args = parser.parse_args()
 33 | JOB_DIR = args.output
 34 | if not os.path.exists(JOB_DIR):
 35 |     os.mkdir(JOB_DIR)
 36 | with open(args.config, 'r') as f:
 37 |     job_data = eval(f.read())
 38 | assert 'algorithm' in job_data.keys()
 39 | assert any([job_data['algorithm'] == a for a in ['NPG', 'NVPG', 'VPG', 'PPO']])
 40 | assert 'sample_mode' in job_data.keys()
 41 | job_data['alg_hyper_params'] = dict() if 'alg_hyper_params' not in job_data.keys() else job_data['alg_hyper_params']
 42 | 
 43 | EXP_FILE = JOB_DIR + '/job_config.json'
 44 | with open(EXP_FILE, 'w') as f:
 45 |     json.dump(job_data, f, indent=4)
 46 | 
 47 | if job_data['sample_mode'] == 'trajectories':
 48 |     assert 'rl_num_traj' in job_data.keys()
 49 |     job_data['rl_num_samples'] = 0 # will be ignored
 50 | elif job_data['sample_mode'] == 'samples':
 51 |     assert 'rl_num_samples' in job_data.keys()
 52 |     job_data['rl_num_traj'] = 0    # will be ignored
 53 | else:
 54 |     print("Unknown sampling mode. Choose either trajectories or samples")
 55 |     exit()
 56 | 
 57 | # ===============================================================================
 58 | # Train Loop
 59 | # ===============================================================================
 60 | 
 61 | e = GymEnv(job_data['env'])
 62 | policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed'], init_log_std=job_data['init_log_std'])
 63 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], hidden_sizes=job_data['vf_hidden_size'],
 64 |                        epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate'])
 65 | 
 66 | # Construct the algorithm
 67 | if job_data['algorithm'] == 'NPG':
 68 |     # Other hyperparameters (like number of CG steps) can be specified in config for pass through
 69 |     # or default hyperparameters will be used
 70 |     agent = NPG(e, policy, baseline, normalized_step_size=job_data['rl_step_size'],
 71 |                 seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
 72 | 
 73 | elif job_data['algorithm'] == 'VPG':
 74 |     agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data['rl_step_size'],
 75 |                            seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
 76 | 
 77 | elif job_data['algorithm'] == 'NVPG':
 78 |     agent = BatchREINFORCE(e, policy, baseline, desired_kl=job_data['rl_step_size'],
 79 |                            seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
 80 | 
 81 | elif job_data['algorithm'] == 'PPO':
 82 |     # There are many hyperparameters for PPO. They can be specified in config for pass through
 83 |     # or defaults in the PPO algorithm will be used
 84 |     agent = PPO(e, policy, baseline, save_logs=True, **job_data['alg_hyper_params'])
 85 | 
 86 | print("========================================")
 87 | print("Starting policy learning")
 88 | print("========================================")
 89 | 
 90 | ts = timer.time()
 91 | train_agent(job_name=JOB_DIR,
 92 |             agent=agent,
 93 |             seed=job_data['seed'],
 94 |             niter=job_data['rl_num_iter'],
 95 |             gamma=job_data['rl_gamma'],
 96 |             gae_lambda=job_data['rl_gae'],
 97 |             num_cpu=job_data['num_cpu'],
 98 |             sample_mode=job_data['sample_mode'],
 99 |             num_traj=job_data['rl_num_traj'],
100 |             num_samples=job_data['rl_num_samples'],
101 |             save_freq=job_data['save_freq'],
102 |             evaluation_rollouts=job_data['eval_rollouts'])
103 | print("time taken = %f" % (timer.time()-ts))
104 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/__init__.py:
--------------------------------------------------------------------------------
1 | import mjrl.envs


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/algos/__init__.py


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/behavior_cloning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Minimize bc loss (MLE, MSE, RWR etc.) with pytorch optimizers
  3 | """
  4 | 
  5 | import logging
  6 | #logging.disable(logging.CRITICAL)
  7 | import numpy as np
  8 | import time as timer
  9 | import torch
 10 | from torch.autograd import Variable
 11 | from mjrl.utils.logger import DataLog
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | class BC:
 16 |     def __init__(self, expert_paths,
 17 |                  policy,
 18 |                  epochs = 5,
 19 |                  batch_size = 64,
 20 |                  lr = 1e-3,
 21 |                  optimizer = None,
 22 |                  loss_type = 'MSE',  # can be 'MLE' or 'MSE'
 23 |                  save_logs = True,
 24 |                  set_transforms = False,
 25 |                  **kwargs,
 26 |                  ):
 27 | 
 28 |         self.policy = policy
 29 |         self.expert_paths = expert_paths
 30 |         self.epochs = epochs
 31 |         self.mb_size = batch_size
 32 |         self.logger = DataLog()
 33 |         self.loss_type = loss_type
 34 |         self.save_logs = save_logs
 35 | 
 36 |         if set_transforms:
 37 |             in_shift, in_scale, out_shift, out_scale = self.compute_transformations()
 38 |             self.set_transformations(in_shift, in_scale, out_shift, out_scale)
 39 |             self.set_variance_with_data(out_scale)
 40 | 
 41 |         # construct optimizer
 42 |         self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=lr) if optimizer is None else optimizer
 43 | 
 44 |         # Loss criterion if required
 45 |         if loss_type == 'MSE':
 46 |             self.loss_criterion = torch.nn.MSELoss()
 47 | 
 48 |         # make logger
 49 |         if self.save_logs:
 50 |             self.logger = DataLog()
 51 | 
 52 |     def compute_transformations(self):
 53 |         # get transformations
 54 |         if self.expert_paths == [] or self.expert_paths is None:
 55 |             in_shift, in_scale, out_shift, out_scale = None, None, None, None
 56 |         else:
 57 |             observations = np.concatenate([path["observations"] for path in self.expert_paths])
 58 |             actions = np.concatenate([path["actions"] for path in self.expert_paths])
 59 |             in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0)
 60 |             out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0)
 61 |         return in_shift, in_scale, out_shift, out_scale
 62 | 
 63 |     def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None):
 64 |         # set scalings in the target policy
 65 |         self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale)
 66 |         self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale)
 67 | 
 68 |     def set_variance_with_data(self, out_scale):
 69 |         # set the variance of gaussian policy based on out_scale
 70 |         params = self.policy.get_param_values()
 71 |         params[-self.policy.m:] = np.log(out_scale + 1e-12)
 72 |         self.policy.set_param_values(params)
 73 | 
 74 |     def loss(self, data, idx=None):
 75 |         if self.loss_type == 'MLE':
 76 |             return self.mle_loss(data, idx)
 77 |         elif self.loss_type == 'MSE':
 78 |             return self.mse_loss(data, idx)
 79 |         else:
 80 |             print("Please use valid loss type")
 81 |             return None
 82 | 
 83 |     def mle_loss(self, data, idx):
 84 |         # use indices if provided (e.g. for mini-batching)
 85 |         # otherwise, use all the data
 86 |         idx = range(data['observations'].shape[0]) if idx is None else idx
 87 |         if type(data['observations']) == torch.Tensor:
 88 |             idx = torch.LongTensor(idx)
 89 |         obs = data['observations'][idx]
 90 |         act = data['expert_actions'][idx]
 91 |         LL, mu, log_std = self.policy.new_dist_info(obs, act)
 92 |         # minimize negative log likelihood
 93 |         return -torch.mean(LL)
 94 | 
 95 |     def mse_loss(self, data, idx=None):
 96 |         idx = range(data['observations'].shape[0]) if idx is None else idx
 97 |         if type(data['observations']) is torch.Tensor:
 98 |             idx = torch.LongTensor(idx)
 99 |         obs = data['observations'][idx]
100 |         act_expert = data['expert_actions'][idx]
101 |         if type(data['observations']) is not torch.Tensor:
102 |             obs = Variable(torch.from_numpy(obs).float(), requires_grad=False)
103 |             act_expert = Variable(torch.from_numpy(act_expert).float(), requires_grad=False)
104 |         act_pi = self.policy.model(obs)
105 |         return self.loss_criterion(act_pi, act_expert.detach())
106 | 
107 |     def fit(self, data, suppress_fit_tqdm=False, **kwargs):
108 |         # data is a dict
109 |         # keys should have "observations" and "expert_actions"
110 |         validate_keys = all([k in data.keys() for k in ["observations", "expert_actions"]])
111 |         assert validate_keys is True
112 |         ts = timer.time()
113 |         num_samples = data["observations"].shape[0]
114 | 
115 |         # log stats before
116 |         if self.save_logs:
117 |             loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0]
118 |             self.logger.log_kv('loss_before', loss_val)
119 | 
120 |         # train loop
121 |         for ep in config_tqdm(range(self.epochs), suppress_fit_tqdm):
122 |             for mb in range(int(num_samples / self.mb_size)):
123 |                 rand_idx = np.random.choice(num_samples, size=self.mb_size)
124 |                 self.optimizer.zero_grad()
125 |                 loss = self.loss(data, idx=rand_idx)
126 |                 loss.backward()
127 |                 self.optimizer.step()
128 |         params_after_opt = self.policy.get_param_values()
129 |         self.policy.set_param_values(params_after_opt, set_new=True, set_old=True)
130 | 
131 |         # log stats after
132 |         if self.save_logs:
133 |             self.logger.log_kv('epoch', self.epochs)
134 |             loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0]
135 |             self.logger.log_kv('loss_after', loss_val)
136 |             self.logger.log_kv('time', (timer.time()-ts))
137 | 
138 |     def train(self, **kwargs):
139 |         observations = np.concatenate([path["observations"] for path in self.expert_paths])
140 |         expert_actions = np.concatenate([path["actions"] for path in self.expert_paths])
141 |         data = dict(observations=observations, expert_actions=expert_actions)
142 |         self.fit(data, **kwargs)
143 | 
144 | 
145 | def config_tqdm(range_inp, suppress_tqdm=False):
146 |     if suppress_tqdm:
147 |         return range_inp
148 |     else:
149 |         return tqdm(range_inp)
150 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/dapg.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.disable(logging.CRITICAL)
  3 | import numpy as np
  4 | import scipy as sp
  5 | import scipy.sparse.linalg as spLA
  6 | import copy
  7 | import time as timer
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.autograd import Variable
 11 | import copy
 12 | 
 13 | # samplers
 14 | import mjrl.samplers.core as trajectory_sampler
 15 | 
 16 | # utility functions
 17 | import mjrl.utils.process_samples as process_samples
 18 | from mjrl.utils.logger import DataLog
 19 | from mjrl.utils.cg_solve import cg_solve
 20 | 
 21 | # Import Algs
 22 | from mjrl.algos.npg_cg import NPG
 23 | from mjrl.algos.behavior_cloning import BC
 24 | 
 25 | class DAPG(NPG):
 26 |     def __init__(self, env, policy, baseline,
 27 |                  demo_paths=None,
 28 |                  normalized_step_size=0.01,
 29 |                  FIM_invert_args={'iters': 10, 'damping': 1e-4},
 30 |                  hvp_sample_frac=1.0,
 31 |                  seed=123,
 32 |                  save_logs=False,
 33 |                  kl_dist=None,
 34 |                  lam_0=1.0,  # demo coef
 35 |                  lam_1=0.95, # decay coef
 36 |                  **kwargs,
 37 |                  ):
 38 | 
 39 |         self.env = env
 40 |         self.policy = policy
 41 |         self.baseline = baseline
 42 |         self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size
 43 |         self.seed = seed
 44 |         self.save_logs = save_logs
 45 |         self.FIM_invert_args = FIM_invert_args
 46 |         self.hvp_subsample = hvp_sample_frac
 47 |         self.running_score = None
 48 |         self.demo_paths = demo_paths
 49 |         self.lam_0 = lam_0
 50 |         self.lam_1 = lam_1
 51 |         self.iter_count = 0.0
 52 |         if save_logs: self.logger = DataLog()
 53 | 
 54 |     def train_from_paths(self, paths):
 55 | 
 56 |         # Concatenate from all the trajectories
 57 |         observations = np.concatenate([path["observations"] for path in paths])
 58 |         actions = np.concatenate([path["actions"] for path in paths])
 59 |         advantages = np.concatenate([path["advantages"] for path in paths])
 60 |         advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
 61 | 
 62 |         if self.demo_paths is not None and self.lam_0 > 0.0:
 63 |             demo_obs = np.concatenate([path["observations"] for path in self.demo_paths])
 64 |             demo_act = np.concatenate([path["actions"] for path in self.demo_paths])
 65 |             demo_adv = self.lam_0 * (self.lam_1 ** self.iter_count) * np.ones(demo_obs.shape[0])
 66 |             self.iter_count += 1
 67 |             # concatenate all
 68 |             all_obs = np.concatenate([observations, demo_obs])
 69 |             all_act = np.concatenate([actions, demo_act])
 70 |             all_adv = 1e-2*np.concatenate([advantages/(np.std(advantages) + 1e-8), demo_adv])
 71 |         else:
 72 |             all_obs = observations
 73 |             all_act = actions
 74 |             all_adv = advantages
 75 | 
 76 |         # cache return distributions for the paths
 77 |         path_returns = [sum(p["rewards"]) for p in paths]
 78 |         mean_return = np.mean(path_returns)
 79 |         std_return = np.std(path_returns)
 80 |         min_return = np.amin(path_returns)
 81 |         max_return = np.amax(path_returns)
 82 |         base_stats = [mean_return, std_return, min_return, max_return]
 83 |         self.running_score = mean_return if self.running_score is None else \
 84 |                              0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
 85 |         if self.save_logs: self.log_rollout_statistics(paths)
 86 | 
 87 |         # Keep track of times for various computations
 88 |         t_gLL = 0.0
 89 |         t_FIM = 0.0
 90 | 
 91 |         # Optimization algorithm
 92 |         # --------------------------
 93 |         surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
 94 | 
 95 |         # DAPG
 96 |         ts = timer.time()
 97 |         sample_coef = all_adv.shape[0]/advantages.shape[0]
 98 |         dapg_grad = sample_coef*self.flat_vpg(all_obs, all_act, all_adv)
 99 |         t_gLL += timer.time() - ts
100 | 
101 |         # NPG
102 |         ts = timer.time()
103 |         hvp = self.build_Hvp_eval([observations, actions],
104 |                                   regu_coef=self.FIM_invert_args['damping'])
105 |         npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(),
106 |                             cg_iters=self.FIM_invert_args['iters'])
107 |         t_FIM += timer.time() - ts
108 | 
109 |         # Step size computation
110 |         # --------------------------
111 |         n_step_size = 2.0*self.kl_dist
112 |         alpha = np.sqrt(np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20)))
113 | 
114 |         # Policy update
115 |         # --------------------------
116 |         curr_params = self.policy.get_param_values()
117 |         new_params = curr_params + alpha * npg_grad
118 |         self.policy.set_param_values(new_params, set_new=True, set_old=False)
119 |         surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
120 |         kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
121 |         self.policy.set_param_values(new_params, set_new=True, set_old=True)
122 | 
123 |         # Log information
124 |         if self.save_logs:
125 |             self.logger.log_kv('alpha', alpha)
126 |             self.logger.log_kv('delta', n_step_size)
127 |             self.logger.log_kv('time_vpg', t_gLL)
128 |             self.logger.log_kv('time_npg', t_FIM)
129 |             self.logger.log_kv('kl_dist', kl_dist)
130 |             self.logger.log_kv('surr_improvement', surr_after - surr_before)
131 |             self.logger.log_kv('running_score', self.running_score)
132 |             try:
133 |                 self.env.env.env.evaluate_success(paths, self.logger)
134 |             except:
135 |                 # nested logic for backwards compatibility. TODO: clean this up.
136 |                 try:
137 |                     success_rate = self.env.env.env.evaluate_success(paths)
138 |                     self.logger.log_kv('success_rate', success_rate)
139 |                 except:
140 |                     pass
141 |         return base_stats
142 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/mbac.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.disable(logging.CRITICAL)
  3 | import numpy as np
  4 | import time as timer
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.autograd import Variable
  8 | from mjrl.utils.logger import DataLog
  9 | from tqdm import tqdm
 10 | from mjrl.utils.gym_env import GymEnv
 11 | from mjrl.policies.mpc_actor import MPCActor
 12 | from mjrl.algos.behavior_cloning import BC
 13 | 
 14 | 
 15 | class MBAC(BC):
 16 |     def __init__(self,
 17 |                  env_name,
 18 |                  policy,
 19 |                  expert_paths = None, # for the initial seeding
 20 |                  epochs = 5,
 21 |                  batch_size = 64,
 22 |                  lr = 1e-3,
 23 |                  optimizer = None,
 24 |                  loss_type = 'MSE',  # can be 'MLE' or 'MSE'
 25 |                  seed = 123,
 26 |                  buffer_size = 50,   # measured in number of trajectories
 27 |                  mpc_params = None,
 28 |                  save_logs = True,
 29 |                  ):
 30 | 
 31 |         super().__init__(expert_paths=expert_paths,
 32 |                          policy=policy,
 33 |                          epochs=epochs,
 34 |                          batch_size=batch_size,
 35 |                          lr=lr,
 36 |                          optimizer=optimizer,
 37 |                          loss_type=loss_type,
 38 |                          save_logs=save_logs,
 39 |                          )
 40 |         self.expert_paths = [] if self.expert_paths is None else self.expert_paths
 41 |         self.buffer_size = buffer_size
 42 | 
 43 |         # For the MPC policy
 44 |         self.env = GymEnv(env_name)
 45 |         self.env.reset(seed=seed)
 46 |         if mpc_params is None:
 47 |             mean = np.zeros(self.env.action_dim)
 48 |             sigma = 1.0 * np.ones(self.env.action_dim)
 49 |             filter_coefs = [sigma, 0.05, 0.0, 0.0]
 50 |             mpc_params = dict(env=GymEnv(env_name), H=10,
 51 |                               paths_per_cpu=25, num_cpu=1,
 52 |                               kappa=10.0, gamma=1.0,
 53 |                               mean=mean, filter_coefs=filter_coefs,
 54 |                               seed=seed)
 55 |         else:
 56 |             mpc_params['env'] = GymEnv(env_name)
 57 |             mpc_params['seed'] = seed
 58 | 
 59 |         self.mpc_params = mpc_params
 60 |         self.mpc_policy = MPCActor(**mpc_params)
 61 | 
 62 |     def collect_paths(self, num_traj=10,
 63 |                       mode='policy',
 64 |                       horizon=None,
 65 |                       render=False
 66 |                       ):
 67 |         horizon = self.env.horizon if horizon is None else horizon
 68 |         paths = []
 69 |         for i in tqdm(range(num_traj)):
 70 |             self.env.reset()
 71 |             obs, act_pi, act_mpc, rew, states = [], [], [], [], []
 72 |             for t in range(horizon):
 73 |                 o = self.env.get_obs()
 74 |                 s = self.env.get_env_state()
 75 |                 a_pi = self.policy.get_action(o)[0]
 76 |                 a_mpc = self.mpc_policy.get_action(s)
 77 |                 a = a_pi if mode == 'policy' else a_mpc
 78 |                 next_o, r, done, _ = self.env.step(a)
 79 |                 if render:
 80 |                     self.env.render()
 81 |                 # store data
 82 |                 obs.append(o)
 83 |                 rew.append(r)
 84 |                 states.append(s)
 85 |                 act_pi.append(a_pi)
 86 |                 act_mpc.append(a_mpc)
 87 |                 # kill if done
 88 |                 if done:
 89 |                     break
 90 |             path = dict(observations=np.array(obs),
 91 |                         actions=np.array(act_pi),
 92 |                         expert_actions=np.array(act_mpc),
 93 |                         rewards=np.array(rew),
 94 |                         states=states,
 95 |                         )
 96 |             paths.append(path)
 97 |         return paths
 98 | 
 99 |     def add_paths_to_buffer(self, paths):
100 |         for path in paths:
101 |             self.expert_paths.append(path)
102 |         if len(self.expert_paths) > self.buffer_size:
103 |             # keep recent trajectories
104 |             # TODO: Also consider keeping best performing trajectories
105 |             self.expert_paths = self.expert_paths[-self.buffer_size:]
106 |         if self.save_logs:
107 |             self.logger.log_kv('buffer_size', len(self.expert_paths))
108 | 
109 |     def get_data_from_buffer(self):
110 |         observations = np.concatenate([path["observations"] for path in self.expert_paths])
111 |         expert_actions = np.concatenate([path["expert_actions"] for path in self.expert_paths])
112 |         observations = torch.Tensor(observations).float()
113 |         expert_actions = torch.Tensor(expert_actions).float()
114 |         data = dict(observations=observations, expert_actions=expert_actions)
115 |         return data
116 | 
117 |     def train_step(self, num_traj=10, **kwargs):
118 |         # collect data using policy actions
119 |         # fit policy to expert actions on these states
120 |         new_paths = self.collect_paths(num_traj, mode='policy')
121 |         self.add_paths_to_buffer(new_paths)
122 |         data = self.get_data_from_buffer()
123 |         self.fit(data, **kwargs)
124 |         stoc_pol_perf = np.mean([np.sum(path['rewards']) for path in new_paths])
125 |         return stoc_pol_perf


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/algos/model_accel/__init__.py


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/model_learning_mpc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from mjrl.algos.model_accel.sampling import generate_paths, generate_perturbed_actions, trajectory_rollout
  3 | 
  4 | 
  5 | class MPCPolicy(object):
  6 |     def __init__(self, env,
  7 |                  plan_horizon,
  8 |                  plan_paths=10,
  9 |                  kappa=1.0,
 10 |                  gamma=1.0,
 11 |                  mean=None,
 12 |                  filter_coefs=None,
 13 |                  seed=123,
 14 |                  warmstart=True,
 15 |                  fitted_model=None,
 16 |                  omega=5.0,
 17 |                  **kwargs,
 18 |                  ):
 19 | 
 20 |         # initialize
 21 |         self.env, self.seed = env, seed
 22 |         self.n, self.m = env.observation_dim, env.action_dim
 23 |         self.plan_horizon, self.num_traj = plan_horizon, plan_paths
 24 | 
 25 |         if fitted_model is None:
 26 |             print("Policy requires a fitted dynamics model")
 27 |             quit()
 28 |         else:
 29 |             self.fitted_model = fitted_model
 30 | 
 31 |         # initialize other params
 32 |         self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma
 33 |         if mean is None:
 34 |             self.mean = np.zeros(self.m)
 35 |         if filter_coefs is None:
 36 |             self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0]
 37 |         self.act_sequence = np.ones((self.plan_horizon, self.m)) * self.mean
 38 |         self.init_act_sequence = self.act_sequence.copy()
 39 |         self.warmstart = warmstart
 40 |         self.omega = omega
 41 | 
 42 |     def get_action(self, obs):
 43 |         # generate paths
 44 |         if type(self.fitted_model) == list:
 45 | 
 46 |             # Ensemble case
 47 |             # Collect trajectories from different models with same action sequences
 48 |             base_act = self.act_sequence
 49 |             act_list = [generate_perturbed_actions(base_act, self.filter_coefs)
 50 |                         for _ in range(self.num_traj)]
 51 |             actions = np.array(act_list)
 52 |             paths_list = []
 53 |             for model in self.fitted_model:
 54 |                 paths = trajectory_rollout(actions, model, obs)
 55 |                 self.env.env.env.compute_path_rewards(paths)
 56 |                 paths_list.append(paths)
 57 |             # consolidate paths
 58 |             paths = dict()
 59 |             for k in paths_list[0].keys():
 60 |                 v = np.vstack([p[k] for p in paths_list])
 61 |                 paths[k] = v
 62 |             R = self.score_trajectory_ensemble(paths, paths_list)
 63 | 
 64 |         else:
 65 |             paths = generate_paths(num_traj=self.num_traj, fitted_model=self.fitted_model,
 66 |                                    start_state=obs, base_act=self.act_sequence, filter_coefs=self.filter_coefs)
 67 |             self.env.env.env.compute_path_rewards(paths)  # will populate path['rewards']
 68 |             R = self.score_trajectory(paths)
 69 | 
 70 |         S = np.exp(self.kappa * (R - np.max(R)))
 71 |         act = paths["actions"]
 72 | 
 73 |         weighted_seq = S * act.T
 74 |         act_sequence = np.sum(weighted_seq.T, axis=0) / (np.sum(S) + 1e-6)
 75 |         action = act_sequence[0].copy()
 76 | 
 77 |         # get updated action sequence
 78 |         if self.warmstart:
 79 |             self.act_sequence[:-1] = act_sequence[1:]
 80 |             self.act_sequence[-1] = self.mean.copy()
 81 |         else:
 82 |             self.act_sequence = self.init_act_sequence.copy()
 83 |         return action
 84 | 
 85 |     def score_trajectory_ensemble(self, paths, paths_list):
 86 |         num_traj = self.num_traj
 87 |         num_models = len(paths_list)
 88 |         total_traj = paths['rewards'].shape[0]
 89 |         horizon = paths['rewards'].shape[1]
 90 |         predictions = [p['observations'] for p in paths_list]
 91 |         disagreement = np.std(predictions, axis=0)      # (num_traj, horizon, state_dim)
 92 |         disagreement = np.sum(disagreement, axis=(1,2)) # (num_traj,)
 93 |         scores = np.zeros(total_traj)
 94 |         for i in range(total_traj):
 95 |             disagreement_score = disagreement[i // self.num_traj]
 96 |             scores[i] = self.omega * disagreement_score
 97 |             for t in range(horizon):
 98 |                 scores[i] += (self.gamma ** t) * paths["rewards"][i][t]
 99 |         return scores
100 | 
101 |     def score_trajectory(self, paths):
102 |         # rewards shape: (num_traj, horizon)
103 |         num_traj = paths["rewards"].shape[0]
104 |         horizon = paths["rewards"].shape[1]
105 |         scores = np.zeros(num_traj)
106 |         for i in range(num_traj):
107 |             scores[i] = 0.0
108 |             for t in range(horizon):
109 |                 scores[i] += (self.gamma**t)*paths["rewards"][i][t]
110 |         return scores
111 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/configs/point_mass.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env_name'      :   'mjrl_point_mass-v0',
 6 | 'seed'          :   123,
 7 | 'debug_mode'    :   False,
 8 | 'num_iter'      :   5,
 9 | 'iter_samples'  :   100,
10 | 'eval_rollouts' :   25,
11 | 'num_models'    :   3,
12 | 'exp_notes'     :   'Toy experiment for initial trial.',
13 | 'save_freq'     :   1,
14 | 'device'        :   'cpu',
15 | 'learn_reward'  :   False,
16 | 'reward_file'   :   'utils/reward_functions/mjrl_point_mass.py',
17 | 
18 | # dynamics learning
19 | 
20 | 'hidden_size'   :   (256, 256),
21 | 'activation'    :   'relu',
22 | 'fit_lr'        :   1e-3,
23 | 'fit_wd'        :   1e-5,
24 | 'buffer_size'   :   10000,
25 | 'fit_mb_size'   :   16,
26 | 'fit_epochs'    :   25,
27 | 'refresh_fit'   :   False,
28 | 
29 | # initial data
30 | 
31 | 'init_log_std'  :   -0.5,
32 | 'min_log_std'   :   -2.0,
33 | 'init_samples'  :   1000,
34 | 
35 | # NPG params
36 | 
37 | 'policy_size'   :   (32, 32),
38 | 'inner_steps'   :   10,
39 | 'step_size'     :   0.05,
40 | 'update_paths'  :   250,
41 | 'start_state'	:   'init',
42 | 'horizon'       :   25,
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/configs/reacher.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env_name'      :   'mjrl_reacher_7dof-v0',
 6 | 'seed'          :   123,
 7 | 'debug_mode'    :   False,
 8 | 'num_iter'      :   25,
 9 | 'iter_samples'  :   500,
10 | 'eval_rollouts' :   10,
11 | 'num_models'    :   4,
12 | 'save_freq'     :   1,
13 | 'device'        :   'cpu',
14 | 
15 | # dynamics learning
16 | 
17 | 'hidden_size'   :   (256, 256),
18 | 'activation'    :   'relu',
19 | 'fit_lr'        :   1e-3,
20 | 'fit_wd'        :   0.0,
21 | 'buffer_size'   :   20000,
22 | 'fit_mb_size'   :   64,
23 | 'fit_epochs'    :   20,
24 | 'refresh_fit'   :   False,
25 | 
26 | # initial data
27 | 
28 | 'init_log_std'  :   -0.5,
29 | 'min_log_std'   :   -2.5,
30 | 'init_samples'  :   2500,
31 | 'init_policy'   :   None,
32 | 
33 | 
34 | # NPG params
35 | 
36 | 'policy_size'   :   (64, 64),
37 | 'inner_steps'   :   5,
38 | 'step_size'     :   0.05,
39 | 'update_paths'  :   250,
40 | 'start_state'	:   'init',
41 | 'horizon'       :   50,
42 | 
43 | }


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/sandbox/example_config_mpc.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env_name'      :   'mjrl_point_mass-v0',
 6 | 'seed'          :   123,
 7 | 'debug_mode'    :   False,
 8 | 'num_iter'      :   5,
 9 | 'paths_per_iter':   5,
10 | 'eval_rollouts' :   10,
11 | 'num_models'    :   3,
12 | 'exp_notes'     :   'Toy experiment for initial trial.',
13 | 'save_freq'     :   5,
14 | 'device'        :   'cpu',
15 | 
16 | # dynamics learning
17 | 
18 | 'hidden_size'   :   (64, 64),
19 | 'activation'    :   'relu',
20 | 'fit_lr'        :   1e-3,
21 | 'fit_wd'        :   1e-5,
22 | 'max_paths'     :   1000,
23 | 'fit_mb_size'   :   16,
24 | 'fit_epochs'    :   25,
25 | 'refresh_fit'   :   True,
26 | 
27 | # initial data
28 | 
29 | 'init_log_std'  :   -0.5,
30 | 'n_init_paths'  :   25,
31 | 'use_demos'     :   False,
32 | 'demo_file'     :   None,
33 | 
34 | # model predictive control
35 | 
36 | 'noisy_mpc'     :   True,     # when collecting data for exploration
37 | 'noise_level'   :   0.1,
38 | 'filter_coefs'  :   {'f1': 0.5, 'f2': 1.0, 'f3': 0.0, 'f4': 0.0},
39 | 'plan_paths'    :   200,
40 | 'plan_horizon'  :   10,
41 | 'kappa'         :   2.0,
42 | 'omega'         :   0.0,
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/sandbox/run_model_learning_mpc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Job script to optimize trajectories with fitted model
  3 | """
  4 | 
  5 | import numpy as np
  6 | import copy
  7 | import torch
  8 | import torch.nn as nn
  9 | import pickle
 10 | import mjrl.envs
 11 | import time as timer
 12 | import argparse
 13 | import os
 14 | import json
 15 | import mjrl.samplers.core as trajectory_sampler
 16 | import mjrl.utils.tensor_utils as tensor_utils
 17 | from tqdm import tqdm
 18 | from tabulate import tabulate
 19 | from mjrl.policies.gaussian_mlp import MLP
 20 | from mjrl.baselines.mlp_baseline import MLPBaseline
 21 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 22 | from mjrl.utils.gym_env import GymEnv
 23 | from mjrl.utils.logger import DataLog
 24 | from mjrl.utils.make_train_plots import make_train_plots
 25 | from mjrl.algos.model_accel.nn_dynamics import DynamicsModel
 26 | from mjrl.algos.model_accel.model_learning_mpc import MPCPolicy
 27 | from mjrl.algos.model_accel.sampling import sample_paths, evaluate_policy
 28 | 
 29 | 
 30 | # ===============================================================================
 31 | # Get command line arguments
 32 | # ===============================================================================
 33 | 
 34 | parser = argparse.ArgumentParser(description='Trajectory Optimization with fitted models.')
 35 | parser.add_argument('--output', type=str, required=True, help='location to store results')
 36 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params')
 37 | args = parser.parse_args()
 38 | OUT_DIR = args.output
 39 | if not os.path.exists(OUT_DIR):
 40 |     os.mkdir(OUT_DIR)
 41 | with open(args.config, 'r') as f:
 42 |     job_data = eval(f.read())
 43 | 
 44 | # Unpack args and make files for easy access
 45 | logger = DataLog()
 46 | ENV_NAME = job_data['env_name']
 47 | PICKLE_FILE = OUT_DIR + '/exp_results.pickle'
 48 | EXP_FILE = OUT_DIR + '/job_data.json'
 49 | SEED = job_data['seed']
 50 | job_data['filter_coefs'] = [job_data['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']]
 51 | 
 52 | # base cases
 53 | if 'num_models' not in job_data.keys():
 54 |     job_data['num_models'] = 1
 55 | if job_data['num_models'] == 1 or 'omega' not in job_data.keys():
 56 |     job_data['omega'] = 0.0
 57 | if 'eval_rollouts' not in job_data.keys():
 58 |     job_data['eval_rollouts'] = 0
 59 | if 'save_freq' not in job_data.keys():
 60 |     job_data['save_freq'] = 10
 61 | if 'device' not in job_data.keys():
 62 |     job_data['device'] = 'cpu'
 63 | if 'debug_mode' in job_data.keys():
 64 |     DEBUG = job_data['debug_mode']
 65 | else:
 66 |     DEBUG =False
 67 | if 'device_path' not in job_data.keys():
 68 |     job_data['device_path'] = None
 69 | with open(EXP_FILE, 'w') as f:
 70 |     json.dump(job_data, f, indent=4)
 71 | 
 72 | del(job_data['seed'])
 73 | job_data['base_seed'] = SEED
 74 | 
 75 | # ===============================================================================
 76 | # Train loop
 77 | # ===============================================================================
 78 | 
 79 | np.random.seed(SEED)
 80 | torch.random.manual_seed(SEED)
 81 | 
 82 | # TODO(Aravind): Map to hardware if device_path is specified
 83 | 
 84 | e = GymEnv(ENV_NAME)
 85 | e.set_seed(SEED)
 86 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+i, **job_data)
 87 |           for i in range(job_data['num_models'])]
 88 | exploratory_policy = MLP(e.spec, seed=SEED, init_log_std=job_data['init_log_std'])
 89 | paths = []
 90 | 
 91 | for outer_iter in range(job_data['num_iter']):
 92 | 
 93 |     ts = timer.time()
 94 |     print("================> ITERATION : %i " % outer_iter)
 95 |     print("Getting interaction data from real dynamics ...")
 96 | 
 97 |     if outer_iter == 0:
 98 |         iter_paths = trajectory_sampler.sample_paths(job_data['n_init_paths'], e,
 99 |                                                      exploratory_policy,
100 |                                                      eval_mode=False, base_seed=SEED)
101 |     else:
102 |         iter_paths = sample_paths(job_data['paths_per_iter'],
103 |                                   mpc_policy.env, mpc_policy,
104 |                                   eval_mode=(not job_data['noisy_mpc']),
105 |                                   noise_level=job_data['noise_level'],
106 |                                   base_seed=SEED + outer_iter)
107 | 
108 |     # reset the environment (good for hardware)
109 |     e.reset()
110 | 
111 |     for p in iter_paths:
112 |         paths.append(p)
113 | 
114 |     if len(paths) > job_data['max_paths']:
115 |         diff = len(paths) - job_data['max_paths']
116 |         paths[:diff] = []
117 | 
118 |     s = np.concatenate([p['observations'][:-1] for p in paths])
119 |     a = np.concatenate([p['actions'][:-1] for p in paths])
120 |     sp = np.concatenate([p['observations'][1:] for p in paths])
121 |     r = np.array([np.sum(p['rewards']) for p in iter_paths])
122 |     rollout_score = np.mean(r)
123 | 
124 |     logger.log_kv('fit_epochs', job_data['fit_epochs'])
125 |     logger.log_kv('rollout_score', rollout_score)
126 |     try:
127 |         rollout_metric = e.env.env.evaluate_success(iter_paths)
128 |         logger.log_kv('rollout_metric', rollout_metric)
129 |     except:
130 |         pass
131 | 
132 |     print("Data gathered, fitting model ...")
133 |     if job_data['refresh_fit']:
134 |         models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+123*outer_iter,
135 |                                 **job_data) for i in range(job_data['num_models'])]
136 | 
137 |     for i, model in enumerate(models):
138 |         epoch_loss = model.fit(s, a, sp, job_data['fit_mb_size'], job_data['fit_epochs'])
139 |         logger.log_kv('loss_before_' + str(i), epoch_loss[0])
140 |         logger.log_kv('loss_after_' + str(i), epoch_loss[-1])
141 | 
142 |     mpc_policy = MPCPolicy(env=e, fitted_model=models, seed=SEED+12345*outer_iter, **job_data)
143 | 
144 |     if job_data['eval_rollouts'] > 0:
145 |         print("Performing validation rollouts ... ")
146 |         eval_paths = evaluate_policy(mpc_policy.env, mpc_policy, mpc_policy.fitted_model[0], noise_level=0.0,
147 |                                      real_step=True, num_episodes=job_data['eval_rollouts'], visualize=False)
148 |         eval_score = np.mean([np.sum(p['rewards']) for p in eval_paths])
149 |         logger.log_kv('eval_score', eval_score)
150 |         try:
151 |             eval_metric = e.env.env.evaluate_success(eval_paths)
152 |             logger.log_kv('eval_metric', eval_metric)
153 |         except:
154 |             pass
155 |     else:
156 |         eval_paths = []
157 | 
158 |     exp_data = dict(policy=mpc_policy, fitted_model=mpc_policy.fitted_model,
159 |                     log=logger.log, rollout_paths=iter_paths, eval_paths=eval_paths)
160 |     if outer_iter > 0 and outer_iter % job_data['save_freq'] == 0:
161 |         pickle.dump(exp_data, open(PICKLE_FILE, 'wb'))
162 |         pickle.dump(exp_data, open(OUT_DIR + '/iteration_' + str(outer_iter) + '.pickle', 'wb'))
163 | 
164 |     tf = timer.time()
165 |     logger.log_kv('iter_time', tf-ts)
166 |     print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
167 |                                logger.get_current_log().items()))
168 |     print(tabulate(print_data))
169 |     logger.save_log(OUT_DIR+'/')
170 |     make_train_plots(log=logger.log, keys=['rollout_score', 'eval_score', 'rollout_metric', 'eval_metric'],
171 |                      save_loc=OUT_DIR+'/')
172 | 
173 |     if job_data['debug_mode']:
174 |         evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], False, 5, True)
175 |         evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], True, 5, True)
176 | 
177 |     pickle.dump(exp_data, open(PICKLE_FILE, 'wb')) # final save


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/utils/reward_functions/mjrl_point_mass.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def reward_function(paths):
 4 |     # path has two keys: observations and actions
 5 |     # path["observations"] : (num_traj, horizon, obs_dim)
 6 |     # return paths that contain rewards in path["rewards"]
 7 |     # path["rewards"] should have shape (num_traj, horizon)
 8 |     obs = paths["observations"]
 9 |     obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
10 |     agent_pos = obs[:, :, :2]
11 |     target_pos = obs[:, :, -2:]
12 |     l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1)
13 |     l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1)
14 |     rewards = -1.0 * l1_dist - 0.5 * l2_dist
15 |     rewards[..., :-1] = rewards[..., 1:]   # shift index by 1 to have r(s,a)=r(s')
16 |     paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
17 |     return paths
18 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/utils/visualize_policy.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import mjrl.envs
 3 | import trajopt.envs
 4 | import mj_envs
 5 | import click
 6 | import os
 7 | import gym
 8 | import numpy as np
 9 | import pickle
10 | import torch
11 | from mjrl.utils.gym_env import GymEnv
12 | from mjrl.policies.gaussian_mlp import MLP
13 | import trajopt.envs
14 | 
15 | DESC = '''
16 | Helper script to visualize policy (in mjrl format).\n
17 | USAGE:\n
18 |     Visualizes policy on the env\n
19 |     $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n
20 | '''
21 | 
22 | # MAIN =========================================================
23 | @click.command(help=DESC)
24 | @click.option('--env_name', type=str, help='environment to load', required= True)
25 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None)
26 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation')
27 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123)
28 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10)
29 | @click.option('--log_std', type=float, default=-0.5)
30 | @click.option('--terminate', type=bool, default=True)
31 | @click.option('--device_path', type=str, default=None)
32 | def main(env_name, policy, mode, seed, episodes, log_std, terminate, device_path):
33 |     render = True
34 | 
35 |     # TODO(Aravind): Map to hardware if device_path is specified
36 | 
37 |     e = GymEnv(env_name)
38 |     e.set_seed(seed)
39 |     np.random.seed(seed)
40 |     torch.manual_seed(seed)
41 |     if policy is not None:
42 |         policy = pickle.load(open(policy, 'rb'))
43 |     else:
44 |         policy = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=log_std)
45 | 
46 |     for ep in range(episodes):
47 |         o = e.reset()
48 |         rew = 0.0
49 |         t = 0
50 |         done = False
51 |         while t < e.horizon and done is False:
52 |             o = e.get_obs()
53 |             a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
54 |             next_o, r, done, ifo = e.step(a)
55 |             if terminate is False:
56 |                 done = False
57 |             rew = rew + r
58 |             t = t + 1
59 |             if render:
60 |                 e.render()
61 |             if done and t < e.horizon - 1:
62 |                 print("Episode terminated early")
63 |         print("episode score = %f " % rew)
64 | 
65 |     e.reset()
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/model_accel/run_experiments/utils/visualize_trajectories.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import click
 3 | import json
 4 | import numpy as np
 5 | import torch
 6 | import mjrl.envs
 7 | import trajopt.envs
 8 | import mj_envs
 9 | import mjrl.utils.tensor_utils as tensor_utils
10 | 
11 | from mjrl.utils.gym_env import GymEnv
12 | from mjrl.algos.model_accel.sampling import evaluate_policy
13 | 
14 | DESC = '''
15 | Helper script to visualize optimized trajectories (list of trajectories in trajopt format).\n
16 | USAGE:\n
17 |     $ python viz_trajectories.py --file path_to_file.pickle\n
18 | '''
19 | @click.command(help=DESC)
20 | @click.option('--file', type=str, help='pickle file with trajectories', required= True)
21 | @click.option('--seed', type=int, default=123)
22 | @click.option('--noise_level', type=float, default=0.0)
23 | @click.option('--num_episodes', type=int, help='number of times to play trajectories', default=5)
24 | @click.option('--config', type=str, help='if provided MPC params from here will be used.', default=None)
25 | @click.option('--device_path', type=str, default=None)
26 | def main(file, seed, noise_level, num_episodes, config, device_path):
27 |     exp_data = pickle.load(open(file, 'rb'))
28 |     policy = exp_data['policy']
29 |     model = exp_data['fitted_model']
30 |     model = model[-1] if type(model) == list else model
31 |     env_id = policy.env.env_id
32 |     render = True
33 | 
34 |     # TODO(Aravind): Map to hardware if device_path is specified
35 | 
36 |     env = GymEnv(env_id)
37 |     policy.env = env
38 | 
39 |     env.set_seed(seed)
40 |     np.random.seed(seed)
41 |     torch.manual_seed(seed)
42 | 
43 |     if config is not None:
44 |         try:
45 |             with open(config, 'r') as f:
46 |                 config = eval(f.read())
47 |         except:
48 |             with open(config, 'r') as f:
49 |                 config = json.load(f)
50 |         policy.plan_horizon = config['plan_horizon']
51 |         policy.num_traj = config['plan_paths']
52 |         policy.kappa = config['kappa']
53 |         policy.filter_coefs = [config['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']]
54 |         policy.omega = config['omega'] if 'omega' in config.keys() else 0.0
55 | 
56 |     # TODO(Aravind): Implement capability to set predicted state for rendering purposes
57 |     # evaluate_policy(env, policy, model, noise_level, real_step=False, num_episodes=num_episodes, visualize=render)
58 |     evaluate_policy(env, policy, model, noise_level, real_step=True, num_episodes=num_episodes, visualize=render)
59 | 
60 |     # final close out
61 |     env.reset()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/ppo_clip.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | #logging.disable(logging.CRITICAL)
  3 | import numpy as np
  4 | import scipy as sp
  5 | import scipy.sparse.linalg as spLA
  6 | import copy
  7 | import time as timer
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.autograd import Variable
 11 | import copy
 12 | 
 13 | # samplers
 14 | import mjrl.samplers.core as trajectory_sampler
 15 | 
 16 | # utility functions
 17 | import mjrl.utils.process_samples as process_samples
 18 | from mjrl.utils.logger import DataLog
 19 | from mjrl.utils.cg_solve import cg_solve
 20 | from mjrl.algos.batch_reinforce import BatchREINFORCE
 21 | 
 22 | 
 23 | class PPO(BatchREINFORCE):
 24 |     def __init__(self, env, policy, baseline,
 25 |                  clip_coef = 0.2,
 26 |                  epochs = 10,
 27 |                  mb_size = 64,
 28 |                  learn_rate = 3e-4,
 29 |                  seed = 123,
 30 |                  save_logs = False,
 31 |                  **kwargs
 32 |                  ):
 33 | 
 34 |         self.env = env
 35 |         self.policy = policy
 36 |         self.baseline = baseline
 37 |         self.learn_rate = learn_rate
 38 |         self.seed = seed
 39 |         self.save_logs = save_logs
 40 |         self.clip_coef = clip_coef
 41 |         self.epochs = epochs
 42 |         self.mb_size = mb_size
 43 |         self.running_score = None
 44 |         if save_logs: self.logger = DataLog()
 45 | 
 46 |         self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=learn_rate)
 47 | 
 48 |     def PPO_surrogate(self, observations, actions, advantages):
 49 |         adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False)
 50 |         old_dist_info = self.policy.old_dist_info(observations, actions)
 51 |         new_dist_info = self.policy.new_dist_info(observations, actions)
 52 |         LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info)
 53 |         LR_clip = torch.clamp(LR, min=1-self.clip_coef, max=1+self.clip_coef)
 54 |         ppo_surr = torch.mean(torch.min(LR*adv_var,LR_clip*adv_var))
 55 |         return ppo_surr
 56 | 
 57 |     # ----------------------------------------------------------
 58 |     def train_from_paths(self, paths):
 59 | 
 60 |         # Concatenate from all the trajectories
 61 |         observations = np.concatenate([path["observations"] for path in paths])
 62 |         actions = np.concatenate([path["actions"] for path in paths])
 63 |         advantages = np.concatenate([path["advantages"] for path in paths])
 64 |         # Advantage whitening
 65 |         advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
 66 |         # NOTE : advantage should be zero mean in expectation
 67 |         # normalized step size invariant to advantage scaling,
 68 |         # but scaling can help with least squares
 69 | 
 70 |         # cache return distributions for the paths
 71 |         path_returns = [sum(p["rewards"]) for p in paths]
 72 |         mean_return = np.mean(path_returns)
 73 |         std_return = np.std(path_returns)
 74 |         min_return = np.amin(path_returns)
 75 |         max_return = np.amax(path_returns)
 76 |         base_stats = [mean_return, std_return, min_return, max_return]
 77 |         self.running_score = mean_return if self.running_score is None else \
 78 |                              0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
 79 |         if self.save_logs: self.log_rollout_statistics(paths)
 80 | 
 81 |         # Optimization algorithm
 82 |         # --------------------------
 83 |         surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
 84 |         params_before_opt = self.policy.get_param_values()
 85 | 
 86 |         ts = timer.time()
 87 |         num_samples = observations.shape[0]
 88 |         for ep in range(self.epochs):
 89 |             for mb in range(int(num_samples / self.mb_size)):
 90 |                 rand_idx = np.random.choice(num_samples, size=self.mb_size)
 91 |                 obs = observations[rand_idx]
 92 |                 act = actions[rand_idx]
 93 |                 adv = advantages[rand_idx]
 94 |                 self.optimizer.zero_grad()
 95 |                 loss = - self.PPO_surrogate(obs, act, adv)
 96 |                 loss.backward()
 97 |                 self.optimizer.step()
 98 | 
 99 |         params_after_opt = self.policy.get_param_values()
100 |         surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
101 |         kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
102 |         self.policy.set_param_values(params_after_opt, set_new=True, set_old=True)
103 |         t_opt = timer.time() - ts
104 | 
105 |         # Log information
106 |         if self.save_logs:
107 |             self.logger.log_kv('t_opt', t_opt)
108 |             self.logger.log_kv('kl_dist', kl_dist)
109 |             self.logger.log_kv('surr_improvement', surr_after - surr_before)
110 |             self.logger.log_kv('running_score', self.running_score)
111 |             try:
112 |                 self.env.env.env.evaluate_success(paths, self.logger)
113 |             except:
114 |                 # nested logic for backwards compatibility. TODO: clean this up.
115 |                 try:
116 |                     success_rate = self.env.env.env.evaluate_success(paths)
117 |                     self.logger.log_kv('success_rate', success_rate)
118 |                 except:
119 |                     pass
120 | 
121 |         return base_stats
122 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/algos/trpo.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.disable(logging.CRITICAL)
  3 | import numpy as np
  4 | import scipy as sp
  5 | import scipy.sparse.linalg as spLA
  6 | import copy
  7 | import time as timer
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.autograd import Variable
 11 | import copy
 12 | 
 13 | # samplers
 14 | import mjrl.samplers.core as trajectory_sampler
 15 | import mjrl.samplers.batch_sampler as batch_sampler
 16 | 
 17 | # utility functions
 18 | import mjrl.utils.process_samples as process_samples
 19 | from mjrl.utils.logger import DataLog
 20 | from mjrl.utils.cg_solve import cg_solve
 21 | 
 22 | # Import NPG
 23 | from mjrl.algos.npg_cg import NPG
 24 | 
 25 | class TRPO(NPG):
 26 |     def __init__(self, env, policy, baseline,
 27 |                  kl_dist=0.01,
 28 |                  FIM_invert_args={'iters': 10, 'damping': 1e-4},
 29 |                  hvp_sample_frac=1.0,
 30 |                  seed=123,
 31 |                  save_logs=False,
 32 |                  normalized_step_size=0.01,
 33 |                  **kwargs
 34 |                  ):
 35 |         """
 36 |         All inputs are expected in mjrl's format unless specified
 37 |         :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance
 38 |         :param kl_dist: desired KL distance between steps. Overrides normalized_step_size.
 39 |         :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well)
 40 |         :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG
 41 |         :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow)
 42 |         :param seed: random seed
 43 |         """
 44 | 
 45 |         self.env = env
 46 |         self.policy = policy
 47 |         self.baseline = baseline
 48 |         self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size
 49 |         self.seed = seed
 50 |         self.save_logs = save_logs
 51 |         self.FIM_invert_args = FIM_invert_args
 52 |         self.hvp_subsample = hvp_sample_frac
 53 |         self.running_score = None
 54 |         if save_logs: self.logger = DataLog()
 55 | 
 56 |     def train_from_paths(self, paths):
 57 | 
 58 |         # Concatenate from all the trajectories
 59 |         observations = np.concatenate([path["observations"] for path in paths])
 60 |         actions = np.concatenate([path["actions"] for path in paths])
 61 |         advantages = np.concatenate([path["advantages"] for path in paths])
 62 |         # Advantage whitening
 63 |         advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
 64 |         # NOTE : advantage should be zero mean in expectation
 65 |         # normalized step size invariant to advantage scaling,
 66 |         # but scaling can help with least squares
 67 | 
 68 |         # cache return distributions for the paths
 69 |         path_returns = [sum(p["rewards"]) for p in paths]
 70 |         mean_return = np.mean(path_returns)
 71 |         std_return = np.std(path_returns)
 72 |         min_return = np.amin(path_returns)
 73 |         max_return = np.amax(path_returns)
 74 |         base_stats = [mean_return, std_return, min_return, max_return]
 75 |         self.running_score = mean_return if self.running_score is None else \
 76 |                              0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
 77 |         if self.save_logs: self.log_rollout_statistics(paths)
 78 | 
 79 |         # Keep track of times for various computations
 80 |         t_gLL = 0.0
 81 |         t_FIM = 0.0
 82 | 
 83 |         # Optimization algorithm
 84 |         # --------------------------
 85 |         surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
 86 | 
 87 |         # VPG
 88 |         ts = timer.time()
 89 |         vpg_grad = self.flat_vpg(observations, actions, advantages)
 90 |         t_gLL += timer.time() - ts
 91 | 
 92 |         # NPG
 93 |         ts = timer.time()
 94 |         hvp = self.build_Hvp_eval([observations, actions],
 95 |                                   regu_coef=self.FIM_invert_args['damping'])
 96 |         npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(),
 97 |                             cg_iters=self.FIM_invert_args['iters'])
 98 |         t_FIM += timer.time() - ts
 99 | 
100 |         # Step size computation
101 |         # --------------------------
102 |         n_step_size = 2.0*self.kl_dist
103 |         alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20)))
104 | 
105 |         # Policy update
106 |         # --------------------------
107 |         curr_params = self.policy.get_param_values()
108 |         for k in range(100):
109 |             new_params = curr_params + alpha * npg_grad
110 |             self.policy.set_param_values(new_params, set_new=True, set_old=False)
111 |             kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
112 |             surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
113 |             if kl_dist < self.kl_dist:
114 |                 break
115 |             else:
116 |                 alpha = 0.9*alpha # backtrack
117 |                 print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \
118 |                       (kl_dist, surr_after-surr_before) )
119 |             if k == 99:
120 |                 alpha = 0.0
121 | 
122 |         new_params = curr_params + alpha * npg_grad
123 |         self.policy.set_param_values(new_params, set_new=True, set_old=False)
124 |         kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
125 |         surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
126 |         self.policy.set_param_values(new_params, set_new=True, set_old=True)
127 | 
128 |         # Log information
129 |         if self.save_logs:
130 |             self.logger.log_kv('alpha', alpha)
131 |             self.logger.log_kv('delta', n_step_size)
132 |             self.logger.log_kv('time_vpg', t_gLL)
133 |             self.logger.log_kv('time_npg', t_FIM)
134 |             self.logger.log_kv('kl_dist', kl_dist)
135 |             self.logger.log_kv('surr_improvement', surr_after - surr_before)
136 |             self.logger.log_kv('running_score', self.running_score)
137 |             try:
138 |                 self.env.env.env.evaluate_success(paths, self.logger)
139 |             except:
140 |                 # nested logic for backwards compatibility. TODO: clean this up.
141 |                 try:
142 |                     success_rate = self.env.env.env.evaluate_success(paths)
143 |                     self.logger.log_kv('success_rate', success_rate)
144 |                 except:
145 |                     pass
146 | 
147 |         return base_stats


--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/baselines/__init__.py


--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/linear_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | 
 4 | 
 5 | class LinearBaseline:
 6 |     def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-5):
 7 |         self.inp = inp
 8 |         self._reg_coeff = reg_coeff
 9 |         self._coeffs = None
10 | 
11 |     def _features(self, paths):
12 |         if self.inp == 'env_features':
13 |             o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
14 |         else:
15 |             o = np.concatenate([path["observations"] for path in paths])
16 |         o = np.clip(o, -10, 10)/10.0
17 |         if o.ndim > 2:
18 |             o = o.reshape(o.shape[0], -1)
19 |         N, n = o.shape
20 |         num_feat = int( n + 1 + 4 )  # linear + bias (1.0) + time till pow 4
21 |         feat_mat = np.ones((N, num_feat))
22 | 
23 |         # linear features
24 |         feat_mat[:,:n] = o
25 | 
26 |         k = 0  # start from this row
27 |         for i in range(len(paths)):
28 |             l = len(paths[i]["rewards"])
29 |             al = np.arange(l)/1000.0
30 |             for j in range(4):
31 |                 feat_mat[k:k+l, -4+j] = al**(j+1)
32 |             k += l
33 | 
34 |         return feat_mat
35 | 
36 |     def fit(self, paths, return_errors=False):
37 | 
38 |         featmat = self._features(paths)
39 |         returns = np.concatenate([path["returns"] for path in paths])
40 | 
41 |         if return_errors:
42 |             predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape)
43 |             errors = returns - predictions
44 |             error_before = np.sum(errors**2)/np.sum(returns**2)
45 | 
46 |         reg_coeff = copy.deepcopy(self._reg_coeff)
47 |         for _ in range(10):
48 |             self._coeffs = np.linalg.lstsq(
49 |                 featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
50 |                 featmat.T.dot(returns)
51 |             )[0]
52 |             if not np.any(np.isnan(self._coeffs)):
53 |                 break
54 |             reg_coeff *= 10
55 | 
56 |         if return_errors:
57 |             predictions = featmat.dot(self._coeffs)
58 |             errors = returns - predictions
59 |             error_after = np.sum(errors**2)/np.sum(returns**2)
60 |             return error_before, error_after
61 | 
62 |     def predict(self, path):
63 |         if self._coeffs is None:
64 |             return np.zeros(len(path["rewards"]))
65 |         return self._features([path]).dot(self._coeffs)
66 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/mlp_baseline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import copy
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | from mjrl.utils.optimize_model import fit_data
  7 | 
  8 | import pickle
  9 | 
 10 | class MLPBaseline:
 11 |     def __init__(self, env_spec, inp_dim=None, inp='obs', learn_rate=1e-3, reg_coef=0.0,
 12 |                  batch_size=64, epochs=1, use_gpu=False, hidden_sizes=(128, 128)):
 13 |         self.n = inp_dim if inp_dim is not None else env_spec.observation_dim
 14 |         self.batch_size = batch_size
 15 |         self.epochs = epochs
 16 |         self.reg_coef = reg_coef
 17 |         self.use_gpu = use_gpu
 18 |         self.inp = inp
 19 |         self.hidden_sizes = hidden_sizes
 20 | 
 21 |         self.model = nn.Sequential()
 22 |         layer_sizes = (self.n + 4, ) + hidden_sizes + (1, )
 23 |         for i in range(len(layer_sizes) - 1):
 24 |             layer_id = 'fc_' + str(i)
 25 |             relu_id = 'relu_' + str(i)
 26 |             self.model.add_module(layer_id, nn.Linear(layer_sizes[i], layer_sizes[i+1]))
 27 |             if i != len(layer_sizes) - 2:
 28 |                 self.model.add_module(relu_id, nn.ReLU())
 29 | 
 30 |         if self.use_gpu:
 31 |             self.model.cuda()
 32 | 
 33 |         self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learn_rate, weight_decay=reg_coef)
 34 |         self.loss_function = torch.nn.MSELoss()
 35 | 
 36 |     def _features(self, paths):
 37 |         if self.inp == 'env_features':
 38 |             o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
 39 |         else:
 40 |             o = np.concatenate([path["observations"] for path in paths])
 41 |         o = np.clip(o, -10, 10)/10.0
 42 |         if o.ndim > 2:
 43 |             o = o.reshape(o.shape[0], -1)
 44 |         N, n = o.shape
 45 |         num_feat = int( n + 4 )            # linear + time till pow 4
 46 |         feat_mat =  np.ones((N, num_feat)) # memory allocation
 47 | 
 48 |         # linear features
 49 |         feat_mat[:,:n] = o
 50 | 
 51 |         k = 0  # start from this row
 52 |         for i in range(len(paths)):
 53 |             l = len(paths[i]["rewards"])
 54 |             al = np.arange(l)/1000.0
 55 |             for j in range(4):
 56 |                 feat_mat[k:k+l, -4+j] = al**(j+1)
 57 |             k += l
 58 |         return feat_mat
 59 | 
 60 | 
 61 |     def fit(self, paths, return_errors=False, return_all_errors=False):
 62 | 
 63 |         featmat = self._features(paths)
 64 |         returns = np.concatenate([path["returns"] for path in paths]).reshape(-1, 1)
 65 |         featmat = featmat.astype('float32')
 66 |         returns = returns.astype('float32')
 67 |         num_samples = returns.shape[0]
 68 | 
 69 |         # Make variables with the above data
 70 |         if self.use_gpu:
 71 |             featmat_var = Variable(torch.from_numpy(featmat).cuda(), requires_grad=False)
 72 |             returns_var = Variable(torch.from_numpy(returns).cuda(), requires_grad=False)
 73 |         else:
 74 |             featmat_var = Variable(torch.from_numpy(featmat), requires_grad=False)
 75 |             returns_var = Variable(torch.from_numpy(returns), requires_grad=False)
 76 | 
 77 |         if return_errors:
 78 |             if self.use_gpu:
 79 |                 predictions = self.model(featmat_var).cpu().data.numpy().ravel()
 80 |             else:
 81 |                 predictions = self.model(featmat_var).data.numpy().ravel()
 82 |             errors = returns.ravel() - predictions
 83 |             error_before = np.sum(errors**2)/(np.sum(returns**2) + 1e-8)
 84 | 
 85 |         epoch_losses = fit_data(self.model, featmat_var, returns_var, self.optimizer,
 86 |                                 self.loss_function, self.batch_size, self.epochs)
 87 | 
 88 |         if return_errors:
 89 |             if self.use_gpu:
 90 |                 predictions = self.model(featmat_var).cpu().data.numpy().ravel()
 91 |             else:
 92 |                 predictions = self.model(featmat_var).data.numpy().ravel()
 93 |             errors = returns.ravel() - predictions
 94 |             error_after = np.sum(errors**2)/(np.sum(returns**2) + 1e-8)
 95 |             if return_all_errors:
 96 |                 return error_before, error_after, epoch_losses
 97 |             return error_before, error_after
 98 | 
 99 |     def predict(self, path):
100 |         featmat = self._features([path]).astype('float32')
101 |         if self.use_gpu:
102 |             feat_var = Variable(torch.from_numpy(featmat).float().cuda(), requires_grad=False)
103 |             prediction = self.model(feat_var).cpu().data.numpy().ravel()
104 |         else:
105 |             feat_var = Variable(torch.from_numpy(featmat).float(), requires_grad=False)
106 |             prediction = self.model(feat_var).data.numpy().ravel()
107 |         return prediction
108 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/quadratic_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | 
 4 | class QuadraticBaseline:
 5 |     def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-3):
 6 |         self.n = inp_dim if inp_dim is not None else env_spec.observation_dim
 7 |         self.inp = inp
 8 |         self._reg_coeff = reg_coeff
 9 |         self._coeffs = None
10 | 
11 |     def _features(self, paths):
12 |         if self.inp == 'env_features':
13 |             o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
14 |         else:
15 |             o = np.concatenate([path["observations"] for path in paths])
16 |         o = np.clip(o, -10, 10)/10.0
17 |         if o.ndim > 2:
18 |             o = o.reshape(o.shape[0], -1)
19 |         N, n = o.shape
20 |         num_feat = int( n + n*(n+1)/2 + 1 + 4 )  # linear + full quad (symmetric matrix) + bias (1.0) + time till pow 4 
21 |         feat_mat =  np.ones((N, num_feat))       # memory allocation
22 | 
23 |         # linear features
24 |         feat_mat[:,:n] = o
25 | 
26 |         # quadratic features
27 |         k = n  # starting from this column in feat_mat
28 |         for i in range(n):
29 |             for j in range(i, n):
30 |                 feat_mat[:,k] = o[:,i]*o[:,j]  # element-wise product
31 |                 k += 1
32 | 
33 |         k = 0  # start from this row
34 |         for i in range(len(paths)):
35 |             l = len(paths[i]["rewards"])
36 |             al = np.arange(l)/1000.0
37 |             for j in range(4):
38 |                 feat_mat[k:k+l, -4+j] = al**(j+1)
39 |             k += l
40 | 
41 |         return feat_mat
42 | 
43 | 
44 |     def fit(self, paths, return_errors=False):
45 | 
46 |         #featmat = np.concatenate([self._features(path) for path in paths])
47 |         featmat = self._features(paths)
48 |         returns = np.concatenate([path["returns"] for path in paths])
49 | 
50 |         if return_errors:
51 |             predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape)
52 |             errors = returns - predictions
53 |             error_before = np.sum(errors**2)/np.sum(returns**2)
54 | 
55 |         reg_coeff = copy.deepcopy(self._reg_coeff)
56 |         for _ in range(10):
57 |             self._coeffs = np.linalg.lstsq(
58 |                 featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
59 |                 featmat.T.dot(returns)
60 |             )[0]
61 |             if not np.any(np.isnan(self._coeffs)):
62 |                 break
63 |             reg_coeff *= 10
64 | 
65 |         if return_errors:
66 |             predictions = featmat.dot(self._coeffs)
67 |             errors = returns - predictions
68 |             error_after = np.sum(errors**2)/np.sum(returns**2)
69 |             return error_before, error_after
70 | 
71 |     def predict(self, path):
72 |         if self._coeffs is None:
73 |             return np.zeros(len(path["rewards"]))
74 |         return self._features([path]).dot(self._coeffs)
75 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/baselines/zero_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | 
 4 | class ZeroBaseline:
 5 |     def __init__(self, env_spec, **kwargs):
 6 |         n = env_spec.observation_dim       # number of states
 7 |         self._coeffs = None
 8 | 
 9 |     def fit(self, paths, return_errors=False):
10 |         if return_errors:
11 |             return 1.0, 1.0
12 | 
13 |     def predict(self, path):
14 |         return np.zeros(len(path["rewards"]))
15 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | # ----------------------------------------
 4 | # mjrl environments
 5 | # ----------------------------------------
 6 | 
 7 | register(
 8 |     id='mjrl_point_mass-v0',
 9 |     entry_point='mjrl.envs:PointMassEnv',
10 |     max_episode_steps=25,
11 | )
12 | 
13 | register(
14 |     id='mjrl_swimmer-v0',
15 |     entry_point='mjrl.envs:SwimmerEnv',
16 |     max_episode_steps=500,
17 | )
18 | 
19 | register(
20 |     id='mjrl_reacher_7dof-v0',
21 |     entry_point='mjrl.envs:Reacher7DOFEnv',
22 |     max_episode_steps=50,
23 | )
24 | 
25 | register(
26 |     id='mjrl_peg_insertion-v0',
27 |     entry_point='mjrl.envs:PegEnv',
28 |     max_episode_steps=50,
29 | )
30 | 
31 | from mjrl.envs.mujoco_env import MujocoEnv
32 | # ^^^^^ so that user gets the correct error
33 | # message if mujoco is not installed correctly
34 | from mjrl.envs.point_mass import PointMassEnv
35 | from mjrl.envs.swimmer import SwimmerEnv
36 | from mjrl.envs.reacher_sawyer import Reacher7DOFEnv
37 | from mjrl.envs.peg_insertion_sawyer import PegEnv
38 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/envs/assets/point_mass.xml:
--------------------------------------------------------------------------------
 1 | <!-- Author + Copyright (C) Aravind Rajeswaran, rajeswaran.aravind@gmail.com. 
 2 | See LICENSE file for legal notices. LICENSE must be kept together with this file. -->
 3 | <mujoco model="PointMass">
 4 |     <compiler inertiafromgeom="true" angle="radian" />
 5 |     <default>
 6 |         <joint armature="0.01" damping="0.1" limited="true"/>
 7 |         <geom contype="0" friction="1 0.1 0.1" rgba="0.7 0.7 0 1"/>
 8 |         <motor ctrllimited="true" ctrlrange="-1.0 1.0" />
 9 |     </default>
10 |     <option gravity="0 0 0" integrator="RK4" timestep="0.01"/>
11 | 
12 |     <asset>
13 |         <texture type="skybox" builtin="gradient" rgb1=".1 .12 .15" rgb2="0 0 0" width="100" height="100"/>
14 |         <texture builtin="flat" height="100" name="grayplane" rgb1="0.4 0.4 0.4" rgb2=".1 0.15 0.2" type="2d" width="100"/>
15 |         <material name="grayfloor" reflectance="0" shininess="0" specular="1" texture="grayplane"/>
16 |     </asset>
17 | 
18 |     <worldbody>
19 |         <!-- Arena -->
20 |         <geom conaffinity="0" contype="0" name="ground" pos="0 0 0" rgba="0.9 0.9 0.9 1" size="1.5 1.5 .05" type="plane" material="grayfloor"/>
21 |         <geom conaffinity="0" fromto="-1.5 -1.5 .02 1.5 -1.5 .02" name="sideS" rgba="0.3 0.4 0.3 1.0" size=".04" type="capsule" mass="0.1"/>
22 |         <geom conaffinity="0" fromto=" 1.5 -1.5 .02 1.5  1.5 .02" name="sideE" rgba="0.3 0.4 0.3 1.0" size=".04" type="capsule" mass="0.1"/>
23 |         <geom conaffinity="0" fromto="-1.5  1.5 .02 1.5  1.5 .02" name="sideN" rgba="0.3 0.4 0.3 1.0" size=".04" type="capsule" mass="0.1"/>
24 |         <geom conaffinity="0" fromto="-1.5 -1.5 .02 -1.5 1.5 .02" name="sideW" rgba="0.3 0.4 0.3 1.0" size=".04" type="capsule" mass="0.1"/>
25 | 
26 |         <!-- Agent -->
27 |         <body name="agent" pos="0 0 .05">
28 |             <joint axis="1 0 0" name="agent_x" pos="0 0 0" range="-1.4 1.4" stiffness="0" type="slide"/>
29 |             <joint axis="0 1 0" name="agent_y" pos="0 0 0" range="-1.4 1.4" stiffness="0" type="slide"/>
30 |             <geom conaffinity="1" contype="1" name="agent" pos="0 0 0" rgba="0.2 0.2 0.6 1" size=".05" type="sphere"/>
31 |         </body>
32 | 
33 |         <!-- Target -->
34 |         <site name="target" pos="1.0 0 .05" rgba="0.6 0.2 0.2 0.2" size=".07" type="sphere" />
35 | 
36 |     </worldbody>
37 | 
38 |     <actuator>
39 |         <motor gear="10.0" joint="agent_x"/>
40 |         <motor gear="10.0" joint="agent_y"/>
41 |     </actuator>
42 | 
43 | </mujoco>
44 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/envs/assets/sawyer.xml:
--------------------------------------------------------------------------------
  1 | <mujoco model="7dof reacher">
  2 |    <compiler inertiafromgeom="true" angle="radian" coordinate="local" />
  3 |    <option timestep="0.01" gravity="0 0 0" iterations="100" integrator="Euler" />
  4 |    <default>
  5 |       <joint armature="0.004" damping="0.8" limited="true" />
  6 |       <geom friction=".5 .1 .1" margin="0.002" condim="1" contype="0" conaffinity="0" rgba="1.0 0.25 0.22 1" />
  7 |    </default>
  8 | 
  9 |    <worldbody>
 10 |       <light diffuse=".5 .5 .5" pos="0 0 3" dir="0 0 -1"/>
 11 |       <geom name="table" type="plane" pos="0 0.5 -0.425" size="1 1 0.1" rgba="0.8 0.8 0.8 1.0" contype="1" conaffinity="1"/>
 12 | 
 13 |       <site name="target" pos="0.1 0.1 0.1" rgba="0.2 0.6 0.2 0.25" size=".05" type="sphere" />
 14 | 
 15 |       <body name="r_shoulder_pan_link" pos="0 -0.6 0">
 16 |          <geom name="e1"  type="sphere" rgba="0.6 0.6 0.6 1" pos="-0.06 0.05 0.2" size="0.05" />
 17 |          <geom name="e2"  type="sphere" rgba="0.6 0.6 0.6 1" pos=" 0.06 0.05 0.2" size="0.05" />
 18 |          <geom name="e1p" type="sphere" rgba="0.1 0.1 0.1 1" pos="-0.06 0.09 0.2" size="0.03" />
 19 |          <geom name="e2p" type="sphere" rgba="0.1 0.1 0.1 1" pos=" 0.06 0.09 0.2" size="0.03" />
 20 |          <geom name="sp"  type="capsule" fromto="0 0 -0.4 0 0 0.2" size="0.1" />
 21 |          <joint name="r_shoulder_pan_joint" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.2854 1.714602" damping="2.0" />
 22 | 
 23 |          <body name="r_shoulder_lift_link" pos="0.1 0 0">
 24 |             <geom name="sl" type="capsule" fromto="0 -0.1 0 0 0.1 0" size="0.1" />
 25 |             <joint name="r_shoulder_lift_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-0.5236 1.3963" damping="2.0" />
 26 | 
 27 |             <body name="r_upper_arm_roll_link" pos="0 0 0">
 28 |                <geom name="uar" type="capsule" fromto="-0.1 0 0 0.1 0 0" size="0.02" />
 29 |                <joint name="r_upper_arm_roll_joint" type="hinge" pos="0 0 0" axis="1 0 0" range="-1.5 1.7" />
 30 | 
 31 |                <body name="r_upper_arm_link" pos="0 0 0">
 32 |                   <geom name="ua" type="capsule" fromto="0 0 0 0.4 0 0" size="0.06" />
 33 | 
 34 |                   <body name="r_elbow_flex_link" pos="0.4 0 0">
 35 |                      <geom name="ef" type="capsule" fromto="0 -0.02 0 0.0 0.02 0" size="0.06" />
 36 |                      <joint name="r_elbow_flex_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-2.3213 0" />
 37 | 
 38 |                      <body name="r_forearm_roll_link" pos="0 0 0">
 39 |                         <geom name="fr" type="capsule" fromto="-0.1 0 0 0.1 0 0" size="0.02" />
 40 |                         <joint name="r_forearm_roll_joint" type="hinge" limited="true" pos="0 0 0" axis="1 0 0" range="-1.5 1.5"/>
 41 | 
 42 |                         <body name="r_forearm_link" pos="0 0 0">
 43 |                            <geom name="fa" type="capsule" fromto="0 0 0 0.291 0 0" size="0.05" />
 44 | 
 45 |                            <body name="r_wrist_flex_link" pos="0.321 0 0">
 46 |                               <geom name="wf" type="capsule" fromto="0 -0.02 0 0 0.02 0" size="0.01" />
 47 |                               <joint name="r_wrist_flex_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-1.094 0" />
 48 | 
 49 |                               <body name="r_wrist_roll_link" pos="0 0 0">
 50 |                                  <joint name="r_wrist_roll_joint" type="hinge" pos="0 0 0" limited="true" axis="1 0 0" range="-1.5 1.5"/>
 51 |                                     <!--
 52 |                                  <body name="tips_arm" pos="0 0 0">
 53 |                                     <geom name="tip_arml" type="sphere" pos="0.1 -0.1 0." size="0.01" />
 54 |                                     <geom name="tip_armr" type="sphere" pos="0.1 0.1 0." size="0.01" />
 55 |                                     <site name="finger" type="sphere" size="0.05" rgba="0.4 0.4 0.5 1" pos="0 0 0"/>
 56 |                                  </body>
 57 |                                     -->
 58 |                                  <geom type="sphere" pos="0.03 0 0" size="0.08" rgba="0 0 1 0.2" contype="1" conaffinity="1" />
 59 |                                  <site name="finger" type="sphere" size="0.05" rgba="0.4 0.4 0.5 1" pos="0 0 0"/>
 60 |                                     <!--
 61 |                                  <geom type="capsule" fromto="0 -0.1 0. 0.1 -0.1 0" size="0.02" contype="1" conaffinity="1" />
 62 |                                  <geom type="capsule" fromto="0 +0.1 0. 0.1 +0.1 0" size="0.02" contype="1" conaffinity="1" />
 63 |                                     -->
 64 |                               </body>
 65 |                            </body>
 66 |                         </body>
 67 |                      </body>
 68 |                   </body>
 69 |                </body>
 70 |             </body>
 71 |          </body>
 72 |       </body>
 73 | 
 74 |       <!--<body name="goal" pos="0 0 0" >-->
 75 |          <!--<geom name="goal" rgba="1 0.86 0.5 1" type="sphere" size="0.05" conaffinity="0" contype="0"/>-->
 76 |          <!--<site name="goal" type="sphere" size="0.02" rgba="0.4 0.4 0.5 0.2" pos="0 0 0"/>-->
 77 | 
 78 |          <!--<joint axis="1 0 0" limited="false" name="goalx" pos="0 0 0" type="slide"/>-->
 79 |          <!--<joint axis="0 1 0" limited="false" name="goaly" pos="0 0 0" type="slide"/>-->
 80 |          <!--<joint axis="0 0 1" limited="false" name="goalz" pos="0 0 0" type="slide"/>-->
 81 |       <!--</body>-->
 82 | 
 83 |       <!--<body name="ball" pos="0 0 0" >-->
 84 |          <!--<geom name="ball" rgba="0.5 0.86 1 1" type="sphere" size="0.05" conaffinity="0" contype="0"/>-->
 85 |          <!--<site name="ball" type="sphere" size="0.02" rgba="0.4 0.4 0.5 0.2" pos="0 0 0"/>-->
 86 | 
 87 |          <!--<joint axis="1 0 0" limited="false" name="ballx" pos="0 0 0" type="slide"/>-->
 88 |          <!--<joint axis="0 1 0" limited="false" name="bally" pos="0 0 0" type="slide"/>-->
 89 |          <!--<joint axis="0 0 1" limited="false" name="ballz" pos="0 0 0" type="slide"/>-->
 90 |       <!--</body>-->
 91 | 
 92 |    </worldbody>
 93 | 
 94 |    <contact>
 95 |       <!--
 96 |       <pair geom1="tip_armr" geom2="ball" condim="1" solref="0.02 4" solimp="-0.1 0.95 0.1" margin="0.0" />
 97 |       <pair geom1="table" geom2="ball" condim="1" solref="0.02 1" solimp="0.9 0.95 0.1" margin="0.02" />
 98 |       -->
 99 |    </contact>
100 | 
101 |    <actuator>
102 |       <motor joint="r_shoulder_pan_joint"   ctrlrange="-1.0 1.0" ctrllimited="true" gear="20" />
103 |       <motor joint="r_shoulder_lift_joint"  ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
104 |       <motor joint="r_upper_arm_roll_joint" ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
105 |       <motor joint="r_elbow_flex_joint"     ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
106 |       <motor joint="r_forearm_roll_joint"   ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
107 |       <motor joint="r_wrist_flex_joint"     ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
108 |       <motor joint="r_wrist_roll_joint"     ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
109 |    </actuator>
110 | </mujoco>


--------------------------------------------------------------------------------
/mjrl/mjrl/envs/assets/swimmer.xml:
--------------------------------------------------------------------------------
 1 | <!-- Author + Copyright (C) Aravind Rajeswaran, rajeswaran.aravind@gmail.com. 
 2 | See LICENSE file for legal notices. LICENSE file must be kept together with this file.
 3 | Based on original model by Vikash Kumar (vikashplus@gmail.com) in the MuJoCo forum resources -->
 4 | <mujoco model="swimmer">
 5 | 	<compiler inertiafromgeom="true" angle="radian" />
 6 | 	<default>
 7 | 		<joint limited="true" range="-1.5 1.5"/>
 8 | 		<motor ctrllimited="true" ctrlrange="-1 1"/>
 9 | 		<default class="body">
10 | 			<geom material="MatBody"/>
11 | 		</default>
12 | 	</default>
13 | 	
14 | 	<!--Viscosity:: water:0.000894, air:0.00001983 -->
15 | 	<!--Density:: water:1000, air:1.2 -->
16 | 	<option timestep="0.005" viscosity="0.000894" density="1000" />
17 | 	
18 | 	<asset>
19 |         <texture type="skybox" builtin="gradient" rgb1=".1 .12 .15" rgb2="0 0 0" width="100" height="100"/>
20 |         <texture builtin="checker" height="100" name="groundplane" rgb1=".4 .4 .45" rgb2=".15 .15 0.15" type="2d" width="100"/>
21 | 		<material name="MatBody" specular="3" shininess="0.9" reflectance="0" rgba=".3 .4 .35 1"/>
22 | 		<material name="MatGnd" texture="groundplane" texrepeat="10 10" specular="1" shininess="1" reflectance="0.00001"/>
23 | 	</asset>
24 | 	
25 | 	<worldbody>
26 | 		<light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3"/>
27 | 		<camera name="cam0" pos="0 -7 7" euler="0.7 0 0"/>
28 | 		<geom name="ground" material="MatGnd" type="plane" pos="0 0 0" size="10 10 1" contype="0" conaffinity="0" />
29 | 		<body pos="0 0 0.03" childclass="body" name="torso">
30 | 			<site name="eyeL" type="sphere" size="0.02" pos="-.065 -.045 .02" rgba=".9 .4 .2 1"/>
31 | 			<site name="eyer" type="sphere" size="0.02" pos="-.065 0.045 .02" rgba=".9 .4 .2 1"/>
32 | 			<site name="head" material="MatBody" type="sphere" size="0.08" pos="0 0 0" rgba="0 0 0 1"/>
33 | 			<joint type="slide" pos="0 0 0" axis="1 0 0" limited="false"/>
34 | 			<joint type="slide" pos="0 0 0" axis="0 1 0" limited="false"/>
35 | 			<joint type="hinge" pos="0 0 0" axis="0 0 1" limited="false"/>
36 | 			<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.07 0.15" />
37 | 			<body pos="0.3 0 0">
38 | 				<joint name="j1" type="hinge" pos="0 0 0" axis="0 0 1"/>
39 | 				<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.065 0.15" />
40 | 				<body pos="0.3 0 0">
41 | 					<joint name="j2" type="hinge" pos="0 0 0" axis="0 0 1"/>
42 | 					<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.06 0.15" />
43 | 					<body pos="0.3 0 0">
44 | 						<joint name="j3" type="hinge" pos="0 0 0" axis="0 0 1"/>
45 | 						<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.055 0.15" />
46 | 						<body pos="0.3 0 0">
47 | 							<joint name="j4" type="hinge" pos="0 0 0" axis="0 0 1"/>
48 | 							<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.05 0.15" />
49 | 						</body>
50 | 					</body>
51 | 				</body>
52 | 			</body>
53 | 		</body>
54 | 
55 | 	    <site name="target" pos='-5 0 0.15' type="sphere" size="0.8" rgba="1 0 0 0.5" />
56 | 	    
57 | 	</worldbody>
58 | 	
59 | 	<actuator>
60 | 		<motor joint="j1" gear="20"/>
61 | 		<motor joint="j2" gear="20"/>
62 | 		<motor joint="j3" gear="20"/>
63 | 		<motor joint="j4" gear="20"/>
64 | 	</actuator>
65 | 
66 | </mujoco>
67 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/envs/mujoco_env.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from gym import error, spaces
  4 | from gym.utils import seeding
  5 | import numpy as np
  6 | from os import path
  7 | import gym
  8 | import six
  9 | import time as timer
 10 | 
 11 | try:
 12 |     import mujoco_py
 13 |     from mujoco_py import load_model_from_path, MjSim, MjViewer
 14 | except ImportError as e:
 15 |     raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e))
 16 | 
 17 | def get_sim(model_path):
 18 |     if model_path.startswith("/"):
 19 |         fullpath = model_path
 20 |     else:
 21 |         fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path)
 22 |     if not path.exists(fullpath):
 23 |         raise IOError("File %s does not exist" % fullpath)
 24 |     model = load_model_from_path(fullpath)
 25 |     return MjSim(model)
 26 | 
 27 | class MujocoEnv(gym.Env):
 28 |     """Superclass for all MuJoCo environments.
 29 |     """
 30 | 
 31 |     def __init__(self, model_path=None, frame_skip=1, sim=None):
 32 | 
 33 |         if sim is None:
 34 |             self.sim = get_sim(model_path)
 35 |         else:
 36 |             self.sim = sim
 37 |         self.data = self.sim.data
 38 |         self.model = self.sim.model
 39 | 
 40 |         self.frame_skip = frame_skip
 41 |         self.metadata = {
 42 |             'render.modes': ['human', 'rgb_array'],
 43 |             'video.frames_per_second': int(np.round(1.0 / self.dt))
 44 |         }
 45 |         self.mujoco_render_frames = False
 46 | 
 47 |         self.init_qpos = self.data.qpos.ravel().copy()
 48 |         self.init_qvel = self.data.qvel.ravel().copy()
 49 |         try:
 50 |             observation, _reward, done, _info = self.step(np.zeros(self.model.nu))
 51 |         except NotImplementedError:
 52 |             observation, _reward, done, _info = self._step(np.zeros(self.model.nu))
 53 |         assert not done
 54 |         self.obs_dim = np.sum([o.size for o in observation]) if type(observation) is tuple else observation.size
 55 | 
 56 |         bounds = self.model.actuator_ctrlrange.copy()
 57 |         low = bounds[:, 0]
 58 |         high = bounds[:, 1]
 59 |         self.action_space = spaces.Box(low, high, dtype=np.float32)
 60 | 
 61 |         high = np.inf*np.ones(self.obs_dim)
 62 |         low = -high
 63 |         self.observation_space = spaces.Box(low, high, dtype=np.float32)
 64 | 
 65 |         self.seed()
 66 | 
 67 |     def seed(self, seed=None):
 68 |         self.np_random, seed = seeding.np_random(seed)
 69 |         return [seed]
 70 | 
 71 |     # methods to override:
 72 |     # ----------------------------
 73 | 
 74 |     def reset_model(self):
 75 |         """
 76 |         Reset the robot degrees of freedom (qpos and qvel).
 77 |         Implement this in each subclass.
 78 |         """
 79 |         raise NotImplementedError
 80 | 
 81 |     def mj_viewer_setup(self):
 82 |         """
 83 |         Due to specifics of new mujoco rendering, the standard viewer cannot be used
 84 |         with this set-up. Instead we use this mujoco specific function.
 85 |         """
 86 |         pass
 87 | 
 88 |     def viewer_setup(self):
 89 |         """
 90 |         Does not work. Use mj_viewer_setup() instead
 91 |         """
 92 |         pass
 93 | 
 94 |     def evaluate_success(self, paths, logger=None):
 95 |         """
 96 |         Log various success metrics calculated based on input paths into the logger
 97 |         """
 98 |         pass
 99 | 
100 |     # -----------------------------
101 | 
102 |     def reset(self):
103 |         self.sim.reset()
104 |         self.sim.forward()
105 |         ob = self.reset_model()
106 |         return ob
107 | 
108 |     def set_state(self, qpos, qvel):
109 |         assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
110 |         old_state = self.sim.get_state()
111 |         new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel,
112 |                                          old_state.act, old_state.udd_state)
113 |         self.sim.set_state(new_state)
114 |         self.sim.forward()
115 | 
116 |     @property
117 |     def dt(self):
118 |         return self.model.opt.timestep * self.frame_skip
119 | 
120 |     def do_simulation(self, ctrl, n_frames):
121 |         for i in range(self.model.nu):
122 |             self.sim.data.ctrl[i] = ctrl[i]
123 |         for _ in range(n_frames):
124 |             self.sim.step()
125 |             if self.mujoco_render_frames is True:
126 |                 self.mj_render()
127 | 
128 |     def mj_render(self):
129 |         try:
130 |             self.viewer.render()
131 |         except:
132 |             self.mj_viewer_setup()
133 |             self.viewer._run_speed = 0.5
134 |             #self.viewer._run_speed /= self.frame_skip
135 |             self.viewer.render()
136 | 
137 |     def render(self, *args, **kwargs):
138 |         pass
139 |         #return self.mj_render()
140 | 
141 |     def _get_viewer(self):
142 |         pass
143 |         #return None
144 | 
145 |     def state_vector(self):
146 |         state = self.sim.get_state()
147 |         return np.concatenate([
148 |             state.qpos.flat, state.qvel.flat])
149 | 
150 |     # -----------------------------
151 | 
152 |     def visualize_policy(self, policy, horizon=1000, num_episodes=1, mode='exploration'):
153 |         self.mujoco_render_frames = True
154 |         for ep in range(num_episodes):
155 |             o = self.reset()
156 |             d = False
157 |             t = 0
158 |             score = 0.0
159 |             while t < horizon and d is False:
160 |                 a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
161 |                 o, r, d, _ = self.step(a)
162 |                 t = t+1
163 |                 score = score + r
164 |             print("Episode score = %f" % score)
165 |         self.mujoco_render_frames = False
166 | 
167 |     def visualize_policy_offscreen(self, policy, horizon=1000,
168 |                                    num_episodes=1,
169 |                                    frame_size=(640,480),
170 |                                    mode='exploration',
171 |                                    save_loc='/tmp/',
172 |                                    filename='newvid',
173 |                                    camera_name=None):
174 |         import skvideo.io
175 |         for ep in range(num_episodes):
176 |             print("Episode %d: rendering offline " % ep, end='', flush=True)
177 |             o = self.reset()
178 |             d = False
179 |             t = 0
180 |             arrs = []
181 |             t0 = timer.time()
182 |             while t < horizon and d is False:
183 |                 a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
184 |                 o, r, d, _ = self.step(a)
185 |                 t = t+1
186 |                 curr_frame = self.sim.render(width=frame_size[0], height=frame_size[1],
187 |                                              mode='offscreen', camera_name=camera_name, device_id=0)
188 |                 arrs.append(curr_frame[::-1,:,:])
189 |                 print(t, end=', ', flush=True)
190 |             file_name = save_loc + filename + str(ep) + ".mp4"
191 |             skvideo.io.vwrite( file_name, np.asarray(arrs))
192 |             print("saved", file_name)
193 |             t1 = timer.time()
194 |             print("time taken = %f"% (t1-t0))
195 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/envs/peg_insertion_sawyer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from mjrl.envs import mujoco_env
  4 | from mujoco_py import MjViewer
  5 | 
  6 | 
  7 | class PegEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  8 |     def __init__(self):
  9 |         self.peg_sid = -2
 10 |         self.target_sid = -1
 11 |         mujoco_env.MujocoEnv.__init__(self, 'peg_insertion.xml', 4)
 12 |         utils.EzPickle.__init__(self)
 13 |         self.peg_sid = self.model.site_name2id("peg_bottom")
 14 |         self.target_sid = self.model.site_name2id("target")
 15 |         self.init_body_pos = self.model.body_pos.copy()
 16 | 
 17 |     def step(self, a):
 18 |         self.do_simulation(a, self.frame_skip)
 19 |         obs = self.get_obs()
 20 |         reward = self.get_reward(obs, a)
 21 |         return obs, reward, False, self.get_env_infos()
 22 | 
 23 |     def get_obs(self):
 24 |         return np.concatenate([
 25 |             self.data.qpos.flat,
 26 |             self.data.qvel.flat,
 27 |             self.data.site_xpos[self.peg_sid],
 28 |             self.data.site_xpos[self.target_sid],
 29 |         ])
 30 | 
 31 |     def get_reward(self, obs, act=None):
 32 |         obs = np.clip(obs, -10.0, 10.0)
 33 |         if len(obs.shape) == 1:
 34 |             # vector obs, called when stepping the env
 35 |             hand_pos = obs[-6:-3]
 36 |             target_pos = obs[-3:]
 37 |             l1_dist = np.sum(np.abs(hand_pos - target_pos))
 38 |             l2_dist = np.linalg.norm(hand_pos - target_pos)
 39 |         else:
 40 |             obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
 41 |             hand_pos = obs[:, :, -6:-3]
 42 |             target_pos = obs[:, :, -3:]
 43 |             l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1)
 44 |             l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1)
 45 |         bonus = 5.0 * (l2_dist < 0.06)
 46 |         reward = - l1_dist - 5.0 * l2_dist + bonus
 47 |         return reward
 48 | 
 49 |     def compute_path_rewards(self, paths):
 50 |         # path has two keys: observations and actions
 51 |         # path["observations"] : (num_traj, horizon, obs_dim)
 52 |         # path["rewards"] should have shape (num_traj, horizon)
 53 |         obs = paths["observations"]
 54 |         rewards = self.get_reward(obs)
 55 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
 56 | 
 57 |     # --------------------------------
 58 |     # resets and randomization
 59 |     # --------------------------------
 60 | 
 61 |     def robot_reset(self):
 62 |         self.set_state(self.init_qpos, self.init_qvel)
 63 | 
 64 |     def target_reset(self):
 65 |         # Randomize goal position
 66 |         goal_y = self.np_random.uniform(low=0.1, high=0.5)
 67 |         try:
 68 |             self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29)
 69 |             self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29)
 70 |             self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29)
 71 |             self.sim.forward()
 72 |         except:
 73 |             pass
 74 | 
 75 |     def reset_model(self, seed=None):
 76 |         if seed is not None:
 77 |             self.seeding = True
 78 |             self.seed(seed)
 79 |         self.robot_reset()
 80 |         self.target_reset()
 81 |         return self.get_obs()
 82 | 
 83 |     # --------------------------------
 84 |     # get and set states
 85 |     # --------------------------------
 86 | 
 87 |     def get_env_state(self):
 88 |         target_pos = self.model.body_pos[-1].copy()
 89 |         return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
 90 |                     target_pos=target_pos)
 91 | 
 92 |     def set_env_state(self, state):
 93 |         self.sim.reset()
 94 |         qp = state['qp'].copy()
 95 |         qv = state['qv'].copy()
 96 |         target_pos = state['target_pos']
 97 |         self.model.body_pos[-1] = target_pos
 98 |         goal_y = target_pos[1]
 99 |         self.data.qpos[:] = qp
100 |         self.data.qvel[:] = qv
101 |         self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29)
102 |         self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29)
103 |         self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29)
104 |         self.sim.forward()
105 | 
106 |     # --------------------------------
107 |     # utility functions
108 |     # --------------------------------
109 | 
110 |     def get_env_infos(self):
111 |         return dict(state=self.get_env_state())
112 | 
113 |     def mj_viewer_setup(self):
114 |         self.viewer = MjViewer(self.sim)
115 |         self.viewer.cam.azimuth += 200
116 |         self.sim.forward()
117 |         self.viewer.cam.distance = self.model.stat.extent*2.0
118 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/envs/point_mass.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from mjrl.envs import mujoco_env
  4 | from mujoco_py import MjViewer
  5 | 
  6 | 
  7 | class PointMassEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  8 |     def __init__(self):
  9 |         self.agent_bid = 0
 10 |         self.target_sid = 0
 11 |         utils.EzPickle.__init__(self)
 12 |         mujoco_env.MujocoEnv.__init__(self, 'point_mass.xml', 5)
 13 |         self.agent_bid = self.sim.model.body_name2id('agent')
 14 |         self.target_sid = self.sim.model.site_name2id('target')
 15 | 
 16 |     def step(self, a):
 17 |         self.do_simulation(a, self.frame_skip)
 18 |         obs = self.get_obs()
 19 |         reward = self.get_reward(obs)
 20 |         return obs, reward, False, dict(solved=(reward > -0.1), state=self.get_env_state())
 21 | 
 22 |     def get_obs(self):
 23 |         agent_pos = self.data.body_xpos[self.agent_bid].ravel()
 24 |         target_pos = self.data.site_xpos[self.target_sid].ravel()
 25 |         return np.concatenate([agent_pos[:2], self.data.qvel.ravel(), target_pos[:2]])
 26 | 
 27 |     def get_reward(self, obs, act=None):
 28 |         if len(obs.shape) == 1:
 29 |             # vector obs, called when stepping the env
 30 |             agent_pos = obs[:2]
 31 |             target_pos = obs[-2:]
 32 |             l1_dist = np.sum(np.abs(agent_pos - target_pos))
 33 |             l2_dist = np.linalg.norm(agent_pos - target_pos)
 34 |         else:
 35 |             obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
 36 |             agent_pos = obs[:, :, :2]
 37 |             target_pos = obs[:, :, -2:]
 38 |             l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1)
 39 |             l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1)
 40 |         reward = -1.0 * l1_dist - 0.5 * l2_dist
 41 |         return reward
 42 | 
 43 |     def compute_path_rewards(self, paths):
 44 |         # path has two keys: observations and actions
 45 |         # path["observations"] : (num_traj, horizon, obs_dim)
 46 |         # path["rewards"] should have shape (num_traj, horizon)
 47 |         obs = paths["observations"]
 48 |         rewards = self.get_reward(obs)
 49 |         rewards[..., :-1] = rewards[..., 1:]   # shift index by 1 to have r(s,a)=r(s')
 50 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
 51 |         return paths
 52 | 
 53 |     def reset_model(self):
 54 |         # randomize the agent and goal
 55 |         agent_x = self.np_random.uniform(low=-1.0, high=1.0)
 56 |         agent_y = self.np_random.uniform(low=-1.0, high=1.0)
 57 |         goal_x  = self.np_random.uniform(low=-1.0, high=1.0)
 58 |         goal_y  = self.np_random.uniform(low=-1.0, high=1.0)
 59 |         qp = np.array([agent_x, agent_y])
 60 |         qv = self.init_qvel.copy()
 61 |         self.set_state(qp, qv)
 62 |         self.model.site_pos[self.target_sid][0] = goal_x
 63 |         self.model.site_pos[self.target_sid][1] = goal_y
 64 |         self.sim.forward()
 65 |         return self.get_obs()
 66 | 
 67 |     def evaluate_success(self, paths, logger=None):
 68 |         success = 0.0
 69 |         for p in paths:
 70 |             if np.mean(p['env_infos']['solved'][-4:]) > 0.0:
 71 |                 success += 1.0
 72 |         success_rate = 100.0*success/len(paths)
 73 |         if logger is None:
 74 |             # nowhere to log so return the value
 75 |             return success_rate
 76 |         else:
 77 |             # log the success
 78 |             # can log multiple statistics here if needed
 79 |             logger.log_kv('success_rate', success_rate)
 80 |             return None
 81 | 
 82 |     # --------------------------------
 83 |     # get and set states
 84 |     # --------------------------------
 85 | 
 86 |     def get_env_state(self):
 87 |         target_pos = self.model.site_pos[self.target_sid].copy()
 88 |         return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
 89 |                     target_pos=target_pos)
 90 | 
 91 |     def set_env_state(self, state):
 92 |         self.sim.reset()
 93 |         qp = state['qp'].copy()
 94 |         qv = state['qv'].copy()
 95 |         target_pos = state['target_pos']
 96 |         self.set_state(qp, qv)
 97 |         self.model.site_pos[self.target_sid] = target_pos
 98 |         self.sim.forward()
 99 | 
100 |     # --------------------------------
101 |     # utility functions
102 |     # --------------------------------
103 | 
104 |     def get_env_infos(self):
105 |         return dict(state=self.get_env_state())
106 | 
107 |     def mj_viewer_setup(self):
108 |         self.viewer = MjViewer(self.sim)
109 |         self.sim.forward()
110 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/envs/reacher_sawyer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from mjrl.envs import mujoco_env
  4 | from mujoco_py import MjViewer
  5 | 
  6 | 
  7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  8 |     def __init__(self):
  9 |         self.hand_sid = -2
 10 |         self.target_sid = -1
 11 |         mujoco_env.MujocoEnv.__init__(self, 'sawyer.xml', 4)
 12 |         utils.EzPickle.__init__(self)
 13 |         self.hand_sid = self.model.site_name2id("finger")
 14 |         self.target_sid = self.model.site_name2id("target")
 15 | 
 16 |     def step(self, a):
 17 |         self.do_simulation(a, self.frame_skip)
 18 |         obs = self.get_obs()
 19 |         reward = self.get_reward(obs, a)
 20 |         return obs, reward, False, self.get_env_infos()
 21 | 
 22 |     def get_obs(self):
 23 |         return np.concatenate([
 24 |             self.data.qpos.flat,
 25 |             self.data.qvel.ravel() * self.dt,       # delta_x instead of velocity
 26 |             self.data.site_xpos[self.hand_sid],
 27 |             self.data.site_xpos[self.target_sid],
 28 |         ])
 29 | 
 30 |     def get_reward(self, obs, act=None):
 31 |         obs = np.clip(obs, -10.0, 10.0)
 32 |         if len(obs.shape) == 1:
 33 |             # vector obs, called when stepping the env
 34 |             hand_pos = obs[-6:-3]
 35 |             target_pos = obs[-3:]
 36 |             l1_dist = np.sum(np.abs(hand_pos - target_pos))
 37 |             l2_dist = np.linalg.norm(hand_pos - target_pos)
 38 |         else:
 39 |             obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
 40 |             hand_pos = obs[:, :, -6:-3]
 41 |             target_pos = obs[:, :, -3:]
 42 |             l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1)
 43 |             l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1)
 44 |         reward = - l1_dist - 5.0 * l2_dist
 45 |         return reward
 46 | 
 47 |     def compute_path_rewards(self, paths):
 48 |         # path has two keys: observations and actions
 49 |         # path["observations"] : (num_traj, horizon, obs_dim)
 50 |         # path["rewards"] should have shape (num_traj, horizon)
 51 |         obs = paths["observations"]
 52 |         rewards = self.get_reward(obs)
 53 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
 54 | 
 55 |     # --------------------------------
 56 |     # resets and randomization
 57 |     # --------------------------------
 58 | 
 59 |     def robot_reset(self):
 60 |         self.set_state(self.init_qpos, self.init_qvel)
 61 | 
 62 |     def target_reset(self):
 63 |         target_pos = np.array([0.1, 0.1, 0.1])
 64 |         target_pos[0] = self.np_random.uniform(low=-0.3, high=0.3)
 65 |         target_pos[1] = self.np_random.uniform(low=-0.2, high=0.2)
 66 |         target_pos[2] = self.np_random.uniform(low=-0.25, high=0.25)
 67 |         self.model.site_pos[self.target_sid] = target_pos
 68 |         self.sim.forward()
 69 | 
 70 |     def reset_model(self, seed=None):
 71 |         if seed is not None:
 72 |             self.seeding = True
 73 |             self.seed(seed)
 74 |         self.robot_reset()
 75 |         self.target_reset()
 76 |         return self.get_obs()
 77 | 
 78 |     # --------------------------------
 79 |     # get and set states
 80 |     # --------------------------------
 81 | 
 82 |     def get_env_state(self):
 83 |         target_pos = self.model.site_pos[self.target_sid].copy()
 84 |         return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
 85 |                     target_pos=target_pos)
 86 | 
 87 |     def set_env_state(self, state):
 88 |         self.sim.reset()
 89 |         qp = state['qp'].copy()
 90 |         qv = state['qv'].copy()
 91 |         target_pos = state['target_pos']
 92 |         self.model.site_pos[self.target_sid] = target_pos
 93 |         self.data.qpos[:] = qp
 94 |         self.data.qvel[:] = qv
 95 |         self.sim.forward()
 96 | 
 97 |     # --------------------------------
 98 |     # utility functions
 99 |     # --------------------------------
100 | 
101 |     def get_env_infos(self):
102 |         return dict(state=self.get_env_state())
103 | 
104 |     def mj_viewer_setup(self):
105 |         self.viewer = MjViewer(self.sim)
106 |         self.viewer.cam.trackbodyid = 1
107 |         self.viewer.cam.type = 1
108 |         self.sim.forward()
109 |         self.viewer.cam.distance = self.model.stat.extent * 2.0
110 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/envs/swimmer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from mjrl.envs import mujoco_env
 4 | from mujoco_py import MjViewer
 5 | 
 6 | class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 7 |     def __init__(self):
 8 |         mujoco_env.MujocoEnv.__init__(self, 'swimmer.xml', 5)
 9 |         utils.EzPickle.__init__(self)
10 | 
11 |     def step(self, a):
12 |         xposbefore = self.data.qpos[0]
13 |         self.do_simulation(a, self.frame_skip)
14 |         xposafter = self.data.qpos[0]
15 |         
16 |         delta = (xposafter - xposbefore)
17 |         # make agent move in the negative x direction
18 |         reward = -10.0 * delta
19 |         done = False
20 | 
21 |         ob = self.get_obs()
22 |         return ob, reward, done, self.get_env_infos()
23 | 
24 |     def get_obs(self):
25 |         return np.concatenate([
26 |             self.data.qpos.flat[2:],
27 |             self.data.qvel.flat,
28 |         ])
29 | 
30 |     def reset_model(self):
31 |         qpos_init = self.init_qpos.copy()
32 |         qpos_init[2] = self.np_random.uniform(low=-np.pi, high=np.pi)
33 |         self.set_state(qpos_init, self.init_qvel)
34 |         self.sim.forward()
35 |         return self.get_obs()
36 | 
37 |     # --------------------------------
38 |     # get and set states
39 |     # --------------------------------
40 | 
41 |     def get_env_state(self):
42 |         return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy())
43 | 
44 |     def set_env_state(self, state):
45 |         self.sim.reset()
46 |         qp = state['qp'].copy()
47 |         qv = state['qv'].copy()
48 |         self.set_state(qp, qv)
49 |         self.sim.forward()
50 | 
51 |     # --------------------------------
52 |     # utility functions
53 |     # --------------------------------
54 | 
55 |     def get_env_infos(self):
56 |         return dict(state=self.get_env_state())
57 | 
58 |     def mj_viewer_setup(self):
59 |         self.viewer = MjViewer(self.sim)
60 |         self.viewer.cam.trackbodyid = 1
61 |         self.viewer.cam.type = 1
62 |         self.sim.forward()
63 |         self.viewer.cam.distance = self.model.stat.extent*1.2


--------------------------------------------------------------------------------
/mjrl/mjrl/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/policies/__init__.py


--------------------------------------------------------------------------------
/mjrl/mjrl/policies/gaussian_linear.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | from mjrl.utils.fc_network import FCNetwork
  7 | 
  8 | 
  9 | class LinearPolicy:
 10 |     def __init__(self, env_spec,
 11 |                  min_log_std=-3,
 12 |                  init_log_std=0,
 13 |                  seed=None):
 14 |         """
 15 |         :param env_spec: specifications of the env (see utils/gym_env.py)
 16 |         :param min_log_std: log_std is clamped at this value and can't go below
 17 |         :param init_log_std: initial log standard deviation
 18 |         :param seed: random seed
 19 |         """
 20 |         self.n = env_spec.observation_dim  # number of states
 21 |         self.m = env_spec.action_dim  # number of actions
 22 |         self.min_log_std = min_log_std
 23 | 
 24 |         # Set seed
 25 |         # ------------------------
 26 |         if seed is not None:
 27 |             torch.manual_seed(seed)
 28 |             np.random.seed(seed)
 29 | 
 30 |         # Policy network
 31 |         # ------------------------
 32 |         self.model = FCNetwork(self.n, self.m, hidden_sizes=())
 33 |         # make weights small
 34 |         for param in list(self.model.parameters())[-2:]:  # only last layer
 35 |            param.data = 1e-2 * param.data
 36 |         self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True)
 37 |         self.trainable_params = list(self.model.parameters()) + [self.log_std]
 38 | 
 39 |         # Old Policy network
 40 |         # ------------------------
 41 |         self.old_model = FCNetwork(self.n, self.m, hidden_sizes=())
 42 |         self.old_log_std = Variable(torch.ones(self.m) * init_log_std)
 43 |         self.old_params = list(self.old_model.parameters()) + [self.old_log_std]
 44 |         for idx, param in enumerate(self.old_params):
 45 |             param.data = self.trainable_params[idx].data.clone()
 46 | 
 47 |         # Easy access variables
 48 |         # -------------------------
 49 |         self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
 50 |         self.param_shapes = [p.data.numpy().shape for p in self.trainable_params]
 51 |         self.param_sizes = [p.data.numpy().size for p in self.trainable_params]
 52 |         self.d = np.sum(self.param_sizes)  # total number of params
 53 | 
 54 |         # Placeholders
 55 |         # ------------------------
 56 |         self.obs_var = Variable(torch.randn(self.n), requires_grad=False)
 57 | 
 58 |     # Utility functions
 59 |     # ============================================
 60 |     def get_param_values(self):
 61 |         params = np.concatenate([p.contiguous().view(-1).data.numpy()
 62 |                                  for p in self.trainable_params])
 63 |         return params.copy()
 64 | 
 65 |     def set_param_values(self, new_params, set_new=True, set_old=True):
 66 |         if set_new:
 67 |             current_idx = 0
 68 |             for idx, param in enumerate(self.trainable_params):
 69 |                 vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
 70 |                 vals = vals.reshape(self.param_shapes[idx])
 71 |                 param.data = torch.from_numpy(vals).float()
 72 |                 current_idx += self.param_sizes[idx]
 73 |             # clip std at minimum value
 74 |             self.trainable_params[-1].data = \
 75 |                 torch.clamp(self.trainable_params[-1], self.min_log_std).data
 76 |             # update log_std_val for sampling
 77 |             self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
 78 |         if set_old:
 79 |             current_idx = 0
 80 |             for idx, param in enumerate(self.old_params):
 81 |                 vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
 82 |                 vals = vals.reshape(self.param_shapes[idx])
 83 |                 param.data = torch.from_numpy(vals).float()
 84 |                 current_idx += self.param_sizes[idx]
 85 |             # clip std at minimum value
 86 |             self.old_params[-1].data = \
 87 |                 torch.clamp(self.old_params[-1], self.min_log_std).data
 88 | 
 89 |     # Main functions
 90 |     # ============================================
 91 |     def get_action(self, observation):
 92 |         o = np.float32(observation.reshape(1, -1))
 93 |         self.obs_var.data = torch.from_numpy(o)
 94 |         mean = self.model(self.obs_var).data.numpy().ravel()
 95 |         noise = np.exp(self.log_std_val) * np.random.randn(self.m)
 96 |         action = mean + noise
 97 |         return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}]
 98 | 
 99 |     def mean_LL(self, observations, actions, model=None, log_std=None):
100 |         model = self.model if model is None else model
101 |         log_std = self.log_std if log_std is None else log_std
102 |         obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False)
103 |         act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False)
104 |         mean = model(obs_var)
105 |         zs = (act_var - mean) / torch.exp(log_std)
106 |         LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \
107 |              - torch.sum(log_std) + \
108 |              - 0.5 * self.m * np.log(2 * np.pi)
109 |         return mean, LL
110 | 
111 |     def log_likelihood(self, observations, actions, model=None, log_std=None):
112 |         mean, LL = self.mean_LL(observations, actions, model, log_std)
113 |         return LL.data.numpy()
114 | 
115 |     def old_dist_info(self, observations, actions):
116 |         mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std)
117 |         return [LL, mean, self.old_log_std]
118 | 
119 |     def new_dist_info(self, observations, actions):
120 |         mean, LL = self.mean_LL(observations, actions, self.model, self.log_std)
121 |         return [LL, mean, self.log_std]
122 | 
123 |     def likelihood_ratio(self, new_dist_info, old_dist_info):
124 |         LL_old = old_dist_info[0]
125 |         LL_new = new_dist_info[0]
126 |         LR = torch.exp(LL_new - LL_old)
127 |         return LR
128 | 
129 |     def mean_kl(self, new_dist_info, old_dist_info):
130 |         old_log_std = old_dist_info[2]
131 |         new_log_std = new_dist_info[2]
132 |         old_std = torch.exp(old_log_std)
133 |         new_std = torch.exp(new_log_std)
134 |         old_mean = old_dist_info[1]
135 |         new_mean = new_dist_info[1]
136 |         Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2
137 |         Dr = 2 * new_std ** 2 + 1e-8
138 |         sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1)
139 |         return torch.mean(sample_kl)
140 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/policies/gaussian_mlp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from mjrl.utils.fc_network import FCNetwork
  3 | import torch
  4 | from torch.autograd import Variable
  5 | 
  6 | 
  7 | class MLP:
  8 |     def __init__(self, env_spec,
  9 |                  hidden_sizes=(64,64),
 10 |                  min_log_std=-3,
 11 |                  init_log_std=0,
 12 |                  eps=0.0,
 13 |                  seed=None):
 14 |         """
 15 |         :param env_spec: specifications of the env (see utils/gym_env.py)
 16 |         :param hidden_sizes: network hidden layer sizes (currently 2 layers only)
 17 |         :param min_log_std: log_std is clamped at this value and can't go below
 18 |         :param init_log_std: initial log standard deviation
 19 |         :param seed: random seed
 20 |         """
 21 |         self.n = env_spec.observation_dim  # number of states
 22 |         self.m = env_spec.action_dim  # number of actions
 23 |         self.min_log_std = min_log_std
 24 |         self.eps = eps
 25 | 
 26 |         # Set seed
 27 |         # ------------------------
 28 |         if seed is not None:
 29 |             torch.manual_seed(seed)
 30 |             np.random.seed(seed)
 31 | 
 32 |         # Policy network
 33 |         # ------------------------
 34 |         self.model = FCNetwork(self.n, self.m, hidden_sizes)
 35 |         # make weights small
 36 |         for param in list(self.model.parameters())[-2:]:  # only last layer
 37 |            param.data = 1e-2 * param.data
 38 |         self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True)
 39 |         self.trainable_params = list(self.model.parameters()) + [self.log_std]
 40 | 
 41 |         # Old Policy network
 42 |         # ------------------------
 43 |         self.old_model = FCNetwork(self.n, self.m, hidden_sizes)
 44 |         self.old_log_std = Variable(torch.ones(self.m) * init_log_std)
 45 |         self.old_params = list(self.old_model.parameters()) + [self.old_log_std]
 46 |         for idx, param in enumerate(self.old_params):
 47 |             param.data = self.trainable_params[idx].data.clone()
 48 | 
 49 |         # Easy access variables
 50 |         # -------------------------
 51 |         self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
 52 |         self.param_shapes = [p.data.numpy().shape for p in self.trainable_params]
 53 |         self.param_sizes = [p.data.numpy().size for p in self.trainable_params]
 54 |         self.d = np.sum(self.param_sizes)  # total number of params
 55 | 
 56 |         # Placeholders
 57 |         # ------------------------
 58 |         self.obs_var = Variable(torch.randn(self.n), requires_grad=False)
 59 | 
 60 |     # Utility functions
 61 |     # ============================================
 62 |     def get_param_values(self):
 63 |         params = np.concatenate([p.contiguous().view(-1).data.numpy()
 64 |                                  for p in self.trainable_params])
 65 |         return params.copy()
 66 | 
 67 |     def set_param_values(self, new_params, set_new=True, set_old=True):
 68 |         if set_new:
 69 |             current_idx = 0
 70 |             for idx, param in enumerate(self.trainable_params):
 71 |                 vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
 72 |                 vals = vals.reshape(self.param_shapes[idx])
 73 |                 param.data = torch.from_numpy(vals).float()
 74 |                 current_idx += self.param_sizes[idx]
 75 |             # clip std at minimum value
 76 |             self.trainable_params[-1].data = \
 77 |                 torch.clamp(self.trainable_params[-1], self.min_log_std).data
 78 |             # update log_std_val for sampling
 79 |             self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
 80 |         if set_old:
 81 |             current_idx = 0
 82 |             for idx, param in enumerate(self.old_params):
 83 |                 vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
 84 |                 vals = vals.reshape(self.param_shapes[idx])
 85 |                 param.data = torch.from_numpy(vals).float()
 86 |                 current_idx += self.param_sizes[idx]
 87 |             # clip std at minimum value
 88 |             self.old_params[-1].data = \
 89 |                 torch.clamp(self.old_params[-1], self.min_log_std).data
 90 | 
 91 |     # Main functions
 92 |     # ============================================
 93 |     def get_action(self, observation):
 94 |         o = np.float32(observation.reshape(1, -1))
 95 |         self.obs_var.data = torch.from_numpy(o)
 96 |         mean = self.model(self.obs_var).data.numpy().ravel()
 97 |         if np.random.uniform() < self.eps:
 98 |             action = np.random.uniform(0, 1, self.m)
 99 |         else:
100 |             noise = np.exp(self.log_std_val) * np.random.randn(self.m)
101 |             action = mean + noise
102 |         return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}]
103 | 
104 |     def mean_LL(self, observations, actions, model=None, log_std=None):
105 |         model = self.model if model is None else model
106 |         log_std = self.log_std if log_std is None else log_std
107 |         if type(observations) is not torch.Tensor:
108 |             obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False)
109 |         else:
110 |             obs_var = observations
111 |         if type(actions) is not torch.Tensor:
112 |             act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False)
113 |         else:
114 |             act_var = actions
115 |         mean = model(obs_var)
116 |         zs = (act_var - mean) / torch.exp(log_std)
117 |         LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \
118 |              - torch.sum(log_std) + \
119 |              - 0.5 * self.m * np.log(2 * np.pi)
120 |         return mean, LL
121 | 
122 |     def log_likelihood(self, observations, actions, model=None, log_std=None):
123 |         mean, LL = self.mean_LL(observations, actions, model, log_std)
124 |         return LL.data.numpy()
125 | 
126 |     def old_dist_info(self, observations, actions):
127 |         mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std)
128 |         return [LL, mean, self.old_log_std]
129 | 
130 |     def new_dist_info(self, observations, actions):
131 |         mean, LL = self.mean_LL(observations, actions, self.model, self.log_std)
132 |         return [LL, mean, self.log_std]
133 | 
134 |     def likelihood_ratio(self, new_dist_info, old_dist_info):
135 |         LL_old = old_dist_info[0]
136 |         LL_new = new_dist_info[0]
137 |         LR = torch.exp(LL_new - LL_old)
138 |         return LR
139 | 
140 |     def mean_kl(self, new_dist_info, old_dist_info):
141 |         old_log_std = old_dist_info[2]
142 |         new_log_std = new_dist_info[2]
143 |         old_std = torch.exp(old_log_std)
144 |         new_std = torch.exp(new_log_std)
145 |         old_mean = old_dist_info[1]
146 |         new_mean = new_dist_info[1]
147 |         Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2
148 |         Dr = 2 * new_std ** 2 + 1e-8
149 |         sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1)
150 |         return torch.mean(sample_kl)
151 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/policies/mpc_actor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from trajopt.utils import gather_paths_parallel
 3 | 
 4 | 
 5 | class MPCActor(object):
 6 |     def __init__(self, env, H, paths_per_cpu,
 7 |                  num_cpu=1,
 8 |                  kappa=1.0,
 9 |                  gamma=1.0,
10 |                  mean=None,
11 |                  filter_coefs=None,
12 |                  seed=123,
13 |                  ):
14 | 
15 |         self.env, self.seed = env, seed
16 |         self.n, self.m = env.observation_dim, env.action_dim
17 |         self.H, self.paths_per_cpu, self.num_cpu = H, paths_per_cpu, num_cpu
18 | 
19 |         self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma
20 |         if mean is None:
21 |             self.mean = np.zeros(self.m)
22 |         if filter_coefs is None:
23 |             self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0]
24 | 
25 |         self.env.reset()
26 |         self.env.set_seed(seed)
27 |         self.env.reset(seed=seed)
28 |         self.act_sequence = np.ones((self.H, self.m)) * self.mean
29 |         self.ctr = 1
30 | 
31 |     def score_trajectory(self, paths):
32 |         scores = np.zeros(len(paths))
33 |         for i in range(len(paths)):
34 |             scores[i] = 0.0
35 |             for t in range(paths[i]["rewards"].shape[0]):
36 |                 scores[i] += (self.gamma**t)*paths[i]["rewards"][t]
37 |         return scores
38 | 
39 |     def get_action(self, env_state):
40 |         # Set to env_state
41 |         # Shoot trajectories
42 |         # Return optimal action
43 |         seed = self.seed + self.ctr * 1000
44 |         paths = gather_paths_parallel(self.env.env_id,
45 |                                       env_state,
46 |                                       self.act_sequence,
47 |                                       self.filter_coefs,
48 |                                       seed,
49 |                                       self.paths_per_cpu,
50 |                                       self.num_cpu,
51 |                                       )
52 | 
53 |         num_traj = len(paths)
54 |         R = self.score_trajectory(paths)
55 |         S = np.exp(self.kappa*(R-np.max(R)))
56 |         act = np.sum([paths[i]["actions"][0] * S[i] for i in range(num_traj)], axis=0)
57 |         act = act / (np.sum(S) + 1e-6)
58 |         return act


--------------------------------------------------------------------------------
/mjrl/mjrl/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/samplers/__init__.py


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jdchang1/milo/0bd49fc2287329f4c3e11c77bd37dd060c0aa1a7/mjrl/mjrl/utils/__init__.py


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/cg_solve.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def cg_solve(f_Ax, b, x_0=None, cg_iters=10, residual_tol=1e-10):
 4 |     x = np.zeros_like(b) #if x_0 is None else x_0
 5 |     r = b.copy() #if x_0 is None else b-f_Ax(x_0)
 6 |     p = r.copy()
 7 |     rdotr = r.dot(r)
 8 | 
 9 |     for i in range(cg_iters):
10 |         z = f_Ax(p)
11 |         v = rdotr / p.dot(z)
12 |         x += v * p
13 |         r -= v * z
14 |         newrdotr = r.dot(r)
15 |         mu = newrdotr / rdotr
16 |         p = r + mu * p
17 | 
18 |         rdotr = newrdotr
19 |         if rdotr < residual_tol:
20 |             break
21 | 
22 |     return x
23 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/fc_network.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class FCNetwork(nn.Module):
 7 |     def __init__(self, obs_dim, act_dim,
 8 |                  hidden_sizes=(64,64),
 9 |                  nonlinearity='tanh',   # either 'tanh' or 'relu'
10 |                  in_shift = None,
11 |                  in_scale = None,
12 |                  out_shift = None,
13 |                  out_scale = None):
14 |         super(FCNetwork, self).__init__()
15 | 
16 |         self.obs_dim = obs_dim
17 |         self.act_dim = act_dim
18 |         assert type(hidden_sizes) == tuple
19 |         self.layer_sizes = (obs_dim, ) + hidden_sizes + (act_dim, )
20 |         self.set_transformations(in_shift, in_scale, out_shift, out_scale)
21 | 
22 |         # hidden layers
23 |         self.fc_layers = nn.ModuleList([nn.Linear(self.layer_sizes[i], self.layer_sizes[i+1]) \
24 |                          for i in range(len(self.layer_sizes) -1)])
25 |         self.nonlinearity = torch.relu if nonlinearity == 'relu' else torch.tanh
26 | 
27 |     def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None):
28 |         # store native scales that can be used for resets
29 |         self.transformations = dict(in_shift=in_shift,
30 |                            in_scale=in_scale,
31 |                            out_shift=out_shift,
32 |                            out_scale=out_scale
33 |                           )
34 |         self.in_shift  = torch.from_numpy(np.float32(in_shift)) if in_shift is not None else torch.zeros(self.obs_dim)
35 |         self.in_scale  = torch.from_numpy(np.float32(in_scale)) if in_scale is not None else torch.ones(self.obs_dim)
36 |         self.out_shift = torch.from_numpy(np.float32(out_shift)) if out_shift is not None else torch.zeros(self.act_dim)
37 |         self.out_scale = torch.from_numpy(np.float32(out_scale)) if out_scale is not None else torch.ones(self.act_dim)
38 | 
39 |     def forward(self, x):
40 |         # TODO(Aravind): Remove clamping to CPU
41 |         # This is a temp change that should be fixed shortly
42 |         if x.is_cuda:
43 |             out = x.to('cpu')
44 |         else:
45 |             out = x
46 |         out = (out - self.in_shift)/(self.in_scale + 1e-8)
47 |         for i in range(len(self.fc_layers)-1):
48 |             out = self.fc_layers[i](out)
49 |             out = self.nonlinearity(out)
50 |         out = self.fc_layers[-1](out)
51 |         out = out * self.out_scale + self.out_shift
52 |         return out
53 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/get_environment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | convenience function to generate env
 3 | useful if we want some procedural env generation
 4 | """
 5 | 
 6 | import gym
 7 | from mjrl.utils.gym_env import GymEnv
 8 | 
 9 | def get_environment(env_name=None, **kwargs):
10 |     if env_name is None: print("Need to specify environment name")
11 |     e = GymEnv(env_name)
12 |     # can make procedural modifications here if needed using kwargs
13 |     return e
14 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import scipy
 6 | import pickle
 7 | import os
 8 | import csv
 9 | 
10 | class DataLog:
11 | 
12 |     def __init__(self):
13 |         self.log = {}
14 |         self.max_len = 0
15 | 
16 |     def log_kv(self, key, value):
17 |         # logs the (key, value) pair
18 | 
19 |         # TODO: This implementation is error-prone:
20 |         # it would be NOT aligned if some keys are missing during one iteration.
21 |         if key not in self.log:
22 |             self.log[key] = []
23 |         self.log[key].append(value)
24 |         if len(self.log[key]) > self.max_len:
25 |             self.max_len = self.max_len + 1
26 | 
27 |     def save_log(self, save_path):
28 |         # TODO: Validate all lengths are the same.
29 |         pickle.dump(self.log, open(save_path + '/log.pickle', 'wb'))
30 |         with open(save_path + '/log.csv', 'w') as csv_file:
31 |             fieldnames = list(self.log.keys())
32 |             if 'iteration' not in fieldnames:
33 |                 fieldnames = ['iteration'] + fieldnames
34 | 
35 |             writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
36 |             writer.writeheader()
37 |             for row in range(self.max_len):
38 |                 row_dict = {'iteration': row}
39 |                 for key in self.log.keys():
40 |                     if row < len(self.log[key]):
41 |                         row_dict[key] = self.log[key][row]
42 |                 writer.writerow(row_dict)
43 | 
44 |     def get_current_log(self):
45 |         row_dict = {}
46 |         for key in self.log.keys():
47 |             # TODO: this is very error-prone (alignment is not guaranteed)
48 |             row_dict[key] = self.log[key][-1]
49 |         return row_dict
50 | 
51 |     def shrink_to(self, num_entries):
52 |         for key in self.log.keys():
53 |             self.log[key] = self.log[key][:num_entries]
54 | 
55 |         self.max_len = num_entries
56 |         assert min([len(series) for series in self.log.values()]) == \
57 |             max([len(series) for series in self.log.values()])
58 | 
59 |     def read_log(self, log_path):
60 |         assert log_path.endswith('log.csv')
61 | 
62 |         with open(log_path) as csv_file:
63 |             reader = csv.DictReader(csv_file)
64 |             listr = list(reader)
65 |             keys = reader.fieldnames
66 |             data = {}
67 |             for key in keys:
68 |                 data[key] = []
69 |             for row, row_dict in enumerate(listr):
70 |                 for key in keys:
71 |                     try:
72 |                         data[key].append(eval(row_dict[key]))
73 |                     except:
74 |                         print("ERROR on reading key {}: {}".format(key, row_dict[key]))
75 | 
76 |                 if 'iteration' in data and data['iteration'][-1] != row:
77 |                     raise RuntimeError("Iteration %d mismatch -- possibly corrupted logfile?" % row)
78 | 
79 |         self.log = data
80 |         self.max_len = max(len(v) for k, v in self.log.items())
81 |         print("Log read from {}: had {} entries".format(log_path, self.max_len))
82 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/make_train_plots.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import scipy
 6 | import csv
 7 | from mjrl.utils.logger import DataLog
 8 | import argparse
 9 | 
10 | def make_train_plots(log = None,
11 |                      log_path = None,
12 |                      keys = None,
13 |                      save_loc = None,
14 |                      sample_key = 'num_samples',
15 |                      x_scale = 1.0,
16 |                      y_scale = 1.0):
17 |     if log is None and log_path is None:
18 |         print("Need to provide either the log or path to a log file")
19 |     if log is None:
20 |         logger = DataLog()
21 |         logger.read_log(log_path)
22 |         log = logger.log
23 |     # make plots for specified keys
24 |     for key in keys:
25 |         if key in log.keys():
26 |             fig = plt.figure(figsize=(10,6))
27 |             ax1 = fig.add_subplot(111)
28 |             try:
29 |                 cum_samples = [np.sum(log[sample_key][:i]) * x_scale for i in range(len(log[sample_key]))]
30 |                 ax1.plot(cum_samples, [elem * y_scale for elem in log[key]])
31 |                 ax1.set_xlabel('samples')
32 |                 # mark iteration on the top axis
33 |                 ax2 = ax1.twiny() 
34 |                 ax2.set_xlabel('iterations', color=(.7,.7,.7))
35 |                 ax2.tick_params(axis='x', labelcolor=(.7,.7,.7))
36 |                 ax2.set_xlim([0, len(log[key])])
37 |             except:
38 |                 ax1.plot(log[key])
39 |                 ax1.set_xlabel('iterations')
40 |             ax1.set_title(key)
41 |             plt.savefig(save_loc+'/'+key+'.png', dpi=100)
42 |             plt.close()
43 | 
44 | # MAIN =========================================================
45 | # Example: python make_train_plots.py --log_path logs/log.csv --keys eval_score rollout_score save_loc logs
46 | def main():
47 |     # Parse arguments
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument(
50 |         '-l', '--log_path', type=str, required=True, help='path file to log.csv')
51 |     parser.add_argument(
52 |         '-k', '--keys', type=str, action='append', nargs='+', required=True, help='keys to plot')
53 |     parser.add_argument(
54 |         '-s', '--save_loc', type=str, default='', help='Path for logs')
55 |     args = parser.parse_args()
56 | 
57 |     make_train_plots(log_path=args.log_path, keys=args.keys[0], save_loc=args.save_loc)
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 
62 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/optimize_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | def fit_data(model, x, y, optimizer, loss_func, batch_size, epochs):
 8 |     """
 9 |     :param model:           pytorch model of form y_hat = f(x) (class)
10 |     :param x:               inputs to the model (tensor)
11 |     :param y:               desired outputs or targets (tensor)
12 |     :param optimizer:       optimizer to be used (class)
13 |     :param loss_func:       loss criterion (callable)
14 |     :param batch_size:      mini-batch size for optimization (int)
15 |     :param epochs:          number of epochs (int)
16 |     :return:
17 |     """
18 | 
19 |     num_samples = x.shape[0]
20 |     epoch_losses = []
21 |     for ep in range(epochs):
22 |         rand_idx = torch.LongTensor(np.random.permutation(num_samples))
23 |         ep_loss = 0.0
24 |         num_steps = int(num_samples / batch_size) - 1
25 |         for mb in range(num_steps):
26 |             data_idx = rand_idx[mb*batch_size:(mb+1)*batch_size]
27 |             batch_x = x[data_idx]
28 |             batch_y = y[data_idx]
29 |             optimizer.zero_grad()
30 |             yhat = model(batch_x)
31 |             loss = loss_func(yhat, batch_y)
32 |             loss.backward()
33 |             optimizer.step()
34 |             ep_loss += loss.detach()
35 |         epoch_losses.append(ep_loss.to('cpu').data.numpy().ravel() / num_steps)
36 |     return epoch_losses
37 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/process_samples.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def compute_returns(paths, gamma):
 4 |     for path in paths:
 5 |         path["returns"] = discount_sum(path["rewards"], gamma)
 6 | 
 7 | def compute_advantages(paths, baseline, gamma, gae_lambda=None, normalize=False):
 8 |     # compute and store returns, advantages, and baseline 
 9 |     # standard mode
10 |     if gae_lambda == None or gae_lambda < 0.0 or gae_lambda > 1.0:
11 |         for path in paths:
12 |             path["baseline"] = baseline.predict(path)
13 |             path["advantages"] = path["returns"] - path["baseline"]
14 |         if normalize:
15 |             alladv = np.concatenate([path["advantages"] for path in paths])
16 |             mean_adv = alladv.mean()
17 |             std_adv = alladv.std()
18 |             for path in paths:
19 |                 path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8)
20 |     # GAE mode
21 |     else:
22 |         for path in paths:
23 |             b = path["baseline"] = baseline.predict(path)
24 |             if b.ndim == 1:
25 |                 b1 = np.append(path["baseline"], 0.0 if path["terminated"] else b[-1])
26 |             else:
27 |                 b1 = np.vstack((b, np.zeros(b.shape[1]) if path["terminated"] else b[-1]))
28 |             td_deltas = path["rewards"] + gamma*b1[1:] - b1[:-1]
29 |             path["advantages"] = discount_sum(td_deltas, gamma*gae_lambda)
30 |         if normalize:
31 |             alladv = np.concatenate([path["advantages"] for path in paths])
32 |             mean_adv = alladv.mean()
33 |             std_adv = alladv.std()
34 |             for path in paths:
35 |                 path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8)
36 | 
37 | def discount_sum(x, gamma, terminal=0.0):
38 |     y = []
39 |     run_sum = terminal
40 |     for t in range( len(x)-1, -1, -1):
41 |         run_sum = x[t] + gamma*run_sum
42 |         y.append(run_sum)
43 | 
44 |     return np.array(y[::-1])


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/tensor_utils.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | def flatten_tensors(tensors):
  7 |     if len(tensors) > 0:
  8 |         return np.concatenate([np.reshape(x, [-1]) for x in tensors])
  9 |     else:
 10 |         return np.asarray([])
 11 | 
 12 | 
 13 | def unflatten_tensors(flattened, tensor_shapes):
 14 |     tensor_sizes = list(map(np.prod, tensor_shapes))
 15 |     indices = np.cumsum(tensor_sizes)[:-1]
 16 |     return [np.reshape(pair[0], pair[1]) for pair in zip(np.split(flattened, indices), tensor_shapes)]
 17 | 
 18 | 
 19 | def pad_tensor(x, max_len, mode='zero'):
 20 |     padding = np.zeros_like(x[0])
 21 |     if mode == 'last':
 22 |         padding = x[-1]
 23 |     return np.concatenate([
 24 |         x,
 25 |         np.tile(padding, (max_len - len(x),) + (1,) * np.ndim(x[0]))
 26 |     ])
 27 | 
 28 | 
 29 | def pad_tensor_n(xs, max_len):
 30 |     ret = np.zeros((len(xs), max_len) + xs[0].shape[1:], dtype=xs[0].dtype)
 31 |     for idx, x in enumerate(xs):
 32 |         ret[idx][:len(x)] = x
 33 |     return ret
 34 | 
 35 | 
 36 | def pad_tensor_dict(tensor_dict, max_len, mode='zero'):
 37 |     keys = list(tensor_dict.keys())
 38 |     ret = dict()
 39 |     for k in keys:
 40 |         if isinstance(tensor_dict[k], dict):
 41 |             ret[k] = pad_tensor_dict(tensor_dict[k], max_len, mode=mode)
 42 |         else:
 43 |             ret[k] = pad_tensor(tensor_dict[k], max_len, mode=mode)
 44 |     return ret
 45 | 
 46 | 
 47 | def flatten_first_axis_tensor_dict(tensor_dict):
 48 |     keys = list(tensor_dict.keys())
 49 |     ret = dict()
 50 |     for k in keys:
 51 |         if isinstance(tensor_dict[k], dict):
 52 |             ret[k] = flatten_first_axis_tensor_dict(tensor_dict[k])
 53 |         else:
 54 |             old_shape = tensor_dict[k].shape
 55 |             ret[k] = tensor_dict[k].reshape((-1,) + old_shape[2:])
 56 |     return ret
 57 | 
 58 | 
 59 | def high_res_normalize(probs):
 60 |     return [x / sum(map(float, probs)) for x in list(map(float, probs))]
 61 | 
 62 | 
 63 | def stack_tensor_list(tensor_list):
 64 |     return np.array(tensor_list)
 65 |     # tensor_shape = np.array(tensor_list[0]).shape
 66 |     # if tensor_shape is tuple():
 67 |     #     return np.array(tensor_list)
 68 |     # return np.vstack(tensor_list)
 69 | 
 70 | 
 71 | def stack_tensor_dict_list(tensor_dict_list):
 72 |     """
 73 |     Stack a list of dictionaries of {tensors or dictionary of tensors}.
 74 |     :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}.
 75 |     :return: a dictionary of {stacked tensors or dictionary of stacked tensors}
 76 |     """
 77 |     keys = list(tensor_dict_list[0].keys())
 78 |     ret = dict()
 79 |     for k in keys:
 80 |         example = tensor_dict_list[0][k]
 81 |         if isinstance(example, dict):
 82 |             v = stack_tensor_dict_list([x[k] for x in tensor_dict_list])
 83 |         else:
 84 |             v = stack_tensor_list([x[k] for x in tensor_dict_list])
 85 |         ret[k] = v
 86 |     return ret
 87 | 
 88 | 
 89 | def concat_tensor_list_subsample(tensor_list, f):
 90 |     return np.concatenate(
 91 |         [t[np.random.choice(len(t), int(np.ceil(len(t) * f)), replace=False)] for t in tensor_list], axis=0)
 92 | 
 93 | 
 94 | def concat_tensor_dict_list_subsample(tensor_dict_list, f):
 95 |     keys = list(tensor_dict_list[0].keys())
 96 |     ret = dict()
 97 |     for k in keys:
 98 |         example = tensor_dict_list[0][k]
 99 |         if isinstance(example, dict):
100 |             v = concat_tensor_dict_list_subsample([x[k] for x in tensor_dict_list], f)
101 |         else:
102 |             v = concat_tensor_list_subsample([x[k] for x in tensor_dict_list], f)
103 |         ret[k] = v
104 |     return ret
105 | 
106 | 
107 | def concat_tensor_list(tensor_list):
108 |     return np.concatenate(tensor_list, axis=0)
109 | 
110 | 
111 | def concat_tensor_dict_list(tensor_dict_list):
112 |     keys = list(tensor_dict_list[0].keys())
113 |     ret = dict()
114 |     for k in keys:
115 |         example = tensor_dict_list[0][k]
116 |         if isinstance(example, dict):
117 |             v = concat_tensor_dict_list([x[k] for x in tensor_dict_list])
118 |         else:
119 |             v = concat_tensor_list([x[k] for x in tensor_dict_list])
120 |         ret[k] = v
121 |     return ret
122 | 
123 | 
124 | def split_tensor_dict_list(tensor_dict):
125 |     keys = list(tensor_dict.keys())
126 |     ret = None
127 |     for k in keys:
128 |         vals = tensor_dict[k]
129 |         if isinstance(vals, dict):
130 |             vals = split_tensor_dict_list(vals)
131 |         if ret is None:
132 |             ret = [{k: v} for v in vals]
133 |         else:
134 |             for v, cur_dict in zip(vals, ret):
135 |                 cur_dict[k] = v
136 |     return ret
137 | 
138 | 
139 | def truncate_tensor_list(tensor_list, truncated_len):
140 |     return tensor_list[:truncated_len]
141 | 
142 | 
143 | def truncate_tensor_dict(tensor_dict, truncated_len):
144 |     ret = dict()
145 |     for k, v in tensor_dict.items():
146 |         if isinstance(v, dict):
147 |             ret[k] = truncate_tensor_dict(v, truncated_len)
148 |         else:
149 |             ret[k] = truncate_tensor_list(v, truncated_len)
150 |     return ret
151 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/train_agent.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | #logging.disable(logging.CRITICAL)
  3 | 
  4 | from tabulate import tabulate
  5 | from mjrl.utils.make_train_plots import make_train_plots
  6 | from mjrl.utils.gym_env import GymEnv
  7 | from mjrl.samplers.core import sample_paths
  8 | import numpy as np
  9 | import pickle
 10 | import time as timer
 11 | import os
 12 | import copy
 13 | 
 14 | 
 15 | def _load_latest_policy_and_logs(agent, *, policy_dir, logs_dir):
 16 |     """Loads the latest policy.
 17 |     Returns the next step number to begin with.
 18 |     """
 19 |     assert os.path.isdir(policy_dir), str(policy_dir)
 20 |     assert os.path.isdir(logs_dir), str(logs_dir)
 21 | 
 22 |     log_csv_path = os.path.join(logs_dir, 'log.csv')
 23 |     if not os.path.exists(log_csv_path):
 24 |         return 0   # fresh start
 25 | 
 26 |     print("Reading: {}".format(log_csv_path))
 27 |     agent.logger.read_log(log_csv_path)
 28 |     last_step = agent.logger.max_len - 1
 29 |     if last_step <= 0:
 30 |         return 0   # fresh start
 31 | 
 32 | 
 33 |     # find latest policy/baseline
 34 |     i = last_step
 35 |     while i >= 0:
 36 |         policy_path = os.path.join(policy_dir, 'policy_{}.pickle'.format(i))
 37 |         baseline_path = os.path.join(policy_dir, 'baseline_{}.pickle'.format(i))
 38 | 
 39 |         if not os.path.isfile(policy_path):
 40 |             i = i -1
 41 |             continue
 42 |         else:
 43 |             print("Loaded last saved iteration: {}".format(i))
 44 | 
 45 |         with open(policy_path, 'rb') as fp:
 46 |             agent.policy = pickle.load(fp)
 47 |         with open(baseline_path, 'rb') as fp:
 48 |             agent.baseline = pickle.load(fp)
 49 | 
 50 |         # additional
 51 |         # global_status_path = os.path.join(policy_dir, 'global_status.pickle')
 52 |         # with open(global_status_path, 'rb') as fp:
 53 |         #     agent.load_global_status( pickle.load(fp) )
 54 | 
 55 |         agent.logger.shrink_to(i + 1)
 56 |         assert agent.logger.max_len == i + 1
 57 |         return agent.logger.max_len
 58 | 
 59 |     # cannot find any saved policy
 60 |     raise RuntimeError("Log file exists, but cannot find any saved policy.")
 61 | 
 62 | def train_agent(job_name, agent,
 63 |                 seed = 0,
 64 |                 niter = 101,
 65 |                 gamma = 0.995,
 66 |                 gae_lambda = None,
 67 |                 num_cpu = 1,
 68 |                 sample_mode = 'trajectories',
 69 |                 num_traj = 50,
 70 |                 num_samples = 50000, # has precedence, used with sample_mode = 'samples'
 71 |                 save_freq = 10,
 72 |                 evaluation_rollouts = None,
 73 |                 plot_keys = ['stoc_pol_mean'],
 74 |                 reward_kwargs = None,
 75 |                 adroit=False
 76 |                 ):
 77 | 
 78 |     np.random.seed(seed)
 79 |     if os.path.isdir(job_name) == False:
 80 |         os.mkdir(job_name)
 81 |     previous_dir = os.getcwd()
 82 |     os.chdir(job_name) # important! we are now in the directory to save data
 83 |     if os.path.isdir('iterations') == False: os.mkdir('iterations')
 84 |     if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs')
 85 |     best_policy = copy.deepcopy(agent.policy)
 86 |     best_perf = -1e8
 87 |     train_curve = best_perf*np.ones(niter)
 88 |     mean_pol_perf = 0.0
 89 |     e = GymEnv(agent.env.env.spec.id)
 90 | 
 91 |     # Load from any existing checkpoint, policy, statistics, etc.
 92 |     # Why no checkpointing.. :(
 93 |     #i_start = _load_latest_policy_and_logs(agent,
 94 |     #                                       policy_dir='iterations',
 95 |     #                                       logs_dir='logs')
 96 |     #if i_start:
 97 |     #    print("Resuming from an existing job folder ...")
 98 |     i_start = 0
 99 |     for i in range(i_start, niter):
100 |         print("......................................................................................")
101 |         print("ITERATION : %i " % i)
102 | 
103 |         if train_curve[i-1] > best_perf:
104 |             best_policy = copy.deepcopy(agent.policy)
105 |             best_perf = train_curve[i-1]
106 | 
107 |         N = num_traj if sample_mode == 'trajectories' else num_samples
108 | 
109 |         args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu, reward_kwargs=reward_kwargs)
110 |         # NOTE: Samples are inputed here
111 |         stats = agent.train_step(**args)
112 |         train_curve[i] = stats[0]
113 | 
114 |         if evaluation_rollouts is not None and evaluation_rollouts > 0:
115 |             print("Performing evaluation rollouts ........")
116 |             eval_paths = sample_paths(num_traj=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu,
117 |                                       env=e.env_id, eval_mode=True, base_seed=seed)
118 |             mean_pol_perf = np.mean([np.sum(path['rewards']) for path in eval_paths])
119 |             if agent.save_logs:
120 |                 agent.logger.log_kv('eval_score', mean_pol_perf)
121 | 
122 |         if save_freq != 0 and i > 0 and i % save_freq == 0:
123 |             if agent.save_logs:
124 |                 agent.logger.save_log('logs/')
125 |                 make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
126 |             policy_file = 'policy_%i.pickle' % i
127 |             baseline_file = 'baseline_%i.pickle' % i
128 |             pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
129 |             pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb'))
130 |             pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
131 |             # pickle.dump(agent.global_status, open('iterations/global_status.pickle', 'wb'))
132 | 
133 |         # print results to console
134 |         if i == 0:
135 |             result_file = open('results.txt', 'w')
136 |             print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
137 |             result_file.write("Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
138 |             result_file.close()
139 |         if not adroit:
140 |             print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())),
141 |                                                  i, train_curve[i], mean_pol_perf, best_perf))
142 |         else:
143 |             print("[ %s ] %4i %5.2f %5.2f %5.2f %5.2f" % (timer.asctime(timer.localtime(timer.time())),
144 |                                                  i, train_curve[i], mean_pol_perf, best_perf, stats[-1]))
145 | 
146 |         result_file = open('results.txt', 'a')
147 |         result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf))
148 |         result_file.close()
149 |         if agent.save_logs:
150 |             print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
151 |                                        agent.logger.get_current_log().items()))
152 |             print(tabulate(print_data))
153 | 
154 |     # final save
155 |     pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
156 |     if agent.save_logs:
157 |         agent.logger.save_log('logs/')
158 |         make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
159 |     os.chdir(previous_dir)
160 | 


--------------------------------------------------------------------------------
/mjrl/mjrl/utils/visualize_policy.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import mjrl.envs
 3 | import click 
 4 | import os
 5 | import gym
 6 | import numpy as np
 7 | import pickle
 8 | from mjrl.utils.gym_env import GymEnv
 9 | from mjrl.policies.gaussian_mlp import MLP
10 | import trajopt.envs
11 | 
12 | DESC = '''
13 | Helper script to visualize policy (in mjrl format).\n
14 | USAGE:\n
15 |     Visualizes policy on the env\n
16 |     $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n
17 | '''
18 | 
19 | # MAIN =========================================================
20 | @click.command(help=DESC)
21 | @click.option('--env_name', type=str, help='environment to load', required= True)
22 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None)
23 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation')
24 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123)
25 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10)
26 | 
27 | def main(env_name, policy, mode, seed, episodes):
28 |     e = GymEnv(env_name)
29 |     e.set_seed(seed)
30 |     if policy is not None:
31 |         pi = pickle.load(open(policy, 'rb'))
32 |     else:
33 |         pi = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=-1.0)
34 |     # render policy
35 |     e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode)
36 | 
37 | if __name__ == '__main__':
38 |     main()
39 | 
40 | 


--------------------------------------------------------------------------------
/mjrl/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | 
 5 | print("Installing mjrl. \n Package intended for use with provided conda env. See setup instructions here: https://github.com/aravindr93/mjrl/tree/master/setup")
 6 | 
 7 | if sys.version_info.major != 3:
 8 |     print("This Python is only compatible with Python 3, but you are running "
 9 |           "Python {}. The installation will likely fail.".format(sys.version_info.major))
10 |     
11 | def read(fname):
12 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
13 | 
14 | setup(
15 |     name='mjrl',
16 |     version='1.0.0',
17 |     packages=find_packages(),
18 |     description='RL algorithms for environments in MuJoCo',
19 |     long_description=read('README.md'),
20 |     url='https://github.com/aravindr93/mjrl.git',
21 |     author='Aravind Rajeswaran',
22 | )
23 | 


--------------------------------------------------------------------------------
/mjrl/setup/README.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | A short guide to install this package is below. The package relies on `mujoco-py` which might be the trickiest part of the installation. See `known issues` below and also instructions from the mujoco-py [page](https://github.com/openai/mujoco-py) if you are stuck with mujoco-py installation.
 4 | 
 5 | The package can handle both `MuJoCo v1.5` as well as `MuJoCo v2.0`, but the former is not supported for future updates. We encourage you to use v2.0.
 6 | 
 7 | ## Linux
 8 | 
 9 | - Download MuJoCo v2.0 binaries from the official [website](http://www.mujoco.org/) and also obtain the license key.
10 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200`, and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. Note that unzip of the MuJoCo binaries will generate `mujoco200_linux`. You need to rename the directory and place it at `~/.mujoco/mujoco200`.
11 | - Install osmesa related dependencies:
12 | ```
13 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev build-essential libglfw3
14 | ```
15 | - Update `bashrc` by adding the following lines and source it
16 | ```
17 | export LD_LIBRARY_PATH="<path/to/.mujoco>/mujoco200/bin:$LD_LIBRARY_PATH"
18 | export MUJOCO_PY_FORCE_CPU=True
19 | alias MJPL='LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so:/usr/lib/nvidia-384/libGL.so'
20 | ```
21 | - Install this package using
22 | ```
23 | $ conda update conda
24 | $ cd <path/to/mjrl>
25 | $ conda env create -f setup/env.yml
26 | $ source activate mjrl-env
27 | $ pip install -e .
28 | ```
29 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly based on the specific version of CUDA (or CPU-only) you have.
30 | 
31 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info.
32 | 
33 | ## Mac OS
34 | 
35 | - Download MuJoCo binaries from the official [website](http://www.mujoco.org/) and also obtain the license key.
36 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200` (rename unzipped directory to this), and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`.
37 | - Update `bashrc` by adding the following lines and source it
38 | ```
39 | export LD_LIBRARY_PATH="<path/to/.mujoco>/mujoco200/bin:$LD_LIBRARY_PATH"
40 | ```
41 | - Install this package using
42 | ```
43 | $ conda update conda
44 | $ cd path/to/mjrl
45 | $ conda env create -f setup/env.yml
46 | $ source activate mjrl-env
47 | $ pip install -e .
48 | ```
49 | 
50 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly.
51 | 
52 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info.
53 | 
54 | 
55 | ## Known Issues
56 | 
57 | - Visualization in linux: If the linux system has a GPU, then mujoco-py does not automatically preload the correct drivers. We added an alias `MJPL` in bashrc (see instructions) which stands for mujoco pre-load. When runing any python script that requires rendering, prepend the execution with MJPL.
58 | ```
59 | $ MJPL python script.py
60 | ```
61 | 
62 | - Errors related to osmesa during installation. This is a `mujoco-py` build error and would likely go away if the following command is used before creating the conda environment. If the problem still persists, please contact the developers of mujoco-py
63 | ```
64 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev
65 | ```
66 | 
67 | - If conda environment creation gets interrupted for some reason, you can resume it with the following:
68 | ```
69 | $ conda env update -n mjrl-env -f setup/env.yml
70 | ```
71 | 
72 | - GCC error in Mac OS: If you get a GCC error from mujoco-py, you can get the correct version mujoco-py expects with `brew install gcc --without-multilib`. This may require uninstalling other versions of GCC that may have been previously installed with `brew remove gcc@6` for example. You can see which brew packages were already installed with `brew list`.
73 | 
74 | 


--------------------------------------------------------------------------------
/mjrl/setup/env.yml:
--------------------------------------------------------------------------------
 1 | name: mjrl-env
 2 | channels:
 3 | - pytorch
 4 | - defaults
 5 | dependencies:
 6 | - python=3.7
 7 | - pip
 8 | - ipython
 9 | - mkl-service
10 | - pytorch==1.4
11 | - tabulate
12 | - termcolor
13 | - torchvision
14 | - patchelf
15 | - pip:
16 |   - click
17 |   - cloudpickle
18 |   - gym==0.13
19 |   - ipdb
20 |   - matplotlib
21 |   - mujoco-py<2.1,>=2.0
22 |   - pip
23 |   - pyyaml
24 |   - tqdm
25 |   - wheel
26 |   - scipy
27 |   - transforms3d
28 | 


--------------------------------------------------------------------------------
/mjrl/tests/point_mass_test.py:
--------------------------------------------------------------------------------
 1 | from mjrl.utils.gym_env import GymEnv
 2 | from mjrl.policies.gaussian_mlp import MLP
 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 4 | from mjrl.baselines.mlp_baseline import MLPBaseline
 5 | from mjrl.algos.npg_cg import NPG
 6 | from mjrl.utils.train_agent import train_agent
 7 | import mjrl.envs
 8 | import time as timer
 9 | SEED = 500
10 | 
11 | e = GymEnv('mjrl_point_mass-v0')
12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
13 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=10, learn_rate=1e-3)
14 | agent = NPG(e, policy, baseline, normalized_step_size=0.05, seed=SEED, save_logs=True)
15 | 
16 | ts = timer.time()
17 | train_agent(job_name='point_mass_exp1',
18 |             agent=agent,
19 |             seed=SEED,
20 |             niter=50,
21 |             gamma=0.95,
22 |             gae_lambda=0.97,
23 |             num_cpu=1,
24 |             sample_mode='trajectories',
25 |             num_traj=40,      # samples = 40*25 = 1000
26 |             save_freq=5,
27 |             evaluation_rollouts=None,
28 |             plot_keys=['stoc_pol_mean', 'running_score'])
29 | print("time taken = %f" % (timer.time()-ts))
30 | 


--------------------------------------------------------------------------------
/mjrl/tests/visualizer_test.py:
--------------------------------------------------------------------------------
 1 | from mjrl.utils.gym_env import GymEnv
 2 | from mjrl.policies.gaussian_mlp import MLP
 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 4 | from mjrl.baselines.mlp_baseline import MLPBaseline
 5 | from mjrl.algos.npg_cg import NPG
 6 | from mjrl.utils.train_agent import train_agent
 7 | import mjrl.envs
 8 | import time as timer
 9 | SEED = 500
10 | 
11 | e = GymEnv('mjrl_point_mass-v0')
12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
13 | baseline = QuadraticBaseline(e.spec)
14 | agent = NPG(e, policy, baseline, normalized_step_size=0.5, seed=SEED, save_logs=True)
15 | 
16 | ts = timer.time()
17 | train_agent(job_name='vis_exp',
18 |             agent=agent,
19 |             seed=SEED,
20 |             niter=10,
21 |             gamma=0.95,
22 |             gae_lambda=0.97,
23 |             num_cpu=1,
24 |             sample_mode='trajectories',
25 |             num_traj=100,
26 |             save_freq=5,
27 |             evaluation_rollouts=None)
28 | print("time taken = %f" % (timer.time()-ts))
29 | e.visualize_policy(policy, num_episodes=5, horizon=e.horizon, mode='exploration')
30 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib==3.4.1
 2 | mujoco-py==2.0.2.13
 3 | numpy==1.20.2
 4 | tabulate==0.8.9
 5 | tensorboard==2.5.0
 6 | tensorboard-data-server==0.6.1
 7 | tensorboard-plugin-wit==1.8.0
 8 | torch==1.8.1
 9 | torchaudio==0.8.0a0+e4e171a
10 | torchvision==0.9.1
11 | tqdm==4.60.0
12 | 


--------------------------------------------------------------------------------