├── .gitignore
├── LICENSE
├── README.md
├── examples
├── README.md
├── behavior_clone.py
├── example_configs
│ ├── hopper_npg.txt
│ ├── swimmer_npg.txt
│ └── swimmer_ppo.txt
├── linear_nn_comparison.py
└── policy_opt_job_script.py
├── mjrl
├── __init__.py
├── algos
│ ├── __init__.py
│ ├── batch_reinforce.py
│ ├── behavior_cloning.py
│ ├── dapg.py
│ ├── mbac.py
│ ├── model_accel
│ │ ├── __init__.py
│ │ ├── model_accel_npg.py
│ │ ├── model_learning_mpc.py
│ │ ├── nn_dynamics.py
│ │ ├── run_experiments
│ │ │ ├── configs
│ │ │ │ ├── point_mass.txt
│ │ │ │ └── reacher.txt
│ │ │ ├── run_model_accel_npg.py
│ │ │ ├── sandbox
│ │ │ │ ├── example_config_mpc.txt
│ │ │ │ └── run_model_learning_mpc.py
│ │ │ └── utils
│ │ │ │ ├── reward_functions
│ │ │ │ ├── __init__.py
│ │ │ │ └── mjrl_point_mass.py
│ │ │ │ ├── visualize_policy.py
│ │ │ │ └── visualize_trajectories.py
│ │ └── sampling.py
│ ├── npg_cg.py
│ ├── ppo_clip.py
│ └── trpo.py
├── baselines
│ ├── __init__.py
│ ├── linear_baseline.py
│ ├── mlp_baseline.py
│ ├── quadratic_baseline.py
│ └── zero_baseline.py
├── envs
│ ├── __init__.py
│ ├── assets
│ │ ├── peg_insertion.xml
│ │ ├── point_mass.xml
│ │ ├── sawyer.xml
│ │ └── swimmer.xml
│ ├── mujoco_env.py
│ ├── peg_insertion_sawyer.py
│ ├── point_mass.py
│ ├── reacher_sawyer.py
│ └── swimmer.py
├── policies
│ ├── __init__.py
│ ├── gaussian_linear.py
│ ├── gaussian_mlp.py
│ └── mpc_actor.py
├── samplers
│ ├── __init__.py
│ └── core.py
└── utils
│ ├── __init__.py
│ ├── cg_solve.py
│ ├── fc_network.py
│ ├── get_environment.py
│ ├── gym_env.py
│ ├── logger.py
│ ├── make_train_plots.py
│ ├── optimize_model.py
│ ├── plot_from_logs.py
│ ├── process_samples.py
│ ├── tensor_utils.py
│ ├── train_agent.py
│ └── visualize_policy.py
├── setup.py
├── setup
├── README.md
└── env.yml
└── tests
├── hydra
├── config
│ └── hydra_npg_config.yaml
└── hydra_policy_opt_job_script.py
├── point_mass_test.py
└── visualizer_test.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | # idea
104 | *.idea/
105 |
106 | # Mac OSX files
107 | *.DS_Store
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RL for MuJoCo
2 |
3 | This package contains implementations of various RL algorithms for continuous control tasks simulated with [MuJoCo.](http://www.mujoco.org/)
4 |
5 | # Installation
6 | The main package dependencies are `MuJoCo`, `python=3.7`, `gym>=0.13`, `mujoco-py>=2.0`, and `pytorch>=1.0`. See `setup/README.md` ([link](https://github.com/aravindr93/mjrl/tree/master/setup#installation)) for detailed install instructions.
7 |
8 | # Bibliography
9 | If you find the package useful, please cite the following papers.
10 | ```
11 | @INPROCEEDINGS{Rajeswaran-NIPS-17,
12 | AUTHOR = {Aravind Rajeswaran and Kendall Lowrey and Emanuel Todorov and Sham Kakade},
13 | TITLE = "{Towards Generalization and Simplicity in Continuous Control}",
14 | BOOKTITLE = {NIPS},
15 | YEAR = {2017},
16 | }
17 |
18 | @INPROCEEDINGS{Rajeswaran-RSS-18,
19 | AUTHOR = {Aravind Rajeswaran AND Vikash Kumar AND Abhishek Gupta AND
20 | Giulia Vezzani AND John Schulman AND Emanuel Todorov AND Sergey Levine},
21 | TITLE = "{Learning Complex Dexterous Manipulation with Deep Reinforcement Learning and Demonstrations}",
22 | BOOKTITLE = {Proceedings of Robotics: Science and Systems (RSS)},
23 | YEAR = {2018},
24 | }
25 | ```
26 |
27 | # Credits
28 | This package is maintained by [Aravind Rajeswaran](http://homes.cs.washington.edu/~aravraj/) and other members of the [Movement Control Lab,](http://homes.cs.washington.edu/~todorov/) University of Washington Seattle.
29 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | Here we provide a job script to illustrate policy optimization with incrimental learning methods like NPG and PPO. To run the experiments, use the commands below. The experiments are run through the job script provided which tasks two arguments:
4 | - `output`: path to directory where all the results will be saved
5 | - `config`: a config `.txt` file with all the experiment parameters (examples are provided)
6 | The script has to be run from this directory, i.e. `mjrl/examples`
7 |
8 | 1. To train an NPG agent on a task shipped with `mjrl` (e.g. swimmer)
9 | ```
10 | $ python policy_opt_job_script.py --output swimmer_npg_exp --config example_configs/swimmer_npg.txt
11 | ```
12 |
13 | 2. To train an NPG agent on an OpenAI gym benchmark task (e.g. hopper)
14 | ```
15 | $ python policy_opt_job_script.py --output hopper_npg_exp --config example_configs/hopper_npg.txt
16 | ```
17 | Note that since the Hopper env has termination conditions, we pick the sampling mode in the config to be `samples` rather than trajectories, so that per update we have 10K samples.
18 |
19 | 3. To train a PPO agent on the swimmer task
20 | ```
21 | $ python policy_opt_job_script.py --output swimmer_ppo_exp --config example_configs/swimmer_ppo.txt
22 | ```
--------------------------------------------------------------------------------
/examples/behavior_clone.py:
--------------------------------------------------------------------------------
1 | from mjrl.utils.gym_env import GymEnv
2 | from mjrl.policies.gaussian_mlp import MLP
3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
4 | from mjrl.baselines.mlp_baseline import MLPBaseline
5 | from mjrl.algos.npg_cg import NPG
6 | from mjrl.algos.behavior_cloning import BC
7 | from mjrl.utils.train_agent import train_agent
8 | from mjrl.samplers.core import sample_paths
9 | import mjrl.envs
10 | import time as timer
11 | import pickle
12 | SEED = 500
13 |
14 | # ------------------------------
15 | # Train expert policy first
16 | e = GymEnv('mjrl_swimmer-v0')
17 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
18 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=5, learn_rate=1e-3)
19 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
20 |
21 | ts = timer.time()
22 | print("========================================")
23 | print("Training expert policy")
24 | print("========================================")
25 | train_agent(job_name='swimmer_exp1',
26 | agent=agent,
27 | seed=SEED,
28 | niter=50,
29 | gamma=0.995,
30 | gae_lambda=0.97,
31 | num_cpu=1,
32 | sample_mode='trajectories',
33 | num_traj=10,
34 | save_freq=5,
35 | evaluation_rollouts=None)
36 | print("========================================")
37 | print("Expert policy training complete !!!")
38 | print("========================================")
39 | print("time taken = %f" % (timer.time()-ts))
40 | print("========================================")
41 |
42 | # ------------------------------
43 | # Get demonstrations
44 | print("========================================")
45 | print("Collecting expert demonstrations")
46 | print("========================================")
47 | expert_pol = pickle.load(open('swimmer_exp1/iterations/best_policy.pickle', 'rb'))
48 | demo_paths = sample_paths(num_traj=5, policy=expert_pol, env=e.env_id)
49 |
50 | # ------------------------------
51 | # Train BC
52 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
53 | bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=64, lr=1e-3) # will use Adam by default
54 | ts = timer.time()
55 | print("========================================")
56 | print("Running BC with expert demonstrations")
57 | print("========================================")
58 | bc_agent.train()
59 | print("========================================")
60 | print("BC training complete !!!")
61 | print("time taken = %f" % (timer.time()-ts))
62 | print("========================================")
63 |
64 | # ------------------------------
65 | # Evaluate Policies
66 | bc_pol_score = e.evaluate_policy(policy, num_episodes=5, mean_action=True)
67 | expert_score = e.evaluate_policy(expert_pol, num_episodes=5, mean_action=True)
68 | print("Expert policy performance (eval mode) = %f" % expert_score[0][0])
69 | print("BC policy performance (eval mode) = %f" % bc_pol_score[0][0])
70 |
--------------------------------------------------------------------------------
/examples/example_configs/hopper_npg.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env' : 'Hopper-v3',
6 | 'algorithm' : 'NPG',
7 | 'seed' : 123,
8 | 'sample_mode' : 'samples',
9 | 'rl_num_samples' : 10000,
10 | 'rl_num_iter' : 100,
11 | 'num_cpu' : 1,
12 | 'save_freq' : 25,
13 | 'eval_rollouts' : None,
14 | 'exp_notes' : 'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.',
15 |
16 | # RL parameters (all params related to PG, value function etc.)
17 |
18 | 'policy_size' : (32, 32),
19 | 'init_log_std' : -0.5,
20 | 'vf_hidden_size' : (128, 128),
21 | 'vf_batch_size' : 64,
22 | 'vf_epochs' : 2,
23 | 'vf_learn_rate' : 1e-3,
24 | 'rl_step_size' : 0.05,
25 | 'rl_gamma' : 0.995,
26 | 'rl_gae' : 0.97,
27 |
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 |
30 | 'alg_hyper_params' : dict(),
31 |
32 | }
33 |
34 |
--------------------------------------------------------------------------------
/examples/example_configs/swimmer_npg.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env' : 'mjrl_swimmer-v0',
6 | 'algorithm' : 'NPG',
7 | 'seed' : 123,
8 | 'sample_mode' : 'trajectories',
9 | 'rl_num_traj' : 10,
10 | 'rl_num_iter' : 50,
11 | 'num_cpu' : 2,
12 | 'save_freq' : 25,
13 | 'eval_rollouts' : None,
14 | 'exp_notes' : 'Example config for training policy with NPG on the mjrl swimmer task.',
15 |
16 | # RL parameters (all params related to PG, value function, DAPG etc.)
17 |
18 | 'policy_size' : (32, 32),
19 | 'init_log_std' : -0.5,
20 | 'vf_hidden_size' : (128, 128),
21 | 'vf_batch_size' : 64,
22 | 'vf_epochs' : 2,
23 | 'vf_learn_rate' : 1e-3,
24 | 'rl_step_size' : 0.1,
25 | 'rl_gamma' : 0.995,
26 | 'rl_gae' : 0.97,
27 |
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 |
30 | 'alg_hyper_params' : dict(),
31 |
32 | }
--------------------------------------------------------------------------------
/examples/example_configs/swimmer_ppo.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env' : 'mjrl_swimmer-v0',
6 | 'algorithm' : 'PPO',
7 | 'seed' : 123,
8 | 'sample_mode' : 'trajectories',
9 | 'rl_num_traj' : 10,
10 | 'rl_num_iter' : 50,
11 | 'num_cpu' : 2,
12 | 'save_freq' : 25,
13 | 'eval_rollouts' : None,
14 | 'exp_notes' : 'Example config for training policy with PPO on the mjrl swimmer task.',
15 |
16 | # RL parameters (all params related to PG, value function, DAPG etc.)
17 |
18 | 'policy_size' : (32, 32),
19 | 'init_log_std' : -0.5,
20 | 'vf_hidden_size' : (128, 128),
21 | 'vf_batch_size' : 64,
22 | 'vf_epochs' : 2,
23 | 'vf_learn_rate' : 1e-3,
24 | 'rl_step_size' : 0.1,
25 | 'rl_gamma' : 0.995,
26 | 'rl_gae' : 0.97,
27 |
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 |
30 | 'alg_hyper_params' : dict(clip_coef=0.2, epochs=10, mb_size=64, learn_rate=5e-4),
31 |
32 | }
--------------------------------------------------------------------------------
/examples/linear_nn_comparison.py:
--------------------------------------------------------------------------------
1 | from mjrl.utils.gym_env import GymEnv
2 | from mjrl.policies.gaussian_mlp import MLP
3 | from mjrl.policies.gaussian_linear import LinearPolicy
4 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
5 | from mjrl.baselines.mlp_baseline import MLPBaseline
6 | from mjrl.algos.npg_cg import NPG
7 | from mjrl.utils.train_agent import train_agent
8 | import mjrl.envs
9 | import time as timer
10 | SEED = 500
11 |
12 | # NN policy
13 | # ==================================
14 | e = GymEnv('mjrl_swimmer-v0')
15 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
16 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
17 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
18 |
19 | ts = timer.time()
20 | train_agent(job_name='swimmer_nn_exp1',
21 | agent=agent,
22 | seed=SEED,
23 | niter=50,
24 | gamma=0.995,
25 | gae_lambda=0.97,
26 | num_cpu=1,
27 | sample_mode='trajectories',
28 | num_traj=10,
29 | save_freq=5,
30 | evaluation_rollouts=5)
31 | print("time taken for NN policy training = %f" % (timer.time()-ts))
32 |
33 |
34 | # Linear policy
35 | # ==================================
36 | e = GymEnv('mjrl_swimmer-v0')
37 | policy = LinearPolicy(e.spec, seed=SEED)
38 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
39 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
40 |
41 | ts = timer.time()
42 | train_agent(job_name='swimmer_linear_exp1',
43 | agent=agent,
44 | seed=SEED,
45 | niter=50,
46 | gamma=0.995,
47 | gae_lambda=0.97,
48 | num_cpu=1,
49 | sample_mode='trajectories',
50 | num_traj=10,
51 | save_freq=5,
52 | evaluation_rollouts=5)
53 | print("time taken for linear policy training = %f" % (timer.time()-ts))
54 |
--------------------------------------------------------------------------------
/examples/policy_opt_job_script.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a job script for running policy gradient algorithms on gym tasks.
3 | Separate job scripts are provided to run few other algorithms
4 | - For DAPG see here: https://github.com/aravindr93/hand_dapg/tree/master/dapg/examples
5 | - For model-based NPG see here: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos/model_accel
6 | """
7 |
8 | from mjrl.utils.gym_env import GymEnv
9 | from mjrl.policies.gaussian_mlp import MLP
10 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
11 | from mjrl.baselines.mlp_baseline import MLPBaseline
12 | from mjrl.algos.npg_cg import NPG
13 | from mjrl.algos.batch_reinforce import BatchREINFORCE
14 | from mjrl.algos.ppo_clip import PPO
15 | from mjrl.utils.train_agent import train_agent
16 | import os
17 | import json
18 | import gym
19 | import mjrl.envs
20 | import time as timer
21 | import pickle
22 | import argparse
23 |
24 | # ===============================================================================
25 | # Get command line arguments
26 | # ===============================================================================
27 |
28 | parser = argparse.ArgumentParser(description='Natural policy gradient from mjrl on mujoco environments')
29 | parser.add_argument('--output', type=str, required=True, help='location to store results')
30 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params')
31 | args = parser.parse_args()
32 | JOB_DIR = args.output
33 | if not os.path.exists(JOB_DIR):
34 | os.mkdir(JOB_DIR)
35 | with open(args.config, 'r') as f:
36 | job_data = eval(f.read())
37 | assert 'algorithm' in job_data.keys()
38 | assert any([job_data['algorithm'] == a for a in ['NPG', 'NVPG', 'VPG', 'PPO']])
39 | assert 'sample_mode' in job_data.keys()
40 | job_data['alg_hyper_params'] = dict() if 'alg_hyper_params' not in job_data.keys() else job_data['alg_hyper_params']
41 |
42 | EXP_FILE = JOB_DIR + '/job_config.json'
43 | with open(EXP_FILE, 'w') as f:
44 | json.dump(job_data, f, indent=4)
45 |
46 | if job_data['sample_mode'] == 'trajectories':
47 | assert 'rl_num_traj' in job_data.keys()
48 | job_data['rl_num_samples'] = 0 # will be ignored
49 | elif job_data['sample_mode'] == 'samples':
50 | assert 'rl_num_samples' in job_data.keys()
51 | job_data['rl_num_traj'] = 0 # will be ignored
52 | else:
53 | print("Unknown sampling mode. Choose either trajectories or samples")
54 | exit()
55 |
56 | # ===============================================================================
57 | # Train Loop
58 | # ===============================================================================
59 |
60 | e = GymEnv(job_data['env'])
61 | policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed'], init_log_std=job_data['init_log_std'])
62 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], hidden_sizes=job_data['vf_hidden_size'],
63 | epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate'])
64 |
65 | # Construct the algorithm
66 | if job_data['algorithm'] == 'NPG':
67 | # Other hyperparameters (like number of CG steps) can be specified in config for pass through
68 | # or default hyperparameters will be used
69 | agent = NPG(e, policy, baseline, normalized_step_size=job_data['rl_step_size'],
70 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
71 |
72 | elif job_data['algorithm'] == 'VPG':
73 | agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data['rl_step_size'],
74 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
75 |
76 | elif job_data['algorithm'] == 'NVPG':
77 | agent = BatchREINFORCE(e, policy, baseline, desired_kl=job_data['rl_step_size'],
78 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
79 |
80 | elif job_data['algorithm'] == 'PPO':
81 | # There are many hyperparameters for PPO. They can be specified in config for pass through
82 | # or defaults in the PPO algorithm will be used
83 | agent = PPO(e, policy, baseline, save_logs=True, **job_data['alg_hyper_params'])
84 |
85 | print("========================================")
86 | print("Starting policy learning")
87 | print("========================================")
88 |
89 | ts = timer.time()
90 | train_agent(job_name=JOB_DIR,
91 | agent=agent,
92 | seed=job_data['seed'],
93 | niter=job_data['rl_num_iter'],
94 | gamma=job_data['rl_gamma'],
95 | gae_lambda=job_data['rl_gae'],
96 | num_cpu=job_data['num_cpu'],
97 | sample_mode=job_data['sample_mode'],
98 | num_traj=job_data['rl_num_traj'],
99 | num_samples=job_data['rl_num_samples'],
100 | save_freq=job_data['save_freq'],
101 | evaluation_rollouts=job_data['eval_rollouts'])
102 | print("time taken = %f" % (timer.time()-ts))
103 |
--------------------------------------------------------------------------------
/mjrl/__init__.py:
--------------------------------------------------------------------------------
1 | import mjrl.envs
--------------------------------------------------------------------------------
/mjrl/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/algos/__init__.py
--------------------------------------------------------------------------------
/mjrl/algos/behavior_cloning.py:
--------------------------------------------------------------------------------
1 | """
2 | Minimize bc loss (MLE, MSE, RWR etc.) with pytorch optimizers
3 | """
4 |
5 | import logging
6 | logging.disable(logging.CRITICAL)
7 | import numpy as np
8 | import time as timer
9 | import torch
10 | from torch.autograd import Variable
11 | from mjrl.utils.logger import DataLog
12 | from tqdm import tqdm
13 |
14 |
15 | class BC:
16 | def __init__(self, expert_paths,
17 | policy,
18 | epochs = 5,
19 | batch_size = 64,
20 | lr = 1e-3,
21 | optimizer = None,
22 | loss_type = 'MSE', # can be 'MLE' or 'MSE'
23 | save_logs = True,
24 | set_transforms = False,
25 | **kwargs,
26 | ):
27 |
28 | self.policy = policy
29 | self.expert_paths = expert_paths
30 | self.epochs = epochs
31 | self.mb_size = batch_size
32 | self.logger = DataLog()
33 | self.loss_type = loss_type
34 | self.save_logs = save_logs
35 |
36 | if set_transforms:
37 | in_shift, in_scale, out_shift, out_scale = self.compute_transformations()
38 | self.set_transformations(in_shift, in_scale, out_shift, out_scale)
39 | self.set_variance_with_data(out_scale)
40 |
41 | # construct optimizer
42 | self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=lr) if optimizer is None else optimizer
43 |
44 | # Loss criterion if required
45 | if loss_type == 'MSE':
46 | self.loss_criterion = torch.nn.MSELoss()
47 |
48 | # make logger
49 | if self.save_logs:
50 | self.logger = DataLog()
51 |
52 | def compute_transformations(self):
53 | # get transformations
54 | if self.expert_paths == [] or self.expert_paths is None:
55 | in_shift, in_scale, out_shift, out_scale = None, None, None, None
56 | else:
57 | observations = np.concatenate([path["observations"] for path in self.expert_paths])
58 | actions = np.concatenate([path["actions"] for path in self.expert_paths])
59 | in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0)
60 | out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0)
61 | return in_shift, in_scale, out_shift, out_scale
62 |
63 | def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None):
64 | # set scalings in the target policy
65 | self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale)
66 | self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale)
67 |
68 | def set_variance_with_data(self, out_scale):
69 | # set the variance of gaussian policy based on out_scale
70 | params = self.policy.get_param_values()
71 | params[-self.policy.m:] = np.log(out_scale + 1e-12)
72 | self.policy.set_param_values(params)
73 |
74 | def loss(self, data, idx=None):
75 | if self.loss_type == 'MLE':
76 | return self.mle_loss(data, idx)
77 | elif self.loss_type == 'MSE':
78 | return self.mse_loss(data, idx)
79 | else:
80 | print("Please use valid loss type")
81 | return None
82 |
83 | def mle_loss(self, data, idx):
84 | # use indices if provided (e.g. for mini-batching)
85 | # otherwise, use all the data
86 | idx = range(data['observations'].shape[0]) if idx is None else idx
87 | if type(data['observations']) == torch.Tensor:
88 | idx = torch.LongTensor(idx)
89 | obs = data['observations'][idx]
90 | act = data['expert_actions'][idx]
91 | LL, mu, log_std = self.policy.new_dist_info(obs, act)
92 | # minimize negative log likelihood
93 | return -torch.mean(LL)
94 |
95 | def mse_loss(self, data, idx=None):
96 | idx = range(data['observations'].shape[0]) if idx is None else idx
97 | if type(data['observations']) is torch.Tensor:
98 | idx = torch.LongTensor(idx)
99 | obs = data['observations'][idx]
100 | act_expert = data['expert_actions'][idx]
101 | if type(data['observations']) is not torch.Tensor:
102 | obs = Variable(torch.from_numpy(obs).float(), requires_grad=False)
103 | act_expert = Variable(torch.from_numpy(act_expert).float(), requires_grad=False)
104 | act_pi = self.policy.model(obs)
105 | return self.loss_criterion(act_pi, act_expert.detach())
106 |
107 | def fit(self, data, suppress_fit_tqdm=False, **kwargs):
108 | # data is a dict
109 | # keys should have "observations" and "expert_actions"
110 | validate_keys = all([k in data.keys() for k in ["observations", "expert_actions"]])
111 | assert validate_keys is True
112 | ts = timer.time()
113 | num_samples = data["observations"].shape[0]
114 |
115 | # log stats before
116 | if self.save_logs:
117 | loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0]
118 | self.logger.log_kv('loss_before', loss_val)
119 |
120 | # train loop
121 | for ep in config_tqdm(range(self.epochs), suppress_fit_tqdm):
122 | for mb in range(int(num_samples / self.mb_size)):
123 | rand_idx = np.random.choice(num_samples, size=self.mb_size)
124 | self.optimizer.zero_grad()
125 | loss = self.loss(data, idx=rand_idx)
126 | loss.backward()
127 | self.optimizer.step()
128 | params_after_opt = self.policy.get_param_values()
129 | self.policy.set_param_values(params_after_opt, set_new=True, set_old=True)
130 |
131 | # log stats after
132 | if self.save_logs:
133 | self.logger.log_kv('epoch', self.epochs)
134 | loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0]
135 | self.logger.log_kv('loss_after', loss_val)
136 | self.logger.log_kv('time', (timer.time()-ts))
137 |
138 | def train(self, **kwargs):
139 | observations = np.concatenate([path["observations"] for path in self.expert_paths])
140 | expert_actions = np.concatenate([path["actions"] for path in self.expert_paths])
141 | data = dict(observations=observations, expert_actions=expert_actions)
142 | self.fit(data, **kwargs)
143 |
144 |
145 | def config_tqdm(range_inp, suppress_tqdm=False):
146 | if suppress_tqdm:
147 | return range_inp
148 | else:
149 | return tqdm(range_inp)
--------------------------------------------------------------------------------
/mjrl/algos/dapg.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.disable(logging.CRITICAL)
3 | import numpy as np
4 | import scipy as sp
5 | import scipy.sparse.linalg as spLA
6 | import copy
7 | import time as timer
8 | import torch
9 | import torch.nn as nn
10 | from torch.autograd import Variable
11 | import copy
12 |
13 | # samplers
14 | import mjrl.samplers.core as trajectory_sampler
15 |
16 | # utility functions
17 | import mjrl.utils.process_samples as process_samples
18 | from mjrl.utils.logger import DataLog
19 | from mjrl.utils.cg_solve import cg_solve
20 |
21 | # Import Algs
22 | from mjrl.algos.npg_cg import NPG
23 | from mjrl.algos.behavior_cloning import BC
24 |
25 | class DAPG(NPG):
26 | def __init__(self, env, policy, baseline,
27 | demo_paths=None,
28 | normalized_step_size=0.01,
29 | FIM_invert_args={'iters': 10, 'damping': 1e-4},
30 | hvp_sample_frac=1.0,
31 | seed=123,
32 | save_logs=False,
33 | kl_dist=None,
34 | lam_0=1.0, # demo coef
35 | lam_1=0.95, # decay coef
36 | **kwargs,
37 | ):
38 |
39 | self.env = env
40 | self.policy = policy
41 | self.baseline = baseline
42 | self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size
43 | self.seed = seed
44 | self.save_logs = save_logs
45 | self.FIM_invert_args = FIM_invert_args
46 | self.hvp_subsample = hvp_sample_frac
47 | self.running_score = None
48 | self.demo_paths = demo_paths
49 | self.lam_0 = lam_0
50 | self.lam_1 = lam_1
51 | self.iter_count = 0.0
52 | if save_logs: self.logger = DataLog()
53 |
54 | def train_from_paths(self, paths):
55 |
56 | # Concatenate from all the trajectories
57 | observations = np.concatenate([path["observations"] for path in paths])
58 | actions = np.concatenate([path["actions"] for path in paths])
59 | advantages = np.concatenate([path["advantages"] for path in paths])
60 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
61 |
62 | if self.demo_paths is not None and self.lam_0 > 0.0:
63 | demo_obs = np.concatenate([path["observations"] for path in self.demo_paths])
64 | demo_act = np.concatenate([path["actions"] for path in self.demo_paths])
65 | demo_adv = self.lam_0 * (self.lam_1 ** self.iter_count) * np.ones(demo_obs.shape[0])
66 | self.iter_count += 1
67 | # concatenate all
68 | all_obs = np.concatenate([observations, demo_obs])
69 | all_act = np.concatenate([actions, demo_act])
70 | all_adv = 1e-2*np.concatenate([advantages/(np.std(advantages) + 1e-8), demo_adv])
71 | else:
72 | all_obs = observations
73 | all_act = actions
74 | all_adv = advantages
75 |
76 | # cache return distributions for the paths
77 | path_returns = [sum(p["rewards"]) for p in paths]
78 | mean_return = np.mean(path_returns)
79 | std_return = np.std(path_returns)
80 | min_return = np.amin(path_returns)
81 | max_return = np.amax(path_returns)
82 | base_stats = [mean_return, std_return, min_return, max_return]
83 | self.running_score = mean_return if self.running_score is None else \
84 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters
85 | if self.save_logs: self.log_rollout_statistics(paths)
86 |
87 | # Keep track of times for various computations
88 | t_gLL = 0.0
89 | t_FIM = 0.0
90 |
91 | # Optimization algorithm
92 | # --------------------------
93 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
94 |
95 | # DAPG
96 | ts = timer.time()
97 | sample_coef = all_adv.shape[0]/advantages.shape[0]
98 | dapg_grad = sample_coef*self.flat_vpg(all_obs, all_act, all_adv)
99 | t_gLL += timer.time() - ts
100 |
101 | # NPG
102 | ts = timer.time()
103 | hvp = self.build_Hvp_eval([observations, actions],
104 | regu_coef=self.FIM_invert_args['damping'])
105 | npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(),
106 | cg_iters=self.FIM_invert_args['iters'])
107 | t_FIM += timer.time() - ts
108 |
109 | # Step size computation
110 | # --------------------------
111 | n_step_size = 2.0*self.kl_dist
112 | alpha = np.sqrt(np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20)))
113 |
114 | # Policy update
115 | # --------------------------
116 | curr_params = self.policy.get_param_values()
117 | new_params = curr_params + alpha * npg_grad
118 | self.policy.set_param_values(new_params, set_new=True, set_old=False)
119 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
120 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
121 | self.policy.set_param_values(new_params, set_new=True, set_old=True)
122 |
123 | # Log information
124 | if self.save_logs:
125 | self.logger.log_kv('alpha', alpha)
126 | self.logger.log_kv('delta', n_step_size)
127 | self.logger.log_kv('time_vpg', t_gLL)
128 | self.logger.log_kv('time_npg', t_FIM)
129 | self.logger.log_kv('kl_dist', kl_dist)
130 | self.logger.log_kv('surr_improvement', surr_after - surr_before)
131 | self.logger.log_kv('running_score', self.running_score)
132 | try:
133 | self.env.env.env.evaluate_success(paths, self.logger)
134 | except:
135 | # nested logic for backwards compatibility. TODO: clean this up.
136 | try:
137 | success_rate = self.env.env.env.evaluate_success(paths)
138 | self.logger.log_kv('success_rate', success_rate)
139 | except:
140 | pass
141 | return base_stats
142 |
--------------------------------------------------------------------------------
/mjrl/algos/mbac.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.disable(logging.CRITICAL)
3 | import numpy as np
4 | import time as timer
5 | import torch
6 | import torch.nn as nn
7 | from torch.autograd import Variable
8 | from mjrl.utils.logger import DataLog
9 | from tqdm import tqdm
10 | from mjrl.utils.gym_env import GymEnv
11 | from mjrl.policies.mpc_actor import MPCActor
12 | from mjrl.algos.behavior_cloning import BC
13 |
14 |
15 | class MBAC(BC):
16 | def __init__(self,
17 | env_name,
18 | policy,
19 | expert_paths = None, # for the initial seeding
20 | epochs = 5,
21 | batch_size = 64,
22 | lr = 1e-3,
23 | optimizer = None,
24 | loss_type = 'MSE', # can be 'MLE' or 'MSE'
25 | seed = 123,
26 | buffer_size = 50, # measured in number of trajectories
27 | mpc_params = None,
28 | save_logs = True,
29 | ):
30 |
31 | super().__init__(expert_paths=expert_paths,
32 | policy=policy,
33 | epochs=epochs,
34 | batch_size=batch_size,
35 | lr=lr,
36 | optimizer=optimizer,
37 | loss_type=loss_type,
38 | save_logs=save_logs,
39 | )
40 | self.expert_paths = [] if self.expert_paths is None else self.expert_paths
41 | self.buffer_size = buffer_size
42 |
43 | # For the MPC policy
44 | self.env = GymEnv(env_name)
45 | self.env.reset(seed=seed)
46 | if mpc_params is None:
47 | mean = np.zeros(self.env.action_dim)
48 | sigma = 1.0 * np.ones(self.env.action_dim)
49 | filter_coefs = [sigma, 0.05, 0.0, 0.0]
50 | mpc_params = dict(env=GymEnv(env_name), H=10,
51 | paths_per_cpu=25, num_cpu=1,
52 | kappa=10.0, gamma=1.0,
53 | mean=mean, filter_coefs=filter_coefs,
54 | seed=seed)
55 | else:
56 | mpc_params['env'] = GymEnv(env_name)
57 | mpc_params['seed'] = seed
58 |
59 | self.mpc_params = mpc_params
60 | self.mpc_policy = MPCActor(**mpc_params)
61 |
62 | def collect_paths(self, num_traj=10,
63 | mode='policy',
64 | horizon=None,
65 | render=False
66 | ):
67 | horizon = self.env.horizon if horizon is None else horizon
68 | paths = []
69 | for i in tqdm(range(num_traj)):
70 | self.env.reset()
71 | obs, act_pi, act_mpc, rew, states = [], [], [], [], []
72 | for t in range(horizon):
73 | o = self.env.get_obs()
74 | s = self.env.get_env_state()
75 | a_pi = self.policy.get_action(o)[0]
76 | a_mpc = self.mpc_policy.get_action(s)
77 | a = a_pi if mode == 'policy' else a_mpc
78 | next_o, r, done, _ = self.env.step(a)
79 | if render:
80 | self.env.render()
81 | # store data
82 | obs.append(o)
83 | rew.append(r)
84 | states.append(s)
85 | act_pi.append(a_pi)
86 | act_mpc.append(a_mpc)
87 | # kill if done
88 | if done:
89 | break
90 | path = dict(observations=np.array(obs),
91 | actions=np.array(act_pi),
92 | expert_actions=np.array(act_mpc),
93 | rewards=np.array(rew),
94 | states=states,
95 | )
96 | paths.append(path)
97 | return paths
98 |
99 | def add_paths_to_buffer(self, paths):
100 | for path in paths:
101 | self.expert_paths.append(path)
102 | if len(self.expert_paths) > self.buffer_size:
103 | # keep recent trajectories
104 | # TODO: Also consider keeping best performing trajectories
105 | self.expert_paths = self.expert_paths[-self.buffer_size:]
106 | if self.save_logs:
107 | self.logger.log_kv('buffer_size', len(self.expert_paths))
108 |
109 | def get_data_from_buffer(self):
110 | observations = np.concatenate([path["observations"] for path in self.expert_paths])
111 | expert_actions = np.concatenate([path["expert_actions"] for path in self.expert_paths])
112 | observations = torch.Tensor(observations).float()
113 | expert_actions = torch.Tensor(expert_actions).float()
114 | data = dict(observations=observations, expert_actions=expert_actions)
115 | return data
116 |
117 | def train_step(self, num_traj=10, **kwargs):
118 | # collect data using policy actions
119 | # fit policy to expert actions on these states
120 | new_paths = self.collect_paths(num_traj, mode='policy')
121 | self.add_paths_to_buffer(new_paths)
122 | data = self.get_data_from_buffer()
123 | self.fit(data, **kwargs)
124 | stoc_pol_perf = np.mean([np.sum(path['rewards']) for path in new_paths])
125 | return stoc_pol_perf
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/algos/model_accel/__init__.py
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/model_accel_npg.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 | import torch
4 | import torch.nn as nn
5 | import pickle
6 | import mjrl.envs
7 | import os
8 | import time as timer
9 | from torch.autograd import Variable
10 | from mjrl.utils.gym_env import GymEnv
11 | from mjrl.algos.model_accel.nn_dynamics import WorldModel
12 | import mjrl.samplers.core as trajectory_sampler
13 |
14 | # utility functions
15 | import mjrl.utils.process_samples as process_samples
16 | from mjrl.utils.logger import DataLog
17 | from mjrl.algos.model_accel.sampling import policy_rollout
18 |
19 | # Import NPG
20 | from mjrl.algos.npg_cg import NPG
21 |
22 |
23 | class ModelAccelNPG(NPG):
24 | def __init__(self, learned_model=None,
25 | refine=False,
26 | kappa=5.0,
27 | plan_horizon=10,
28 | plan_paths=100,
29 | reward_function=None,
30 | termination_function=None,
31 | **kwargs):
32 | super(ModelAccelNPG, self).__init__(**kwargs)
33 | if learned_model is None:
34 | print("Algorithm requires a (list of) learned dynamics model")
35 | quit()
36 | elif isinstance(learned_model, WorldModel):
37 | self.learned_model = [learned_model]
38 | else:
39 | self.learned_model = learned_model
40 | self.refine, self.kappa, self.plan_horizon, self.plan_paths = refine, kappa, plan_horizon, plan_paths
41 | self.reward_function, self.termination_function = reward_function, termination_function
42 |
43 | def to(self, device):
44 | # Convert all the networks (except policy network which is clamped to CPU)
45 | # to the specified device
46 | for model in self.learned_model:
47 | model.to(device)
48 | try: self.baseline.model.to(device)
49 | except: pass
50 |
51 | def is_cuda(self):
52 | # Check if any of the networks are on GPU
53 | model_cuda = [model.is_cuda() for model in self.learned_model]
54 | model_cuda = any(model_cuda)
55 | baseline_cuda = next(self.baseline.model.parameters()).is_cuda
56 | return any([model_cuda, baseline_cuda])
57 |
58 | def train_step(self, N,
59 | env=None,
60 | sample_mode='trajectories',
61 | horizon=1e6,
62 | gamma=0.995,
63 | gae_lambda=0.97,
64 | num_cpu='max',
65 | env_kwargs=None,
66 | init_states=None,
67 | reward_function=None,
68 | termination_function=None,
69 | truncate_lim=None,
70 | truncate_reward=0.0,
71 | **kwargs,
72 | ):
73 |
74 | ts = timer.time()
75 |
76 | # get the correct env behavior
77 | if env is None:
78 | env = self.env
79 | elif type(env) == str:
80 | env = GymEnv(env)
81 | elif isinstance(env, GymEnv):
82 | env = env
83 | elif callable(env):
84 | env = env(**env_kwargs)
85 | else:
86 | print("Unsupported environment format")
87 | raise AttributeError
88 |
89 | # get correct behavior for reward and termination
90 | reward_function = self.reward_function if reward_function is None else reward_function
91 | termination_function = self.termination_function if termination_function is None else termination_function
92 | if reward_function: assert callable(reward_function)
93 | if termination_function: assert callable(termination_function)
94 |
95 | # simulate trajectories with the learned model(s)
96 | # we want to use the same task instances (e.g. goal locations) for each model in ensemble
97 | paths = []
98 |
99 | # NOTE: We can optionally specify a set of initial states to perform the rollouts from
100 | # This is useful for starting rollouts from the states in the replay buffer
101 | init_states = np.array([env.reset() for _ in range(N)]) if init_states is None else init_states
102 | assert type(init_states) == list
103 | assert len(init_states) == N
104 |
105 | for model in self.learned_model:
106 | # dont set seed explicitly -- this will make rollouts follow tne global seed
107 | rollouts = policy_rollout(num_traj=N, env=env, policy=self.policy,
108 | learned_model=model, eval_mode=False, horizon=horizon,
109 | init_state=init_states, seed=None)
110 | # use learned reward function if available
111 | if model.learn_reward:
112 | model.compute_path_rewards(rollouts)
113 | else:
114 | rollouts = reward_function(rollouts)
115 | num_traj, horizon, state_dim = rollouts['observations'].shape
116 | for i in range(num_traj):
117 | path = dict()
118 | obs = rollouts['observations'][i, :, :]
119 | act = rollouts['actions'][i, :, :]
120 | rew = rollouts['rewards'][i, :]
121 | path['observations'] = obs
122 | path['actions'] = act
123 | path['rewards'] = rew
124 | path['terminated'] = False
125 | paths.append(path)
126 |
127 | # NOTE: If tasks have termination condition, we will assume that the env has
128 | # a function that can terminate paths appropriately.
129 | # Otherwise, termination is not considered.
130 |
131 | if callable(termination_function): paths = termination_function(paths)
132 |
133 | # remove paths that are too short
134 | paths = [path for path in paths if path['observations'].shape[0] >= 5]
135 |
136 | # additional truncation based on error in the ensembles
137 | if truncate_lim is not None and len(self.learned_model) > 1:
138 | for path in paths:
139 | pred_err = np.zeros(path['observations'].shape[0] - 1)
140 | for model in self.learned_model:
141 | s = path['observations'][:-1]
142 | a = path['actions'][:-1]
143 | s_next = path['observations'][1:]
144 | pred = model.predict(s, a)
145 | model_err = np.mean((s_next - pred)**2, axis=-1)
146 | pred_err = np.maximum(pred_err, model_err)
147 | violations = np.where(pred_err > truncate_lim)[0]
148 | truncated = (not len(violations) == 0)
149 | T = violations[0] + 1 if truncated else obs.shape[0]
150 | T = max(4, T) # we don't want corner cases of very short truncation
151 | path["observations"] = path["observations"][:T]
152 | path["actions"] = path["actions"][:T]
153 | path["rewards"] = path["rewards"][:T]
154 | if truncated: path["rewards"][-1] += truncate_reward
155 | path["terminated"] = False if T == obs.shape[0] else True
156 |
157 | if self.save_logs:
158 | self.logger.log_kv('time_sampling', timer.time() - ts)
159 |
160 | self.seed = self.seed + N if self.seed is not None else self.seed
161 |
162 | # compute returns
163 | process_samples.compute_returns(paths, gamma)
164 | # compute advantages
165 | process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda)
166 | # train from paths
167 | eval_statistics = self.train_from_paths(paths)
168 | eval_statistics.append(N)
169 | # log number of samples
170 | if self.save_logs:
171 | num_samples = np.sum([p["rewards"].shape[0] for p in paths])
172 | self.logger.log_kv('num_samples', num_samples)
173 | # fit baseline
174 | if self.save_logs:
175 | ts = timer.time()
176 | error_before, error_after = self.baseline.fit(paths, return_errors=True)
177 | self.logger.log_kv('time_VF', timer.time()-ts)
178 | self.logger.log_kv('VF_error_before', error_before)
179 | self.logger.log_kv('VF_error_after', error_after)
180 | else:
181 | self.baseline.fit(paths)
182 |
183 | return eval_statistics
184 |
185 | def get_action(self, observation):
186 | if self.refine is False:
187 | return self.policy.get_action(observation)
188 | else:
189 | return self.get_refined_action(observation)
190 |
191 | def get_refined_action(self, observation):
192 | # TODO(Aravind): Implemenet this
193 | # This function should rollout many trajectories according to the learned
194 | # dynamics model and the policy, and should refine around the policy by
195 | # incorporating reward based refinement
196 | raise NotImplementedError
197 |
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/model_learning_mpc.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from mjrl.algos.model_accel.sampling import generate_paths, generate_perturbed_actions, trajectory_rollout
3 |
4 |
5 | class MPCPolicy(object):
6 | def __init__(self, env,
7 | plan_horizon,
8 | plan_paths=10,
9 | kappa=1.0,
10 | gamma=1.0,
11 | mean=None,
12 | filter_coefs=None,
13 | seed=123,
14 | warmstart=True,
15 | fitted_model=None,
16 | omega=5.0,
17 | **kwargs,
18 | ):
19 |
20 | # initialize
21 | self.env, self.seed = env, seed
22 | self.n, self.m = env.observation_dim, env.action_dim
23 | self.plan_horizon, self.num_traj = plan_horizon, plan_paths
24 |
25 | if fitted_model is None:
26 | print("Policy requires a fitted dynamics model")
27 | quit()
28 | else:
29 | self.fitted_model = fitted_model
30 |
31 | # initialize other params
32 | self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma
33 | if mean is None:
34 | self.mean = np.zeros(self.m)
35 | if filter_coefs is None:
36 | self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0]
37 | self.act_sequence = np.ones((self.plan_horizon, self.m)) * self.mean
38 | self.init_act_sequence = self.act_sequence.copy()
39 | self.warmstart = warmstart
40 | self.omega = omega
41 |
42 | def get_action(self, obs):
43 | # generate paths
44 | if type(self.fitted_model) == list:
45 |
46 | # Ensemble case
47 | # Collect trajectories from different models with same action sequences
48 | base_act = self.act_sequence
49 | act_list = [generate_perturbed_actions(base_act, self.filter_coefs)
50 | for _ in range(self.num_traj)]
51 | actions = np.array(act_list)
52 | paths_list = []
53 | for model in self.fitted_model:
54 | paths = trajectory_rollout(actions, model, obs)
55 | self.env.env.env.compute_path_rewards(paths)
56 | paths_list.append(paths)
57 | # consolidate paths
58 | paths = dict()
59 | for k in paths_list[0].keys():
60 | v = np.vstack([p[k] for p in paths_list])
61 | paths[k] = v
62 | R = self.score_trajectory_ensemble(paths, paths_list)
63 |
64 | else:
65 | paths = generate_paths(num_traj=self.num_traj, fitted_model=self.fitted_model,
66 | start_state=obs, base_act=self.act_sequence, filter_coefs=self.filter_coefs)
67 | self.env.env.env.compute_path_rewards(paths) # will populate path['rewards']
68 | R = self.score_trajectory(paths)
69 |
70 | S = np.exp(self.kappa * (R - np.max(R)))
71 | act = paths["actions"]
72 |
73 | weighted_seq = S * act.T
74 | act_sequence = np.sum(weighted_seq.T, axis=0) / (np.sum(S) + 1e-6)
75 | action = act_sequence[0].copy()
76 |
77 | # get updated action sequence
78 | if self.warmstart:
79 | self.act_sequence[:-1] = act_sequence[1:]
80 | self.act_sequence[-1] = self.mean.copy()
81 | else:
82 | self.act_sequence = self.init_act_sequence.copy()
83 | return action
84 |
85 | def score_trajectory_ensemble(self, paths, paths_list):
86 | num_traj = self.num_traj
87 | num_models = len(paths_list)
88 | total_traj = paths['rewards'].shape[0]
89 | horizon = paths['rewards'].shape[1]
90 | predictions = [p['observations'] for p in paths_list]
91 | disagreement = np.std(predictions, axis=0) # (num_traj, horizon, state_dim)
92 | disagreement = np.sum(disagreement, axis=(1,2)) # (num_traj,)
93 | scores = np.zeros(total_traj)
94 | for i in range(total_traj):
95 | disagreement_score = disagreement[i // self.num_traj]
96 | scores[i] = self.omega * disagreement_score
97 | for t in range(horizon):
98 | scores[i] += (self.gamma ** t) * paths["rewards"][i][t]
99 | return scores
100 |
101 | def score_trajectory(self, paths):
102 | # rewards shape: (num_traj, horizon)
103 | num_traj = paths["rewards"].shape[0]
104 | horizon = paths["rewards"].shape[1]
105 | scores = np.zeros(num_traj)
106 | for i in range(num_traj):
107 | scores[i] = 0.0
108 | for t in range(horizon):
109 | scores[i] += (self.gamma**t)*paths["rewards"][i][t]
110 | return scores
111 |
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/configs/point_mass.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env_name' : 'mjrl_point_mass-v0',
6 | 'seed' : 123,
7 | 'debug_mode' : False,
8 | 'num_iter' : 5,
9 | 'iter_samples' : 100,
10 | 'eval_rollouts' : 25,
11 | 'num_models' : 3,
12 | 'exp_notes' : 'Toy experiment for initial trial.',
13 | 'save_freq' : 1,
14 | 'device' : 'cpu',
15 | 'learn_reward' : False,
16 | 'reward_file' : 'utils/reward_functions/mjrl_point_mass.py',
17 |
18 | # dynamics learning
19 |
20 | 'hidden_size' : (256, 256),
21 | 'activation' : 'relu',
22 | 'fit_lr' : 1e-3,
23 | 'fit_wd' : 1e-5,
24 | 'buffer_size' : 10000,
25 | 'fit_mb_size' : 16,
26 | 'fit_epochs' : 25,
27 | 'refresh_fit' : False,
28 |
29 | # initial data
30 |
31 | 'init_log_std' : -0.5,
32 | 'min_log_std' : -2.0,
33 | 'init_samples' : 1000,
34 |
35 | # NPG params
36 |
37 | 'policy_size' : (32, 32),
38 | 'inner_steps' : 10,
39 | 'step_size' : 0.05,
40 | 'update_paths' : 250,
41 | 'start_state' : 'init',
42 | 'horizon' : 25,
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/configs/reacher.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env_name' : 'mjrl_reacher_7dof-v0',
6 | 'seed' : 123,
7 | 'debug_mode' : False,
8 | 'num_iter' : 25,
9 | 'iter_samples' : 500,
10 | 'eval_rollouts' : 10,
11 | 'num_models' : 4,
12 | 'save_freq' : 1,
13 | 'device' : 'cpu',
14 |
15 | # dynamics learning
16 |
17 | 'hidden_size' : (256, 256),
18 | 'activation' : 'relu',
19 | 'fit_lr' : 1e-3,
20 | 'fit_wd' : 0.0,
21 | 'buffer_size' : 20000,
22 | 'fit_mb_size' : 64,
23 | 'fit_epochs' : 20,
24 | 'refresh_fit' : False,
25 |
26 | # initial data
27 |
28 | 'init_log_std' : -0.5,
29 | 'min_log_std' : -2.5,
30 | 'init_samples' : 2500,
31 | 'init_policy' : None,
32 |
33 |
34 | # NPG params
35 |
36 | 'policy_size' : (64, 64),
37 | 'inner_steps' : 5,
38 | 'step_size' : 0.05,
39 | 'update_paths' : 250,
40 | 'start_state' : 'init',
41 | 'horizon' : 50,
42 |
43 | }
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/sandbox/example_config_mpc.txt:
--------------------------------------------------------------------------------
1 | {
2 |
3 | # general inputs
4 |
5 | 'env_name' : 'mjrl_point_mass-v0',
6 | 'seed' : 123,
7 | 'debug_mode' : False,
8 | 'num_iter' : 5,
9 | 'paths_per_iter': 5,
10 | 'eval_rollouts' : 10,
11 | 'num_models' : 3,
12 | 'exp_notes' : 'Toy experiment for initial trial.',
13 | 'save_freq' : 5,
14 | 'device' : 'cpu',
15 |
16 | # dynamics learning
17 |
18 | 'hidden_size' : (64, 64),
19 | 'activation' : 'relu',
20 | 'fit_lr' : 1e-3,
21 | 'fit_wd' : 1e-5,
22 | 'max_paths' : 1000,
23 | 'fit_mb_size' : 16,
24 | 'fit_epochs' : 25,
25 | 'refresh_fit' : True,
26 |
27 | # initial data
28 |
29 | 'init_log_std' : -0.5,
30 | 'n_init_paths' : 25,
31 | 'use_demos' : False,
32 | 'demo_file' : None,
33 |
34 | # model predictive control
35 |
36 | 'noisy_mpc' : True, # when collecting data for exploration
37 | 'noise_level' : 0.1,
38 | 'filter_coefs' : {'f1': 0.5, 'f2': 1.0, 'f3': 0.0, 'f4': 0.0},
39 | 'plan_paths' : 200,
40 | 'plan_horizon' : 10,
41 | 'kappa' : 2.0,
42 | 'omega' : 0.0,
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/sandbox/run_model_learning_mpc.py:
--------------------------------------------------------------------------------
1 | """
2 | Job script to optimize trajectories with fitted model
3 | """
4 |
5 | import numpy as np
6 | import copy
7 | import torch
8 | import torch.nn as nn
9 | import pickle
10 | import mjrl.envs
11 | import time as timer
12 | import argparse
13 | import os
14 | import json
15 | import mjrl.samplers.core as trajectory_sampler
16 | import mjrl.utils.tensor_utils as tensor_utils
17 | from tqdm import tqdm
18 | from tabulate import tabulate
19 | from mjrl.policies.gaussian_mlp import MLP
20 | from mjrl.baselines.mlp_baseline import MLPBaseline
21 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
22 | from mjrl.utils.gym_env import GymEnv
23 | from mjrl.utils.logger import DataLog
24 | from mjrl.utils.make_train_plots import make_train_plots
25 | from mjrl.algos.model_accel.nn_dynamics import DynamicsModel
26 | from mjrl.algos.model_accel.model_learning_mpc import MPCPolicy
27 | from mjrl.algos.model_accel.sampling import sample_paths, evaluate_policy
28 |
29 |
30 | # ===============================================================================
31 | # Get command line arguments
32 | # ===============================================================================
33 |
34 | parser = argparse.ArgumentParser(description='Trajectory Optimization with fitted models.')
35 | parser.add_argument('--output', type=str, required=True, help='location to store results')
36 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params')
37 | args = parser.parse_args()
38 | OUT_DIR = args.output
39 | if not os.path.exists(OUT_DIR):
40 | os.mkdir(OUT_DIR)
41 | with open(args.config, 'r') as f:
42 | job_data = eval(f.read())
43 |
44 | # Unpack args and make files for easy access
45 | logger = DataLog()
46 | ENV_NAME = job_data['env_name']
47 | PICKLE_FILE = OUT_DIR + '/exp_results.pickle'
48 | EXP_FILE = OUT_DIR + '/job_data.json'
49 | SEED = job_data['seed']
50 | job_data['filter_coefs'] = [job_data['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']]
51 |
52 | # base cases
53 | if 'num_models' not in job_data.keys():
54 | job_data['num_models'] = 1
55 | if job_data['num_models'] == 1 or 'omega' not in job_data.keys():
56 | job_data['omega'] = 0.0
57 | if 'eval_rollouts' not in job_data.keys():
58 | job_data['eval_rollouts'] = 0
59 | if 'save_freq' not in job_data.keys():
60 | job_data['save_freq'] = 10
61 | if 'device' not in job_data.keys():
62 | job_data['device'] = 'cpu'
63 | if 'debug_mode' in job_data.keys():
64 | DEBUG = job_data['debug_mode']
65 | else:
66 | DEBUG =False
67 | if 'device_path' not in job_data.keys():
68 | job_data['device_path'] = None
69 | with open(EXP_FILE, 'w') as f:
70 | json.dump(job_data, f, indent=4)
71 |
72 | del(job_data['seed'])
73 | job_data['base_seed'] = SEED
74 |
75 | # ===============================================================================
76 | # Train loop
77 | # ===============================================================================
78 |
79 | np.random.seed(SEED)
80 | torch.random.manual_seed(SEED)
81 |
82 | # TODO(Aravind): Map to hardware if device_path is specified
83 |
84 | e = GymEnv(ENV_NAME)
85 | e.set_seed(SEED)
86 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+i, **job_data)
87 | for i in range(job_data['num_models'])]
88 | exploratory_policy = MLP(e.spec, seed=SEED, init_log_std=job_data['init_log_std'])
89 | paths = []
90 |
91 | for outer_iter in range(job_data['num_iter']):
92 |
93 | ts = timer.time()
94 | print("================> ITERATION : %i " % outer_iter)
95 | print("Getting interaction data from real dynamics ...")
96 |
97 | if outer_iter == 0:
98 | iter_paths = trajectory_sampler.sample_paths(job_data['n_init_paths'], e,
99 | exploratory_policy,
100 | eval_mode=False, base_seed=SEED)
101 | else:
102 | iter_paths = sample_paths(job_data['paths_per_iter'],
103 | mpc_policy.env, mpc_policy,
104 | eval_mode=(not job_data['noisy_mpc']),
105 | noise_level=job_data['noise_level'],
106 | base_seed=SEED + outer_iter)
107 |
108 | # reset the environment (good for hardware)
109 | e.reset()
110 |
111 | for p in iter_paths:
112 | paths.append(p)
113 |
114 | if len(paths) > job_data['max_paths']:
115 | diff = len(paths) - job_data['max_paths']
116 | paths[:diff] = []
117 |
118 | s = np.concatenate([p['observations'][:-1] for p in paths])
119 | a = np.concatenate([p['actions'][:-1] for p in paths])
120 | sp = np.concatenate([p['observations'][1:] for p in paths])
121 | r = np.array([np.sum(p['rewards']) for p in iter_paths])
122 | rollout_score = np.mean(r)
123 |
124 | logger.log_kv('fit_epochs', job_data['fit_epochs'])
125 | logger.log_kv('rollout_score', rollout_score)
126 | try:
127 | rollout_metric = e.env.env.evaluate_success(iter_paths)
128 | logger.log_kv('rollout_metric', rollout_metric)
129 | except:
130 | pass
131 |
132 | print("Data gathered, fitting model ...")
133 | if job_data['refresh_fit']:
134 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+123*outer_iter,
135 | **job_data) for i in range(job_data['num_models'])]
136 |
137 | for i, model in enumerate(models):
138 | epoch_loss = model.fit(s, a, sp, job_data['fit_mb_size'], job_data['fit_epochs'])
139 | logger.log_kv('loss_before_' + str(i), epoch_loss[0])
140 | logger.log_kv('loss_after_' + str(i), epoch_loss[-1])
141 |
142 | mpc_policy = MPCPolicy(env=e, fitted_model=models, seed=SEED+12345*outer_iter, **job_data)
143 |
144 | if job_data['eval_rollouts'] > 0:
145 | print("Performing validation rollouts ... ")
146 | eval_paths = evaluate_policy(mpc_policy.env, mpc_policy, mpc_policy.fitted_model[0], noise_level=0.0,
147 | real_step=True, num_episodes=job_data['eval_rollouts'], visualize=False)
148 | eval_score = np.mean([np.sum(p['rewards']) for p in eval_paths])
149 | logger.log_kv('eval_score', eval_score)
150 | try:
151 | eval_metric = e.env.env.evaluate_success(eval_paths)
152 | logger.log_kv('eval_metric', eval_metric)
153 | except:
154 | pass
155 | else:
156 | eval_paths = []
157 |
158 | exp_data = dict(policy=mpc_policy, fitted_model=mpc_policy.fitted_model,
159 | log=logger.log, rollout_paths=iter_paths, eval_paths=eval_paths)
160 | if outer_iter > 0 and outer_iter % job_data['save_freq'] == 0:
161 | pickle.dump(exp_data, open(PICKLE_FILE, 'wb'))
162 | pickle.dump(exp_data, open(OUT_DIR + '/iteration_' + str(outer_iter) + '.pickle', 'wb'))
163 |
164 | tf = timer.time()
165 | logger.log_kv('iter_time', tf-ts)
166 | print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
167 | logger.get_current_log().items()))
168 | print(tabulate(print_data))
169 | logger.save_log(OUT_DIR+'/')
170 | make_train_plots(log=logger.log, keys=['rollout_score', 'eval_score', 'rollout_metric', 'eval_metric'],
171 | save_loc=OUT_DIR+'/')
172 |
173 | if job_data['debug_mode']:
174 | evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], False, 5, True)
175 | evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], True, 5, True)
176 |
177 | pickle.dump(exp_data, open(PICKLE_FILE, 'wb')) # final save
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/utils/reward_functions/mjrl_point_mass.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def reward_function(paths):
4 | # path has two keys: observations and actions
5 | # path["observations"] : (num_traj, horizon, obs_dim)
6 | # return paths that contain rewards in path["rewards"]
7 | # path["rewards"] should have shape (num_traj, horizon)
8 | obs = paths["observations"]
9 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
10 | agent_pos = obs[:, :, :2]
11 | target_pos = obs[:, :, -2:]
12 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1)
13 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1)
14 | rewards = -1.0 * l1_dist - 0.5 * l2_dist
15 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s')
16 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
17 | return paths
18 |
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/utils/visualize_policy.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import mjrl.envs
3 | import trajopt.envs
4 | import mj_envs
5 | import click
6 | import os
7 | import gym
8 | import numpy as np
9 | import pickle
10 | import torch
11 | from mjrl.utils.gym_env import GymEnv
12 | from mjrl.policies.gaussian_mlp import MLP
13 | import trajopt.envs
14 |
15 | DESC = '''
16 | Helper script to visualize policy (in mjrl format).\n
17 | USAGE:\n
18 | Visualizes policy on the env\n
19 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n
20 | '''
21 |
22 | # MAIN =========================================================
23 | @click.command(help=DESC)
24 | @click.option('--env_name', type=str, help='environment to load', required= True)
25 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None)
26 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation')
27 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123)
28 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10)
29 | @click.option('--log_std', type=float, default=-0.5)
30 | @click.option('--terminate', type=bool, default=True)
31 | @click.option('--device_path', type=str, default=None)
32 | def main(env_name, policy, mode, seed, episodes, log_std, terminate, device_path):
33 | render = True
34 |
35 | # TODO(Aravind): Map to hardware if device_path is specified
36 |
37 | e = GymEnv(env_name)
38 | e.set_seed(seed)
39 | np.random.seed(seed)
40 | torch.manual_seed(seed)
41 | if policy is not None:
42 | policy = pickle.load(open(policy, 'rb'))
43 | else:
44 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=log_std)
45 |
46 | for ep in range(episodes):
47 | o = e.reset()
48 | rew = 0.0
49 | t = 0
50 | done = False
51 | while t < e.horizon and done is False:
52 | o = e.get_obs()
53 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
54 | next_o, r, done, ifo = e.step(a)
55 | if terminate is False:
56 | done = False
57 | rew = rew + r
58 | t = t + 1
59 | if render:
60 | e.render()
61 | if done and t < e.horizon - 1:
62 | print("Episode terminated early")
63 | print("episode score = %f " % rew)
64 |
65 | e.reset()
66 |
67 |
68 | if __name__ == '__main__':
69 | main()
70 |
--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/utils/visualize_trajectories.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import click
3 | import json
4 | import numpy as np
5 | import torch
6 | import mjrl.envs
7 | import trajopt.envs
8 | import mj_envs
9 | import mjrl.utils.tensor_utils as tensor_utils
10 |
11 | from mjrl.utils.gym_env import GymEnv
12 | from mjrl.algos.model_accel.sampling import evaluate_policy
13 |
14 | DESC = '''
15 | Helper script to visualize optimized trajectories (list of trajectories in trajopt format).\n
16 | USAGE:\n
17 | $ python viz_trajectories.py --file path_to_file.pickle\n
18 | '''
19 | @click.command(help=DESC)
20 | @click.option('--file', type=str, help='pickle file with trajectories', required= True)
21 | @click.option('--seed', type=int, default=123)
22 | @click.option('--noise_level', type=float, default=0.0)
23 | @click.option('--num_episodes', type=int, help='number of times to play trajectories', default=5)
24 | @click.option('--config', type=str, help='if provided MPC params from here will be used.', default=None)
25 | @click.option('--device_path', type=str, default=None)
26 | def main(file, seed, noise_level, num_episodes, config, device_path):
27 | exp_data = pickle.load(open(file, 'rb'))
28 | policy = exp_data['policy']
29 | model = exp_data['fitted_model']
30 | model = model[-1] if type(model) == list else model
31 | env_id = policy.env.env_id
32 | render = True
33 |
34 | # TODO(Aravind): Map to hardware if device_path is specified
35 |
36 | env = GymEnv(env_id)
37 | policy.env = env
38 |
39 | env.set_seed(seed)
40 | np.random.seed(seed)
41 | torch.manual_seed(seed)
42 |
43 | if config is not None:
44 | try:
45 | with open(config, 'r') as f:
46 | config = eval(f.read())
47 | except:
48 | with open(config, 'r') as f:
49 | config = json.load(f)
50 | policy.plan_horizon = config['plan_horizon']
51 | policy.num_traj = config['plan_paths']
52 | policy.kappa = config['kappa']
53 | policy.filter_coefs = [config['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']]
54 | policy.omega = config['omega'] if 'omega' in config.keys() else 0.0
55 |
56 | # TODO(Aravind): Implement capability to set predicted state for rendering purposes
57 | # evaluate_policy(env, policy, model, noise_level, real_step=False, num_episodes=num_episodes, visualize=render)
58 | evaluate_policy(env, policy, model, noise_level, real_step=True, num_episodes=num_episodes, visualize=render)
59 |
60 | # final close out
61 | env.reset()
62 |
63 |
64 | if __name__ == '__main__':
65 | main()
66 |
--------------------------------------------------------------------------------
/mjrl/algos/npg_cg.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.disable(logging.CRITICAL)
3 | import numpy as np
4 | import scipy as sp
5 | import scipy.sparse.linalg as spLA
6 | import copy
7 | import time as timer
8 | import torch
9 | import torch.nn as nn
10 | from torch.autograd import Variable
11 | import copy
12 |
13 | # samplers
14 | import mjrl.samplers.core as trajectory_sampler
15 |
16 | # utility functions
17 | import mjrl.utils.process_samples as process_samples
18 | from mjrl.utils.logger import DataLog
19 | from mjrl.utils.cg_solve import cg_solve
20 | from mjrl.algos.batch_reinforce import BatchREINFORCE
21 |
22 |
23 | class NPG(BatchREINFORCE):
24 | def __init__(self, env, policy, baseline,
25 | normalized_step_size=0.01,
26 | const_learn_rate=None,
27 | FIM_invert_args={'iters': 10, 'damping': 1e-4},
28 | hvp_sample_frac=1.0,
29 | seed=123,
30 | save_logs=False,
31 | kl_dist=None,
32 | input_normalization=None,
33 | **kwargs
34 | ):
35 | """
36 | All inputs are expected in mjrl's format unless specified
37 | :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance
38 | :param kl_dist: desired KL distance between steps. Overrides normalized_step_size.
39 | :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well)
40 | :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG
41 | :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow)
42 | :param seed: random seed
43 | """
44 |
45 | self.env = env
46 | self.policy = policy
47 | self.baseline = baseline
48 | self.alpha = const_learn_rate
49 | self.n_step_size = normalized_step_size if kl_dist is None else 2.0 * kl_dist
50 | self.seed = seed
51 | self.save_logs = save_logs
52 | self.FIM_invert_args = FIM_invert_args
53 | self.hvp_subsample = hvp_sample_frac
54 | self.running_score = None
55 | if save_logs: self.logger = DataLog()
56 | # input normalization (running average)
57 | self.input_normalization = input_normalization
58 | if self.input_normalization is not None:
59 | if self.input_normalization > 1 or self.input_normalization <= 0:
60 | self.input_normalization = None
61 |
62 | def HVP(self, observations, actions, vector, regu_coef=None):
63 | regu_coef = self.FIM_invert_args['damping'] if regu_coef is None else regu_coef
64 | vec = Variable(torch.from_numpy(vector).float(), requires_grad=False)
65 | if self.hvp_subsample is not None and self.hvp_subsample < 0.99:
66 | num_samples = observations.shape[0]
67 | rand_idx = np.random.choice(num_samples, size=int(self.hvp_subsample*num_samples))
68 | obs = observations[rand_idx]
69 | act = actions[rand_idx]
70 | else:
71 | obs = observations
72 | act = actions
73 | old_dist_info = self.policy.old_dist_info(obs, act)
74 | new_dist_info = self.policy.new_dist_info(obs, act)
75 | mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info)
76 | grad_fo = torch.autograd.grad(mean_kl, self.policy.trainable_params, create_graph=True)
77 | flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_fo])
78 | h = torch.sum(flat_grad*vec)
79 | hvp = torch.autograd.grad(h, self.policy.trainable_params)
80 | hvp_flat = np.concatenate([g.contiguous().view(-1).data.numpy() for g in hvp])
81 | return hvp_flat + regu_coef*vector
82 |
83 | def build_Hvp_eval(self, inputs, regu_coef=None):
84 | def eval(v):
85 | full_inp = inputs + [v] + [regu_coef]
86 | Hvp = self.HVP(*full_inp)
87 | return Hvp
88 | return eval
89 |
90 | # ----------------------------------------------------------
91 | def train_from_paths(self, paths):
92 |
93 | observations, actions, advantages, base_stats, self.running_score = self.process_paths(paths)
94 | if self.save_logs: self.log_rollout_statistics(paths)
95 |
96 | # Keep track of times for various computations
97 | t_gLL = 0.0
98 | t_FIM = 0.0
99 |
100 | # normalize inputs if necessary
101 | if self.input_normalization:
102 | data_in_shift, data_in_scale = np.mean(observations, axis=0), np.std(observations, axis=0)
103 | pi_in_shift, pi_in_scale = self.policy.model.in_shift.data.numpy(), self.policy.model.in_scale.data.numpy()
104 | pi_out_shift, pi_out_scale = self.policy.model.out_shift.data.numpy(), self.policy.model.out_scale.data.numpy()
105 | pi_in_shift = self.input_normalization * pi_in_shift + (1-self.input_normalization) * data_in_shift
106 | pi_in_scale = self.input_normalization * pi_in_scale + (1-self.input_normalization) * data_in_scale
107 | self.policy.model.set_transformations(pi_in_shift, pi_in_scale, pi_out_shift, pi_out_scale)
108 |
109 | # Optimization algorithm
110 | # --------------------------
111 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
112 |
113 | # VPG
114 | ts = timer.time()
115 | vpg_grad = self.flat_vpg(observations, actions, advantages)
116 | t_gLL += timer.time() - ts
117 |
118 | # NPG
119 | ts = timer.time()
120 | hvp = self.build_Hvp_eval([observations, actions],
121 | regu_coef=self.FIM_invert_args['damping'])
122 | npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(),
123 | cg_iters=self.FIM_invert_args['iters'])
124 | t_FIM += timer.time() - ts
125 |
126 | # Step size computation
127 | # --------------------------
128 | if self.alpha is not None:
129 | alpha = self.alpha
130 | n_step_size = (alpha ** 2) * np.dot(vpg_grad.T, npg_grad)
131 | else:
132 | n_step_size = self.n_step_size
133 | alpha = np.sqrt(np.abs(self.n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20)))
134 |
135 | # Policy update
136 | # --------------------------
137 | curr_params = self.policy.get_param_values()
138 | new_params = curr_params + alpha * npg_grad
139 | self.policy.set_param_values(new_params, set_new=True, set_old=False)
140 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
141 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
142 | self.policy.set_param_values(new_params, set_new=True, set_old=True)
143 |
144 | # Log information
145 | if self.save_logs:
146 | self.logger.log_kv('alpha', alpha)
147 | self.logger.log_kv('delta', n_step_size)
148 | self.logger.log_kv('time_vpg', t_gLL)
149 | self.logger.log_kv('time_npg', t_FIM)
150 | self.logger.log_kv('kl_dist', kl_dist)
151 | self.logger.log_kv('surr_improvement', surr_after - surr_before)
152 | self.logger.log_kv('running_score', self.running_score)
153 | try:
154 | self.env.env.env.evaluate_success(paths, self.logger)
155 | except:
156 | # nested logic for backwards compatibility. TODO: clean this up.
157 | try:
158 | success_rate = self.env.env.env.evaluate_success(paths)
159 | self.logger.log_kv('success_rate', success_rate)
160 | except:
161 | pass
162 |
163 | return base_stats
164 |
--------------------------------------------------------------------------------
/mjrl/algos/ppo_clip.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.disable(logging.CRITICAL)
3 | import numpy as np
4 | import scipy as sp
5 | import scipy.sparse.linalg as spLA
6 | import copy
7 | import time as timer
8 | import torch
9 | import torch.nn as nn
10 | from torch.autograd import Variable
11 | import copy
12 |
13 | # samplers
14 | import mjrl.samplers.core as trajectory_sampler
15 |
16 | # utility functions
17 | import mjrl.utils.process_samples as process_samples
18 | from mjrl.utils.logger import DataLog
19 | from mjrl.utils.cg_solve import cg_solve
20 | from mjrl.algos.batch_reinforce import BatchREINFORCE
21 |
22 |
23 | class PPO(BatchREINFORCE):
24 | def __init__(self, env, policy, baseline,
25 | clip_coef = 0.2,
26 | epochs = 10,
27 | mb_size = 64,
28 | learn_rate = 3e-4,
29 | seed = 123,
30 | save_logs = False,
31 | **kwargs
32 | ):
33 |
34 | self.env = env
35 | self.policy = policy
36 | self.baseline = baseline
37 | self.learn_rate = learn_rate
38 | self.seed = seed
39 | self.save_logs = save_logs
40 | self.clip_coef = clip_coef
41 | self.epochs = epochs
42 | self.mb_size = mb_size
43 | self.running_score = None
44 | if save_logs: self.logger = DataLog()
45 |
46 | self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=learn_rate)
47 |
48 | def PPO_surrogate(self, observations, actions, advantages):
49 | adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False)
50 | old_dist_info = self.policy.old_dist_info(observations, actions)
51 | new_dist_info = self.policy.new_dist_info(observations, actions)
52 | LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info)
53 | LR_clip = torch.clamp(LR, min=1-self.clip_coef, max=1+self.clip_coef)
54 | ppo_surr = torch.mean(torch.min(LR*adv_var,LR_clip*adv_var))
55 | return ppo_surr
56 |
57 | # ----------------------------------------------------------
58 | def train_from_paths(self, paths):
59 |
60 | # Concatenate from all the trajectories
61 | observations = np.concatenate([path["observations"] for path in paths])
62 | actions = np.concatenate([path["actions"] for path in paths])
63 | advantages = np.concatenate([path["advantages"] for path in paths])
64 | # Advantage whitening
65 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
66 | # NOTE : advantage should be zero mean in expectation
67 | # normalized step size invariant to advantage scaling,
68 | # but scaling can help with least squares
69 |
70 | # cache return distributions for the paths
71 | path_returns = [sum(p["rewards"]) for p in paths]
72 | mean_return = np.mean(path_returns)
73 | std_return = np.std(path_returns)
74 | min_return = np.amin(path_returns)
75 | max_return = np.amax(path_returns)
76 | base_stats = [mean_return, std_return, min_return, max_return]
77 | self.running_score = mean_return if self.running_score is None else \
78 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters
79 | if self.save_logs: self.log_rollout_statistics(paths)
80 |
81 | # Optimization algorithm
82 | # --------------------------
83 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
84 | params_before_opt = self.policy.get_param_values()
85 |
86 | ts = timer.time()
87 | num_samples = observations.shape[0]
88 | for ep in range(self.epochs):
89 | for mb in range(int(num_samples / self.mb_size)):
90 | rand_idx = np.random.choice(num_samples, size=self.mb_size)
91 | obs = observations[rand_idx]
92 | act = actions[rand_idx]
93 | adv = advantages[rand_idx]
94 | self.optimizer.zero_grad()
95 | loss = - self.PPO_surrogate(obs, act, adv)
96 | loss.backward()
97 | self.optimizer.step()
98 |
99 | params_after_opt = self.policy.get_param_values()
100 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
101 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
102 | self.policy.set_param_values(params_after_opt, set_new=True, set_old=True)
103 | t_opt = timer.time() - ts
104 |
105 | # Log information
106 | if self.save_logs:
107 | self.logger.log_kv('t_opt', t_opt)
108 | self.logger.log_kv('kl_dist', kl_dist)
109 | self.logger.log_kv('surr_improvement', surr_after - surr_before)
110 | self.logger.log_kv('running_score', self.running_score)
111 | try:
112 | self.env.env.env.evaluate_success(paths, self.logger)
113 | except:
114 | # nested logic for backwards compatibility. TODO: clean this up.
115 | try:
116 | success_rate = self.env.env.env.evaluate_success(paths)
117 | self.logger.log_kv('success_rate', success_rate)
118 | except:
119 | pass
120 |
121 | return base_stats
122 |
--------------------------------------------------------------------------------
/mjrl/algos/trpo.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.disable(logging.CRITICAL)
3 | import numpy as np
4 | import scipy as sp
5 | import scipy.sparse.linalg as spLA
6 | import copy
7 | import time as timer
8 | import torch
9 | import torch.nn as nn
10 | from torch.autograd import Variable
11 | import copy
12 |
13 | # samplers
14 | import mjrl.samplers.core as trajectory_sampler
15 | import mjrl.samplers.batch_sampler as batch_sampler
16 |
17 | # utility functions
18 | import mjrl.utils.process_samples as process_samples
19 | from mjrl.utils.logger import DataLog
20 | from mjrl.utils.cg_solve import cg_solve
21 |
22 | # Import NPG
23 | from mjrl.algos.npg_cg import NPG
24 |
25 | class TRPO(NPG):
26 | def __init__(self, env, policy, baseline,
27 | kl_dist=0.01,
28 | FIM_invert_args={'iters': 10, 'damping': 1e-4},
29 | hvp_sample_frac=1.0,
30 | seed=123,
31 | save_logs=False,
32 | normalized_step_size=0.01,
33 | **kwargs
34 | ):
35 | """
36 | All inputs are expected in mjrl's format unless specified
37 | :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance
38 | :param kl_dist: desired KL distance between steps. Overrides normalized_step_size.
39 | :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well)
40 | :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG
41 | :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow)
42 | :param seed: random seed
43 | """
44 |
45 | self.env = env
46 | self.policy = policy
47 | self.baseline = baseline
48 | self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size
49 | self.seed = seed
50 | self.save_logs = save_logs
51 | self.FIM_invert_args = FIM_invert_args
52 | self.hvp_subsample = hvp_sample_frac
53 | self.running_score = None
54 | if save_logs: self.logger = DataLog()
55 |
56 | def train_from_paths(self, paths):
57 |
58 | # Concatenate from all the trajectories
59 | observations = np.concatenate([path["observations"] for path in paths])
60 | actions = np.concatenate([path["actions"] for path in paths])
61 | advantages = np.concatenate([path["advantages"] for path in paths])
62 | # Advantage whitening
63 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
64 | # NOTE : advantage should be zero mean in expectation
65 | # normalized step size invariant to advantage scaling,
66 | # but scaling can help with least squares
67 |
68 | # cache return distributions for the paths
69 | path_returns = [sum(p["rewards"]) for p in paths]
70 | mean_return = np.mean(path_returns)
71 | std_return = np.std(path_returns)
72 | min_return = np.amin(path_returns)
73 | max_return = np.amax(path_returns)
74 | base_stats = [mean_return, std_return, min_return, max_return]
75 | self.running_score = mean_return if self.running_score is None else \
76 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters
77 | if self.save_logs: self.log_rollout_statistics(paths)
78 |
79 | # Keep track of times for various computations
80 | t_gLL = 0.0
81 | t_FIM = 0.0
82 |
83 | # Optimization algorithm
84 | # --------------------------
85 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
86 |
87 | # VPG
88 | ts = timer.time()
89 | vpg_grad = self.flat_vpg(observations, actions, advantages)
90 | t_gLL += timer.time() - ts
91 |
92 | # NPG
93 | ts = timer.time()
94 | hvp = self.build_Hvp_eval([observations, actions],
95 | regu_coef=self.FIM_invert_args['damping'])
96 | npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(),
97 | cg_iters=self.FIM_invert_args['iters'])
98 | t_FIM += timer.time() - ts
99 |
100 | # Step size computation
101 | # --------------------------
102 | n_step_size = 2.0*self.kl_dist
103 | alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20)))
104 |
105 | # Policy update
106 | # --------------------------
107 | curr_params = self.policy.get_param_values()
108 | for k in range(100):
109 | new_params = curr_params + alpha * npg_grad
110 | self.policy.set_param_values(new_params, set_new=True, set_old=False)
111 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
112 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
113 | if kl_dist < self.kl_dist:
114 | break
115 | else:
116 | alpha = 0.9*alpha # backtrack
117 | print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \
118 | (kl_dist, surr_after-surr_before) )
119 | if k == 99:
120 | alpha = 0.0
121 |
122 | new_params = curr_params + alpha * npg_grad
123 | self.policy.set_param_values(new_params, set_new=True, set_old=False)
124 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
125 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
126 | self.policy.set_param_values(new_params, set_new=True, set_old=True)
127 |
128 | # Log information
129 | if self.save_logs:
130 | self.logger.log_kv('alpha', alpha)
131 | self.logger.log_kv('delta', n_step_size)
132 | self.logger.log_kv('time_vpg', t_gLL)
133 | self.logger.log_kv('time_npg', t_FIM)
134 | self.logger.log_kv('kl_dist', kl_dist)
135 | self.logger.log_kv('surr_improvement', surr_after - surr_before)
136 | self.logger.log_kv('running_score', self.running_score)
137 | try:
138 | self.env.env.env.evaluate_success(paths, self.logger)
139 | except:
140 | # nested logic for backwards compatibility. TODO: clean this up.
141 | try:
142 | success_rate = self.env.env.env.evaluate_success(paths)
143 | self.logger.log_kv('success_rate', success_rate)
144 | except:
145 | pass
146 |
147 | return base_stats
--------------------------------------------------------------------------------
/mjrl/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/baselines/__init__.py
--------------------------------------------------------------------------------
/mjrl/baselines/linear_baseline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 |
4 |
5 | class LinearBaseline:
6 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-5):
7 | self.inp = inp
8 | self._reg_coeff = reg_coeff
9 | self._coeffs = None
10 |
11 | def _features(self, paths):
12 | if self.inp == 'env_features':
13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
14 | else:
15 | o = np.concatenate([path["observations"] for path in paths])
16 | o = np.clip(o, -10, 10)/10.0
17 | if o.ndim > 2:
18 | o = o.reshape(o.shape[0], -1)
19 | N, n = o.shape
20 | num_feat = int( n + 1 + 4 ) # linear + bias (1.0) + time till pow 4
21 | feat_mat = np.ones((N, num_feat))
22 |
23 | # linear features
24 | feat_mat[:,:n] = o
25 |
26 | k = 0 # start from this row
27 | for i in range(len(paths)):
28 | l = len(paths[i]["rewards"])
29 | al = np.arange(l)/1000.0
30 | for j in range(4):
31 | feat_mat[k:k+l, -4+j] = al**(j+1)
32 | k += l
33 |
34 | return feat_mat
35 |
36 | def fit(self, paths, return_errors=False):
37 |
38 | featmat = self._features(paths)
39 | returns = np.concatenate([path["returns"] for path in paths])
40 |
41 | if return_errors:
42 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape)
43 | errors = returns - predictions
44 | error_before = np.sum(errors**2)/np.sum(returns**2)
45 |
46 | reg_coeff = copy.deepcopy(self._reg_coeff)
47 | for _ in range(10):
48 | self._coeffs = np.linalg.lstsq(
49 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
50 | featmat.T.dot(returns)
51 | )[0]
52 | if not np.any(np.isnan(self._coeffs)):
53 | break
54 | reg_coeff *= 10
55 |
56 | if return_errors:
57 | predictions = featmat.dot(self._coeffs)
58 | errors = returns - predictions
59 | error_after = np.sum(errors**2)/np.sum(returns**2)
60 | return error_before, error_after
61 |
62 | def predict(self, path):
63 | if self._coeffs is None:
64 | return np.zeros(len(path["rewards"]))
65 | return self._features([path]).dot(self._coeffs)
66 |
--------------------------------------------------------------------------------
/mjrl/baselines/mlp_baseline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 | import torch
4 | import torch.nn as nn
5 | from torch.autograd import Variable
6 | from mjrl.utils.optimize_model import fit_data
7 |
8 | import pickle
9 |
10 | class MLPBaseline:
11 | def __init__(self, env_spec, inp_dim=None, inp='obs', learn_rate=1e-3, reg_coef=0.0,
12 | batch_size=64, epochs=1, use_gpu=False, hidden_sizes=(128, 128)):
13 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim
14 | self.batch_size = batch_size
15 | self.epochs = epochs
16 | self.reg_coef = reg_coef
17 | self.use_gpu = use_gpu
18 | self.inp = inp
19 | self.hidden_sizes = hidden_sizes
20 |
21 | self.model = nn.Sequential()
22 | layer_sizes = (self.n + 4, ) + hidden_sizes + (1, )
23 | for i in range(len(layer_sizes) - 1):
24 | layer_id = 'fc_' + str(i)
25 | relu_id = 'relu_' + str(i)
26 | self.model.add_module(layer_id, nn.Linear(layer_sizes[i], layer_sizes[i+1]))
27 | if i != len(layer_sizes) - 2:
28 | self.model.add_module(relu_id, nn.ReLU())
29 |
30 | if self.use_gpu:
31 | self.model.cuda()
32 |
33 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learn_rate, weight_decay=reg_coef)
34 | self.loss_function = torch.nn.MSELoss()
35 |
36 | def _features(self, paths):
37 | if self.inp == 'env_features':
38 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
39 | else:
40 | o = np.concatenate([path["observations"] for path in paths])
41 | o = np.clip(o, -10, 10)/10.0
42 | if o.ndim > 2:
43 | o = o.reshape(o.shape[0], -1)
44 | N, n = o.shape
45 | num_feat = int( n + 4 ) # linear + time till pow 4
46 | feat_mat = np.ones((N, num_feat)) # memory allocation
47 |
48 | # linear features
49 | feat_mat[:,:n] = o
50 |
51 | k = 0 # start from this row
52 | for i in range(len(paths)):
53 | l = len(paths[i]["rewards"])
54 | al = np.arange(l)/1000.0
55 | for j in range(4):
56 | feat_mat[k:k+l, -4+j] = al**(j+1)
57 | k += l
58 | return feat_mat
59 |
60 |
61 | def fit(self, paths, return_errors=False):
62 |
63 | featmat = self._features(paths)
64 | returns = np.concatenate([path["returns"] for path in paths]).reshape(-1, 1)
65 | featmat = featmat.astype('float32')
66 | returns = returns.astype('float32')
67 | num_samples = returns.shape[0]
68 |
69 | # Make variables with the above data
70 | if self.use_gpu:
71 | featmat_var = Variable(torch.from_numpy(featmat).cuda(), requires_grad=False)
72 | returns_var = Variable(torch.from_numpy(returns).cuda(), requires_grad=False)
73 | else:
74 | featmat_var = Variable(torch.from_numpy(featmat), requires_grad=False)
75 | returns_var = Variable(torch.from_numpy(returns), requires_grad=False)
76 |
77 | if return_errors:
78 | if self.use_gpu:
79 | predictions = self.model(featmat_var).cpu().data.numpy().ravel()
80 | else:
81 | predictions = self.model(featmat_var).data.numpy().ravel()
82 | errors = returns.ravel() - predictions
83 | error_before = np.sum(errors**2)/(np.sum(returns**2) + 1e-8)
84 |
85 | epoch_losses = fit_data(self.model, featmat_var, returns_var, self.optimizer,
86 | self.loss_function, self.batch_size, self.epochs)
87 |
88 | if return_errors:
89 | if self.use_gpu:
90 | predictions = self.model(featmat_var).cpu().data.numpy().ravel()
91 | else:
92 | predictions = self.model(featmat_var).data.numpy().ravel()
93 | errors = returns.ravel() - predictions
94 | error_after = np.sum(errors**2)/(np.sum(returns**2) + 1e-8)
95 | return error_before, error_after
96 |
97 | def predict(self, path):
98 | featmat = self._features([path]).astype('float32')
99 | if self.use_gpu:
100 | feat_var = Variable(torch.from_numpy(featmat).float().cuda(), requires_grad=False)
101 | prediction = self.model(feat_var).cpu().data.numpy().ravel()
102 | else:
103 | feat_var = Variable(torch.from_numpy(featmat).float(), requires_grad=False)
104 | prediction = self.model(feat_var).data.numpy().ravel()
105 | return prediction
106 |
--------------------------------------------------------------------------------
/mjrl/baselines/quadratic_baseline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 |
4 | class QuadraticBaseline:
5 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-3):
6 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim
7 | self.inp = inp
8 | self._reg_coeff = reg_coeff
9 | self._coeffs = None
10 |
11 | def _features(self, paths):
12 | if self.inp == 'env_features':
13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
14 | else:
15 | o = np.concatenate([path["observations"] for path in paths])
16 | o = np.clip(o, -10, 10)/10.0
17 | if o.ndim > 2:
18 | o = o.reshape(o.shape[0], -1)
19 | N, n = o.shape
20 | num_feat = int( n + n*(n+1)/2 + 1 + 4 ) # linear + full quad (symmetric matrix) + bias (1.0) + time till pow 4
21 | feat_mat = np.ones((N, num_feat)) # memory allocation
22 |
23 | # linear features
24 | feat_mat[:,:n] = o
25 |
26 | # quadratic features
27 | k = n # starting from this column in feat_mat
28 | for i in range(n):
29 | for j in range(i, n):
30 | feat_mat[:,k] = o[:,i]*o[:,j] # element-wise product
31 | k += 1
32 |
33 | k = 0 # start from this row
34 | for i in range(len(paths)):
35 | l = len(paths[i]["rewards"])
36 | al = np.arange(l)/1000.0
37 | for j in range(4):
38 | feat_mat[k:k+l, -4+j] = al**(j+1)
39 | k += l
40 |
41 | return feat_mat
42 |
43 |
44 | def fit(self, paths, return_errors=False):
45 |
46 | #featmat = np.concatenate([self._features(path) for path in paths])
47 | featmat = self._features(paths)
48 | returns = np.concatenate([path["returns"] for path in paths])
49 |
50 | if return_errors:
51 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape)
52 | errors = returns - predictions
53 | error_before = np.sum(errors**2)/np.sum(returns**2)
54 |
55 | reg_coeff = copy.deepcopy(self._reg_coeff)
56 | for _ in range(10):
57 | self._coeffs = np.linalg.lstsq(
58 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
59 | featmat.T.dot(returns)
60 | )[0]
61 | if not np.any(np.isnan(self._coeffs)):
62 | break
63 | reg_coeff *= 10
64 |
65 | if return_errors:
66 | predictions = featmat.dot(self._coeffs)
67 | errors = returns - predictions
68 | error_after = np.sum(errors**2)/np.sum(returns**2)
69 | return error_before, error_after
70 |
71 | def predict(self, path):
72 | if self._coeffs is None:
73 | return np.zeros(len(path["rewards"]))
74 | return self._features([path]).dot(self._coeffs)
75 |
--------------------------------------------------------------------------------
/mjrl/baselines/zero_baseline.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 |
4 | class ZeroBaseline:
5 | def __init__(self, env_spec, **kwargs):
6 | n = env_spec.observation_dim # number of states
7 | self._coeffs = None
8 |
9 | def fit(self, paths, return_errors=False):
10 | if return_errors:
11 | return 1.0, 1.0
12 |
13 | def predict(self, path):
14 | return np.zeros(len(path["rewards"]))
15 |
--------------------------------------------------------------------------------
/mjrl/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | # ----------------------------------------
4 | # mjrl environments
5 | # ----------------------------------------
6 |
7 | register(
8 | id='mjrl_point_mass-v0',
9 | entry_point='mjrl.envs:PointMassEnv',
10 | max_episode_steps=25,
11 | )
12 |
13 | register(
14 | id='mjrl_swimmer-v0',
15 | entry_point='mjrl.envs:SwimmerEnv',
16 | max_episode_steps=500,
17 | )
18 |
19 | register(
20 | id='mjrl_reacher_7dof-v0',
21 | entry_point='mjrl.envs:Reacher7DOFEnv',
22 | max_episode_steps=50,
23 | )
24 |
25 | register(
26 | id='mjrl_peg_insertion-v0',
27 | entry_point='mjrl.envs:PegEnv',
28 | max_episode_steps=50,
29 | )
30 |
31 | from mjrl.envs.mujoco_env import MujocoEnv
32 | # ^^^^^ so that user gets the correct error
33 | # message if mujoco is not installed correctly
34 | from mjrl.envs.point_mass import PointMassEnv
35 | from mjrl.envs.swimmer import SwimmerEnv
36 | from mjrl.envs.reacher_sawyer import Reacher7DOFEnv
37 | from mjrl.envs.peg_insertion_sawyer import PegEnv
38 |
--------------------------------------------------------------------------------
/mjrl/envs/assets/peg_insertion.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
--------------------------------------------------------------------------------
/mjrl/envs/assets/point_mass.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/mjrl/envs/assets/sawyer.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
58 |
59 |
60 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
--------------------------------------------------------------------------------
/mjrl/envs/assets/swimmer.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/mjrl/envs/mujoco_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from gym import error, spaces
4 | from gym.utils import seeding
5 | import numpy as np
6 | from os import path
7 | import gym
8 | import six
9 | import time as timer
10 |
11 | try:
12 | import mujoco_py
13 | from mujoco_py import load_model_from_path, MjSim, MjViewer
14 | except ImportError as e:
15 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e))
16 |
17 | def get_sim(model_path):
18 | if model_path.startswith("/"):
19 | fullpath = model_path
20 | else:
21 | fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path)
22 | if not path.exists(fullpath):
23 | raise IOError("File %s does not exist" % fullpath)
24 | model = load_model_from_path(fullpath)
25 | return MjSim(model)
26 |
27 | class MujocoEnv(gym.Env):
28 | """Superclass for all MuJoCo environments.
29 | """
30 |
31 | def __init__(self, model_path=None, frame_skip=1, sim=None):
32 |
33 | if sim is None:
34 | self.sim = get_sim(model_path)
35 | else:
36 | self.sim = sim
37 | self.data = self.sim.data
38 | self.model = self.sim.model
39 |
40 | self.frame_skip = frame_skip
41 | self.metadata = {
42 | 'render.modes': ['human', 'rgb_array'],
43 | 'video.frames_per_second': int(np.round(1.0 / self.dt))
44 | }
45 | self.mujoco_render_frames = False
46 |
47 | self.init_qpos = self.data.qpos.ravel().copy()
48 | self.init_qvel = self.data.qvel.ravel().copy()
49 | try:
50 | observation, _reward, done, _info = self.step(np.zeros(self.model.nu))
51 | except NotImplementedError:
52 | observation, _reward, done, _info = self._step(np.zeros(self.model.nu))
53 | assert not done
54 | self.obs_dim = np.sum([o.size for o in observation]) if type(observation) is tuple else observation.size
55 |
56 | bounds = self.model.actuator_ctrlrange.copy()
57 | low = bounds[:, 0]
58 | high = bounds[:, 1]
59 | self.action_space = spaces.Box(low, high, dtype=np.float32)
60 |
61 | high = np.inf*np.ones(self.obs_dim)
62 | low = -high
63 | self.observation_space = spaces.Box(low, high, dtype=np.float32)
64 |
65 | self.seed()
66 |
67 | def seed(self, seed=None):
68 | self.np_random, seed = seeding.np_random(seed)
69 | return [seed]
70 |
71 | # methods to override:
72 | # ----------------------------
73 |
74 | def reset_model(self):
75 | """
76 | Reset the robot degrees of freedom (qpos and qvel).
77 | Implement this in each subclass.
78 | """
79 | raise NotImplementedError
80 |
81 | def mj_viewer_setup(self):
82 | """
83 | Due to specifics of new mujoco rendering, the standard viewer cannot be used
84 | with this set-up. Instead we use this mujoco specific function.
85 | """
86 | pass
87 |
88 | def viewer_setup(self):
89 | """
90 | Does not work. Use mj_viewer_setup() instead
91 | """
92 | pass
93 |
94 | def evaluate_success(self, paths, logger=None):
95 | """
96 | Log various success metrics calculated based on input paths into the logger
97 | """
98 | pass
99 |
100 | # -----------------------------
101 |
102 | def reset(self):
103 | self.sim.reset()
104 | self.sim.forward()
105 | ob = self.reset_model()
106 | return ob
107 |
108 | def set_state(self, qpos, qvel):
109 | assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
110 | old_state = self.sim.get_state()
111 | new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel,
112 | old_state.act, old_state.udd_state)
113 | self.sim.set_state(new_state)
114 | self.sim.forward()
115 |
116 | @property
117 | def dt(self):
118 | return self.model.opt.timestep * self.frame_skip
119 |
120 | def do_simulation(self, ctrl, n_frames):
121 | for i in range(self.model.nu):
122 | self.sim.data.ctrl[i] = ctrl[i]
123 | for _ in range(n_frames):
124 | self.sim.step()
125 | if self.mujoco_render_frames is True:
126 | self.mj_render()
127 |
128 | def mj_render(self):
129 | try:
130 | self.viewer.render()
131 | except:
132 | self.mj_viewer_setup()
133 | self.viewer._run_speed = 0.5
134 | #self.viewer._run_speed /= self.frame_skip
135 | self.viewer.render()
136 |
137 | def render(self, *args, **kwargs):
138 | pass
139 | #return self.mj_render()
140 |
141 | def _get_viewer(self):
142 | pass
143 | #return None
144 |
145 | def state_vector(self):
146 | state = self.sim.get_state()
147 | return np.concatenate([
148 | state.qpos.flat, state.qvel.flat])
149 |
150 | # -----------------------------
151 |
152 | def visualize_policy(self, policy, horizon=1000, num_episodes=1, mode='exploration'):
153 | self.mujoco_render_frames = True
154 | for ep in range(num_episodes):
155 | o = self.reset()
156 | d = False
157 | t = 0
158 | score = 0.0
159 | while t < horizon and d is False:
160 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
161 | o, r, d, _ = self.step(a)
162 | t = t+1
163 | score = score + r
164 | print("Episode score = %f" % score)
165 | self.mujoco_render_frames = False
166 |
167 | def visualize_policy_offscreen(self, policy, horizon=1000,
168 | num_episodes=1,
169 | frame_size=(640,480),
170 | mode='exploration',
171 | save_loc='/tmp/',
172 | filename='newvid',
173 | camera_name=None):
174 | import skvideo.io
175 | for ep in range(num_episodes):
176 | print("Episode %d: rendering offline " % ep, end='', flush=True)
177 | o = self.reset()
178 | d = False
179 | t = 0
180 | arrs = []
181 | t0 = timer.time()
182 | while t < horizon and d is False:
183 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
184 | o, r, d, _ = self.step(a)
185 | t = t+1
186 | curr_frame = self.sim.render(width=frame_size[0], height=frame_size[1],
187 | mode='offscreen', camera_name=camera_name, device_id=0)
188 | arrs.append(curr_frame[::-1,:,:])
189 | print(t, end=', ', flush=True)
190 | file_name = save_loc + filename + str(ep) + ".mp4"
191 | skvideo.io.vwrite( file_name, np.asarray(arrs))
192 | print("saved", file_name)
193 | t1 = timer.time()
194 | print("time taken = %f"% (t1-t0))
195 |
--------------------------------------------------------------------------------
/mjrl/envs/peg_insertion_sawyer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from mjrl.envs import mujoco_env
4 | from mujoco_py import MjViewer
5 |
6 |
7 | class PegEnv(mujoco_env.MujocoEnv, utils.EzPickle):
8 | def __init__(self):
9 | self.peg_sid = -2
10 | self.target_sid = -1
11 | mujoco_env.MujocoEnv.__init__(self, 'peg_insertion.xml', 4)
12 | utils.EzPickle.__init__(self)
13 | self.peg_sid = self.model.site_name2id("peg_bottom")
14 | self.target_sid = self.model.site_name2id("target")
15 | self.init_body_pos = self.model.body_pos.copy()
16 |
17 | def step(self, a):
18 | self.do_simulation(a, self.frame_skip)
19 | obs = self.get_obs()
20 | reward = self.get_reward(obs, a)
21 | return obs, reward, False, self.get_env_infos()
22 |
23 | def get_obs(self):
24 | return np.concatenate([
25 | self.data.qpos.flat,
26 | self.data.qvel.flat,
27 | self.data.site_xpos[self.peg_sid],
28 | self.data.site_xpos[self.target_sid],
29 | ])
30 |
31 | def get_reward(self, obs, act=None):
32 | obs = np.clip(obs, -10.0, 10.0)
33 | if len(obs.shape) == 1:
34 | # vector obs, called when stepping the env
35 | hand_pos = obs[-6:-3]
36 | target_pos = obs[-3:]
37 | l1_dist = np.sum(np.abs(hand_pos - target_pos))
38 | l2_dist = np.linalg.norm(hand_pos - target_pos)
39 | else:
40 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
41 | hand_pos = obs[:, :, -6:-3]
42 | target_pos = obs[:, :, -3:]
43 | l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1)
44 | l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1)
45 | bonus = 5.0 * (l2_dist < 0.06)
46 | reward = - l1_dist - 5.0 * l2_dist + bonus
47 | return reward
48 |
49 | def compute_path_rewards(self, paths):
50 | # path has two keys: observations and actions
51 | # path["observations"] : (num_traj, horizon, obs_dim)
52 | # path["rewards"] should have shape (num_traj, horizon)
53 | obs = paths["observations"]
54 | rewards = self.get_reward(obs)
55 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
56 |
57 | # --------------------------------
58 | # resets and randomization
59 | # --------------------------------
60 |
61 | def robot_reset(self):
62 | self.set_state(self.init_qpos, self.init_qvel)
63 |
64 | def target_reset(self):
65 | # Randomize goal position
66 | goal_y = self.np_random.uniform(low=0.1, high=0.5)
67 | try:
68 | self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29)
69 | self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29)
70 | self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29)
71 | self.sim.forward()
72 | except:
73 | pass
74 |
75 | def reset_model(self, seed=None):
76 | if seed is not None:
77 | self.seeding = True
78 | self.seed(seed)
79 | self.robot_reset()
80 | self.target_reset()
81 | return self.get_obs()
82 |
83 | # --------------------------------
84 | # get and set states
85 | # --------------------------------
86 |
87 | def get_env_state(self):
88 | target_pos = self.model.body_pos[-1].copy()
89 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
90 | target_pos=target_pos)
91 |
92 | def set_env_state(self, state):
93 | self.sim.reset()
94 | qp = state['qp'].copy()
95 | qv = state['qv'].copy()
96 | target_pos = state['target_pos']
97 | self.model.body_pos[-1] = target_pos
98 | goal_y = target_pos[1]
99 | self.data.qpos[:] = qp
100 | self.data.qvel[:] = qv
101 | self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29)
102 | self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29)
103 | self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29)
104 | self.sim.forward()
105 |
106 | # --------------------------------
107 | # utility functions
108 | # --------------------------------
109 |
110 | def get_env_infos(self):
111 | return dict(state=self.get_env_state())
112 |
113 | def mj_viewer_setup(self):
114 | self.viewer = MjViewer(self.sim)
115 | self.viewer.cam.azimuth += 200
116 | self.sim.forward()
117 | self.viewer.cam.distance = self.model.stat.extent*2.0
118 |
--------------------------------------------------------------------------------
/mjrl/envs/point_mass.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from mjrl.envs import mujoco_env
4 | from mujoco_py import MjViewer
5 |
6 |
7 | class PointMassEnv(mujoco_env.MujocoEnv, utils.EzPickle):
8 | def __init__(self):
9 | self.agent_bid = 0
10 | self.target_sid = 0
11 | utils.EzPickle.__init__(self)
12 | mujoco_env.MujocoEnv.__init__(self, 'point_mass.xml', 5)
13 | self.agent_bid = self.sim.model.body_name2id('agent')
14 | self.target_sid = self.sim.model.site_name2id('target')
15 |
16 | def step(self, a):
17 | self.do_simulation(a, self.frame_skip)
18 | obs = self.get_obs()
19 | reward = self.get_reward(obs)
20 | return obs, reward, False, dict(solved=(reward > -0.1), state=self.get_env_state())
21 |
22 | def get_obs(self):
23 | agent_pos = self.data.body_xpos[self.agent_bid].ravel()
24 | target_pos = self.data.site_xpos[self.target_sid].ravel()
25 | return np.concatenate([agent_pos[:2], self.data.qvel.ravel(), target_pos[:2]])
26 |
27 | def get_reward(self, obs, act=None):
28 | if len(obs.shape) == 1:
29 | # vector obs, called when stepping the env
30 | agent_pos = obs[:2]
31 | target_pos = obs[-2:]
32 | l1_dist = np.sum(np.abs(agent_pos - target_pos))
33 | l2_dist = np.linalg.norm(agent_pos - target_pos)
34 | else:
35 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
36 | agent_pos = obs[:, :, :2]
37 | target_pos = obs[:, :, -2:]
38 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1)
39 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1)
40 | reward = -1.0 * l1_dist - 0.5 * l2_dist
41 | return reward
42 |
43 | def compute_path_rewards(self, paths):
44 | # path has two keys: observations and actions
45 | # path["observations"] : (num_traj, horizon, obs_dim)
46 | # path["rewards"] should have shape (num_traj, horizon)
47 | obs = paths["observations"]
48 | rewards = self.get_reward(obs)
49 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s')
50 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
51 | return paths
52 |
53 | def reset_model(self):
54 | # randomize the agent and goal
55 | agent_x = self.np_random.uniform(low=-1.0, high=1.0)
56 | agent_y = self.np_random.uniform(low=-1.0, high=1.0)
57 | goal_x = self.np_random.uniform(low=-1.0, high=1.0)
58 | goal_y = self.np_random.uniform(low=-1.0, high=1.0)
59 | qp = np.array([agent_x, agent_y])
60 | qv = self.init_qvel.copy()
61 | self.set_state(qp, qv)
62 | self.model.site_pos[self.target_sid][0] = goal_x
63 | self.model.site_pos[self.target_sid][1] = goal_y
64 | self.sim.forward()
65 | return self.get_obs()
66 |
67 | def evaluate_success(self, paths, logger=None):
68 | success = 0.0
69 | for p in paths:
70 | if np.mean(p['env_infos']['solved'][-4:]) > 0.0:
71 | success += 1.0
72 | success_rate = 100.0*success/len(paths)
73 | if logger is None:
74 | # nowhere to log so return the value
75 | return success_rate
76 | else:
77 | # log the success
78 | # can log multiple statistics here if needed
79 | logger.log_kv('success_rate', success_rate)
80 | return None
81 |
82 | # --------------------------------
83 | # get and set states
84 | # --------------------------------
85 |
86 | def get_env_state(self):
87 | target_pos = self.model.site_pos[self.target_sid].copy()
88 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
89 | target_pos=target_pos)
90 |
91 | def set_env_state(self, state):
92 | self.sim.reset()
93 | qp = state['qp'].copy()
94 | qv = state['qv'].copy()
95 | target_pos = state['target_pos']
96 | self.set_state(qp, qv)
97 | self.model.site_pos[self.target_sid] = target_pos
98 | self.sim.forward()
99 |
100 | # --------------------------------
101 | # utility functions
102 | # --------------------------------
103 |
104 | def get_env_infos(self):
105 | return dict(state=self.get_env_state())
106 |
107 | def mj_viewer_setup(self):
108 | self.viewer = MjViewer(self.sim)
109 | self.sim.forward()
110 |
--------------------------------------------------------------------------------
/mjrl/envs/reacher_sawyer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from mjrl.envs import mujoco_env
4 | from mujoco_py import MjViewer
5 |
6 |
7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle):
8 | def __init__(self):
9 | self.hand_sid = -2
10 | self.target_sid = -1
11 | mujoco_env.MujocoEnv.__init__(self, 'sawyer.xml', 4)
12 | utils.EzPickle.__init__(self)
13 | self.hand_sid = self.model.site_name2id("finger")
14 | self.target_sid = self.model.site_name2id("target")
15 |
16 | def step(self, a):
17 | self.do_simulation(a, self.frame_skip)
18 | obs = self.get_obs()
19 | reward = self.get_reward(obs, a)
20 | return obs, reward, False, self.get_env_infos()
21 |
22 | def get_obs(self):
23 | return np.concatenate([
24 | self.data.qpos.flat,
25 | self.data.qvel.ravel() * self.dt, # delta_x instead of velocity
26 | self.data.site_xpos[self.hand_sid],
27 | self.data.site_xpos[self.target_sid],
28 | ])
29 |
30 | def get_reward(self, obs, act=None):
31 | obs = np.clip(obs, -10.0, 10.0)
32 | if len(obs.shape) == 1:
33 | # vector obs, called when stepping the env
34 | hand_pos = obs[-6:-3]
35 | target_pos = obs[-3:]
36 | l1_dist = np.sum(np.abs(hand_pos - target_pos))
37 | l2_dist = np.linalg.norm(hand_pos - target_pos)
38 | else:
39 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
40 | hand_pos = obs[:, :, -6:-3]
41 | target_pos = obs[:, :, -3:]
42 | l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1)
43 | l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1)
44 | reward = - l1_dist - 5.0 * l2_dist
45 | return reward
46 |
47 | def compute_path_rewards(self, paths):
48 | # path has two keys: observations and actions
49 | # path["observations"] : (num_traj, horizon, obs_dim)
50 | # path["rewards"] should have shape (num_traj, horizon)
51 | obs = paths["observations"]
52 | rewards = self.get_reward(obs)
53 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
54 |
55 | # --------------------------------
56 | # resets and randomization
57 | # --------------------------------
58 |
59 | def robot_reset(self):
60 | self.set_state(self.init_qpos, self.init_qvel)
61 |
62 | def target_reset(self):
63 | target_pos = np.array([0.1, 0.1, 0.1])
64 | target_pos[0] = self.np_random.uniform(low=-0.3, high=0.3)
65 | target_pos[1] = self.np_random.uniform(low=-0.2, high=0.2)
66 | target_pos[2] = self.np_random.uniform(low=-0.25, high=0.25)
67 | self.model.site_pos[self.target_sid] = target_pos
68 | self.sim.forward()
69 |
70 | def reset_model(self, seed=None):
71 | if seed is not None:
72 | self.seeding = True
73 | self.seed(seed)
74 | self.robot_reset()
75 | self.target_reset()
76 | return self.get_obs()
77 |
78 | # --------------------------------
79 | # get and set states
80 | # --------------------------------
81 |
82 | def get_env_state(self):
83 | target_pos = self.model.site_pos[self.target_sid].copy()
84 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
85 | target_pos=target_pos)
86 |
87 | def set_env_state(self, state):
88 | self.sim.reset()
89 | qp = state['qp'].copy()
90 | qv = state['qv'].copy()
91 | target_pos = state['target_pos']
92 | self.model.site_pos[self.target_sid] = target_pos
93 | self.data.qpos[:] = qp
94 | self.data.qvel[:] = qv
95 | self.sim.forward()
96 |
97 | # --------------------------------
98 | # utility functions
99 | # --------------------------------
100 |
101 | def get_env_infos(self):
102 | return dict(state=self.get_env_state())
103 |
104 | def mj_viewer_setup(self):
105 | self.viewer = MjViewer(self.sim)
106 | self.viewer.cam.trackbodyid = 1
107 | self.viewer.cam.type = 1
108 | self.sim.forward()
109 | self.viewer.cam.distance = self.model.stat.extent * 2.0
110 |
--------------------------------------------------------------------------------
/mjrl/envs/swimmer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from mjrl.envs import mujoco_env
4 | from mujoco_py import MjViewer
5 |
6 | class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
7 | def __init__(self):
8 | mujoco_env.MujocoEnv.__init__(self, 'swimmer.xml', 5)
9 | utils.EzPickle.__init__(self)
10 |
11 | def step(self, a):
12 | xposbefore = self.data.qpos[0]
13 | self.do_simulation(a, self.frame_skip)
14 | xposafter = self.data.qpos[0]
15 |
16 | delta = (xposafter - xposbefore)
17 | # make agent move in the negative x direction
18 | reward = -10.0 * delta
19 | done = False
20 |
21 | ob = self.get_obs()
22 | return ob, reward, done, self.get_env_infos()
23 |
24 | def get_obs(self):
25 | return np.concatenate([
26 | self.data.qpos.flat[2:],
27 | self.data.qvel.flat,
28 | ])
29 |
30 | def reset_model(self):
31 | qpos_init = self.init_qpos.copy()
32 | qpos_init[2] = self.np_random.uniform(low=-np.pi, high=np.pi)
33 | self.set_state(qpos_init, self.init_qvel)
34 | self.sim.forward()
35 | return self.get_obs()
36 |
37 | # --------------------------------
38 | # get and set states
39 | # --------------------------------
40 |
41 | def get_env_state(self):
42 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy())
43 |
44 | def set_env_state(self, state):
45 | self.sim.reset()
46 | qp = state['qp'].copy()
47 | qv = state['qv'].copy()
48 | self.set_state(qp, qv)
49 | self.sim.forward()
50 |
51 | # --------------------------------
52 | # utility functions
53 | # --------------------------------
54 |
55 | def get_env_infos(self):
56 | return dict(state=self.get_env_state())
57 |
58 | def mj_viewer_setup(self):
59 | self.viewer = MjViewer(self.sim)
60 | self.viewer.cam.trackbodyid = 1
61 | self.viewer.cam.type = 1
62 | self.sim.forward()
63 | self.viewer.cam.distance = self.model.stat.extent*1.2
--------------------------------------------------------------------------------
/mjrl/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/policies/__init__.py
--------------------------------------------------------------------------------
/mjrl/policies/gaussian_linear.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.autograd import Variable
6 | from mjrl.utils.fc_network import FCNetwork
7 |
8 |
9 | class LinearPolicy:
10 | def __init__(self, env_spec,
11 | min_log_std=-3,
12 | init_log_std=0,
13 | seed=None):
14 | """
15 | :param env_spec: specifications of the env (see utils/gym_env.py)
16 | :param min_log_std: log_std is clamped at this value and can't go below
17 | :param init_log_std: initial log standard deviation
18 | :param seed: random seed
19 | """
20 | self.n = env_spec.observation_dim # number of states
21 | self.m = env_spec.action_dim # number of actions
22 | self.min_log_std = min_log_std
23 |
24 | # Set seed
25 | # ------------------------
26 | if seed is not None:
27 | torch.manual_seed(seed)
28 | np.random.seed(seed)
29 |
30 | # Policy network
31 | # ------------------------
32 | self.model = FCNetwork(self.n, self.m, hidden_sizes=())
33 | # make weights small
34 | for param in list(self.model.parameters())[-2:]: # only last layer
35 | param.data = 1e-2 * param.data
36 | self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True)
37 | self.trainable_params = list(self.model.parameters()) + [self.log_std]
38 |
39 | # Old Policy network
40 | # ------------------------
41 | self.old_model = FCNetwork(self.n, self.m, hidden_sizes=())
42 | self.old_log_std = Variable(torch.ones(self.m) * init_log_std)
43 | self.old_params = list(self.old_model.parameters()) + [self.old_log_std]
44 | for idx, param in enumerate(self.old_params):
45 | param.data = self.trainable_params[idx].data.clone()
46 |
47 | # Easy access variables
48 | # -------------------------
49 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
50 | self.param_shapes = [p.data.numpy().shape for p in self.trainable_params]
51 | self.param_sizes = [p.data.numpy().size for p in self.trainable_params]
52 | self.d = np.sum(self.param_sizes) # total number of params
53 |
54 | # Placeholders
55 | # ------------------------
56 | self.obs_var = Variable(torch.randn(self.n), requires_grad=False)
57 |
58 | # Utility functions
59 | # ============================================
60 | def get_param_values(self):
61 | params = np.concatenate([p.contiguous().view(-1).data.numpy()
62 | for p in self.trainable_params])
63 | return params.copy()
64 |
65 | def set_param_values(self, new_params, set_new=True, set_old=True):
66 | if set_new:
67 | current_idx = 0
68 | for idx, param in enumerate(self.trainable_params):
69 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
70 | vals = vals.reshape(self.param_shapes[idx])
71 | param.data = torch.from_numpy(vals).float()
72 | current_idx += self.param_sizes[idx]
73 | # clip std at minimum value
74 | self.trainable_params[-1].data = \
75 | torch.clamp(self.trainable_params[-1], self.min_log_std).data
76 | # update log_std_val for sampling
77 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
78 | if set_old:
79 | current_idx = 0
80 | for idx, param in enumerate(self.old_params):
81 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
82 | vals = vals.reshape(self.param_shapes[idx])
83 | param.data = torch.from_numpy(vals).float()
84 | current_idx += self.param_sizes[idx]
85 | # clip std at minimum value
86 | self.old_params[-1].data = \
87 | torch.clamp(self.old_params[-1], self.min_log_std).data
88 |
89 | # Main functions
90 | # ============================================
91 | def get_action(self, observation):
92 | o = np.float32(observation.reshape(1, -1))
93 | self.obs_var.data = torch.from_numpy(o)
94 | mean = self.model(self.obs_var).data.numpy().ravel()
95 | noise = np.exp(self.log_std_val) * np.random.randn(self.m)
96 | action = mean + noise
97 | return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}]
98 |
99 | def mean_LL(self, observations, actions, model=None, log_std=None):
100 | model = self.model if model is None else model
101 | log_std = self.log_std if log_std is None else log_std
102 | obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False)
103 | act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False)
104 | mean = model(obs_var)
105 | zs = (act_var - mean) / torch.exp(log_std)
106 | LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \
107 | - torch.sum(log_std) + \
108 | - 0.5 * self.m * np.log(2 * np.pi)
109 | return mean, LL
110 |
111 | def log_likelihood(self, observations, actions, model=None, log_std=None):
112 | mean, LL = self.mean_LL(observations, actions, model, log_std)
113 | return LL.data.numpy()
114 |
115 | def old_dist_info(self, observations, actions):
116 | mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std)
117 | return [LL, mean, self.old_log_std]
118 |
119 | def new_dist_info(self, observations, actions):
120 | mean, LL = self.mean_LL(observations, actions, self.model, self.log_std)
121 | return [LL, mean, self.log_std]
122 |
123 | def likelihood_ratio(self, new_dist_info, old_dist_info):
124 | LL_old = old_dist_info[0]
125 | LL_new = new_dist_info[0]
126 | LR = torch.exp(LL_new - LL_old)
127 | return LR
128 |
129 | def mean_kl(self, new_dist_info, old_dist_info):
130 | old_log_std = old_dist_info[2]
131 | new_log_std = new_dist_info[2]
132 | old_std = torch.exp(old_log_std)
133 | new_std = torch.exp(new_log_std)
134 | old_mean = old_dist_info[1]
135 | new_mean = new_dist_info[1]
136 | Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2
137 | Dr = 2 * new_std ** 2 + 1e-8
138 | sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1)
139 | return torch.mean(sample_kl)
140 |
--------------------------------------------------------------------------------
/mjrl/policies/gaussian_mlp.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from mjrl.utils.fc_network import FCNetwork
3 | import torch
4 | from torch.autograd import Variable
5 |
6 |
7 | class MLP:
8 | def __init__(self, env_spec,
9 | hidden_sizes=(64,64),
10 | min_log_std=-3,
11 | init_log_std=0,
12 | seed=None):
13 | """
14 | :param env_spec: specifications of the env (see utils/gym_env.py)
15 | :param hidden_sizes: network hidden layer sizes (currently 2 layers only)
16 | :param min_log_std: log_std is clamped at this value and can't go below
17 | :param init_log_std: initial log standard deviation
18 | :param seed: random seed
19 | """
20 | self.n = env_spec.observation_dim # number of states
21 | self.m = env_spec.action_dim # number of actions
22 | self.min_log_std = min_log_std
23 |
24 | # Set seed
25 | # ------------------------
26 | if seed is not None:
27 | torch.manual_seed(seed)
28 | np.random.seed(seed)
29 |
30 | # Policy network
31 | # ------------------------
32 | self.model = FCNetwork(self.n, self.m, hidden_sizes)
33 | # make weights small
34 | for param in list(self.model.parameters())[-2:]: # only last layer
35 | param.data = 1e-2 * param.data
36 | self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True)
37 | self.trainable_params = list(self.model.parameters()) + [self.log_std]
38 |
39 | # Old Policy network
40 | # ------------------------
41 | self.old_model = FCNetwork(self.n, self.m, hidden_sizes)
42 | self.old_log_std = Variable(torch.ones(self.m) * init_log_std)
43 | self.old_params = list(self.old_model.parameters()) + [self.old_log_std]
44 | for idx, param in enumerate(self.old_params):
45 | param.data = self.trainable_params[idx].data.clone()
46 |
47 | # Easy access variables
48 | # -------------------------
49 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
50 | self.param_shapes = [p.data.numpy().shape for p in self.trainable_params]
51 | self.param_sizes = [p.data.numpy().size for p in self.trainable_params]
52 | self.d = np.sum(self.param_sizes) # total number of params
53 |
54 | # Placeholders
55 | # ------------------------
56 | self.obs_var = Variable(torch.randn(self.n), requires_grad=False)
57 |
58 | # Utility functions
59 | # ============================================
60 | def get_param_values(self):
61 | params = np.concatenate([p.contiguous().view(-1).data.numpy()
62 | for p in self.trainable_params])
63 | return params.copy()
64 |
65 | def set_param_values(self, new_params, set_new=True, set_old=True):
66 | if set_new:
67 | current_idx = 0
68 | for idx, param in enumerate(self.trainable_params):
69 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
70 | vals = vals.reshape(self.param_shapes[idx])
71 | param.data = torch.from_numpy(vals).float()
72 | current_idx += self.param_sizes[idx]
73 | # clip std at minimum value
74 | self.trainable_params[-1].data = \
75 | torch.clamp(self.trainable_params[-1], self.min_log_std).data
76 | # update log_std_val for sampling
77 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
78 | if set_old:
79 | current_idx = 0
80 | for idx, param in enumerate(self.old_params):
81 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
82 | vals = vals.reshape(self.param_shapes[idx])
83 | param.data = torch.from_numpy(vals).float()
84 | current_idx += self.param_sizes[idx]
85 | # clip std at minimum value
86 | self.old_params[-1].data = \
87 | torch.clamp(self.old_params[-1], self.min_log_std).data
88 |
89 | # Main functions
90 | # ============================================
91 | def get_action(self, observation):
92 | o = np.float32(observation.reshape(1, -1))
93 | self.obs_var.data = torch.from_numpy(o)
94 | mean = self.model(self.obs_var).data.numpy().ravel()
95 | noise = np.exp(self.log_std_val) * np.random.randn(self.m)
96 | action = mean + noise
97 | return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}]
98 |
99 | def mean_LL(self, observations, actions, model=None, log_std=None):
100 | model = self.model if model is None else model
101 | log_std = self.log_std if log_std is None else log_std
102 | if type(observations) is not torch.Tensor:
103 | obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False)
104 | else:
105 | obs_var = observations
106 | if type(actions) is not torch.Tensor:
107 | act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False)
108 | else:
109 | act_var = actions
110 | mean = model(obs_var)
111 | zs = (act_var - mean) / torch.exp(log_std)
112 | LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \
113 | - torch.sum(log_std) + \
114 | - 0.5 * self.m * np.log(2 * np.pi)
115 | return mean, LL
116 |
117 | def log_likelihood(self, observations, actions, model=None, log_std=None):
118 | mean, LL = self.mean_LL(observations, actions, model, log_std)
119 | return LL.data.numpy()
120 |
121 | def old_dist_info(self, observations, actions):
122 | mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std)
123 | return [LL, mean, self.old_log_std]
124 |
125 | def new_dist_info(self, observations, actions):
126 | mean, LL = self.mean_LL(observations, actions, self.model, self.log_std)
127 | return [LL, mean, self.log_std]
128 |
129 | def likelihood_ratio(self, new_dist_info, old_dist_info):
130 | LL_old = old_dist_info[0]
131 | LL_new = new_dist_info[0]
132 | LR = torch.exp(LL_new - LL_old)
133 | return LR
134 |
135 | def mean_kl(self, new_dist_info, old_dist_info):
136 | old_log_std = old_dist_info[2]
137 | new_log_std = new_dist_info[2]
138 | old_std = torch.exp(old_log_std)
139 | new_std = torch.exp(new_log_std)
140 | old_mean = old_dist_info[1]
141 | new_mean = new_dist_info[1]
142 | Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2
143 | Dr = 2 * new_std ** 2 + 1e-8
144 | sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1)
145 | return torch.mean(sample_kl)
146 |
--------------------------------------------------------------------------------
/mjrl/policies/mpc_actor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from trajopt.utils import gather_paths_parallel
3 |
4 |
5 | class MPCActor(object):
6 | def __init__(self, env, H, paths_per_cpu,
7 | num_cpu=1,
8 | kappa=1.0,
9 | gamma=1.0,
10 | mean=None,
11 | filter_coefs=None,
12 | seed=123,
13 | ):
14 |
15 | self.env, self.seed = env, seed
16 | self.n, self.m = env.observation_dim, env.action_dim
17 | self.H, self.paths_per_cpu, self.num_cpu = H, paths_per_cpu, num_cpu
18 |
19 | self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma
20 | if mean is None:
21 | self.mean = np.zeros(self.m)
22 | if filter_coefs is None:
23 | self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0]
24 |
25 | self.env.reset()
26 | self.env.set_seed(seed)
27 | self.env.reset(seed=seed)
28 | self.act_sequence = np.ones((self.H, self.m)) * self.mean
29 | self.ctr = 1
30 |
31 | def score_trajectory(self, paths):
32 | scores = np.zeros(len(paths))
33 | for i in range(len(paths)):
34 | scores[i] = 0.0
35 | for t in range(paths[i]["rewards"].shape[0]):
36 | scores[i] += (self.gamma**t)*paths[i]["rewards"][t]
37 | return scores
38 |
39 | def get_action(self, env_state):
40 | # Set to env_state
41 | # Shoot trajectories
42 | # Return optimal action
43 | seed = self.seed + self.ctr * 1000
44 | paths = gather_paths_parallel(self.env.env_id,
45 | env_state,
46 | self.act_sequence,
47 | self.filter_coefs,
48 | seed,
49 | self.paths_per_cpu,
50 | self.num_cpu,
51 | )
52 |
53 | num_traj = len(paths)
54 | R = self.score_trajectory(paths)
55 | S = np.exp(self.kappa*(R-np.max(R)))
56 | act = np.sum([paths[i]["actions"][0] * S[i] for i in range(num_traj)], axis=0)
57 | act = act / (np.sum(S) + 1e-6)
58 | return act
--------------------------------------------------------------------------------
/mjrl/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/samplers/__init__.py
--------------------------------------------------------------------------------
/mjrl/samplers/core.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import numpy as np
3 | from mjrl.utils.gym_env import GymEnv
4 | from mjrl.utils import tensor_utils
5 | logging.disable(logging.CRITICAL)
6 | import multiprocessing as mp
7 | import time as timer
8 | logging.disable(logging.CRITICAL)
9 |
10 |
11 | # Single core rollout to sample trajectories
12 | # =======================================================
13 | def do_rollout(
14 | num_traj,
15 | env,
16 | policy,
17 | eval_mode = False,
18 | horizon = 1e6,
19 | base_seed = None,
20 | env_kwargs=None,
21 | ):
22 | """
23 | :param num_traj: number of trajectories (int)
24 | :param env: environment (env class, str with env_name, or factory function)
25 | :param policy: policy to use for action selection
26 | :param eval_mode: use evaluation mode for action computation (bool)
27 | :param horizon: max horizon length for rollout (<= env.horizon)
28 | :param base_seed: base seed for rollouts (int)
29 | :param env_kwargs: dictionary with parameters, will be passed to env generator
30 | :return:
31 | """
32 |
33 | # get the correct env behavior
34 | if type(env) == str:
35 | env = GymEnv(env)
36 | elif isinstance(env, GymEnv):
37 | env = env
38 | elif callable(env):
39 | env = env(**env_kwargs)
40 | else:
41 | print("Unsupported environment format")
42 | raise AttributeError
43 |
44 | if base_seed is not None:
45 | env.set_seed(base_seed)
46 | np.random.seed(base_seed)
47 | else:
48 | np.random.seed()
49 | horizon = min(horizon, env.horizon)
50 | paths = []
51 |
52 | for ep in range(num_traj):
53 | # seeding
54 | if base_seed is not None:
55 | seed = base_seed + ep
56 | env.set_seed(seed)
57 | np.random.seed(seed)
58 |
59 | observations=[]
60 | actions=[]
61 | rewards=[]
62 | agent_infos = []
63 | env_infos = []
64 |
65 | o = env.reset()
66 | done = False
67 | t = 0
68 |
69 | while t < horizon and done != True:
70 | a, agent_info = policy.get_action(o)
71 | if eval_mode:
72 | a = agent_info['evaluation']
73 | env_info_base = env.get_env_infos()
74 | next_o, r, done, env_info_step = env.step(a)
75 | # below is important to ensure correct env_infos for the timestep
76 | env_info = env_info_step if env_info_base == {} else env_info_base
77 | observations.append(o)
78 | actions.append(a)
79 | rewards.append(r)
80 | agent_infos.append(agent_info)
81 | env_infos.append(env_info)
82 | o = next_o
83 | t += 1
84 |
85 | path = dict(
86 | observations=np.array(observations),
87 | actions=np.array(actions),
88 | rewards=np.array(rewards),
89 | agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
90 | env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
91 | terminated=done
92 | )
93 | paths.append(path)
94 |
95 | del(env)
96 | return paths
97 |
98 |
99 | def sample_paths(
100 | num_traj,
101 | env,
102 | policy,
103 | eval_mode = False,
104 | horizon = 1e6,
105 | base_seed = None,
106 | num_cpu = 1,
107 | max_process_time=300,
108 | max_timeouts=4,
109 | suppress_print=False,
110 | env_kwargs=None,
111 | ):
112 |
113 | num_cpu = 1 if num_cpu is None else num_cpu
114 | num_cpu = mp.cpu_count() if num_cpu == 'max' else num_cpu
115 | assert type(num_cpu) == int
116 |
117 | if num_cpu == 1:
118 | input_dict = dict(num_traj=num_traj, env=env, policy=policy,
119 | eval_mode=eval_mode, horizon=horizon, base_seed=base_seed,
120 | env_kwargs=env_kwargs)
121 | # dont invoke multiprocessing if not necessary
122 | return do_rollout(**input_dict)
123 |
124 | # do multiprocessing otherwise
125 | paths_per_cpu = int(np.ceil(num_traj/num_cpu))
126 | input_dict_list= []
127 | for i in range(num_cpu):
128 | input_dict = dict(num_traj=paths_per_cpu, env=env, policy=policy,
129 | eval_mode=eval_mode, horizon=horizon,
130 | base_seed=base_seed + i * paths_per_cpu,
131 | env_kwargs=env_kwargs)
132 | input_dict_list.append(input_dict)
133 | if suppress_print is False:
134 | start_time = timer.time()
135 | print("####### Gathering Samples #######")
136 |
137 | results = _try_multiprocess(do_rollout, input_dict_list,
138 | num_cpu, max_process_time, max_timeouts)
139 | paths = []
140 | # result is a paths type and results is list of paths
141 | for result in results:
142 | for path in result:
143 | paths.append(path)
144 |
145 | if suppress_print is False:
146 | print("======= Samples Gathered ======= | >>>> Time taken = %f " %(timer.time()-start_time) )
147 |
148 | return paths
149 |
150 |
151 | def sample_data_batch(
152 | num_samples,
153 | env,
154 | policy,
155 | eval_mode = False,
156 | horizon = 1e6,
157 | base_seed = None,
158 | num_cpu = 1,
159 | paths_per_call = 1,
160 | env_kwargs=None,
161 | ):
162 |
163 | num_cpu = 1 if num_cpu is None else num_cpu
164 | num_cpu = mp.cpu_count() if num_cpu == 'max' else num_cpu
165 | assert type(num_cpu) == int
166 |
167 | start_time = timer.time()
168 | print("####### Gathering Samples #######")
169 | sampled_so_far = 0
170 | paths_so_far = 0
171 | paths = []
172 | base_seed = 123 if base_seed is None else base_seed
173 | while sampled_so_far < num_samples:
174 | base_seed = base_seed + 12345
175 | new_paths = sample_paths(paths_per_call * num_cpu, env, policy,
176 | eval_mode, horizon, base_seed, num_cpu,
177 | suppress_print=True, env_kwargs=env_kwargs)
178 | for path in new_paths:
179 | paths.append(path)
180 | paths_so_far += len(new_paths)
181 | new_samples = np.sum([len(p['rewards']) for p in new_paths])
182 | sampled_so_far += new_samples
183 | print("======= Samples Gathered ======= | >>>> Time taken = %f " % (timer.time() - start_time))
184 | print("................................. | >>>> # samples = %i # trajectories = %i " % (
185 | sampled_so_far, paths_so_far))
186 | return paths
187 |
188 |
189 | def _try_multiprocess(func, input_dict_list, num_cpu, max_process_time, max_timeouts):
190 |
191 | # Base case
192 | if max_timeouts == 0:
193 | return None
194 |
195 | pool = mp.Pool(processes=num_cpu, maxtasksperchild=None)
196 | parallel_runs = [pool.apply_async(func, kwds=input_dict) for input_dict in input_dict_list]
197 | try:
198 | results = [p.get(timeout=max_process_time) for p in parallel_runs]
199 | except Exception as e:
200 | print(str(e))
201 | print("Timeout Error raised... Trying again")
202 | pool.close()
203 | pool.terminate()
204 | pool.join()
205 | return _try_multiprocess(func, input_dict_list, num_cpu, max_process_time, max_timeouts-1)
206 |
207 | pool.close()
208 | pool.terminate()
209 | pool.join()
210 | return results
211 |
--------------------------------------------------------------------------------
/mjrl/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/utils/__init__.py
--------------------------------------------------------------------------------
/mjrl/utils/cg_solve.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def cg_solve(f_Ax, b, x_0=None, cg_iters=10, residual_tol=1e-10):
4 | x = np.zeros_like(b) #if x_0 is None else x_0
5 | r = b.copy() #if x_0 is None else b-f_Ax(x_0)
6 | p = r.copy()
7 | rdotr = r.dot(r)
8 |
9 | for i in range(cg_iters):
10 | z = f_Ax(p)
11 | v = rdotr / p.dot(z)
12 | x += v * p
13 | r -= v * z
14 | newrdotr = r.dot(r)
15 | mu = newrdotr / rdotr
16 | p = r + mu * p
17 |
18 | rdotr = newrdotr
19 | if rdotr < residual_tol:
20 | break
21 |
22 | return x
23 |
--------------------------------------------------------------------------------
/mjrl/utils/fc_network.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class FCNetwork(nn.Module):
7 | def __init__(self, obs_dim, act_dim,
8 | hidden_sizes=(64,64),
9 | nonlinearity='tanh', # either 'tanh' or 'relu'
10 | in_shift = None,
11 | in_scale = None,
12 | out_shift = None,
13 | out_scale = None):
14 | super(FCNetwork, self).__init__()
15 |
16 | self.obs_dim = obs_dim
17 | self.act_dim = act_dim
18 | assert type(hidden_sizes) == tuple
19 | self.layer_sizes = (obs_dim, ) + hidden_sizes + (act_dim, )
20 | self.set_transformations(in_shift, in_scale, out_shift, out_scale)
21 |
22 | # hidden layers
23 | self.fc_layers = nn.ModuleList([nn.Linear(self.layer_sizes[i], self.layer_sizes[i+1]) \
24 | for i in range(len(self.layer_sizes) -1)])
25 | self.nonlinearity = torch.relu if nonlinearity == 'relu' else torch.tanh
26 |
27 | def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None):
28 | # store native scales that can be used for resets
29 | self.transformations = dict(in_shift=in_shift,
30 | in_scale=in_scale,
31 | out_shift=out_shift,
32 | out_scale=out_scale
33 | )
34 | self.in_shift = torch.from_numpy(np.float32(in_shift)) if in_shift is not None else torch.zeros(self.obs_dim)
35 | self.in_scale = torch.from_numpy(np.float32(in_scale)) if in_scale is not None else torch.ones(self.obs_dim)
36 | self.out_shift = torch.from_numpy(np.float32(out_shift)) if out_shift is not None else torch.zeros(self.act_dim)
37 | self.out_scale = torch.from_numpy(np.float32(out_scale)) if out_scale is not None else torch.ones(self.act_dim)
38 |
39 | def forward(self, x):
40 | # TODO(Aravind): Remove clamping to CPU
41 | # This is a temp change that should be fixed shortly
42 | if x.is_cuda:
43 | out = x.to('cpu')
44 | else:
45 | out = x
46 | out = (out - self.in_shift)/(self.in_scale + 1e-8)
47 | for i in range(len(self.fc_layers)-1):
48 | out = self.fc_layers[i](out)
49 | out = self.nonlinearity(out)
50 | out = self.fc_layers[-1](out)
51 | out = out * self.out_scale + self.out_shift
52 | return out
53 |
--------------------------------------------------------------------------------
/mjrl/utils/get_environment.py:
--------------------------------------------------------------------------------
1 | """
2 | convenience function to generate env
3 | useful if we want some procedural env generation
4 | """
5 |
6 | import gym
7 | from mjrl.utils.gym_env import GymEnv
8 |
9 | def get_environment(env_name=None, **kwargs):
10 | if env_name is None: print("Need to specify environment name")
11 | e = GymEnv(env_name)
12 | # can make procedural modifications here if needed using kwargs
13 | return e
14 |
--------------------------------------------------------------------------------
/mjrl/utils/gym_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Wrapper around a gym env that provides convenience functions
3 | """
4 |
5 | import gym
6 | import numpy as np
7 |
8 |
9 | class EnvSpec(object):
10 | def __init__(self, obs_dim, act_dim, horizon):
11 | self.observation_dim = obs_dim
12 | self.action_dim = act_dim
13 | self.horizon = horizon
14 |
15 |
16 | class GymEnv(object):
17 | def __init__(self, env, env_kwargs=None,
18 | obs_mask=None, act_repeat=1,
19 | *args, **kwargs):
20 |
21 | # get the correct env behavior
22 | if type(env) == str:
23 | env = gym.make(env)
24 | elif isinstance(env, gym.Env):
25 | env = env
26 | elif callable(env):
27 | env = env(**env_kwargs)
28 | else:
29 | print("Unsupported environment format")
30 | raise AttributeError
31 |
32 | self.env = env
33 | self.env_id = env.spec.id
34 | self.act_repeat = act_repeat
35 |
36 | try:
37 | self._horizon = env.spec.max_episode_steps
38 | except AttributeError:
39 | self._horizon = env.spec._horizon
40 |
41 | assert self._horizon % act_repeat == 0
42 | self._horizon = self._horizon // self.act_repeat
43 |
44 | try:
45 | self._action_dim = self.env.env.action_dim
46 | except AttributeError:
47 | self._action_dim = self.env.action_space.shape[0]
48 |
49 | try:
50 | self._observation_dim = self.env.env.obs_dim
51 | except AttributeError:
52 | self._observation_dim = self.env.observation_space.shape[0]
53 |
54 | # Specs
55 | self.spec = EnvSpec(self._observation_dim, self._action_dim, self._horizon)
56 |
57 | # obs mask
58 | self.obs_mask = np.ones(self._observation_dim) if obs_mask is None else obs_mask
59 |
60 | @property
61 | def action_dim(self):
62 | return self._action_dim
63 |
64 | @property
65 | def observation_dim(self):
66 | return self._observation_dim
67 |
68 | @property
69 | def observation_space(self):
70 | return self.env.observation_space
71 |
72 | @property
73 | def action_space(self):
74 | return self.env.action_space
75 |
76 | @property
77 | def horizon(self):
78 | return self._horizon
79 |
80 | def reset(self, seed=None):
81 | try:
82 | self.env._elapsed_steps = 0
83 | return self.env.env.reset_model(seed=seed)
84 | except:
85 | if seed is not None:
86 | self.set_seed(seed)
87 | return self.env.reset()
88 |
89 | def reset_model(self, seed=None):
90 | # overloading for legacy code
91 | return self.reset(seed)
92 |
93 | def step(self, action):
94 | action = action.clip(self.action_space.low, self.action_space.high)
95 | if self.act_repeat == 1:
96 | obs, cum_reward, done, ifo = self.env.step(action)
97 | else:
98 | cum_reward = 0.0
99 | for i in range(self.act_repeat):
100 | obs, reward, done, ifo = self.env.step(action)
101 | cum_reward += reward
102 | if done: break
103 | return self.obs_mask * obs, cum_reward, done, ifo
104 |
105 | def render(self):
106 | try:
107 | self.env.env.mujoco_render_frames = True
108 | self.env.env.mj_render()
109 | except:
110 | self.env.render()
111 |
112 | def set_seed(self, seed=123):
113 | try:
114 | self.env.seed(seed)
115 | except AttributeError:
116 | self.env._seed(seed)
117 |
118 | def get_obs(self):
119 | try:
120 | return self.obs_mask * self.env.env.get_obs()
121 | except:
122 | return self.obs_mask * self.env.env._get_obs()
123 |
124 | def get_env_infos(self):
125 | try:
126 | return self.env.env.get_env_infos()
127 | except:
128 | return {}
129 |
130 | # ===========================================
131 | # Trajectory optimization related
132 | # Envs should support these functions in case of trajopt
133 |
134 | def get_env_state(self):
135 | try:
136 | return self.env.env.get_env_state()
137 | except:
138 | raise NotImplementedError
139 |
140 | def set_env_state(self, state_dict):
141 | try:
142 | self.env.env.set_env_state(state_dict)
143 | except:
144 | raise NotImplementedError
145 |
146 | def real_env_step(self, bool_val):
147 | try:
148 | self.env.env.real_step = bool_val
149 | except:
150 | raise NotImplementedError
151 |
152 | # ===========================================
153 |
154 | def visualize_policy(self, policy, horizon=1000, num_episodes=1, mode='exploration'):
155 | try:
156 | self.env.env.visualize_policy(policy, horizon, num_episodes, mode)
157 | except:
158 | for ep in range(num_episodes):
159 | o = self.reset()
160 | d = False
161 | t = 0
162 | score = 0.0
163 | while t < horizon and d is False:
164 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
165 | o, r, d, _ = self.step(a)
166 | score = score + r
167 | self.render()
168 | t = t+1
169 | print("Episode score = %f" % score)
170 |
171 | def evaluate_policy(self, policy,
172 | num_episodes=5,
173 | horizon=None,
174 | gamma=1,
175 | visual=False,
176 | percentile=[],
177 | get_full_dist=False,
178 | mean_action=False,
179 | init_env_state=None,
180 | terminate_at_done=True,
181 | seed=123):
182 |
183 | self.set_seed(seed)
184 | horizon = self._horizon if horizon is None else horizon
185 | mean_eval, std, min_eval, max_eval = 0.0, 0.0, -1e8, -1e8
186 | ep_returns = np.zeros(num_episodes)
187 |
188 | for ep in range(num_episodes):
189 | self.reset()
190 | if init_env_state is not None:
191 | self.set_env_state(init_env_state)
192 | t, done = 0, False
193 | while t < horizon and (done == False or terminate_at_done == False):
194 | self.render() if visual is True else None
195 | o = self.get_obs()
196 | a = policy.get_action(o)[1]['evaluation'] if mean_action is True else policy.get_action(o)[0]
197 | o, r, done, _ = self.step(a)
198 | ep_returns[ep] += (gamma ** t) * r
199 | t += 1
200 |
201 | mean_eval, std = np.mean(ep_returns), np.std(ep_returns)
202 | min_eval, max_eval = np.amin(ep_returns), np.amax(ep_returns)
203 | base_stats = [mean_eval, std, min_eval, max_eval]
204 |
205 | percentile_stats = []
206 | for p in percentile:
207 | percentile_stats.append(np.percentile(ep_returns, p))
208 |
209 | full_dist = ep_returns if get_full_dist is True else None
210 |
211 | return [base_stats, percentile_stats, full_dist]
212 |
--------------------------------------------------------------------------------
/mjrl/utils/logger.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import scipy
6 | import pickle
7 | import os
8 | import csv
9 |
10 | class DataLog:
11 |
12 | def __init__(self):
13 | self.log = {}
14 | self.max_len = 0
15 |
16 | def log_kv(self, key, value):
17 | # logs the (key, value) pair
18 |
19 | # TODO: This implementation is error-prone:
20 | # it would be NOT aligned if some keys are missing during one iteration.
21 | if key not in self.log:
22 | self.log[key] = []
23 | self.log[key].append(value)
24 | if len(self.log[key]) > self.max_len:
25 | self.max_len = self.max_len + 1
26 |
27 | def save_log(self, save_path):
28 | # TODO: Validate all lengths are the same.
29 | pickle.dump(self.log, open(save_path + '/log.pickle', 'wb'))
30 | with open(save_path + '/log.csv', 'w') as csv_file:
31 | fieldnames = list(self.log.keys())
32 | if 'iteration' not in fieldnames:
33 | fieldnames = ['iteration'] + fieldnames
34 |
35 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
36 | writer.writeheader()
37 | for row in range(self.max_len):
38 | row_dict = {'iteration': row}
39 | for key in self.log.keys():
40 | if row < len(self.log[key]):
41 | row_dict[key] = self.log[key][row]
42 | writer.writerow(row_dict)
43 |
44 | def get_current_log(self):
45 | row_dict = {}
46 | for key in self.log.keys():
47 | # TODO: this is very error-prone (alignment is not guaranteed)
48 | row_dict[key] = self.log[key][-1]
49 | return row_dict
50 |
51 | def shrink_to(self, num_entries):
52 | for key in self.log.keys():
53 | self.log[key] = self.log[key][:num_entries]
54 |
55 | self.max_len = num_entries
56 | assert min([len(series) for series in self.log.values()]) == \
57 | max([len(series) for series in self.log.values()])
58 |
59 | def read_log(self, log_path):
60 | assert log_path.endswith('log.csv')
61 |
62 | with open(log_path) as csv_file:
63 | reader = csv.DictReader(csv_file)
64 | listr = list(reader)
65 | keys = reader.fieldnames
66 | data = {}
67 | for key in keys:
68 | data[key] = []
69 | for row, row_dict in enumerate(listr):
70 | for key in keys:
71 | try:
72 | data[key].append(eval(row_dict[key]))
73 | except:
74 | print("ERROR on reading key {}: {}".format(key, row_dict[key]))
75 |
76 | if 'iteration' in data and data['iteration'][-1] != row:
77 | raise RuntimeError("Iteration %d mismatch -- possibly corrupted logfile?" % row)
78 |
79 | self.log = data
80 | self.max_len = max(len(v) for k, v in self.log.items())
81 | print("Log read from {}: had {} entries".format(log_path, self.max_len))
82 |
--------------------------------------------------------------------------------
/mjrl/utils/make_train_plots.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import scipy
6 | import csv
7 | from mjrl.utils.logger import DataLog
8 | import argparse
9 |
10 | def make_train_plots(log = None,
11 | log_path = None,
12 | keys = None,
13 | save_loc = None,
14 | sample_key = 'num_samples',
15 | x_scale = 1.0,
16 | y_scale = 1.0):
17 | if log is None and log_path is None:
18 | print("Need to provide either the log or path to a log file")
19 | if log is None:
20 | logger = DataLog()
21 | logger.read_log(log_path)
22 | log = logger.log
23 | # make plots for specified keys
24 | for key in keys:
25 | if key in log.keys():
26 | fig = plt.figure(figsize=(10,6))
27 | ax1 = fig.add_subplot(111)
28 | try:
29 | cum_samples = [np.sum(log[sample_key][:i]) * x_scale for i in range(len(log[sample_key]))]
30 | ax1.plot(cum_samples, [elem * y_scale for elem in log[key]])
31 | ax1.set_xlabel('samples')
32 | # mark iteration on the top axis
33 | ax2 = ax1.twiny()
34 | ax2.set_xlabel('iterations', color=(.7,.7,.7))
35 | ax2.tick_params(axis='x', labelcolor=(.7,.7,.7))
36 | ax2.set_xlim([0, len(log[key])])
37 | except:
38 | ax1.plot(log[key])
39 | ax1.set_xlabel('iterations')
40 | ax1.set_title(key)
41 | plt.savefig(save_loc+'/'+key+'.png', dpi=100)
42 | plt.close()
43 |
44 | # MAIN =========================================================
45 | # Example: python make_train_plots.py --log_path logs/log.csv --keys eval_score rollout_score save_loc logs
46 | def main():
47 | # Parse arguments
48 | parser = argparse.ArgumentParser()
49 | parser.add_argument(
50 | '-l', '--log_path', type=str, required=True, help='path file to log.csv')
51 | parser.add_argument(
52 | '-k', '--keys', type=str, action='append', nargs='+', required=True, help='keys to plot')
53 | parser.add_argument(
54 | '-s', '--save_loc', type=str, default='', help='Path for logs')
55 | args = parser.parse_args()
56 |
57 | make_train_plots(log_path=args.log_path, keys=args.keys[0], save_loc=args.save_loc)
58 |
59 | if __name__ == '__main__':
60 | main()
61 |
62 |
--------------------------------------------------------------------------------
/mjrl/utils/optimize_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import copy
3 | import torch
4 | import torch.nn as nn
5 |
6 |
7 | def fit_data(model, x, y, optimizer, loss_func, batch_size, epochs):
8 | """
9 | :param model: pytorch model of form y_hat = f(x) (class)
10 | :param x: inputs to the model (tensor)
11 | :param y: desired outputs or targets (tensor)
12 | :param optimizer: optimizer to be used (class)
13 | :param loss_func: loss criterion (callable)
14 | :param batch_size: mini-batch size for optimization (int)
15 | :param epochs: number of epochs (int)
16 | :return:
17 | """
18 |
19 | num_samples = x.shape[0]
20 | epoch_losses = []
21 | for ep in range(epochs):
22 | rand_idx = torch.LongTensor(np.random.permutation(num_samples))
23 | ep_loss = 0.0
24 | num_steps = int(num_samples / batch_size) - 1
25 | for mb in range(num_steps):
26 | data_idx = rand_idx[mb*batch_size:(mb+1)*batch_size]
27 | batch_x = x[data_idx]
28 | batch_y = y[data_idx]
29 | optimizer.zero_grad()
30 | yhat = model(batch_x)
31 | loss = loss_func(yhat, batch_y)
32 | loss.backward()
33 | optimizer.step()
34 | ep_loss += loss.detach()
35 | epoch_losses.append(ep_loss.to('cpu').data.numpy().ravel() / num_steps)
36 | return epoch_losses
37 |
--------------------------------------------------------------------------------
/mjrl/utils/plot_from_logs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pickle
4 | import numpy as np
5 | import matplotlib
6 | matplotlib.use('Agg')
7 | import matplotlib.pyplot as plt
8 | colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
9 |
10 | parser = argparse.ArgumentParser(description='Script to explore the data generated by an experiment.')
11 | parser.add_argument('--data', '-d', type=str, required=True, help='location of the .pickle log data file')
12 | parser.add_argument('--output', '-o', type=str, required=True, help='location to store results as a png')
13 | parser.add_argument('--xkey', '-x', type=str, default=None, help='the key to use for x axis in plots')
14 | parser.add_argument('--xscale', '-s', type=int, default=1, help='scaling for the x axis (optional)')
15 | args = parser.parse_args()
16 |
17 | # get inputs and setup output file
18 | if '.png' in args.output:
19 | OUT_FILE = args.output
20 | else:
21 | OUT_FILE = args.output + '/plot.png'
22 | data = pickle.load(open(args.data, 'rb'))
23 | xscale = 1 if args.xscale is None else args.xscale
24 | if args.xkey == 'num_samples':
25 | xscale = xscale if 'act_repeat' not in data.keys() else data['act_repeat'][-1]
26 |
27 | dict_keys = list(data.keys())
28 | for k in dict_keys:
29 | if len(data[k]) == 1: del(data[k])
30 |
31 | # plot layout
32 | nplt = len(data.keys())
33 | ncol = 4
34 | nrow = int(np.ceil(nplt/ncol))
35 |
36 | # plot data
37 | xkey = args.xkey
38 | start_idx = 2
39 | end_idx = max([len(data[k]) for k in data.keys()])
40 | xdata = np.arange(end_idx) if (xkey is None or xkey == 'None') else \
41 | [np.sum(data[xkey][:i+1]) * xscale for i in range(len(data[xkey]))]
42 |
43 | # make the plot
44 | plt.figure(figsize=(15,15), dpi=60)
45 | for idx, key in enumerate(data.keys()):
46 | plt.subplot(nrow, ncol, idx+1)
47 | plt.tight_layout()
48 | try:
49 | last_idx = min(end_idx, len(data[key]))
50 | plt.plot(xdata[start_idx:last_idx], data[key][start_idx:last_idx], color=colors[idx%7], linewidth=3)
51 | except:
52 | pass
53 | plt.title(key)
54 |
55 | plt.savefig(OUT_FILE, dpi=100, bbox_inches="tight")
56 |
--------------------------------------------------------------------------------
/mjrl/utils/process_samples.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def compute_returns(paths, gamma):
4 | for path in paths:
5 | path["returns"] = discount_sum(path["rewards"], gamma)
6 |
7 | def compute_advantages(paths, baseline, gamma, gae_lambda=None, normalize=False):
8 | # compute and store returns, advantages, and baseline
9 | # standard mode
10 | if gae_lambda == None or gae_lambda < 0.0 or gae_lambda > 1.0:
11 | for path in paths:
12 | path["baseline"] = baseline.predict(path)
13 | path["advantages"] = path["returns"] - path["baseline"]
14 | if normalize:
15 | alladv = np.concatenate([path["advantages"] for path in paths])
16 | mean_adv = alladv.mean()
17 | std_adv = alladv.std()
18 | for path in paths:
19 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8)
20 | # GAE mode
21 | else:
22 | for path in paths:
23 | b = path["baseline"] = baseline.predict(path)
24 | if b.ndim == 1:
25 | b1 = np.append(path["baseline"], 0.0 if path["terminated"] else b[-1])
26 | else:
27 | b1 = np.vstack((b, np.zeros(b.shape[1]) if path["terminated"] else b[-1]))
28 | td_deltas = path["rewards"] + gamma*b1[1:] - b1[:-1]
29 | path["advantages"] = discount_sum(td_deltas, gamma*gae_lambda)
30 | if normalize:
31 | alladv = np.concatenate([path["advantages"] for path in paths])
32 | mean_adv = alladv.mean()
33 | std_adv = alladv.std()
34 | for path in paths:
35 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8)
36 |
37 | def discount_sum(x, gamma, terminal=0.0):
38 | y = []
39 | run_sum = terminal
40 | for t in range( len(x)-1, -1, -1):
41 | run_sum = x[t] + gamma*run_sum
42 | y.append(run_sum)
43 |
44 | return np.array(y[::-1])
--------------------------------------------------------------------------------
/mjrl/utils/tensor_utils.py:
--------------------------------------------------------------------------------
1 | import operator
2 |
3 | import numpy as np
4 |
5 |
6 | def flatten_tensors(tensors):
7 | if len(tensors) > 0:
8 | return np.concatenate([np.reshape(x, [-1]) for x in tensors])
9 | else:
10 | return np.asarray([])
11 |
12 |
13 | def unflatten_tensors(flattened, tensor_shapes):
14 | tensor_sizes = list(map(np.prod, tensor_shapes))
15 | indices = np.cumsum(tensor_sizes)[:-1]
16 | return [np.reshape(pair[0], pair[1]) for pair in zip(np.split(flattened, indices), tensor_shapes)]
17 |
18 |
19 | def pad_tensor(x, max_len, mode='zero'):
20 | padding = np.zeros_like(x[0])
21 | if mode == 'last':
22 | padding = x[-1]
23 | return np.concatenate([
24 | x,
25 | np.tile(padding, (max_len - len(x),) + (1,) * np.ndim(x[0]))
26 | ])
27 |
28 |
29 | def pad_tensor_n(xs, max_len):
30 | ret = np.zeros((len(xs), max_len) + xs[0].shape[1:], dtype=xs[0].dtype)
31 | for idx, x in enumerate(xs):
32 | ret[idx][:len(x)] = x
33 | return ret
34 |
35 |
36 | def pad_tensor_dict(tensor_dict, max_len, mode='zero'):
37 | keys = list(tensor_dict.keys())
38 | ret = dict()
39 | for k in keys:
40 | if isinstance(tensor_dict[k], dict):
41 | ret[k] = pad_tensor_dict(tensor_dict[k], max_len, mode=mode)
42 | else:
43 | ret[k] = pad_tensor(tensor_dict[k], max_len, mode=mode)
44 | return ret
45 |
46 |
47 | def flatten_first_axis_tensor_dict(tensor_dict):
48 | keys = list(tensor_dict.keys())
49 | ret = dict()
50 | for k in keys:
51 | if isinstance(tensor_dict[k], dict):
52 | ret[k] = flatten_first_axis_tensor_dict(tensor_dict[k])
53 | else:
54 | old_shape = tensor_dict[k].shape
55 | ret[k] = tensor_dict[k].reshape((-1,) + old_shape[2:])
56 | return ret
57 |
58 |
59 | def high_res_normalize(probs):
60 | return [x / sum(map(float, probs)) for x in list(map(float, probs))]
61 |
62 |
63 | def stack_tensor_list(tensor_list):
64 | return np.array(tensor_list)
65 | # tensor_shape = np.array(tensor_list[0]).shape
66 | # if tensor_shape is tuple():
67 | # return np.array(tensor_list)
68 | # return np.vstack(tensor_list)
69 |
70 |
71 | def stack_tensor_dict_list(tensor_dict_list):
72 | """
73 | Stack a list of dictionaries of {tensors or dictionary of tensors}.
74 | :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}.
75 | :return: a dictionary of {stacked tensors or dictionary of stacked tensors}
76 | """
77 | keys = list(tensor_dict_list[0].keys())
78 | ret = dict()
79 | for k in keys:
80 | example = tensor_dict_list[0][k]
81 | if isinstance(example, dict):
82 | v = stack_tensor_dict_list([x[k] for x in tensor_dict_list])
83 | else:
84 | v = stack_tensor_list([x[k] for x in tensor_dict_list])
85 | ret[k] = v
86 | return ret
87 |
88 |
89 | def concat_tensor_list_subsample(tensor_list, f):
90 | return np.concatenate(
91 | [t[np.random.choice(len(t), int(np.ceil(len(t) * f)), replace=False)] for t in tensor_list], axis=0)
92 |
93 |
94 | def concat_tensor_dict_list_subsample(tensor_dict_list, f):
95 | keys = list(tensor_dict_list[0].keys())
96 | ret = dict()
97 | for k in keys:
98 | example = tensor_dict_list[0][k]
99 | if isinstance(example, dict):
100 | v = concat_tensor_dict_list_subsample([x[k] for x in tensor_dict_list], f)
101 | else:
102 | v = concat_tensor_list_subsample([x[k] for x in tensor_dict_list], f)
103 | ret[k] = v
104 | return ret
105 |
106 |
107 | def concat_tensor_list(tensor_list):
108 | return np.concatenate(tensor_list, axis=0)
109 |
110 |
111 | def concat_tensor_dict_list(tensor_dict_list):
112 | keys = list(tensor_dict_list[0].keys())
113 | ret = dict()
114 | for k in keys:
115 | example = tensor_dict_list[0][k]
116 | if isinstance(example, dict):
117 | v = concat_tensor_dict_list([x[k] for x in tensor_dict_list])
118 | else:
119 | v = concat_tensor_list([x[k] for x in tensor_dict_list])
120 | ret[k] = v
121 | return ret
122 |
123 |
124 | def split_tensor_dict_list(tensor_dict):
125 | keys = list(tensor_dict.keys())
126 | ret = None
127 | for k in keys:
128 | vals = tensor_dict[k]
129 | if isinstance(vals, dict):
130 | vals = split_tensor_dict_list(vals)
131 | if ret is None:
132 | ret = [{k: v} for v in vals]
133 | else:
134 | for v, cur_dict in zip(vals, ret):
135 | cur_dict[k] = v
136 | return ret
137 |
138 |
139 | def truncate_tensor_list(tensor_list, truncated_len):
140 | return tensor_list[:truncated_len]
141 |
142 |
143 | def truncate_tensor_dict(tensor_dict, truncated_len):
144 | ret = dict()
145 | for k, v in tensor_dict.items():
146 | if isinstance(v, dict):
147 | ret[k] = truncate_tensor_dict(v, truncated_len)
148 | else:
149 | ret[k] = truncate_tensor_list(v, truncated_len)
150 | return ret
151 |
--------------------------------------------------------------------------------
/mjrl/utils/train_agent.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.disable(logging.CRITICAL)
3 |
4 | from tabulate import tabulate
5 | from mjrl.utils.make_train_plots import make_train_plots
6 | from mjrl.utils.gym_env import GymEnv
7 | from mjrl.samplers.core import sample_paths
8 | import numpy as np
9 | import pickle
10 | import time as timer
11 | import os
12 | import copy
13 |
14 |
15 | def _load_latest_policy_and_logs(agent, *, policy_dir, logs_dir):
16 | """Loads the latest policy.
17 | Returns the next step number to begin with.
18 | """
19 | assert os.path.isdir(policy_dir), str(policy_dir)
20 | assert os.path.isdir(logs_dir), str(logs_dir)
21 |
22 | log_csv_path = os.path.join(logs_dir, 'log.csv')
23 | if not os.path.exists(log_csv_path):
24 | return 0 # fresh start
25 |
26 | print("Reading: {}".format(log_csv_path))
27 | agent.logger.read_log(log_csv_path)
28 | last_step = agent.logger.max_len - 1
29 | if last_step <= 0:
30 | return 0 # fresh start
31 |
32 |
33 | # find latest policy/baseline
34 | i = last_step
35 | while i >= 0:
36 | policy_path = os.path.join(policy_dir, 'policy_{}.pickle'.format(i))
37 | baseline_path = os.path.join(policy_dir, 'baseline_{}.pickle'.format(i))
38 |
39 | if not os.path.isfile(policy_path):
40 | i = i -1
41 | continue
42 | else:
43 | print("Loaded last saved iteration: {}".format(i))
44 |
45 | with open(policy_path, 'rb') as fp:
46 | agent.policy = pickle.load(fp)
47 | with open(baseline_path, 'rb') as fp:
48 | agent.baseline = pickle.load(fp)
49 |
50 | # additional
51 | # global_status_path = os.path.join(policy_dir, 'global_status.pickle')
52 | # with open(global_status_path, 'rb') as fp:
53 | # agent.load_global_status( pickle.load(fp) )
54 |
55 | agent.logger.shrink_to(i + 1)
56 | assert agent.logger.max_len == i + 1
57 | return agent.logger.max_len
58 |
59 | # cannot find any saved policy
60 | raise RuntimeError("Log file exists, but cannot find any saved policy.")
61 |
62 | def train_agent(job_name, agent,
63 | seed = 0,
64 | niter = 101,
65 | gamma = 0.995,
66 | gae_lambda = None,
67 | num_cpu = 1,
68 | sample_mode = 'trajectories',
69 | num_traj = 50,
70 | num_samples = 50000, # has precedence, used with sample_mode = 'samples'
71 | save_freq = 10,
72 | evaluation_rollouts = None,
73 | plot_keys = ['stoc_pol_mean'],
74 | ):
75 |
76 | np.random.seed(seed)
77 | if os.path.isdir(job_name) == False:
78 | os.mkdir(job_name)
79 | previous_dir = os.getcwd()
80 | os.chdir(job_name) # important! we are now in the directory to save data
81 | if os.path.isdir('iterations') == False: os.mkdir('iterations')
82 | if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs')
83 | best_policy = copy.deepcopy(agent.policy)
84 | best_perf = -1e8
85 | train_curve = best_perf*np.ones(niter)
86 | mean_pol_perf = 0.0
87 | e = GymEnv(agent.env.env_id)
88 |
89 | # Load from any existing checkpoint, policy, statistics, etc.
90 | # Why no checkpointing.. :(
91 | i_start = _load_latest_policy_and_logs(agent,
92 | policy_dir='iterations',
93 | logs_dir='logs')
94 | if i_start:
95 | print("Resuming from an existing job folder ...")
96 |
97 | for i in range(i_start, niter):
98 | print("......................................................................................")
99 | print("ITERATION : %i " % i)
100 |
101 | if train_curve[i-1] > best_perf:
102 | best_policy = copy.deepcopy(agent.policy)
103 | best_perf = train_curve[i-1]
104 |
105 | N = num_traj if sample_mode == 'trajectories' else num_samples
106 | args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu)
107 | stats = agent.train_step(**args)
108 | train_curve[i] = stats[0]
109 |
110 | if evaluation_rollouts is not None and evaluation_rollouts > 0:
111 | print("Performing evaluation rollouts ........")
112 | eval_paths = sample_paths(num_traj=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu,
113 | env=e.env_id, eval_mode=True, base_seed=seed)
114 | mean_pol_perf = np.mean([np.sum(path['rewards']) for path in eval_paths])
115 | if agent.save_logs:
116 | agent.logger.log_kv('eval_score', mean_pol_perf)
117 | try:
118 | eval_success = e.env.env.evaluate_success(eval_paths)
119 | agent.logger.log_kv('eval_success', eval_success)
120 | except:
121 | pass
122 |
123 | if i % save_freq == 0 and i > 0:
124 | if agent.save_logs:
125 | agent.logger.save_log('logs/')
126 | make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
127 | policy_file = 'policy_%i.pickle' % i
128 | baseline_file = 'baseline_%i.pickle' % i
129 | pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
130 | pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb'))
131 | pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
132 | # pickle.dump(agent.global_status, open('iterations/global_status.pickle', 'wb'))
133 |
134 | # print results to console
135 | if i == 0:
136 | result_file = open('results.txt', 'w')
137 | print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
138 | result_file.write("Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
139 | result_file.close()
140 | print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())),
141 | i, train_curve[i], mean_pol_perf, best_perf))
142 | result_file = open('results.txt', 'a')
143 | result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf))
144 | result_file.close()
145 | if agent.save_logs:
146 | print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
147 | agent.logger.get_current_log().items()))
148 | print(tabulate(print_data))
149 |
150 | # final save
151 | pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
152 | if agent.save_logs:
153 | agent.logger.save_log('logs/')
154 | make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
155 | os.chdir(previous_dir)
156 |
--------------------------------------------------------------------------------
/mjrl/utils/visualize_policy.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import mjrl.envs
3 | import click
4 | import os
5 | import gym
6 | import numpy as np
7 | import pickle
8 | from mjrl.utils.gym_env import GymEnv
9 | from mjrl.policies.gaussian_mlp import MLP
10 | import trajopt.envs
11 |
12 | DESC = '''
13 | Helper script to visualize policy (in mjrl format).\n
14 | USAGE:\n
15 | Visualizes policy on the env\n
16 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n
17 | '''
18 |
19 | # MAIN =========================================================
20 | @click.command(help=DESC)
21 | @click.option('--env_name', type=str, help='environment to load', required= True)
22 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None)
23 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation')
24 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123)
25 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10)
26 |
27 | def main(env_name, policy, mode, seed, episodes):
28 | e = GymEnv(env_name)
29 | e.set_seed(seed)
30 | if policy is not None:
31 | pi = pickle.load(open(policy, 'rb'))
32 | else:
33 | pi = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=-1.0)
34 | # render policy
35 | e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode)
36 |
37 | if __name__ == '__main__':
38 | main()
39 |
40 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from setuptools import setup, find_packages
4 |
5 | print("Installing mjrl. \n Package intended for use with provided conda env. See setup instructions here: https://github.com/aravindr93/mjrl/tree/master/setup")
6 |
7 | if sys.version_info.major != 3:
8 | print("This Python is only compatible with Python 3, but you are running "
9 | "Python {}. The installation will likely fail.".format(sys.version_info.major))
10 |
11 | def read(fname):
12 | return open(os.path.join(os.path.dirname(__file__), fname)).read()
13 |
14 | setup(
15 | name='mjrl',
16 | version='1.0.0',
17 | packages=find_packages(),
18 | description='RL algorithms for environments in MuJoCo',
19 | long_description=read('README.md'),
20 | url='https://github.com/aravindr93/mjrl.git',
21 | author='Aravind Rajeswaran',
22 | )
23 |
--------------------------------------------------------------------------------
/setup/README.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | A short guide to install this package is below. The package relies on `mujoco-py` which might be the trickiest part of the installation. See `known issues` below and also instructions from the mujoco-py [page](https://github.com/openai/mujoco-py) if you are stuck with mujoco-py installation.
4 |
5 | The package can handle both `MuJoCo v1.5` as well as `MuJoCo v2.0`, but the former is not supported for future updates. We encourage you to use v2.0.
6 |
7 | ## Linux
8 |
9 | - Download MuJoCo v2.0 binaries from the official [website](http://www.mujoco.org/) and also obtain the license key.
10 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200`, and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. Note that unzip of the MuJoCo binaries will generate `mujoco200_linux`. You need to rename the directory and place it at `~/.mujoco/mujoco200`.
11 | - Install osmesa related dependencies:
12 | ```
13 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev build-essential libglfw3
14 | ```
15 | - Update `bashrc` by adding the following lines and source it
16 | ```
17 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH"
18 | export MUJOCO_PY_FORCE_CPU=True
19 | alias MJPL='LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so:/usr/lib/nvidia-384/libGL.so'
20 | ```
21 | - Install this package using
22 | ```
23 | $ conda update conda
24 | $ cd
25 | $ conda env create -f setup/env.yml
26 | $ source activate mjrl-env
27 | $ pip install -e .
28 | ```
29 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly based on the specific version of CUDA (or CPU-only) you have.
30 |
31 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info.
32 |
33 | ## Mac OS
34 |
35 | - Download MuJoCo binaries from the official [website](http://www.mujoco.org/) and also obtain the license key.
36 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200` (rename unzipped directory to this), and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`.
37 | - Update `bashrc` by adding the following lines and source it
38 | ```
39 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH"
40 | ```
41 | - Install this package using
42 | ```
43 | $ conda update conda
44 | $ cd path/to/mjrl
45 | $ conda env create -f setup/env.yml
46 | $ source activate mjrl-env
47 | $ pip install -e .
48 | ```
49 |
50 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly.
51 |
52 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info.
53 |
54 |
55 | ## Known Issues
56 |
57 | - Visualization in linux: If the linux system has a GPU, then mujoco-py does not automatically preload the correct drivers. We added an alias `MJPL` in bashrc (see instructions) which stands for mujoco pre-load. When runing any python script that requires rendering, prepend the execution with MJPL.
58 | ```
59 | $ MJPL python script.py
60 | ```
61 |
62 | - Errors related to osmesa during installation. This is a `mujoco-py` build error and would likely go away if the following command is used before creating the conda environment. If the problem still persists, please contact the developers of mujoco-py
63 | ```
64 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev
65 | ```
66 |
67 | - If conda environment creation gets interrupted for some reason, you can resume it with the following:
68 | ```
69 | $ conda env update -n mjrl-env -f setup/env.yml
70 | ```
71 |
72 | - GCC error in Mac OS: If you get a GCC error from mujoco-py, you can get the correct version mujoco-py expects with `brew install gcc --without-multilib`. This may require uninstalling other versions of GCC that may have been previously installed with `brew remove gcc@6` for example. You can see which brew packages were already installed with `brew list`.
73 |
74 |
--------------------------------------------------------------------------------
/setup/env.yml:
--------------------------------------------------------------------------------
1 | name: mjrl-env
2 | channels:
3 | - pytorch
4 | - defaults
5 | dependencies:
6 | - python=3.7
7 | - pip
8 | - ipython
9 | - mkl-service
10 | - pytorch==1.4
11 | - tabulate
12 | - termcolor
13 | - torchvision
14 | - patchelf
15 | - pip:
16 | - click
17 | - cloudpickle
18 | - gym==0.13
19 | - ipdb
20 | - matplotlib
21 | - mujoco-py<2.1,>=2.0
22 | - pip
23 | - pyyaml
24 | - tqdm
25 | - wheel
26 | - scipy
27 | - transforms3d
28 |
--------------------------------------------------------------------------------
/tests/hydra/config/hydra_npg_config.yaml:
--------------------------------------------------------------------------------
1 | # general outputs
2 | job_name : 'hydra_npg_test'
3 |
4 | # general inputs
5 | env : Hopper-v3
6 | algorithm : NPG
7 | seed : 123
8 | sample_mode : samples
9 | rl_num_samples : 1000
10 | rl_num_traj : 0
11 | rl_num_iter : 2
12 | num_cpu : 4
13 | save_freq : 5
14 | eval_rollouts : 0
15 | exp_notes : 'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.'
16 |
17 | # RL parameters (all params related to PG, value function etc.)
18 | policy_size : (32, 32)
19 | init_log_std : -0.5
20 | vf_hidden_size : (128, 128)
21 | vf_batch_size : 64
22 | vf_epochs : 2
23 | vf_learn_rate : 1e-3
24 | rl_step_size : 0.05
25 | rl_gamma : 0.995
26 | rl_gae : 0.97
27 |
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 |
30 | alg_hyper_params : {}
31 |
32 | hydra:
33 | launcher:
34 | cpus_per_task: 12
35 | gpus_per_node: 0
36 | tasks_per_node: 1
37 | run:
38 | dir: ./outputs/${hydra.job.name}/${now:%Y-%m-%d_%H-%M-%S}
39 | sweep:
40 | dir: /checkpoint/${env:USER}/outputs/${job_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
41 | subdir: ${hydra.job.num}_${hydra.job.override_dirname}
--------------------------------------------------------------------------------
/tests/hydra/hydra_policy_opt_job_script.py:
--------------------------------------------------------------------------------
1 | """
2 | This is a job script for running policy gradient algorithms on gym tasks.
3 | Separate job scripts are provided to run few other algorithms
4 | - For DAPG see here: https://github.com/aravindr93/hand_dapg/tree/master/dapg/examples
5 | - For model-based NPG see here: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos/model_accel
6 | """
7 |
8 | from mjrl.utils.gym_env import GymEnv
9 | from mjrl.policies.gaussian_mlp import MLP
10 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
11 | from mjrl.baselines.mlp_baseline import MLPBaseline
12 | from mjrl.algos.npg_cg import NPG
13 | from mjrl.algos.batch_reinforce import BatchREINFORCE
14 | from mjrl.algos.ppo_clip import PPO
15 | from mjrl.utils.train_agent import train_agent
16 | import os
17 | import json
18 | import gym
19 | import mjrl.envs
20 | # import mj_envs
21 | import time as timer
22 | import pickle
23 | import hydra
24 | from omegaconf import DictConfig, OmegaConf
25 |
26 | # ===============================================================================
27 | # Process Inputs
28 | # ===============================================================================
29 | def preprocess(job_data):
30 | if not os.path.exists(job_data.job_name):
31 | os.mkdir(job_data.job_name)
32 | assert 'algorithm' in job_data.keys()
33 | assert any([job_data.algorithm == a for a in ['NPG', 'NVPG', 'VPG', 'PPO']])
34 | assert 'sample_mode' in job_data.keys()
35 | job_data.alg_hyper_params = dict() if 'alg_hyper_params' not in job_data.keys() else job_data.alg_hyper_params
36 |
37 | EXP_FILE = job_data.job_name + '/job_config.json'
38 | with open(EXP_FILE, 'w') as fp:
39 | # json.dump(job_data, f, indent=4)
40 | OmegaConf.save(config=job_data, f=fp.name)
41 |
42 | if job_data.sample_mode == 'trajectories':
43 | assert 'rl_num_traj' in job_data.keys()
44 | job_data.rl_num_samples = 0 # will be ignored
45 | elif job_data.sample_mode == 'samples':
46 | assert 'rl_num_samples' in job_data.keys()
47 | job_data.rl_num_traj = 0 # will be ignored
48 | else:
49 | print("Unknown sampling mode. Choose either trajectories or samples")
50 | exit()
51 |
52 | # ===============================================================================
53 | # Train Loop
54 | # ===============================================================================
55 | @hydra.main(config_name="hydra_npg_config", config_path="config")
56 | def train_loop(job_data: DictConfig) -> None:
57 | print("========================================")
58 | print("Job Configuration")
59 | print("========================================")
60 | preprocess(job_data)
61 | print(OmegaConf.to_yaml(job_data))
62 |
63 | e = GymEnv(job_data.env)
64 | policy_size = tuple(eval(job_data.policy_size))
65 | vf_hidden_size = tuple(eval(job_data.vf_hidden_size))
66 |
67 | policy = MLP(e.spec, hidden_sizes=policy_size, seed=job_data.seed, init_log_std=job_data.init_log_std)
68 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data.vf_batch_size, hidden_sizes=vf_hidden_size,
69 | epochs=job_data.vf_epochs, learn_rate=job_data.vf_learn_rate)
70 |
71 | # Construct the algorithm
72 | if job_data.algorithm == 'NPG':
73 | # Other hyperparameters (like number of CG steps) can be specified in config for pass through
74 | # or default hyperparameters will be used
75 | agent = NPG(e, policy, baseline, normalized_step_size=job_data.rl_step_size,
76 | seed=job_data.seed, save_logs=True, **job_data.alg_hyper_params)
77 |
78 | elif job_data.algorithm == 'VPG':
79 | agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data.rl_step_size,
80 | seed=job_data.seed, save_logs=True, **job_data.alg_hyper_params)
81 |
82 | elif job_data.algorithm == 'NVPG':
83 | agent = BatchREINFORCE(e, policy, baseline, desired_kl=job_data.rl_step_size,
84 | seed=job_data.seed, save_logs=True, **job_data.alg_hyper_params)
85 |
86 | elif job_data.algorithm == 'PPO':
87 | # There are many hyperparameters for PPO. They can be specified in config for pass through
88 | # or defaults in the PPO algorithm will be used
89 | agent = PPO(e, policy, baseline, save_logs=True, **job_data.alg_hyper_params)
90 | else:
91 | NotImplementedError("Algorithm not found")
92 |
93 | print("========================================")
94 | print("Starting policy learning")
95 | print("========================================")
96 |
97 | ts = timer.time()
98 | train_agent(job_name=job_data.job_name,
99 | agent=agent,
100 | seed=job_data.seed,
101 | niter=job_data.rl_num_iter,
102 | gamma=job_data.rl_gamma,
103 | gae_lambda=job_data.rl_gae,
104 | num_cpu=job_data.num_cpu,
105 | sample_mode=job_data.sample_mode,
106 | num_traj=job_data.rl_num_traj,
107 | num_samples=job_data.rl_num_samples,
108 | save_freq=job_data.save_freq,
109 | evaluation_rollouts=job_data.eval_rollouts)
110 | print("========================================")
111 | print("Job Finished. Time taken = %f" % (timer.time()-ts))
112 | print("========================================")
113 |
114 | if __name__ == "__main__":
115 | train_loop()
--------------------------------------------------------------------------------
/tests/point_mass_test.py:
--------------------------------------------------------------------------------
1 | from mjrl.utils.gym_env import GymEnv
2 | from mjrl.policies.gaussian_mlp import MLP
3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
4 | from mjrl.baselines.mlp_baseline import MLPBaseline
5 | from mjrl.algos.npg_cg import NPG
6 | from mjrl.utils.train_agent import train_agent
7 | import mjrl.envs
8 | import time as timer
9 | SEED = 500
10 |
11 | e = GymEnv('mjrl_point_mass-v0')
12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
13 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=10, learn_rate=1e-3)
14 | agent = NPG(e, policy, baseline, normalized_step_size=0.05, seed=SEED, save_logs=True)
15 |
16 | ts = timer.time()
17 | train_agent(job_name='point_mass_exp1',
18 | agent=agent,
19 | seed=SEED,
20 | niter=50,
21 | gamma=0.95,
22 | gae_lambda=0.97,
23 | num_cpu=1,
24 | sample_mode='trajectories',
25 | num_traj=40, # samples = 40*25 = 1000
26 | save_freq=5,
27 | evaluation_rollouts=None,
28 | plot_keys=['stoc_pol_mean', 'running_score'])
29 | print("time taken = %f" % (timer.time()-ts))
30 |
--------------------------------------------------------------------------------
/tests/visualizer_test.py:
--------------------------------------------------------------------------------
1 | from mjrl.utils.gym_env import GymEnv
2 | from mjrl.policies.gaussian_mlp import MLP
3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
4 | from mjrl.baselines.mlp_baseline import MLPBaseline
5 | from mjrl.algos.npg_cg import NPG
6 | from mjrl.utils.train_agent import train_agent
7 | import mjrl.envs
8 | import time as timer
9 | SEED = 500
10 |
11 | e = GymEnv('mjrl_point_mass-v0')
12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
13 | baseline = QuadraticBaseline(e.spec)
14 | agent = NPG(e, policy, baseline, normalized_step_size=0.5, seed=SEED, save_logs=True)
15 |
16 | ts = timer.time()
17 | train_agent(job_name='vis_exp',
18 | agent=agent,
19 | seed=SEED,
20 | niter=10,
21 | gamma=0.95,
22 | gae_lambda=0.97,
23 | num_cpu=1,
24 | sample_mode='trajectories',
25 | num_traj=100,
26 | save_freq=5,
27 | evaluation_rollouts=None)
28 | print("time taken = %f" % (timer.time()-ts))
29 | e.visualize_policy(policy, num_episodes=5, horizon=e.horizon, mode='exploration')
30 |
--------------------------------------------------------------------------------