├── .gitignore ├── LICENSE ├── README.md ├── examples ├── README.md ├── behavior_clone.py ├── example_configs │ ├── hopper_npg.txt │ ├── swimmer_npg.txt │ └── swimmer_ppo.txt ├── linear_nn_comparison.py └── policy_opt_job_script.py ├── mjrl ├── __init__.py ├── algos │ ├── __init__.py │ ├── batch_reinforce.py │ ├── behavior_cloning.py │ ├── dapg.py │ ├── mbac.py │ ├── model_accel │ │ ├── __init__.py │ │ ├── model_accel_npg.py │ │ ├── model_learning_mpc.py │ │ ├── nn_dynamics.py │ │ ├── run_experiments │ │ │ ├── configs │ │ │ │ ├── point_mass.txt │ │ │ │ └── reacher.txt │ │ │ ├── run_model_accel_npg.py │ │ │ ├── sandbox │ │ │ │ ├── example_config_mpc.txt │ │ │ │ └── run_model_learning_mpc.py │ │ │ └── utils │ │ │ │ ├── reward_functions │ │ │ │ ├── __init__.py │ │ │ │ └── mjrl_point_mass.py │ │ │ │ ├── visualize_policy.py │ │ │ │ └── visualize_trajectories.py │ │ └── sampling.py │ ├── npg_cg.py │ ├── ppo_clip.py │ └── trpo.py ├── baselines │ ├── __init__.py │ ├── linear_baseline.py │ ├── mlp_baseline.py │ ├── quadratic_baseline.py │ └── zero_baseline.py ├── envs │ ├── __init__.py │ ├── assets │ │ ├── peg_insertion.xml │ │ ├── point_mass.xml │ │ ├── sawyer.xml │ │ └── swimmer.xml │ ├── mujoco_env.py │ ├── peg_insertion_sawyer.py │ ├── point_mass.py │ ├── reacher_sawyer.py │ └── swimmer.py ├── policies │ ├── __init__.py │ ├── gaussian_linear.py │ ├── gaussian_mlp.py │ └── mpc_actor.py ├── samplers │ ├── __init__.py │ └── core.py └── utils │ ├── __init__.py │ ├── cg_solve.py │ ├── fc_network.py │ ├── get_environment.py │ ├── gym_env.py │ ├── logger.py │ ├── make_train_plots.py │ ├── optimize_model.py │ ├── plot_from_logs.py │ ├── process_samples.py │ ├── tensor_utils.py │ ├── train_agent.py │ └── visualize_policy.py ├── setup.py ├── setup ├── README.md └── env.yml └── tests ├── hydra ├── config │ └── hydra_npg_config.yaml └── hydra_policy_opt_job_script.py ├── point_mass_test.py └── visualizer_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # idea 104 | *.idea/ 105 | 106 | # Mac OSX files 107 | *.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL for MuJoCo 2 | 3 | This package contains implementations of various RL algorithms for continuous control tasks simulated with [MuJoCo.](http://www.mujoco.org/) 4 | 5 | # Installation 6 | The main package dependencies are `MuJoCo`, `python=3.7`, `gym>=0.13`, `mujoco-py>=2.0`, and `pytorch>=1.0`. See `setup/README.md` ([link](https://github.com/aravindr93/mjrl/tree/master/setup#installation)) for detailed install instructions. 7 | 8 | # Bibliography 9 | If you find the package useful, please cite the following papers. 10 | ``` 11 | @INPROCEEDINGS{Rajeswaran-NIPS-17, 12 | AUTHOR = {Aravind Rajeswaran and Kendall Lowrey and Emanuel Todorov and Sham Kakade}, 13 | TITLE = "{Towards Generalization and Simplicity in Continuous Control}", 14 | BOOKTITLE = {NIPS}, 15 | YEAR = {2017}, 16 | } 17 | 18 | @INPROCEEDINGS{Rajeswaran-RSS-18, 19 | AUTHOR = {Aravind Rajeswaran AND Vikash Kumar AND Abhishek Gupta AND 20 | Giulia Vezzani AND John Schulman AND Emanuel Todorov AND Sergey Levine}, 21 | TITLE = "{Learning Complex Dexterous Manipulation with Deep Reinforcement Learning and Demonstrations}", 22 | BOOKTITLE = {Proceedings of Robotics: Science and Systems (RSS)}, 23 | YEAR = {2018}, 24 | } 25 | ``` 26 | 27 | # Credits 28 | This package is maintained by [Aravind Rajeswaran](http://homes.cs.washington.edu/~aravraj/) and other members of the [Movement Control Lab,](http://homes.cs.washington.edu/~todorov/) University of Washington Seattle. 29 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | Here we provide a job script to illustrate policy optimization with incrimental learning methods like NPG and PPO. To run the experiments, use the commands below. The experiments are run through the job script provided which tasks two arguments: 4 | - `output`: path to directory where all the results will be saved 5 | - `config`: a config `.txt` file with all the experiment parameters (examples are provided) 6 | The script has to be run from this directory, i.e. `mjrl/examples` 7 | 8 | 1. To train an NPG agent on a task shipped with `mjrl` (e.g. swimmer) 9 | ``` 10 | $ python policy_opt_job_script.py --output swimmer_npg_exp --config example_configs/swimmer_npg.txt 11 | ``` 12 | 13 | 2. To train an NPG agent on an OpenAI gym benchmark task (e.g. hopper) 14 | ``` 15 | $ python policy_opt_job_script.py --output hopper_npg_exp --config example_configs/hopper_npg.txt 16 | ``` 17 | Note that since the Hopper env has termination conditions, we pick the sampling mode in the config to be `samples` rather than trajectories, so that per update we have 10K samples. 18 | 19 | 3. To train a PPO agent on the swimmer task 20 | ``` 21 | $ python policy_opt_job_script.py --output swimmer_ppo_exp --config example_configs/swimmer_ppo.txt 22 | ``` -------------------------------------------------------------------------------- /examples/behavior_clone.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 4 | from mjrl.baselines.mlp_baseline import MLPBaseline 5 | from mjrl.algos.npg_cg import NPG 6 | from mjrl.algos.behavior_cloning import BC 7 | from mjrl.utils.train_agent import train_agent 8 | from mjrl.samplers.core import sample_paths 9 | import mjrl.envs 10 | import time as timer 11 | import pickle 12 | SEED = 500 13 | 14 | # ------------------------------ 15 | # Train expert policy first 16 | e = GymEnv('mjrl_swimmer-v0') 17 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 18 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=5, learn_rate=1e-3) 19 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) 20 | 21 | ts = timer.time() 22 | print("========================================") 23 | print("Training expert policy") 24 | print("========================================") 25 | train_agent(job_name='swimmer_exp1', 26 | agent=agent, 27 | seed=SEED, 28 | niter=50, 29 | gamma=0.995, 30 | gae_lambda=0.97, 31 | num_cpu=1, 32 | sample_mode='trajectories', 33 | num_traj=10, 34 | save_freq=5, 35 | evaluation_rollouts=None) 36 | print("========================================") 37 | print("Expert policy training complete !!!") 38 | print("========================================") 39 | print("time taken = %f" % (timer.time()-ts)) 40 | print("========================================") 41 | 42 | # ------------------------------ 43 | # Get demonstrations 44 | print("========================================") 45 | print("Collecting expert demonstrations") 46 | print("========================================") 47 | expert_pol = pickle.load(open('swimmer_exp1/iterations/best_policy.pickle', 'rb')) 48 | demo_paths = sample_paths(num_traj=5, policy=expert_pol, env=e.env_id) 49 | 50 | # ------------------------------ 51 | # Train BC 52 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 53 | bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=64, lr=1e-3) # will use Adam by default 54 | ts = timer.time() 55 | print("========================================") 56 | print("Running BC with expert demonstrations") 57 | print("========================================") 58 | bc_agent.train() 59 | print("========================================") 60 | print("BC training complete !!!") 61 | print("time taken = %f" % (timer.time()-ts)) 62 | print("========================================") 63 | 64 | # ------------------------------ 65 | # Evaluate Policies 66 | bc_pol_score = e.evaluate_policy(policy, num_episodes=5, mean_action=True) 67 | expert_score = e.evaluate_policy(expert_pol, num_episodes=5, mean_action=True) 68 | print("Expert policy performance (eval mode) = %f" % expert_score[0][0]) 69 | print("BC policy performance (eval mode) = %f" % bc_pol_score[0][0]) 70 | -------------------------------------------------------------------------------- /examples/example_configs/hopper_npg.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env' : 'Hopper-v3', 6 | 'algorithm' : 'NPG', 7 | 'seed' : 123, 8 | 'sample_mode' : 'samples', 9 | 'rl_num_samples' : 10000, 10 | 'rl_num_iter' : 100, 11 | 'num_cpu' : 1, 12 | 'save_freq' : 25, 13 | 'eval_rollouts' : None, 14 | 'exp_notes' : 'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.', 15 | 16 | # RL parameters (all params related to PG, value function etc.) 17 | 18 | 'policy_size' : (32, 32), 19 | 'init_log_std' : -0.5, 20 | 'vf_hidden_size' : (128, 128), 21 | 'vf_batch_size' : 64, 22 | 'vf_epochs' : 2, 23 | 'vf_learn_rate' : 1e-3, 24 | 'rl_step_size' : 0.05, 25 | 'rl_gamma' : 0.995, 26 | 'rl_gae' : 0.97, 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | 'alg_hyper_params' : dict(), 31 | 32 | } 33 | 34 | -------------------------------------------------------------------------------- /examples/example_configs/swimmer_npg.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env' : 'mjrl_swimmer-v0', 6 | 'algorithm' : 'NPG', 7 | 'seed' : 123, 8 | 'sample_mode' : 'trajectories', 9 | 'rl_num_traj' : 10, 10 | 'rl_num_iter' : 50, 11 | 'num_cpu' : 2, 12 | 'save_freq' : 25, 13 | 'eval_rollouts' : None, 14 | 'exp_notes' : 'Example config for training policy with NPG on the mjrl swimmer task.', 15 | 16 | # RL parameters (all params related to PG, value function, DAPG etc.) 17 | 18 | 'policy_size' : (32, 32), 19 | 'init_log_std' : -0.5, 20 | 'vf_hidden_size' : (128, 128), 21 | 'vf_batch_size' : 64, 22 | 'vf_epochs' : 2, 23 | 'vf_learn_rate' : 1e-3, 24 | 'rl_step_size' : 0.1, 25 | 'rl_gamma' : 0.995, 26 | 'rl_gae' : 0.97, 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | 'alg_hyper_params' : dict(), 31 | 32 | } -------------------------------------------------------------------------------- /examples/example_configs/swimmer_ppo.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env' : 'mjrl_swimmer-v0', 6 | 'algorithm' : 'PPO', 7 | 'seed' : 123, 8 | 'sample_mode' : 'trajectories', 9 | 'rl_num_traj' : 10, 10 | 'rl_num_iter' : 50, 11 | 'num_cpu' : 2, 12 | 'save_freq' : 25, 13 | 'eval_rollouts' : None, 14 | 'exp_notes' : 'Example config for training policy with PPO on the mjrl swimmer task.', 15 | 16 | # RL parameters (all params related to PG, value function, DAPG etc.) 17 | 18 | 'policy_size' : (32, 32), 19 | 'init_log_std' : -0.5, 20 | 'vf_hidden_size' : (128, 128), 21 | 'vf_batch_size' : 64, 22 | 'vf_epochs' : 2, 23 | 'vf_learn_rate' : 1e-3, 24 | 'rl_step_size' : 0.1, 25 | 'rl_gamma' : 0.995, 26 | 'rl_gae' : 0.97, 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | 'alg_hyper_params' : dict(clip_coef=0.2, epochs=10, mb_size=64, learn_rate=5e-4), 31 | 32 | } -------------------------------------------------------------------------------- /examples/linear_nn_comparison.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.policies.gaussian_linear import LinearPolicy 4 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 5 | from mjrl.baselines.mlp_baseline import MLPBaseline 6 | from mjrl.algos.npg_cg import NPG 7 | from mjrl.utils.train_agent import train_agent 8 | import mjrl.envs 9 | import time as timer 10 | SEED = 500 11 | 12 | # NN policy 13 | # ================================== 14 | e = GymEnv('mjrl_swimmer-v0') 15 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 16 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) 17 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) 18 | 19 | ts = timer.time() 20 | train_agent(job_name='swimmer_nn_exp1', 21 | agent=agent, 22 | seed=SEED, 23 | niter=50, 24 | gamma=0.995, 25 | gae_lambda=0.97, 26 | num_cpu=1, 27 | sample_mode='trajectories', 28 | num_traj=10, 29 | save_freq=5, 30 | evaluation_rollouts=5) 31 | print("time taken for NN policy training = %f" % (timer.time()-ts)) 32 | 33 | 34 | # Linear policy 35 | # ================================== 36 | e = GymEnv('mjrl_swimmer-v0') 37 | policy = LinearPolicy(e.spec, seed=SEED) 38 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3) 39 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True) 40 | 41 | ts = timer.time() 42 | train_agent(job_name='swimmer_linear_exp1', 43 | agent=agent, 44 | seed=SEED, 45 | niter=50, 46 | gamma=0.995, 47 | gae_lambda=0.97, 48 | num_cpu=1, 49 | sample_mode='trajectories', 50 | num_traj=10, 51 | save_freq=5, 52 | evaluation_rollouts=5) 53 | print("time taken for linear policy training = %f" % (timer.time()-ts)) 54 | -------------------------------------------------------------------------------- /examples/policy_opt_job_script.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a job script for running policy gradient algorithms on gym tasks. 3 | Separate job scripts are provided to run few other algorithms 4 | - For DAPG see here: https://github.com/aravindr93/hand_dapg/tree/master/dapg/examples 5 | - For model-based NPG see here: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos/model_accel 6 | """ 7 | 8 | from mjrl.utils.gym_env import GymEnv 9 | from mjrl.policies.gaussian_mlp import MLP 10 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 11 | from mjrl.baselines.mlp_baseline import MLPBaseline 12 | from mjrl.algos.npg_cg import NPG 13 | from mjrl.algos.batch_reinforce import BatchREINFORCE 14 | from mjrl.algos.ppo_clip import PPO 15 | from mjrl.utils.train_agent import train_agent 16 | import os 17 | import json 18 | import gym 19 | import mjrl.envs 20 | import time as timer 21 | import pickle 22 | import argparse 23 | 24 | # =============================================================================== 25 | # Get command line arguments 26 | # =============================================================================== 27 | 28 | parser = argparse.ArgumentParser(description='Natural policy gradient from mjrl on mujoco environments') 29 | parser.add_argument('--output', type=str, required=True, help='location to store results') 30 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params') 31 | args = parser.parse_args() 32 | JOB_DIR = args.output 33 | if not os.path.exists(JOB_DIR): 34 | os.mkdir(JOB_DIR) 35 | with open(args.config, 'r') as f: 36 | job_data = eval(f.read()) 37 | assert 'algorithm' in job_data.keys() 38 | assert any([job_data['algorithm'] == a for a in ['NPG', 'NVPG', 'VPG', 'PPO']]) 39 | assert 'sample_mode' in job_data.keys() 40 | job_data['alg_hyper_params'] = dict() if 'alg_hyper_params' not in job_data.keys() else job_data['alg_hyper_params'] 41 | 42 | EXP_FILE = JOB_DIR + '/job_config.json' 43 | with open(EXP_FILE, 'w') as f: 44 | json.dump(job_data, f, indent=4) 45 | 46 | if job_data['sample_mode'] == 'trajectories': 47 | assert 'rl_num_traj' in job_data.keys() 48 | job_data['rl_num_samples'] = 0 # will be ignored 49 | elif job_data['sample_mode'] == 'samples': 50 | assert 'rl_num_samples' in job_data.keys() 51 | job_data['rl_num_traj'] = 0 # will be ignored 52 | else: 53 | print("Unknown sampling mode. Choose either trajectories or samples") 54 | exit() 55 | 56 | # =============================================================================== 57 | # Train Loop 58 | # =============================================================================== 59 | 60 | e = GymEnv(job_data['env']) 61 | policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed'], init_log_std=job_data['init_log_std']) 62 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], hidden_sizes=job_data['vf_hidden_size'], 63 | epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate']) 64 | 65 | # Construct the algorithm 66 | if job_data['algorithm'] == 'NPG': 67 | # Other hyperparameters (like number of CG steps) can be specified in config for pass through 68 | # or default hyperparameters will be used 69 | agent = NPG(e, policy, baseline, normalized_step_size=job_data['rl_step_size'], 70 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params']) 71 | 72 | elif job_data['algorithm'] == 'VPG': 73 | agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data['rl_step_size'], 74 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params']) 75 | 76 | elif job_data['algorithm'] == 'NVPG': 77 | agent = BatchREINFORCE(e, policy, baseline, desired_kl=job_data['rl_step_size'], 78 | seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params']) 79 | 80 | elif job_data['algorithm'] == 'PPO': 81 | # There are many hyperparameters for PPO. They can be specified in config for pass through 82 | # or defaults in the PPO algorithm will be used 83 | agent = PPO(e, policy, baseline, save_logs=True, **job_data['alg_hyper_params']) 84 | 85 | print("========================================") 86 | print("Starting policy learning") 87 | print("========================================") 88 | 89 | ts = timer.time() 90 | train_agent(job_name=JOB_DIR, 91 | agent=agent, 92 | seed=job_data['seed'], 93 | niter=job_data['rl_num_iter'], 94 | gamma=job_data['rl_gamma'], 95 | gae_lambda=job_data['rl_gae'], 96 | num_cpu=job_data['num_cpu'], 97 | sample_mode=job_data['sample_mode'], 98 | num_traj=job_data['rl_num_traj'], 99 | num_samples=job_data['rl_num_samples'], 100 | save_freq=job_data['save_freq'], 101 | evaluation_rollouts=job_data['eval_rollouts']) 102 | print("time taken = %f" % (timer.time()-ts)) 103 | -------------------------------------------------------------------------------- /mjrl/__init__.py: -------------------------------------------------------------------------------- 1 | import mjrl.envs -------------------------------------------------------------------------------- /mjrl/algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/algos/__init__.py -------------------------------------------------------------------------------- /mjrl/algos/behavior_cloning.py: -------------------------------------------------------------------------------- 1 | """ 2 | Minimize bc loss (MLE, MSE, RWR etc.) with pytorch optimizers 3 | """ 4 | 5 | import logging 6 | logging.disable(logging.CRITICAL) 7 | import numpy as np 8 | import time as timer 9 | import torch 10 | from torch.autograd import Variable 11 | from mjrl.utils.logger import DataLog 12 | from tqdm import tqdm 13 | 14 | 15 | class BC: 16 | def __init__(self, expert_paths, 17 | policy, 18 | epochs = 5, 19 | batch_size = 64, 20 | lr = 1e-3, 21 | optimizer = None, 22 | loss_type = 'MSE', # can be 'MLE' or 'MSE' 23 | save_logs = True, 24 | set_transforms = False, 25 | **kwargs, 26 | ): 27 | 28 | self.policy = policy 29 | self.expert_paths = expert_paths 30 | self.epochs = epochs 31 | self.mb_size = batch_size 32 | self.logger = DataLog() 33 | self.loss_type = loss_type 34 | self.save_logs = save_logs 35 | 36 | if set_transforms: 37 | in_shift, in_scale, out_shift, out_scale = self.compute_transformations() 38 | self.set_transformations(in_shift, in_scale, out_shift, out_scale) 39 | self.set_variance_with_data(out_scale) 40 | 41 | # construct optimizer 42 | self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=lr) if optimizer is None else optimizer 43 | 44 | # Loss criterion if required 45 | if loss_type == 'MSE': 46 | self.loss_criterion = torch.nn.MSELoss() 47 | 48 | # make logger 49 | if self.save_logs: 50 | self.logger = DataLog() 51 | 52 | def compute_transformations(self): 53 | # get transformations 54 | if self.expert_paths == [] or self.expert_paths is None: 55 | in_shift, in_scale, out_shift, out_scale = None, None, None, None 56 | else: 57 | observations = np.concatenate([path["observations"] for path in self.expert_paths]) 58 | actions = np.concatenate([path["actions"] for path in self.expert_paths]) 59 | in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) 60 | out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0) 61 | return in_shift, in_scale, out_shift, out_scale 62 | 63 | def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None): 64 | # set scalings in the target policy 65 | self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale) 66 | self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale) 67 | 68 | def set_variance_with_data(self, out_scale): 69 | # set the variance of gaussian policy based on out_scale 70 | params = self.policy.get_param_values() 71 | params[-self.policy.m:] = np.log(out_scale + 1e-12) 72 | self.policy.set_param_values(params) 73 | 74 | def loss(self, data, idx=None): 75 | if self.loss_type == 'MLE': 76 | return self.mle_loss(data, idx) 77 | elif self.loss_type == 'MSE': 78 | return self.mse_loss(data, idx) 79 | else: 80 | print("Please use valid loss type") 81 | return None 82 | 83 | def mle_loss(self, data, idx): 84 | # use indices if provided (e.g. for mini-batching) 85 | # otherwise, use all the data 86 | idx = range(data['observations'].shape[0]) if idx is None else idx 87 | if type(data['observations']) == torch.Tensor: 88 | idx = torch.LongTensor(idx) 89 | obs = data['observations'][idx] 90 | act = data['expert_actions'][idx] 91 | LL, mu, log_std = self.policy.new_dist_info(obs, act) 92 | # minimize negative log likelihood 93 | return -torch.mean(LL) 94 | 95 | def mse_loss(self, data, idx=None): 96 | idx = range(data['observations'].shape[0]) if idx is None else idx 97 | if type(data['observations']) is torch.Tensor: 98 | idx = torch.LongTensor(idx) 99 | obs = data['observations'][idx] 100 | act_expert = data['expert_actions'][idx] 101 | if type(data['observations']) is not torch.Tensor: 102 | obs = Variable(torch.from_numpy(obs).float(), requires_grad=False) 103 | act_expert = Variable(torch.from_numpy(act_expert).float(), requires_grad=False) 104 | act_pi = self.policy.model(obs) 105 | return self.loss_criterion(act_pi, act_expert.detach()) 106 | 107 | def fit(self, data, suppress_fit_tqdm=False, **kwargs): 108 | # data is a dict 109 | # keys should have "observations" and "expert_actions" 110 | validate_keys = all([k in data.keys() for k in ["observations", "expert_actions"]]) 111 | assert validate_keys is True 112 | ts = timer.time() 113 | num_samples = data["observations"].shape[0] 114 | 115 | # log stats before 116 | if self.save_logs: 117 | loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0] 118 | self.logger.log_kv('loss_before', loss_val) 119 | 120 | # train loop 121 | for ep in config_tqdm(range(self.epochs), suppress_fit_tqdm): 122 | for mb in range(int(num_samples / self.mb_size)): 123 | rand_idx = np.random.choice(num_samples, size=self.mb_size) 124 | self.optimizer.zero_grad() 125 | loss = self.loss(data, idx=rand_idx) 126 | loss.backward() 127 | self.optimizer.step() 128 | params_after_opt = self.policy.get_param_values() 129 | self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) 130 | 131 | # log stats after 132 | if self.save_logs: 133 | self.logger.log_kv('epoch', self.epochs) 134 | loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0] 135 | self.logger.log_kv('loss_after', loss_val) 136 | self.logger.log_kv('time', (timer.time()-ts)) 137 | 138 | def train(self, **kwargs): 139 | observations = np.concatenate([path["observations"] for path in self.expert_paths]) 140 | expert_actions = np.concatenate([path["actions"] for path in self.expert_paths]) 141 | data = dict(observations=observations, expert_actions=expert_actions) 142 | self.fit(data, **kwargs) 143 | 144 | 145 | def config_tqdm(range_inp, suppress_tqdm=False): 146 | if suppress_tqdm: 147 | return range_inp 148 | else: 149 | return tqdm(range_inp) -------------------------------------------------------------------------------- /mjrl/algos/dapg.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.disable(logging.CRITICAL) 3 | import numpy as np 4 | import scipy as sp 5 | import scipy.sparse.linalg as spLA 6 | import copy 7 | import time as timer 8 | import torch 9 | import torch.nn as nn 10 | from torch.autograd import Variable 11 | import copy 12 | 13 | # samplers 14 | import mjrl.samplers.core as trajectory_sampler 15 | 16 | # utility functions 17 | import mjrl.utils.process_samples as process_samples 18 | from mjrl.utils.logger import DataLog 19 | from mjrl.utils.cg_solve import cg_solve 20 | 21 | # Import Algs 22 | from mjrl.algos.npg_cg import NPG 23 | from mjrl.algos.behavior_cloning import BC 24 | 25 | class DAPG(NPG): 26 | def __init__(self, env, policy, baseline, 27 | demo_paths=None, 28 | normalized_step_size=0.01, 29 | FIM_invert_args={'iters': 10, 'damping': 1e-4}, 30 | hvp_sample_frac=1.0, 31 | seed=123, 32 | save_logs=False, 33 | kl_dist=None, 34 | lam_0=1.0, # demo coef 35 | lam_1=0.95, # decay coef 36 | **kwargs, 37 | ): 38 | 39 | self.env = env 40 | self.policy = policy 41 | self.baseline = baseline 42 | self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size 43 | self.seed = seed 44 | self.save_logs = save_logs 45 | self.FIM_invert_args = FIM_invert_args 46 | self.hvp_subsample = hvp_sample_frac 47 | self.running_score = None 48 | self.demo_paths = demo_paths 49 | self.lam_0 = lam_0 50 | self.lam_1 = lam_1 51 | self.iter_count = 0.0 52 | if save_logs: self.logger = DataLog() 53 | 54 | def train_from_paths(self, paths): 55 | 56 | # Concatenate from all the trajectories 57 | observations = np.concatenate([path["observations"] for path in paths]) 58 | actions = np.concatenate([path["actions"] for path in paths]) 59 | advantages = np.concatenate([path["advantages"] for path in paths]) 60 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) 61 | 62 | if self.demo_paths is not None and self.lam_0 > 0.0: 63 | demo_obs = np.concatenate([path["observations"] for path in self.demo_paths]) 64 | demo_act = np.concatenate([path["actions"] for path in self.demo_paths]) 65 | demo_adv = self.lam_0 * (self.lam_1 ** self.iter_count) * np.ones(demo_obs.shape[0]) 66 | self.iter_count += 1 67 | # concatenate all 68 | all_obs = np.concatenate([observations, demo_obs]) 69 | all_act = np.concatenate([actions, demo_act]) 70 | all_adv = 1e-2*np.concatenate([advantages/(np.std(advantages) + 1e-8), demo_adv]) 71 | else: 72 | all_obs = observations 73 | all_act = actions 74 | all_adv = advantages 75 | 76 | # cache return distributions for the paths 77 | path_returns = [sum(p["rewards"]) for p in paths] 78 | mean_return = np.mean(path_returns) 79 | std_return = np.std(path_returns) 80 | min_return = np.amin(path_returns) 81 | max_return = np.amax(path_returns) 82 | base_stats = [mean_return, std_return, min_return, max_return] 83 | self.running_score = mean_return if self.running_score is None else \ 84 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters 85 | if self.save_logs: self.log_rollout_statistics(paths) 86 | 87 | # Keep track of times for various computations 88 | t_gLL = 0.0 89 | t_FIM = 0.0 90 | 91 | # Optimization algorithm 92 | # -------------------------- 93 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 94 | 95 | # DAPG 96 | ts = timer.time() 97 | sample_coef = all_adv.shape[0]/advantages.shape[0] 98 | dapg_grad = sample_coef*self.flat_vpg(all_obs, all_act, all_adv) 99 | t_gLL += timer.time() - ts 100 | 101 | # NPG 102 | ts = timer.time() 103 | hvp = self.build_Hvp_eval([observations, actions], 104 | regu_coef=self.FIM_invert_args['damping']) 105 | npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(), 106 | cg_iters=self.FIM_invert_args['iters']) 107 | t_FIM += timer.time() - ts 108 | 109 | # Step size computation 110 | # -------------------------- 111 | n_step_size = 2.0*self.kl_dist 112 | alpha = np.sqrt(np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20))) 113 | 114 | # Policy update 115 | # -------------------------- 116 | curr_params = self.policy.get_param_values() 117 | new_params = curr_params + alpha * npg_grad 118 | self.policy.set_param_values(new_params, set_new=True, set_old=False) 119 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 120 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] 121 | self.policy.set_param_values(new_params, set_new=True, set_old=True) 122 | 123 | # Log information 124 | if self.save_logs: 125 | self.logger.log_kv('alpha', alpha) 126 | self.logger.log_kv('delta', n_step_size) 127 | self.logger.log_kv('time_vpg', t_gLL) 128 | self.logger.log_kv('time_npg', t_FIM) 129 | self.logger.log_kv('kl_dist', kl_dist) 130 | self.logger.log_kv('surr_improvement', surr_after - surr_before) 131 | self.logger.log_kv('running_score', self.running_score) 132 | try: 133 | self.env.env.env.evaluate_success(paths, self.logger) 134 | except: 135 | # nested logic for backwards compatibility. TODO: clean this up. 136 | try: 137 | success_rate = self.env.env.env.evaluate_success(paths) 138 | self.logger.log_kv('success_rate', success_rate) 139 | except: 140 | pass 141 | return base_stats 142 | -------------------------------------------------------------------------------- /mjrl/algos/mbac.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.disable(logging.CRITICAL) 3 | import numpy as np 4 | import time as timer 5 | import torch 6 | import torch.nn as nn 7 | from torch.autograd import Variable 8 | from mjrl.utils.logger import DataLog 9 | from tqdm import tqdm 10 | from mjrl.utils.gym_env import GymEnv 11 | from mjrl.policies.mpc_actor import MPCActor 12 | from mjrl.algos.behavior_cloning import BC 13 | 14 | 15 | class MBAC(BC): 16 | def __init__(self, 17 | env_name, 18 | policy, 19 | expert_paths = None, # for the initial seeding 20 | epochs = 5, 21 | batch_size = 64, 22 | lr = 1e-3, 23 | optimizer = None, 24 | loss_type = 'MSE', # can be 'MLE' or 'MSE' 25 | seed = 123, 26 | buffer_size = 50, # measured in number of trajectories 27 | mpc_params = None, 28 | save_logs = True, 29 | ): 30 | 31 | super().__init__(expert_paths=expert_paths, 32 | policy=policy, 33 | epochs=epochs, 34 | batch_size=batch_size, 35 | lr=lr, 36 | optimizer=optimizer, 37 | loss_type=loss_type, 38 | save_logs=save_logs, 39 | ) 40 | self.expert_paths = [] if self.expert_paths is None else self.expert_paths 41 | self.buffer_size = buffer_size 42 | 43 | # For the MPC policy 44 | self.env = GymEnv(env_name) 45 | self.env.reset(seed=seed) 46 | if mpc_params is None: 47 | mean = np.zeros(self.env.action_dim) 48 | sigma = 1.0 * np.ones(self.env.action_dim) 49 | filter_coefs = [sigma, 0.05, 0.0, 0.0] 50 | mpc_params = dict(env=GymEnv(env_name), H=10, 51 | paths_per_cpu=25, num_cpu=1, 52 | kappa=10.0, gamma=1.0, 53 | mean=mean, filter_coefs=filter_coefs, 54 | seed=seed) 55 | else: 56 | mpc_params['env'] = GymEnv(env_name) 57 | mpc_params['seed'] = seed 58 | 59 | self.mpc_params = mpc_params 60 | self.mpc_policy = MPCActor(**mpc_params) 61 | 62 | def collect_paths(self, num_traj=10, 63 | mode='policy', 64 | horizon=None, 65 | render=False 66 | ): 67 | horizon = self.env.horizon if horizon is None else horizon 68 | paths = [] 69 | for i in tqdm(range(num_traj)): 70 | self.env.reset() 71 | obs, act_pi, act_mpc, rew, states = [], [], [], [], [] 72 | for t in range(horizon): 73 | o = self.env.get_obs() 74 | s = self.env.get_env_state() 75 | a_pi = self.policy.get_action(o)[0] 76 | a_mpc = self.mpc_policy.get_action(s) 77 | a = a_pi if mode == 'policy' else a_mpc 78 | next_o, r, done, _ = self.env.step(a) 79 | if render: 80 | self.env.render() 81 | # store data 82 | obs.append(o) 83 | rew.append(r) 84 | states.append(s) 85 | act_pi.append(a_pi) 86 | act_mpc.append(a_mpc) 87 | # kill if done 88 | if done: 89 | break 90 | path = dict(observations=np.array(obs), 91 | actions=np.array(act_pi), 92 | expert_actions=np.array(act_mpc), 93 | rewards=np.array(rew), 94 | states=states, 95 | ) 96 | paths.append(path) 97 | return paths 98 | 99 | def add_paths_to_buffer(self, paths): 100 | for path in paths: 101 | self.expert_paths.append(path) 102 | if len(self.expert_paths) > self.buffer_size: 103 | # keep recent trajectories 104 | # TODO: Also consider keeping best performing trajectories 105 | self.expert_paths = self.expert_paths[-self.buffer_size:] 106 | if self.save_logs: 107 | self.logger.log_kv('buffer_size', len(self.expert_paths)) 108 | 109 | def get_data_from_buffer(self): 110 | observations = np.concatenate([path["observations"] for path in self.expert_paths]) 111 | expert_actions = np.concatenate([path["expert_actions"] for path in self.expert_paths]) 112 | observations = torch.Tensor(observations).float() 113 | expert_actions = torch.Tensor(expert_actions).float() 114 | data = dict(observations=observations, expert_actions=expert_actions) 115 | return data 116 | 117 | def train_step(self, num_traj=10, **kwargs): 118 | # collect data using policy actions 119 | # fit policy to expert actions on these states 120 | new_paths = self.collect_paths(num_traj, mode='policy') 121 | self.add_paths_to_buffer(new_paths) 122 | data = self.get_data_from_buffer() 123 | self.fit(data, **kwargs) 124 | stoc_pol_perf = np.mean([np.sum(path['rewards']) for path in new_paths]) 125 | return stoc_pol_perf -------------------------------------------------------------------------------- /mjrl/algos/model_accel/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/algos/model_accel/__init__.py -------------------------------------------------------------------------------- /mjrl/algos/model_accel/model_accel_npg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import torch 4 | import torch.nn as nn 5 | import pickle 6 | import mjrl.envs 7 | import os 8 | import time as timer 9 | from torch.autograd import Variable 10 | from mjrl.utils.gym_env import GymEnv 11 | from mjrl.algos.model_accel.nn_dynamics import WorldModel 12 | import mjrl.samplers.core as trajectory_sampler 13 | 14 | # utility functions 15 | import mjrl.utils.process_samples as process_samples 16 | from mjrl.utils.logger import DataLog 17 | from mjrl.algos.model_accel.sampling import policy_rollout 18 | 19 | # Import NPG 20 | from mjrl.algos.npg_cg import NPG 21 | 22 | 23 | class ModelAccelNPG(NPG): 24 | def __init__(self, learned_model=None, 25 | refine=False, 26 | kappa=5.0, 27 | plan_horizon=10, 28 | plan_paths=100, 29 | reward_function=None, 30 | termination_function=None, 31 | **kwargs): 32 | super(ModelAccelNPG, self).__init__(**kwargs) 33 | if learned_model is None: 34 | print("Algorithm requires a (list of) learned dynamics model") 35 | quit() 36 | elif isinstance(learned_model, WorldModel): 37 | self.learned_model = [learned_model] 38 | else: 39 | self.learned_model = learned_model 40 | self.refine, self.kappa, self.plan_horizon, self.plan_paths = refine, kappa, plan_horizon, plan_paths 41 | self.reward_function, self.termination_function = reward_function, termination_function 42 | 43 | def to(self, device): 44 | # Convert all the networks (except policy network which is clamped to CPU) 45 | # to the specified device 46 | for model in self.learned_model: 47 | model.to(device) 48 | try: self.baseline.model.to(device) 49 | except: pass 50 | 51 | def is_cuda(self): 52 | # Check if any of the networks are on GPU 53 | model_cuda = [model.is_cuda() for model in self.learned_model] 54 | model_cuda = any(model_cuda) 55 | baseline_cuda = next(self.baseline.model.parameters()).is_cuda 56 | return any([model_cuda, baseline_cuda]) 57 | 58 | def train_step(self, N, 59 | env=None, 60 | sample_mode='trajectories', 61 | horizon=1e6, 62 | gamma=0.995, 63 | gae_lambda=0.97, 64 | num_cpu='max', 65 | env_kwargs=None, 66 | init_states=None, 67 | reward_function=None, 68 | termination_function=None, 69 | truncate_lim=None, 70 | truncate_reward=0.0, 71 | **kwargs, 72 | ): 73 | 74 | ts = timer.time() 75 | 76 | # get the correct env behavior 77 | if env is None: 78 | env = self.env 79 | elif type(env) == str: 80 | env = GymEnv(env) 81 | elif isinstance(env, GymEnv): 82 | env = env 83 | elif callable(env): 84 | env = env(**env_kwargs) 85 | else: 86 | print("Unsupported environment format") 87 | raise AttributeError 88 | 89 | # get correct behavior for reward and termination 90 | reward_function = self.reward_function if reward_function is None else reward_function 91 | termination_function = self.termination_function if termination_function is None else termination_function 92 | if reward_function: assert callable(reward_function) 93 | if termination_function: assert callable(termination_function) 94 | 95 | # simulate trajectories with the learned model(s) 96 | # we want to use the same task instances (e.g. goal locations) for each model in ensemble 97 | paths = [] 98 | 99 | # NOTE: We can optionally specify a set of initial states to perform the rollouts from 100 | # This is useful for starting rollouts from the states in the replay buffer 101 | init_states = np.array([env.reset() for _ in range(N)]) if init_states is None else init_states 102 | assert type(init_states) == list 103 | assert len(init_states) == N 104 | 105 | for model in self.learned_model: 106 | # dont set seed explicitly -- this will make rollouts follow tne global seed 107 | rollouts = policy_rollout(num_traj=N, env=env, policy=self.policy, 108 | learned_model=model, eval_mode=False, horizon=horizon, 109 | init_state=init_states, seed=None) 110 | # use learned reward function if available 111 | if model.learn_reward: 112 | model.compute_path_rewards(rollouts) 113 | else: 114 | rollouts = reward_function(rollouts) 115 | num_traj, horizon, state_dim = rollouts['observations'].shape 116 | for i in range(num_traj): 117 | path = dict() 118 | obs = rollouts['observations'][i, :, :] 119 | act = rollouts['actions'][i, :, :] 120 | rew = rollouts['rewards'][i, :] 121 | path['observations'] = obs 122 | path['actions'] = act 123 | path['rewards'] = rew 124 | path['terminated'] = False 125 | paths.append(path) 126 | 127 | # NOTE: If tasks have termination condition, we will assume that the env has 128 | # a function that can terminate paths appropriately. 129 | # Otherwise, termination is not considered. 130 | 131 | if callable(termination_function): paths = termination_function(paths) 132 | 133 | # remove paths that are too short 134 | paths = [path for path in paths if path['observations'].shape[0] >= 5] 135 | 136 | # additional truncation based on error in the ensembles 137 | if truncate_lim is not None and len(self.learned_model) > 1: 138 | for path in paths: 139 | pred_err = np.zeros(path['observations'].shape[0] - 1) 140 | for model in self.learned_model: 141 | s = path['observations'][:-1] 142 | a = path['actions'][:-1] 143 | s_next = path['observations'][1:] 144 | pred = model.predict(s, a) 145 | model_err = np.mean((s_next - pred)**2, axis=-1) 146 | pred_err = np.maximum(pred_err, model_err) 147 | violations = np.where(pred_err > truncate_lim)[0] 148 | truncated = (not len(violations) == 0) 149 | T = violations[0] + 1 if truncated else obs.shape[0] 150 | T = max(4, T) # we don't want corner cases of very short truncation 151 | path["observations"] = path["observations"][:T] 152 | path["actions"] = path["actions"][:T] 153 | path["rewards"] = path["rewards"][:T] 154 | if truncated: path["rewards"][-1] += truncate_reward 155 | path["terminated"] = False if T == obs.shape[0] else True 156 | 157 | if self.save_logs: 158 | self.logger.log_kv('time_sampling', timer.time() - ts) 159 | 160 | self.seed = self.seed + N if self.seed is not None else self.seed 161 | 162 | # compute returns 163 | process_samples.compute_returns(paths, gamma) 164 | # compute advantages 165 | process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda) 166 | # train from paths 167 | eval_statistics = self.train_from_paths(paths) 168 | eval_statistics.append(N) 169 | # log number of samples 170 | if self.save_logs: 171 | num_samples = np.sum([p["rewards"].shape[0] for p in paths]) 172 | self.logger.log_kv('num_samples', num_samples) 173 | # fit baseline 174 | if self.save_logs: 175 | ts = timer.time() 176 | error_before, error_after = self.baseline.fit(paths, return_errors=True) 177 | self.logger.log_kv('time_VF', timer.time()-ts) 178 | self.logger.log_kv('VF_error_before', error_before) 179 | self.logger.log_kv('VF_error_after', error_after) 180 | else: 181 | self.baseline.fit(paths) 182 | 183 | return eval_statistics 184 | 185 | def get_action(self, observation): 186 | if self.refine is False: 187 | return self.policy.get_action(observation) 188 | else: 189 | return self.get_refined_action(observation) 190 | 191 | def get_refined_action(self, observation): 192 | # TODO(Aravind): Implemenet this 193 | # This function should rollout many trajectories according to the learned 194 | # dynamics model and the policy, and should refine around the policy by 195 | # incorporating reward based refinement 196 | raise NotImplementedError 197 | -------------------------------------------------------------------------------- /mjrl/algos/model_accel/model_learning_mpc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mjrl.algos.model_accel.sampling import generate_paths, generate_perturbed_actions, trajectory_rollout 3 | 4 | 5 | class MPCPolicy(object): 6 | def __init__(self, env, 7 | plan_horizon, 8 | plan_paths=10, 9 | kappa=1.0, 10 | gamma=1.0, 11 | mean=None, 12 | filter_coefs=None, 13 | seed=123, 14 | warmstart=True, 15 | fitted_model=None, 16 | omega=5.0, 17 | **kwargs, 18 | ): 19 | 20 | # initialize 21 | self.env, self.seed = env, seed 22 | self.n, self.m = env.observation_dim, env.action_dim 23 | self.plan_horizon, self.num_traj = plan_horizon, plan_paths 24 | 25 | if fitted_model is None: 26 | print("Policy requires a fitted dynamics model") 27 | quit() 28 | else: 29 | self.fitted_model = fitted_model 30 | 31 | # initialize other params 32 | self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma 33 | if mean is None: 34 | self.mean = np.zeros(self.m) 35 | if filter_coefs is None: 36 | self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0] 37 | self.act_sequence = np.ones((self.plan_horizon, self.m)) * self.mean 38 | self.init_act_sequence = self.act_sequence.copy() 39 | self.warmstart = warmstart 40 | self.omega = omega 41 | 42 | def get_action(self, obs): 43 | # generate paths 44 | if type(self.fitted_model) == list: 45 | 46 | # Ensemble case 47 | # Collect trajectories from different models with same action sequences 48 | base_act = self.act_sequence 49 | act_list = [generate_perturbed_actions(base_act, self.filter_coefs) 50 | for _ in range(self.num_traj)] 51 | actions = np.array(act_list) 52 | paths_list = [] 53 | for model in self.fitted_model: 54 | paths = trajectory_rollout(actions, model, obs) 55 | self.env.env.env.compute_path_rewards(paths) 56 | paths_list.append(paths) 57 | # consolidate paths 58 | paths = dict() 59 | for k in paths_list[0].keys(): 60 | v = np.vstack([p[k] for p in paths_list]) 61 | paths[k] = v 62 | R = self.score_trajectory_ensemble(paths, paths_list) 63 | 64 | else: 65 | paths = generate_paths(num_traj=self.num_traj, fitted_model=self.fitted_model, 66 | start_state=obs, base_act=self.act_sequence, filter_coefs=self.filter_coefs) 67 | self.env.env.env.compute_path_rewards(paths) # will populate path['rewards'] 68 | R = self.score_trajectory(paths) 69 | 70 | S = np.exp(self.kappa * (R - np.max(R))) 71 | act = paths["actions"] 72 | 73 | weighted_seq = S * act.T 74 | act_sequence = np.sum(weighted_seq.T, axis=0) / (np.sum(S) + 1e-6) 75 | action = act_sequence[0].copy() 76 | 77 | # get updated action sequence 78 | if self.warmstart: 79 | self.act_sequence[:-1] = act_sequence[1:] 80 | self.act_sequence[-1] = self.mean.copy() 81 | else: 82 | self.act_sequence = self.init_act_sequence.copy() 83 | return action 84 | 85 | def score_trajectory_ensemble(self, paths, paths_list): 86 | num_traj = self.num_traj 87 | num_models = len(paths_list) 88 | total_traj = paths['rewards'].shape[0] 89 | horizon = paths['rewards'].shape[1] 90 | predictions = [p['observations'] for p in paths_list] 91 | disagreement = np.std(predictions, axis=0) # (num_traj, horizon, state_dim) 92 | disagreement = np.sum(disagreement, axis=(1,2)) # (num_traj,) 93 | scores = np.zeros(total_traj) 94 | for i in range(total_traj): 95 | disagreement_score = disagreement[i // self.num_traj] 96 | scores[i] = self.omega * disagreement_score 97 | for t in range(horizon): 98 | scores[i] += (self.gamma ** t) * paths["rewards"][i][t] 99 | return scores 100 | 101 | def score_trajectory(self, paths): 102 | # rewards shape: (num_traj, horizon) 103 | num_traj = paths["rewards"].shape[0] 104 | horizon = paths["rewards"].shape[1] 105 | scores = np.zeros(num_traj) 106 | for i in range(num_traj): 107 | scores[i] = 0.0 108 | for t in range(horizon): 109 | scores[i] += (self.gamma**t)*paths["rewards"][i][t] 110 | return scores 111 | -------------------------------------------------------------------------------- /mjrl/algos/model_accel/run_experiments/configs/point_mass.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env_name' : 'mjrl_point_mass-v0', 6 | 'seed' : 123, 7 | 'debug_mode' : False, 8 | 'num_iter' : 5, 9 | 'iter_samples' : 100, 10 | 'eval_rollouts' : 25, 11 | 'num_models' : 3, 12 | 'exp_notes' : 'Toy experiment for initial trial.', 13 | 'save_freq' : 1, 14 | 'device' : 'cpu', 15 | 'learn_reward' : False, 16 | 'reward_file' : 'utils/reward_functions/mjrl_point_mass.py', 17 | 18 | # dynamics learning 19 | 20 | 'hidden_size' : (256, 256), 21 | 'activation' : 'relu', 22 | 'fit_lr' : 1e-3, 23 | 'fit_wd' : 1e-5, 24 | 'buffer_size' : 10000, 25 | 'fit_mb_size' : 16, 26 | 'fit_epochs' : 25, 27 | 'refresh_fit' : False, 28 | 29 | # initial data 30 | 31 | 'init_log_std' : -0.5, 32 | 'min_log_std' : -2.0, 33 | 'init_samples' : 1000, 34 | 35 | # NPG params 36 | 37 | 'policy_size' : (32, 32), 38 | 'inner_steps' : 10, 39 | 'step_size' : 0.05, 40 | 'update_paths' : 250, 41 | 'start_state' : 'init', 42 | 'horizon' : 25, 43 | 44 | } 45 | -------------------------------------------------------------------------------- /mjrl/algos/model_accel/run_experiments/configs/reacher.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env_name' : 'mjrl_reacher_7dof-v0', 6 | 'seed' : 123, 7 | 'debug_mode' : False, 8 | 'num_iter' : 25, 9 | 'iter_samples' : 500, 10 | 'eval_rollouts' : 10, 11 | 'num_models' : 4, 12 | 'save_freq' : 1, 13 | 'device' : 'cpu', 14 | 15 | # dynamics learning 16 | 17 | 'hidden_size' : (256, 256), 18 | 'activation' : 'relu', 19 | 'fit_lr' : 1e-3, 20 | 'fit_wd' : 0.0, 21 | 'buffer_size' : 20000, 22 | 'fit_mb_size' : 64, 23 | 'fit_epochs' : 20, 24 | 'refresh_fit' : False, 25 | 26 | # initial data 27 | 28 | 'init_log_std' : -0.5, 29 | 'min_log_std' : -2.5, 30 | 'init_samples' : 2500, 31 | 'init_policy' : None, 32 | 33 | 34 | # NPG params 35 | 36 | 'policy_size' : (64, 64), 37 | 'inner_steps' : 5, 38 | 'step_size' : 0.05, 39 | 'update_paths' : 250, 40 | 'start_state' : 'init', 41 | 'horizon' : 50, 42 | 43 | } -------------------------------------------------------------------------------- /mjrl/algos/model_accel/run_experiments/sandbox/example_config_mpc.txt: -------------------------------------------------------------------------------- 1 | { 2 | 3 | # general inputs 4 | 5 | 'env_name' : 'mjrl_point_mass-v0', 6 | 'seed' : 123, 7 | 'debug_mode' : False, 8 | 'num_iter' : 5, 9 | 'paths_per_iter': 5, 10 | 'eval_rollouts' : 10, 11 | 'num_models' : 3, 12 | 'exp_notes' : 'Toy experiment for initial trial.', 13 | 'save_freq' : 5, 14 | 'device' : 'cpu', 15 | 16 | # dynamics learning 17 | 18 | 'hidden_size' : (64, 64), 19 | 'activation' : 'relu', 20 | 'fit_lr' : 1e-3, 21 | 'fit_wd' : 1e-5, 22 | 'max_paths' : 1000, 23 | 'fit_mb_size' : 16, 24 | 'fit_epochs' : 25, 25 | 'refresh_fit' : True, 26 | 27 | # initial data 28 | 29 | 'init_log_std' : -0.5, 30 | 'n_init_paths' : 25, 31 | 'use_demos' : False, 32 | 'demo_file' : None, 33 | 34 | # model predictive control 35 | 36 | 'noisy_mpc' : True, # when collecting data for exploration 37 | 'noise_level' : 0.1, 38 | 'filter_coefs' : {'f1': 0.5, 'f2': 1.0, 'f3': 0.0, 'f4': 0.0}, 39 | 'plan_paths' : 200, 40 | 'plan_horizon' : 10, 41 | 'kappa' : 2.0, 42 | 'omega' : 0.0, 43 | 44 | } 45 | -------------------------------------------------------------------------------- /mjrl/algos/model_accel/run_experiments/sandbox/run_model_learning_mpc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Job script to optimize trajectories with fitted model 3 | """ 4 | 5 | import numpy as np 6 | import copy 7 | import torch 8 | import torch.nn as nn 9 | import pickle 10 | import mjrl.envs 11 | import time as timer 12 | import argparse 13 | import os 14 | import json 15 | import mjrl.samplers.core as trajectory_sampler 16 | import mjrl.utils.tensor_utils as tensor_utils 17 | from tqdm import tqdm 18 | from tabulate import tabulate 19 | from mjrl.policies.gaussian_mlp import MLP 20 | from mjrl.baselines.mlp_baseline import MLPBaseline 21 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 22 | from mjrl.utils.gym_env import GymEnv 23 | from mjrl.utils.logger import DataLog 24 | from mjrl.utils.make_train_plots import make_train_plots 25 | from mjrl.algos.model_accel.nn_dynamics import DynamicsModel 26 | from mjrl.algos.model_accel.model_learning_mpc import MPCPolicy 27 | from mjrl.algos.model_accel.sampling import sample_paths, evaluate_policy 28 | 29 | 30 | # =============================================================================== 31 | # Get command line arguments 32 | # =============================================================================== 33 | 34 | parser = argparse.ArgumentParser(description='Trajectory Optimization with fitted models.') 35 | parser.add_argument('--output', type=str, required=True, help='location to store results') 36 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params') 37 | args = parser.parse_args() 38 | OUT_DIR = args.output 39 | if not os.path.exists(OUT_DIR): 40 | os.mkdir(OUT_DIR) 41 | with open(args.config, 'r') as f: 42 | job_data = eval(f.read()) 43 | 44 | # Unpack args and make files for easy access 45 | logger = DataLog() 46 | ENV_NAME = job_data['env_name'] 47 | PICKLE_FILE = OUT_DIR + '/exp_results.pickle' 48 | EXP_FILE = OUT_DIR + '/job_data.json' 49 | SEED = job_data['seed'] 50 | job_data['filter_coefs'] = [job_data['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']] 51 | 52 | # base cases 53 | if 'num_models' not in job_data.keys(): 54 | job_data['num_models'] = 1 55 | if job_data['num_models'] == 1 or 'omega' not in job_data.keys(): 56 | job_data['omega'] = 0.0 57 | if 'eval_rollouts' not in job_data.keys(): 58 | job_data['eval_rollouts'] = 0 59 | if 'save_freq' not in job_data.keys(): 60 | job_data['save_freq'] = 10 61 | if 'device' not in job_data.keys(): 62 | job_data['device'] = 'cpu' 63 | if 'debug_mode' in job_data.keys(): 64 | DEBUG = job_data['debug_mode'] 65 | else: 66 | DEBUG =False 67 | if 'device_path' not in job_data.keys(): 68 | job_data['device_path'] = None 69 | with open(EXP_FILE, 'w') as f: 70 | json.dump(job_data, f, indent=4) 71 | 72 | del(job_data['seed']) 73 | job_data['base_seed'] = SEED 74 | 75 | # =============================================================================== 76 | # Train loop 77 | # =============================================================================== 78 | 79 | np.random.seed(SEED) 80 | torch.random.manual_seed(SEED) 81 | 82 | # TODO(Aravind): Map to hardware if device_path is specified 83 | 84 | e = GymEnv(ENV_NAME) 85 | e.set_seed(SEED) 86 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+i, **job_data) 87 | for i in range(job_data['num_models'])] 88 | exploratory_policy = MLP(e.spec, seed=SEED, init_log_std=job_data['init_log_std']) 89 | paths = [] 90 | 91 | for outer_iter in range(job_data['num_iter']): 92 | 93 | ts = timer.time() 94 | print("================> ITERATION : %i " % outer_iter) 95 | print("Getting interaction data from real dynamics ...") 96 | 97 | if outer_iter == 0: 98 | iter_paths = trajectory_sampler.sample_paths(job_data['n_init_paths'], e, 99 | exploratory_policy, 100 | eval_mode=False, base_seed=SEED) 101 | else: 102 | iter_paths = sample_paths(job_data['paths_per_iter'], 103 | mpc_policy.env, mpc_policy, 104 | eval_mode=(not job_data['noisy_mpc']), 105 | noise_level=job_data['noise_level'], 106 | base_seed=SEED + outer_iter) 107 | 108 | # reset the environment (good for hardware) 109 | e.reset() 110 | 111 | for p in iter_paths: 112 | paths.append(p) 113 | 114 | if len(paths) > job_data['max_paths']: 115 | diff = len(paths) - job_data['max_paths'] 116 | paths[:diff] = [] 117 | 118 | s = np.concatenate([p['observations'][:-1] for p in paths]) 119 | a = np.concatenate([p['actions'][:-1] for p in paths]) 120 | sp = np.concatenate([p['observations'][1:] for p in paths]) 121 | r = np.array([np.sum(p['rewards']) for p in iter_paths]) 122 | rollout_score = np.mean(r) 123 | 124 | logger.log_kv('fit_epochs', job_data['fit_epochs']) 125 | logger.log_kv('rollout_score', rollout_score) 126 | try: 127 | rollout_metric = e.env.env.evaluate_success(iter_paths) 128 | logger.log_kv('rollout_metric', rollout_metric) 129 | except: 130 | pass 131 | 132 | print("Data gathered, fitting model ...") 133 | if job_data['refresh_fit']: 134 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+123*outer_iter, 135 | **job_data) for i in range(job_data['num_models'])] 136 | 137 | for i, model in enumerate(models): 138 | epoch_loss = model.fit(s, a, sp, job_data['fit_mb_size'], job_data['fit_epochs']) 139 | logger.log_kv('loss_before_' + str(i), epoch_loss[0]) 140 | logger.log_kv('loss_after_' + str(i), epoch_loss[-1]) 141 | 142 | mpc_policy = MPCPolicy(env=e, fitted_model=models, seed=SEED+12345*outer_iter, **job_data) 143 | 144 | if job_data['eval_rollouts'] > 0: 145 | print("Performing validation rollouts ... ") 146 | eval_paths = evaluate_policy(mpc_policy.env, mpc_policy, mpc_policy.fitted_model[0], noise_level=0.0, 147 | real_step=True, num_episodes=job_data['eval_rollouts'], visualize=False) 148 | eval_score = np.mean([np.sum(p['rewards']) for p in eval_paths]) 149 | logger.log_kv('eval_score', eval_score) 150 | try: 151 | eval_metric = e.env.env.evaluate_success(eval_paths) 152 | logger.log_kv('eval_metric', eval_metric) 153 | except: 154 | pass 155 | else: 156 | eval_paths = [] 157 | 158 | exp_data = dict(policy=mpc_policy, fitted_model=mpc_policy.fitted_model, 159 | log=logger.log, rollout_paths=iter_paths, eval_paths=eval_paths) 160 | if outer_iter > 0 and outer_iter % job_data['save_freq'] == 0: 161 | pickle.dump(exp_data, open(PICKLE_FILE, 'wb')) 162 | pickle.dump(exp_data, open(OUT_DIR + '/iteration_' + str(outer_iter) + '.pickle', 'wb')) 163 | 164 | tf = timer.time() 165 | logger.log_kv('iter_time', tf-ts) 166 | print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1, 167 | logger.get_current_log().items())) 168 | print(tabulate(print_data)) 169 | logger.save_log(OUT_DIR+'/') 170 | make_train_plots(log=logger.log, keys=['rollout_score', 'eval_score', 'rollout_metric', 'eval_metric'], 171 | save_loc=OUT_DIR+'/') 172 | 173 | if job_data['debug_mode']: 174 | evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], False, 5, True) 175 | evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], True, 5, True) 176 | 177 | pickle.dump(exp_data, open(PICKLE_FILE, 'wb')) # final save -------------------------------------------------------------------------------- /mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py -------------------------------------------------------------------------------- /mjrl/algos/model_accel/run_experiments/utils/reward_functions/mjrl_point_mass.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def reward_function(paths): 4 | # path has two keys: observations and actions 5 | # path["observations"] : (num_traj, horizon, obs_dim) 6 | # return paths that contain rewards in path["rewards"] 7 | # path["rewards"] should have shape (num_traj, horizon) 8 | obs = paths["observations"] 9 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 10 | agent_pos = obs[:, :, :2] 11 | target_pos = obs[:, :, -2:] 12 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1) 13 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1) 14 | rewards = -1.0 * l1_dist - 0.5 * l2_dist 15 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s') 16 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 17 | return paths 18 | -------------------------------------------------------------------------------- /mjrl/algos/model_accel/run_experiments/utils/visualize_policy.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import mjrl.envs 3 | import trajopt.envs 4 | import mj_envs 5 | import click 6 | import os 7 | import gym 8 | import numpy as np 9 | import pickle 10 | import torch 11 | from mjrl.utils.gym_env import GymEnv 12 | from mjrl.policies.gaussian_mlp import MLP 13 | import trajopt.envs 14 | 15 | DESC = ''' 16 | Helper script to visualize policy (in mjrl format).\n 17 | USAGE:\n 18 | Visualizes policy on the env\n 19 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n 20 | ''' 21 | 22 | # MAIN ========================================================= 23 | @click.command(help=DESC) 24 | @click.option('--env_name', type=str, help='environment to load', required= True) 25 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None) 26 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation') 27 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123) 28 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10) 29 | @click.option('--log_std', type=float, default=-0.5) 30 | @click.option('--terminate', type=bool, default=True) 31 | @click.option('--device_path', type=str, default=None) 32 | def main(env_name, policy, mode, seed, episodes, log_std, terminate, device_path): 33 | render = True 34 | 35 | # TODO(Aravind): Map to hardware if device_path is specified 36 | 37 | e = GymEnv(env_name) 38 | e.set_seed(seed) 39 | np.random.seed(seed) 40 | torch.manual_seed(seed) 41 | if policy is not None: 42 | policy = pickle.load(open(policy, 'rb')) 43 | else: 44 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=log_std) 45 | 46 | for ep in range(episodes): 47 | o = e.reset() 48 | rew = 0.0 49 | t = 0 50 | done = False 51 | while t < e.horizon and done is False: 52 | o = e.get_obs() 53 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation'] 54 | next_o, r, done, ifo = e.step(a) 55 | if terminate is False: 56 | done = False 57 | rew = rew + r 58 | t = t + 1 59 | if render: 60 | e.render() 61 | if done and t < e.horizon - 1: 62 | print("Episode terminated early") 63 | print("episode score = %f " % rew) 64 | 65 | e.reset() 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /mjrl/algos/model_accel/run_experiments/utils/visualize_trajectories.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import click 3 | import json 4 | import numpy as np 5 | import torch 6 | import mjrl.envs 7 | import trajopt.envs 8 | import mj_envs 9 | import mjrl.utils.tensor_utils as tensor_utils 10 | 11 | from mjrl.utils.gym_env import GymEnv 12 | from mjrl.algos.model_accel.sampling import evaluate_policy 13 | 14 | DESC = ''' 15 | Helper script to visualize optimized trajectories (list of trajectories in trajopt format).\n 16 | USAGE:\n 17 | $ python viz_trajectories.py --file path_to_file.pickle\n 18 | ''' 19 | @click.command(help=DESC) 20 | @click.option('--file', type=str, help='pickle file with trajectories', required= True) 21 | @click.option('--seed', type=int, default=123) 22 | @click.option('--noise_level', type=float, default=0.0) 23 | @click.option('--num_episodes', type=int, help='number of times to play trajectories', default=5) 24 | @click.option('--config', type=str, help='if provided MPC params from here will be used.', default=None) 25 | @click.option('--device_path', type=str, default=None) 26 | def main(file, seed, noise_level, num_episodes, config, device_path): 27 | exp_data = pickle.load(open(file, 'rb')) 28 | policy = exp_data['policy'] 29 | model = exp_data['fitted_model'] 30 | model = model[-1] if type(model) == list else model 31 | env_id = policy.env.env_id 32 | render = True 33 | 34 | # TODO(Aravind): Map to hardware if device_path is specified 35 | 36 | env = GymEnv(env_id) 37 | policy.env = env 38 | 39 | env.set_seed(seed) 40 | np.random.seed(seed) 41 | torch.manual_seed(seed) 42 | 43 | if config is not None: 44 | try: 45 | with open(config, 'r') as f: 46 | config = eval(f.read()) 47 | except: 48 | with open(config, 'r') as f: 49 | config = json.load(f) 50 | policy.plan_horizon = config['plan_horizon'] 51 | policy.num_traj = config['plan_paths'] 52 | policy.kappa = config['kappa'] 53 | policy.filter_coefs = [config['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']] 54 | policy.omega = config['omega'] if 'omega' in config.keys() else 0.0 55 | 56 | # TODO(Aravind): Implement capability to set predicted state for rendering purposes 57 | # evaluate_policy(env, policy, model, noise_level, real_step=False, num_episodes=num_episodes, visualize=render) 58 | evaluate_policy(env, policy, model, noise_level, real_step=True, num_episodes=num_episodes, visualize=render) 59 | 60 | # final close out 61 | env.reset() 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /mjrl/algos/npg_cg.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.disable(logging.CRITICAL) 3 | import numpy as np 4 | import scipy as sp 5 | import scipy.sparse.linalg as spLA 6 | import copy 7 | import time as timer 8 | import torch 9 | import torch.nn as nn 10 | from torch.autograd import Variable 11 | import copy 12 | 13 | # samplers 14 | import mjrl.samplers.core as trajectory_sampler 15 | 16 | # utility functions 17 | import mjrl.utils.process_samples as process_samples 18 | from mjrl.utils.logger import DataLog 19 | from mjrl.utils.cg_solve import cg_solve 20 | from mjrl.algos.batch_reinforce import BatchREINFORCE 21 | 22 | 23 | class NPG(BatchREINFORCE): 24 | def __init__(self, env, policy, baseline, 25 | normalized_step_size=0.01, 26 | const_learn_rate=None, 27 | FIM_invert_args={'iters': 10, 'damping': 1e-4}, 28 | hvp_sample_frac=1.0, 29 | seed=123, 30 | save_logs=False, 31 | kl_dist=None, 32 | input_normalization=None, 33 | **kwargs 34 | ): 35 | """ 36 | All inputs are expected in mjrl's format unless specified 37 | :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance 38 | :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. 39 | :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) 40 | :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG 41 | :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) 42 | :param seed: random seed 43 | """ 44 | 45 | self.env = env 46 | self.policy = policy 47 | self.baseline = baseline 48 | self.alpha = const_learn_rate 49 | self.n_step_size = normalized_step_size if kl_dist is None else 2.0 * kl_dist 50 | self.seed = seed 51 | self.save_logs = save_logs 52 | self.FIM_invert_args = FIM_invert_args 53 | self.hvp_subsample = hvp_sample_frac 54 | self.running_score = None 55 | if save_logs: self.logger = DataLog() 56 | # input normalization (running average) 57 | self.input_normalization = input_normalization 58 | if self.input_normalization is not None: 59 | if self.input_normalization > 1 or self.input_normalization <= 0: 60 | self.input_normalization = None 61 | 62 | def HVP(self, observations, actions, vector, regu_coef=None): 63 | regu_coef = self.FIM_invert_args['damping'] if regu_coef is None else regu_coef 64 | vec = Variable(torch.from_numpy(vector).float(), requires_grad=False) 65 | if self.hvp_subsample is not None and self.hvp_subsample < 0.99: 66 | num_samples = observations.shape[0] 67 | rand_idx = np.random.choice(num_samples, size=int(self.hvp_subsample*num_samples)) 68 | obs = observations[rand_idx] 69 | act = actions[rand_idx] 70 | else: 71 | obs = observations 72 | act = actions 73 | old_dist_info = self.policy.old_dist_info(obs, act) 74 | new_dist_info = self.policy.new_dist_info(obs, act) 75 | mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info) 76 | grad_fo = torch.autograd.grad(mean_kl, self.policy.trainable_params, create_graph=True) 77 | flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_fo]) 78 | h = torch.sum(flat_grad*vec) 79 | hvp = torch.autograd.grad(h, self.policy.trainable_params) 80 | hvp_flat = np.concatenate([g.contiguous().view(-1).data.numpy() for g in hvp]) 81 | return hvp_flat + regu_coef*vector 82 | 83 | def build_Hvp_eval(self, inputs, regu_coef=None): 84 | def eval(v): 85 | full_inp = inputs + [v] + [regu_coef] 86 | Hvp = self.HVP(*full_inp) 87 | return Hvp 88 | return eval 89 | 90 | # ---------------------------------------------------------- 91 | def train_from_paths(self, paths): 92 | 93 | observations, actions, advantages, base_stats, self.running_score = self.process_paths(paths) 94 | if self.save_logs: self.log_rollout_statistics(paths) 95 | 96 | # Keep track of times for various computations 97 | t_gLL = 0.0 98 | t_FIM = 0.0 99 | 100 | # normalize inputs if necessary 101 | if self.input_normalization: 102 | data_in_shift, data_in_scale = np.mean(observations, axis=0), np.std(observations, axis=0) 103 | pi_in_shift, pi_in_scale = self.policy.model.in_shift.data.numpy(), self.policy.model.in_scale.data.numpy() 104 | pi_out_shift, pi_out_scale = self.policy.model.out_shift.data.numpy(), self.policy.model.out_scale.data.numpy() 105 | pi_in_shift = self.input_normalization * pi_in_shift + (1-self.input_normalization) * data_in_shift 106 | pi_in_scale = self.input_normalization * pi_in_scale + (1-self.input_normalization) * data_in_scale 107 | self.policy.model.set_transformations(pi_in_shift, pi_in_scale, pi_out_shift, pi_out_scale) 108 | 109 | # Optimization algorithm 110 | # -------------------------- 111 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 112 | 113 | # VPG 114 | ts = timer.time() 115 | vpg_grad = self.flat_vpg(observations, actions, advantages) 116 | t_gLL += timer.time() - ts 117 | 118 | # NPG 119 | ts = timer.time() 120 | hvp = self.build_Hvp_eval([observations, actions], 121 | regu_coef=self.FIM_invert_args['damping']) 122 | npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), 123 | cg_iters=self.FIM_invert_args['iters']) 124 | t_FIM += timer.time() - ts 125 | 126 | # Step size computation 127 | # -------------------------- 128 | if self.alpha is not None: 129 | alpha = self.alpha 130 | n_step_size = (alpha ** 2) * np.dot(vpg_grad.T, npg_grad) 131 | else: 132 | n_step_size = self.n_step_size 133 | alpha = np.sqrt(np.abs(self.n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) 134 | 135 | # Policy update 136 | # -------------------------- 137 | curr_params = self.policy.get_param_values() 138 | new_params = curr_params + alpha * npg_grad 139 | self.policy.set_param_values(new_params, set_new=True, set_old=False) 140 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 141 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] 142 | self.policy.set_param_values(new_params, set_new=True, set_old=True) 143 | 144 | # Log information 145 | if self.save_logs: 146 | self.logger.log_kv('alpha', alpha) 147 | self.logger.log_kv('delta', n_step_size) 148 | self.logger.log_kv('time_vpg', t_gLL) 149 | self.logger.log_kv('time_npg', t_FIM) 150 | self.logger.log_kv('kl_dist', kl_dist) 151 | self.logger.log_kv('surr_improvement', surr_after - surr_before) 152 | self.logger.log_kv('running_score', self.running_score) 153 | try: 154 | self.env.env.env.evaluate_success(paths, self.logger) 155 | except: 156 | # nested logic for backwards compatibility. TODO: clean this up. 157 | try: 158 | success_rate = self.env.env.env.evaluate_success(paths) 159 | self.logger.log_kv('success_rate', success_rate) 160 | except: 161 | pass 162 | 163 | return base_stats 164 | -------------------------------------------------------------------------------- /mjrl/algos/ppo_clip.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.disable(logging.CRITICAL) 3 | import numpy as np 4 | import scipy as sp 5 | import scipy.sparse.linalg as spLA 6 | import copy 7 | import time as timer 8 | import torch 9 | import torch.nn as nn 10 | from torch.autograd import Variable 11 | import copy 12 | 13 | # samplers 14 | import mjrl.samplers.core as trajectory_sampler 15 | 16 | # utility functions 17 | import mjrl.utils.process_samples as process_samples 18 | from mjrl.utils.logger import DataLog 19 | from mjrl.utils.cg_solve import cg_solve 20 | from mjrl.algos.batch_reinforce import BatchREINFORCE 21 | 22 | 23 | class PPO(BatchREINFORCE): 24 | def __init__(self, env, policy, baseline, 25 | clip_coef = 0.2, 26 | epochs = 10, 27 | mb_size = 64, 28 | learn_rate = 3e-4, 29 | seed = 123, 30 | save_logs = False, 31 | **kwargs 32 | ): 33 | 34 | self.env = env 35 | self.policy = policy 36 | self.baseline = baseline 37 | self.learn_rate = learn_rate 38 | self.seed = seed 39 | self.save_logs = save_logs 40 | self.clip_coef = clip_coef 41 | self.epochs = epochs 42 | self.mb_size = mb_size 43 | self.running_score = None 44 | if save_logs: self.logger = DataLog() 45 | 46 | self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=learn_rate) 47 | 48 | def PPO_surrogate(self, observations, actions, advantages): 49 | adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False) 50 | old_dist_info = self.policy.old_dist_info(observations, actions) 51 | new_dist_info = self.policy.new_dist_info(observations, actions) 52 | LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info) 53 | LR_clip = torch.clamp(LR, min=1-self.clip_coef, max=1+self.clip_coef) 54 | ppo_surr = torch.mean(torch.min(LR*adv_var,LR_clip*adv_var)) 55 | return ppo_surr 56 | 57 | # ---------------------------------------------------------- 58 | def train_from_paths(self, paths): 59 | 60 | # Concatenate from all the trajectories 61 | observations = np.concatenate([path["observations"] for path in paths]) 62 | actions = np.concatenate([path["actions"] for path in paths]) 63 | advantages = np.concatenate([path["advantages"] for path in paths]) 64 | # Advantage whitening 65 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) 66 | # NOTE : advantage should be zero mean in expectation 67 | # normalized step size invariant to advantage scaling, 68 | # but scaling can help with least squares 69 | 70 | # cache return distributions for the paths 71 | path_returns = [sum(p["rewards"]) for p in paths] 72 | mean_return = np.mean(path_returns) 73 | std_return = np.std(path_returns) 74 | min_return = np.amin(path_returns) 75 | max_return = np.amax(path_returns) 76 | base_stats = [mean_return, std_return, min_return, max_return] 77 | self.running_score = mean_return if self.running_score is None else \ 78 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters 79 | if self.save_logs: self.log_rollout_statistics(paths) 80 | 81 | # Optimization algorithm 82 | # -------------------------- 83 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 84 | params_before_opt = self.policy.get_param_values() 85 | 86 | ts = timer.time() 87 | num_samples = observations.shape[0] 88 | for ep in range(self.epochs): 89 | for mb in range(int(num_samples / self.mb_size)): 90 | rand_idx = np.random.choice(num_samples, size=self.mb_size) 91 | obs = observations[rand_idx] 92 | act = actions[rand_idx] 93 | adv = advantages[rand_idx] 94 | self.optimizer.zero_grad() 95 | loss = - self.PPO_surrogate(obs, act, adv) 96 | loss.backward() 97 | self.optimizer.step() 98 | 99 | params_after_opt = self.policy.get_param_values() 100 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 101 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] 102 | self.policy.set_param_values(params_after_opt, set_new=True, set_old=True) 103 | t_opt = timer.time() - ts 104 | 105 | # Log information 106 | if self.save_logs: 107 | self.logger.log_kv('t_opt', t_opt) 108 | self.logger.log_kv('kl_dist', kl_dist) 109 | self.logger.log_kv('surr_improvement', surr_after - surr_before) 110 | self.logger.log_kv('running_score', self.running_score) 111 | try: 112 | self.env.env.env.evaluate_success(paths, self.logger) 113 | except: 114 | # nested logic for backwards compatibility. TODO: clean this up. 115 | try: 116 | success_rate = self.env.env.env.evaluate_success(paths) 117 | self.logger.log_kv('success_rate', success_rate) 118 | except: 119 | pass 120 | 121 | return base_stats 122 | -------------------------------------------------------------------------------- /mjrl/algos/trpo.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.disable(logging.CRITICAL) 3 | import numpy as np 4 | import scipy as sp 5 | import scipy.sparse.linalg as spLA 6 | import copy 7 | import time as timer 8 | import torch 9 | import torch.nn as nn 10 | from torch.autograd import Variable 11 | import copy 12 | 13 | # samplers 14 | import mjrl.samplers.core as trajectory_sampler 15 | import mjrl.samplers.batch_sampler as batch_sampler 16 | 17 | # utility functions 18 | import mjrl.utils.process_samples as process_samples 19 | from mjrl.utils.logger import DataLog 20 | from mjrl.utils.cg_solve import cg_solve 21 | 22 | # Import NPG 23 | from mjrl.algos.npg_cg import NPG 24 | 25 | class TRPO(NPG): 26 | def __init__(self, env, policy, baseline, 27 | kl_dist=0.01, 28 | FIM_invert_args={'iters': 10, 'damping': 1e-4}, 29 | hvp_sample_frac=1.0, 30 | seed=123, 31 | save_logs=False, 32 | normalized_step_size=0.01, 33 | **kwargs 34 | ): 35 | """ 36 | All inputs are expected in mjrl's format unless specified 37 | :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance 38 | :param kl_dist: desired KL distance between steps. Overrides normalized_step_size. 39 | :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well) 40 | :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG 41 | :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow) 42 | :param seed: random seed 43 | """ 44 | 45 | self.env = env 46 | self.policy = policy 47 | self.baseline = baseline 48 | self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size 49 | self.seed = seed 50 | self.save_logs = save_logs 51 | self.FIM_invert_args = FIM_invert_args 52 | self.hvp_subsample = hvp_sample_frac 53 | self.running_score = None 54 | if save_logs: self.logger = DataLog() 55 | 56 | def train_from_paths(self, paths): 57 | 58 | # Concatenate from all the trajectories 59 | observations = np.concatenate([path["observations"] for path in paths]) 60 | actions = np.concatenate([path["actions"] for path in paths]) 61 | advantages = np.concatenate([path["advantages"] for path in paths]) 62 | # Advantage whitening 63 | advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6) 64 | # NOTE : advantage should be zero mean in expectation 65 | # normalized step size invariant to advantage scaling, 66 | # but scaling can help with least squares 67 | 68 | # cache return distributions for the paths 69 | path_returns = [sum(p["rewards"]) for p in paths] 70 | mean_return = np.mean(path_returns) 71 | std_return = np.std(path_returns) 72 | min_return = np.amin(path_returns) 73 | max_return = np.amax(path_returns) 74 | base_stats = [mean_return, std_return, min_return, max_return] 75 | self.running_score = mean_return if self.running_score is None else \ 76 | 0.9*self.running_score + 0.1*mean_return # approx avg of last 10 iters 77 | if self.save_logs: self.log_rollout_statistics(paths) 78 | 79 | # Keep track of times for various computations 80 | t_gLL = 0.0 81 | t_FIM = 0.0 82 | 83 | # Optimization algorithm 84 | # -------------------------- 85 | surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 86 | 87 | # VPG 88 | ts = timer.time() 89 | vpg_grad = self.flat_vpg(observations, actions, advantages) 90 | t_gLL += timer.time() - ts 91 | 92 | # NPG 93 | ts = timer.time() 94 | hvp = self.build_Hvp_eval([observations, actions], 95 | regu_coef=self.FIM_invert_args['damping']) 96 | npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), 97 | cg_iters=self.FIM_invert_args['iters']) 98 | t_FIM += timer.time() - ts 99 | 100 | # Step size computation 101 | # -------------------------- 102 | n_step_size = 2.0*self.kl_dist 103 | alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) 104 | 105 | # Policy update 106 | # -------------------------- 107 | curr_params = self.policy.get_param_values() 108 | for k in range(100): 109 | new_params = curr_params + alpha * npg_grad 110 | self.policy.set_param_values(new_params, set_new=True, set_old=False) 111 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] 112 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 113 | if kl_dist < self.kl_dist: 114 | break 115 | else: 116 | alpha = 0.9*alpha # backtrack 117 | print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \ 118 | (kl_dist, surr_after-surr_before) ) 119 | if k == 99: 120 | alpha = 0.0 121 | 122 | new_params = curr_params + alpha * npg_grad 123 | self.policy.set_param_values(new_params, set_new=True, set_old=False) 124 | kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0] 125 | surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0] 126 | self.policy.set_param_values(new_params, set_new=True, set_old=True) 127 | 128 | # Log information 129 | if self.save_logs: 130 | self.logger.log_kv('alpha', alpha) 131 | self.logger.log_kv('delta', n_step_size) 132 | self.logger.log_kv('time_vpg', t_gLL) 133 | self.logger.log_kv('time_npg', t_FIM) 134 | self.logger.log_kv('kl_dist', kl_dist) 135 | self.logger.log_kv('surr_improvement', surr_after - surr_before) 136 | self.logger.log_kv('running_score', self.running_score) 137 | try: 138 | self.env.env.env.evaluate_success(paths, self.logger) 139 | except: 140 | # nested logic for backwards compatibility. TODO: clean this up. 141 | try: 142 | success_rate = self.env.env.env.evaluate_success(paths) 143 | self.logger.log_kv('success_rate', success_rate) 144 | except: 145 | pass 146 | 147 | return base_stats -------------------------------------------------------------------------------- /mjrl/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/baselines/__init__.py -------------------------------------------------------------------------------- /mjrl/baselines/linear_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | 5 | class LinearBaseline: 6 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-5): 7 | self.inp = inp 8 | self._reg_coeff = reg_coeff 9 | self._coeffs = None 10 | 11 | def _features(self, paths): 12 | if self.inp == 'env_features': 13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths]) 14 | else: 15 | o = np.concatenate([path["observations"] for path in paths]) 16 | o = np.clip(o, -10, 10)/10.0 17 | if o.ndim > 2: 18 | o = o.reshape(o.shape[0], -1) 19 | N, n = o.shape 20 | num_feat = int( n + 1 + 4 ) # linear + bias (1.0) + time till pow 4 21 | feat_mat = np.ones((N, num_feat)) 22 | 23 | # linear features 24 | feat_mat[:,:n] = o 25 | 26 | k = 0 # start from this row 27 | for i in range(len(paths)): 28 | l = len(paths[i]["rewards"]) 29 | al = np.arange(l)/1000.0 30 | for j in range(4): 31 | feat_mat[k:k+l, -4+j] = al**(j+1) 32 | k += l 33 | 34 | return feat_mat 35 | 36 | def fit(self, paths, return_errors=False): 37 | 38 | featmat = self._features(paths) 39 | returns = np.concatenate([path["returns"] for path in paths]) 40 | 41 | if return_errors: 42 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape) 43 | errors = returns - predictions 44 | error_before = np.sum(errors**2)/np.sum(returns**2) 45 | 46 | reg_coeff = copy.deepcopy(self._reg_coeff) 47 | for _ in range(10): 48 | self._coeffs = np.linalg.lstsq( 49 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]), 50 | featmat.T.dot(returns) 51 | )[0] 52 | if not np.any(np.isnan(self._coeffs)): 53 | break 54 | reg_coeff *= 10 55 | 56 | if return_errors: 57 | predictions = featmat.dot(self._coeffs) 58 | errors = returns - predictions 59 | error_after = np.sum(errors**2)/np.sum(returns**2) 60 | return error_before, error_after 61 | 62 | def predict(self, path): 63 | if self._coeffs is None: 64 | return np.zeros(len(path["rewards"])) 65 | return self._features([path]).dot(self._coeffs) 66 | -------------------------------------------------------------------------------- /mjrl/baselines/mlp_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | from mjrl.utils.optimize_model import fit_data 7 | 8 | import pickle 9 | 10 | class MLPBaseline: 11 | def __init__(self, env_spec, inp_dim=None, inp='obs', learn_rate=1e-3, reg_coef=0.0, 12 | batch_size=64, epochs=1, use_gpu=False, hidden_sizes=(128, 128)): 13 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim 14 | self.batch_size = batch_size 15 | self.epochs = epochs 16 | self.reg_coef = reg_coef 17 | self.use_gpu = use_gpu 18 | self.inp = inp 19 | self.hidden_sizes = hidden_sizes 20 | 21 | self.model = nn.Sequential() 22 | layer_sizes = (self.n + 4, ) + hidden_sizes + (1, ) 23 | for i in range(len(layer_sizes) - 1): 24 | layer_id = 'fc_' + str(i) 25 | relu_id = 'relu_' + str(i) 26 | self.model.add_module(layer_id, nn.Linear(layer_sizes[i], layer_sizes[i+1])) 27 | if i != len(layer_sizes) - 2: 28 | self.model.add_module(relu_id, nn.ReLU()) 29 | 30 | if self.use_gpu: 31 | self.model.cuda() 32 | 33 | self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learn_rate, weight_decay=reg_coef) 34 | self.loss_function = torch.nn.MSELoss() 35 | 36 | def _features(self, paths): 37 | if self.inp == 'env_features': 38 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths]) 39 | else: 40 | o = np.concatenate([path["observations"] for path in paths]) 41 | o = np.clip(o, -10, 10)/10.0 42 | if o.ndim > 2: 43 | o = o.reshape(o.shape[0], -1) 44 | N, n = o.shape 45 | num_feat = int( n + 4 ) # linear + time till pow 4 46 | feat_mat = np.ones((N, num_feat)) # memory allocation 47 | 48 | # linear features 49 | feat_mat[:,:n] = o 50 | 51 | k = 0 # start from this row 52 | for i in range(len(paths)): 53 | l = len(paths[i]["rewards"]) 54 | al = np.arange(l)/1000.0 55 | for j in range(4): 56 | feat_mat[k:k+l, -4+j] = al**(j+1) 57 | k += l 58 | return feat_mat 59 | 60 | 61 | def fit(self, paths, return_errors=False): 62 | 63 | featmat = self._features(paths) 64 | returns = np.concatenate([path["returns"] for path in paths]).reshape(-1, 1) 65 | featmat = featmat.astype('float32') 66 | returns = returns.astype('float32') 67 | num_samples = returns.shape[0] 68 | 69 | # Make variables with the above data 70 | if self.use_gpu: 71 | featmat_var = Variable(torch.from_numpy(featmat).cuda(), requires_grad=False) 72 | returns_var = Variable(torch.from_numpy(returns).cuda(), requires_grad=False) 73 | else: 74 | featmat_var = Variable(torch.from_numpy(featmat), requires_grad=False) 75 | returns_var = Variable(torch.from_numpy(returns), requires_grad=False) 76 | 77 | if return_errors: 78 | if self.use_gpu: 79 | predictions = self.model(featmat_var).cpu().data.numpy().ravel() 80 | else: 81 | predictions = self.model(featmat_var).data.numpy().ravel() 82 | errors = returns.ravel() - predictions 83 | error_before = np.sum(errors**2)/(np.sum(returns**2) + 1e-8) 84 | 85 | epoch_losses = fit_data(self.model, featmat_var, returns_var, self.optimizer, 86 | self.loss_function, self.batch_size, self.epochs) 87 | 88 | if return_errors: 89 | if self.use_gpu: 90 | predictions = self.model(featmat_var).cpu().data.numpy().ravel() 91 | else: 92 | predictions = self.model(featmat_var).data.numpy().ravel() 93 | errors = returns.ravel() - predictions 94 | error_after = np.sum(errors**2)/(np.sum(returns**2) + 1e-8) 95 | return error_before, error_after 96 | 97 | def predict(self, path): 98 | featmat = self._features([path]).astype('float32') 99 | if self.use_gpu: 100 | feat_var = Variable(torch.from_numpy(featmat).float().cuda(), requires_grad=False) 101 | prediction = self.model(feat_var).cpu().data.numpy().ravel() 102 | else: 103 | feat_var = Variable(torch.from_numpy(featmat).float(), requires_grad=False) 104 | prediction = self.model(feat_var).data.numpy().ravel() 105 | return prediction 106 | -------------------------------------------------------------------------------- /mjrl/baselines/quadratic_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | class QuadraticBaseline: 5 | def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-3): 6 | self.n = inp_dim if inp_dim is not None else env_spec.observation_dim 7 | self.inp = inp 8 | self._reg_coeff = reg_coeff 9 | self._coeffs = None 10 | 11 | def _features(self, paths): 12 | if self.inp == 'env_features': 13 | o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths]) 14 | else: 15 | o = np.concatenate([path["observations"] for path in paths]) 16 | o = np.clip(o, -10, 10)/10.0 17 | if o.ndim > 2: 18 | o = o.reshape(o.shape[0], -1) 19 | N, n = o.shape 20 | num_feat = int( n + n*(n+1)/2 + 1 + 4 ) # linear + full quad (symmetric matrix) + bias (1.0) + time till pow 4 21 | feat_mat = np.ones((N, num_feat)) # memory allocation 22 | 23 | # linear features 24 | feat_mat[:,:n] = o 25 | 26 | # quadratic features 27 | k = n # starting from this column in feat_mat 28 | for i in range(n): 29 | for j in range(i, n): 30 | feat_mat[:,k] = o[:,i]*o[:,j] # element-wise product 31 | k += 1 32 | 33 | k = 0 # start from this row 34 | for i in range(len(paths)): 35 | l = len(paths[i]["rewards"]) 36 | al = np.arange(l)/1000.0 37 | for j in range(4): 38 | feat_mat[k:k+l, -4+j] = al**(j+1) 39 | k += l 40 | 41 | return feat_mat 42 | 43 | 44 | def fit(self, paths, return_errors=False): 45 | 46 | #featmat = np.concatenate([self._features(path) for path in paths]) 47 | featmat = self._features(paths) 48 | returns = np.concatenate([path["returns"] for path in paths]) 49 | 50 | if return_errors: 51 | predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape) 52 | errors = returns - predictions 53 | error_before = np.sum(errors**2)/np.sum(returns**2) 54 | 55 | reg_coeff = copy.deepcopy(self._reg_coeff) 56 | for _ in range(10): 57 | self._coeffs = np.linalg.lstsq( 58 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]), 59 | featmat.T.dot(returns) 60 | )[0] 61 | if not np.any(np.isnan(self._coeffs)): 62 | break 63 | reg_coeff *= 10 64 | 65 | if return_errors: 66 | predictions = featmat.dot(self._coeffs) 67 | errors = returns - predictions 68 | error_after = np.sum(errors**2)/np.sum(returns**2) 69 | return error_before, error_after 70 | 71 | def predict(self, path): 72 | if self._coeffs is None: 73 | return np.zeros(len(path["rewards"])) 74 | return self._features([path]).dot(self._coeffs) 75 | -------------------------------------------------------------------------------- /mjrl/baselines/zero_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | 4 | class ZeroBaseline: 5 | def __init__(self, env_spec, **kwargs): 6 | n = env_spec.observation_dim # number of states 7 | self._coeffs = None 8 | 9 | def fit(self, paths, return_errors=False): 10 | if return_errors: 11 | return 1.0, 1.0 12 | 13 | def predict(self, path): 14 | return np.zeros(len(path["rewards"])) 15 | -------------------------------------------------------------------------------- /mjrl/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | # ---------------------------------------- 4 | # mjrl environments 5 | # ---------------------------------------- 6 | 7 | register( 8 | id='mjrl_point_mass-v0', 9 | entry_point='mjrl.envs:PointMassEnv', 10 | max_episode_steps=25, 11 | ) 12 | 13 | register( 14 | id='mjrl_swimmer-v0', 15 | entry_point='mjrl.envs:SwimmerEnv', 16 | max_episode_steps=500, 17 | ) 18 | 19 | register( 20 | id='mjrl_reacher_7dof-v0', 21 | entry_point='mjrl.envs:Reacher7DOFEnv', 22 | max_episode_steps=50, 23 | ) 24 | 25 | register( 26 | id='mjrl_peg_insertion-v0', 27 | entry_point='mjrl.envs:PegEnv', 28 | max_episode_steps=50, 29 | ) 30 | 31 | from mjrl.envs.mujoco_env import MujocoEnv 32 | # ^^^^^ so that user gets the correct error 33 | # message if mujoco is not installed correctly 34 | from mjrl.envs.point_mass import PointMassEnv 35 | from mjrl.envs.swimmer import SwimmerEnv 36 | from mjrl.envs.reacher_sawyer import Reacher7DOFEnv 37 | from mjrl.envs.peg_insertion_sawyer import PegEnv 38 | -------------------------------------------------------------------------------- /mjrl/envs/assets/peg_insertion.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /mjrl/envs/assets/point_mass.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 44 | -------------------------------------------------------------------------------- /mjrl/envs/assets/sawyer.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /mjrl/envs/assets/swimmer.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 67 | -------------------------------------------------------------------------------- /mjrl/envs/mujoco_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from gym import error, spaces 4 | from gym.utils import seeding 5 | import numpy as np 6 | from os import path 7 | import gym 8 | import six 9 | import time as timer 10 | 11 | try: 12 | import mujoco_py 13 | from mujoco_py import load_model_from_path, MjSim, MjViewer 14 | except ImportError as e: 15 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) 16 | 17 | def get_sim(model_path): 18 | if model_path.startswith("/"): 19 | fullpath = model_path 20 | else: 21 | fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path) 22 | if not path.exists(fullpath): 23 | raise IOError("File %s does not exist" % fullpath) 24 | model = load_model_from_path(fullpath) 25 | return MjSim(model) 26 | 27 | class MujocoEnv(gym.Env): 28 | """Superclass for all MuJoCo environments. 29 | """ 30 | 31 | def __init__(self, model_path=None, frame_skip=1, sim=None): 32 | 33 | if sim is None: 34 | self.sim = get_sim(model_path) 35 | else: 36 | self.sim = sim 37 | self.data = self.sim.data 38 | self.model = self.sim.model 39 | 40 | self.frame_skip = frame_skip 41 | self.metadata = { 42 | 'render.modes': ['human', 'rgb_array'], 43 | 'video.frames_per_second': int(np.round(1.0 / self.dt)) 44 | } 45 | self.mujoco_render_frames = False 46 | 47 | self.init_qpos = self.data.qpos.ravel().copy() 48 | self.init_qvel = self.data.qvel.ravel().copy() 49 | try: 50 | observation, _reward, done, _info = self.step(np.zeros(self.model.nu)) 51 | except NotImplementedError: 52 | observation, _reward, done, _info = self._step(np.zeros(self.model.nu)) 53 | assert not done 54 | self.obs_dim = np.sum([o.size for o in observation]) if type(observation) is tuple else observation.size 55 | 56 | bounds = self.model.actuator_ctrlrange.copy() 57 | low = bounds[:, 0] 58 | high = bounds[:, 1] 59 | self.action_space = spaces.Box(low, high, dtype=np.float32) 60 | 61 | high = np.inf*np.ones(self.obs_dim) 62 | low = -high 63 | self.observation_space = spaces.Box(low, high, dtype=np.float32) 64 | 65 | self.seed() 66 | 67 | def seed(self, seed=None): 68 | self.np_random, seed = seeding.np_random(seed) 69 | return [seed] 70 | 71 | # methods to override: 72 | # ---------------------------- 73 | 74 | def reset_model(self): 75 | """ 76 | Reset the robot degrees of freedom (qpos and qvel). 77 | Implement this in each subclass. 78 | """ 79 | raise NotImplementedError 80 | 81 | def mj_viewer_setup(self): 82 | """ 83 | Due to specifics of new mujoco rendering, the standard viewer cannot be used 84 | with this set-up. Instead we use this mujoco specific function. 85 | """ 86 | pass 87 | 88 | def viewer_setup(self): 89 | """ 90 | Does not work. Use mj_viewer_setup() instead 91 | """ 92 | pass 93 | 94 | def evaluate_success(self, paths, logger=None): 95 | """ 96 | Log various success metrics calculated based on input paths into the logger 97 | """ 98 | pass 99 | 100 | # ----------------------------- 101 | 102 | def reset(self): 103 | self.sim.reset() 104 | self.sim.forward() 105 | ob = self.reset_model() 106 | return ob 107 | 108 | def set_state(self, qpos, qvel): 109 | assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,) 110 | old_state = self.sim.get_state() 111 | new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel, 112 | old_state.act, old_state.udd_state) 113 | self.sim.set_state(new_state) 114 | self.sim.forward() 115 | 116 | @property 117 | def dt(self): 118 | return self.model.opt.timestep * self.frame_skip 119 | 120 | def do_simulation(self, ctrl, n_frames): 121 | for i in range(self.model.nu): 122 | self.sim.data.ctrl[i] = ctrl[i] 123 | for _ in range(n_frames): 124 | self.sim.step() 125 | if self.mujoco_render_frames is True: 126 | self.mj_render() 127 | 128 | def mj_render(self): 129 | try: 130 | self.viewer.render() 131 | except: 132 | self.mj_viewer_setup() 133 | self.viewer._run_speed = 0.5 134 | #self.viewer._run_speed /= self.frame_skip 135 | self.viewer.render() 136 | 137 | def render(self, *args, **kwargs): 138 | pass 139 | #return self.mj_render() 140 | 141 | def _get_viewer(self): 142 | pass 143 | #return None 144 | 145 | def state_vector(self): 146 | state = self.sim.get_state() 147 | return np.concatenate([ 148 | state.qpos.flat, state.qvel.flat]) 149 | 150 | # ----------------------------- 151 | 152 | def visualize_policy(self, policy, horizon=1000, num_episodes=1, mode='exploration'): 153 | self.mujoco_render_frames = True 154 | for ep in range(num_episodes): 155 | o = self.reset() 156 | d = False 157 | t = 0 158 | score = 0.0 159 | while t < horizon and d is False: 160 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation'] 161 | o, r, d, _ = self.step(a) 162 | t = t+1 163 | score = score + r 164 | print("Episode score = %f" % score) 165 | self.mujoco_render_frames = False 166 | 167 | def visualize_policy_offscreen(self, policy, horizon=1000, 168 | num_episodes=1, 169 | frame_size=(640,480), 170 | mode='exploration', 171 | save_loc='/tmp/', 172 | filename='newvid', 173 | camera_name=None): 174 | import skvideo.io 175 | for ep in range(num_episodes): 176 | print("Episode %d: rendering offline " % ep, end='', flush=True) 177 | o = self.reset() 178 | d = False 179 | t = 0 180 | arrs = [] 181 | t0 = timer.time() 182 | while t < horizon and d is False: 183 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation'] 184 | o, r, d, _ = self.step(a) 185 | t = t+1 186 | curr_frame = self.sim.render(width=frame_size[0], height=frame_size[1], 187 | mode='offscreen', camera_name=camera_name, device_id=0) 188 | arrs.append(curr_frame[::-1,:,:]) 189 | print(t, end=', ', flush=True) 190 | file_name = save_loc + filename + str(ep) + ".mp4" 191 | skvideo.io.vwrite( file_name, np.asarray(arrs)) 192 | print("saved", file_name) 193 | t1 = timer.time() 194 | print("time taken = %f"% (t1-t0)) 195 | -------------------------------------------------------------------------------- /mjrl/envs/peg_insertion_sawyer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | 7 | class PegEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | self.peg_sid = -2 10 | self.target_sid = -1 11 | mujoco_env.MujocoEnv.__init__(self, 'peg_insertion.xml', 4) 12 | utils.EzPickle.__init__(self) 13 | self.peg_sid = self.model.site_name2id("peg_bottom") 14 | self.target_sid = self.model.site_name2id("target") 15 | self.init_body_pos = self.model.body_pos.copy() 16 | 17 | def step(self, a): 18 | self.do_simulation(a, self.frame_skip) 19 | obs = self.get_obs() 20 | reward = self.get_reward(obs, a) 21 | return obs, reward, False, self.get_env_infos() 22 | 23 | def get_obs(self): 24 | return np.concatenate([ 25 | self.data.qpos.flat, 26 | self.data.qvel.flat, 27 | self.data.site_xpos[self.peg_sid], 28 | self.data.site_xpos[self.target_sid], 29 | ]) 30 | 31 | def get_reward(self, obs, act=None): 32 | obs = np.clip(obs, -10.0, 10.0) 33 | if len(obs.shape) == 1: 34 | # vector obs, called when stepping the env 35 | hand_pos = obs[-6:-3] 36 | target_pos = obs[-3:] 37 | l1_dist = np.sum(np.abs(hand_pos - target_pos)) 38 | l2_dist = np.linalg.norm(hand_pos - target_pos) 39 | else: 40 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 41 | hand_pos = obs[:, :, -6:-3] 42 | target_pos = obs[:, :, -3:] 43 | l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1) 44 | l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1) 45 | bonus = 5.0 * (l2_dist < 0.06) 46 | reward = - l1_dist - 5.0 * l2_dist + bonus 47 | return reward 48 | 49 | def compute_path_rewards(self, paths): 50 | # path has two keys: observations and actions 51 | # path["observations"] : (num_traj, horizon, obs_dim) 52 | # path["rewards"] should have shape (num_traj, horizon) 53 | obs = paths["observations"] 54 | rewards = self.get_reward(obs) 55 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 56 | 57 | # -------------------------------- 58 | # resets and randomization 59 | # -------------------------------- 60 | 61 | def robot_reset(self): 62 | self.set_state(self.init_qpos, self.init_qvel) 63 | 64 | def target_reset(self): 65 | # Randomize goal position 66 | goal_y = self.np_random.uniform(low=0.1, high=0.5) 67 | try: 68 | self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29) 69 | self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29) 70 | self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29) 71 | self.sim.forward() 72 | except: 73 | pass 74 | 75 | def reset_model(self, seed=None): 76 | if seed is not None: 77 | self.seeding = True 78 | self.seed(seed) 79 | self.robot_reset() 80 | self.target_reset() 81 | return self.get_obs() 82 | 83 | # -------------------------------- 84 | # get and set states 85 | # -------------------------------- 86 | 87 | def get_env_state(self): 88 | target_pos = self.model.body_pos[-1].copy() 89 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(), 90 | target_pos=target_pos) 91 | 92 | def set_env_state(self, state): 93 | self.sim.reset() 94 | qp = state['qp'].copy() 95 | qv = state['qv'].copy() 96 | target_pos = state['target_pos'] 97 | self.model.body_pos[-1] = target_pos 98 | goal_y = target_pos[1] 99 | self.data.qpos[:] = qp 100 | self.data.qvel[:] = qv 101 | self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29) 102 | self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29) 103 | self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29) 104 | self.sim.forward() 105 | 106 | # -------------------------------- 107 | # utility functions 108 | # -------------------------------- 109 | 110 | def get_env_infos(self): 111 | return dict(state=self.get_env_state()) 112 | 113 | def mj_viewer_setup(self): 114 | self.viewer = MjViewer(self.sim) 115 | self.viewer.cam.azimuth += 200 116 | self.sim.forward() 117 | self.viewer.cam.distance = self.model.stat.extent*2.0 118 | -------------------------------------------------------------------------------- /mjrl/envs/point_mass.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | 7 | class PointMassEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | self.agent_bid = 0 10 | self.target_sid = 0 11 | utils.EzPickle.__init__(self) 12 | mujoco_env.MujocoEnv.__init__(self, 'point_mass.xml', 5) 13 | self.agent_bid = self.sim.model.body_name2id('agent') 14 | self.target_sid = self.sim.model.site_name2id('target') 15 | 16 | def step(self, a): 17 | self.do_simulation(a, self.frame_skip) 18 | obs = self.get_obs() 19 | reward = self.get_reward(obs) 20 | return obs, reward, False, dict(solved=(reward > -0.1), state=self.get_env_state()) 21 | 22 | def get_obs(self): 23 | agent_pos = self.data.body_xpos[self.agent_bid].ravel() 24 | target_pos = self.data.site_xpos[self.target_sid].ravel() 25 | return np.concatenate([agent_pos[:2], self.data.qvel.ravel(), target_pos[:2]]) 26 | 27 | def get_reward(self, obs, act=None): 28 | if len(obs.shape) == 1: 29 | # vector obs, called when stepping the env 30 | agent_pos = obs[:2] 31 | target_pos = obs[-2:] 32 | l1_dist = np.sum(np.abs(agent_pos - target_pos)) 33 | l2_dist = np.linalg.norm(agent_pos - target_pos) 34 | else: 35 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 36 | agent_pos = obs[:, :, :2] 37 | target_pos = obs[:, :, -2:] 38 | l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1) 39 | l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1) 40 | reward = -1.0 * l1_dist - 0.5 * l2_dist 41 | return reward 42 | 43 | def compute_path_rewards(self, paths): 44 | # path has two keys: observations and actions 45 | # path["observations"] : (num_traj, horizon, obs_dim) 46 | # path["rewards"] should have shape (num_traj, horizon) 47 | obs = paths["observations"] 48 | rewards = self.get_reward(obs) 49 | rewards[..., :-1] = rewards[..., 1:] # shift index by 1 to have r(s,a)=r(s') 50 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 51 | return paths 52 | 53 | def reset_model(self): 54 | # randomize the agent and goal 55 | agent_x = self.np_random.uniform(low=-1.0, high=1.0) 56 | agent_y = self.np_random.uniform(low=-1.0, high=1.0) 57 | goal_x = self.np_random.uniform(low=-1.0, high=1.0) 58 | goal_y = self.np_random.uniform(low=-1.0, high=1.0) 59 | qp = np.array([agent_x, agent_y]) 60 | qv = self.init_qvel.copy() 61 | self.set_state(qp, qv) 62 | self.model.site_pos[self.target_sid][0] = goal_x 63 | self.model.site_pos[self.target_sid][1] = goal_y 64 | self.sim.forward() 65 | return self.get_obs() 66 | 67 | def evaluate_success(self, paths, logger=None): 68 | success = 0.0 69 | for p in paths: 70 | if np.mean(p['env_infos']['solved'][-4:]) > 0.0: 71 | success += 1.0 72 | success_rate = 100.0*success/len(paths) 73 | if logger is None: 74 | # nowhere to log so return the value 75 | return success_rate 76 | else: 77 | # log the success 78 | # can log multiple statistics here if needed 79 | logger.log_kv('success_rate', success_rate) 80 | return None 81 | 82 | # -------------------------------- 83 | # get and set states 84 | # -------------------------------- 85 | 86 | def get_env_state(self): 87 | target_pos = self.model.site_pos[self.target_sid].copy() 88 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(), 89 | target_pos=target_pos) 90 | 91 | def set_env_state(self, state): 92 | self.sim.reset() 93 | qp = state['qp'].copy() 94 | qv = state['qv'].copy() 95 | target_pos = state['target_pos'] 96 | self.set_state(qp, qv) 97 | self.model.site_pos[self.target_sid] = target_pos 98 | self.sim.forward() 99 | 100 | # -------------------------------- 101 | # utility functions 102 | # -------------------------------- 103 | 104 | def get_env_infos(self): 105 | return dict(state=self.get_env_state()) 106 | 107 | def mj_viewer_setup(self): 108 | self.viewer = MjViewer(self.sim) 109 | self.sim.forward() 110 | -------------------------------------------------------------------------------- /mjrl/envs/reacher_sawyer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | 7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | self.hand_sid = -2 10 | self.target_sid = -1 11 | mujoco_env.MujocoEnv.__init__(self, 'sawyer.xml', 4) 12 | utils.EzPickle.__init__(self) 13 | self.hand_sid = self.model.site_name2id("finger") 14 | self.target_sid = self.model.site_name2id("target") 15 | 16 | def step(self, a): 17 | self.do_simulation(a, self.frame_skip) 18 | obs = self.get_obs() 19 | reward = self.get_reward(obs, a) 20 | return obs, reward, False, self.get_env_infos() 21 | 22 | def get_obs(self): 23 | return np.concatenate([ 24 | self.data.qpos.flat, 25 | self.data.qvel.ravel() * self.dt, # delta_x instead of velocity 26 | self.data.site_xpos[self.hand_sid], 27 | self.data.site_xpos[self.target_sid], 28 | ]) 29 | 30 | def get_reward(self, obs, act=None): 31 | obs = np.clip(obs, -10.0, 10.0) 32 | if len(obs.shape) == 1: 33 | # vector obs, called when stepping the env 34 | hand_pos = obs[-6:-3] 35 | target_pos = obs[-3:] 36 | l1_dist = np.sum(np.abs(hand_pos - target_pos)) 37 | l2_dist = np.linalg.norm(hand_pos - target_pos) 38 | else: 39 | obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs 40 | hand_pos = obs[:, :, -6:-3] 41 | target_pos = obs[:, :, -3:] 42 | l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1) 43 | l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1) 44 | reward = - l1_dist - 5.0 * l2_dist 45 | return reward 46 | 47 | def compute_path_rewards(self, paths): 48 | # path has two keys: observations and actions 49 | # path["observations"] : (num_traj, horizon, obs_dim) 50 | # path["rewards"] should have shape (num_traj, horizon) 51 | obs = paths["observations"] 52 | rewards = self.get_reward(obs) 53 | paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel() 54 | 55 | # -------------------------------- 56 | # resets and randomization 57 | # -------------------------------- 58 | 59 | def robot_reset(self): 60 | self.set_state(self.init_qpos, self.init_qvel) 61 | 62 | def target_reset(self): 63 | target_pos = np.array([0.1, 0.1, 0.1]) 64 | target_pos[0] = self.np_random.uniform(low=-0.3, high=0.3) 65 | target_pos[1] = self.np_random.uniform(low=-0.2, high=0.2) 66 | target_pos[2] = self.np_random.uniform(low=-0.25, high=0.25) 67 | self.model.site_pos[self.target_sid] = target_pos 68 | self.sim.forward() 69 | 70 | def reset_model(self, seed=None): 71 | if seed is not None: 72 | self.seeding = True 73 | self.seed(seed) 74 | self.robot_reset() 75 | self.target_reset() 76 | return self.get_obs() 77 | 78 | # -------------------------------- 79 | # get and set states 80 | # -------------------------------- 81 | 82 | def get_env_state(self): 83 | target_pos = self.model.site_pos[self.target_sid].copy() 84 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(), 85 | target_pos=target_pos) 86 | 87 | def set_env_state(self, state): 88 | self.sim.reset() 89 | qp = state['qp'].copy() 90 | qv = state['qv'].copy() 91 | target_pos = state['target_pos'] 92 | self.model.site_pos[self.target_sid] = target_pos 93 | self.data.qpos[:] = qp 94 | self.data.qvel[:] = qv 95 | self.sim.forward() 96 | 97 | # -------------------------------- 98 | # utility functions 99 | # -------------------------------- 100 | 101 | def get_env_infos(self): 102 | return dict(state=self.get_env_state()) 103 | 104 | def mj_viewer_setup(self): 105 | self.viewer = MjViewer(self.sim) 106 | self.viewer.cam.trackbodyid = 1 107 | self.viewer.cam.type = 1 108 | self.sim.forward() 109 | self.viewer.cam.distance = self.model.stat.extent * 2.0 110 | -------------------------------------------------------------------------------- /mjrl/envs/swimmer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from mjrl.envs import mujoco_env 4 | from mujoco_py import MjViewer 5 | 6 | class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, 'swimmer.xml', 5) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, a): 12 | xposbefore = self.data.qpos[0] 13 | self.do_simulation(a, self.frame_skip) 14 | xposafter = self.data.qpos[0] 15 | 16 | delta = (xposafter - xposbefore) 17 | # make agent move in the negative x direction 18 | reward = -10.0 * delta 19 | done = False 20 | 21 | ob = self.get_obs() 22 | return ob, reward, done, self.get_env_infos() 23 | 24 | def get_obs(self): 25 | return np.concatenate([ 26 | self.data.qpos.flat[2:], 27 | self.data.qvel.flat, 28 | ]) 29 | 30 | def reset_model(self): 31 | qpos_init = self.init_qpos.copy() 32 | qpos_init[2] = self.np_random.uniform(low=-np.pi, high=np.pi) 33 | self.set_state(qpos_init, self.init_qvel) 34 | self.sim.forward() 35 | return self.get_obs() 36 | 37 | # -------------------------------- 38 | # get and set states 39 | # -------------------------------- 40 | 41 | def get_env_state(self): 42 | return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy()) 43 | 44 | def set_env_state(self, state): 45 | self.sim.reset() 46 | qp = state['qp'].copy() 47 | qv = state['qv'].copy() 48 | self.set_state(qp, qv) 49 | self.sim.forward() 50 | 51 | # -------------------------------- 52 | # utility functions 53 | # -------------------------------- 54 | 55 | def get_env_infos(self): 56 | return dict(state=self.get_env_state()) 57 | 58 | def mj_viewer_setup(self): 59 | self.viewer = MjViewer(self.sim) 60 | self.viewer.cam.trackbodyid = 1 61 | self.viewer.cam.type = 1 62 | self.sim.forward() 63 | self.viewer.cam.distance = self.model.stat.extent*1.2 -------------------------------------------------------------------------------- /mjrl/policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/policies/__init__.py -------------------------------------------------------------------------------- /mjrl/policies/gaussian_linear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from mjrl.utils.fc_network import FCNetwork 7 | 8 | 9 | class LinearPolicy: 10 | def __init__(self, env_spec, 11 | min_log_std=-3, 12 | init_log_std=0, 13 | seed=None): 14 | """ 15 | :param env_spec: specifications of the env (see utils/gym_env.py) 16 | :param min_log_std: log_std is clamped at this value and can't go below 17 | :param init_log_std: initial log standard deviation 18 | :param seed: random seed 19 | """ 20 | self.n = env_spec.observation_dim # number of states 21 | self.m = env_spec.action_dim # number of actions 22 | self.min_log_std = min_log_std 23 | 24 | # Set seed 25 | # ------------------------ 26 | if seed is not None: 27 | torch.manual_seed(seed) 28 | np.random.seed(seed) 29 | 30 | # Policy network 31 | # ------------------------ 32 | self.model = FCNetwork(self.n, self.m, hidden_sizes=()) 33 | # make weights small 34 | for param in list(self.model.parameters())[-2:]: # only last layer 35 | param.data = 1e-2 * param.data 36 | self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True) 37 | self.trainable_params = list(self.model.parameters()) + [self.log_std] 38 | 39 | # Old Policy network 40 | # ------------------------ 41 | self.old_model = FCNetwork(self.n, self.m, hidden_sizes=()) 42 | self.old_log_std = Variable(torch.ones(self.m) * init_log_std) 43 | self.old_params = list(self.old_model.parameters()) + [self.old_log_std] 44 | for idx, param in enumerate(self.old_params): 45 | param.data = self.trainable_params[idx].data.clone() 46 | 47 | # Easy access variables 48 | # ------------------------- 49 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel()) 50 | self.param_shapes = [p.data.numpy().shape for p in self.trainable_params] 51 | self.param_sizes = [p.data.numpy().size for p in self.trainable_params] 52 | self.d = np.sum(self.param_sizes) # total number of params 53 | 54 | # Placeholders 55 | # ------------------------ 56 | self.obs_var = Variable(torch.randn(self.n), requires_grad=False) 57 | 58 | # Utility functions 59 | # ============================================ 60 | def get_param_values(self): 61 | params = np.concatenate([p.contiguous().view(-1).data.numpy() 62 | for p in self.trainable_params]) 63 | return params.copy() 64 | 65 | def set_param_values(self, new_params, set_new=True, set_old=True): 66 | if set_new: 67 | current_idx = 0 68 | for idx, param in enumerate(self.trainable_params): 69 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]] 70 | vals = vals.reshape(self.param_shapes[idx]) 71 | param.data = torch.from_numpy(vals).float() 72 | current_idx += self.param_sizes[idx] 73 | # clip std at minimum value 74 | self.trainable_params[-1].data = \ 75 | torch.clamp(self.trainable_params[-1], self.min_log_std).data 76 | # update log_std_val for sampling 77 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel()) 78 | if set_old: 79 | current_idx = 0 80 | for idx, param in enumerate(self.old_params): 81 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]] 82 | vals = vals.reshape(self.param_shapes[idx]) 83 | param.data = torch.from_numpy(vals).float() 84 | current_idx += self.param_sizes[idx] 85 | # clip std at minimum value 86 | self.old_params[-1].data = \ 87 | torch.clamp(self.old_params[-1], self.min_log_std).data 88 | 89 | # Main functions 90 | # ============================================ 91 | def get_action(self, observation): 92 | o = np.float32(observation.reshape(1, -1)) 93 | self.obs_var.data = torch.from_numpy(o) 94 | mean = self.model(self.obs_var).data.numpy().ravel() 95 | noise = np.exp(self.log_std_val) * np.random.randn(self.m) 96 | action = mean + noise 97 | return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}] 98 | 99 | def mean_LL(self, observations, actions, model=None, log_std=None): 100 | model = self.model if model is None else model 101 | log_std = self.log_std if log_std is None else log_std 102 | obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False) 103 | act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False) 104 | mean = model(obs_var) 105 | zs = (act_var - mean) / torch.exp(log_std) 106 | LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \ 107 | - torch.sum(log_std) + \ 108 | - 0.5 * self.m * np.log(2 * np.pi) 109 | return mean, LL 110 | 111 | def log_likelihood(self, observations, actions, model=None, log_std=None): 112 | mean, LL = self.mean_LL(observations, actions, model, log_std) 113 | return LL.data.numpy() 114 | 115 | def old_dist_info(self, observations, actions): 116 | mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std) 117 | return [LL, mean, self.old_log_std] 118 | 119 | def new_dist_info(self, observations, actions): 120 | mean, LL = self.mean_LL(observations, actions, self.model, self.log_std) 121 | return [LL, mean, self.log_std] 122 | 123 | def likelihood_ratio(self, new_dist_info, old_dist_info): 124 | LL_old = old_dist_info[0] 125 | LL_new = new_dist_info[0] 126 | LR = torch.exp(LL_new - LL_old) 127 | return LR 128 | 129 | def mean_kl(self, new_dist_info, old_dist_info): 130 | old_log_std = old_dist_info[2] 131 | new_log_std = new_dist_info[2] 132 | old_std = torch.exp(old_log_std) 133 | new_std = torch.exp(new_log_std) 134 | old_mean = old_dist_info[1] 135 | new_mean = new_dist_info[1] 136 | Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2 137 | Dr = 2 * new_std ** 2 + 1e-8 138 | sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1) 139 | return torch.mean(sample_kl) 140 | -------------------------------------------------------------------------------- /mjrl/policies/gaussian_mlp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from mjrl.utils.fc_network import FCNetwork 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | 7 | class MLP: 8 | def __init__(self, env_spec, 9 | hidden_sizes=(64,64), 10 | min_log_std=-3, 11 | init_log_std=0, 12 | seed=None): 13 | """ 14 | :param env_spec: specifications of the env (see utils/gym_env.py) 15 | :param hidden_sizes: network hidden layer sizes (currently 2 layers only) 16 | :param min_log_std: log_std is clamped at this value and can't go below 17 | :param init_log_std: initial log standard deviation 18 | :param seed: random seed 19 | """ 20 | self.n = env_spec.observation_dim # number of states 21 | self.m = env_spec.action_dim # number of actions 22 | self.min_log_std = min_log_std 23 | 24 | # Set seed 25 | # ------------------------ 26 | if seed is not None: 27 | torch.manual_seed(seed) 28 | np.random.seed(seed) 29 | 30 | # Policy network 31 | # ------------------------ 32 | self.model = FCNetwork(self.n, self.m, hidden_sizes) 33 | # make weights small 34 | for param in list(self.model.parameters())[-2:]: # only last layer 35 | param.data = 1e-2 * param.data 36 | self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True) 37 | self.trainable_params = list(self.model.parameters()) + [self.log_std] 38 | 39 | # Old Policy network 40 | # ------------------------ 41 | self.old_model = FCNetwork(self.n, self.m, hidden_sizes) 42 | self.old_log_std = Variable(torch.ones(self.m) * init_log_std) 43 | self.old_params = list(self.old_model.parameters()) + [self.old_log_std] 44 | for idx, param in enumerate(self.old_params): 45 | param.data = self.trainable_params[idx].data.clone() 46 | 47 | # Easy access variables 48 | # ------------------------- 49 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel()) 50 | self.param_shapes = [p.data.numpy().shape for p in self.trainable_params] 51 | self.param_sizes = [p.data.numpy().size for p in self.trainable_params] 52 | self.d = np.sum(self.param_sizes) # total number of params 53 | 54 | # Placeholders 55 | # ------------------------ 56 | self.obs_var = Variable(torch.randn(self.n), requires_grad=False) 57 | 58 | # Utility functions 59 | # ============================================ 60 | def get_param_values(self): 61 | params = np.concatenate([p.contiguous().view(-1).data.numpy() 62 | for p in self.trainable_params]) 63 | return params.copy() 64 | 65 | def set_param_values(self, new_params, set_new=True, set_old=True): 66 | if set_new: 67 | current_idx = 0 68 | for idx, param in enumerate(self.trainable_params): 69 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]] 70 | vals = vals.reshape(self.param_shapes[idx]) 71 | param.data = torch.from_numpy(vals).float() 72 | current_idx += self.param_sizes[idx] 73 | # clip std at minimum value 74 | self.trainable_params[-1].data = \ 75 | torch.clamp(self.trainable_params[-1], self.min_log_std).data 76 | # update log_std_val for sampling 77 | self.log_std_val = np.float64(self.log_std.data.numpy().ravel()) 78 | if set_old: 79 | current_idx = 0 80 | for idx, param in enumerate(self.old_params): 81 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]] 82 | vals = vals.reshape(self.param_shapes[idx]) 83 | param.data = torch.from_numpy(vals).float() 84 | current_idx += self.param_sizes[idx] 85 | # clip std at minimum value 86 | self.old_params[-1].data = \ 87 | torch.clamp(self.old_params[-1], self.min_log_std).data 88 | 89 | # Main functions 90 | # ============================================ 91 | def get_action(self, observation): 92 | o = np.float32(observation.reshape(1, -1)) 93 | self.obs_var.data = torch.from_numpy(o) 94 | mean = self.model(self.obs_var).data.numpy().ravel() 95 | noise = np.exp(self.log_std_val) * np.random.randn(self.m) 96 | action = mean + noise 97 | return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}] 98 | 99 | def mean_LL(self, observations, actions, model=None, log_std=None): 100 | model = self.model if model is None else model 101 | log_std = self.log_std if log_std is None else log_std 102 | if type(observations) is not torch.Tensor: 103 | obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False) 104 | else: 105 | obs_var = observations 106 | if type(actions) is not torch.Tensor: 107 | act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False) 108 | else: 109 | act_var = actions 110 | mean = model(obs_var) 111 | zs = (act_var - mean) / torch.exp(log_std) 112 | LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \ 113 | - torch.sum(log_std) + \ 114 | - 0.5 * self.m * np.log(2 * np.pi) 115 | return mean, LL 116 | 117 | def log_likelihood(self, observations, actions, model=None, log_std=None): 118 | mean, LL = self.mean_LL(observations, actions, model, log_std) 119 | return LL.data.numpy() 120 | 121 | def old_dist_info(self, observations, actions): 122 | mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std) 123 | return [LL, mean, self.old_log_std] 124 | 125 | def new_dist_info(self, observations, actions): 126 | mean, LL = self.mean_LL(observations, actions, self.model, self.log_std) 127 | return [LL, mean, self.log_std] 128 | 129 | def likelihood_ratio(self, new_dist_info, old_dist_info): 130 | LL_old = old_dist_info[0] 131 | LL_new = new_dist_info[0] 132 | LR = torch.exp(LL_new - LL_old) 133 | return LR 134 | 135 | def mean_kl(self, new_dist_info, old_dist_info): 136 | old_log_std = old_dist_info[2] 137 | new_log_std = new_dist_info[2] 138 | old_std = torch.exp(old_log_std) 139 | new_std = torch.exp(new_log_std) 140 | old_mean = old_dist_info[1] 141 | new_mean = new_dist_info[1] 142 | Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2 143 | Dr = 2 * new_std ** 2 + 1e-8 144 | sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1) 145 | return torch.mean(sample_kl) 146 | -------------------------------------------------------------------------------- /mjrl/policies/mpc_actor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from trajopt.utils import gather_paths_parallel 3 | 4 | 5 | class MPCActor(object): 6 | def __init__(self, env, H, paths_per_cpu, 7 | num_cpu=1, 8 | kappa=1.0, 9 | gamma=1.0, 10 | mean=None, 11 | filter_coefs=None, 12 | seed=123, 13 | ): 14 | 15 | self.env, self.seed = env, seed 16 | self.n, self.m = env.observation_dim, env.action_dim 17 | self.H, self.paths_per_cpu, self.num_cpu = H, paths_per_cpu, num_cpu 18 | 19 | self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma 20 | if mean is None: 21 | self.mean = np.zeros(self.m) 22 | if filter_coefs is None: 23 | self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0] 24 | 25 | self.env.reset() 26 | self.env.set_seed(seed) 27 | self.env.reset(seed=seed) 28 | self.act_sequence = np.ones((self.H, self.m)) * self.mean 29 | self.ctr = 1 30 | 31 | def score_trajectory(self, paths): 32 | scores = np.zeros(len(paths)) 33 | for i in range(len(paths)): 34 | scores[i] = 0.0 35 | for t in range(paths[i]["rewards"].shape[0]): 36 | scores[i] += (self.gamma**t)*paths[i]["rewards"][t] 37 | return scores 38 | 39 | def get_action(self, env_state): 40 | # Set to env_state 41 | # Shoot trajectories 42 | # Return optimal action 43 | seed = self.seed + self.ctr * 1000 44 | paths = gather_paths_parallel(self.env.env_id, 45 | env_state, 46 | self.act_sequence, 47 | self.filter_coefs, 48 | seed, 49 | self.paths_per_cpu, 50 | self.num_cpu, 51 | ) 52 | 53 | num_traj = len(paths) 54 | R = self.score_trajectory(paths) 55 | S = np.exp(self.kappa*(R-np.max(R))) 56 | act = np.sum([paths[i]["actions"][0] * S[i] for i in range(num_traj)], axis=0) 57 | act = act / (np.sum(S) + 1e-6) 58 | return act -------------------------------------------------------------------------------- /mjrl/samplers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/samplers/__init__.py -------------------------------------------------------------------------------- /mjrl/samplers/core.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | from mjrl.utils.gym_env import GymEnv 4 | from mjrl.utils import tensor_utils 5 | logging.disable(logging.CRITICAL) 6 | import multiprocessing as mp 7 | import time as timer 8 | logging.disable(logging.CRITICAL) 9 | 10 | 11 | # Single core rollout to sample trajectories 12 | # ======================================================= 13 | def do_rollout( 14 | num_traj, 15 | env, 16 | policy, 17 | eval_mode = False, 18 | horizon = 1e6, 19 | base_seed = None, 20 | env_kwargs=None, 21 | ): 22 | """ 23 | :param num_traj: number of trajectories (int) 24 | :param env: environment (env class, str with env_name, or factory function) 25 | :param policy: policy to use for action selection 26 | :param eval_mode: use evaluation mode for action computation (bool) 27 | :param horizon: max horizon length for rollout (<= env.horizon) 28 | :param base_seed: base seed for rollouts (int) 29 | :param env_kwargs: dictionary with parameters, will be passed to env generator 30 | :return: 31 | """ 32 | 33 | # get the correct env behavior 34 | if type(env) == str: 35 | env = GymEnv(env) 36 | elif isinstance(env, GymEnv): 37 | env = env 38 | elif callable(env): 39 | env = env(**env_kwargs) 40 | else: 41 | print("Unsupported environment format") 42 | raise AttributeError 43 | 44 | if base_seed is not None: 45 | env.set_seed(base_seed) 46 | np.random.seed(base_seed) 47 | else: 48 | np.random.seed() 49 | horizon = min(horizon, env.horizon) 50 | paths = [] 51 | 52 | for ep in range(num_traj): 53 | # seeding 54 | if base_seed is not None: 55 | seed = base_seed + ep 56 | env.set_seed(seed) 57 | np.random.seed(seed) 58 | 59 | observations=[] 60 | actions=[] 61 | rewards=[] 62 | agent_infos = [] 63 | env_infos = [] 64 | 65 | o = env.reset() 66 | done = False 67 | t = 0 68 | 69 | while t < horizon and done != True: 70 | a, agent_info = policy.get_action(o) 71 | if eval_mode: 72 | a = agent_info['evaluation'] 73 | env_info_base = env.get_env_infos() 74 | next_o, r, done, env_info_step = env.step(a) 75 | # below is important to ensure correct env_infos for the timestep 76 | env_info = env_info_step if env_info_base == {} else env_info_base 77 | observations.append(o) 78 | actions.append(a) 79 | rewards.append(r) 80 | agent_infos.append(agent_info) 81 | env_infos.append(env_info) 82 | o = next_o 83 | t += 1 84 | 85 | path = dict( 86 | observations=np.array(observations), 87 | actions=np.array(actions), 88 | rewards=np.array(rewards), 89 | agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), 90 | env_infos=tensor_utils.stack_tensor_dict_list(env_infos), 91 | terminated=done 92 | ) 93 | paths.append(path) 94 | 95 | del(env) 96 | return paths 97 | 98 | 99 | def sample_paths( 100 | num_traj, 101 | env, 102 | policy, 103 | eval_mode = False, 104 | horizon = 1e6, 105 | base_seed = None, 106 | num_cpu = 1, 107 | max_process_time=300, 108 | max_timeouts=4, 109 | suppress_print=False, 110 | env_kwargs=None, 111 | ): 112 | 113 | num_cpu = 1 if num_cpu is None else num_cpu 114 | num_cpu = mp.cpu_count() if num_cpu == 'max' else num_cpu 115 | assert type(num_cpu) == int 116 | 117 | if num_cpu == 1: 118 | input_dict = dict(num_traj=num_traj, env=env, policy=policy, 119 | eval_mode=eval_mode, horizon=horizon, base_seed=base_seed, 120 | env_kwargs=env_kwargs) 121 | # dont invoke multiprocessing if not necessary 122 | return do_rollout(**input_dict) 123 | 124 | # do multiprocessing otherwise 125 | paths_per_cpu = int(np.ceil(num_traj/num_cpu)) 126 | input_dict_list= [] 127 | for i in range(num_cpu): 128 | input_dict = dict(num_traj=paths_per_cpu, env=env, policy=policy, 129 | eval_mode=eval_mode, horizon=horizon, 130 | base_seed=base_seed + i * paths_per_cpu, 131 | env_kwargs=env_kwargs) 132 | input_dict_list.append(input_dict) 133 | if suppress_print is False: 134 | start_time = timer.time() 135 | print("####### Gathering Samples #######") 136 | 137 | results = _try_multiprocess(do_rollout, input_dict_list, 138 | num_cpu, max_process_time, max_timeouts) 139 | paths = [] 140 | # result is a paths type and results is list of paths 141 | for result in results: 142 | for path in result: 143 | paths.append(path) 144 | 145 | if suppress_print is False: 146 | print("======= Samples Gathered ======= | >>>> Time taken = %f " %(timer.time()-start_time) ) 147 | 148 | return paths 149 | 150 | 151 | def sample_data_batch( 152 | num_samples, 153 | env, 154 | policy, 155 | eval_mode = False, 156 | horizon = 1e6, 157 | base_seed = None, 158 | num_cpu = 1, 159 | paths_per_call = 1, 160 | env_kwargs=None, 161 | ): 162 | 163 | num_cpu = 1 if num_cpu is None else num_cpu 164 | num_cpu = mp.cpu_count() if num_cpu == 'max' else num_cpu 165 | assert type(num_cpu) == int 166 | 167 | start_time = timer.time() 168 | print("####### Gathering Samples #######") 169 | sampled_so_far = 0 170 | paths_so_far = 0 171 | paths = [] 172 | base_seed = 123 if base_seed is None else base_seed 173 | while sampled_so_far < num_samples: 174 | base_seed = base_seed + 12345 175 | new_paths = sample_paths(paths_per_call * num_cpu, env, policy, 176 | eval_mode, horizon, base_seed, num_cpu, 177 | suppress_print=True, env_kwargs=env_kwargs) 178 | for path in new_paths: 179 | paths.append(path) 180 | paths_so_far += len(new_paths) 181 | new_samples = np.sum([len(p['rewards']) for p in new_paths]) 182 | sampled_so_far += new_samples 183 | print("======= Samples Gathered ======= | >>>> Time taken = %f " % (timer.time() - start_time)) 184 | print("................................. | >>>> # samples = %i # trajectories = %i " % ( 185 | sampled_so_far, paths_so_far)) 186 | return paths 187 | 188 | 189 | def _try_multiprocess(func, input_dict_list, num_cpu, max_process_time, max_timeouts): 190 | 191 | # Base case 192 | if max_timeouts == 0: 193 | return None 194 | 195 | pool = mp.Pool(processes=num_cpu, maxtasksperchild=None) 196 | parallel_runs = [pool.apply_async(func, kwds=input_dict) for input_dict in input_dict_list] 197 | try: 198 | results = [p.get(timeout=max_process_time) for p in parallel_runs] 199 | except Exception as e: 200 | print(str(e)) 201 | print("Timeout Error raised... Trying again") 202 | pool.close() 203 | pool.terminate() 204 | pool.join() 205 | return _try_multiprocess(func, input_dict_list, num_cpu, max_process_time, max_timeouts-1) 206 | 207 | pool.close() 208 | pool.terminate() 209 | pool.join() 210 | return results 211 | -------------------------------------------------------------------------------- /mjrl/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/utils/__init__.py -------------------------------------------------------------------------------- /mjrl/utils/cg_solve.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def cg_solve(f_Ax, b, x_0=None, cg_iters=10, residual_tol=1e-10): 4 | x = np.zeros_like(b) #if x_0 is None else x_0 5 | r = b.copy() #if x_0 is None else b-f_Ax(x_0) 6 | p = r.copy() 7 | rdotr = r.dot(r) 8 | 9 | for i in range(cg_iters): 10 | z = f_Ax(p) 11 | v = rdotr / p.dot(z) 12 | x += v * p 13 | r -= v * z 14 | newrdotr = r.dot(r) 15 | mu = newrdotr / rdotr 16 | p = r + mu * p 17 | 18 | rdotr = newrdotr 19 | if rdotr < residual_tol: 20 | break 21 | 22 | return x 23 | -------------------------------------------------------------------------------- /mjrl/utils/fc_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class FCNetwork(nn.Module): 7 | def __init__(self, obs_dim, act_dim, 8 | hidden_sizes=(64,64), 9 | nonlinearity='tanh', # either 'tanh' or 'relu' 10 | in_shift = None, 11 | in_scale = None, 12 | out_shift = None, 13 | out_scale = None): 14 | super(FCNetwork, self).__init__() 15 | 16 | self.obs_dim = obs_dim 17 | self.act_dim = act_dim 18 | assert type(hidden_sizes) == tuple 19 | self.layer_sizes = (obs_dim, ) + hidden_sizes + (act_dim, ) 20 | self.set_transformations(in_shift, in_scale, out_shift, out_scale) 21 | 22 | # hidden layers 23 | self.fc_layers = nn.ModuleList([nn.Linear(self.layer_sizes[i], self.layer_sizes[i+1]) \ 24 | for i in range(len(self.layer_sizes) -1)]) 25 | self.nonlinearity = torch.relu if nonlinearity == 'relu' else torch.tanh 26 | 27 | def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None): 28 | # store native scales that can be used for resets 29 | self.transformations = dict(in_shift=in_shift, 30 | in_scale=in_scale, 31 | out_shift=out_shift, 32 | out_scale=out_scale 33 | ) 34 | self.in_shift = torch.from_numpy(np.float32(in_shift)) if in_shift is not None else torch.zeros(self.obs_dim) 35 | self.in_scale = torch.from_numpy(np.float32(in_scale)) if in_scale is not None else torch.ones(self.obs_dim) 36 | self.out_shift = torch.from_numpy(np.float32(out_shift)) if out_shift is not None else torch.zeros(self.act_dim) 37 | self.out_scale = torch.from_numpy(np.float32(out_scale)) if out_scale is not None else torch.ones(self.act_dim) 38 | 39 | def forward(self, x): 40 | # TODO(Aravind): Remove clamping to CPU 41 | # This is a temp change that should be fixed shortly 42 | if x.is_cuda: 43 | out = x.to('cpu') 44 | else: 45 | out = x 46 | out = (out - self.in_shift)/(self.in_scale + 1e-8) 47 | for i in range(len(self.fc_layers)-1): 48 | out = self.fc_layers[i](out) 49 | out = self.nonlinearity(out) 50 | out = self.fc_layers[-1](out) 51 | out = out * self.out_scale + self.out_shift 52 | return out 53 | -------------------------------------------------------------------------------- /mjrl/utils/get_environment.py: -------------------------------------------------------------------------------- 1 | """ 2 | convenience function to generate env 3 | useful if we want some procedural env generation 4 | """ 5 | 6 | import gym 7 | from mjrl.utils.gym_env import GymEnv 8 | 9 | def get_environment(env_name=None, **kwargs): 10 | if env_name is None: print("Need to specify environment name") 11 | e = GymEnv(env_name) 12 | # can make procedural modifications here if needed using kwargs 13 | return e 14 | -------------------------------------------------------------------------------- /mjrl/utils/gym_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper around a gym env that provides convenience functions 3 | """ 4 | 5 | import gym 6 | import numpy as np 7 | 8 | 9 | class EnvSpec(object): 10 | def __init__(self, obs_dim, act_dim, horizon): 11 | self.observation_dim = obs_dim 12 | self.action_dim = act_dim 13 | self.horizon = horizon 14 | 15 | 16 | class GymEnv(object): 17 | def __init__(self, env, env_kwargs=None, 18 | obs_mask=None, act_repeat=1, 19 | *args, **kwargs): 20 | 21 | # get the correct env behavior 22 | if type(env) == str: 23 | env = gym.make(env) 24 | elif isinstance(env, gym.Env): 25 | env = env 26 | elif callable(env): 27 | env = env(**env_kwargs) 28 | else: 29 | print("Unsupported environment format") 30 | raise AttributeError 31 | 32 | self.env = env 33 | self.env_id = env.spec.id 34 | self.act_repeat = act_repeat 35 | 36 | try: 37 | self._horizon = env.spec.max_episode_steps 38 | except AttributeError: 39 | self._horizon = env.spec._horizon 40 | 41 | assert self._horizon % act_repeat == 0 42 | self._horizon = self._horizon // self.act_repeat 43 | 44 | try: 45 | self._action_dim = self.env.env.action_dim 46 | except AttributeError: 47 | self._action_dim = self.env.action_space.shape[0] 48 | 49 | try: 50 | self._observation_dim = self.env.env.obs_dim 51 | except AttributeError: 52 | self._observation_dim = self.env.observation_space.shape[0] 53 | 54 | # Specs 55 | self.spec = EnvSpec(self._observation_dim, self._action_dim, self._horizon) 56 | 57 | # obs mask 58 | self.obs_mask = np.ones(self._observation_dim) if obs_mask is None else obs_mask 59 | 60 | @property 61 | def action_dim(self): 62 | return self._action_dim 63 | 64 | @property 65 | def observation_dim(self): 66 | return self._observation_dim 67 | 68 | @property 69 | def observation_space(self): 70 | return self.env.observation_space 71 | 72 | @property 73 | def action_space(self): 74 | return self.env.action_space 75 | 76 | @property 77 | def horizon(self): 78 | return self._horizon 79 | 80 | def reset(self, seed=None): 81 | try: 82 | self.env._elapsed_steps = 0 83 | return self.env.env.reset_model(seed=seed) 84 | except: 85 | if seed is not None: 86 | self.set_seed(seed) 87 | return self.env.reset() 88 | 89 | def reset_model(self, seed=None): 90 | # overloading for legacy code 91 | return self.reset(seed) 92 | 93 | def step(self, action): 94 | action = action.clip(self.action_space.low, self.action_space.high) 95 | if self.act_repeat == 1: 96 | obs, cum_reward, done, ifo = self.env.step(action) 97 | else: 98 | cum_reward = 0.0 99 | for i in range(self.act_repeat): 100 | obs, reward, done, ifo = self.env.step(action) 101 | cum_reward += reward 102 | if done: break 103 | return self.obs_mask * obs, cum_reward, done, ifo 104 | 105 | def render(self): 106 | try: 107 | self.env.env.mujoco_render_frames = True 108 | self.env.env.mj_render() 109 | except: 110 | self.env.render() 111 | 112 | def set_seed(self, seed=123): 113 | try: 114 | self.env.seed(seed) 115 | except AttributeError: 116 | self.env._seed(seed) 117 | 118 | def get_obs(self): 119 | try: 120 | return self.obs_mask * self.env.env.get_obs() 121 | except: 122 | return self.obs_mask * self.env.env._get_obs() 123 | 124 | def get_env_infos(self): 125 | try: 126 | return self.env.env.get_env_infos() 127 | except: 128 | return {} 129 | 130 | # =========================================== 131 | # Trajectory optimization related 132 | # Envs should support these functions in case of trajopt 133 | 134 | def get_env_state(self): 135 | try: 136 | return self.env.env.get_env_state() 137 | except: 138 | raise NotImplementedError 139 | 140 | def set_env_state(self, state_dict): 141 | try: 142 | self.env.env.set_env_state(state_dict) 143 | except: 144 | raise NotImplementedError 145 | 146 | def real_env_step(self, bool_val): 147 | try: 148 | self.env.env.real_step = bool_val 149 | except: 150 | raise NotImplementedError 151 | 152 | # =========================================== 153 | 154 | def visualize_policy(self, policy, horizon=1000, num_episodes=1, mode='exploration'): 155 | try: 156 | self.env.env.visualize_policy(policy, horizon, num_episodes, mode) 157 | except: 158 | for ep in range(num_episodes): 159 | o = self.reset() 160 | d = False 161 | t = 0 162 | score = 0.0 163 | while t < horizon and d is False: 164 | a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation'] 165 | o, r, d, _ = self.step(a) 166 | score = score + r 167 | self.render() 168 | t = t+1 169 | print("Episode score = %f" % score) 170 | 171 | def evaluate_policy(self, policy, 172 | num_episodes=5, 173 | horizon=None, 174 | gamma=1, 175 | visual=False, 176 | percentile=[], 177 | get_full_dist=False, 178 | mean_action=False, 179 | init_env_state=None, 180 | terminate_at_done=True, 181 | seed=123): 182 | 183 | self.set_seed(seed) 184 | horizon = self._horizon if horizon is None else horizon 185 | mean_eval, std, min_eval, max_eval = 0.0, 0.0, -1e8, -1e8 186 | ep_returns = np.zeros(num_episodes) 187 | 188 | for ep in range(num_episodes): 189 | self.reset() 190 | if init_env_state is not None: 191 | self.set_env_state(init_env_state) 192 | t, done = 0, False 193 | while t < horizon and (done == False or terminate_at_done == False): 194 | self.render() if visual is True else None 195 | o = self.get_obs() 196 | a = policy.get_action(o)[1]['evaluation'] if mean_action is True else policy.get_action(o)[0] 197 | o, r, done, _ = self.step(a) 198 | ep_returns[ep] += (gamma ** t) * r 199 | t += 1 200 | 201 | mean_eval, std = np.mean(ep_returns), np.std(ep_returns) 202 | min_eval, max_eval = np.amin(ep_returns), np.amax(ep_returns) 203 | base_stats = [mean_eval, std, min_eval, max_eval] 204 | 205 | percentile_stats = [] 206 | for p in percentile: 207 | percentile_stats.append(np.percentile(ep_returns, p)) 208 | 209 | full_dist = ep_returns if get_full_dist is True else None 210 | 211 | return [base_stats, percentile_stats, full_dist] 212 | -------------------------------------------------------------------------------- /mjrl/utils/logger.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import scipy 6 | import pickle 7 | import os 8 | import csv 9 | 10 | class DataLog: 11 | 12 | def __init__(self): 13 | self.log = {} 14 | self.max_len = 0 15 | 16 | def log_kv(self, key, value): 17 | # logs the (key, value) pair 18 | 19 | # TODO: This implementation is error-prone: 20 | # it would be NOT aligned if some keys are missing during one iteration. 21 | if key not in self.log: 22 | self.log[key] = [] 23 | self.log[key].append(value) 24 | if len(self.log[key]) > self.max_len: 25 | self.max_len = self.max_len + 1 26 | 27 | def save_log(self, save_path): 28 | # TODO: Validate all lengths are the same. 29 | pickle.dump(self.log, open(save_path + '/log.pickle', 'wb')) 30 | with open(save_path + '/log.csv', 'w') as csv_file: 31 | fieldnames = list(self.log.keys()) 32 | if 'iteration' not in fieldnames: 33 | fieldnames = ['iteration'] + fieldnames 34 | 35 | writer = csv.DictWriter(csv_file, fieldnames=fieldnames) 36 | writer.writeheader() 37 | for row in range(self.max_len): 38 | row_dict = {'iteration': row} 39 | for key in self.log.keys(): 40 | if row < len(self.log[key]): 41 | row_dict[key] = self.log[key][row] 42 | writer.writerow(row_dict) 43 | 44 | def get_current_log(self): 45 | row_dict = {} 46 | for key in self.log.keys(): 47 | # TODO: this is very error-prone (alignment is not guaranteed) 48 | row_dict[key] = self.log[key][-1] 49 | return row_dict 50 | 51 | def shrink_to(self, num_entries): 52 | for key in self.log.keys(): 53 | self.log[key] = self.log[key][:num_entries] 54 | 55 | self.max_len = num_entries 56 | assert min([len(series) for series in self.log.values()]) == \ 57 | max([len(series) for series in self.log.values()]) 58 | 59 | def read_log(self, log_path): 60 | assert log_path.endswith('log.csv') 61 | 62 | with open(log_path) as csv_file: 63 | reader = csv.DictReader(csv_file) 64 | listr = list(reader) 65 | keys = reader.fieldnames 66 | data = {} 67 | for key in keys: 68 | data[key] = [] 69 | for row, row_dict in enumerate(listr): 70 | for key in keys: 71 | try: 72 | data[key].append(eval(row_dict[key])) 73 | except: 74 | print("ERROR on reading key {}: {}".format(key, row_dict[key])) 75 | 76 | if 'iteration' in data and data['iteration'][-1] != row: 77 | raise RuntimeError("Iteration %d mismatch -- possibly corrupted logfile?" % row) 78 | 79 | self.log = data 80 | self.max_len = max(len(v) for k, v in self.log.items()) 81 | print("Log read from {}: had {} entries".format(log_path, self.max_len)) 82 | -------------------------------------------------------------------------------- /mjrl/utils/make_train_plots.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import scipy 6 | import csv 7 | from mjrl.utils.logger import DataLog 8 | import argparse 9 | 10 | def make_train_plots(log = None, 11 | log_path = None, 12 | keys = None, 13 | save_loc = None, 14 | sample_key = 'num_samples', 15 | x_scale = 1.0, 16 | y_scale = 1.0): 17 | if log is None and log_path is None: 18 | print("Need to provide either the log or path to a log file") 19 | if log is None: 20 | logger = DataLog() 21 | logger.read_log(log_path) 22 | log = logger.log 23 | # make plots for specified keys 24 | for key in keys: 25 | if key in log.keys(): 26 | fig = plt.figure(figsize=(10,6)) 27 | ax1 = fig.add_subplot(111) 28 | try: 29 | cum_samples = [np.sum(log[sample_key][:i]) * x_scale for i in range(len(log[sample_key]))] 30 | ax1.plot(cum_samples, [elem * y_scale for elem in log[key]]) 31 | ax1.set_xlabel('samples') 32 | # mark iteration on the top axis 33 | ax2 = ax1.twiny() 34 | ax2.set_xlabel('iterations', color=(.7,.7,.7)) 35 | ax2.tick_params(axis='x', labelcolor=(.7,.7,.7)) 36 | ax2.set_xlim([0, len(log[key])]) 37 | except: 38 | ax1.plot(log[key]) 39 | ax1.set_xlabel('iterations') 40 | ax1.set_title(key) 41 | plt.savefig(save_loc+'/'+key+'.png', dpi=100) 42 | plt.close() 43 | 44 | # MAIN ========================================================= 45 | # Example: python make_train_plots.py --log_path logs/log.csv --keys eval_score rollout_score save_loc logs 46 | def main(): 47 | # Parse arguments 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument( 50 | '-l', '--log_path', type=str, required=True, help='path file to log.csv') 51 | parser.add_argument( 52 | '-k', '--keys', type=str, action='append', nargs='+', required=True, help='keys to plot') 53 | parser.add_argument( 54 | '-s', '--save_loc', type=str, default='', help='Path for logs') 55 | args = parser.parse_args() 56 | 57 | make_train_plots(log_path=args.log_path, keys=args.keys[0], save_loc=args.save_loc) 58 | 59 | if __name__ == '__main__': 60 | main() 61 | 62 | -------------------------------------------------------------------------------- /mjrl/utils/optimize_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | def fit_data(model, x, y, optimizer, loss_func, batch_size, epochs): 8 | """ 9 | :param model: pytorch model of form y_hat = f(x) (class) 10 | :param x: inputs to the model (tensor) 11 | :param y: desired outputs or targets (tensor) 12 | :param optimizer: optimizer to be used (class) 13 | :param loss_func: loss criterion (callable) 14 | :param batch_size: mini-batch size for optimization (int) 15 | :param epochs: number of epochs (int) 16 | :return: 17 | """ 18 | 19 | num_samples = x.shape[0] 20 | epoch_losses = [] 21 | for ep in range(epochs): 22 | rand_idx = torch.LongTensor(np.random.permutation(num_samples)) 23 | ep_loss = 0.0 24 | num_steps = int(num_samples / batch_size) - 1 25 | for mb in range(num_steps): 26 | data_idx = rand_idx[mb*batch_size:(mb+1)*batch_size] 27 | batch_x = x[data_idx] 28 | batch_y = y[data_idx] 29 | optimizer.zero_grad() 30 | yhat = model(batch_x) 31 | loss = loss_func(yhat, batch_y) 32 | loss.backward() 33 | optimizer.step() 34 | ep_loss += loss.detach() 35 | epoch_losses.append(ep_loss.to('cpu').data.numpy().ravel() / num_steps) 36 | return epoch_losses 37 | -------------------------------------------------------------------------------- /mjrl/utils/plot_from_logs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pickle 4 | import numpy as np 5 | import matplotlib 6 | matplotlib.use('Agg') 7 | import matplotlib.pyplot as plt 8 | colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] 9 | 10 | parser = argparse.ArgumentParser(description='Script to explore the data generated by an experiment.') 11 | parser.add_argument('--data', '-d', type=str, required=True, help='location of the .pickle log data file') 12 | parser.add_argument('--output', '-o', type=str, required=True, help='location to store results as a png') 13 | parser.add_argument('--xkey', '-x', type=str, default=None, help='the key to use for x axis in plots') 14 | parser.add_argument('--xscale', '-s', type=int, default=1, help='scaling for the x axis (optional)') 15 | args = parser.parse_args() 16 | 17 | # get inputs and setup output file 18 | if '.png' in args.output: 19 | OUT_FILE = args.output 20 | else: 21 | OUT_FILE = args.output + '/plot.png' 22 | data = pickle.load(open(args.data, 'rb')) 23 | xscale = 1 if args.xscale is None else args.xscale 24 | if args.xkey == 'num_samples': 25 | xscale = xscale if 'act_repeat' not in data.keys() else data['act_repeat'][-1] 26 | 27 | dict_keys = list(data.keys()) 28 | for k in dict_keys: 29 | if len(data[k]) == 1: del(data[k]) 30 | 31 | # plot layout 32 | nplt = len(data.keys()) 33 | ncol = 4 34 | nrow = int(np.ceil(nplt/ncol)) 35 | 36 | # plot data 37 | xkey = args.xkey 38 | start_idx = 2 39 | end_idx = max([len(data[k]) for k in data.keys()]) 40 | xdata = np.arange(end_idx) if (xkey is None or xkey == 'None') else \ 41 | [np.sum(data[xkey][:i+1]) * xscale for i in range(len(data[xkey]))] 42 | 43 | # make the plot 44 | plt.figure(figsize=(15,15), dpi=60) 45 | for idx, key in enumerate(data.keys()): 46 | plt.subplot(nrow, ncol, idx+1) 47 | plt.tight_layout() 48 | try: 49 | last_idx = min(end_idx, len(data[key])) 50 | plt.plot(xdata[start_idx:last_idx], data[key][start_idx:last_idx], color=colors[idx%7], linewidth=3) 51 | except: 52 | pass 53 | plt.title(key) 54 | 55 | plt.savefig(OUT_FILE, dpi=100, bbox_inches="tight") 56 | -------------------------------------------------------------------------------- /mjrl/utils/process_samples.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def compute_returns(paths, gamma): 4 | for path in paths: 5 | path["returns"] = discount_sum(path["rewards"], gamma) 6 | 7 | def compute_advantages(paths, baseline, gamma, gae_lambda=None, normalize=False): 8 | # compute and store returns, advantages, and baseline 9 | # standard mode 10 | if gae_lambda == None or gae_lambda < 0.0 or gae_lambda > 1.0: 11 | for path in paths: 12 | path["baseline"] = baseline.predict(path) 13 | path["advantages"] = path["returns"] - path["baseline"] 14 | if normalize: 15 | alladv = np.concatenate([path["advantages"] for path in paths]) 16 | mean_adv = alladv.mean() 17 | std_adv = alladv.std() 18 | for path in paths: 19 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8) 20 | # GAE mode 21 | else: 22 | for path in paths: 23 | b = path["baseline"] = baseline.predict(path) 24 | if b.ndim == 1: 25 | b1 = np.append(path["baseline"], 0.0 if path["terminated"] else b[-1]) 26 | else: 27 | b1 = np.vstack((b, np.zeros(b.shape[1]) if path["terminated"] else b[-1])) 28 | td_deltas = path["rewards"] + gamma*b1[1:] - b1[:-1] 29 | path["advantages"] = discount_sum(td_deltas, gamma*gae_lambda) 30 | if normalize: 31 | alladv = np.concatenate([path["advantages"] for path in paths]) 32 | mean_adv = alladv.mean() 33 | std_adv = alladv.std() 34 | for path in paths: 35 | path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8) 36 | 37 | def discount_sum(x, gamma, terminal=0.0): 38 | y = [] 39 | run_sum = terminal 40 | for t in range( len(x)-1, -1, -1): 41 | run_sum = x[t] + gamma*run_sum 42 | y.append(run_sum) 43 | 44 | return np.array(y[::-1]) -------------------------------------------------------------------------------- /mjrl/utils/tensor_utils.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | import numpy as np 4 | 5 | 6 | def flatten_tensors(tensors): 7 | if len(tensors) > 0: 8 | return np.concatenate([np.reshape(x, [-1]) for x in tensors]) 9 | else: 10 | return np.asarray([]) 11 | 12 | 13 | def unflatten_tensors(flattened, tensor_shapes): 14 | tensor_sizes = list(map(np.prod, tensor_shapes)) 15 | indices = np.cumsum(tensor_sizes)[:-1] 16 | return [np.reshape(pair[0], pair[1]) for pair in zip(np.split(flattened, indices), tensor_shapes)] 17 | 18 | 19 | def pad_tensor(x, max_len, mode='zero'): 20 | padding = np.zeros_like(x[0]) 21 | if mode == 'last': 22 | padding = x[-1] 23 | return np.concatenate([ 24 | x, 25 | np.tile(padding, (max_len - len(x),) + (1,) * np.ndim(x[0])) 26 | ]) 27 | 28 | 29 | def pad_tensor_n(xs, max_len): 30 | ret = np.zeros((len(xs), max_len) + xs[0].shape[1:], dtype=xs[0].dtype) 31 | for idx, x in enumerate(xs): 32 | ret[idx][:len(x)] = x 33 | return ret 34 | 35 | 36 | def pad_tensor_dict(tensor_dict, max_len, mode='zero'): 37 | keys = list(tensor_dict.keys()) 38 | ret = dict() 39 | for k in keys: 40 | if isinstance(tensor_dict[k], dict): 41 | ret[k] = pad_tensor_dict(tensor_dict[k], max_len, mode=mode) 42 | else: 43 | ret[k] = pad_tensor(tensor_dict[k], max_len, mode=mode) 44 | return ret 45 | 46 | 47 | def flatten_first_axis_tensor_dict(tensor_dict): 48 | keys = list(tensor_dict.keys()) 49 | ret = dict() 50 | for k in keys: 51 | if isinstance(tensor_dict[k], dict): 52 | ret[k] = flatten_first_axis_tensor_dict(tensor_dict[k]) 53 | else: 54 | old_shape = tensor_dict[k].shape 55 | ret[k] = tensor_dict[k].reshape((-1,) + old_shape[2:]) 56 | return ret 57 | 58 | 59 | def high_res_normalize(probs): 60 | return [x / sum(map(float, probs)) for x in list(map(float, probs))] 61 | 62 | 63 | def stack_tensor_list(tensor_list): 64 | return np.array(tensor_list) 65 | # tensor_shape = np.array(tensor_list[0]).shape 66 | # if tensor_shape is tuple(): 67 | # return np.array(tensor_list) 68 | # return np.vstack(tensor_list) 69 | 70 | 71 | def stack_tensor_dict_list(tensor_dict_list): 72 | """ 73 | Stack a list of dictionaries of {tensors or dictionary of tensors}. 74 | :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}. 75 | :return: a dictionary of {stacked tensors or dictionary of stacked tensors} 76 | """ 77 | keys = list(tensor_dict_list[0].keys()) 78 | ret = dict() 79 | for k in keys: 80 | example = tensor_dict_list[0][k] 81 | if isinstance(example, dict): 82 | v = stack_tensor_dict_list([x[k] for x in tensor_dict_list]) 83 | else: 84 | v = stack_tensor_list([x[k] for x in tensor_dict_list]) 85 | ret[k] = v 86 | return ret 87 | 88 | 89 | def concat_tensor_list_subsample(tensor_list, f): 90 | return np.concatenate( 91 | [t[np.random.choice(len(t), int(np.ceil(len(t) * f)), replace=False)] for t in tensor_list], axis=0) 92 | 93 | 94 | def concat_tensor_dict_list_subsample(tensor_dict_list, f): 95 | keys = list(tensor_dict_list[0].keys()) 96 | ret = dict() 97 | for k in keys: 98 | example = tensor_dict_list[0][k] 99 | if isinstance(example, dict): 100 | v = concat_tensor_dict_list_subsample([x[k] for x in tensor_dict_list], f) 101 | else: 102 | v = concat_tensor_list_subsample([x[k] for x in tensor_dict_list], f) 103 | ret[k] = v 104 | return ret 105 | 106 | 107 | def concat_tensor_list(tensor_list): 108 | return np.concatenate(tensor_list, axis=0) 109 | 110 | 111 | def concat_tensor_dict_list(tensor_dict_list): 112 | keys = list(tensor_dict_list[0].keys()) 113 | ret = dict() 114 | for k in keys: 115 | example = tensor_dict_list[0][k] 116 | if isinstance(example, dict): 117 | v = concat_tensor_dict_list([x[k] for x in tensor_dict_list]) 118 | else: 119 | v = concat_tensor_list([x[k] for x in tensor_dict_list]) 120 | ret[k] = v 121 | return ret 122 | 123 | 124 | def split_tensor_dict_list(tensor_dict): 125 | keys = list(tensor_dict.keys()) 126 | ret = None 127 | for k in keys: 128 | vals = tensor_dict[k] 129 | if isinstance(vals, dict): 130 | vals = split_tensor_dict_list(vals) 131 | if ret is None: 132 | ret = [{k: v} for v in vals] 133 | else: 134 | for v, cur_dict in zip(vals, ret): 135 | cur_dict[k] = v 136 | return ret 137 | 138 | 139 | def truncate_tensor_list(tensor_list, truncated_len): 140 | return tensor_list[:truncated_len] 141 | 142 | 143 | def truncate_tensor_dict(tensor_dict, truncated_len): 144 | ret = dict() 145 | for k, v in tensor_dict.items(): 146 | if isinstance(v, dict): 147 | ret[k] = truncate_tensor_dict(v, truncated_len) 148 | else: 149 | ret[k] = truncate_tensor_list(v, truncated_len) 150 | return ret 151 | -------------------------------------------------------------------------------- /mjrl/utils/train_agent.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.disable(logging.CRITICAL) 3 | 4 | from tabulate import tabulate 5 | from mjrl.utils.make_train_plots import make_train_plots 6 | from mjrl.utils.gym_env import GymEnv 7 | from mjrl.samplers.core import sample_paths 8 | import numpy as np 9 | import pickle 10 | import time as timer 11 | import os 12 | import copy 13 | 14 | 15 | def _load_latest_policy_and_logs(agent, *, policy_dir, logs_dir): 16 | """Loads the latest policy. 17 | Returns the next step number to begin with. 18 | """ 19 | assert os.path.isdir(policy_dir), str(policy_dir) 20 | assert os.path.isdir(logs_dir), str(logs_dir) 21 | 22 | log_csv_path = os.path.join(logs_dir, 'log.csv') 23 | if not os.path.exists(log_csv_path): 24 | return 0 # fresh start 25 | 26 | print("Reading: {}".format(log_csv_path)) 27 | agent.logger.read_log(log_csv_path) 28 | last_step = agent.logger.max_len - 1 29 | if last_step <= 0: 30 | return 0 # fresh start 31 | 32 | 33 | # find latest policy/baseline 34 | i = last_step 35 | while i >= 0: 36 | policy_path = os.path.join(policy_dir, 'policy_{}.pickle'.format(i)) 37 | baseline_path = os.path.join(policy_dir, 'baseline_{}.pickle'.format(i)) 38 | 39 | if not os.path.isfile(policy_path): 40 | i = i -1 41 | continue 42 | else: 43 | print("Loaded last saved iteration: {}".format(i)) 44 | 45 | with open(policy_path, 'rb') as fp: 46 | agent.policy = pickle.load(fp) 47 | with open(baseline_path, 'rb') as fp: 48 | agent.baseline = pickle.load(fp) 49 | 50 | # additional 51 | # global_status_path = os.path.join(policy_dir, 'global_status.pickle') 52 | # with open(global_status_path, 'rb') as fp: 53 | # agent.load_global_status( pickle.load(fp) ) 54 | 55 | agent.logger.shrink_to(i + 1) 56 | assert agent.logger.max_len == i + 1 57 | return agent.logger.max_len 58 | 59 | # cannot find any saved policy 60 | raise RuntimeError("Log file exists, but cannot find any saved policy.") 61 | 62 | def train_agent(job_name, agent, 63 | seed = 0, 64 | niter = 101, 65 | gamma = 0.995, 66 | gae_lambda = None, 67 | num_cpu = 1, 68 | sample_mode = 'trajectories', 69 | num_traj = 50, 70 | num_samples = 50000, # has precedence, used with sample_mode = 'samples' 71 | save_freq = 10, 72 | evaluation_rollouts = None, 73 | plot_keys = ['stoc_pol_mean'], 74 | ): 75 | 76 | np.random.seed(seed) 77 | if os.path.isdir(job_name) == False: 78 | os.mkdir(job_name) 79 | previous_dir = os.getcwd() 80 | os.chdir(job_name) # important! we are now in the directory to save data 81 | if os.path.isdir('iterations') == False: os.mkdir('iterations') 82 | if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs') 83 | best_policy = copy.deepcopy(agent.policy) 84 | best_perf = -1e8 85 | train_curve = best_perf*np.ones(niter) 86 | mean_pol_perf = 0.0 87 | e = GymEnv(agent.env.env_id) 88 | 89 | # Load from any existing checkpoint, policy, statistics, etc. 90 | # Why no checkpointing.. :( 91 | i_start = _load_latest_policy_and_logs(agent, 92 | policy_dir='iterations', 93 | logs_dir='logs') 94 | if i_start: 95 | print("Resuming from an existing job folder ...") 96 | 97 | for i in range(i_start, niter): 98 | print("......................................................................................") 99 | print("ITERATION : %i " % i) 100 | 101 | if train_curve[i-1] > best_perf: 102 | best_policy = copy.deepcopy(agent.policy) 103 | best_perf = train_curve[i-1] 104 | 105 | N = num_traj if sample_mode == 'trajectories' else num_samples 106 | args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu) 107 | stats = agent.train_step(**args) 108 | train_curve[i] = stats[0] 109 | 110 | if evaluation_rollouts is not None and evaluation_rollouts > 0: 111 | print("Performing evaluation rollouts ........") 112 | eval_paths = sample_paths(num_traj=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu, 113 | env=e.env_id, eval_mode=True, base_seed=seed) 114 | mean_pol_perf = np.mean([np.sum(path['rewards']) for path in eval_paths]) 115 | if agent.save_logs: 116 | agent.logger.log_kv('eval_score', mean_pol_perf) 117 | try: 118 | eval_success = e.env.env.evaluate_success(eval_paths) 119 | agent.logger.log_kv('eval_success', eval_success) 120 | except: 121 | pass 122 | 123 | if i % save_freq == 0 and i > 0: 124 | if agent.save_logs: 125 | agent.logger.save_log('logs/') 126 | make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') 127 | policy_file = 'policy_%i.pickle' % i 128 | baseline_file = 'baseline_%i.pickle' % i 129 | pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb')) 130 | pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb')) 131 | pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) 132 | # pickle.dump(agent.global_status, open('iterations/global_status.pickle', 'wb')) 133 | 134 | # print results to console 135 | if i == 0: 136 | result_file = open('results.txt', 'w') 137 | print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n") 138 | result_file.write("Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n") 139 | result_file.close() 140 | print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())), 141 | i, train_curve[i], mean_pol_perf, best_perf)) 142 | result_file = open('results.txt', 'a') 143 | result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf)) 144 | result_file.close() 145 | if agent.save_logs: 146 | print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1, 147 | agent.logger.get_current_log().items())) 148 | print(tabulate(print_data)) 149 | 150 | # final save 151 | pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb')) 152 | if agent.save_logs: 153 | agent.logger.save_log('logs/') 154 | make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/') 155 | os.chdir(previous_dir) 156 | -------------------------------------------------------------------------------- /mjrl/utils/visualize_policy.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import mjrl.envs 3 | import click 4 | import os 5 | import gym 6 | import numpy as np 7 | import pickle 8 | from mjrl.utils.gym_env import GymEnv 9 | from mjrl.policies.gaussian_mlp import MLP 10 | import trajopt.envs 11 | 12 | DESC = ''' 13 | Helper script to visualize policy (in mjrl format).\n 14 | USAGE:\n 15 | Visualizes policy on the env\n 16 | $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n 17 | ''' 18 | 19 | # MAIN ========================================================= 20 | @click.command(help=DESC) 21 | @click.option('--env_name', type=str, help='environment to load', required= True) 22 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None) 23 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation') 24 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123) 25 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10) 26 | 27 | def main(env_name, policy, mode, seed, episodes): 28 | e = GymEnv(env_name) 29 | e.set_seed(seed) 30 | if policy is not None: 31 | pi = pickle.load(open(policy, 'rb')) 32 | else: 33 | pi = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=-1.0) 34 | # render policy 35 | e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode) 36 | 37 | if __name__ == '__main__': 38 | main() 39 | 40 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | print("Installing mjrl. \n Package intended for use with provided conda env. See setup instructions here: https://github.com/aravindr93/mjrl/tree/master/setup") 6 | 7 | if sys.version_info.major != 3: 8 | print("This Python is only compatible with Python 3, but you are running " 9 | "Python {}. The installation will likely fail.".format(sys.version_info.major)) 10 | 11 | def read(fname): 12 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 13 | 14 | setup( 15 | name='mjrl', 16 | version='1.0.0', 17 | packages=find_packages(), 18 | description='RL algorithms for environments in MuJoCo', 19 | long_description=read('README.md'), 20 | url='https://github.com/aravindr93/mjrl.git', 21 | author='Aravind Rajeswaran', 22 | ) 23 | -------------------------------------------------------------------------------- /setup/README.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | A short guide to install this package is below. The package relies on `mujoco-py` which might be the trickiest part of the installation. See `known issues` below and also instructions from the mujoco-py [page](https://github.com/openai/mujoco-py) if you are stuck with mujoco-py installation. 4 | 5 | The package can handle both `MuJoCo v1.5` as well as `MuJoCo v2.0`, but the former is not supported for future updates. We encourage you to use v2.0. 6 | 7 | ## Linux 8 | 9 | - Download MuJoCo v2.0 binaries from the official [website](http://www.mujoco.org/) and also obtain the license key. 10 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200`, and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. Note that unzip of the MuJoCo binaries will generate `mujoco200_linux`. You need to rename the directory and place it at `~/.mujoco/mujoco200`. 11 | - Install osmesa related dependencies: 12 | ``` 13 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev build-essential libglfw3 14 | ``` 15 | - Update `bashrc` by adding the following lines and source it 16 | ``` 17 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH" 18 | export MUJOCO_PY_FORCE_CPU=True 19 | alias MJPL='LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so:/usr/lib/nvidia-384/libGL.so' 20 | ``` 21 | - Install this package using 22 | ``` 23 | $ conda update conda 24 | $ cd 25 | $ conda env create -f setup/env.yml 26 | $ source activate mjrl-env 27 | $ pip install -e . 28 | ``` 29 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly based on the specific version of CUDA (or CPU-only) you have. 30 | 31 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info. 32 | 33 | ## Mac OS 34 | 35 | - Download MuJoCo binaries from the official [website](http://www.mujoco.org/) and also obtain the license key. 36 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200` (rename unzipped directory to this), and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. 37 | - Update `bashrc` by adding the following lines and source it 38 | ``` 39 | export LD_LIBRARY_PATH="/mujoco200/bin:$LD_LIBRARY_PATH" 40 | ``` 41 | - Install this package using 42 | ``` 43 | $ conda update conda 44 | $ cd path/to/mjrl 45 | $ conda env create -f setup/env.yml 46 | $ source activate mjrl-env 47 | $ pip install -e . 48 | ``` 49 | 50 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly. 51 | 52 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info. 53 | 54 | 55 | ## Known Issues 56 | 57 | - Visualization in linux: If the linux system has a GPU, then mujoco-py does not automatically preload the correct drivers. We added an alias `MJPL` in bashrc (see instructions) which stands for mujoco pre-load. When runing any python script that requires rendering, prepend the execution with MJPL. 58 | ``` 59 | $ MJPL python script.py 60 | ``` 61 | 62 | - Errors related to osmesa during installation. This is a `mujoco-py` build error and would likely go away if the following command is used before creating the conda environment. If the problem still persists, please contact the developers of mujoco-py 63 | ``` 64 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev 65 | ``` 66 | 67 | - If conda environment creation gets interrupted for some reason, you can resume it with the following: 68 | ``` 69 | $ conda env update -n mjrl-env -f setup/env.yml 70 | ``` 71 | 72 | - GCC error in Mac OS: If you get a GCC error from mujoco-py, you can get the correct version mujoco-py expects with `brew install gcc --without-multilib`. This may require uninstalling other versions of GCC that may have been previously installed with `brew remove gcc@6` for example. You can see which brew packages were already installed with `brew list`. 73 | 74 | -------------------------------------------------------------------------------- /setup/env.yml: -------------------------------------------------------------------------------- 1 | name: mjrl-env 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - python=3.7 7 | - pip 8 | - ipython 9 | - mkl-service 10 | - pytorch==1.4 11 | - tabulate 12 | - termcolor 13 | - torchvision 14 | - patchelf 15 | - pip: 16 | - click 17 | - cloudpickle 18 | - gym==0.13 19 | - ipdb 20 | - matplotlib 21 | - mujoco-py<2.1,>=2.0 22 | - pip 23 | - pyyaml 24 | - tqdm 25 | - wheel 26 | - scipy 27 | - transforms3d 28 | -------------------------------------------------------------------------------- /tests/hydra/config/hydra_npg_config.yaml: -------------------------------------------------------------------------------- 1 | # general outputs 2 | job_name : 'hydra_npg_test' 3 | 4 | # general inputs 5 | env : Hopper-v3 6 | algorithm : NPG 7 | seed : 123 8 | sample_mode : samples 9 | rl_num_samples : 1000 10 | rl_num_traj : 0 11 | rl_num_iter : 2 12 | num_cpu : 4 13 | save_freq : 5 14 | eval_rollouts : 0 15 | exp_notes : 'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.' 16 | 17 | # RL parameters (all params related to PG, value function etc.) 18 | policy_size : (32, 32) 19 | init_log_std : -0.5 20 | vf_hidden_size : (128, 128) 21 | vf_batch_size : 64 22 | vf_epochs : 2 23 | vf_learn_rate : 1e-3 24 | rl_step_size : 0.05 25 | rl_gamma : 0.995 26 | rl_gae : 0.97 27 | 28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used) 29 | 30 | alg_hyper_params : {} 31 | 32 | hydra: 33 | launcher: 34 | cpus_per_task: 12 35 | gpus_per_node: 0 36 | tasks_per_node: 1 37 | run: 38 | dir: ./outputs/${hydra.job.name}/${now:%Y-%m-%d_%H-%M-%S} 39 | sweep: 40 | dir: /checkpoint/${env:USER}/outputs/${job_name}/${now:%Y-%m-%d}_${now:%H-%M-%S} 41 | subdir: ${hydra.job.num}_${hydra.job.override_dirname} -------------------------------------------------------------------------------- /tests/hydra/hydra_policy_opt_job_script.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a job script for running policy gradient algorithms on gym tasks. 3 | Separate job scripts are provided to run few other algorithms 4 | - For DAPG see here: https://github.com/aravindr93/hand_dapg/tree/master/dapg/examples 5 | - For model-based NPG see here: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos/model_accel 6 | """ 7 | 8 | from mjrl.utils.gym_env import GymEnv 9 | from mjrl.policies.gaussian_mlp import MLP 10 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 11 | from mjrl.baselines.mlp_baseline import MLPBaseline 12 | from mjrl.algos.npg_cg import NPG 13 | from mjrl.algos.batch_reinforce import BatchREINFORCE 14 | from mjrl.algos.ppo_clip import PPO 15 | from mjrl.utils.train_agent import train_agent 16 | import os 17 | import json 18 | import gym 19 | import mjrl.envs 20 | # import mj_envs 21 | import time as timer 22 | import pickle 23 | import hydra 24 | from omegaconf import DictConfig, OmegaConf 25 | 26 | # =============================================================================== 27 | # Process Inputs 28 | # =============================================================================== 29 | def preprocess(job_data): 30 | if not os.path.exists(job_data.job_name): 31 | os.mkdir(job_data.job_name) 32 | assert 'algorithm' in job_data.keys() 33 | assert any([job_data.algorithm == a for a in ['NPG', 'NVPG', 'VPG', 'PPO']]) 34 | assert 'sample_mode' in job_data.keys() 35 | job_data.alg_hyper_params = dict() if 'alg_hyper_params' not in job_data.keys() else job_data.alg_hyper_params 36 | 37 | EXP_FILE = job_data.job_name + '/job_config.json' 38 | with open(EXP_FILE, 'w') as fp: 39 | # json.dump(job_data, f, indent=4) 40 | OmegaConf.save(config=job_data, f=fp.name) 41 | 42 | if job_data.sample_mode == 'trajectories': 43 | assert 'rl_num_traj' in job_data.keys() 44 | job_data.rl_num_samples = 0 # will be ignored 45 | elif job_data.sample_mode == 'samples': 46 | assert 'rl_num_samples' in job_data.keys() 47 | job_data.rl_num_traj = 0 # will be ignored 48 | else: 49 | print("Unknown sampling mode. Choose either trajectories or samples") 50 | exit() 51 | 52 | # =============================================================================== 53 | # Train Loop 54 | # =============================================================================== 55 | @hydra.main(config_name="hydra_npg_config", config_path="config") 56 | def train_loop(job_data: DictConfig) -> None: 57 | print("========================================") 58 | print("Job Configuration") 59 | print("========================================") 60 | preprocess(job_data) 61 | print(OmegaConf.to_yaml(job_data)) 62 | 63 | e = GymEnv(job_data.env) 64 | policy_size = tuple(eval(job_data.policy_size)) 65 | vf_hidden_size = tuple(eval(job_data.vf_hidden_size)) 66 | 67 | policy = MLP(e.spec, hidden_sizes=policy_size, seed=job_data.seed, init_log_std=job_data.init_log_std) 68 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data.vf_batch_size, hidden_sizes=vf_hidden_size, 69 | epochs=job_data.vf_epochs, learn_rate=job_data.vf_learn_rate) 70 | 71 | # Construct the algorithm 72 | if job_data.algorithm == 'NPG': 73 | # Other hyperparameters (like number of CG steps) can be specified in config for pass through 74 | # or default hyperparameters will be used 75 | agent = NPG(e, policy, baseline, normalized_step_size=job_data.rl_step_size, 76 | seed=job_data.seed, save_logs=True, **job_data.alg_hyper_params) 77 | 78 | elif job_data.algorithm == 'VPG': 79 | agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data.rl_step_size, 80 | seed=job_data.seed, save_logs=True, **job_data.alg_hyper_params) 81 | 82 | elif job_data.algorithm == 'NVPG': 83 | agent = BatchREINFORCE(e, policy, baseline, desired_kl=job_data.rl_step_size, 84 | seed=job_data.seed, save_logs=True, **job_data.alg_hyper_params) 85 | 86 | elif job_data.algorithm == 'PPO': 87 | # There are many hyperparameters for PPO. They can be specified in config for pass through 88 | # or defaults in the PPO algorithm will be used 89 | agent = PPO(e, policy, baseline, save_logs=True, **job_data.alg_hyper_params) 90 | else: 91 | NotImplementedError("Algorithm not found") 92 | 93 | print("========================================") 94 | print("Starting policy learning") 95 | print("========================================") 96 | 97 | ts = timer.time() 98 | train_agent(job_name=job_data.job_name, 99 | agent=agent, 100 | seed=job_data.seed, 101 | niter=job_data.rl_num_iter, 102 | gamma=job_data.rl_gamma, 103 | gae_lambda=job_data.rl_gae, 104 | num_cpu=job_data.num_cpu, 105 | sample_mode=job_data.sample_mode, 106 | num_traj=job_data.rl_num_traj, 107 | num_samples=job_data.rl_num_samples, 108 | save_freq=job_data.save_freq, 109 | evaluation_rollouts=job_data.eval_rollouts) 110 | print("========================================") 111 | print("Job Finished. Time taken = %f" % (timer.time()-ts)) 112 | print("========================================") 113 | 114 | if __name__ == "__main__": 115 | train_loop() -------------------------------------------------------------------------------- /tests/point_mass_test.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 4 | from mjrl.baselines.mlp_baseline import MLPBaseline 5 | from mjrl.algos.npg_cg import NPG 6 | from mjrl.utils.train_agent import train_agent 7 | import mjrl.envs 8 | import time as timer 9 | SEED = 500 10 | 11 | e = GymEnv('mjrl_point_mass-v0') 12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 13 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=10, learn_rate=1e-3) 14 | agent = NPG(e, policy, baseline, normalized_step_size=0.05, seed=SEED, save_logs=True) 15 | 16 | ts = timer.time() 17 | train_agent(job_name='point_mass_exp1', 18 | agent=agent, 19 | seed=SEED, 20 | niter=50, 21 | gamma=0.95, 22 | gae_lambda=0.97, 23 | num_cpu=1, 24 | sample_mode='trajectories', 25 | num_traj=40, # samples = 40*25 = 1000 26 | save_freq=5, 27 | evaluation_rollouts=None, 28 | plot_keys=['stoc_pol_mean', 'running_score']) 29 | print("time taken = %f" % (timer.time()-ts)) 30 | -------------------------------------------------------------------------------- /tests/visualizer_test.py: -------------------------------------------------------------------------------- 1 | from mjrl.utils.gym_env import GymEnv 2 | from mjrl.policies.gaussian_mlp import MLP 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline 4 | from mjrl.baselines.mlp_baseline import MLPBaseline 5 | from mjrl.algos.npg_cg import NPG 6 | from mjrl.utils.train_agent import train_agent 7 | import mjrl.envs 8 | import time as timer 9 | SEED = 500 10 | 11 | e = GymEnv('mjrl_point_mass-v0') 12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED) 13 | baseline = QuadraticBaseline(e.spec) 14 | agent = NPG(e, policy, baseline, normalized_step_size=0.5, seed=SEED, save_logs=True) 15 | 16 | ts = timer.time() 17 | train_agent(job_name='vis_exp', 18 | agent=agent, 19 | seed=SEED, 20 | niter=10, 21 | gamma=0.95, 22 | gae_lambda=0.97, 23 | num_cpu=1, 24 | sample_mode='trajectories', 25 | num_traj=100, 26 | save_freq=5, 27 | evaluation_rollouts=None) 28 | print("time taken = %f" % (timer.time()-ts)) 29 | e.visualize_policy(policy, num_episodes=5, horizon=e.horizon, mode='exploration') 30 | --------------------------------------------------------------------------------