├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── README.md
    ├── behavior_clone.py
    ├── example_configs
    │   ├── hopper_npg.txt
    │   ├── swimmer_npg.txt
    │   └── swimmer_ppo.txt
    ├── linear_nn_comparison.py
    └── policy_opt_job_script.py
├── mjrl
    ├── __init__.py
    ├── algos
    │   ├── __init__.py
    │   ├── batch_reinforce.py
    │   ├── behavior_cloning.py
    │   ├── dapg.py
    │   ├── mbac.py
    │   ├── model_accel
    │   │   ├── __init__.py
    │   │   ├── model_accel_npg.py
    │   │   ├── model_learning_mpc.py
    │   │   ├── nn_dynamics.py
    │   │   ├── run_experiments
    │   │   │   ├── configs
    │   │   │   │   ├── point_mass.txt
    │   │   │   │   └── reacher.txt
    │   │   │   ├── run_model_accel_npg.py
    │   │   │   ├── sandbox
    │   │   │   │   ├── example_config_mpc.txt
    │   │   │   │   └── run_model_learning_mpc.py
    │   │   │   └── utils
    │   │   │   │   ├── reward_functions
    │   │   │   │       ├── __init__.py
    │   │   │   │       └── mjrl_point_mass.py
    │   │   │   │   ├── visualize_policy.py
    │   │   │   │   └── visualize_trajectories.py
    │   │   └── sampling.py
    │   ├── npg_cg.py
    │   ├── ppo_clip.py
    │   └── trpo.py
    ├── baselines
    │   ├── __init__.py
    │   ├── linear_baseline.py
    │   ├── mlp_baseline.py
    │   ├── quadratic_baseline.py
    │   └── zero_baseline.py
    ├── envs
    │   ├── __init__.py
    │   ├── assets
    │   │   ├── peg_insertion.xml
    │   │   ├── point_mass.xml
    │   │   ├── sawyer.xml
    │   │   └── swimmer.xml
    │   ├── mujoco_env.py
    │   ├── peg_insertion_sawyer.py
    │   ├── point_mass.py
    │   ├── reacher_sawyer.py
    │   └── swimmer.py
    ├── policies
    │   ├── __init__.py
    │   ├── gaussian_linear.py
    │   ├── gaussian_mlp.py
    │   └── mpc_actor.py
    ├── samplers
    │   ├── __init__.py
    │   └── core.py
    └── utils
    │   ├── __init__.py
    │   ├── cg_solve.py
    │   ├── fc_network.py
    │   ├── get_environment.py
    │   ├── gym_env.py
    │   ├── logger.py
    │   ├── make_train_plots.py
    │   ├── optimize_model.py
    │   ├── plot_from_logs.py
    │   ├── process_samples.py
    │   ├── tensor_utils.py
    │   ├── train_agent.py
    │   └── visualize_policy.py
├── setup.py
├── setup
    ├── README.md
    └── env.yml
└── tests
    ├── hydra
        ├── config
        │   └── hydra_npg_config.yaml
        └── hydra_policy_opt_job_script.py
    ├── point_mass_test.py
    └── visualizer_test.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # idea
104 | *.idea/
105 | 
106 | # Mac OSX files
107 | *.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RL for MuJoCo
 2 | 
 3 | This package  contains implementations of various RL algorithms for continuous control tasks simulated with [MuJoCo.](http://www.mujoco.org/)
 4 | 
 5 | # Installation
 6 | The main package dependencies are `MuJoCo`, `python=3.7`, `gym>=0.13`, `mujoco-py>=2.0`, and `pytorch>=1.0`. See `setup/README.md` ([link](https://github.com/aravindr93/mjrl/tree/master/setup#installation)) for detailed install instructions.
 7 | 
 8 | # Bibliography
 9 | If you find the package useful, please cite the following papers.
10 | ```
11 | @INPROCEEDINGS{Rajeswaran-NIPS-17,
12 |     AUTHOR    = {Aravind Rajeswaran and Kendall Lowrey and Emanuel Todorov and Sham Kakade},
13 |     TITLE     = "{Towards Generalization and Simplicity in Continuous Control}",
14 |     BOOKTITLE = {NIPS},
15 |     YEAR      = {2017},
16 | }
17 | 
18 | @INPROCEEDINGS{Rajeswaran-RSS-18,
19 |     AUTHOR    = {Aravind Rajeswaran AND Vikash Kumar AND Abhishek Gupta AND
20 |                  Giulia Vezzani AND John Schulman AND Emanuel Todorov AND Sergey Levine},
21 |     TITLE     = "{Learning Complex Dexterous Manipulation with Deep Reinforcement Learning and Demonstrations}",
22 |     BOOKTITLE = {Proceedings of Robotics: Science and Systems (RSS)},
23 |     YEAR      = {2018},
24 | }
25 | ```
26 | 
27 | # Credits
28 | This package is maintained by [Aravind Rajeswaran](http://homes.cs.washington.edu/~aravraj/) and other members of the [Movement Control Lab,](http://homes.cs.washington.edu/~todorov/) University of Washington Seattle.
29 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | Here we provide a job script to illustrate policy optimization with incrimental learning methods like NPG and PPO. To run the experiments, use the commands below. The experiments are run through the job script provided which tasks two arguments:
 4 | - `output`: path to directory where all the results will be saved
 5 | - `config`: a config `.txt` file with all the experiment parameters (examples are provided)
 6 | The script has to be run from this directory, i.e. `mjrl/examples` 
 7 | 
 8 | 1. To train an NPG agent on a task shipped with `mjrl` (e.g. swimmer)
 9 | ```
10 | $ python policy_opt_job_script.py --output swimmer_npg_exp --config example_configs/swimmer_npg.txt
11 | ```
12 | 
13 | 2. To train an NPG agent on an OpenAI gym benchmark task (e.g. hopper)
14 | ```
15 | $ python policy_opt_job_script.py --output hopper_npg_exp --config example_configs/hopper_npg.txt
16 | ```
17 | Note that since the Hopper env has termination conditions, we pick the sampling mode in the config to be `samples` rather than trajectories, so that per update we have 10K samples.
18 | 
19 | 3. To train a PPO agent on the swimmer task
20 | ```
21 | $ python policy_opt_job_script.py --output swimmer_ppo_exp --config example_configs/swimmer_ppo.txt
22 | ```


--------------------------------------------------------------------------------
/examples/behavior_clone.py:
--------------------------------------------------------------------------------
 1 | from mjrl.utils.gym_env import GymEnv
 2 | from mjrl.policies.gaussian_mlp import MLP
 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 4 | from mjrl.baselines.mlp_baseline import MLPBaseline
 5 | from mjrl.algos.npg_cg import NPG
 6 | from mjrl.algos.behavior_cloning import BC
 7 | from mjrl.utils.train_agent import train_agent
 8 | from mjrl.samplers.core import sample_paths
 9 | import mjrl.envs
10 | import time as timer
11 | import pickle
12 | SEED = 500
13 | 
14 | # ------------------------------
15 | # Train expert policy first
16 | e = GymEnv('mjrl_swimmer-v0')
17 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
18 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=5, learn_rate=1e-3)
19 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
20 | 
21 | ts = timer.time()
22 | print("========================================")
23 | print("Training expert policy")
24 | print("========================================")
25 | train_agent(job_name='swimmer_exp1',
26 |             agent=agent,
27 |             seed=SEED,
28 |             niter=50,
29 |             gamma=0.995,
30 |             gae_lambda=0.97,
31 |             num_cpu=1,
32 |             sample_mode='trajectories',
33 |             num_traj=10,
34 |             save_freq=5,
35 |             evaluation_rollouts=None)
36 | print("========================================")
37 | print("Expert policy training complete !!!")
38 | print("========================================")
39 | print("time taken = %f" % (timer.time()-ts))
40 | print("========================================")
41 | 
42 | # ------------------------------
43 | # Get demonstrations
44 | print("========================================")
45 | print("Collecting expert demonstrations")
46 | print("========================================")
47 | expert_pol = pickle.load(open('swimmer_exp1/iterations/best_policy.pickle', 'rb'))
48 | demo_paths = sample_paths(num_traj=5, policy=expert_pol, env=e.env_id)
49 | 
50 | # ------------------------------
51 | # Train BC
52 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
53 | bc_agent = BC(demo_paths, policy=policy, epochs=20, batch_size=64, lr=1e-3) # will use Adam by default
54 | ts = timer.time()
55 | print("========================================")
56 | print("Running BC with expert demonstrations")
57 | print("========================================")
58 | bc_agent.train()
59 | print("========================================")
60 | print("BC training complete !!!")
61 | print("time taken = %f" % (timer.time()-ts))
62 | print("========================================")
63 | 
64 | # ------------------------------
65 | # Evaluate Policies
66 | bc_pol_score = e.evaluate_policy(policy, num_episodes=5, mean_action=True)
67 | expert_score = e.evaluate_policy(expert_pol, num_episodes=5, mean_action=True)
68 | print("Expert policy performance (eval mode) = %f" % expert_score[0][0])
69 | print("BC policy performance (eval mode) = %f" % bc_pol_score[0][0])
70 | 


--------------------------------------------------------------------------------
/examples/example_configs/hopper_npg.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env'               :   'Hopper-v3',
 6 | 'algorithm'         :   'NPG',
 7 | 'seed'              :   123,
 8 | 'sample_mode'       :   'samples',
 9 | 'rl_num_samples'    :   10000,
10 | 'rl_num_iter'       :   100,
11 | 'num_cpu'           :   1,
12 | 'save_freq'         :   25,
13 | 'eval_rollouts'     :   None,
14 | 'exp_notes'         :   'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.',
15 | 
16 | # RL parameters (all params related to PG, value function etc.)
17 | 
18 | 'policy_size'       :   (32, 32),
19 | 'init_log_std'      :   -0.5,
20 | 'vf_hidden_size'    :   (128, 128),
21 | 'vf_batch_size'     :   64,
22 | 'vf_epochs'         :   2,
23 | 'vf_learn_rate'     :   1e-3,
24 | 'rl_step_size'      :   0.05,
25 | 'rl_gamma'          :   0.995,
26 | 'rl_gae'            :   0.97,
27 | 
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 | 
30 | 'alg_hyper_params'  :   dict(),
31 | 
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/examples/example_configs/swimmer_npg.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env'               :   'mjrl_swimmer-v0',
 6 | 'algorithm'         :   'NPG',
 7 | 'seed'              :   123,
 8 | 'sample_mode'       :   'trajectories',
 9 | 'rl_num_traj'       :   10,
10 | 'rl_num_iter'       :   50,
11 | 'num_cpu'           :   2,
12 | 'save_freq'         :   25,
13 | 'eval_rollouts'     :   None,
14 | 'exp_notes'         :   'Example config for training policy with NPG on the mjrl swimmer task.',
15 | 
16 | # RL parameters (all params related to PG, value function, DAPG etc.)
17 | 
18 | 'policy_size'       :   (32, 32),
19 | 'init_log_std'      :   -0.5,
20 | 'vf_hidden_size'    :   (128, 128),
21 | 'vf_batch_size'     :   64,
22 | 'vf_epochs'         :   2,
23 | 'vf_learn_rate'     :   1e-3,
24 | 'rl_step_size'      :   0.1,
25 | 'rl_gamma'          :   0.995,
26 | 'rl_gae'            :   0.97,
27 | 
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 | 
30 | 'alg_hyper_params'  :   dict(),
31 | 
32 | }


--------------------------------------------------------------------------------
/examples/example_configs/swimmer_ppo.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env'               :   'mjrl_swimmer-v0',
 6 | 'algorithm'         :   'PPO',
 7 | 'seed'              :   123,
 8 | 'sample_mode'       :   'trajectories',
 9 | 'rl_num_traj'       :   10,
10 | 'rl_num_iter'       :   50,
11 | 'num_cpu'           :   2,
12 | 'save_freq'         :   25,
13 | 'eval_rollouts'     :   None,
14 | 'exp_notes'         :   'Example config for training policy with PPO on the mjrl swimmer task.',
15 | 
16 | # RL parameters (all params related to PG, value function, DAPG etc.)
17 | 
18 | 'policy_size'       :   (32, 32),
19 | 'init_log_std'      :   -0.5,
20 | 'vf_hidden_size'    :   (128, 128),
21 | 'vf_batch_size'     :   64,
22 | 'vf_epochs'         :   2,
23 | 'vf_learn_rate'     :   1e-3,
24 | 'rl_step_size'      :   0.1,
25 | 'rl_gamma'          :   0.995,
26 | 'rl_gae'            :   0.97,
27 | 
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 | 
30 | 'alg_hyper_params'  :   dict(clip_coef=0.2, epochs=10, mb_size=64, learn_rate=5e-4),
31 | 
32 | }


--------------------------------------------------------------------------------
/examples/linear_nn_comparison.py:
--------------------------------------------------------------------------------
 1 | from mjrl.utils.gym_env import GymEnv
 2 | from mjrl.policies.gaussian_mlp import MLP
 3 | from mjrl.policies.gaussian_linear import LinearPolicy
 4 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 5 | from mjrl.baselines.mlp_baseline import MLPBaseline
 6 | from mjrl.algos.npg_cg import NPG
 7 | from mjrl.utils.train_agent import train_agent
 8 | import mjrl.envs
 9 | import time as timer
10 | SEED = 500
11 | 
12 | # NN policy
13 | # ==================================
14 | e = GymEnv('mjrl_swimmer-v0')
15 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
16 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
17 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
18 | 
19 | ts = timer.time()
20 | train_agent(job_name='swimmer_nn_exp1',
21 |             agent=agent,
22 |             seed=SEED,
23 |             niter=50,
24 |             gamma=0.995,  
25 |             gae_lambda=0.97,
26 |             num_cpu=1,
27 |             sample_mode='trajectories',
28 |             num_traj=10,
29 |             save_freq=5,
30 |             evaluation_rollouts=5)
31 | print("time taken for NN policy training = %f" % (timer.time()-ts))
32 | 
33 | 
34 | # Linear policy
35 | # ==================================
36 | e = GymEnv('mjrl_swimmer-v0')
37 | policy = LinearPolicy(e.spec, seed=SEED)
38 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=2, learn_rate=1e-3)
39 | agent = NPG(e, policy, baseline, normalized_step_size=0.1, seed=SEED, save_logs=True)
40 | 
41 | ts = timer.time()
42 | train_agent(job_name='swimmer_linear_exp1',
43 |             agent=agent,
44 |             seed=SEED,
45 |             niter=50,
46 |             gamma=0.995,  
47 |             gae_lambda=0.97,
48 |             num_cpu=1,
49 |             sample_mode='trajectories',
50 |             num_traj=10,
51 |             save_freq=5,
52 |             evaluation_rollouts=5)
53 | print("time taken for linear policy training = %f" % (timer.time()-ts))
54 | 


--------------------------------------------------------------------------------
/examples/policy_opt_job_script.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a job script for running policy gradient algorithms on gym tasks.
  3 | Separate job scripts are provided to run few other algorithms
  4 | - For DAPG see here: https://github.com/aravindr93/hand_dapg/tree/master/dapg/examples
  5 | - For model-based NPG see here: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos/model_accel
  6 | """
  7 | 
  8 | from mjrl.utils.gym_env import GymEnv
  9 | from mjrl.policies.gaussian_mlp import MLP
 10 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 11 | from mjrl.baselines.mlp_baseline import MLPBaseline
 12 | from mjrl.algos.npg_cg import NPG
 13 | from mjrl.algos.batch_reinforce import BatchREINFORCE
 14 | from mjrl.algos.ppo_clip import PPO
 15 | from mjrl.utils.train_agent import train_agent
 16 | import os
 17 | import json
 18 | import gym
 19 | import mjrl.envs
 20 | import time as timer
 21 | import pickle
 22 | import argparse
 23 | 
 24 | # ===============================================================================
 25 | # Get command line arguments
 26 | # ===============================================================================
 27 | 
 28 | parser = argparse.ArgumentParser(description='Natural policy gradient from mjrl on mujoco environments')
 29 | parser.add_argument('--output', type=str, required=True, help='location to store results')
 30 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params')
 31 | args = parser.parse_args()
 32 | JOB_DIR = args.output
 33 | if not os.path.exists(JOB_DIR):
 34 |     os.mkdir(JOB_DIR)
 35 | with open(args.config, 'r') as f:
 36 |     job_data = eval(f.read())
 37 | assert 'algorithm' in job_data.keys()
 38 | assert any([job_data['algorithm'] == a for a in ['NPG', 'NVPG', 'VPG', 'PPO']])
 39 | assert 'sample_mode' in job_data.keys()
 40 | job_data['alg_hyper_params'] = dict() if 'alg_hyper_params' not in job_data.keys() else job_data['alg_hyper_params']
 41 | 
 42 | EXP_FILE = JOB_DIR + '/job_config.json'
 43 | with open(EXP_FILE, 'w') as f:
 44 |     json.dump(job_data, f, indent=4)
 45 | 
 46 | if job_data['sample_mode'] == 'trajectories':
 47 |     assert 'rl_num_traj' in job_data.keys()
 48 |     job_data['rl_num_samples'] = 0 # will be ignored
 49 | elif job_data['sample_mode'] == 'samples':
 50 |     assert 'rl_num_samples' in job_data.keys()
 51 |     job_data['rl_num_traj'] = 0    # will be ignored
 52 | else:
 53 |     print("Unknown sampling mode. Choose either trajectories or samples")
 54 |     exit()
 55 | 
 56 | # ===============================================================================
 57 | # Train Loop
 58 | # ===============================================================================
 59 | 
 60 | e = GymEnv(job_data['env'])
 61 | policy = MLP(e.spec, hidden_sizes=job_data['policy_size'], seed=job_data['seed'], init_log_std=job_data['init_log_std'])
 62 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data['vf_batch_size'], hidden_sizes=job_data['vf_hidden_size'],
 63 |                        epochs=job_data['vf_epochs'], learn_rate=job_data['vf_learn_rate'])
 64 | 
 65 | # Construct the algorithm
 66 | if job_data['algorithm'] == 'NPG':
 67 |     # Other hyperparameters (like number of CG steps) can be specified in config for pass through
 68 |     # or default hyperparameters will be used
 69 |     agent = NPG(e, policy, baseline, normalized_step_size=job_data['rl_step_size'],
 70 |                 seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
 71 | 
 72 | elif job_data['algorithm'] == 'VPG':
 73 |     agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data['rl_step_size'],
 74 |                            seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
 75 | 
 76 | elif job_data['algorithm'] == 'NVPG':
 77 |     agent = BatchREINFORCE(e, policy, baseline, desired_kl=job_data['rl_step_size'],
 78 |                            seed=job_data['seed'], save_logs=True, **job_data['alg_hyper_params'])
 79 | 
 80 | elif job_data['algorithm'] == 'PPO':
 81 |     # There are many hyperparameters for PPO. They can be specified in config for pass through
 82 |     # or defaults in the PPO algorithm will be used
 83 |     agent = PPO(e, policy, baseline, save_logs=True, **job_data['alg_hyper_params'])
 84 | 
 85 | print("========================================")
 86 | print("Starting policy learning")
 87 | print("========================================")
 88 | 
 89 | ts = timer.time()
 90 | train_agent(job_name=JOB_DIR,
 91 |             agent=agent,
 92 |             seed=job_data['seed'],
 93 |             niter=job_data['rl_num_iter'],
 94 |             gamma=job_data['rl_gamma'],
 95 |             gae_lambda=job_data['rl_gae'],
 96 |             num_cpu=job_data['num_cpu'],
 97 |             sample_mode=job_data['sample_mode'],
 98 |             num_traj=job_data['rl_num_traj'],
 99 |             num_samples=job_data['rl_num_samples'],
100 |             save_freq=job_data['save_freq'],
101 |             evaluation_rollouts=job_data['eval_rollouts'])
102 | print("time taken = %f" % (timer.time()-ts))
103 | 


--------------------------------------------------------------------------------
/mjrl/__init__.py:
--------------------------------------------------------------------------------
1 | import mjrl.envs


--------------------------------------------------------------------------------
/mjrl/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/algos/__init__.py


--------------------------------------------------------------------------------
/mjrl/algos/behavior_cloning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Minimize bc loss (MLE, MSE, RWR etc.) with pytorch optimizers
  3 | """
  4 | 
  5 | import logging
  6 | logging.disable(logging.CRITICAL)
  7 | import numpy as np
  8 | import time as timer
  9 | import torch
 10 | from torch.autograd import Variable
 11 | from mjrl.utils.logger import DataLog
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | class BC:
 16 |     def __init__(self, expert_paths,
 17 |                  policy,
 18 |                  epochs = 5,
 19 |                  batch_size = 64,
 20 |                  lr = 1e-3,
 21 |                  optimizer = None,
 22 |                  loss_type = 'MSE',  # can be 'MLE' or 'MSE'
 23 |                  save_logs = True,
 24 |                  set_transforms = False,
 25 |                  **kwargs,
 26 |                  ):
 27 | 
 28 |         self.policy = policy
 29 |         self.expert_paths = expert_paths
 30 |         self.epochs = epochs
 31 |         self.mb_size = batch_size
 32 |         self.logger = DataLog()
 33 |         self.loss_type = loss_type
 34 |         self.save_logs = save_logs
 35 | 
 36 |         if set_transforms:
 37 |             in_shift, in_scale, out_shift, out_scale = self.compute_transformations()
 38 |             self.set_transformations(in_shift, in_scale, out_shift, out_scale)
 39 |             self.set_variance_with_data(out_scale)
 40 | 
 41 |         # construct optimizer
 42 |         self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=lr) if optimizer is None else optimizer
 43 | 
 44 |         # Loss criterion if required
 45 |         if loss_type == 'MSE':
 46 |             self.loss_criterion = torch.nn.MSELoss()
 47 | 
 48 |         # make logger
 49 |         if self.save_logs:
 50 |             self.logger = DataLog()
 51 | 
 52 |     def compute_transformations(self):
 53 |         # get transformations
 54 |         if self.expert_paths == [] or self.expert_paths is None:
 55 |             in_shift, in_scale, out_shift, out_scale = None, None, None, None
 56 |         else:
 57 |             observations = np.concatenate([path["observations"] for path in self.expert_paths])
 58 |             actions = np.concatenate([path["actions"] for path in self.expert_paths])
 59 |             in_shift, in_scale = np.mean(observations, axis=0), np.std(observations, axis=0)
 60 |             out_shift, out_scale = np.mean(actions, axis=0), np.std(actions, axis=0)
 61 |         return in_shift, in_scale, out_shift, out_scale
 62 | 
 63 |     def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None):
 64 |         # set scalings in the target policy
 65 |         self.policy.model.set_transformations(in_shift, in_scale, out_shift, out_scale)
 66 |         self.policy.old_model.set_transformations(in_shift, in_scale, out_shift, out_scale)
 67 | 
 68 |     def set_variance_with_data(self, out_scale):
 69 |         # set the variance of gaussian policy based on out_scale
 70 |         params = self.policy.get_param_values()
 71 |         params[-self.policy.m:] = np.log(out_scale + 1e-12)
 72 |         self.policy.set_param_values(params)
 73 | 
 74 |     def loss(self, data, idx=None):
 75 |         if self.loss_type == 'MLE':
 76 |             return self.mle_loss(data, idx)
 77 |         elif self.loss_type == 'MSE':
 78 |             return self.mse_loss(data, idx)
 79 |         else:
 80 |             print("Please use valid loss type")
 81 |             return None
 82 | 
 83 |     def mle_loss(self, data, idx):
 84 |         # use indices if provided (e.g. for mini-batching)
 85 |         # otherwise, use all the data
 86 |         idx = range(data['observations'].shape[0]) if idx is None else idx
 87 |         if type(data['observations']) == torch.Tensor:
 88 |             idx = torch.LongTensor(idx)
 89 |         obs = data['observations'][idx]
 90 |         act = data['expert_actions'][idx]
 91 |         LL, mu, log_std = self.policy.new_dist_info(obs, act)
 92 |         # minimize negative log likelihood
 93 |         return -torch.mean(LL)
 94 | 
 95 |     def mse_loss(self, data, idx=None):
 96 |         idx = range(data['observations'].shape[0]) if idx is None else idx
 97 |         if type(data['observations']) is torch.Tensor:
 98 |             idx = torch.LongTensor(idx)
 99 |         obs = data['observations'][idx]
100 |         act_expert = data['expert_actions'][idx]
101 |         if type(data['observations']) is not torch.Tensor:
102 |             obs = Variable(torch.from_numpy(obs).float(), requires_grad=False)
103 |             act_expert = Variable(torch.from_numpy(act_expert).float(), requires_grad=False)
104 |         act_pi = self.policy.model(obs)
105 |         return self.loss_criterion(act_pi, act_expert.detach())
106 | 
107 |     def fit(self, data, suppress_fit_tqdm=False, **kwargs):
108 |         # data is a dict
109 |         # keys should have "observations" and "expert_actions"
110 |         validate_keys = all([k in data.keys() for k in ["observations", "expert_actions"]])
111 |         assert validate_keys is True
112 |         ts = timer.time()
113 |         num_samples = data["observations"].shape[0]
114 | 
115 |         # log stats before
116 |         if self.save_logs:
117 |             loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0]
118 |             self.logger.log_kv('loss_before', loss_val)
119 | 
120 |         # train loop
121 |         for ep in config_tqdm(range(self.epochs), suppress_fit_tqdm):
122 |             for mb in range(int(num_samples / self.mb_size)):
123 |                 rand_idx = np.random.choice(num_samples, size=self.mb_size)
124 |                 self.optimizer.zero_grad()
125 |                 loss = self.loss(data, idx=rand_idx)
126 |                 loss.backward()
127 |                 self.optimizer.step()
128 |         params_after_opt = self.policy.get_param_values()
129 |         self.policy.set_param_values(params_after_opt, set_new=True, set_old=True)
130 | 
131 |         # log stats after
132 |         if self.save_logs:
133 |             self.logger.log_kv('epoch', self.epochs)
134 |             loss_val = self.loss(data, idx=range(num_samples)).data.numpy().ravel()[0]
135 |             self.logger.log_kv('loss_after', loss_val)
136 |             self.logger.log_kv('time', (timer.time()-ts))
137 | 
138 |     def train(self, **kwargs):
139 |         observations = np.concatenate([path["observations"] for path in self.expert_paths])
140 |         expert_actions = np.concatenate([path["actions"] for path in self.expert_paths])
141 |         data = dict(observations=observations, expert_actions=expert_actions)
142 |         self.fit(data, **kwargs)
143 | 
144 | 
145 | def config_tqdm(range_inp, suppress_tqdm=False):
146 |     if suppress_tqdm:
147 |         return range_inp
148 |     else:
149 |         return tqdm(range_inp)


--------------------------------------------------------------------------------
/mjrl/algos/dapg.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.disable(logging.CRITICAL)
  3 | import numpy as np
  4 | import scipy as sp
  5 | import scipy.sparse.linalg as spLA
  6 | import copy
  7 | import time as timer
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.autograd import Variable
 11 | import copy
 12 | 
 13 | # samplers
 14 | import mjrl.samplers.core as trajectory_sampler
 15 | 
 16 | # utility functions
 17 | import mjrl.utils.process_samples as process_samples
 18 | from mjrl.utils.logger import DataLog
 19 | from mjrl.utils.cg_solve import cg_solve
 20 | 
 21 | # Import Algs
 22 | from mjrl.algos.npg_cg import NPG
 23 | from mjrl.algos.behavior_cloning import BC
 24 | 
 25 | class DAPG(NPG):
 26 |     def __init__(self, env, policy, baseline,
 27 |                  demo_paths=None,
 28 |                  normalized_step_size=0.01,
 29 |                  FIM_invert_args={'iters': 10, 'damping': 1e-4},
 30 |                  hvp_sample_frac=1.0,
 31 |                  seed=123,
 32 |                  save_logs=False,
 33 |                  kl_dist=None,
 34 |                  lam_0=1.0,  # demo coef
 35 |                  lam_1=0.95, # decay coef
 36 |                  **kwargs,
 37 |                  ):
 38 | 
 39 |         self.env = env
 40 |         self.policy = policy
 41 |         self.baseline = baseline
 42 |         self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size
 43 |         self.seed = seed
 44 |         self.save_logs = save_logs
 45 |         self.FIM_invert_args = FIM_invert_args
 46 |         self.hvp_subsample = hvp_sample_frac
 47 |         self.running_score = None
 48 |         self.demo_paths = demo_paths
 49 |         self.lam_0 = lam_0
 50 |         self.lam_1 = lam_1
 51 |         self.iter_count = 0.0
 52 |         if save_logs: self.logger = DataLog()
 53 | 
 54 |     def train_from_paths(self, paths):
 55 | 
 56 |         # Concatenate from all the trajectories
 57 |         observations = np.concatenate([path["observations"] for path in paths])
 58 |         actions = np.concatenate([path["actions"] for path in paths])
 59 |         advantages = np.concatenate([path["advantages"] for path in paths])
 60 |         advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
 61 | 
 62 |         if self.demo_paths is not None and self.lam_0 > 0.0:
 63 |             demo_obs = np.concatenate([path["observations"] for path in self.demo_paths])
 64 |             demo_act = np.concatenate([path["actions"] for path in self.demo_paths])
 65 |             demo_adv = self.lam_0 * (self.lam_1 ** self.iter_count) * np.ones(demo_obs.shape[0])
 66 |             self.iter_count += 1
 67 |             # concatenate all
 68 |             all_obs = np.concatenate([observations, demo_obs])
 69 |             all_act = np.concatenate([actions, demo_act])
 70 |             all_adv = 1e-2*np.concatenate([advantages/(np.std(advantages) + 1e-8), demo_adv])
 71 |         else:
 72 |             all_obs = observations
 73 |             all_act = actions
 74 |             all_adv = advantages
 75 | 
 76 |         # cache return distributions for the paths
 77 |         path_returns = [sum(p["rewards"]) for p in paths]
 78 |         mean_return = np.mean(path_returns)
 79 |         std_return = np.std(path_returns)
 80 |         min_return = np.amin(path_returns)
 81 |         max_return = np.amax(path_returns)
 82 |         base_stats = [mean_return, std_return, min_return, max_return]
 83 |         self.running_score = mean_return if self.running_score is None else \
 84 |                              0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
 85 |         if self.save_logs: self.log_rollout_statistics(paths)
 86 | 
 87 |         # Keep track of times for various computations
 88 |         t_gLL = 0.0
 89 |         t_FIM = 0.0
 90 | 
 91 |         # Optimization algorithm
 92 |         # --------------------------
 93 |         surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
 94 | 
 95 |         # DAPG
 96 |         ts = timer.time()
 97 |         sample_coef = all_adv.shape[0]/advantages.shape[0]
 98 |         dapg_grad = sample_coef*self.flat_vpg(all_obs, all_act, all_adv)
 99 |         t_gLL += timer.time() - ts
100 | 
101 |         # NPG
102 |         ts = timer.time()
103 |         hvp = self.build_Hvp_eval([observations, actions],
104 |                                   regu_coef=self.FIM_invert_args['damping'])
105 |         npg_grad = cg_solve(hvp, dapg_grad, x_0=dapg_grad.copy(),
106 |                             cg_iters=self.FIM_invert_args['iters'])
107 |         t_FIM += timer.time() - ts
108 | 
109 |         # Step size computation
110 |         # --------------------------
111 |         n_step_size = 2.0*self.kl_dist
112 |         alpha = np.sqrt(np.abs(n_step_size / (np.dot(dapg_grad.T, npg_grad) + 1e-20)))
113 | 
114 |         # Policy update
115 |         # --------------------------
116 |         curr_params = self.policy.get_param_values()
117 |         new_params = curr_params + alpha * npg_grad
118 |         self.policy.set_param_values(new_params, set_new=True, set_old=False)
119 |         surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
120 |         kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
121 |         self.policy.set_param_values(new_params, set_new=True, set_old=True)
122 | 
123 |         # Log information
124 |         if self.save_logs:
125 |             self.logger.log_kv('alpha', alpha)
126 |             self.logger.log_kv('delta', n_step_size)
127 |             self.logger.log_kv('time_vpg', t_gLL)
128 |             self.logger.log_kv('time_npg', t_FIM)
129 |             self.logger.log_kv('kl_dist', kl_dist)
130 |             self.logger.log_kv('surr_improvement', surr_after - surr_before)
131 |             self.logger.log_kv('running_score', self.running_score)
132 |             try:
133 |                 self.env.env.env.evaluate_success(paths, self.logger)
134 |             except:
135 |                 # nested logic for backwards compatibility. TODO: clean this up.
136 |                 try:
137 |                     success_rate = self.env.env.env.evaluate_success(paths)
138 |                     self.logger.log_kv('success_rate', success_rate)
139 |                 except:
140 |                     pass
141 |         return base_stats
142 | 


--------------------------------------------------------------------------------
/mjrl/algos/mbac.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.disable(logging.CRITICAL)
  3 | import numpy as np
  4 | import time as timer
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.autograd import Variable
  8 | from mjrl.utils.logger import DataLog
  9 | from tqdm import tqdm
 10 | from mjrl.utils.gym_env import GymEnv
 11 | from mjrl.policies.mpc_actor import MPCActor
 12 | from mjrl.algos.behavior_cloning import BC
 13 | 
 14 | 
 15 | class MBAC(BC):
 16 |     def __init__(self,
 17 |                  env_name,
 18 |                  policy,
 19 |                  expert_paths = None, # for the initial seeding
 20 |                  epochs = 5,
 21 |                  batch_size = 64,
 22 |                  lr = 1e-3,
 23 |                  optimizer = None,
 24 |                  loss_type = 'MSE',  # can be 'MLE' or 'MSE'
 25 |                  seed = 123,
 26 |                  buffer_size = 50,   # measured in number of trajectories
 27 |                  mpc_params = None,
 28 |                  save_logs = True,
 29 |                  ):
 30 | 
 31 |         super().__init__(expert_paths=expert_paths,
 32 |                          policy=policy,
 33 |                          epochs=epochs,
 34 |                          batch_size=batch_size,
 35 |                          lr=lr,
 36 |                          optimizer=optimizer,
 37 |                          loss_type=loss_type,
 38 |                          save_logs=save_logs,
 39 |                          )
 40 |         self.expert_paths = [] if self.expert_paths is None else self.expert_paths
 41 |         self.buffer_size = buffer_size
 42 | 
 43 |         # For the MPC policy
 44 |         self.env = GymEnv(env_name)
 45 |         self.env.reset(seed=seed)
 46 |         if mpc_params is None:
 47 |             mean = np.zeros(self.env.action_dim)
 48 |             sigma = 1.0 * np.ones(self.env.action_dim)
 49 |             filter_coefs = [sigma, 0.05, 0.0, 0.0]
 50 |             mpc_params = dict(env=GymEnv(env_name), H=10,
 51 |                               paths_per_cpu=25, num_cpu=1,
 52 |                               kappa=10.0, gamma=1.0,
 53 |                               mean=mean, filter_coefs=filter_coefs,
 54 |                               seed=seed)
 55 |         else:
 56 |             mpc_params['env'] = GymEnv(env_name)
 57 |             mpc_params['seed'] = seed
 58 | 
 59 |         self.mpc_params = mpc_params
 60 |         self.mpc_policy = MPCActor(**mpc_params)
 61 | 
 62 |     def collect_paths(self, num_traj=10,
 63 |                       mode='policy',
 64 |                       horizon=None,
 65 |                       render=False
 66 |                       ):
 67 |         horizon = self.env.horizon if horizon is None else horizon
 68 |         paths = []
 69 |         for i in tqdm(range(num_traj)):
 70 |             self.env.reset()
 71 |             obs, act_pi, act_mpc, rew, states = [], [], [], [], []
 72 |             for t in range(horizon):
 73 |                 o = self.env.get_obs()
 74 |                 s = self.env.get_env_state()
 75 |                 a_pi = self.policy.get_action(o)[0]
 76 |                 a_mpc = self.mpc_policy.get_action(s)
 77 |                 a = a_pi if mode == 'policy' else a_mpc
 78 |                 next_o, r, done, _ = self.env.step(a)
 79 |                 if render:
 80 |                     self.env.render()
 81 |                 # store data
 82 |                 obs.append(o)
 83 |                 rew.append(r)
 84 |                 states.append(s)
 85 |                 act_pi.append(a_pi)
 86 |                 act_mpc.append(a_mpc)
 87 |                 # kill if done
 88 |                 if done:
 89 |                     break
 90 |             path = dict(observations=np.array(obs),
 91 |                         actions=np.array(act_pi),
 92 |                         expert_actions=np.array(act_mpc),
 93 |                         rewards=np.array(rew),
 94 |                         states=states,
 95 |                         )
 96 |             paths.append(path)
 97 |         return paths
 98 | 
 99 |     def add_paths_to_buffer(self, paths):
100 |         for path in paths:
101 |             self.expert_paths.append(path)
102 |         if len(self.expert_paths) > self.buffer_size:
103 |             # keep recent trajectories
104 |             # TODO: Also consider keeping best performing trajectories
105 |             self.expert_paths = self.expert_paths[-self.buffer_size:]
106 |         if self.save_logs:
107 |             self.logger.log_kv('buffer_size', len(self.expert_paths))
108 | 
109 |     def get_data_from_buffer(self):
110 |         observations = np.concatenate([path["observations"] for path in self.expert_paths])
111 |         expert_actions = np.concatenate([path["expert_actions"] for path in self.expert_paths])
112 |         observations = torch.Tensor(observations).float()
113 |         expert_actions = torch.Tensor(expert_actions).float()
114 |         data = dict(observations=observations, expert_actions=expert_actions)
115 |         return data
116 | 
117 |     def train_step(self, num_traj=10, **kwargs):
118 |         # collect data using policy actions
119 |         # fit policy to expert actions on these states
120 |         new_paths = self.collect_paths(num_traj, mode='policy')
121 |         self.add_paths_to_buffer(new_paths)
122 |         data = self.get_data_from_buffer()
123 |         self.fit(data, **kwargs)
124 |         stoc_pol_perf = np.mean([np.sum(path['rewards']) for path in new_paths])
125 |         return stoc_pol_perf


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/algos/model_accel/__init__.py


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/model_accel_npg.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import copy
  3 | import torch
  4 | import torch.nn as nn
  5 | import pickle
  6 | import mjrl.envs
  7 | import os
  8 | import time as timer
  9 | from torch.autograd import Variable
 10 | from mjrl.utils.gym_env import GymEnv
 11 | from mjrl.algos.model_accel.nn_dynamics import WorldModel
 12 | import mjrl.samplers.core as trajectory_sampler
 13 | 
 14 | # utility functions
 15 | import mjrl.utils.process_samples as process_samples
 16 | from mjrl.utils.logger import DataLog
 17 | from mjrl.algos.model_accel.sampling import policy_rollout
 18 | 
 19 | # Import NPG
 20 | from mjrl.algos.npg_cg import NPG
 21 | 
 22 | 
 23 | class ModelAccelNPG(NPG):
 24 |     def __init__(self, learned_model=None,
 25 |                  refine=False,
 26 |                  kappa=5.0,
 27 |                  plan_horizon=10,
 28 |                  plan_paths=100,
 29 |                  reward_function=None,
 30 |                  termination_function=None,
 31 |                  **kwargs):
 32 |         super(ModelAccelNPG, self).__init__(**kwargs)
 33 |         if learned_model is None:
 34 |             print("Algorithm requires a (list of) learned dynamics model")
 35 |             quit()
 36 |         elif isinstance(learned_model, WorldModel):
 37 |             self.learned_model = [learned_model]
 38 |         else:
 39 |             self.learned_model = learned_model
 40 |         self.refine, self.kappa, self.plan_horizon, self.plan_paths = refine, kappa, plan_horizon, plan_paths
 41 |         self.reward_function, self.termination_function = reward_function, termination_function
 42 | 
 43 |     def to(self, device):
 44 |         # Convert all the networks (except policy network which is clamped to CPU)
 45 |         # to the specified device
 46 |         for model in self.learned_model:
 47 |             model.to(device)
 48 |         try:    self.baseline.model.to(device)
 49 |         except: pass
 50 | 
 51 |     def is_cuda(self):
 52 |         # Check if any of the networks are on GPU
 53 |         model_cuda = [model.is_cuda() for model in self.learned_model]
 54 |         model_cuda = any(model_cuda)
 55 |         baseline_cuda = next(self.baseline.model.parameters()).is_cuda
 56 |         return any([model_cuda, baseline_cuda])
 57 | 
 58 |     def train_step(self, N,
 59 |                    env=None,
 60 |                    sample_mode='trajectories',
 61 |                    horizon=1e6,
 62 |                    gamma=0.995,
 63 |                    gae_lambda=0.97,
 64 |                    num_cpu='max',
 65 |                    env_kwargs=None,
 66 |                    init_states=None,
 67 |                    reward_function=None,
 68 |                    termination_function=None,
 69 |                    truncate_lim=None,
 70 |                    truncate_reward=0.0,
 71 |                    **kwargs,
 72 |                    ):
 73 | 
 74 |         ts = timer.time()
 75 | 
 76 |         # get the correct env behavior
 77 |         if env is None:
 78 |             env = self.env
 79 |         elif type(env) == str:
 80 |             env = GymEnv(env)
 81 |         elif isinstance(env, GymEnv):
 82 |             env = env
 83 |         elif callable(env):
 84 |             env = env(**env_kwargs)
 85 |         else:
 86 |             print("Unsupported environment format")
 87 |             raise AttributeError
 88 | 
 89 |         # get correct behavior for reward and termination
 90 |         reward_function = self.reward_function if reward_function is None else reward_function
 91 |         termination_function = self.termination_function if termination_function is None else termination_function
 92 |         if reward_function: assert callable(reward_function)
 93 |         if termination_function: assert callable(termination_function)
 94 | 
 95 |         # simulate trajectories with the learned model(s)
 96 |         # we want to use the same task instances (e.g. goal locations) for each model in ensemble
 97 |         paths = []
 98 | 
 99 |         # NOTE: We can optionally specify a set of initial states to perform the rollouts from
100 |         # This is useful for starting rollouts from the states in the replay buffer
101 |         init_states = np.array([env.reset() for _ in range(N)]) if init_states is None else init_states
102 |         assert type(init_states) == list
103 |         assert len(init_states) == N
104 | 
105 |         for model in self.learned_model:
106 |             # dont set seed explicitly -- this will make rollouts follow tne global seed
107 |             rollouts = policy_rollout(num_traj=N, env=env, policy=self.policy,
108 |                                       learned_model=model, eval_mode=False, horizon=horizon,
109 |                                       init_state=init_states, seed=None)
110 |             # use learned reward function if available
111 |             if model.learn_reward:
112 |                 model.compute_path_rewards(rollouts)
113 |             else:
114 |                rollouts = reward_function(rollouts)
115 |             num_traj, horizon, state_dim = rollouts['observations'].shape
116 |             for i in range(num_traj):
117 |                 path = dict()
118 |                 obs = rollouts['observations'][i, :, :]
119 |                 act = rollouts['actions'][i, :, :]
120 |                 rew = rollouts['rewards'][i, :]
121 |                 path['observations'] = obs
122 |                 path['actions'] = act
123 |                 path['rewards'] = rew
124 |                 path['terminated'] = False
125 |                 paths.append(path)
126 | 
127 |         # NOTE: If tasks have termination condition, we will assume that the env has
128 |         # a function that can terminate paths appropriately.
129 |         # Otherwise, termination is not considered.
130 | 
131 |         if callable(termination_function): paths = termination_function(paths)
132 | 
133 |         # remove paths that are too short
134 |         paths = [path for path in paths if path['observations'].shape[0] >= 5]
135 | 
136 |         # additional truncation based on error in the ensembles
137 |         if truncate_lim is not None and len(self.learned_model) > 1:
138 |             for path in paths:
139 |                 pred_err = np.zeros(path['observations'].shape[0] - 1)
140 |                 for model in self.learned_model:
141 |                     s = path['observations'][:-1]
142 |                     a = path['actions'][:-1]
143 |                     s_next = path['observations'][1:]
144 |                     pred = model.predict(s, a)
145 |                     model_err = np.mean((s_next - pred)**2, axis=-1)
146 |                     pred_err = np.maximum(pred_err, model_err)
147 |                 violations = np.where(pred_err > truncate_lim)[0]
148 |                 truncated = (not len(violations) == 0)
149 |                 T = violations[0] + 1 if truncated else obs.shape[0]
150 |                 T = max(4, T)   # we don't want corner cases of very short truncation
151 |                 path["observations"] = path["observations"][:T]
152 |                 path["actions"] = path["actions"][:T]
153 |                 path["rewards"] = path["rewards"][:T]
154 |                 if truncated: path["rewards"][-1] += truncate_reward
155 |                 path["terminated"] = False if T == obs.shape[0] else True
156 | 
157 |         if self.save_logs:
158 |             self.logger.log_kv('time_sampling', timer.time() - ts)
159 | 
160 |         self.seed = self.seed + N if self.seed is not None else self.seed
161 | 
162 |         # compute returns
163 |         process_samples.compute_returns(paths, gamma)
164 |         # compute advantages
165 |         process_samples.compute_advantages(paths, self.baseline, gamma, gae_lambda)
166 |         # train from paths
167 |         eval_statistics = self.train_from_paths(paths)
168 |         eval_statistics.append(N)
169 |         # log number of samples
170 |         if self.save_logs:
171 |             num_samples = np.sum([p["rewards"].shape[0] for p in paths])
172 |             self.logger.log_kv('num_samples', num_samples)
173 |         # fit baseline
174 |         if self.save_logs:
175 |             ts = timer.time()
176 |             error_before, error_after = self.baseline.fit(paths, return_errors=True)
177 |             self.logger.log_kv('time_VF', timer.time()-ts)
178 |             self.logger.log_kv('VF_error_before', error_before)
179 |             self.logger.log_kv('VF_error_after', error_after)
180 |         else:
181 |             self.baseline.fit(paths)
182 | 
183 |         return eval_statistics
184 | 
185 |     def get_action(self, observation):
186 |         if self.refine is False:
187 |             return self.policy.get_action(observation)
188 |         else:
189 |             return self.get_refined_action(observation)
190 | 
191 |     def get_refined_action(self, observation):
192 |         # TODO(Aravind): Implemenet this
193 |         # This function should rollout many trajectories according to the learned
194 |         # dynamics model and the policy, and should refine around the policy by
195 |         # incorporating reward based refinement
196 |         raise NotImplementedError
197 | 


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/model_learning_mpc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from mjrl.algos.model_accel.sampling import generate_paths, generate_perturbed_actions, trajectory_rollout
  3 | 
  4 | 
  5 | class MPCPolicy(object):
  6 |     def __init__(self, env,
  7 |                  plan_horizon,
  8 |                  plan_paths=10,
  9 |                  kappa=1.0,
 10 |                  gamma=1.0,
 11 |                  mean=None,
 12 |                  filter_coefs=None,
 13 |                  seed=123,
 14 |                  warmstart=True,
 15 |                  fitted_model=None,
 16 |                  omega=5.0,
 17 |                  **kwargs,
 18 |                  ):
 19 | 
 20 |         # initialize
 21 |         self.env, self.seed = env, seed
 22 |         self.n, self.m = env.observation_dim, env.action_dim
 23 |         self.plan_horizon, self.num_traj = plan_horizon, plan_paths
 24 | 
 25 |         if fitted_model is None:
 26 |             print("Policy requires a fitted dynamics model")
 27 |             quit()
 28 |         else:
 29 |             self.fitted_model = fitted_model
 30 | 
 31 |         # initialize other params
 32 |         self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma
 33 |         if mean is None:
 34 |             self.mean = np.zeros(self.m)
 35 |         if filter_coefs is None:
 36 |             self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0]
 37 |         self.act_sequence = np.ones((self.plan_horizon, self.m)) * self.mean
 38 |         self.init_act_sequence = self.act_sequence.copy()
 39 |         self.warmstart = warmstart
 40 |         self.omega = omega
 41 | 
 42 |     def get_action(self, obs):
 43 |         # generate paths
 44 |         if type(self.fitted_model) == list:
 45 | 
 46 |             # Ensemble case
 47 |             # Collect trajectories from different models with same action sequences
 48 |             base_act = self.act_sequence
 49 |             act_list = [generate_perturbed_actions(base_act, self.filter_coefs)
 50 |                         for _ in range(self.num_traj)]
 51 |             actions = np.array(act_list)
 52 |             paths_list = []
 53 |             for model in self.fitted_model:
 54 |                 paths = trajectory_rollout(actions, model, obs)
 55 |                 self.env.env.env.compute_path_rewards(paths)
 56 |                 paths_list.append(paths)
 57 |             # consolidate paths
 58 |             paths = dict()
 59 |             for k in paths_list[0].keys():
 60 |                 v = np.vstack([p[k] for p in paths_list])
 61 |                 paths[k] = v
 62 |             R = self.score_trajectory_ensemble(paths, paths_list)
 63 | 
 64 |         else:
 65 |             paths = generate_paths(num_traj=self.num_traj, fitted_model=self.fitted_model,
 66 |                                    start_state=obs, base_act=self.act_sequence, filter_coefs=self.filter_coefs)
 67 |             self.env.env.env.compute_path_rewards(paths)  # will populate path['rewards']
 68 |             R = self.score_trajectory(paths)
 69 | 
 70 |         S = np.exp(self.kappa * (R - np.max(R)))
 71 |         act = paths["actions"]
 72 | 
 73 |         weighted_seq = S * act.T
 74 |         act_sequence = np.sum(weighted_seq.T, axis=0) / (np.sum(S) + 1e-6)
 75 |         action = act_sequence[0].copy()
 76 | 
 77 |         # get updated action sequence
 78 |         if self.warmstart:
 79 |             self.act_sequence[:-1] = act_sequence[1:]
 80 |             self.act_sequence[-1] = self.mean.copy()
 81 |         else:
 82 |             self.act_sequence = self.init_act_sequence.copy()
 83 |         return action
 84 | 
 85 |     def score_trajectory_ensemble(self, paths, paths_list):
 86 |         num_traj = self.num_traj
 87 |         num_models = len(paths_list)
 88 |         total_traj = paths['rewards'].shape[0]
 89 |         horizon = paths['rewards'].shape[1]
 90 |         predictions = [p['observations'] for p in paths_list]
 91 |         disagreement = np.std(predictions, axis=0)      # (num_traj, horizon, state_dim)
 92 |         disagreement = np.sum(disagreement, axis=(1,2)) # (num_traj,)
 93 |         scores = np.zeros(total_traj)
 94 |         for i in range(total_traj):
 95 |             disagreement_score = disagreement[i // self.num_traj]
 96 |             scores[i] = self.omega * disagreement_score
 97 |             for t in range(horizon):
 98 |                 scores[i] += (self.gamma ** t) * paths["rewards"][i][t]
 99 |         return scores
100 | 
101 |     def score_trajectory(self, paths):
102 |         # rewards shape: (num_traj, horizon)
103 |         num_traj = paths["rewards"].shape[0]
104 |         horizon = paths["rewards"].shape[1]
105 |         scores = np.zeros(num_traj)
106 |         for i in range(num_traj):
107 |             scores[i] = 0.0
108 |             for t in range(horizon):
109 |                 scores[i] += (self.gamma**t)*paths["rewards"][i][t]
110 |         return scores
111 | 


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/configs/point_mass.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env_name'      :   'mjrl_point_mass-v0',
 6 | 'seed'          :   123,
 7 | 'debug_mode'    :   False,
 8 | 'num_iter'      :   5,
 9 | 'iter_samples'  :   100,
10 | 'eval_rollouts' :   25,
11 | 'num_models'    :   3,
12 | 'exp_notes'     :   'Toy experiment for initial trial.',
13 | 'save_freq'     :   1,
14 | 'device'        :   'cpu',
15 | 'learn_reward'  :   False,
16 | 'reward_file'   :   'utils/reward_functions/mjrl_point_mass.py',
17 | 
18 | # dynamics learning
19 | 
20 | 'hidden_size'   :   (256, 256),
21 | 'activation'    :   'relu',
22 | 'fit_lr'        :   1e-3,
23 | 'fit_wd'        :   1e-5,
24 | 'buffer_size'   :   10000,
25 | 'fit_mb_size'   :   16,
26 | 'fit_epochs'    :   25,
27 | 'refresh_fit'   :   False,
28 | 
29 | # initial data
30 | 
31 | 'init_log_std'  :   -0.5,
32 | 'min_log_std'   :   -2.0,
33 | 'init_samples'  :   1000,
34 | 
35 | # NPG params
36 | 
37 | 'policy_size'   :   (32, 32),
38 | 'inner_steps'   :   10,
39 | 'step_size'     :   0.05,
40 | 'update_paths'  :   250,
41 | 'start_state'	:   'init',
42 | 'horizon'       :   25,
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/configs/reacher.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env_name'      :   'mjrl_reacher_7dof-v0',
 6 | 'seed'          :   123,
 7 | 'debug_mode'    :   False,
 8 | 'num_iter'      :   25,
 9 | 'iter_samples'  :   500,
10 | 'eval_rollouts' :   10,
11 | 'num_models'    :   4,
12 | 'save_freq'     :   1,
13 | 'device'        :   'cpu',
14 | 
15 | # dynamics learning
16 | 
17 | 'hidden_size'   :   (256, 256),
18 | 'activation'    :   'relu',
19 | 'fit_lr'        :   1e-3,
20 | 'fit_wd'        :   0.0,
21 | 'buffer_size'   :   20000,
22 | 'fit_mb_size'   :   64,
23 | 'fit_epochs'    :   20,
24 | 'refresh_fit'   :   False,
25 | 
26 | # initial data
27 | 
28 | 'init_log_std'  :   -0.5,
29 | 'min_log_std'   :   -2.5,
30 | 'init_samples'  :   2500,
31 | 'init_policy'   :   None,
32 | 
33 | 
34 | # NPG params
35 | 
36 | 'policy_size'   :   (64, 64),
37 | 'inner_steps'   :   5,
38 | 'step_size'     :   0.05,
39 | 'update_paths'  :   250,
40 | 'start_state'	:   'init',
41 | 'horizon'       :   50,
42 | 
43 | }


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/sandbox/example_config_mpc.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 
 3 | # general inputs
 4 | 
 5 | 'env_name'      :   'mjrl_point_mass-v0',
 6 | 'seed'          :   123,
 7 | 'debug_mode'    :   False,
 8 | 'num_iter'      :   5,
 9 | 'paths_per_iter':   5,
10 | 'eval_rollouts' :   10,
11 | 'num_models'    :   3,
12 | 'exp_notes'     :   'Toy experiment for initial trial.',
13 | 'save_freq'     :   5,
14 | 'device'        :   'cpu',
15 | 
16 | # dynamics learning
17 | 
18 | 'hidden_size'   :   (64, 64),
19 | 'activation'    :   'relu',
20 | 'fit_lr'        :   1e-3,
21 | 'fit_wd'        :   1e-5,
22 | 'max_paths'     :   1000,
23 | 'fit_mb_size'   :   16,
24 | 'fit_epochs'    :   25,
25 | 'refresh_fit'   :   True,
26 | 
27 | # initial data
28 | 
29 | 'init_log_std'  :   -0.5,
30 | 'n_init_paths'  :   25,
31 | 'use_demos'     :   False,
32 | 'demo_file'     :   None,
33 | 
34 | # model predictive control
35 | 
36 | 'noisy_mpc'     :   True,     # when collecting data for exploration
37 | 'noise_level'   :   0.1,
38 | 'filter_coefs'  :   {'f1': 0.5, 'f2': 1.0, 'f3': 0.0, 'f4': 0.0},
39 | 'plan_paths'    :   200,
40 | 'plan_horizon'  :   10,
41 | 'kappa'         :   2.0,
42 | 'omega'         :   0.0,
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/sandbox/run_model_learning_mpc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Job script to optimize trajectories with fitted model
  3 | """
  4 | 
  5 | import numpy as np
  6 | import copy
  7 | import torch
  8 | import torch.nn as nn
  9 | import pickle
 10 | import mjrl.envs
 11 | import time as timer
 12 | import argparse
 13 | import os
 14 | import json
 15 | import mjrl.samplers.core as trajectory_sampler
 16 | import mjrl.utils.tensor_utils as tensor_utils
 17 | from tqdm import tqdm
 18 | from tabulate import tabulate
 19 | from mjrl.policies.gaussian_mlp import MLP
 20 | from mjrl.baselines.mlp_baseline import MLPBaseline
 21 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 22 | from mjrl.utils.gym_env import GymEnv
 23 | from mjrl.utils.logger import DataLog
 24 | from mjrl.utils.make_train_plots import make_train_plots
 25 | from mjrl.algos.model_accel.nn_dynamics import DynamicsModel
 26 | from mjrl.algos.model_accel.model_learning_mpc import MPCPolicy
 27 | from mjrl.algos.model_accel.sampling import sample_paths, evaluate_policy
 28 | 
 29 | 
 30 | # ===============================================================================
 31 | # Get command line arguments
 32 | # ===============================================================================
 33 | 
 34 | parser = argparse.ArgumentParser(description='Trajectory Optimization with fitted models.')
 35 | parser.add_argument('--output', type=str, required=True, help='location to store results')
 36 | parser.add_argument('--config', type=str, required=True, help='path to config file with exp params')
 37 | args = parser.parse_args()
 38 | OUT_DIR = args.output
 39 | if not os.path.exists(OUT_DIR):
 40 |     os.mkdir(OUT_DIR)
 41 | with open(args.config, 'r') as f:
 42 |     job_data = eval(f.read())
 43 | 
 44 | # Unpack args and make files for easy access
 45 | logger = DataLog()
 46 | ENV_NAME = job_data['env_name']
 47 | PICKLE_FILE = OUT_DIR + '/exp_results.pickle'
 48 | EXP_FILE = OUT_DIR + '/job_data.json'
 49 | SEED = job_data['seed']
 50 | job_data['filter_coefs'] = [job_data['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']]
 51 | 
 52 | # base cases
 53 | if 'num_models' not in job_data.keys():
 54 |     job_data['num_models'] = 1
 55 | if job_data['num_models'] == 1 or 'omega' not in job_data.keys():
 56 |     job_data['omega'] = 0.0
 57 | if 'eval_rollouts' not in job_data.keys():
 58 |     job_data['eval_rollouts'] = 0
 59 | if 'save_freq' not in job_data.keys():
 60 |     job_data['save_freq'] = 10
 61 | if 'device' not in job_data.keys():
 62 |     job_data['device'] = 'cpu'
 63 | if 'debug_mode' in job_data.keys():
 64 |     DEBUG = job_data['debug_mode']
 65 | else:
 66 |     DEBUG =False
 67 | if 'device_path' not in job_data.keys():
 68 |     job_data['device_path'] = None
 69 | with open(EXP_FILE, 'w') as f:
 70 |     json.dump(job_data, f, indent=4)
 71 | 
 72 | del(job_data['seed'])
 73 | job_data['base_seed'] = SEED
 74 | 
 75 | # ===============================================================================
 76 | # Train loop
 77 | # ===============================================================================
 78 | 
 79 | np.random.seed(SEED)
 80 | torch.random.manual_seed(SEED)
 81 | 
 82 | # TODO(Aravind): Map to hardware if device_path is specified
 83 | 
 84 | e = GymEnv(ENV_NAME)
 85 | e.set_seed(SEED)
 86 | models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+i, **job_data)
 87 |           for i in range(job_data['num_models'])]
 88 | exploratory_policy = MLP(e.spec, seed=SEED, init_log_std=job_data['init_log_std'])
 89 | paths = []
 90 | 
 91 | for outer_iter in range(job_data['num_iter']):
 92 | 
 93 |     ts = timer.time()
 94 |     print("================> ITERATION : %i " % outer_iter)
 95 |     print("Getting interaction data from real dynamics ...")
 96 | 
 97 |     if outer_iter == 0:
 98 |         iter_paths = trajectory_sampler.sample_paths(job_data['n_init_paths'], e,
 99 |                                                      exploratory_policy,
100 |                                                      eval_mode=False, base_seed=SEED)
101 |     else:
102 |         iter_paths = sample_paths(job_data['paths_per_iter'],
103 |                                   mpc_policy.env, mpc_policy,
104 |                                   eval_mode=(not job_data['noisy_mpc']),
105 |                                   noise_level=job_data['noise_level'],
106 |                                   base_seed=SEED + outer_iter)
107 | 
108 |     # reset the environment (good for hardware)
109 |     e.reset()
110 | 
111 |     for p in iter_paths:
112 |         paths.append(p)
113 | 
114 |     if len(paths) > job_data['max_paths']:
115 |         diff = len(paths) - job_data['max_paths']
116 |         paths[:diff] = []
117 | 
118 |     s = np.concatenate([p['observations'][:-1] for p in paths])
119 |     a = np.concatenate([p['actions'][:-1] for p in paths])
120 |     sp = np.concatenate([p['observations'][1:] for p in paths])
121 |     r = np.array([np.sum(p['rewards']) for p in iter_paths])
122 |     rollout_score = np.mean(r)
123 | 
124 |     logger.log_kv('fit_epochs', job_data['fit_epochs'])
125 |     logger.log_kv('rollout_score', rollout_score)
126 |     try:
127 |         rollout_metric = e.env.env.evaluate_success(iter_paths)
128 |         logger.log_kv('rollout_metric', rollout_metric)
129 |     except:
130 |         pass
131 | 
132 |     print("Data gathered, fitting model ...")
133 |     if job_data['refresh_fit']:
134 |         models = [DynamicsModel(state_dim=e.observation_dim, act_dim=e.action_dim, seed=SEED+123*outer_iter,
135 |                                 **job_data) for i in range(job_data['num_models'])]
136 | 
137 |     for i, model in enumerate(models):
138 |         epoch_loss = model.fit(s, a, sp, job_data['fit_mb_size'], job_data['fit_epochs'])
139 |         logger.log_kv('loss_before_' + str(i), epoch_loss[0])
140 |         logger.log_kv('loss_after_' + str(i), epoch_loss[-1])
141 | 
142 |     mpc_policy = MPCPolicy(env=e, fitted_model=models, seed=SEED+12345*outer_iter, **job_data)
143 | 
144 |     if job_data['eval_rollouts'] > 0:
145 |         print("Performing validation rollouts ... ")
146 |         eval_paths = evaluate_policy(mpc_policy.env, mpc_policy, mpc_policy.fitted_model[0], noise_level=0.0,
147 |                                      real_step=True, num_episodes=job_data['eval_rollouts'], visualize=False)
148 |         eval_score = np.mean([np.sum(p['rewards']) for p in eval_paths])
149 |         logger.log_kv('eval_score', eval_score)
150 |         try:
151 |             eval_metric = e.env.env.evaluate_success(eval_paths)
152 |             logger.log_kv('eval_metric', eval_metric)
153 |         except:
154 |             pass
155 |     else:
156 |         eval_paths = []
157 | 
158 |     exp_data = dict(policy=mpc_policy, fitted_model=mpc_policy.fitted_model,
159 |                     log=logger.log, rollout_paths=iter_paths, eval_paths=eval_paths)
160 |     if outer_iter > 0 and outer_iter % job_data['save_freq'] == 0:
161 |         pickle.dump(exp_data, open(PICKLE_FILE, 'wb'))
162 |         pickle.dump(exp_data, open(OUT_DIR + '/iteration_' + str(outer_iter) + '.pickle', 'wb'))
163 | 
164 |     tf = timer.time()
165 |     logger.log_kv('iter_time', tf-ts)
166 |     print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
167 |                                logger.get_current_log().items()))
168 |     print(tabulate(print_data))
169 |     logger.save_log(OUT_DIR+'/')
170 |     make_train_plots(log=logger.log, keys=['rollout_score', 'eval_score', 'rollout_metric', 'eval_metric'],
171 |                      save_loc=OUT_DIR+'/')
172 | 
173 |     if job_data['debug_mode']:
174 |         evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], False, 5, True)
175 |         evaluate_policy(e, mpc_policy, mpc_policy.fitted_model[0], job_data['noise_level'], True, 5, True)
176 | 
177 |     pickle.dump(exp_data, open(PICKLE_FILE, 'wb')) # final save


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/algos/model_accel/run_experiments/utils/reward_functions/__init__.py


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/utils/reward_functions/mjrl_point_mass.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def reward_function(paths):
 4 |     # path has two keys: observations and actions
 5 |     # path["observations"] : (num_traj, horizon, obs_dim)
 6 |     # return paths that contain rewards in path["rewards"]
 7 |     # path["rewards"] should have shape (num_traj, horizon)
 8 |     obs = paths["observations"]
 9 |     obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
10 |     agent_pos = obs[:, :, :2]
11 |     target_pos = obs[:, :, -2:]
12 |     l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1)
13 |     l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1)
14 |     rewards = -1.0 * l1_dist - 0.5 * l2_dist
15 |     rewards[..., :-1] = rewards[..., 1:]   # shift index by 1 to have r(s,a)=r(s')
16 |     paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
17 |     return paths
18 | 


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/utils/visualize_policy.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import mjrl.envs
 3 | import trajopt.envs
 4 | import mj_envs
 5 | import click
 6 | import os
 7 | import gym
 8 | import numpy as np
 9 | import pickle
10 | import torch
11 | from mjrl.utils.gym_env import GymEnv
12 | from mjrl.policies.gaussian_mlp import MLP
13 | import trajopt.envs
14 | 
15 | DESC = '''
16 | Helper script to visualize policy (in mjrl format).\n
17 | USAGE:\n
18 |     Visualizes policy on the env\n
19 |     $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n
20 | '''
21 | 
22 | # MAIN =========================================================
23 | @click.command(help=DESC)
24 | @click.option('--env_name', type=str, help='environment to load', required= True)
25 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None)
26 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation')
27 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123)
28 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10)
29 | @click.option('--log_std', type=float, default=-0.5)
30 | @click.option('--terminate', type=bool, default=True)
31 | @click.option('--device_path', type=str, default=None)
32 | def main(env_name, policy, mode, seed, episodes, log_std, terminate, device_path):
33 |     render = True
34 | 
35 |     # TODO(Aravind): Map to hardware if device_path is specified
36 | 
37 |     e = GymEnv(env_name)
38 |     e.set_seed(seed)
39 |     np.random.seed(seed)
40 |     torch.manual_seed(seed)
41 |     if policy is not None:
42 |         policy = pickle.load(open(policy, 'rb'))
43 |     else:
44 |         policy = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=log_std)
45 | 
46 |     for ep in range(episodes):
47 |         o = e.reset()
48 |         rew = 0.0
49 |         t = 0
50 |         done = False
51 |         while t < e.horizon and done is False:
52 |             o = e.get_obs()
53 |             a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
54 |             next_o, r, done, ifo = e.step(a)
55 |             if terminate is False:
56 |                 done = False
57 |             rew = rew + r
58 |             t = t + 1
59 |             if render:
60 |                 e.render()
61 |             if done and t < e.horizon - 1:
62 |                 print("Episode terminated early")
63 |         print("episode score = %f " % rew)
64 | 
65 |     e.reset()
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/mjrl/algos/model_accel/run_experiments/utils/visualize_trajectories.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import click
 3 | import json
 4 | import numpy as np
 5 | import torch
 6 | import mjrl.envs
 7 | import trajopt.envs
 8 | import mj_envs
 9 | import mjrl.utils.tensor_utils as tensor_utils
10 | 
11 | from mjrl.utils.gym_env import GymEnv
12 | from mjrl.algos.model_accel.sampling import evaluate_policy
13 | 
14 | DESC = '''
15 | Helper script to visualize optimized trajectories (list of trajectories in trajopt format).\n
16 | USAGE:\n
17 |     $ python viz_trajectories.py --file path_to_file.pickle\n
18 | '''
19 | @click.command(help=DESC)
20 | @click.option('--file', type=str, help='pickle file with trajectories', required= True)
21 | @click.option('--seed', type=int, default=123)
22 | @click.option('--noise_level', type=float, default=0.0)
23 | @click.option('--num_episodes', type=int, help='number of times to play trajectories', default=5)
24 | @click.option('--config', type=str, help='if provided MPC params from here will be used.', default=None)
25 | @click.option('--device_path', type=str, default=None)
26 | def main(file, seed, noise_level, num_episodes, config, device_path):
27 |     exp_data = pickle.load(open(file, 'rb'))
28 |     policy = exp_data['policy']
29 |     model = exp_data['fitted_model']
30 |     model = model[-1] if type(model) == list else model
31 |     env_id = policy.env.env_id
32 |     render = True
33 | 
34 |     # TODO(Aravind): Map to hardware if device_path is specified
35 | 
36 |     env = GymEnv(env_id)
37 |     policy.env = env
38 | 
39 |     env.set_seed(seed)
40 |     np.random.seed(seed)
41 |     torch.manual_seed(seed)
42 | 
43 |     if config is not None:
44 |         try:
45 |             with open(config, 'r') as f:
46 |                 config = eval(f.read())
47 |         except:
48 |             with open(config, 'r') as f:
49 |                 config = json.load(f)
50 |         policy.plan_horizon = config['plan_horizon']
51 |         policy.num_traj = config['plan_paths']
52 |         policy.kappa = config['kappa']
53 |         policy.filter_coefs = [config['filter_coefs'][k] for k in ['f1', 'f2', 'f3', 'f4']]
54 |         policy.omega = config['omega'] if 'omega' in config.keys() else 0.0
55 | 
56 |     # TODO(Aravind): Implement capability to set predicted state for rendering purposes
57 |     # evaluate_policy(env, policy, model, noise_level, real_step=False, num_episodes=num_episodes, visualize=render)
58 |     evaluate_policy(env, policy, model, noise_level, real_step=True, num_episodes=num_episodes, visualize=render)
59 | 
60 |     # final close out
61 |     env.reset()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/mjrl/algos/npg_cg.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.disable(logging.CRITICAL)
  3 | import numpy as np
  4 | import scipy as sp
  5 | import scipy.sparse.linalg as spLA
  6 | import copy
  7 | import time as timer
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.autograd import Variable
 11 | import copy
 12 | 
 13 | # samplers
 14 | import mjrl.samplers.core as trajectory_sampler
 15 | 
 16 | # utility functions
 17 | import mjrl.utils.process_samples as process_samples
 18 | from mjrl.utils.logger import DataLog
 19 | from mjrl.utils.cg_solve import cg_solve
 20 | from mjrl.algos.batch_reinforce import BatchREINFORCE
 21 | 
 22 | 
 23 | class NPG(BatchREINFORCE):
 24 |     def __init__(self, env, policy, baseline,
 25 |                  normalized_step_size=0.01,
 26 |                  const_learn_rate=None,
 27 |                  FIM_invert_args={'iters': 10, 'damping': 1e-4},
 28 |                  hvp_sample_frac=1.0,
 29 |                  seed=123,
 30 |                  save_logs=False,
 31 |                  kl_dist=None,
 32 |                  input_normalization=None,
 33 |                  **kwargs
 34 |                  ):
 35 |         """
 36 |         All inputs are expected in mjrl's format unless specified
 37 |         :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance
 38 |         :param kl_dist: desired KL distance between steps. Overrides normalized_step_size.
 39 |         :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well)
 40 |         :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG
 41 |         :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow)
 42 |         :param seed: random seed
 43 |         """
 44 | 
 45 |         self.env = env
 46 |         self.policy = policy
 47 |         self.baseline = baseline
 48 |         self.alpha = const_learn_rate
 49 |         self.n_step_size = normalized_step_size if kl_dist is None else 2.0 * kl_dist
 50 |         self.seed = seed
 51 |         self.save_logs = save_logs
 52 |         self.FIM_invert_args = FIM_invert_args
 53 |         self.hvp_subsample = hvp_sample_frac
 54 |         self.running_score = None
 55 |         if save_logs: self.logger = DataLog()
 56 |         # input normalization (running average)
 57 |         self.input_normalization = input_normalization
 58 |         if self.input_normalization is not None:
 59 |             if self.input_normalization > 1 or self.input_normalization <= 0:
 60 |                 self.input_normalization = None
 61 | 
 62 |     def HVP(self, observations, actions, vector, regu_coef=None):
 63 |         regu_coef = self.FIM_invert_args['damping'] if regu_coef is None else regu_coef
 64 |         vec = Variable(torch.from_numpy(vector).float(), requires_grad=False)
 65 |         if self.hvp_subsample is not None and self.hvp_subsample < 0.99:
 66 |             num_samples = observations.shape[0]
 67 |             rand_idx = np.random.choice(num_samples, size=int(self.hvp_subsample*num_samples))
 68 |             obs = observations[rand_idx]
 69 |             act = actions[rand_idx]
 70 |         else:
 71 |             obs = observations
 72 |             act = actions
 73 |         old_dist_info = self.policy.old_dist_info(obs, act)
 74 |         new_dist_info = self.policy.new_dist_info(obs, act)
 75 |         mean_kl = self.policy.mean_kl(new_dist_info, old_dist_info)
 76 |         grad_fo = torch.autograd.grad(mean_kl, self.policy.trainable_params, create_graph=True)
 77 |         flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_fo])
 78 |         h = torch.sum(flat_grad*vec)
 79 |         hvp = torch.autograd.grad(h, self.policy.trainable_params)
 80 |         hvp_flat = np.concatenate([g.contiguous().view(-1).data.numpy() for g in hvp])
 81 |         return hvp_flat + regu_coef*vector
 82 | 
 83 |     def build_Hvp_eval(self, inputs, regu_coef=None):
 84 |         def eval(v):
 85 |             full_inp = inputs + [v] + [regu_coef]
 86 |             Hvp = self.HVP(*full_inp)
 87 |             return Hvp
 88 |         return eval
 89 | 
 90 |     # ----------------------------------------------------------
 91 |     def train_from_paths(self, paths):
 92 | 
 93 |         observations, actions, advantages, base_stats, self.running_score = self.process_paths(paths)
 94 |         if self.save_logs: self.log_rollout_statistics(paths)
 95 | 
 96 |         # Keep track of times for various computations
 97 |         t_gLL = 0.0
 98 |         t_FIM = 0.0
 99 | 
100 |         # normalize inputs if necessary
101 |         if self.input_normalization:
102 |             data_in_shift, data_in_scale = np.mean(observations, axis=0), np.std(observations, axis=0)
103 |             pi_in_shift, pi_in_scale = self.policy.model.in_shift.data.numpy(), self.policy.model.in_scale.data.numpy()
104 |             pi_out_shift, pi_out_scale = self.policy.model.out_shift.data.numpy(), self.policy.model.out_scale.data.numpy()
105 |             pi_in_shift = self.input_normalization * pi_in_shift + (1-self.input_normalization) * data_in_shift
106 |             pi_in_scale = self.input_normalization * pi_in_scale + (1-self.input_normalization) * data_in_scale
107 |             self.policy.model.set_transformations(pi_in_shift, pi_in_scale, pi_out_shift, pi_out_scale)
108 | 
109 |         # Optimization algorithm
110 |         # --------------------------
111 |         surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
112 | 
113 |         # VPG
114 |         ts = timer.time()
115 |         vpg_grad = self.flat_vpg(observations, actions, advantages)
116 |         t_gLL += timer.time() - ts
117 | 
118 |         # NPG
119 |         ts = timer.time()
120 |         hvp = self.build_Hvp_eval([observations, actions],
121 |                                   regu_coef=self.FIM_invert_args['damping'])
122 |         npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(),
123 |                             cg_iters=self.FIM_invert_args['iters'])
124 |         t_FIM += timer.time() - ts
125 | 
126 |         # Step size computation
127 |         # --------------------------
128 |         if self.alpha is not None:
129 |             alpha = self.alpha
130 |             n_step_size = (alpha ** 2) * np.dot(vpg_grad.T, npg_grad)
131 |         else:
132 |             n_step_size = self.n_step_size
133 |             alpha = np.sqrt(np.abs(self.n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20)))
134 | 
135 |         # Policy update
136 |         # --------------------------
137 |         curr_params = self.policy.get_param_values()
138 |         new_params = curr_params + alpha * npg_grad
139 |         self.policy.set_param_values(new_params, set_new=True, set_old=False)
140 |         surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
141 |         kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
142 |         self.policy.set_param_values(new_params, set_new=True, set_old=True)
143 | 
144 |         # Log information
145 |         if self.save_logs:
146 |             self.logger.log_kv('alpha', alpha)
147 |             self.logger.log_kv('delta', n_step_size)
148 |             self.logger.log_kv('time_vpg', t_gLL)
149 |             self.logger.log_kv('time_npg', t_FIM)
150 |             self.logger.log_kv('kl_dist', kl_dist)
151 |             self.logger.log_kv('surr_improvement', surr_after - surr_before)
152 |             self.logger.log_kv('running_score', self.running_score)
153 |             try:
154 |                 self.env.env.env.evaluate_success(paths, self.logger)
155 |             except:
156 |                 # nested logic for backwards compatibility. TODO: clean this up.
157 |                 try:
158 |                     success_rate = self.env.env.env.evaluate_success(paths)
159 |                     self.logger.log_kv('success_rate', success_rate)
160 |                 except:
161 |                     pass
162 | 
163 |         return base_stats
164 | 


--------------------------------------------------------------------------------
/mjrl/algos/ppo_clip.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.disable(logging.CRITICAL)
  3 | import numpy as np
  4 | import scipy as sp
  5 | import scipy.sparse.linalg as spLA
  6 | import copy
  7 | import time as timer
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.autograd import Variable
 11 | import copy
 12 | 
 13 | # samplers
 14 | import mjrl.samplers.core as trajectory_sampler
 15 | 
 16 | # utility functions
 17 | import mjrl.utils.process_samples as process_samples
 18 | from mjrl.utils.logger import DataLog
 19 | from mjrl.utils.cg_solve import cg_solve
 20 | from mjrl.algos.batch_reinforce import BatchREINFORCE
 21 | 
 22 | 
 23 | class PPO(BatchREINFORCE):
 24 |     def __init__(self, env, policy, baseline,
 25 |                  clip_coef = 0.2,
 26 |                  epochs = 10,
 27 |                  mb_size = 64,
 28 |                  learn_rate = 3e-4,
 29 |                  seed = 123,
 30 |                  save_logs = False,
 31 |                  **kwargs
 32 |                  ):
 33 | 
 34 |         self.env = env
 35 |         self.policy = policy
 36 |         self.baseline = baseline
 37 |         self.learn_rate = learn_rate
 38 |         self.seed = seed
 39 |         self.save_logs = save_logs
 40 |         self.clip_coef = clip_coef
 41 |         self.epochs = epochs
 42 |         self.mb_size = mb_size
 43 |         self.running_score = None
 44 |         if save_logs: self.logger = DataLog()
 45 | 
 46 |         self.optimizer = torch.optim.Adam(self.policy.trainable_params, lr=learn_rate)
 47 | 
 48 |     def PPO_surrogate(self, observations, actions, advantages):
 49 |         adv_var = Variable(torch.from_numpy(advantages).float(), requires_grad=False)
 50 |         old_dist_info = self.policy.old_dist_info(observations, actions)
 51 |         new_dist_info = self.policy.new_dist_info(observations, actions)
 52 |         LR = self.policy.likelihood_ratio(new_dist_info, old_dist_info)
 53 |         LR_clip = torch.clamp(LR, min=1-self.clip_coef, max=1+self.clip_coef)
 54 |         ppo_surr = torch.mean(torch.min(LR*adv_var,LR_clip*adv_var))
 55 |         return ppo_surr
 56 | 
 57 |     # ----------------------------------------------------------
 58 |     def train_from_paths(self, paths):
 59 | 
 60 |         # Concatenate from all the trajectories
 61 |         observations = np.concatenate([path["observations"] for path in paths])
 62 |         actions = np.concatenate([path["actions"] for path in paths])
 63 |         advantages = np.concatenate([path["advantages"] for path in paths])
 64 |         # Advantage whitening
 65 |         advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
 66 |         # NOTE : advantage should be zero mean in expectation
 67 |         # normalized step size invariant to advantage scaling,
 68 |         # but scaling can help with least squares
 69 | 
 70 |         # cache return distributions for the paths
 71 |         path_returns = [sum(p["rewards"]) for p in paths]
 72 |         mean_return = np.mean(path_returns)
 73 |         std_return = np.std(path_returns)
 74 |         min_return = np.amin(path_returns)
 75 |         max_return = np.amax(path_returns)
 76 |         base_stats = [mean_return, std_return, min_return, max_return]
 77 |         self.running_score = mean_return if self.running_score is None else \
 78 |                              0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
 79 |         if self.save_logs: self.log_rollout_statistics(paths)
 80 | 
 81 |         # Optimization algorithm
 82 |         # --------------------------
 83 |         surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
 84 |         params_before_opt = self.policy.get_param_values()
 85 | 
 86 |         ts = timer.time()
 87 |         num_samples = observations.shape[0]
 88 |         for ep in range(self.epochs):
 89 |             for mb in range(int(num_samples / self.mb_size)):
 90 |                 rand_idx = np.random.choice(num_samples, size=self.mb_size)
 91 |                 obs = observations[rand_idx]
 92 |                 act = actions[rand_idx]
 93 |                 adv = advantages[rand_idx]
 94 |                 self.optimizer.zero_grad()
 95 |                 loss = - self.PPO_surrogate(obs, act, adv)
 96 |                 loss.backward()
 97 |                 self.optimizer.step()
 98 | 
 99 |         params_after_opt = self.policy.get_param_values()
100 |         surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
101 |         kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
102 |         self.policy.set_param_values(params_after_opt, set_new=True, set_old=True)
103 |         t_opt = timer.time() - ts
104 | 
105 |         # Log information
106 |         if self.save_logs:
107 |             self.logger.log_kv('t_opt', t_opt)
108 |             self.logger.log_kv('kl_dist', kl_dist)
109 |             self.logger.log_kv('surr_improvement', surr_after - surr_before)
110 |             self.logger.log_kv('running_score', self.running_score)
111 |             try:
112 |                 self.env.env.env.evaluate_success(paths, self.logger)
113 |             except:
114 |                 # nested logic for backwards compatibility. TODO: clean this up.
115 |                 try:
116 |                     success_rate = self.env.env.env.evaluate_success(paths)
117 |                     self.logger.log_kv('success_rate', success_rate)
118 |                 except:
119 |                     pass
120 | 
121 |         return base_stats
122 | 


--------------------------------------------------------------------------------
/mjrl/algos/trpo.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.disable(logging.CRITICAL)
  3 | import numpy as np
  4 | import scipy as sp
  5 | import scipy.sparse.linalg as spLA
  6 | import copy
  7 | import time as timer
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.autograd import Variable
 11 | import copy
 12 | 
 13 | # samplers
 14 | import mjrl.samplers.core as trajectory_sampler
 15 | import mjrl.samplers.batch_sampler as batch_sampler
 16 | 
 17 | # utility functions
 18 | import mjrl.utils.process_samples as process_samples
 19 | from mjrl.utils.logger import DataLog
 20 | from mjrl.utils.cg_solve import cg_solve
 21 | 
 22 | # Import NPG
 23 | from mjrl.algos.npg_cg import NPG
 24 | 
 25 | class TRPO(NPG):
 26 |     def __init__(self, env, policy, baseline,
 27 |                  kl_dist=0.01,
 28 |                  FIM_invert_args={'iters': 10, 'damping': 1e-4},
 29 |                  hvp_sample_frac=1.0,
 30 |                  seed=123,
 31 |                  save_logs=False,
 32 |                  normalized_step_size=0.01,
 33 |                  **kwargs
 34 |                  ):
 35 |         """
 36 |         All inputs are expected in mjrl's format unless specified
 37 |         :param normalized_step_size: Normalized step size (under the KL metric). Twice the desired KL distance
 38 |         :param kl_dist: desired KL distance between steps. Overrides normalized_step_size.
 39 |         :param const_learn_rate: A constant learn rate under the L2 metric (won't work very well)
 40 |         :param FIM_invert_args: {'iters': # cg iters, 'damping': regularization amount when solving with CG
 41 |         :param hvp_sample_frac: fraction of samples (>0 and <=1) to use for the Fisher metric (start with 1 and reduce if code too slow)
 42 |         :param seed: random seed
 43 |         """
 44 | 
 45 |         self.env = env
 46 |         self.policy = policy
 47 |         self.baseline = baseline
 48 |         self.kl_dist = kl_dist if kl_dist is not None else 0.5*normalized_step_size
 49 |         self.seed = seed
 50 |         self.save_logs = save_logs
 51 |         self.FIM_invert_args = FIM_invert_args
 52 |         self.hvp_subsample = hvp_sample_frac
 53 |         self.running_score = None
 54 |         if save_logs: self.logger = DataLog()
 55 | 
 56 |     def train_from_paths(self, paths):
 57 | 
 58 |         # Concatenate from all the trajectories
 59 |         observations = np.concatenate([path["observations"] for path in paths])
 60 |         actions = np.concatenate([path["actions"] for path in paths])
 61 |         advantages = np.concatenate([path["advantages"] for path in paths])
 62 |         # Advantage whitening
 63 |         advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-6)
 64 |         # NOTE : advantage should be zero mean in expectation
 65 |         # normalized step size invariant to advantage scaling,
 66 |         # but scaling can help with least squares
 67 | 
 68 |         # cache return distributions for the paths
 69 |         path_returns = [sum(p["rewards"]) for p in paths]
 70 |         mean_return = np.mean(path_returns)
 71 |         std_return = np.std(path_returns)
 72 |         min_return = np.amin(path_returns)
 73 |         max_return = np.amax(path_returns)
 74 |         base_stats = [mean_return, std_return, min_return, max_return]
 75 |         self.running_score = mean_return if self.running_score is None else \
 76 |                              0.9*self.running_score + 0.1*mean_return  # approx avg of last 10 iters
 77 |         if self.save_logs: self.log_rollout_statistics(paths)
 78 | 
 79 |         # Keep track of times for various computations
 80 |         t_gLL = 0.0
 81 |         t_FIM = 0.0
 82 | 
 83 |         # Optimization algorithm
 84 |         # --------------------------
 85 |         surr_before = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
 86 | 
 87 |         # VPG
 88 |         ts = timer.time()
 89 |         vpg_grad = self.flat_vpg(observations, actions, advantages)
 90 |         t_gLL += timer.time() - ts
 91 | 
 92 |         # NPG
 93 |         ts = timer.time()
 94 |         hvp = self.build_Hvp_eval([observations, actions],
 95 |                                   regu_coef=self.FIM_invert_args['damping'])
 96 |         npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(),
 97 |                             cg_iters=self.FIM_invert_args['iters'])
 98 |         t_FIM += timer.time() - ts
 99 | 
100 |         # Step size computation
101 |         # --------------------------
102 |         n_step_size = 2.0*self.kl_dist
103 |         alpha = np.sqrt(np.abs(n_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20)))
104 | 
105 |         # Policy update
106 |         # --------------------------
107 |         curr_params = self.policy.get_param_values()
108 |         for k in range(100):
109 |             new_params = curr_params + alpha * npg_grad
110 |             self.policy.set_param_values(new_params, set_new=True, set_old=False)
111 |             kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
112 |             surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
113 |             if kl_dist < self.kl_dist:
114 |                 break
115 |             else:
116 |                 alpha = 0.9*alpha # backtrack
117 |                 print("Step size too high. Backtracking. | kl = %f | surr diff = %f" % \
118 |                       (kl_dist, surr_after-surr_before) )
119 |             if k == 99:
120 |                 alpha = 0.0
121 | 
122 |         new_params = curr_params + alpha * npg_grad
123 |         self.policy.set_param_values(new_params, set_new=True, set_old=False)
124 |         kl_dist = self.kl_old_new(observations, actions).data.numpy().ravel()[0]
125 |         surr_after = self.CPI_surrogate(observations, actions, advantages).data.numpy().ravel()[0]
126 |         self.policy.set_param_values(new_params, set_new=True, set_old=True)
127 | 
128 |         # Log information
129 |         if self.save_logs:
130 |             self.logger.log_kv('alpha', alpha)
131 |             self.logger.log_kv('delta', n_step_size)
132 |             self.logger.log_kv('time_vpg', t_gLL)
133 |             self.logger.log_kv('time_npg', t_FIM)
134 |             self.logger.log_kv('kl_dist', kl_dist)
135 |             self.logger.log_kv('surr_improvement', surr_after - surr_before)
136 |             self.logger.log_kv('running_score', self.running_score)
137 |             try:
138 |                 self.env.env.env.evaluate_success(paths, self.logger)
139 |             except:
140 |                 # nested logic for backwards compatibility. TODO: clean this up.
141 |                 try:
142 |                     success_rate = self.env.env.env.evaluate_success(paths)
143 |                     self.logger.log_kv('success_rate', success_rate)
144 |                 except:
145 |                     pass
146 | 
147 |         return base_stats


--------------------------------------------------------------------------------
/mjrl/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/baselines/__init__.py


--------------------------------------------------------------------------------
/mjrl/baselines/linear_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | 
 4 | 
 5 | class LinearBaseline:
 6 |     def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-5):
 7 |         self.inp = inp
 8 |         self._reg_coeff = reg_coeff
 9 |         self._coeffs = None
10 | 
11 |     def _features(self, paths):
12 |         if self.inp == 'env_features':
13 |             o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
14 |         else:
15 |             o = np.concatenate([path["observations"] for path in paths])
16 |         o = np.clip(o, -10, 10)/10.0
17 |         if o.ndim > 2:
18 |             o = o.reshape(o.shape[0], -1)
19 |         N, n = o.shape
20 |         num_feat = int( n + 1 + 4 )  # linear + bias (1.0) + time till pow 4
21 |         feat_mat = np.ones((N, num_feat))
22 | 
23 |         # linear features
24 |         feat_mat[:,:n] = o
25 | 
26 |         k = 0  # start from this row
27 |         for i in range(len(paths)):
28 |             l = len(paths[i]["rewards"])
29 |             al = np.arange(l)/1000.0
30 |             for j in range(4):
31 |                 feat_mat[k:k+l, -4+j] = al**(j+1)
32 |             k += l
33 | 
34 |         return feat_mat
35 | 
36 |     def fit(self, paths, return_errors=False):
37 | 
38 |         featmat = self._features(paths)
39 |         returns = np.concatenate([path["returns"] for path in paths])
40 | 
41 |         if return_errors:
42 |             predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape)
43 |             errors = returns - predictions
44 |             error_before = np.sum(errors**2)/np.sum(returns**2)
45 | 
46 |         reg_coeff = copy.deepcopy(self._reg_coeff)
47 |         for _ in range(10):
48 |             self._coeffs = np.linalg.lstsq(
49 |                 featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
50 |                 featmat.T.dot(returns)
51 |             )[0]
52 |             if not np.any(np.isnan(self._coeffs)):
53 |                 break
54 |             reg_coeff *= 10
55 | 
56 |         if return_errors:
57 |             predictions = featmat.dot(self._coeffs)
58 |             errors = returns - predictions
59 |             error_after = np.sum(errors**2)/np.sum(returns**2)
60 |             return error_before, error_after
61 | 
62 |     def predict(self, path):
63 |         if self._coeffs is None:
64 |             return np.zeros(len(path["rewards"]))
65 |         return self._features([path]).dot(self._coeffs)
66 | 


--------------------------------------------------------------------------------
/mjrl/baselines/mlp_baseline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import copy
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | from mjrl.utils.optimize_model import fit_data
  7 | 
  8 | import pickle
  9 | 
 10 | class MLPBaseline:
 11 |     def __init__(self, env_spec, inp_dim=None, inp='obs', learn_rate=1e-3, reg_coef=0.0,
 12 |                  batch_size=64, epochs=1, use_gpu=False, hidden_sizes=(128, 128)):
 13 |         self.n = inp_dim if inp_dim is not None else env_spec.observation_dim
 14 |         self.batch_size = batch_size
 15 |         self.epochs = epochs
 16 |         self.reg_coef = reg_coef
 17 |         self.use_gpu = use_gpu
 18 |         self.inp = inp
 19 |         self.hidden_sizes = hidden_sizes
 20 | 
 21 |         self.model = nn.Sequential()
 22 |         layer_sizes = (self.n + 4, ) + hidden_sizes + (1, )
 23 |         for i in range(len(layer_sizes) - 1):
 24 |             layer_id = 'fc_' + str(i)
 25 |             relu_id = 'relu_' + str(i)
 26 |             self.model.add_module(layer_id, nn.Linear(layer_sizes[i], layer_sizes[i+1]))
 27 |             if i != len(layer_sizes) - 2:
 28 |                 self.model.add_module(relu_id, nn.ReLU())
 29 | 
 30 |         if self.use_gpu:
 31 |             self.model.cuda()
 32 | 
 33 |         self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learn_rate, weight_decay=reg_coef)
 34 |         self.loss_function = torch.nn.MSELoss()
 35 | 
 36 |     def _features(self, paths):
 37 |         if self.inp == 'env_features':
 38 |             o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
 39 |         else:
 40 |             o = np.concatenate([path["observations"] for path in paths])
 41 |         o = np.clip(o, -10, 10)/10.0
 42 |         if o.ndim > 2:
 43 |             o = o.reshape(o.shape[0], -1)
 44 |         N, n = o.shape
 45 |         num_feat = int( n + 4 )            # linear + time till pow 4
 46 |         feat_mat =  np.ones((N, num_feat)) # memory allocation
 47 | 
 48 |         # linear features
 49 |         feat_mat[:,:n] = o
 50 | 
 51 |         k = 0  # start from this row
 52 |         for i in range(len(paths)):
 53 |             l = len(paths[i]["rewards"])
 54 |             al = np.arange(l)/1000.0
 55 |             for j in range(4):
 56 |                 feat_mat[k:k+l, -4+j] = al**(j+1)
 57 |             k += l
 58 |         return feat_mat
 59 | 
 60 | 
 61 |     def fit(self, paths, return_errors=False):
 62 | 
 63 |         featmat = self._features(paths)
 64 |         returns = np.concatenate([path["returns"] for path in paths]).reshape(-1, 1)
 65 |         featmat = featmat.astype('float32')
 66 |         returns = returns.astype('float32')
 67 |         num_samples = returns.shape[0]
 68 | 
 69 |         # Make variables with the above data
 70 |         if self.use_gpu:
 71 |             featmat_var = Variable(torch.from_numpy(featmat).cuda(), requires_grad=False)
 72 |             returns_var = Variable(torch.from_numpy(returns).cuda(), requires_grad=False)
 73 |         else:
 74 |             featmat_var = Variable(torch.from_numpy(featmat), requires_grad=False)
 75 |             returns_var = Variable(torch.from_numpy(returns), requires_grad=False)
 76 | 
 77 |         if return_errors:
 78 |             if self.use_gpu:
 79 |                 predictions = self.model(featmat_var).cpu().data.numpy().ravel()
 80 |             else:
 81 |                 predictions = self.model(featmat_var).data.numpy().ravel()
 82 |             errors = returns.ravel() - predictions
 83 |             error_before = np.sum(errors**2)/(np.sum(returns**2) + 1e-8)
 84 | 
 85 |         epoch_losses = fit_data(self.model, featmat_var, returns_var, self.optimizer,
 86 |                                 self.loss_function, self.batch_size, self.epochs)
 87 | 
 88 |         if return_errors:
 89 |             if self.use_gpu:
 90 |                 predictions = self.model(featmat_var).cpu().data.numpy().ravel()
 91 |             else:
 92 |                 predictions = self.model(featmat_var).data.numpy().ravel()
 93 |             errors = returns.ravel() - predictions
 94 |             error_after = np.sum(errors**2)/(np.sum(returns**2) + 1e-8)
 95 |             return error_before, error_after
 96 | 
 97 |     def predict(self, path):
 98 |         featmat = self._features([path]).astype('float32')
 99 |         if self.use_gpu:
100 |             feat_var = Variable(torch.from_numpy(featmat).float().cuda(), requires_grad=False)
101 |             prediction = self.model(feat_var).cpu().data.numpy().ravel()
102 |         else:
103 |             feat_var = Variable(torch.from_numpy(featmat).float(), requires_grad=False)
104 |             prediction = self.model(feat_var).data.numpy().ravel()
105 |         return prediction
106 | 


--------------------------------------------------------------------------------
/mjrl/baselines/quadratic_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | 
 4 | class QuadraticBaseline:
 5 |     def __init__(self, env_spec, inp_dim=None, inp='obs', reg_coeff=1e-3):
 6 |         self.n = inp_dim if inp_dim is not None else env_spec.observation_dim
 7 |         self.inp = inp
 8 |         self._reg_coeff = reg_coeff
 9 |         self._coeffs = None
10 | 
11 |     def _features(self, paths):
12 |         if self.inp == 'env_features':
13 |             o = np.concatenate([path["env_infos"]["env_features"][0] for path in paths])
14 |         else:
15 |             o = np.concatenate([path["observations"] for path in paths])
16 |         o = np.clip(o, -10, 10)/10.0
17 |         if o.ndim > 2:
18 |             o = o.reshape(o.shape[0], -1)
19 |         N, n = o.shape
20 |         num_feat = int( n + n*(n+1)/2 + 1 + 4 )  # linear + full quad (symmetric matrix) + bias (1.0) + time till pow 4 
21 |         feat_mat =  np.ones((N, num_feat))       # memory allocation
22 | 
23 |         # linear features
24 |         feat_mat[:,:n] = o
25 | 
26 |         # quadratic features
27 |         k = n  # starting from this column in feat_mat
28 |         for i in range(n):
29 |             for j in range(i, n):
30 |                 feat_mat[:,k] = o[:,i]*o[:,j]  # element-wise product
31 |                 k += 1
32 | 
33 |         k = 0  # start from this row
34 |         for i in range(len(paths)):
35 |             l = len(paths[i]["rewards"])
36 |             al = np.arange(l)/1000.0
37 |             for j in range(4):
38 |                 feat_mat[k:k+l, -4+j] = al**(j+1)
39 |             k += l
40 | 
41 |         return feat_mat
42 | 
43 | 
44 |     def fit(self, paths, return_errors=False):
45 | 
46 |         #featmat = np.concatenate([self._features(path) for path in paths])
47 |         featmat = self._features(paths)
48 |         returns = np.concatenate([path["returns"] for path in paths])
49 | 
50 |         if return_errors:
51 |             predictions = featmat.dot(self._coeffs) if self._coeffs is not None else np.zeros(returns.shape)
52 |             errors = returns - predictions
53 |             error_before = np.sum(errors**2)/np.sum(returns**2)
54 | 
55 |         reg_coeff = copy.deepcopy(self._reg_coeff)
56 |         for _ in range(10):
57 |             self._coeffs = np.linalg.lstsq(
58 |                 featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
59 |                 featmat.T.dot(returns)
60 |             )[0]
61 |             if not np.any(np.isnan(self._coeffs)):
62 |                 break
63 |             reg_coeff *= 10
64 | 
65 |         if return_errors:
66 |             predictions = featmat.dot(self._coeffs)
67 |             errors = returns - predictions
68 |             error_after = np.sum(errors**2)/np.sum(returns**2)
69 |             return error_before, error_after
70 | 
71 |     def predict(self, path):
72 |         if self._coeffs is None:
73 |             return np.zeros(len(path["rewards"]))
74 |         return self._features([path]).dot(self._coeffs)
75 | 


--------------------------------------------------------------------------------
/mjrl/baselines/zero_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | 
 4 | class ZeroBaseline:
 5 |     def __init__(self, env_spec, **kwargs):
 6 |         n = env_spec.observation_dim       # number of states
 7 |         self._coeffs = None
 8 | 
 9 |     def fit(self, paths, return_errors=False):
10 |         if return_errors:
11 |             return 1.0, 1.0
12 | 
13 |     def predict(self, path):
14 |         return np.zeros(len(path["rewards"]))
15 | 


--------------------------------------------------------------------------------
/mjrl/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | # ----------------------------------------
 4 | # mjrl environments
 5 | # ----------------------------------------
 6 | 
 7 | register(
 8 |     id='mjrl_point_mass-v0',
 9 |     entry_point='mjrl.envs:PointMassEnv',
10 |     max_episode_steps=25,
11 | )
12 | 
13 | register(
14 |     id='mjrl_swimmer-v0',
15 |     entry_point='mjrl.envs:SwimmerEnv',
16 |     max_episode_steps=500,
17 | )
18 | 
19 | register(
20 |     id='mjrl_reacher_7dof-v0',
21 |     entry_point='mjrl.envs:Reacher7DOFEnv',
22 |     max_episode_steps=50,
23 | )
24 | 
25 | register(
26 |     id='mjrl_peg_insertion-v0',
27 |     entry_point='mjrl.envs:PegEnv',
28 |     max_episode_steps=50,
29 | )
30 | 
31 | from mjrl.envs.mujoco_env import MujocoEnv
32 | # ^^^^^ so that user gets the correct error
33 | # message if mujoco is not installed correctly
34 | from mjrl.envs.point_mass import PointMassEnv
35 | from mjrl.envs.swimmer import SwimmerEnv
36 | from mjrl.envs.reacher_sawyer import Reacher7DOFEnv
37 | from mjrl.envs.peg_insertion_sawyer import PegEnv
38 | 


--------------------------------------------------------------------------------
/mjrl/envs/assets/peg_insertion.xml:
--------------------------------------------------------------------------------
  1 | <mujoco model="arm3d">
  2 |     <compiler inertiafromgeom="true" angle="radian" coordinate="local" />
  3 |     <option timestep="0.01" gravity="0 0 0" iterations="20" integrator="RK4" />
  4 |     <default>
  5 |         <joint armature="0.04" damping="1" limited="true" />
  6 |         <geom friction=".5 .1 .1" margin="0.002" condim="1" contype="0" conaffinity="1" />
  7 |     </default>
  8 | 
  9 |     <worldbody>
 10 |         <body name="r_shoulder_pan_link" pos="0 -0.188 0">
 11 |             <geom name="e1" type="sphere" rgba="0.6 0.6 0.6 1" pos="-0.06 0.05 0.2" size="0.05" />
 12 |             <geom name="e2" type="sphere" rgba="0.6 0.6 0.6 1" pos=" 0.06 0.05 0.2" size="0.05" />
 13 |             <geom name="e1p" type="sphere" rgba="0.1 0.1 0.1 1" pos="-0.06 0.09 0.2" size="0.03" />
 14 |             <geom name="e2p" type="sphere" rgba="0.1 0.1 0.1 1" pos=" 0.06 0.09 0.2" size="0.03" />
 15 |             <geom name="sp" type="capsule" fromto="0 0 -0.4 0 0 0.2" size="0.1" />
 16 |             <joint name="r_shoulder_pan_joint" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.2854 1.714602" damping="10.0" />
 17 | 
 18 |             <body name="r_shoulder_lift_link" pos="0.1 0 0">
 19 |                 <geom name="sl" type="capsule" fromto="0 -0.1 0 0 0.1 0" size="0.1" />
 20 |                 <joint name="r_shoulder_lift_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-0.5236 1.3963" damping="10.0" />
 21 | 
 22 |                 <body name="r_upper_arm_roll_link" pos="0 0 0">
 23 |                     <geom name="uar" type="capsule" fromto="-0.1 0 0 0.1 0 0" size="0.02" />
 24 |                     <joint name="r_upper_arm_roll_joint" type="hinge" pos="0 0 0" axis="1 0 0" range="-3.9 0.8" damping="0.1" />
 25 | 
 26 |                     <body name="r_upper_arm_link" pos="0 0 0">
 27 |                         <geom name="ua" type="capsule" fromto="0 0 0 0.4 0 0" size="0.06" />
 28 | 
 29 |                         <body name="r_elbow_flex_link" pos="0.4 0 0">
 30 |                             <geom name="ef" type="capsule" fromto="0 -0.02 0 0.0 0.02 0" size="0.06" />
 31 |                             <joint name="r_elbow_flex_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-2.3213 0" damping="1.0" />
 32 | 
 33 |                             <body name="r_forearm_roll_link" pos="0 0 0">
 34 |                                 <geom name="fr" type="capsule" fromto="-0.1 0 0 0.1 0 0" size="0.02" />
 35 |                                 <joint name="r_forearm_roll_joint" type="hinge" limited="false" pos="0 0 0" axis="1 0 0" damping=".1" />
 36 | 
 37 |                                 <body name="r_forearm_link" pos="0 0 0">
 38 |                                     <geom name="fa" type="capsule" fromto="0 0 0 0.321 0 0" size="0.05" />
 39 | 
 40 |                                     <body name="r_wrist_flex_link" pos="0.321 0 0">
 41 |                                         <geom name="wf" type="capsule" fromto="0 -0.02 0 0 0.02 0" size="0.01" />
 42 |                                         <joint name="r_wrist_flex_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-2.094 0" damping=".1" />
 43 | 
 44 |                                         <body name="r_wrist_roll_link" pos="0 0 0">
 45 |                                             <geom name="wr" type="capsule" fromto="-0.02 0 0 0.02 0 0" size="0.01" />
 46 |                                             <joint name="r_wrist_roll_joint" type="hinge" pos="0 0 0" limited="false" axis="1 0 0" damping="0.1" />
 47 | 
 48 |                                             <body name="r_gripper_palm_link" pos="0 0 0">
 49 |                                                 <geom name="pl" type="capsule" fromto="0.05 0 -0.02 0.05 0 0.02" size="0.05" />
 50 | 
 51 |                                                 <body name="r_gripper_tool_frame" pos="0.18 0 0">
 52 |                                                     <site name="leg_bottom" pos="0 0 -0.15" size="0.01" />
 53 |                                                     <site name="leg_top" pos="0 0 0.15" size="0.01" />
 54 | 
 55 |                                                     <body name="peg" pos="0 0 0">
 56 |                                                         <geom name="peg_geom" rgba="0.8 0.6 0.6 1" type="cylinder" fromto="0 0 -0.15 0 0 0.15" size="0.028" density="2000" contype="2" conaffinity="1" />
 57 |                                                         <site name="peg_bottom" rgba="0.2 0.2 0.8 1" type="sphere" pos="0 0 -0.15" size="0.02" />
 58 |                                                     </body>
 59 |                                                 </body>
 60 | 
 61 |                                                 <body name="r_gripper_l_finger_link" pos="0.07691 0.03 0">
 62 |                                                     <geom name="gf3" type="capsule" fromto="0 0 0 0.09137 0.00495 0" size="0.01" />
 63 | 
 64 |                                                     <body name="r_gripper_l_finger_tip_link" pos="0.09137 0.00495 0">
 65 |                                                         <geom name="gf4" type="capsule" fromto="0 0 0 0.09137 0.0 0" size="0.01" />
 66 |                                                     </body>
 67 |                                                 </body>
 68 | 
 69 |                                                 <body name="r_gripper_r_finger_link" pos="0.07691 -0.03 0">
 70 |                                                     <geom name="gf1" type="capsule" fromto="0 0 0 0.09137 -0.00495 0" size="0.01" />
 71 | 
 72 |                                                     <body name="r_gripper_r_finger_tip_link" pos="0.09137 -0.00495 0">
 73 |                                                         <geom name="gf2" type="capsule" fromto="0 0 0 0.09137 0.0 0" size="0.01" />
 74 |                                                     </body>
 75 |                                                 </body>
 76 |                                             </body>
 77 |                                         </body>
 78 |                                     </body>
 79 |                                 </body>
 80 |                             </body>
 81 |                         </body>
 82 |                     </body>
 83 |                 </body>
 84 |             </body>
 85 |         </body>
 86 | 
 87 | <!--         <body name="g1" pos="0.034 0.3 -0.47" axisangle="0 1 0 0.05">
 88 |             <geom name="g1" rgba="0.941176 0.972549 1 1" type="box" size="0.003 0.01 0.05" contype="1" conaffinity="1" />
 89 |         </body>
 90 | 
 91 |         <body name="g2" pos="-0.034 0.3 -0.47" axisangle="0 1 0 -0.05">
 92 |             <geom name="g2" rgba="0.941176 0.972549 1 1" type="box" size="0.003 0.01 0.05" contype="1" conaffinity="1" />
 93 |         </body>
 94 | 
 95 |         <body name="g3" pos="0.0 0.334 -0.47" axisangle="1 0 0 -0.05">
 96 |             <geom name="g3" rgba="0.941176 0.972549 1 1" type="box" size="0.01 0.003 0.05" contype="1" conaffinity="1" />
 97 |         </body> -->
 98 | 
 99 |         <body name="g4" pos="0.0 0.266 -0.47" axisangle="1 0 0 0.05">
100 |             <geom name="g4" rgba="0.941176 0.972549 1 1" type="box" size="0.01 0.003 0.05" contype="1" conaffinity="1" />
101 |         </body>
102 | 
103 |         <body name="fl" pos="0.0 0.3 -0.55">
104 |             <geom name="fl" rgba="0.941176 0.972549 1 1" type="box" size="0.2 0.2 0.05" contype="1" conaffinity="1" />
105 |         </body>
106 | 
107 |         <body name="w1" pos="0.216 0.3 -0.45">
108 |             <geom name="w1" rgba="0.941176 0.972549 1 1" type="box" size="0.183 0.3 0.05" contype="1" conaffinity="1" />
109 |         </body>
110 | 
111 |         <body name="w2" pos="-0.216 0.3 -0.45">
112 |             <geom name="w2" rgba="0.941176 0.972549 1 1" type="box" size="0.183 0.3 0.05" contype="1" conaffinity="1" />
113 |         </body>
114 | 
115 |         <body name="w3" pos="0.0 0.516 -0.45">
116 |             <geom name="w3" rgba="0.941176 0.972549 1 1" type="box" size="0.032 0.183 0.05" contype="1" conaffinity="1" />
117 |         </body>
118 | 
119 |         <body name="w4" pos="0.0 0.084 -0.45">
120 |             <geom name="w4" rgba="0.941176 0.972549 1 1" type="box" size="0.032 0.183 0.05" contype="1" conaffinity="1" />
121 |         </body>
122 | 
123 |         <body name="target" pos="0.0 0.29 -0.5">
124 |             <!-- <geom name="target" rgba="0.2 0.8 0.2 0.3" type="sphere" size="0.03" contype="0" conaffinity="0" /> -->
125 |             <site name="target" rgba="0.2 0.8 0.2 0.3" type="sphere" size="0.03" pos="0 0 0" />
126 |         </body>
127 | 
128 |     </worldbody>
129 | 
130 |    <actuator>
131 |       <motor joint="r_shoulder_pan_joint"   ctrlrange="-1.0 1.0" ctrllimited="true" gear="20" />
132 |       <motor joint="r_shoulder_lift_joint"  ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
133 |       <motor joint="r_upper_arm_roll_joint" ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
134 |       <motor joint="r_elbow_flex_joint"     ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
135 |       <motor joint="r_forearm_roll_joint"   ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
136 |       <motor joint="r_wrist_flex_joint"     ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
137 |       <motor joint="r_wrist_roll_joint"     ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
138 |    </actuator>
139 | 
140 | </mujoco>


--------------------------------------------------------------------------------
/mjrl/envs/assets/point_mass.xml:
--------------------------------------------------------------------------------
 1 | <!-- Author + Copyright (C) Aravind Rajeswaran, rajeswaran.aravind@gmail.com. 
 2 | See LICENSE file for legal notices. LICENSE must be kept together with this file. -->
 3 | <mujoco model="PointMass">
 4 |     <compiler inertiafromgeom="true" angle="radian" />
 5 |     <default>
 6 |         <joint armature="0.01" damping="0.1" limited="true"/>
 7 |         <geom contype="0" friction="1 0.1 0.1" rgba="0.7 0.7 0 1"/>
 8 |         <motor ctrllimited="true" ctrlrange="-1.0 1.0" />
 9 |     </default>
10 |     <option gravity="0 0 0" integrator="RK4" timestep="0.01"/>
11 | 
12 |     <asset>
13 |         <texture type="skybox" builtin="gradient" rgb1=".1 .12 .15" rgb2="0 0 0" width="100" height="100"/>
14 |         <texture builtin="flat" height="100" name="grayplane" rgb1="0.4 0.4 0.4" rgb2=".1 0.15 0.2" type="2d" width="100"/>
15 |         <material name="grayfloor" reflectance="0" shininess="0" specular="1" texture="grayplane"/>
16 |     </asset>
17 | 
18 |     <worldbody>
19 |         <!-- Arena -->
20 |         <geom conaffinity="0" contype="0" name="ground" pos="0 0 0" rgba="0.9 0.9 0.9 1" size="1.5 1.5 .05" type="plane" material="grayfloor"/>
21 |         <geom conaffinity="0" fromto="-1.5 -1.5 .02 1.5 -1.5 .02" name="sideS" rgba="0.3 0.4 0.3 1.0" size=".04" type="capsule" mass="0.1"/>
22 |         <geom conaffinity="0" fromto=" 1.5 -1.5 .02 1.5  1.5 .02" name="sideE" rgba="0.3 0.4 0.3 1.0" size=".04" type="capsule" mass="0.1"/>
23 |         <geom conaffinity="0" fromto="-1.5  1.5 .02 1.5  1.5 .02" name="sideN" rgba="0.3 0.4 0.3 1.0" size=".04" type="capsule" mass="0.1"/>
24 |         <geom conaffinity="0" fromto="-1.5 -1.5 .02 -1.5 1.5 .02" name="sideW" rgba="0.3 0.4 0.3 1.0" size=".04" type="capsule" mass="0.1"/>
25 | 
26 |         <!-- Agent -->
27 |         <body name="agent" pos="0 0 .05">
28 |             <joint axis="1 0 0" name="agent_x" pos="0 0 0" range="-1.4 1.4" stiffness="0" type="slide"/>
29 |             <joint axis="0 1 0" name="agent_y" pos="0 0 0" range="-1.4 1.4" stiffness="0" type="slide"/>
30 |             <geom conaffinity="1" contype="1" name="agent" pos="0 0 0" rgba="0.2 0.2 0.6 1" size=".05" type="sphere"/>
31 |         </body>
32 | 
33 |         <!-- Target -->
34 |         <site name="target" pos="1.0 0 .05" rgba="0.6 0.2 0.2 0.2" size=".07" type="sphere" />
35 | 
36 |     </worldbody>
37 | 
38 |     <actuator>
39 |         <motor gear="10.0" joint="agent_x"/>
40 |         <motor gear="10.0" joint="agent_y"/>
41 |     </actuator>
42 | 
43 | </mujoco>
44 | 


--------------------------------------------------------------------------------
/mjrl/envs/assets/sawyer.xml:
--------------------------------------------------------------------------------
  1 | <mujoco model="7dof reacher">
  2 |    <compiler inertiafromgeom="true" angle="radian" coordinate="local" />
  3 |    <option timestep="0.01" gravity="0 0 0" iterations="100" integrator="Euler" />
  4 |    <default>
  5 |       <joint armature="0.004" damping="0.8" limited="true" />
  6 |       <geom friction=".5 .1 .1" margin="0.002" condim="1" contype="0" conaffinity="0" rgba="1.0 0.25 0.22 1" />
  7 |    </default>
  8 | 
  9 |    <worldbody>
 10 |       <light diffuse=".5 .5 .5" pos="0 0 3" dir="0 0 -1"/>
 11 |       <geom name="table" type="plane" pos="0 0.5 -0.425" size="1 1 0.1" rgba="0.8 0.8 0.8 1.0" contype="1" conaffinity="1"/>
 12 | 
 13 |       <site name="target" pos="0.1 0.1 0.1" rgba="0.2 0.6 0.2 0.25" size=".05" type="sphere" />
 14 | 
 15 |       <body name="r_shoulder_pan_link" pos="0 -0.6 0">
 16 |          <geom name="e1"  type="sphere" rgba="0.6 0.6 0.6 1" pos="-0.06 0.05 0.2" size="0.05" />
 17 |          <geom name="e2"  type="sphere" rgba="0.6 0.6 0.6 1" pos=" 0.06 0.05 0.2" size="0.05" />
 18 |          <geom name="e1p" type="sphere" rgba="0.1 0.1 0.1 1" pos="-0.06 0.09 0.2" size="0.03" />
 19 |          <geom name="e2p" type="sphere" rgba="0.1 0.1 0.1 1" pos=" 0.06 0.09 0.2" size="0.03" />
 20 |          <geom name="sp"  type="capsule" fromto="0 0 -0.4 0 0 0.2" size="0.1" />
 21 |          <joint name="r_shoulder_pan_joint" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.2854 1.714602" damping="2.0" />
 22 | 
 23 |          <body name="r_shoulder_lift_link" pos="0.1 0 0">
 24 |             <geom name="sl" type="capsule" fromto="0 -0.1 0 0 0.1 0" size="0.1" />
 25 |             <joint name="r_shoulder_lift_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-0.5236 1.3963" damping="2.0" />
 26 | 
 27 |             <body name="r_upper_arm_roll_link" pos="0 0 0">
 28 |                <geom name="uar" type="capsule" fromto="-0.1 0 0 0.1 0 0" size="0.02" />
 29 |                <joint name="r_upper_arm_roll_joint" type="hinge" pos="0 0 0" axis="1 0 0" range="-1.5 1.7" />
 30 | 
 31 |                <body name="r_upper_arm_link" pos="0 0 0">
 32 |                   <geom name="ua" type="capsule" fromto="0 0 0 0.4 0 0" size="0.06" />
 33 | 
 34 |                   <body name="r_elbow_flex_link" pos="0.4 0 0">
 35 |                      <geom name="ef" type="capsule" fromto="0 -0.02 0 0.0 0.02 0" size="0.06" />
 36 |                      <joint name="r_elbow_flex_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-2.3213 0" />
 37 | 
 38 |                      <body name="r_forearm_roll_link" pos="0 0 0">
 39 |                         <geom name="fr" type="capsule" fromto="-0.1 0 0 0.1 0 0" size="0.02" />
 40 |                         <joint name="r_forearm_roll_joint" type="hinge" limited="true" pos="0 0 0" axis="1 0 0" range="-1.5 1.5"/>
 41 | 
 42 |                         <body name="r_forearm_link" pos="0 0 0">
 43 |                            <geom name="fa" type="capsule" fromto="0 0 0 0.291 0 0" size="0.05" />
 44 | 
 45 |                            <body name="r_wrist_flex_link" pos="0.321 0 0">
 46 |                               <geom name="wf" type="capsule" fromto="0 -0.02 0 0 0.02 0" size="0.01" />
 47 |                               <joint name="r_wrist_flex_joint" type="hinge" pos="0 0 0" axis="0 1 0" range="-1.094 0" />
 48 | 
 49 |                               <body name="r_wrist_roll_link" pos="0 0 0">
 50 |                                  <joint name="r_wrist_roll_joint" type="hinge" pos="0 0 0" limited="true" axis="1 0 0" range="-1.5 1.5"/>
 51 |                                     <!--
 52 |                                  <body name="tips_arm" pos="0 0 0">
 53 |                                     <geom name="tip_arml" type="sphere" pos="0.1 -0.1 0." size="0.01" />
 54 |                                     <geom name="tip_armr" type="sphere" pos="0.1 0.1 0." size="0.01" />
 55 |                                     <site name="finger" type="sphere" size="0.05" rgba="0.4 0.4 0.5 1" pos="0 0 0"/>
 56 |                                  </body>
 57 |                                     -->
 58 |                                  <geom type="sphere" pos="0.03 0 0" size="0.08" rgba="0 0 1 0.2" contype="1" conaffinity="1" />
 59 |                                  <site name="finger" type="sphere" size="0.05" rgba="0.4 0.4 0.5 1" pos="0 0 0"/>
 60 |                                     <!--
 61 |                                  <geom type="capsule" fromto="0 -0.1 0. 0.1 -0.1 0" size="0.02" contype="1" conaffinity="1" />
 62 |                                  <geom type="capsule" fromto="0 +0.1 0. 0.1 +0.1 0" size="0.02" contype="1" conaffinity="1" />
 63 |                                     -->
 64 |                               </body>
 65 |                            </body>
 66 |                         </body>
 67 |                      </body>
 68 |                   </body>
 69 |                </body>
 70 |             </body>
 71 |          </body>
 72 |       </body>
 73 | 
 74 |       <!--<body name="goal" pos="0 0 0" >-->
 75 |          <!--<geom name="goal" rgba="1 0.86 0.5 1" type="sphere" size="0.05" conaffinity="0" contype="0"/>-->
 76 |          <!--<site name="goal" type="sphere" size="0.02" rgba="0.4 0.4 0.5 0.2" pos="0 0 0"/>-->
 77 | 
 78 |          <!--<joint axis="1 0 0" limited="false" name="goalx" pos="0 0 0" type="slide"/>-->
 79 |          <!--<joint axis="0 1 0" limited="false" name="goaly" pos="0 0 0" type="slide"/>-->
 80 |          <!--<joint axis="0 0 1" limited="false" name="goalz" pos="0 0 0" type="slide"/>-->
 81 |       <!--</body>-->
 82 | 
 83 |       <!--<body name="ball" pos="0 0 0" >-->
 84 |          <!--<geom name="ball" rgba="0.5 0.86 1 1" type="sphere" size="0.05" conaffinity="0" contype="0"/>-->
 85 |          <!--<site name="ball" type="sphere" size="0.02" rgba="0.4 0.4 0.5 0.2" pos="0 0 0"/>-->
 86 | 
 87 |          <!--<joint axis="1 0 0" limited="false" name="ballx" pos="0 0 0" type="slide"/>-->
 88 |          <!--<joint axis="0 1 0" limited="false" name="bally" pos="0 0 0" type="slide"/>-->
 89 |          <!--<joint axis="0 0 1" limited="false" name="ballz" pos="0 0 0" type="slide"/>-->
 90 |       <!--</body>-->
 91 | 
 92 |    </worldbody>
 93 | 
 94 |    <contact>
 95 |       <!--
 96 |       <pair geom1="tip_armr" geom2="ball" condim="1" solref="0.02 4" solimp="-0.1 0.95 0.1" margin="0.0" />
 97 |       <pair geom1="table" geom2="ball" condim="1" solref="0.02 1" solimp="0.9 0.95 0.1" margin="0.02" />
 98 |       -->
 99 |    </contact>
100 | 
101 |    <actuator>
102 |       <motor joint="r_shoulder_pan_joint"   ctrlrange="-1.0 1.0" ctrllimited="true" gear="20" />
103 |       <motor joint="r_shoulder_lift_joint"  ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
104 |       <motor joint="r_upper_arm_roll_joint" ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
105 |       <motor joint="r_elbow_flex_joint"     ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
106 |       <motor joint="r_forearm_roll_joint"   ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
107 |       <motor joint="r_wrist_flex_joint"     ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
108 |       <motor joint="r_wrist_roll_joint"     ctrlrange="-1.0 1.0" ctrllimited="true" gear="10" />
109 |    </actuator>
110 | </mujoco>


--------------------------------------------------------------------------------
/mjrl/envs/assets/swimmer.xml:
--------------------------------------------------------------------------------
 1 | <!-- Author + Copyright (C) Aravind Rajeswaran, rajeswaran.aravind@gmail.com. 
 2 | See LICENSE file for legal notices. LICENSE file must be kept together with this file.
 3 | Based on original model by Vikash Kumar (vikashplus@gmail.com) in the MuJoCo forum resources -->
 4 | <mujoco model="swimmer">
 5 | 	<compiler inertiafromgeom="true" angle="radian" />
 6 | 	<default>
 7 | 		<joint limited="true" range="-1.5 1.5"/>
 8 | 		<motor ctrllimited="true" ctrlrange="-1 1"/>
 9 | 		<default class="body">
10 | 			<geom material="MatBody"/>
11 | 		</default>
12 | 	</default>
13 | 	
14 | 	<!--Viscosity:: water:0.000894, air:0.00001983 -->
15 | 	<!--Density:: water:1000, air:1.2 -->
16 | 	<option timestep="0.005" viscosity="0.000894" density="1000" />
17 | 	
18 | 	<asset>
19 |         <texture type="skybox" builtin="gradient" rgb1=".1 .12 .15" rgb2="0 0 0" width="100" height="100"/>
20 |         <texture builtin="checker" height="100" name="groundplane" rgb1=".4 .4 .45" rgb2=".15 .15 0.15" type="2d" width="100"/>
21 | 		<material name="MatBody" specular="3" shininess="0.9" reflectance="0" rgba=".3 .4 .35 1"/>
22 | 		<material name="MatGnd" texture="groundplane" texrepeat="10 10" specular="1" shininess="1" reflectance="0.00001"/>
23 | 	</asset>
24 | 	
25 | 	<worldbody>
26 | 		<light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3"/>
27 | 		<camera name="cam0" pos="0 -7 7" euler="0.7 0 0"/>
28 | 		<geom name="ground" material="MatGnd" type="plane" pos="0 0 0" size="10 10 1" contype="0" conaffinity="0" />
29 | 		<body pos="0 0 0.03" childclass="body" name="torso">
30 | 			<site name="eyeL" type="sphere" size="0.02" pos="-.065 -.045 .02" rgba=".9 .4 .2 1"/>
31 | 			<site name="eyer" type="sphere" size="0.02" pos="-.065 0.045 .02" rgba=".9 .4 .2 1"/>
32 | 			<site name="head" material="MatBody" type="sphere" size="0.08" pos="0 0 0" rgba="0 0 0 1"/>
33 | 			<joint type="slide" pos="0 0 0" axis="1 0 0" limited="false"/>
34 | 			<joint type="slide" pos="0 0 0" axis="0 1 0" limited="false"/>
35 | 			<joint type="hinge" pos="0 0 0" axis="0 0 1" limited="false"/>
36 | 			<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.07 0.15" />
37 | 			<body pos="0.3 0 0">
38 | 				<joint name="j1" type="hinge" pos="0 0 0" axis="0 0 1"/>
39 | 				<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.065 0.15" />
40 | 				<body pos="0.3 0 0">
41 | 					<joint name="j2" type="hinge" pos="0 0 0" axis="0 0 1"/>
42 | 					<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.06 0.15" />
43 | 					<body pos="0.3 0 0">
44 | 						<joint name="j3" type="hinge" pos="0 0 0" axis="0 0 1"/>
45 | 						<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.055 0.15" />
46 | 						<body pos="0.3 0 0">
47 | 							<joint name="j4" type="hinge" pos="0 0 0" axis="0 0 1"/>
48 | 							<geom type="capsule" pos="0.15 0 0" quat="0.707 0 -0.707 0" size="0.05 0.15" />
49 | 						</body>
50 | 					</body>
51 | 				</body>
52 | 			</body>
53 | 		</body>
54 | 
55 | 	    <site name="target" pos='-5 0 0.15' type="sphere" size="0.8" rgba="1 0 0 0.5" />
56 | 	    
57 | 	</worldbody>
58 | 	
59 | 	<actuator>
60 | 		<motor joint="j1" gear="20"/>
61 | 		<motor joint="j2" gear="20"/>
62 | 		<motor joint="j3" gear="20"/>
63 | 		<motor joint="j4" gear="20"/>
64 | 	</actuator>
65 | 
66 | </mujoco>
67 | 


--------------------------------------------------------------------------------
/mjrl/envs/mujoco_env.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from gym import error, spaces
  4 | from gym.utils import seeding
  5 | import numpy as np
  6 | from os import path
  7 | import gym
  8 | import six
  9 | import time as timer
 10 | 
 11 | try:
 12 |     import mujoco_py
 13 |     from mujoco_py import load_model_from_path, MjSim, MjViewer
 14 | except ImportError as e:
 15 |     raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e))
 16 | 
 17 | def get_sim(model_path):
 18 |     if model_path.startswith("/"):
 19 |         fullpath = model_path
 20 |     else:
 21 |         fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path)
 22 |     if not path.exists(fullpath):
 23 |         raise IOError("File %s does not exist" % fullpath)
 24 |     model = load_model_from_path(fullpath)
 25 |     return MjSim(model)
 26 | 
 27 | class MujocoEnv(gym.Env):
 28 |     """Superclass for all MuJoCo environments.
 29 |     """
 30 | 
 31 |     def __init__(self, model_path=None, frame_skip=1, sim=None):
 32 | 
 33 |         if sim is None:
 34 |             self.sim = get_sim(model_path)
 35 |         else:
 36 |             self.sim = sim
 37 |         self.data = self.sim.data
 38 |         self.model = self.sim.model
 39 | 
 40 |         self.frame_skip = frame_skip
 41 |         self.metadata = {
 42 |             'render.modes': ['human', 'rgb_array'],
 43 |             'video.frames_per_second': int(np.round(1.0 / self.dt))
 44 |         }
 45 |         self.mujoco_render_frames = False
 46 | 
 47 |         self.init_qpos = self.data.qpos.ravel().copy()
 48 |         self.init_qvel = self.data.qvel.ravel().copy()
 49 |         try:
 50 |             observation, _reward, done, _info = self.step(np.zeros(self.model.nu))
 51 |         except NotImplementedError:
 52 |             observation, _reward, done, _info = self._step(np.zeros(self.model.nu))
 53 |         assert not done
 54 |         self.obs_dim = np.sum([o.size for o in observation]) if type(observation) is tuple else observation.size
 55 | 
 56 |         bounds = self.model.actuator_ctrlrange.copy()
 57 |         low = bounds[:, 0]
 58 |         high = bounds[:, 1]
 59 |         self.action_space = spaces.Box(low, high, dtype=np.float32)
 60 | 
 61 |         high = np.inf*np.ones(self.obs_dim)
 62 |         low = -high
 63 |         self.observation_space = spaces.Box(low, high, dtype=np.float32)
 64 | 
 65 |         self.seed()
 66 | 
 67 |     def seed(self, seed=None):
 68 |         self.np_random, seed = seeding.np_random(seed)
 69 |         return [seed]
 70 | 
 71 |     # methods to override:
 72 |     # ----------------------------
 73 | 
 74 |     def reset_model(self):
 75 |         """
 76 |         Reset the robot degrees of freedom (qpos and qvel).
 77 |         Implement this in each subclass.
 78 |         """
 79 |         raise NotImplementedError
 80 | 
 81 |     def mj_viewer_setup(self):
 82 |         """
 83 |         Due to specifics of new mujoco rendering, the standard viewer cannot be used
 84 |         with this set-up. Instead we use this mujoco specific function.
 85 |         """
 86 |         pass
 87 | 
 88 |     def viewer_setup(self):
 89 |         """
 90 |         Does not work. Use mj_viewer_setup() instead
 91 |         """
 92 |         pass
 93 | 
 94 |     def evaluate_success(self, paths, logger=None):
 95 |         """
 96 |         Log various success metrics calculated based on input paths into the logger
 97 |         """
 98 |         pass
 99 | 
100 |     # -----------------------------
101 | 
102 |     def reset(self):
103 |         self.sim.reset()
104 |         self.sim.forward()
105 |         ob = self.reset_model()
106 |         return ob
107 | 
108 |     def set_state(self, qpos, qvel):
109 |         assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
110 |         old_state = self.sim.get_state()
111 |         new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel,
112 |                                          old_state.act, old_state.udd_state)
113 |         self.sim.set_state(new_state)
114 |         self.sim.forward()
115 | 
116 |     @property
117 |     def dt(self):
118 |         return self.model.opt.timestep * self.frame_skip
119 | 
120 |     def do_simulation(self, ctrl, n_frames):
121 |         for i in range(self.model.nu):
122 |             self.sim.data.ctrl[i] = ctrl[i]
123 |         for _ in range(n_frames):
124 |             self.sim.step()
125 |             if self.mujoco_render_frames is True:
126 |                 self.mj_render()
127 | 
128 |     def mj_render(self):
129 |         try:
130 |             self.viewer.render()
131 |         except:
132 |             self.mj_viewer_setup()
133 |             self.viewer._run_speed = 0.5
134 |             #self.viewer._run_speed /= self.frame_skip
135 |             self.viewer.render()
136 | 
137 |     def render(self, *args, **kwargs):
138 |         pass
139 |         #return self.mj_render()
140 | 
141 |     def _get_viewer(self):
142 |         pass
143 |         #return None
144 | 
145 |     def state_vector(self):
146 |         state = self.sim.get_state()
147 |         return np.concatenate([
148 |             state.qpos.flat, state.qvel.flat])
149 | 
150 |     # -----------------------------
151 | 
152 |     def visualize_policy(self, policy, horizon=1000, num_episodes=1, mode='exploration'):
153 |         self.mujoco_render_frames = True
154 |         for ep in range(num_episodes):
155 |             o = self.reset()
156 |             d = False
157 |             t = 0
158 |             score = 0.0
159 |             while t < horizon and d is False:
160 |                 a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
161 |                 o, r, d, _ = self.step(a)
162 |                 t = t+1
163 |                 score = score + r
164 |             print("Episode score = %f" % score)
165 |         self.mujoco_render_frames = False
166 | 
167 |     def visualize_policy_offscreen(self, policy, horizon=1000,
168 |                                    num_episodes=1,
169 |                                    frame_size=(640,480),
170 |                                    mode='exploration',
171 |                                    save_loc='/tmp/',
172 |                                    filename='newvid',
173 |                                    camera_name=None):
174 |         import skvideo.io
175 |         for ep in range(num_episodes):
176 |             print("Episode %d: rendering offline " % ep, end='', flush=True)
177 |             o = self.reset()
178 |             d = False
179 |             t = 0
180 |             arrs = []
181 |             t0 = timer.time()
182 |             while t < horizon and d is False:
183 |                 a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
184 |                 o, r, d, _ = self.step(a)
185 |                 t = t+1
186 |                 curr_frame = self.sim.render(width=frame_size[0], height=frame_size[1],
187 |                                              mode='offscreen', camera_name=camera_name, device_id=0)
188 |                 arrs.append(curr_frame[::-1,:,:])
189 |                 print(t, end=', ', flush=True)
190 |             file_name = save_loc + filename + str(ep) + ".mp4"
191 |             skvideo.io.vwrite( file_name, np.asarray(arrs))
192 |             print("saved", file_name)
193 |             t1 = timer.time()
194 |             print("time taken = %f"% (t1-t0))
195 | 


--------------------------------------------------------------------------------
/mjrl/envs/peg_insertion_sawyer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from mjrl.envs import mujoco_env
  4 | from mujoco_py import MjViewer
  5 | 
  6 | 
  7 | class PegEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  8 |     def __init__(self):
  9 |         self.peg_sid = -2
 10 |         self.target_sid = -1
 11 |         mujoco_env.MujocoEnv.__init__(self, 'peg_insertion.xml', 4)
 12 |         utils.EzPickle.__init__(self)
 13 |         self.peg_sid = self.model.site_name2id("peg_bottom")
 14 |         self.target_sid = self.model.site_name2id("target")
 15 |         self.init_body_pos = self.model.body_pos.copy()
 16 | 
 17 |     def step(self, a):
 18 |         self.do_simulation(a, self.frame_skip)
 19 |         obs = self.get_obs()
 20 |         reward = self.get_reward(obs, a)
 21 |         return obs, reward, False, self.get_env_infos()
 22 | 
 23 |     def get_obs(self):
 24 |         return np.concatenate([
 25 |             self.data.qpos.flat,
 26 |             self.data.qvel.flat,
 27 |             self.data.site_xpos[self.peg_sid],
 28 |             self.data.site_xpos[self.target_sid],
 29 |         ])
 30 | 
 31 |     def get_reward(self, obs, act=None):
 32 |         obs = np.clip(obs, -10.0, 10.0)
 33 |         if len(obs.shape) == 1:
 34 |             # vector obs, called when stepping the env
 35 |             hand_pos = obs[-6:-3]
 36 |             target_pos = obs[-3:]
 37 |             l1_dist = np.sum(np.abs(hand_pos - target_pos))
 38 |             l2_dist = np.linalg.norm(hand_pos - target_pos)
 39 |         else:
 40 |             obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
 41 |             hand_pos = obs[:, :, -6:-3]
 42 |             target_pos = obs[:, :, -3:]
 43 |             l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1)
 44 |             l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1)
 45 |         bonus = 5.0 * (l2_dist < 0.06)
 46 |         reward = - l1_dist - 5.0 * l2_dist + bonus
 47 |         return reward
 48 | 
 49 |     def compute_path_rewards(self, paths):
 50 |         # path has two keys: observations and actions
 51 |         # path["observations"] : (num_traj, horizon, obs_dim)
 52 |         # path["rewards"] should have shape (num_traj, horizon)
 53 |         obs = paths["observations"]
 54 |         rewards = self.get_reward(obs)
 55 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
 56 | 
 57 |     # --------------------------------
 58 |     # resets and randomization
 59 |     # --------------------------------
 60 | 
 61 |     def robot_reset(self):
 62 |         self.set_state(self.init_qpos, self.init_qvel)
 63 | 
 64 |     def target_reset(self):
 65 |         # Randomize goal position
 66 |         goal_y = self.np_random.uniform(low=0.1, high=0.5)
 67 |         try:
 68 |             self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29)
 69 |             self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29)
 70 |             self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29)
 71 |             self.sim.forward()
 72 |         except:
 73 |             pass
 74 | 
 75 |     def reset_model(self, seed=None):
 76 |         if seed is not None:
 77 |             self.seeding = True
 78 |             self.seed(seed)
 79 |         self.robot_reset()
 80 |         self.target_reset()
 81 |         return self.get_obs()
 82 | 
 83 |     # --------------------------------
 84 |     # get and set states
 85 |     # --------------------------------
 86 | 
 87 |     def get_env_state(self):
 88 |         target_pos = self.model.body_pos[-1].copy()
 89 |         return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
 90 |                     target_pos=target_pos)
 91 | 
 92 |     def set_env_state(self, state):
 93 |         self.sim.reset()
 94 |         qp = state['qp'].copy()
 95 |         qv = state['qv'].copy()
 96 |         target_pos = state['target_pos']
 97 |         self.model.body_pos[-1] = target_pos
 98 |         goal_y = target_pos[1]
 99 |         self.data.qpos[:] = qp
100 |         self.data.qvel[:] = qv
101 |         self.model.body_pos[-1,1] = self.init_body_pos[-1,1] + (goal_y-0.29)
102 |         self.model.body_pos[-2,1] = self.init_body_pos[-2,1] + (goal_y-0.29)
103 |         self.model.body_pos[-3,1] = self.init_body_pos[-3,1] + (goal_y-0.29)
104 |         self.sim.forward()
105 | 
106 |     # --------------------------------
107 |     # utility functions
108 |     # --------------------------------
109 | 
110 |     def get_env_infos(self):
111 |         return dict(state=self.get_env_state())
112 | 
113 |     def mj_viewer_setup(self):
114 |         self.viewer = MjViewer(self.sim)
115 |         self.viewer.cam.azimuth += 200
116 |         self.sim.forward()
117 |         self.viewer.cam.distance = self.model.stat.extent*2.0
118 | 


--------------------------------------------------------------------------------
/mjrl/envs/point_mass.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from mjrl.envs import mujoco_env
  4 | from mujoco_py import MjViewer
  5 | 
  6 | 
  7 | class PointMassEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  8 |     def __init__(self):
  9 |         self.agent_bid = 0
 10 |         self.target_sid = 0
 11 |         utils.EzPickle.__init__(self)
 12 |         mujoco_env.MujocoEnv.__init__(self, 'point_mass.xml', 5)
 13 |         self.agent_bid = self.sim.model.body_name2id('agent')
 14 |         self.target_sid = self.sim.model.site_name2id('target')
 15 | 
 16 |     def step(self, a):
 17 |         self.do_simulation(a, self.frame_skip)
 18 |         obs = self.get_obs()
 19 |         reward = self.get_reward(obs)
 20 |         return obs, reward, False, dict(solved=(reward > -0.1), state=self.get_env_state())
 21 | 
 22 |     def get_obs(self):
 23 |         agent_pos = self.data.body_xpos[self.agent_bid].ravel()
 24 |         target_pos = self.data.site_xpos[self.target_sid].ravel()
 25 |         return np.concatenate([agent_pos[:2], self.data.qvel.ravel(), target_pos[:2]])
 26 | 
 27 |     def get_reward(self, obs, act=None):
 28 |         if len(obs.shape) == 1:
 29 |             # vector obs, called when stepping the env
 30 |             agent_pos = obs[:2]
 31 |             target_pos = obs[-2:]
 32 |             l1_dist = np.sum(np.abs(agent_pos - target_pos))
 33 |             l2_dist = np.linalg.norm(agent_pos - target_pos)
 34 |         else:
 35 |             obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
 36 |             agent_pos = obs[:, :, :2]
 37 |             target_pos = obs[:, :, -2:]
 38 |             l1_dist = np.sum(np.abs(agent_pos - target_pos), axis=-1)
 39 |             l2_dist = np.linalg.norm(agent_pos - target_pos, axis=-1)
 40 |         reward = -1.0 * l1_dist - 0.5 * l2_dist
 41 |         return reward
 42 | 
 43 |     def compute_path_rewards(self, paths):
 44 |         # path has two keys: observations and actions
 45 |         # path["observations"] : (num_traj, horizon, obs_dim)
 46 |         # path["rewards"] should have shape (num_traj, horizon)
 47 |         obs = paths["observations"]
 48 |         rewards = self.get_reward(obs)
 49 |         rewards[..., :-1] = rewards[..., 1:]   # shift index by 1 to have r(s,a)=r(s')
 50 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
 51 |         return paths
 52 | 
 53 |     def reset_model(self):
 54 |         # randomize the agent and goal
 55 |         agent_x = self.np_random.uniform(low=-1.0, high=1.0)
 56 |         agent_y = self.np_random.uniform(low=-1.0, high=1.0)
 57 |         goal_x  = self.np_random.uniform(low=-1.0, high=1.0)
 58 |         goal_y  = self.np_random.uniform(low=-1.0, high=1.0)
 59 |         qp = np.array([agent_x, agent_y])
 60 |         qv = self.init_qvel.copy()
 61 |         self.set_state(qp, qv)
 62 |         self.model.site_pos[self.target_sid][0] = goal_x
 63 |         self.model.site_pos[self.target_sid][1] = goal_y
 64 |         self.sim.forward()
 65 |         return self.get_obs()
 66 | 
 67 |     def evaluate_success(self, paths, logger=None):
 68 |         success = 0.0
 69 |         for p in paths:
 70 |             if np.mean(p['env_infos']['solved'][-4:]) > 0.0:
 71 |                 success += 1.0
 72 |         success_rate = 100.0*success/len(paths)
 73 |         if logger is None:
 74 |             # nowhere to log so return the value
 75 |             return success_rate
 76 |         else:
 77 |             # log the success
 78 |             # can log multiple statistics here if needed
 79 |             logger.log_kv('success_rate', success_rate)
 80 |             return None
 81 | 
 82 |     # --------------------------------
 83 |     # get and set states
 84 |     # --------------------------------
 85 | 
 86 |     def get_env_state(self):
 87 |         target_pos = self.model.site_pos[self.target_sid].copy()
 88 |         return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
 89 |                     target_pos=target_pos)
 90 | 
 91 |     def set_env_state(self, state):
 92 |         self.sim.reset()
 93 |         qp = state['qp'].copy()
 94 |         qv = state['qv'].copy()
 95 |         target_pos = state['target_pos']
 96 |         self.set_state(qp, qv)
 97 |         self.model.site_pos[self.target_sid] = target_pos
 98 |         self.sim.forward()
 99 | 
100 |     # --------------------------------
101 |     # utility functions
102 |     # --------------------------------
103 | 
104 |     def get_env_infos(self):
105 |         return dict(state=self.get_env_state())
106 | 
107 |     def mj_viewer_setup(self):
108 |         self.viewer = MjViewer(self.sim)
109 |         self.sim.forward()
110 | 


--------------------------------------------------------------------------------
/mjrl/envs/reacher_sawyer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from mjrl.envs import mujoco_env
  4 | from mujoco_py import MjViewer
  5 | 
  6 | 
  7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  8 |     def __init__(self):
  9 |         self.hand_sid = -2
 10 |         self.target_sid = -1
 11 |         mujoco_env.MujocoEnv.__init__(self, 'sawyer.xml', 4)
 12 |         utils.EzPickle.__init__(self)
 13 |         self.hand_sid = self.model.site_name2id("finger")
 14 |         self.target_sid = self.model.site_name2id("target")
 15 | 
 16 |     def step(self, a):
 17 |         self.do_simulation(a, self.frame_skip)
 18 |         obs = self.get_obs()
 19 |         reward = self.get_reward(obs, a)
 20 |         return obs, reward, False, self.get_env_infos()
 21 | 
 22 |     def get_obs(self):
 23 |         return np.concatenate([
 24 |             self.data.qpos.flat,
 25 |             self.data.qvel.ravel() * self.dt,       # delta_x instead of velocity
 26 |             self.data.site_xpos[self.hand_sid],
 27 |             self.data.site_xpos[self.target_sid],
 28 |         ])
 29 | 
 30 |     def get_reward(self, obs, act=None):
 31 |         obs = np.clip(obs, -10.0, 10.0)
 32 |         if len(obs.shape) == 1:
 33 |             # vector obs, called when stepping the env
 34 |             hand_pos = obs[-6:-3]
 35 |             target_pos = obs[-3:]
 36 |             l1_dist = np.sum(np.abs(hand_pos - target_pos))
 37 |             l2_dist = np.linalg.norm(hand_pos - target_pos)
 38 |         else:
 39 |             obs = np.expand_dims(obs, axis=0) if len(obs.shape) == 2 else obs
 40 |             hand_pos = obs[:, :, -6:-3]
 41 |             target_pos = obs[:, :, -3:]
 42 |             l1_dist = np.sum(np.abs(hand_pos - target_pos), axis=-1)
 43 |             l2_dist = np.linalg.norm(hand_pos - target_pos, axis=-1)
 44 |         reward = - l1_dist - 5.0 * l2_dist
 45 |         return reward
 46 | 
 47 |     def compute_path_rewards(self, paths):
 48 |         # path has two keys: observations and actions
 49 |         # path["observations"] : (num_traj, horizon, obs_dim)
 50 |         # path["rewards"] should have shape (num_traj, horizon)
 51 |         obs = paths["observations"]
 52 |         rewards = self.get_reward(obs)
 53 |         paths["rewards"] = rewards if rewards.shape[0] > 1 else rewards.ravel()
 54 | 
 55 |     # --------------------------------
 56 |     # resets and randomization
 57 |     # --------------------------------
 58 | 
 59 |     def robot_reset(self):
 60 |         self.set_state(self.init_qpos, self.init_qvel)
 61 | 
 62 |     def target_reset(self):
 63 |         target_pos = np.array([0.1, 0.1, 0.1])
 64 |         target_pos[0] = self.np_random.uniform(low=-0.3, high=0.3)
 65 |         target_pos[1] = self.np_random.uniform(low=-0.2, high=0.2)
 66 |         target_pos[2] = self.np_random.uniform(low=-0.25, high=0.25)
 67 |         self.model.site_pos[self.target_sid] = target_pos
 68 |         self.sim.forward()
 69 | 
 70 |     def reset_model(self, seed=None):
 71 |         if seed is not None:
 72 |             self.seeding = True
 73 |             self.seed(seed)
 74 |         self.robot_reset()
 75 |         self.target_reset()
 76 |         return self.get_obs()
 77 | 
 78 |     # --------------------------------
 79 |     # get and set states
 80 |     # --------------------------------
 81 | 
 82 |     def get_env_state(self):
 83 |         target_pos = self.model.site_pos[self.target_sid].copy()
 84 |         return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy(),
 85 |                     target_pos=target_pos)
 86 | 
 87 |     def set_env_state(self, state):
 88 |         self.sim.reset()
 89 |         qp = state['qp'].copy()
 90 |         qv = state['qv'].copy()
 91 |         target_pos = state['target_pos']
 92 |         self.model.site_pos[self.target_sid] = target_pos
 93 |         self.data.qpos[:] = qp
 94 |         self.data.qvel[:] = qv
 95 |         self.sim.forward()
 96 | 
 97 |     # --------------------------------
 98 |     # utility functions
 99 |     # --------------------------------
100 | 
101 |     def get_env_infos(self):
102 |         return dict(state=self.get_env_state())
103 | 
104 |     def mj_viewer_setup(self):
105 |         self.viewer = MjViewer(self.sim)
106 |         self.viewer.cam.trackbodyid = 1
107 |         self.viewer.cam.type = 1
108 |         self.sim.forward()
109 |         self.viewer.cam.distance = self.model.stat.extent * 2.0
110 | 


--------------------------------------------------------------------------------
/mjrl/envs/swimmer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from mjrl.envs import mujoco_env
 4 | from mujoco_py import MjViewer
 5 | 
 6 | class SwimmerEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 7 |     def __init__(self):
 8 |         mujoco_env.MujocoEnv.__init__(self, 'swimmer.xml', 5)
 9 |         utils.EzPickle.__init__(self)
10 | 
11 |     def step(self, a):
12 |         xposbefore = self.data.qpos[0]
13 |         self.do_simulation(a, self.frame_skip)
14 |         xposafter = self.data.qpos[0]
15 |         
16 |         delta = (xposafter - xposbefore)
17 |         # make agent move in the negative x direction
18 |         reward = -10.0 * delta
19 |         done = False
20 | 
21 |         ob = self.get_obs()
22 |         return ob, reward, done, self.get_env_infos()
23 | 
24 |     def get_obs(self):
25 |         return np.concatenate([
26 |             self.data.qpos.flat[2:],
27 |             self.data.qvel.flat,
28 |         ])
29 | 
30 |     def reset_model(self):
31 |         qpos_init = self.init_qpos.copy()
32 |         qpos_init[2] = self.np_random.uniform(low=-np.pi, high=np.pi)
33 |         self.set_state(qpos_init, self.init_qvel)
34 |         self.sim.forward()
35 |         return self.get_obs()
36 | 
37 |     # --------------------------------
38 |     # get and set states
39 |     # --------------------------------
40 | 
41 |     def get_env_state(self):
42 |         return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy())
43 | 
44 |     def set_env_state(self, state):
45 |         self.sim.reset()
46 |         qp = state['qp'].copy()
47 |         qv = state['qv'].copy()
48 |         self.set_state(qp, qv)
49 |         self.sim.forward()
50 | 
51 |     # --------------------------------
52 |     # utility functions
53 |     # --------------------------------
54 | 
55 |     def get_env_infos(self):
56 |         return dict(state=self.get_env_state())
57 | 
58 |     def mj_viewer_setup(self):
59 |         self.viewer = MjViewer(self.sim)
60 |         self.viewer.cam.trackbodyid = 1
61 |         self.viewer.cam.type = 1
62 |         self.sim.forward()
63 |         self.viewer.cam.distance = self.model.stat.extent*1.2


--------------------------------------------------------------------------------
/mjrl/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/policies/__init__.py


--------------------------------------------------------------------------------
/mjrl/policies/gaussian_linear.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | from mjrl.utils.fc_network import FCNetwork
  7 | 
  8 | 
  9 | class LinearPolicy:
 10 |     def __init__(self, env_spec,
 11 |                  min_log_std=-3,
 12 |                  init_log_std=0,
 13 |                  seed=None):
 14 |         """
 15 |         :param env_spec: specifications of the env (see utils/gym_env.py)
 16 |         :param min_log_std: log_std is clamped at this value and can't go below
 17 |         :param init_log_std: initial log standard deviation
 18 |         :param seed: random seed
 19 |         """
 20 |         self.n = env_spec.observation_dim  # number of states
 21 |         self.m = env_spec.action_dim  # number of actions
 22 |         self.min_log_std = min_log_std
 23 | 
 24 |         # Set seed
 25 |         # ------------------------
 26 |         if seed is not None:
 27 |             torch.manual_seed(seed)
 28 |             np.random.seed(seed)
 29 | 
 30 |         # Policy network
 31 |         # ------------------------
 32 |         self.model = FCNetwork(self.n, self.m, hidden_sizes=())
 33 |         # make weights small
 34 |         for param in list(self.model.parameters())[-2:]:  # only last layer
 35 |            param.data = 1e-2 * param.data
 36 |         self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True)
 37 |         self.trainable_params = list(self.model.parameters()) + [self.log_std]
 38 | 
 39 |         # Old Policy network
 40 |         # ------------------------
 41 |         self.old_model = FCNetwork(self.n, self.m, hidden_sizes=())
 42 |         self.old_log_std = Variable(torch.ones(self.m) * init_log_std)
 43 |         self.old_params = list(self.old_model.parameters()) + [self.old_log_std]
 44 |         for idx, param in enumerate(self.old_params):
 45 |             param.data = self.trainable_params[idx].data.clone()
 46 | 
 47 |         # Easy access variables
 48 |         # -------------------------
 49 |         self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
 50 |         self.param_shapes = [p.data.numpy().shape for p in self.trainable_params]
 51 |         self.param_sizes = [p.data.numpy().size for p in self.trainable_params]
 52 |         self.d = np.sum(self.param_sizes)  # total number of params
 53 | 
 54 |         # Placeholders
 55 |         # ------------------------
 56 |         self.obs_var = Variable(torch.randn(self.n), requires_grad=False)
 57 | 
 58 |     # Utility functions
 59 |     # ============================================
 60 |     def get_param_values(self):
 61 |         params = np.concatenate([p.contiguous().view(-1).data.numpy()
 62 |                                  for p in self.trainable_params])
 63 |         return params.copy()
 64 | 
 65 |     def set_param_values(self, new_params, set_new=True, set_old=True):
 66 |         if set_new:
 67 |             current_idx = 0
 68 |             for idx, param in enumerate(self.trainable_params):
 69 |                 vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
 70 |                 vals = vals.reshape(self.param_shapes[idx])
 71 |                 param.data = torch.from_numpy(vals).float()
 72 |                 current_idx += self.param_sizes[idx]
 73 |             # clip std at minimum value
 74 |             self.trainable_params[-1].data = \
 75 |                 torch.clamp(self.trainable_params[-1], self.min_log_std).data
 76 |             # update log_std_val for sampling
 77 |             self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
 78 |         if set_old:
 79 |             current_idx = 0
 80 |             for idx, param in enumerate(self.old_params):
 81 |                 vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
 82 |                 vals = vals.reshape(self.param_shapes[idx])
 83 |                 param.data = torch.from_numpy(vals).float()
 84 |                 current_idx += self.param_sizes[idx]
 85 |             # clip std at minimum value
 86 |             self.old_params[-1].data = \
 87 |                 torch.clamp(self.old_params[-1], self.min_log_std).data
 88 | 
 89 |     # Main functions
 90 |     # ============================================
 91 |     def get_action(self, observation):
 92 |         o = np.float32(observation.reshape(1, -1))
 93 |         self.obs_var.data = torch.from_numpy(o)
 94 |         mean = self.model(self.obs_var).data.numpy().ravel()
 95 |         noise = np.exp(self.log_std_val) * np.random.randn(self.m)
 96 |         action = mean + noise
 97 |         return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}]
 98 | 
 99 |     def mean_LL(self, observations, actions, model=None, log_std=None):
100 |         model = self.model if model is None else model
101 |         log_std = self.log_std if log_std is None else log_std
102 |         obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False)
103 |         act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False)
104 |         mean = model(obs_var)
105 |         zs = (act_var - mean) / torch.exp(log_std)
106 |         LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \
107 |              - torch.sum(log_std) + \
108 |              - 0.5 * self.m * np.log(2 * np.pi)
109 |         return mean, LL
110 | 
111 |     def log_likelihood(self, observations, actions, model=None, log_std=None):
112 |         mean, LL = self.mean_LL(observations, actions, model, log_std)
113 |         return LL.data.numpy()
114 | 
115 |     def old_dist_info(self, observations, actions):
116 |         mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std)
117 |         return [LL, mean, self.old_log_std]
118 | 
119 |     def new_dist_info(self, observations, actions):
120 |         mean, LL = self.mean_LL(observations, actions, self.model, self.log_std)
121 |         return [LL, mean, self.log_std]
122 | 
123 |     def likelihood_ratio(self, new_dist_info, old_dist_info):
124 |         LL_old = old_dist_info[0]
125 |         LL_new = new_dist_info[0]
126 |         LR = torch.exp(LL_new - LL_old)
127 |         return LR
128 | 
129 |     def mean_kl(self, new_dist_info, old_dist_info):
130 |         old_log_std = old_dist_info[2]
131 |         new_log_std = new_dist_info[2]
132 |         old_std = torch.exp(old_log_std)
133 |         new_std = torch.exp(new_log_std)
134 |         old_mean = old_dist_info[1]
135 |         new_mean = new_dist_info[1]
136 |         Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2
137 |         Dr = 2 * new_std ** 2 + 1e-8
138 |         sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1)
139 |         return torch.mean(sample_kl)
140 | 


--------------------------------------------------------------------------------
/mjrl/policies/gaussian_mlp.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from mjrl.utils.fc_network import FCNetwork
  3 | import torch
  4 | from torch.autograd import Variable
  5 | 
  6 | 
  7 | class MLP:
  8 |     def __init__(self, env_spec,
  9 |                  hidden_sizes=(64,64),
 10 |                  min_log_std=-3,
 11 |                  init_log_std=0,
 12 |                  seed=None):
 13 |         """
 14 |         :param env_spec: specifications of the env (see utils/gym_env.py)
 15 |         :param hidden_sizes: network hidden layer sizes (currently 2 layers only)
 16 |         :param min_log_std: log_std is clamped at this value and can't go below
 17 |         :param init_log_std: initial log standard deviation
 18 |         :param seed: random seed
 19 |         """
 20 |         self.n = env_spec.observation_dim  # number of states
 21 |         self.m = env_spec.action_dim  # number of actions
 22 |         self.min_log_std = min_log_std
 23 | 
 24 |         # Set seed
 25 |         # ------------------------
 26 |         if seed is not None:
 27 |             torch.manual_seed(seed)
 28 |             np.random.seed(seed)
 29 | 
 30 |         # Policy network
 31 |         # ------------------------
 32 |         self.model = FCNetwork(self.n, self.m, hidden_sizes)
 33 |         # make weights small
 34 |         for param in list(self.model.parameters())[-2:]:  # only last layer
 35 |            param.data = 1e-2 * param.data
 36 |         self.log_std = Variable(torch.ones(self.m) * init_log_std, requires_grad=True)
 37 |         self.trainable_params = list(self.model.parameters()) + [self.log_std]
 38 | 
 39 |         # Old Policy network
 40 |         # ------------------------
 41 |         self.old_model = FCNetwork(self.n, self.m, hidden_sizes)
 42 |         self.old_log_std = Variable(torch.ones(self.m) * init_log_std)
 43 |         self.old_params = list(self.old_model.parameters()) + [self.old_log_std]
 44 |         for idx, param in enumerate(self.old_params):
 45 |             param.data = self.trainable_params[idx].data.clone()
 46 | 
 47 |         # Easy access variables
 48 |         # -------------------------
 49 |         self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
 50 |         self.param_shapes = [p.data.numpy().shape for p in self.trainable_params]
 51 |         self.param_sizes = [p.data.numpy().size for p in self.trainable_params]
 52 |         self.d = np.sum(self.param_sizes)  # total number of params
 53 | 
 54 |         # Placeholders
 55 |         # ------------------------
 56 |         self.obs_var = Variable(torch.randn(self.n), requires_grad=False)
 57 | 
 58 |     # Utility functions
 59 |     # ============================================
 60 |     def get_param_values(self):
 61 |         params = np.concatenate([p.contiguous().view(-1).data.numpy()
 62 |                                  for p in self.trainable_params])
 63 |         return params.copy()
 64 | 
 65 |     def set_param_values(self, new_params, set_new=True, set_old=True):
 66 |         if set_new:
 67 |             current_idx = 0
 68 |             for idx, param in enumerate(self.trainable_params):
 69 |                 vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
 70 |                 vals = vals.reshape(self.param_shapes[idx])
 71 |                 param.data = torch.from_numpy(vals).float()
 72 |                 current_idx += self.param_sizes[idx]
 73 |             # clip std at minimum value
 74 |             self.trainable_params[-1].data = \
 75 |                 torch.clamp(self.trainable_params[-1], self.min_log_std).data
 76 |             # update log_std_val for sampling
 77 |             self.log_std_val = np.float64(self.log_std.data.numpy().ravel())
 78 |         if set_old:
 79 |             current_idx = 0
 80 |             for idx, param in enumerate(self.old_params):
 81 |                 vals = new_params[current_idx:current_idx + self.param_sizes[idx]]
 82 |                 vals = vals.reshape(self.param_shapes[idx])
 83 |                 param.data = torch.from_numpy(vals).float()
 84 |                 current_idx += self.param_sizes[idx]
 85 |             # clip std at minimum value
 86 |             self.old_params[-1].data = \
 87 |                 torch.clamp(self.old_params[-1], self.min_log_std).data
 88 | 
 89 |     # Main functions
 90 |     # ============================================
 91 |     def get_action(self, observation):
 92 |         o = np.float32(observation.reshape(1, -1))
 93 |         self.obs_var.data = torch.from_numpy(o)
 94 |         mean = self.model(self.obs_var).data.numpy().ravel()
 95 |         noise = np.exp(self.log_std_val) * np.random.randn(self.m)
 96 |         action = mean + noise
 97 |         return [action, {'mean': mean, 'log_std': self.log_std_val, 'evaluation': mean}]
 98 | 
 99 |     def mean_LL(self, observations, actions, model=None, log_std=None):
100 |         model = self.model if model is None else model
101 |         log_std = self.log_std if log_std is None else log_std
102 |         if type(observations) is not torch.Tensor:
103 |             obs_var = Variable(torch.from_numpy(observations).float(), requires_grad=False)
104 |         else:
105 |             obs_var = observations
106 |         if type(actions) is not torch.Tensor:
107 |             act_var = Variable(torch.from_numpy(actions).float(), requires_grad=False)
108 |         else:
109 |             act_var = actions
110 |         mean = model(obs_var)
111 |         zs = (act_var - mean) / torch.exp(log_std)
112 |         LL = - 0.5 * torch.sum(zs ** 2, dim=1) + \
113 |              - torch.sum(log_std) + \
114 |              - 0.5 * self.m * np.log(2 * np.pi)
115 |         return mean, LL
116 | 
117 |     def log_likelihood(self, observations, actions, model=None, log_std=None):
118 |         mean, LL = self.mean_LL(observations, actions, model, log_std)
119 |         return LL.data.numpy()
120 | 
121 |     def old_dist_info(self, observations, actions):
122 |         mean, LL = self.mean_LL(observations, actions, self.old_model, self.old_log_std)
123 |         return [LL, mean, self.old_log_std]
124 | 
125 |     def new_dist_info(self, observations, actions):
126 |         mean, LL = self.mean_LL(observations, actions, self.model, self.log_std)
127 |         return [LL, mean, self.log_std]
128 | 
129 |     def likelihood_ratio(self, new_dist_info, old_dist_info):
130 |         LL_old = old_dist_info[0]
131 |         LL_new = new_dist_info[0]
132 |         LR = torch.exp(LL_new - LL_old)
133 |         return LR
134 | 
135 |     def mean_kl(self, new_dist_info, old_dist_info):
136 |         old_log_std = old_dist_info[2]
137 |         new_log_std = new_dist_info[2]
138 |         old_std = torch.exp(old_log_std)
139 |         new_std = torch.exp(new_log_std)
140 |         old_mean = old_dist_info[1]
141 |         new_mean = new_dist_info[1]
142 |         Nr = (old_mean - new_mean) ** 2 + old_std ** 2 - new_std ** 2
143 |         Dr = 2 * new_std ** 2 + 1e-8
144 |         sample_kl = torch.sum(Nr / Dr + new_log_std - old_log_std, dim=1)
145 |         return torch.mean(sample_kl)
146 | 


--------------------------------------------------------------------------------
/mjrl/policies/mpc_actor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from trajopt.utils import gather_paths_parallel
 3 | 
 4 | 
 5 | class MPCActor(object):
 6 |     def __init__(self, env, H, paths_per_cpu,
 7 |                  num_cpu=1,
 8 |                  kappa=1.0,
 9 |                  gamma=1.0,
10 |                  mean=None,
11 |                  filter_coefs=None,
12 |                  seed=123,
13 |                  ):
14 | 
15 |         self.env, self.seed = env, seed
16 |         self.n, self.m = env.observation_dim, env.action_dim
17 |         self.H, self.paths_per_cpu, self.num_cpu = H, paths_per_cpu, num_cpu
18 | 
19 |         self.mean, self.filter_coefs, self.kappa, self.gamma = mean, filter_coefs, kappa, gamma
20 |         if mean is None:
21 |             self.mean = np.zeros(self.m)
22 |         if filter_coefs is None:
23 |             self.filter_coefs = [np.ones(self.m), 1.0, 0.0, 0.0]
24 | 
25 |         self.env.reset()
26 |         self.env.set_seed(seed)
27 |         self.env.reset(seed=seed)
28 |         self.act_sequence = np.ones((self.H, self.m)) * self.mean
29 |         self.ctr = 1
30 | 
31 |     def score_trajectory(self, paths):
32 |         scores = np.zeros(len(paths))
33 |         for i in range(len(paths)):
34 |             scores[i] = 0.0
35 |             for t in range(paths[i]["rewards"].shape[0]):
36 |                 scores[i] += (self.gamma**t)*paths[i]["rewards"][t]
37 |         return scores
38 | 
39 |     def get_action(self, env_state):
40 |         # Set to env_state
41 |         # Shoot trajectories
42 |         # Return optimal action
43 |         seed = self.seed + self.ctr * 1000
44 |         paths = gather_paths_parallel(self.env.env_id,
45 |                                       env_state,
46 |                                       self.act_sequence,
47 |                                       self.filter_coefs,
48 |                                       seed,
49 |                                       self.paths_per_cpu,
50 |                                       self.num_cpu,
51 |                                       )
52 | 
53 |         num_traj = len(paths)
54 |         R = self.score_trajectory(paths)
55 |         S = np.exp(self.kappa*(R-np.max(R)))
56 |         act = np.sum([paths[i]["actions"][0] * S[i] for i in range(num_traj)], axis=0)
57 |         act = act / (np.sum(S) + 1e-6)
58 |         return act


--------------------------------------------------------------------------------
/mjrl/samplers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/samplers/__init__.py


--------------------------------------------------------------------------------
/mjrl/samplers/core.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import numpy as np
  3 | from mjrl.utils.gym_env import GymEnv
  4 | from mjrl.utils import tensor_utils
  5 | logging.disable(logging.CRITICAL)
  6 | import multiprocessing as mp
  7 | import time as timer
  8 | logging.disable(logging.CRITICAL)
  9 | 
 10 | 
 11 | # Single core rollout to sample trajectories
 12 | # =======================================================
 13 | def do_rollout(
 14 |         num_traj,
 15 |         env,
 16 |         policy,
 17 |         eval_mode = False,
 18 |         horizon = 1e6,
 19 |         base_seed = None,
 20 |         env_kwargs=None,
 21 | ):
 22 |     """
 23 |     :param num_traj:    number of trajectories (int)
 24 |     :param env:         environment (env class, str with env_name, or factory function)
 25 |     :param policy:      policy to use for action selection
 26 |     :param eval_mode:   use evaluation mode for action computation (bool)
 27 |     :param horizon:     max horizon length for rollout (<= env.horizon)
 28 |     :param base_seed:   base seed for rollouts (int)
 29 |     :param env_kwargs:  dictionary with parameters, will be passed to env generator
 30 |     :return:
 31 |     """
 32 | 
 33 |     # get the correct env behavior
 34 |     if type(env) == str:
 35 |         env = GymEnv(env)
 36 |     elif isinstance(env, GymEnv):
 37 |         env = env
 38 |     elif callable(env):
 39 |         env = env(**env_kwargs)
 40 |     else:
 41 |         print("Unsupported environment format")
 42 |         raise AttributeError
 43 | 
 44 |     if base_seed is not None:
 45 |         env.set_seed(base_seed)
 46 |         np.random.seed(base_seed)
 47 |     else:
 48 |         np.random.seed()
 49 |     horizon = min(horizon, env.horizon)
 50 |     paths = []
 51 | 
 52 |     for ep in range(num_traj):
 53 |         # seeding
 54 |         if base_seed is not None:
 55 |             seed = base_seed + ep
 56 |             env.set_seed(seed)
 57 |             np.random.seed(seed)
 58 | 
 59 |         observations=[]
 60 |         actions=[]
 61 |         rewards=[]
 62 |         agent_infos = []
 63 |         env_infos = []
 64 | 
 65 |         o = env.reset()
 66 |         done = False
 67 |         t = 0
 68 | 
 69 |         while t < horizon and done != True:
 70 |             a, agent_info = policy.get_action(o)
 71 |             if eval_mode:
 72 |                 a = agent_info['evaluation']
 73 |             env_info_base = env.get_env_infos()
 74 |             next_o, r, done, env_info_step = env.step(a)
 75 |             # below is important to ensure correct env_infos for the timestep
 76 |             env_info = env_info_step if env_info_base == {} else env_info_base
 77 |             observations.append(o)
 78 |             actions.append(a)
 79 |             rewards.append(r)
 80 |             agent_infos.append(agent_info)
 81 |             env_infos.append(env_info)
 82 |             o = next_o
 83 |             t += 1
 84 | 
 85 |         path = dict(
 86 |             observations=np.array(observations),
 87 |             actions=np.array(actions),
 88 |             rewards=np.array(rewards),
 89 |             agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
 90 |             env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
 91 |             terminated=done
 92 |         )
 93 |         paths.append(path)
 94 | 
 95 |     del(env)
 96 |     return paths
 97 | 
 98 | 
 99 | def sample_paths(
100 |         num_traj,
101 |         env,
102 |         policy,
103 |         eval_mode = False,
104 |         horizon = 1e6,
105 |         base_seed = None,
106 |         num_cpu = 1,
107 |         max_process_time=300,
108 |         max_timeouts=4,
109 |         suppress_print=False,
110 |         env_kwargs=None,
111 |         ):
112 | 
113 |     num_cpu = 1 if num_cpu is None else num_cpu
114 |     num_cpu = mp.cpu_count() if num_cpu == 'max' else num_cpu
115 |     assert type(num_cpu) == int
116 | 
117 |     if num_cpu == 1:
118 |         input_dict = dict(num_traj=num_traj, env=env, policy=policy,
119 |                           eval_mode=eval_mode, horizon=horizon, base_seed=base_seed,
120 |                           env_kwargs=env_kwargs)
121 |         # dont invoke multiprocessing if not necessary
122 |         return do_rollout(**input_dict)
123 | 
124 |     # do multiprocessing otherwise
125 |     paths_per_cpu = int(np.ceil(num_traj/num_cpu))
126 |     input_dict_list= []
127 |     for i in range(num_cpu):
128 |         input_dict = dict(num_traj=paths_per_cpu, env=env, policy=policy,
129 |                           eval_mode=eval_mode, horizon=horizon,
130 |                           base_seed=base_seed + i * paths_per_cpu,
131 |                           env_kwargs=env_kwargs)
132 |         input_dict_list.append(input_dict)
133 |     if suppress_print is False:
134 |         start_time = timer.time()
135 |         print("####### Gathering Samples #######")
136 | 
137 |     results = _try_multiprocess(do_rollout, input_dict_list,
138 |                                 num_cpu, max_process_time, max_timeouts)
139 |     paths = []
140 |     # result is a paths type and results is list of paths
141 |     for result in results:
142 |         for path in result:
143 |             paths.append(path)  
144 | 
145 |     if suppress_print is False:
146 |         print("======= Samples Gathered  ======= | >>>> Time taken = %f " %(timer.time()-start_time) )
147 | 
148 |     return paths
149 | 
150 | 
151 | def sample_data_batch(
152 |         num_samples,
153 |         env,
154 |         policy,
155 |         eval_mode = False,
156 |         horizon = 1e6,
157 |         base_seed = None,
158 |         num_cpu = 1,
159 |         paths_per_call = 1,
160 |         env_kwargs=None,
161 |         ):
162 | 
163 |     num_cpu = 1 if num_cpu is None else num_cpu
164 |     num_cpu = mp.cpu_count() if num_cpu == 'max' else num_cpu
165 |     assert type(num_cpu) == int
166 | 
167 |     start_time = timer.time()
168 |     print("####### Gathering Samples #######")
169 |     sampled_so_far = 0
170 |     paths_so_far = 0
171 |     paths = []
172 |     base_seed = 123 if base_seed is None else base_seed
173 |     while sampled_so_far < num_samples:
174 |         base_seed = base_seed + 12345
175 |         new_paths = sample_paths(paths_per_call * num_cpu, env, policy,
176 |                                  eval_mode, horizon, base_seed, num_cpu,
177 |                                  suppress_print=True, env_kwargs=env_kwargs)
178 |         for path in new_paths:
179 |             paths.append(path)
180 |         paths_so_far += len(new_paths)
181 |         new_samples = np.sum([len(p['rewards']) for p in new_paths])
182 |         sampled_so_far += new_samples
183 |     print("======= Samples Gathered  ======= | >>>> Time taken = %f " % (timer.time() - start_time))
184 |     print("................................. | >>>> # samples = %i # trajectories = %i " % (
185 |     sampled_so_far, paths_so_far))
186 |     return paths
187 | 
188 | 
189 | def _try_multiprocess(func, input_dict_list, num_cpu, max_process_time, max_timeouts):
190 |     
191 |     # Base case
192 |     if max_timeouts == 0:
193 |         return None
194 | 
195 |     pool = mp.Pool(processes=num_cpu, maxtasksperchild=None)
196 |     parallel_runs = [pool.apply_async(func, kwds=input_dict) for input_dict in input_dict_list]
197 |     try:
198 |         results = [p.get(timeout=max_process_time) for p in parallel_runs]
199 |     except Exception as e:
200 |         print(str(e))
201 |         print("Timeout Error raised... Trying again")
202 |         pool.close()
203 |         pool.terminate()
204 |         pool.join()
205 |         return _try_multiprocess(func, input_dict_list, num_cpu, max_process_time, max_timeouts-1)
206 | 
207 |     pool.close()
208 |     pool.terminate()
209 |     pool.join()  
210 |     return results
211 | 


--------------------------------------------------------------------------------
/mjrl/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aravindr93/mjrl/3871d93763d3b49c4741e6daeaebbc605fe140dc/mjrl/utils/__init__.py


--------------------------------------------------------------------------------
/mjrl/utils/cg_solve.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def cg_solve(f_Ax, b, x_0=None, cg_iters=10, residual_tol=1e-10):
 4 |     x = np.zeros_like(b) #if x_0 is None else x_0
 5 |     r = b.copy() #if x_0 is None else b-f_Ax(x_0)
 6 |     p = r.copy()
 7 |     rdotr = r.dot(r)
 8 | 
 9 |     for i in range(cg_iters):
10 |         z = f_Ax(p)
11 |         v = rdotr / p.dot(z)
12 |         x += v * p
13 |         r -= v * z
14 |         newrdotr = r.dot(r)
15 |         mu = newrdotr / rdotr
16 |         p = r + mu * p
17 | 
18 |         rdotr = newrdotr
19 |         if rdotr < residual_tol:
20 |             break
21 | 
22 |     return x
23 | 


--------------------------------------------------------------------------------
/mjrl/utils/fc_network.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class FCNetwork(nn.Module):
 7 |     def __init__(self, obs_dim, act_dim,
 8 |                  hidden_sizes=(64,64),
 9 |                  nonlinearity='tanh',   # either 'tanh' or 'relu'
10 |                  in_shift = None,
11 |                  in_scale = None,
12 |                  out_shift = None,
13 |                  out_scale = None):
14 |         super(FCNetwork, self).__init__()
15 | 
16 |         self.obs_dim = obs_dim
17 |         self.act_dim = act_dim
18 |         assert type(hidden_sizes) == tuple
19 |         self.layer_sizes = (obs_dim, ) + hidden_sizes + (act_dim, )
20 |         self.set_transformations(in_shift, in_scale, out_shift, out_scale)
21 | 
22 |         # hidden layers
23 |         self.fc_layers = nn.ModuleList([nn.Linear(self.layer_sizes[i], self.layer_sizes[i+1]) \
24 |                          for i in range(len(self.layer_sizes) -1)])
25 |         self.nonlinearity = torch.relu if nonlinearity == 'relu' else torch.tanh
26 | 
27 |     def set_transformations(self, in_shift=None, in_scale=None, out_shift=None, out_scale=None):
28 |         # store native scales that can be used for resets
29 |         self.transformations = dict(in_shift=in_shift,
30 |                            in_scale=in_scale,
31 |                            out_shift=out_shift,
32 |                            out_scale=out_scale
33 |                           )
34 |         self.in_shift  = torch.from_numpy(np.float32(in_shift)) if in_shift is not None else torch.zeros(self.obs_dim)
35 |         self.in_scale  = torch.from_numpy(np.float32(in_scale)) if in_scale is not None else torch.ones(self.obs_dim)
36 |         self.out_shift = torch.from_numpy(np.float32(out_shift)) if out_shift is not None else torch.zeros(self.act_dim)
37 |         self.out_scale = torch.from_numpy(np.float32(out_scale)) if out_scale is not None else torch.ones(self.act_dim)
38 | 
39 |     def forward(self, x):
40 |         # TODO(Aravind): Remove clamping to CPU
41 |         # This is a temp change that should be fixed shortly
42 |         if x.is_cuda:
43 |             out = x.to('cpu')
44 |         else:
45 |             out = x
46 |         out = (out - self.in_shift)/(self.in_scale + 1e-8)
47 |         for i in range(len(self.fc_layers)-1):
48 |             out = self.fc_layers[i](out)
49 |             out = self.nonlinearity(out)
50 |         out = self.fc_layers[-1](out)
51 |         out = out * self.out_scale + self.out_shift
52 |         return out
53 | 


--------------------------------------------------------------------------------
/mjrl/utils/get_environment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | convenience function to generate env
 3 | useful if we want some procedural env generation
 4 | """
 5 | 
 6 | import gym
 7 | from mjrl.utils.gym_env import GymEnv
 8 | 
 9 | def get_environment(env_name=None, **kwargs):
10 |     if env_name is None: print("Need to specify environment name")
11 |     e = GymEnv(env_name)
12 |     # can make procedural modifications here if needed using kwargs
13 |     return e
14 | 


--------------------------------------------------------------------------------
/mjrl/utils/gym_env.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Wrapper around a gym env that provides convenience functions
  3 | """
  4 | 
  5 | import gym
  6 | import numpy as np
  7 | 
  8 | 
  9 | class EnvSpec(object):
 10 |     def __init__(self, obs_dim, act_dim, horizon):
 11 |         self.observation_dim = obs_dim
 12 |         self.action_dim = act_dim
 13 |         self.horizon = horizon
 14 | 
 15 | 
 16 | class GymEnv(object):
 17 |     def __init__(self, env, env_kwargs=None,
 18 |                  obs_mask=None, act_repeat=1, 
 19 |                  *args, **kwargs):
 20 |     
 21 |         # get the correct env behavior
 22 |         if type(env) == str:
 23 |             env = gym.make(env)
 24 |         elif isinstance(env, gym.Env):
 25 |             env = env
 26 |         elif callable(env):
 27 |             env = env(**env_kwargs)
 28 |         else:
 29 |             print("Unsupported environment format")
 30 |             raise AttributeError
 31 | 
 32 |         self.env = env
 33 |         self.env_id = env.spec.id
 34 |         self.act_repeat = act_repeat
 35 | 
 36 |         try:
 37 |             self._horizon = env.spec.max_episode_steps
 38 |         except AttributeError:
 39 |             self._horizon = env.spec._horizon
 40 | 
 41 |         assert self._horizon % act_repeat == 0
 42 |         self._horizon = self._horizon // self.act_repeat
 43 | 
 44 |         try:
 45 |             self._action_dim = self.env.env.action_dim
 46 |         except AttributeError:
 47 |             self._action_dim = self.env.action_space.shape[0]
 48 | 
 49 |         try:
 50 |             self._observation_dim = self.env.env.obs_dim
 51 |         except AttributeError:
 52 |             self._observation_dim = self.env.observation_space.shape[0]
 53 | 
 54 |         # Specs
 55 |         self.spec = EnvSpec(self._observation_dim, self._action_dim, self._horizon)
 56 | 
 57 |         # obs mask
 58 |         self.obs_mask = np.ones(self._observation_dim) if obs_mask is None else obs_mask
 59 | 
 60 |     @property
 61 |     def action_dim(self):
 62 |         return self._action_dim
 63 | 
 64 |     @property
 65 |     def observation_dim(self):
 66 |         return self._observation_dim
 67 | 
 68 |     @property
 69 |     def observation_space(self):
 70 |         return self.env.observation_space
 71 | 
 72 |     @property
 73 |     def action_space(self):
 74 |         return self.env.action_space
 75 | 
 76 |     @property
 77 |     def horizon(self):
 78 |         return self._horizon
 79 | 
 80 |     def reset(self, seed=None):
 81 |         try:
 82 |             self.env._elapsed_steps = 0
 83 |             return self.env.env.reset_model(seed=seed)
 84 |         except:
 85 |             if seed is not None:
 86 |                 self.set_seed(seed)
 87 |             return self.env.reset()
 88 | 
 89 |     def reset_model(self, seed=None):
 90 |         # overloading for legacy code
 91 |         return self.reset(seed)
 92 | 
 93 |     def step(self, action):
 94 |         action = action.clip(self.action_space.low, self.action_space.high)
 95 |         if self.act_repeat == 1: 
 96 |             obs, cum_reward, done, ifo = self.env.step(action)
 97 |         else:
 98 |             cum_reward = 0.0
 99 |             for i in range(self.act_repeat):
100 |                 obs, reward, done, ifo = self.env.step(action)
101 |                 cum_reward += reward
102 |                 if done: break
103 |         return self.obs_mask * obs, cum_reward, done, ifo
104 | 
105 |     def render(self):
106 |         try:
107 |             self.env.env.mujoco_render_frames = True
108 |             self.env.env.mj_render()
109 |         except:
110 |             self.env.render()
111 | 
112 |     def set_seed(self, seed=123):
113 |         try:
114 |             self.env.seed(seed)
115 |         except AttributeError:
116 |             self.env._seed(seed)
117 | 
118 |     def get_obs(self):
119 |         try:
120 |             return self.obs_mask * self.env.env.get_obs()
121 |         except:
122 |             return self.obs_mask * self.env.env._get_obs()
123 | 
124 |     def get_env_infos(self):
125 |         try:
126 |             return self.env.env.get_env_infos()
127 |         except:
128 |             return {}
129 | 
130 |     # ===========================================
131 |     # Trajectory optimization related
132 |     # Envs should support these functions in case of trajopt
133 | 
134 |     def get_env_state(self):
135 |         try:
136 |             return self.env.env.get_env_state()
137 |         except:
138 |             raise NotImplementedError
139 | 
140 |     def set_env_state(self, state_dict):
141 |         try:
142 |             self.env.env.set_env_state(state_dict)
143 |         except:
144 |             raise NotImplementedError
145 | 
146 |     def real_env_step(self, bool_val):
147 |         try:
148 |             self.env.env.real_step = bool_val
149 |         except:
150 |             raise NotImplementedError
151 | 
152 |     # ===========================================
153 | 
154 |     def visualize_policy(self, policy, horizon=1000, num_episodes=1, mode='exploration'):
155 |         try:
156 |             self.env.env.visualize_policy(policy, horizon, num_episodes, mode)
157 |         except:
158 |             for ep in range(num_episodes):
159 |                 o = self.reset()
160 |                 d = False
161 |                 t = 0
162 |                 score = 0.0
163 |                 while t < horizon and d is False:
164 |                     a = policy.get_action(o)[0] if mode == 'exploration' else policy.get_action(o)[1]['evaluation']
165 |                     o, r, d, _ = self.step(a)
166 |                     score = score + r
167 |                     self.render()
168 |                     t = t+1
169 |                 print("Episode score = %f" % score)
170 | 
171 |     def evaluate_policy(self, policy,
172 |                         num_episodes=5,
173 |                         horizon=None,
174 |                         gamma=1,
175 |                         visual=False,
176 |                         percentile=[],
177 |                         get_full_dist=False,
178 |                         mean_action=False,
179 |                         init_env_state=None,
180 |                         terminate_at_done=True,
181 |                         seed=123):
182 | 
183 |         self.set_seed(seed)
184 |         horizon = self._horizon if horizon is None else horizon
185 |         mean_eval, std, min_eval, max_eval = 0.0, 0.0, -1e8, -1e8
186 |         ep_returns = np.zeros(num_episodes)
187 | 
188 |         for ep in range(num_episodes):
189 |             self.reset()
190 |             if init_env_state is not None:
191 |                 self.set_env_state(init_env_state)
192 |             t, done = 0, False
193 |             while t < horizon and (done == False or terminate_at_done == False):
194 |                 self.render() if visual is True else None
195 |                 o = self.get_obs()
196 |                 a = policy.get_action(o)[1]['evaluation'] if mean_action is True else policy.get_action(o)[0]
197 |                 o, r, done, _ = self.step(a)
198 |                 ep_returns[ep] += (gamma ** t) * r
199 |                 t += 1
200 | 
201 |         mean_eval, std = np.mean(ep_returns), np.std(ep_returns)
202 |         min_eval, max_eval = np.amin(ep_returns), np.amax(ep_returns)
203 |         base_stats = [mean_eval, std, min_eval, max_eval]
204 | 
205 |         percentile_stats = []
206 |         for p in percentile:
207 |             percentile_stats.append(np.percentile(ep_returns, p))
208 | 
209 |         full_dist = ep_returns if get_full_dist is True else None
210 | 
211 |         return [base_stats, percentile_stats, full_dist]
212 | 


--------------------------------------------------------------------------------
/mjrl/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import scipy
 6 | import pickle
 7 | import os
 8 | import csv
 9 | 
10 | class DataLog:
11 | 
12 |     def __init__(self):
13 |         self.log = {}
14 |         self.max_len = 0
15 | 
16 |     def log_kv(self, key, value):
17 |         # logs the (key, value) pair
18 | 
19 |         # TODO: This implementation is error-prone:
20 |         # it would be NOT aligned if some keys are missing during one iteration.
21 |         if key not in self.log:
22 |             self.log[key] = []
23 |         self.log[key].append(value)
24 |         if len(self.log[key]) > self.max_len:
25 |             self.max_len = self.max_len + 1
26 | 
27 |     def save_log(self, save_path):
28 |         # TODO: Validate all lengths are the same.
29 |         pickle.dump(self.log, open(save_path + '/log.pickle', 'wb'))
30 |         with open(save_path + '/log.csv', 'w') as csv_file:
31 |             fieldnames = list(self.log.keys())
32 |             if 'iteration' not in fieldnames:
33 |                 fieldnames = ['iteration'] + fieldnames
34 | 
35 |             writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
36 |             writer.writeheader()
37 |             for row in range(self.max_len):
38 |                 row_dict = {'iteration': row}
39 |                 for key in self.log.keys():
40 |                     if row < len(self.log[key]):
41 |                         row_dict[key] = self.log[key][row]
42 |                 writer.writerow(row_dict)
43 | 
44 |     def get_current_log(self):
45 |         row_dict = {}
46 |         for key in self.log.keys():
47 |             # TODO: this is very error-prone (alignment is not guaranteed)
48 |             row_dict[key] = self.log[key][-1]
49 |         return row_dict
50 | 
51 |     def shrink_to(self, num_entries):
52 |         for key in self.log.keys():
53 |             self.log[key] = self.log[key][:num_entries]
54 | 
55 |         self.max_len = num_entries
56 |         assert min([len(series) for series in self.log.values()]) == \
57 |             max([len(series) for series in self.log.values()])
58 | 
59 |     def read_log(self, log_path):
60 |         assert log_path.endswith('log.csv')
61 | 
62 |         with open(log_path) as csv_file:
63 |             reader = csv.DictReader(csv_file)
64 |             listr = list(reader)
65 |             keys = reader.fieldnames
66 |             data = {}
67 |             for key in keys:
68 |                 data[key] = []
69 |             for row, row_dict in enumerate(listr):
70 |                 for key in keys:
71 |                     try:
72 |                         data[key].append(eval(row_dict[key]))
73 |                     except:
74 |                         print("ERROR on reading key {}: {}".format(key, row_dict[key]))
75 | 
76 |                 if 'iteration' in data and data['iteration'][-1] != row:
77 |                     raise RuntimeError("Iteration %d mismatch -- possibly corrupted logfile?" % row)
78 | 
79 |         self.log = data
80 |         self.max_len = max(len(v) for k, v in self.log.items())
81 |         print("Log read from {}: had {} entries".format(log_path, self.max_len))
82 | 


--------------------------------------------------------------------------------
/mjrl/utils/make_train_plots.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | matplotlib.use('Agg')
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import scipy
 6 | import csv
 7 | from mjrl.utils.logger import DataLog
 8 | import argparse
 9 | 
10 | def make_train_plots(log = None,
11 |                      log_path = None,
12 |                      keys = None,
13 |                      save_loc = None,
14 |                      sample_key = 'num_samples',
15 |                      x_scale = 1.0,
16 |                      y_scale = 1.0):
17 |     if log is None and log_path is None:
18 |         print("Need to provide either the log or path to a log file")
19 |     if log is None:
20 |         logger = DataLog()
21 |         logger.read_log(log_path)
22 |         log = logger.log
23 |     # make plots for specified keys
24 |     for key in keys:
25 |         if key in log.keys():
26 |             fig = plt.figure(figsize=(10,6))
27 |             ax1 = fig.add_subplot(111)
28 |             try:
29 |                 cum_samples = [np.sum(log[sample_key][:i]) * x_scale for i in range(len(log[sample_key]))]
30 |                 ax1.plot(cum_samples, [elem * y_scale for elem in log[key]])
31 |                 ax1.set_xlabel('samples')
32 |                 # mark iteration on the top axis
33 |                 ax2 = ax1.twiny() 
34 |                 ax2.set_xlabel('iterations', color=(.7,.7,.7))
35 |                 ax2.tick_params(axis='x', labelcolor=(.7,.7,.7))
36 |                 ax2.set_xlim([0, len(log[key])])
37 |             except:
38 |                 ax1.plot(log[key])
39 |                 ax1.set_xlabel('iterations')
40 |             ax1.set_title(key)
41 |             plt.savefig(save_loc+'/'+key+'.png', dpi=100)
42 |             plt.close()
43 | 
44 | # MAIN =========================================================
45 | # Example: python make_train_plots.py --log_path logs/log.csv --keys eval_score rollout_score save_loc logs
46 | def main():
47 |     # Parse arguments
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument(
50 |         '-l', '--log_path', type=str, required=True, help='path file to log.csv')
51 |     parser.add_argument(
52 |         '-k', '--keys', type=str, action='append', nargs='+', required=True, help='keys to plot')
53 |     parser.add_argument(
54 |         '-s', '--save_loc', type=str, default='', help='Path for logs')
55 |     args = parser.parse_args()
56 | 
57 |     make_train_plots(log_path=args.log_path, keys=args.keys[0], save_loc=args.save_loc)
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 
62 | 


--------------------------------------------------------------------------------
/mjrl/utils/optimize_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | def fit_data(model, x, y, optimizer, loss_func, batch_size, epochs):
 8 |     """
 9 |     :param model:           pytorch model of form y_hat = f(x) (class)
10 |     :param x:               inputs to the model (tensor)
11 |     :param y:               desired outputs or targets (tensor)
12 |     :param optimizer:       optimizer to be used (class)
13 |     :param loss_func:       loss criterion (callable)
14 |     :param batch_size:      mini-batch size for optimization (int)
15 |     :param epochs:          number of epochs (int)
16 |     :return:
17 |     """
18 | 
19 |     num_samples = x.shape[0]
20 |     epoch_losses = []
21 |     for ep in range(epochs):
22 |         rand_idx = torch.LongTensor(np.random.permutation(num_samples))
23 |         ep_loss = 0.0
24 |         num_steps = int(num_samples / batch_size) - 1
25 |         for mb in range(num_steps):
26 |             data_idx = rand_idx[mb*batch_size:(mb+1)*batch_size]
27 |             batch_x = x[data_idx]
28 |             batch_y = y[data_idx]
29 |             optimizer.zero_grad()
30 |             yhat = model(batch_x)
31 |             loss = loss_func(yhat, batch_y)
32 |             loss.backward()
33 |             optimizer.step()
34 |             ep_loss += loss.detach()
35 |         epoch_losses.append(ep_loss.to('cpu').data.numpy().ravel() / num_steps)
36 |     return epoch_losses
37 | 


--------------------------------------------------------------------------------
/mjrl/utils/plot_from_logs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import pickle
 4 | import numpy as np
 5 | import matplotlib
 6 | matplotlib.use('Agg')
 7 | import matplotlib.pyplot as plt
 8 | colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
 9 | 
10 | parser = argparse.ArgumentParser(description='Script to explore the data generated by an experiment.')
11 | parser.add_argument('--data', '-d', type=str, required=True, help='location of the .pickle log data file')
12 | parser.add_argument('--output', '-o', type=str, required=True, help='location to store results as a png')
13 | parser.add_argument('--xkey', '-x', type=str, default=None, help='the key to use for x axis in plots')
14 | parser.add_argument('--xscale', '-s', type=int, default=1, help='scaling for the x axis (optional)')
15 | args = parser.parse_args()
16 | 
17 | # get inputs and setup output file
18 | if '.png' in args.output:
19 |     OUT_FILE = args.output
20 | else:
21 |     OUT_FILE = args.output + '/plot.png'
22 | data = pickle.load(open(args.data, 'rb'))
23 | xscale = 1 if args.xscale is None else args.xscale
24 | if args.xkey == 'num_samples':
25 |     xscale = xscale if 'act_repeat' not in data.keys() else data['act_repeat'][-1]
26 | 
27 | dict_keys = list(data.keys())
28 | for k in dict_keys:
29 |     if len(data[k]) == 1: del(data[k])
30 | 
31 | # plot layout
32 | nplt = len(data.keys())
33 | ncol = 4
34 | nrow = int(np.ceil(nplt/ncol))
35 | 
36 | # plot data
37 | xkey = args.xkey
38 | start_idx = 2
39 | end_idx = max([len(data[k]) for k in data.keys()])
40 | xdata = np.arange(end_idx) if (xkey is None or xkey == 'None') else \
41 |         [np.sum(data[xkey][:i+1]) * xscale for i in range(len(data[xkey]))]
42 | 
43 | # make the plot
44 | plt.figure(figsize=(15,15), dpi=60)
45 | for idx, key in enumerate(data.keys()):
46 |     plt.subplot(nrow, ncol, idx+1)
47 |     plt.tight_layout()
48 |     try:
49 |         last_idx = min(end_idx, len(data[key]))
50 |         plt.plot(xdata[start_idx:last_idx], data[key][start_idx:last_idx], color=colors[idx%7], linewidth=3)
51 |     except:
52 |         pass
53 |     plt.title(key)
54 | 
55 | plt.savefig(OUT_FILE, dpi=100, bbox_inches="tight")
56 | 


--------------------------------------------------------------------------------
/mjrl/utils/process_samples.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def compute_returns(paths, gamma):
 4 |     for path in paths:
 5 |         path["returns"] = discount_sum(path["rewards"], gamma)
 6 | 
 7 | def compute_advantages(paths, baseline, gamma, gae_lambda=None, normalize=False):
 8 |     # compute and store returns, advantages, and baseline 
 9 |     # standard mode
10 |     if gae_lambda == None or gae_lambda < 0.0 or gae_lambda > 1.0:
11 |         for path in paths:
12 |             path["baseline"] = baseline.predict(path)
13 |             path["advantages"] = path["returns"] - path["baseline"]
14 |         if normalize:
15 |             alladv = np.concatenate([path["advantages"] for path in paths])
16 |             mean_adv = alladv.mean()
17 |             std_adv = alladv.std()
18 |             for path in paths:
19 |                 path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8)
20 |     # GAE mode
21 |     else:
22 |         for path in paths:
23 |             b = path["baseline"] = baseline.predict(path)
24 |             if b.ndim == 1:
25 |                 b1 = np.append(path["baseline"], 0.0 if path["terminated"] else b[-1])
26 |             else:
27 |                 b1 = np.vstack((b, np.zeros(b.shape[1]) if path["terminated"] else b[-1]))
28 |             td_deltas = path["rewards"] + gamma*b1[1:] - b1[:-1]
29 |             path["advantages"] = discount_sum(td_deltas, gamma*gae_lambda)
30 |         if normalize:
31 |             alladv = np.concatenate([path["advantages"] for path in paths])
32 |             mean_adv = alladv.mean()
33 |             std_adv = alladv.std()
34 |             for path in paths:
35 |                 path["advantages"] = (path["advantages"]-mean_adv)/(std_adv+1e-8)
36 | 
37 | def discount_sum(x, gamma, terminal=0.0):
38 |     y = []
39 |     run_sum = terminal
40 |     for t in range( len(x)-1, -1, -1):
41 |         run_sum = x[t] + gamma*run_sum
42 |         y.append(run_sum)
43 | 
44 |     return np.array(y[::-1])


--------------------------------------------------------------------------------
/mjrl/utils/tensor_utils.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | def flatten_tensors(tensors):
  7 |     if len(tensors) > 0:
  8 |         return np.concatenate([np.reshape(x, [-1]) for x in tensors])
  9 |     else:
 10 |         return np.asarray([])
 11 | 
 12 | 
 13 | def unflatten_tensors(flattened, tensor_shapes):
 14 |     tensor_sizes = list(map(np.prod, tensor_shapes))
 15 |     indices = np.cumsum(tensor_sizes)[:-1]
 16 |     return [np.reshape(pair[0], pair[1]) for pair in zip(np.split(flattened, indices), tensor_shapes)]
 17 | 
 18 | 
 19 | def pad_tensor(x, max_len, mode='zero'):
 20 |     padding = np.zeros_like(x[0])
 21 |     if mode == 'last':
 22 |         padding = x[-1]
 23 |     return np.concatenate([
 24 |         x,
 25 |         np.tile(padding, (max_len - len(x),) + (1,) * np.ndim(x[0]))
 26 |     ])
 27 | 
 28 | 
 29 | def pad_tensor_n(xs, max_len):
 30 |     ret = np.zeros((len(xs), max_len) + xs[0].shape[1:], dtype=xs[0].dtype)
 31 |     for idx, x in enumerate(xs):
 32 |         ret[idx][:len(x)] = x
 33 |     return ret
 34 | 
 35 | 
 36 | def pad_tensor_dict(tensor_dict, max_len, mode='zero'):
 37 |     keys = list(tensor_dict.keys())
 38 |     ret = dict()
 39 |     for k in keys:
 40 |         if isinstance(tensor_dict[k], dict):
 41 |             ret[k] = pad_tensor_dict(tensor_dict[k], max_len, mode=mode)
 42 |         else:
 43 |             ret[k] = pad_tensor(tensor_dict[k], max_len, mode=mode)
 44 |     return ret
 45 | 
 46 | 
 47 | def flatten_first_axis_tensor_dict(tensor_dict):
 48 |     keys = list(tensor_dict.keys())
 49 |     ret = dict()
 50 |     for k in keys:
 51 |         if isinstance(tensor_dict[k], dict):
 52 |             ret[k] = flatten_first_axis_tensor_dict(tensor_dict[k])
 53 |         else:
 54 |             old_shape = tensor_dict[k].shape
 55 |             ret[k] = tensor_dict[k].reshape((-1,) + old_shape[2:])
 56 |     return ret
 57 | 
 58 | 
 59 | def high_res_normalize(probs):
 60 |     return [x / sum(map(float, probs)) for x in list(map(float, probs))]
 61 | 
 62 | 
 63 | def stack_tensor_list(tensor_list):
 64 |     return np.array(tensor_list)
 65 |     # tensor_shape = np.array(tensor_list[0]).shape
 66 |     # if tensor_shape is tuple():
 67 |     #     return np.array(tensor_list)
 68 |     # return np.vstack(tensor_list)
 69 | 
 70 | 
 71 | def stack_tensor_dict_list(tensor_dict_list):
 72 |     """
 73 |     Stack a list of dictionaries of {tensors or dictionary of tensors}.
 74 |     :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}.
 75 |     :return: a dictionary of {stacked tensors or dictionary of stacked tensors}
 76 |     """
 77 |     keys = list(tensor_dict_list[0].keys())
 78 |     ret = dict()
 79 |     for k in keys:
 80 |         example = tensor_dict_list[0][k]
 81 |         if isinstance(example, dict):
 82 |             v = stack_tensor_dict_list([x[k] for x in tensor_dict_list])
 83 |         else:
 84 |             v = stack_tensor_list([x[k] for x in tensor_dict_list])
 85 |         ret[k] = v
 86 |     return ret
 87 | 
 88 | 
 89 | def concat_tensor_list_subsample(tensor_list, f):
 90 |     return np.concatenate(
 91 |         [t[np.random.choice(len(t), int(np.ceil(len(t) * f)), replace=False)] for t in tensor_list], axis=0)
 92 | 
 93 | 
 94 | def concat_tensor_dict_list_subsample(tensor_dict_list, f):
 95 |     keys = list(tensor_dict_list[0].keys())
 96 |     ret = dict()
 97 |     for k in keys:
 98 |         example = tensor_dict_list[0][k]
 99 |         if isinstance(example, dict):
100 |             v = concat_tensor_dict_list_subsample([x[k] for x in tensor_dict_list], f)
101 |         else:
102 |             v = concat_tensor_list_subsample([x[k] for x in tensor_dict_list], f)
103 |         ret[k] = v
104 |     return ret
105 | 
106 | 
107 | def concat_tensor_list(tensor_list):
108 |     return np.concatenate(tensor_list, axis=0)
109 | 
110 | 
111 | def concat_tensor_dict_list(tensor_dict_list):
112 |     keys = list(tensor_dict_list[0].keys())
113 |     ret = dict()
114 |     for k in keys:
115 |         example = tensor_dict_list[0][k]
116 |         if isinstance(example, dict):
117 |             v = concat_tensor_dict_list([x[k] for x in tensor_dict_list])
118 |         else:
119 |             v = concat_tensor_list([x[k] for x in tensor_dict_list])
120 |         ret[k] = v
121 |     return ret
122 | 
123 | 
124 | def split_tensor_dict_list(tensor_dict):
125 |     keys = list(tensor_dict.keys())
126 |     ret = None
127 |     for k in keys:
128 |         vals = tensor_dict[k]
129 |         if isinstance(vals, dict):
130 |             vals = split_tensor_dict_list(vals)
131 |         if ret is None:
132 |             ret = [{k: v} for v in vals]
133 |         else:
134 |             for v, cur_dict in zip(vals, ret):
135 |                 cur_dict[k] = v
136 |     return ret
137 | 
138 | 
139 | def truncate_tensor_list(tensor_list, truncated_len):
140 |     return tensor_list[:truncated_len]
141 | 
142 | 
143 | def truncate_tensor_dict(tensor_dict, truncated_len):
144 |     ret = dict()
145 |     for k, v in tensor_dict.items():
146 |         if isinstance(v, dict):
147 |             ret[k] = truncate_tensor_dict(v, truncated_len)
148 |         else:
149 |             ret[k] = truncate_tensor_list(v, truncated_len)
150 |     return ret
151 | 


--------------------------------------------------------------------------------
/mjrl/utils/train_agent.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | logging.disable(logging.CRITICAL)
  3 | 
  4 | from tabulate import tabulate
  5 | from mjrl.utils.make_train_plots import make_train_plots
  6 | from mjrl.utils.gym_env import GymEnv
  7 | from mjrl.samplers.core import sample_paths
  8 | import numpy as np
  9 | import pickle
 10 | import time as timer
 11 | import os
 12 | import copy
 13 | 
 14 | 
 15 | def _load_latest_policy_and_logs(agent, *, policy_dir, logs_dir):
 16 |     """Loads the latest policy.
 17 |     Returns the next step number to begin with.
 18 |     """
 19 |     assert os.path.isdir(policy_dir), str(policy_dir)
 20 |     assert os.path.isdir(logs_dir), str(logs_dir)
 21 | 
 22 |     log_csv_path = os.path.join(logs_dir, 'log.csv')
 23 |     if not os.path.exists(log_csv_path):
 24 |         return 0   # fresh start
 25 | 
 26 |     print("Reading: {}".format(log_csv_path))
 27 |     agent.logger.read_log(log_csv_path)
 28 |     last_step = agent.logger.max_len - 1
 29 |     if last_step <= 0:
 30 |         return 0   # fresh start
 31 | 
 32 | 
 33 |     # find latest policy/baseline
 34 |     i = last_step
 35 |     while i >= 0:
 36 |         policy_path = os.path.join(policy_dir, 'policy_{}.pickle'.format(i))
 37 |         baseline_path = os.path.join(policy_dir, 'baseline_{}.pickle'.format(i))
 38 | 
 39 |         if not os.path.isfile(policy_path):
 40 |             i = i -1
 41 |             continue
 42 |         else:
 43 |             print("Loaded last saved iteration: {}".format(i))
 44 | 
 45 |         with open(policy_path, 'rb') as fp:
 46 |             agent.policy = pickle.load(fp)
 47 |         with open(baseline_path, 'rb') as fp:
 48 |             agent.baseline = pickle.load(fp)
 49 | 
 50 |         # additional
 51 |         # global_status_path = os.path.join(policy_dir, 'global_status.pickle')
 52 |         # with open(global_status_path, 'rb') as fp:
 53 |         #     agent.load_global_status( pickle.load(fp) )
 54 | 
 55 |         agent.logger.shrink_to(i + 1)
 56 |         assert agent.logger.max_len == i + 1
 57 |         return agent.logger.max_len
 58 | 
 59 |     # cannot find any saved policy
 60 |     raise RuntimeError("Log file exists, but cannot find any saved policy.")
 61 | 
 62 | def train_agent(job_name, agent,
 63 |                 seed = 0,
 64 |                 niter = 101,
 65 |                 gamma = 0.995,
 66 |                 gae_lambda = None,
 67 |                 num_cpu = 1,
 68 |                 sample_mode = 'trajectories',
 69 |                 num_traj = 50,
 70 |                 num_samples = 50000, # has precedence, used with sample_mode = 'samples'
 71 |                 save_freq = 10,
 72 |                 evaluation_rollouts = None,
 73 |                 plot_keys = ['stoc_pol_mean'],
 74 |                 ):
 75 | 
 76 |     np.random.seed(seed)
 77 |     if os.path.isdir(job_name) == False:
 78 |         os.mkdir(job_name)
 79 |     previous_dir = os.getcwd()
 80 |     os.chdir(job_name) # important! we are now in the directory to save data
 81 |     if os.path.isdir('iterations') == False: os.mkdir('iterations')
 82 |     if os.path.isdir('logs') == False and agent.save_logs == True: os.mkdir('logs')
 83 |     best_policy = copy.deepcopy(agent.policy)
 84 |     best_perf = -1e8
 85 |     train_curve = best_perf*np.ones(niter)
 86 |     mean_pol_perf = 0.0
 87 |     e = GymEnv(agent.env.env_id)
 88 | 
 89 |     # Load from any existing checkpoint, policy, statistics, etc.
 90 |     # Why no checkpointing.. :(
 91 |     i_start = _load_latest_policy_and_logs(agent,
 92 |                                            policy_dir='iterations',
 93 |                                            logs_dir='logs')
 94 |     if i_start:
 95 |         print("Resuming from an existing job folder ...")
 96 | 
 97 |     for i in range(i_start, niter):
 98 |         print("......................................................................................")
 99 |         print("ITERATION : %i " % i)
100 | 
101 |         if train_curve[i-1] > best_perf:
102 |             best_policy = copy.deepcopy(agent.policy)
103 |             best_perf = train_curve[i-1]
104 | 
105 |         N = num_traj if sample_mode == 'trajectories' else num_samples
106 |         args = dict(N=N, sample_mode=sample_mode, gamma=gamma, gae_lambda=gae_lambda, num_cpu=num_cpu)
107 |         stats = agent.train_step(**args)
108 |         train_curve[i] = stats[0]
109 | 
110 |         if evaluation_rollouts is not None and evaluation_rollouts > 0:
111 |             print("Performing evaluation rollouts ........")
112 |             eval_paths = sample_paths(num_traj=evaluation_rollouts, policy=agent.policy, num_cpu=num_cpu,
113 |                                       env=e.env_id, eval_mode=True, base_seed=seed)
114 |             mean_pol_perf = np.mean([np.sum(path['rewards']) for path in eval_paths])
115 |             if agent.save_logs:
116 |                 agent.logger.log_kv('eval_score', mean_pol_perf)
117 |                 try:
118 |                     eval_success = e.env.env.evaluate_success(eval_paths)
119 |                     agent.logger.log_kv('eval_success', eval_success)
120 |                 except:
121 |                     pass
122 | 
123 |         if i % save_freq == 0 and i > 0:
124 |             if agent.save_logs:
125 |                 agent.logger.save_log('logs/')
126 |                 make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
127 |             policy_file = 'policy_%i.pickle' % i
128 |             baseline_file = 'baseline_%i.pickle' % i
129 |             pickle.dump(agent.policy, open('iterations/' + policy_file, 'wb'))
130 |             pickle.dump(agent.baseline, open('iterations/' + baseline_file, 'wb'))
131 |             pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
132 |             # pickle.dump(agent.global_status, open('iterations/global_status.pickle', 'wb'))
133 | 
134 |         # print results to console
135 |         if i == 0:
136 |             result_file = open('results.txt', 'w')
137 |             print("Iter | Stoc Pol | Mean Pol | Best (Stoc) \n")
138 |             result_file.write("Iter | Sampling Pol | Evaluation Pol | Best (Sampled) \n")
139 |             result_file.close()
140 |         print("[ %s ] %4i %5.2f %5.2f %5.2f " % (timer.asctime(timer.localtime(timer.time())),
141 |                                                  i, train_curve[i], mean_pol_perf, best_perf))
142 |         result_file = open('results.txt', 'a')
143 |         result_file.write("%4i %5.2f %5.2f %5.2f \n" % (i, train_curve[i], mean_pol_perf, best_perf))
144 |         result_file.close()
145 |         if agent.save_logs:
146 |             print_data = sorted(filter(lambda v: np.asarray(v[1]).size == 1,
147 |                                        agent.logger.get_current_log().items()))
148 |             print(tabulate(print_data))
149 | 
150 |     # final save
151 |     pickle.dump(best_policy, open('iterations/best_policy.pickle', 'wb'))
152 |     if agent.save_logs:
153 |         agent.logger.save_log('logs/')
154 |         make_train_plots(log=agent.logger.log, keys=plot_keys, save_loc='logs/')
155 |     os.chdir(previous_dir)
156 | 


--------------------------------------------------------------------------------
/mjrl/utils/visualize_policy.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import mjrl.envs
 3 | import click 
 4 | import os
 5 | import gym
 6 | import numpy as np
 7 | import pickle
 8 | from mjrl.utils.gym_env import GymEnv
 9 | from mjrl.policies.gaussian_mlp import MLP
10 | import trajopt.envs
11 | 
12 | DESC = '''
13 | Helper script to visualize policy (in mjrl format).\n
14 | USAGE:\n
15 |     Visualizes policy on the env\n
16 |     $ python utils/visualize_policy --env_name mjrl_swimmer-v0 --policy my_policy.pickle --mode evaluation --episodes 10 \n
17 | '''
18 | 
19 | # MAIN =========================================================
20 | @click.command(help=DESC)
21 | @click.option('--env_name', type=str, help='environment to load', required= True)
22 | @click.option('--policy', type=str, help='absolute path of the policy file', default=None)
23 | @click.option('--mode', type=str, help='exploration or evaluation mode for policy', default='evaluation')
24 | @click.option('--seed', type=int, help='seed for generating environment instances', default=123)
25 | @click.option('--episodes', type=int, help='number of episodes to visualize', default=10)
26 | 
27 | def main(env_name, policy, mode, seed, episodes):
28 |     e = GymEnv(env_name)
29 |     e.set_seed(seed)
30 |     if policy is not None:
31 |         pi = pickle.load(open(policy, 'rb'))
32 |     else:
33 |         pi = MLP(e.spec, hidden_sizes=(32,32), seed=seed, init_log_std=-1.0)
34 |     # render policy
35 |     e.visualize_policy(pi, num_episodes=episodes, horizon=e.horizon, mode=mode)
36 | 
37 | if __name__ == '__main__':
38 |     main()
39 | 
40 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup, find_packages
 4 | 
 5 | print("Installing mjrl. \n Package intended for use with provided conda env. See setup instructions here: https://github.com/aravindr93/mjrl/tree/master/setup")
 6 | 
 7 | if sys.version_info.major != 3:
 8 |     print("This Python is only compatible with Python 3, but you are running "
 9 |           "Python {}. The installation will likely fail.".format(sys.version_info.major))
10 |     
11 | def read(fname):
12 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
13 | 
14 | setup(
15 |     name='mjrl',
16 |     version='1.0.0',
17 |     packages=find_packages(),
18 |     description='RL algorithms for environments in MuJoCo',
19 |     long_description=read('README.md'),
20 |     url='https://github.com/aravindr93/mjrl.git',
21 |     author='Aravind Rajeswaran',
22 | )
23 | 


--------------------------------------------------------------------------------
/setup/README.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | A short guide to install this package is below. The package relies on `mujoco-py` which might be the trickiest part of the installation. See `known issues` below and also instructions from the mujoco-py [page](https://github.com/openai/mujoco-py) if you are stuck with mujoco-py installation.
 4 | 
 5 | The package can handle both `MuJoCo v1.5` as well as `MuJoCo v2.0`, but the former is not supported for future updates. We encourage you to use v2.0.
 6 | 
 7 | ## Linux
 8 | 
 9 | - Download MuJoCo v2.0 binaries from the official [website](http://www.mujoco.org/) and also obtain the license key.
10 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200`, and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`. Note that unzip of the MuJoCo binaries will generate `mujoco200_linux`. You need to rename the directory and place it at `~/.mujoco/mujoco200`.
11 | - Install osmesa related dependencies:
12 | ```
13 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev build-essential libglfw3
14 | ```
15 | - Update `bashrc` by adding the following lines and source it
16 | ```
17 | export LD_LIBRARY_PATH="<path/to/.mujoco>/mujoco200/bin:$LD_LIBRARY_PATH"
18 | export MUJOCO_PY_FORCE_CPU=True
19 | alias MJPL='LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libGLEW.so:/usr/lib/nvidia-384/libGL.so'
20 | ```
21 | - Install this package using
22 | ```
23 | $ conda update conda
24 | $ cd <path/to/mjrl>
25 | $ conda env create -f setup/env.yml
26 | $ source activate mjrl-env
27 | $ pip install -e .
28 | ```
29 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly based on the specific version of CUDA (or CPU-only) you have.
30 | 
31 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info.
32 | 
33 | ## Mac OS
34 | 
35 | - Download MuJoCo binaries from the official [website](http://www.mujoco.org/) and also obtain the license key.
36 | - Unzip the downloaded `mujoco200` directory into `~/.mujoco/mujoco200` (rename unzipped directory to this), and place your license key (mjkey.txt) at `~/.mujoco/mjkey.txt`.
37 | - Update `bashrc` by adding the following lines and source it
38 | ```
39 | export LD_LIBRARY_PATH="<path/to/.mujoco>/mujoco200/bin:$LD_LIBRARY_PATH"
40 | ```
41 | - Install this package using
42 | ```
43 | $ conda update conda
44 | $ cd path/to/mjrl
45 | $ conda env create -f setup/env.yml
46 | $ source activate mjrl-env
47 | $ pip install -e .
48 | ```
49 | 
50 | - *NOTE 1:* If there are issues with install of pytorch, please follow instructions from the [pytorch website](https://pytorch.org/) to install it properly.
51 | 
52 | - *NOTE 2:* If you encounter a patchelf error in mujoco_py install, you can fix this with the following command when inside the anaconda env: `conda install -c anaconda patchelf`. See this [page](https://github.com/openai/mujoco-py/issues/147) for additional info.
53 | 
54 | 
55 | ## Known Issues
56 | 
57 | - Visualization in linux: If the linux system has a GPU, then mujoco-py does not automatically preload the correct drivers. We added an alias `MJPL` in bashrc (see instructions) which stands for mujoco pre-load. When runing any python script that requires rendering, prepend the execution with MJPL.
58 | ```
59 | $ MJPL python script.py
60 | ```
61 | 
62 | - Errors related to osmesa during installation. This is a `mujoco-py` build error and would likely go away if the following command is used before creating the conda environment. If the problem still persists, please contact the developers of mujoco-py
63 | ```
64 | $ sudo apt-get install libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev
65 | ```
66 | 
67 | - If conda environment creation gets interrupted for some reason, you can resume it with the following:
68 | ```
69 | $ conda env update -n mjrl-env -f setup/env.yml
70 | ```
71 | 
72 | - GCC error in Mac OS: If you get a GCC error from mujoco-py, you can get the correct version mujoco-py expects with `brew install gcc --without-multilib`. This may require uninstalling other versions of GCC that may have been previously installed with `brew remove gcc@6` for example. You can see which brew packages were already installed with `brew list`.
73 | 
74 | 


--------------------------------------------------------------------------------
/setup/env.yml:
--------------------------------------------------------------------------------
 1 | name: mjrl-env
 2 | channels:
 3 | - pytorch
 4 | - defaults
 5 | dependencies:
 6 | - python=3.7
 7 | - pip
 8 | - ipython
 9 | - mkl-service
10 | - pytorch==1.4
11 | - tabulate
12 | - termcolor
13 | - torchvision
14 | - patchelf
15 | - pip:
16 |   - click
17 |   - cloudpickle
18 |   - gym==0.13
19 |   - ipdb
20 |   - matplotlib
21 |   - mujoco-py<2.1,>=2.0
22 |   - pip
23 |   - pyyaml
24 |   - tqdm
25 |   - wheel
26 |   - scipy
27 |   - transforms3d
28 | 


--------------------------------------------------------------------------------
/tests/hydra/config/hydra_npg_config.yaml:
--------------------------------------------------------------------------------
 1 | # general outputs
 2 | job_name          : 'hydra_npg_test'
 3 | 
 4 | # general inputs
 5 | env               :   Hopper-v3
 6 | algorithm         :   NPG
 7 | seed              :   123
 8 | sample_mode       :   samples
 9 | rl_num_samples    :   1000
10 | rl_num_traj       :   0
11 | rl_num_iter       :   2
12 | num_cpu           :   4
13 | save_freq         :   5
14 | eval_rollouts     :   0
15 | exp_notes         :  'Example config for training policy with NPG on the OpenAI gym Hopper-v3 task.'
16 | 
17 | # RL parameters (all params related to PG, value function etc.)
18 | policy_size       :   (32, 32)
19 | init_log_std      :   -0.5
20 | vf_hidden_size    :   (128, 128)
21 | vf_batch_size     :   64
22 | vf_epochs         :   2
23 | vf_learn_rate     :   1e-3
24 | rl_step_size      :   0.05
25 | rl_gamma          :   0.995
26 | rl_gae            :   0.97
27 | 
28 | # Algorithm hyperparameters : if alg requires additional params, can be specified here (or defaults will be used)
29 | 
30 | alg_hyper_params  :   {}
31 | 
32 | hydra:
33 |           launcher:
34 |                       cpus_per_task: 12
35 |                       gpus_per_node: 0
36 |                       tasks_per_node: 1
37 |           run:
38 |                       dir: ./outputs/${hydra.job.name}/${now:%Y-%m-%d_%H-%M-%S}
39 |           sweep:
40 |                       dir: /checkpoint/${env:USER}/outputs/${job_name}/${now:%Y-%m-%d}_${now:%H-%M-%S}
41 |                       subdir: ${hydra.job.num}_${hydra.job.override_dirname}


--------------------------------------------------------------------------------
/tests/hydra/hydra_policy_opt_job_script.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is a job script for running policy gradient algorithms on gym tasks.
  3 | Separate job scripts are provided to run few other algorithms
  4 | - For DAPG see here: https://github.com/aravindr93/hand_dapg/tree/master/dapg/examples
  5 | - For model-based NPG see here: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos/model_accel
  6 | """
  7 | 
  8 | from mjrl.utils.gym_env import GymEnv
  9 | from mjrl.policies.gaussian_mlp import MLP
 10 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 11 | from mjrl.baselines.mlp_baseline import MLPBaseline
 12 | from mjrl.algos.npg_cg import NPG
 13 | from mjrl.algos.batch_reinforce import BatchREINFORCE
 14 | from mjrl.algos.ppo_clip import PPO
 15 | from mjrl.utils.train_agent import train_agent
 16 | import os
 17 | import json
 18 | import gym
 19 | import mjrl.envs
 20 | # import mj_envs
 21 | import time as timer
 22 | import pickle
 23 | import hydra
 24 | from omegaconf import DictConfig, OmegaConf
 25 | 
 26 | # ===============================================================================
 27 | # Process Inputs
 28 | # ===============================================================================
 29 | def preprocess(job_data):
 30 |     if not os.path.exists(job_data.job_name):
 31 |         os.mkdir(job_data.job_name)
 32 |     assert 'algorithm' in job_data.keys()
 33 |     assert any([job_data.algorithm == a for a in ['NPG', 'NVPG', 'VPG', 'PPO']])
 34 |     assert 'sample_mode' in job_data.keys()
 35 |     job_data.alg_hyper_params = dict() if 'alg_hyper_params' not in job_data.keys() else job_data.alg_hyper_params
 36 | 
 37 |     EXP_FILE = job_data.job_name + '/job_config.json'
 38 |     with open(EXP_FILE, 'w') as fp:
 39 |         # json.dump(job_data, f, indent=4)
 40 |         OmegaConf.save(config=job_data, f=fp.name)
 41 | 
 42 |     if job_data.sample_mode == 'trajectories':
 43 |         assert 'rl_num_traj' in job_data.keys()
 44 |         job_data.rl_num_samples = 0 # will be ignored
 45 |     elif job_data.sample_mode == 'samples':
 46 |         assert 'rl_num_samples' in job_data.keys()
 47 |         job_data.rl_num_traj = 0    # will be ignored
 48 |     else:
 49 |         print("Unknown sampling mode. Choose either trajectories or samples")
 50 |         exit()
 51 | 
 52 | # ===============================================================================
 53 | # Train Loop
 54 | # ===============================================================================
 55 | @hydra.main(config_name="hydra_npg_config", config_path="config")
 56 | def train_loop(job_data: DictConfig) -> None:
 57 |     print("========================================")
 58 |     print("Job Configuration")
 59 |     print("========================================")
 60 |     preprocess(job_data)
 61 |     print(OmegaConf.to_yaml(job_data))
 62 | 
 63 |     e = GymEnv(job_data.env)
 64 |     policy_size = tuple(eval(job_data.policy_size))
 65 |     vf_hidden_size = tuple(eval(job_data.vf_hidden_size))
 66 | 
 67 |     policy = MLP(e.spec, hidden_sizes=policy_size, seed=job_data.seed, init_log_std=job_data.init_log_std)
 68 |     baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=job_data.vf_batch_size, hidden_sizes=vf_hidden_size,
 69 |                         epochs=job_data.vf_epochs, learn_rate=job_data.vf_learn_rate)
 70 | 
 71 |     # Construct the algorithm
 72 |     if job_data.algorithm == 'NPG':
 73 |         # Other hyperparameters (like number of CG steps) can be specified in config for pass through
 74 |         # or default hyperparameters will be used
 75 |         agent = NPG(e, policy, baseline, normalized_step_size=job_data.rl_step_size,
 76 |                     seed=job_data.seed, save_logs=True, **job_data.alg_hyper_params)
 77 | 
 78 |     elif job_data.algorithm == 'VPG':
 79 |         agent = BatchREINFORCE(e, policy, baseline, learn_rate=job_data.rl_step_size,
 80 |                             seed=job_data.seed, save_logs=True, **job_data.alg_hyper_params)
 81 | 
 82 |     elif job_data.algorithm == 'NVPG':
 83 |         agent = BatchREINFORCE(e, policy, baseline, desired_kl=job_data.rl_step_size,
 84 |                             seed=job_data.seed, save_logs=True, **job_data.alg_hyper_params)
 85 | 
 86 |     elif job_data.algorithm == 'PPO':
 87 |         # There are many hyperparameters for PPO. They can be specified in config for pass through
 88 |         # or defaults in the PPO algorithm will be used
 89 |         agent = PPO(e, policy, baseline, save_logs=True, **job_data.alg_hyper_params)
 90 |     else:
 91 |         NotImplementedError("Algorithm not found")
 92 | 
 93 |     print("========================================")
 94 |     print("Starting policy learning")
 95 |     print("========================================")
 96 | 
 97 |     ts = timer.time()
 98 |     train_agent(job_name=job_data.job_name,
 99 |                 agent=agent,
100 |                 seed=job_data.seed,
101 |                 niter=job_data.rl_num_iter,
102 |                 gamma=job_data.rl_gamma,
103 |                 gae_lambda=job_data.rl_gae,
104 |                 num_cpu=job_data.num_cpu,
105 |                 sample_mode=job_data.sample_mode,
106 |                 num_traj=job_data.rl_num_traj,
107 |                 num_samples=job_data.rl_num_samples,
108 |                 save_freq=job_data.save_freq,
109 |                 evaluation_rollouts=job_data.eval_rollouts)
110 |     print("========================================")
111 |     print("Job Finished. Time taken = %f" % (timer.time()-ts))
112 |     print("========================================")
113 | 
114 | if __name__ == "__main__":
115 |     train_loop()


--------------------------------------------------------------------------------
/tests/point_mass_test.py:
--------------------------------------------------------------------------------
 1 | from mjrl.utils.gym_env import GymEnv
 2 | from mjrl.policies.gaussian_mlp import MLP
 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 4 | from mjrl.baselines.mlp_baseline import MLPBaseline
 5 | from mjrl.algos.npg_cg import NPG
 6 | from mjrl.utils.train_agent import train_agent
 7 | import mjrl.envs
 8 | import time as timer
 9 | SEED = 500
10 | 
11 | e = GymEnv('mjrl_point_mass-v0')
12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
13 | baseline = MLPBaseline(e.spec, reg_coef=1e-3, batch_size=64, epochs=10, learn_rate=1e-3)
14 | agent = NPG(e, policy, baseline, normalized_step_size=0.05, seed=SEED, save_logs=True)
15 | 
16 | ts = timer.time()
17 | train_agent(job_name='point_mass_exp1',
18 |             agent=agent,
19 |             seed=SEED,
20 |             niter=50,
21 |             gamma=0.95,
22 |             gae_lambda=0.97,
23 |             num_cpu=1,
24 |             sample_mode='trajectories',
25 |             num_traj=40,      # samples = 40*25 = 1000
26 |             save_freq=5,
27 |             evaluation_rollouts=None,
28 |             plot_keys=['stoc_pol_mean', 'running_score'])
29 | print("time taken = %f" % (timer.time()-ts))
30 | 


--------------------------------------------------------------------------------
/tests/visualizer_test.py:
--------------------------------------------------------------------------------
 1 | from mjrl.utils.gym_env import GymEnv
 2 | from mjrl.policies.gaussian_mlp import MLP
 3 | from mjrl.baselines.quadratic_baseline import QuadraticBaseline
 4 | from mjrl.baselines.mlp_baseline import MLPBaseline
 5 | from mjrl.algos.npg_cg import NPG
 6 | from mjrl.utils.train_agent import train_agent
 7 | import mjrl.envs
 8 | import time as timer
 9 | SEED = 500
10 | 
11 | e = GymEnv('mjrl_point_mass-v0')
12 | policy = MLP(e.spec, hidden_sizes=(32,32), seed=SEED)
13 | baseline = QuadraticBaseline(e.spec)
14 | agent = NPG(e, policy, baseline, normalized_step_size=0.5, seed=SEED, save_logs=True)
15 | 
16 | ts = timer.time()
17 | train_agent(job_name='vis_exp',
18 |             agent=agent,
19 |             seed=SEED,
20 |             niter=10,
21 |             gamma=0.95,
22 |             gae_lambda=0.97,
23 |             num_cpu=1,
24 |             sample_mode='trajectories',
25 |             num_traj=100,
26 |             save_freq=5,
27 |             evaluation_rollouts=None)
28 | print("time taken = %f" % (timer.time()-ts))
29 | e.visualize_policy(policy, num_episodes=5, horizon=e.horizon, mode='exploration')
30 | 


--------------------------------------------------------------------------------