├── algo ├── __init__.py ├── passthrough.py ├── a2c_acktr.py ├── ppo.py └── dqn.py ├── options ├── visualization │ ├── eval_default.yaml │ ├── supervised_default.yaml │ ├── simple_reward.yaml │ ├── reward_forward.yaml │ ├── goal_distance.yaml │ ├── interp_reward_simple.yaml │ ├── interp_reward_surviveinterp.yaml │ ├── arbitrary_theta_vis.yaml │ ├── default.yaml │ ├── reward.yaml │ ├── interp_reward.yaml │ ├── plot_state_action_cycles.yaml │ ├── state_reward.yaml │ ├── state_xy_reward.yaml │ ├── all_episode.yaml │ ├── state_reward_alg.yaml │ └── mujoco_verbose.yaml ├── baseline_lowlevel │ ├── baseline_simplemlp_pretrain_any.yaml │ └── baseline_simplemlp_skip_hs16_pretrain_any_20M.yaml ├── phase_lowlevel │ ├── phase_mlp_pretrain_any.yaml │ └── phase_mlp_skip_hs16_pretrain_any_20M.yaml ├── hierarchical_final │ ├── hierarchical_many_phase.yaml │ ├── hierarchical_many_4M_phase.yaml │ ├── hierarchical_many_phase_antlowgear.yaml │ ├── hierarchical_many_phase_a2c.yaml │ ├── hierarchical_many_phase_16ll_proprioceptivehumanoid.yaml │ ├── hierarchical_many_baseline.yaml │ ├── hierarchical_many_4M_baseline.yaml │ ├── hierarchical_many_baseline_antlowgear.yaml │ └── hierarchical_many_phase_dqn.yaml ├── maze_baseline │ ├── maze_baseline.yaml │ ├── maze_baseline_wmove_r1000.yaml │ ├── maze_baseline_finetune.yaml │ └── maze_baseline_wmove_r1000_finetune.yaml └── maze_baseline_wphase │ ├── maze_baseline_phase.yaml │ ├── maze_baseline_phase_humanoid.yaml │ ├── maze_baseline_phase_wmove_r1000.yaml │ ├── maze_baseline_phase_wmove_r1000_humanoid.yaml │ ├── maze_baseline_phase_finetune.yaml │ ├── maze_baseline_phase_wmove_r1000_finetune.yaml │ ├── maze_baseline_phase_finetune_proprioceptivehumanoid.yaml │ └── maze_baseline_phase_wmove_r1000_finetune_proprioceptivehumanoid.yaml ├── environments ├── explorer_humanoid.py ├── explorer_ant.py ├── geom_utils.py ├── __init__.py ├── RewardCyclicEnv.py ├── assets │ ├── ant_custom_gear.xml │ ├── my_ant.xml │ ├── skull_maze_ant.xml │ └── cross_maze_ant.xml ├── mujoco_env.py ├── simple_humanoid_env.py └── proprioceptive_humanoid_env.py ├── LICENSE ├── README.md ├── .gitignore ├── distributions.py ├── hier_utils.py ├── utils.py ├── wrappers.py └── summarize_results.py /algo/__init__.py: -------------------------------------------------------------------------------- 1 | from .a2c_acktr import A2C_ACKTR 2 | from .ppo import PPO 3 | from .dqn import DQN 4 | from .passthrough import Passthrough 5 | -------------------------------------------------------------------------------- /options/visualization/eval_default.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | 4 | -------------------------------------------------------------------------------- /options/visualization/supervised_default.yaml: -------------------------------------------------------------------------------- 1 | alg_monitor_str: Alg.Monitor.csv 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | supervised_loss: 4 | log_name: 5 | train_loss: True 6 | val_loss: True 7 | data_src: alg_monitor 8 | data_type: multiscalar 9 | bin_size: 1 10 | smooth: 0 11 | -------------------------------------------------------------------------------- /options/visualization/simple_reward.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | rewards_ep: 4 | log_name: 5 | reward_env: True 6 | data_src: episode_monitor 7 | data_type: multiscalar 8 | bin_size: 100 9 | smooth: 1 10 | -------------------------------------------------------------------------------- /options/visualization/reward_forward.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | rewards_ep: 4 | log_name: 5 | reward_forward: True 6 | data_src: episode_monitor 7 | data_type: multiscalar 8 | bin_size: 100 9 | smooth: 1 10 | -------------------------------------------------------------------------------- /options/visualization/goal_distance.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | rewards_ep: 4 | log_name: 5 | goal_distance_radius: True 6 | data_src: episode_monitor 7 | data_type: multiscalar 8 | bin_size: 100 9 | smooth: 1 10 | -------------------------------------------------------------------------------- /options/visualization/interp_reward_simple.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | rewards_ep: 4 | log_name: 5 | reward_interpolate: True 6 | data_src: episode_monitor 7 | data_type: multiscalar 8 | bin_size: 100 9 | smooth: 1 10 | -------------------------------------------------------------------------------- /options/visualization/interp_reward_surviveinterp.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | rewards_ep: 4 | log_name: 5 | reward_survive: True 6 | reward_interpolate: True 7 | data_src: episode_monitor 8 | data_type: multiscalar 9 | bin_size: 100 10 | smooth: 1 11 | -------------------------------------------------------------------------------- /options/visualization/arbitrary_theta_vis.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | step_monitor_str: Last.Step.Monitor.csv # Name of step 3 | plot_keys: # Values we want to plot (and necessary info to plot them) 4 | theta_xy_plot: 5 | log_name: 6 | state: True 7 | obs: True 8 | data_src: step_monitor 9 | update_delay: 0 10 | data_type: special 11 | window_once: True 12 | -------------------------------------------------------------------------------- /environments/explorer_humanoid.py: -------------------------------------------------------------------------------- 1 | from . import proprioceptive_humanoid_env 2 | import numpy as np 3 | 4 | # All obs but xy but yaw and z use integrals 5 | class LowlevelProprioceptiveHumanoidEnv(proprioceptive_humanoid_env.BaseProprioceptiveHumanoidEnv): 6 | # Initialize environment 7 | def __init__(self): 8 | super(LowlevelProprioceptiveHumanoidEnv, self).__init__() 9 | 10 | def _get_obs(self): 11 | s_internal, _ = self.get_intern_extern_state() 12 | return s_internal 13 | -------------------------------------------------------------------------------- /options/visualization/default.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | rewards_ep: 4 | log_name: 5 | reward_env: True 6 | reward_exp: False 7 | reward_run: False 8 | reward_ctrl: False 9 | reward_contact: False 10 | reward_survive: False 11 | reward_forward: False 12 | reward_move: False 13 | reward_cycle: False 14 | reward_cycle_s: False 15 | reward_cycle_a: False 16 | data_src: episode_monitor 17 | data_type: multiscalar 18 | bin_size: 100 19 | smooth: 1 20 | -------------------------------------------------------------------------------- /options/visualization/reward.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | rewards_ep: 4 | log_name: 5 | reward_env: True 6 | reward_exp: False 7 | reward_run: False 8 | reward_ctrl: False 9 | reward_contact: False 10 | reward_survive: False 11 | reward_forward: False 12 | reward_move: False 13 | reward_cycle: False 14 | reward_cycle_s: False 15 | reward_cycle_a: False 16 | reward_thresh: False 17 | data_src: episode_monitor 18 | data_type: multiscalar 19 | bin_size: 100 20 | smooth: 1 21 | -------------------------------------------------------------------------------- /options/visualization/interp_reward.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | rewards_ep: 4 | log_name: 5 | reward_interpolate: True 6 | reward_env: True 7 | reward_exp: False 8 | reward_run: False 9 | reward_ctrl: False 10 | reward_contact: False 11 | reward_survive: False 12 | reward_forward: False 13 | reward_move: False 14 | reward_cycle: False 15 | reward_cycle_s: False 16 | reward_cycle_a: False 17 | data_src: episode_monitor 18 | data_type: multiscalar 19 | bin_size: 100 20 | smooth: 1 21 | -------------------------------------------------------------------------------- /environments/explorer_ant.py: -------------------------------------------------------------------------------- 1 | from . import ant_env 2 | import numpy as np 3 | 4 | # Only contains the internal state of the ant in the observation 5 | class LowlevelAntEnv(ant_env.BaseAntEnv): 6 | # Initialize environment 7 | def __init__(self): 8 | super(LowlevelAntEnv, self).__init__() 9 | 10 | def _get_obs(self): 11 | s_internal, _ = self.get_intern_extern_state() 12 | return s_internal 13 | 14 | # Only contains the internal state of the ant in the observation 15 | class LowlevelAntLowGearEnv(ant_env.BaseAntLowGearEnv): 16 | # Initialize environment 17 | def __init__(self): 18 | super(LowlevelAntLowGearEnv, self).__init__(xml_file='ant_custom_gear.xml') 19 | 20 | def _get_obs(self): 21 | s_internal, _ = self.get_intern_extern_state() 22 | return s_internal 23 | 24 | -------------------------------------------------------------------------------- /algo/passthrough.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | 5 | # Does no updates, just does forward passes 6 | class Passthrough(object): 7 | def __init__(self, actor_critic): 8 | self.actor_critic = actor_critic 9 | 10 | # Generate a state_dict object 11 | def state_dict(self): 12 | ckpt = {} 13 | ckpt['model'] = self.actor_critic.state_dict() 14 | return ckpt 15 | 16 | # Load from a state dict 17 | def load_state_dict(self, ckpt): 18 | self.actor_critic.load_state_dict(ckpt['model']) 19 | 20 | # Load from pretrained (ModularPolicy) 21 | def load_pretrained_policies(self, ckpts): 22 | self.actor_critic.load_pretrained_policies(ckpts) 23 | 24 | # Update our policy network 25 | def update(self, rollouts): 26 | return 0, 0, 0 27 | 28 | -------------------------------------------------------------------------------- /options/visualization/plot_state_action_cycles.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | step_monitor_str: Last.Step.Monitor.csv # Name of step 3 | plot_keys: # Values we want to plot (and necessary info to plot them) 4 | obs_step: 5 | log_name: obs 6 | data_src: step_monitor 7 | update_delay: 0 8 | bin_size: 1 9 | smooth: 0 10 | data_type: array 11 | display_type: elementwise_subset 12 | start_ind: 0 13 | end_ind: 20 14 | time_start: 500 15 | time_end: 550 16 | action_step: 17 | log_name: action 18 | data_src: step_monitor 19 | update_delay: 0 20 | bin_size: 1 21 | smooth: 0 22 | data_type: array 23 | display_type: elementwise 24 | time_start: 500 25 | time_end: 550 26 | -------------------------------------------------------------------------------- /options/visualization/state_reward.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | delta_state_ep: 4 | log_name: delta_state 5 | data_src: episode_monitor 6 | data_type: array 7 | display_type: elementwise_subset 8 | start_ind: 0 9 | end_ind: 1 10 | bin_size: 100 11 | smooth: 1 12 | rewards_ep: 13 | log_name: 14 | reward_env: True 15 | reward_exp: False 16 | reward_run: False 17 | reward_ctrl: False 18 | reward_contact: False 19 | reward_survive: False 20 | reward_forward: False 21 | reward_move: False 22 | reward_cycle: False 23 | reward_cycle_s: False 24 | reward_cycle_a: False 25 | data_src: episode_monitor 26 | data_type: multiscalar 27 | bin_size: 100 28 | smooth: 1 29 | -------------------------------------------------------------------------------- /options/visualization/state_xy_reward.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | delta_state_ep: 4 | log_name: delta_state 5 | data_src: episode_monitor 6 | data_type: array 7 | display_type: elementwise_subset 8 | start_ind: 0 9 | end_ind: 2 10 | bin_size: 100 11 | smooth: 1 12 | rewards_ep: 13 | log_name: 14 | reward_env: True 15 | reward_exp: False 16 | reward_run: False 17 | reward_ctrl: False 18 | reward_contact: False 19 | reward_survive: False 20 | reward_forward: False 21 | reward_move: False 22 | reward_cycle: False 23 | reward_cycle_s: False 24 | reward_cycle_a: False 25 | data_src: episode_monitor 26 | data_type: multiscalar 27 | bin_size: 100 28 | smooth: 1 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Kenneth Marino 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hrl-ep3 2 | Code for our paper: Hierarchical RL Using an Ensemble of Proprioceptive Periodic Policies 3 | 4 | ## Citation 5 | If you find this work helpful, please cite our work, as well as Ilya Kostrikov's pytoch baseline repo. 6 | 7 | @article{marino2019ep3, 8 | title={Hierarchical RL Using an Ensemble of Proprioceptive Periodic Policies}, 9 | author={Marino, Kenneth and Gupta, Abhinav and Fergus, Rob and Szlam, Arthur}, 10 | journal={ICLR}, 11 | year={2019} 12 | } 13 | 14 | ## Requirements 15 | * Python 3 16 | * [PyTorch](http://pytorch.org/) 17 | * [Visdom](https://github.com/facebookresearch/visdom) 18 | * [OpenAI baselines](https://github.com/openai/baselines) 19 | 20 | ## Acknowledgements 21 | This repo was initially copied from Ilya Kostrikov's pytorch baseline repo https://github.com/ikostrikov/pytorch-a2c-ppo-acktr . We will modify many of the original files and add new ones as is useful. We also will continue to use this as our baselines. See their github for more documentation on the baselines. Please also cite this repository in your publications if you make use of this code: 22 | 23 | @misc{pytorchrl, 24 | author = {Kostrikov, Ilya}, 25 | title = {PyTorch Implementations of Reinforcement Learning Algorithms}, 26 | year = {2018}, 27 | publisher = {GitHub}, 28 | journal = {GitHub repository}, 29 | howpublished = {\url{https://github.com/ikostrikov/pytorch-a2c-ppo-acktr}}, 30 | } 31 | 32 | 33 | -------------------------------------------------------------------------------- /options/visualization/all_episode.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | plot_keys: # Values we want to plot (and necessary info to plot them) 3 | delta_state_ep: 4 | log_name: delta_state 5 | data_src: episode_monitor 6 | data_type: array 7 | display_type: elementwise 8 | bin_size: 100 9 | smooth: 0 10 | delta_obs_ep: 11 | log_name: delta_obs 12 | data_src: episode_monitor 13 | data_type: array 14 | display_type: elementwise 15 | bin_size: 100 16 | smooth: 0 17 | mean_action_ep: 18 | log_name: mean_action 19 | data_src: episode_monitor 20 | data_type: array 21 | display_type: elementwise 22 | bin_size: 100 23 | smooth: 0 24 | episode_len_ep: 25 | log_name: episode_len 26 | data_src: episode_monitor 27 | data_type: scalar 28 | bin_size: 100 29 | smooth: 0 30 | rewards_ep: 31 | log_name: 32 | reward_env: True 33 | reward_exp: False 34 | reward_run: False 35 | reward_ctrl: False 36 | reward_contact: False 37 | reward_survive: False 38 | reward_forward: False 39 | reward_move: False 40 | reward_cycle: False 41 | reward_cycle_s: False 42 | reward_cycle_a: False 43 | data_src: episode_monitor 44 | data_type: multiscalar 45 | bin_size: 100 46 | smooth: 0 47 | -------------------------------------------------------------------------------- /options/visualization/state_reward_alg.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | alg_monitor_str: Alg.Monitor.csv # Basename of Alg monitor 3 | plot_keys: # Values we want to plot (and necessary info to plot them) 4 | delta_state_ep: 5 | log_name: delta_state 6 | data_src: episode_monitor 7 | data_type: array 8 | display_type: elementwise_subset 9 | start_ind: 0 10 | end_ind: 1 11 | bin_size: 100 12 | smooth: 1 13 | rewards_ep: 14 | log_name: 15 | reward_env: True 16 | reward_exp: False 17 | reward_run: False 18 | reward_ctrl: False 19 | reward_contact: False 20 | reward_survive: False 21 | reward_forward: False 22 | reward_move: False 23 | reward_cycle: False 24 | reward_cycle_s: False 25 | reward_cycle_a: False 26 | data_src: episode_monitor 27 | data_type: multiscalar 28 | bin_size: 100 29 | smooth: 1 30 | value_loss: 31 | log_name: value_loss 32 | data_src: alg_monitor 33 | data_type: scalar 34 | bin_size: 1 35 | smooth: 0 36 | action_loss: 37 | log_name: action_loss 38 | data_src: alg_monitor 39 | data_type: scalar 40 | bin_size: 1 41 | smooth: 0 42 | dist_entropy: 43 | log_name: dist_entropy 44 | data_src: alg_monitor 45 | data_type: scalar 46 | bin_size: 1 47 | smooth: 0 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Swap files 2 | *.sw* 3 | *.sv* 4 | *.su* 5 | *.st* 6 | 7 | # Temp ckpt folders 8 | ckpt*/ 9 | slurm_logs/ 10 | results/ 11 | logs/ 12 | sac-hierarchy/ 13 | eval_runs/ 14 | 15 | # Whatever this is 16 | .DS_Store 17 | 18 | # Byte-compiled / optimized / DLL files 19 | __pycache__/ 20 | *.py[cod] 21 | *$py.class 22 | 23 | # C extensions 24 | *.so 25 | 26 | # Distribution / packaging 27 | .Python 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .coverage 59 | .coverage.* 60 | .cache 61 | nosetests.xml 62 | coverage.xml 63 | *.cover 64 | .hypothesis/ 65 | .pytest_cache/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | db.sqlite3 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | target/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # pyenv 93 | .python-version 94 | 95 | # celery beat schedule file 96 | celerybeat-schedule 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | trained_models/ 121 | .fuse_hidden* 122 | -------------------------------------------------------------------------------- /distributions.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from utils import init, init_normc_, AddBias 7 | 8 | """ 9 | Modify standard PyTorch distributions so they are compatible with this code. 10 | """ 11 | 12 | FixedCategorical = torch.distributions.Categorical 13 | 14 | old_sample = FixedCategorical.sample 15 | FixedCategorical.sample = lambda self: old_sample(self).unsqueeze(-1) 16 | 17 | log_prob_cat = FixedCategorical.log_prob 18 | FixedCategorical.log_probs = lambda self, actions: log_prob_cat(self, actions.squeeze(-1)).unsqueeze(-1) 19 | 20 | FixedCategorical.mode = lambda self: self.probs.argmax(dim=1, keepdim=True) 21 | 22 | FixedNormal = torch.distributions.Normal 23 | log_prob_normal = FixedNormal.log_prob 24 | FixedNormal.log_probs = lambda self, actions: log_prob_normal(self, actions).sum(-1, keepdim=True) 25 | 26 | entropy = FixedNormal.entropy 27 | FixedNormal.entropy = lambda self: entropy(self).sum(-1) 28 | 29 | FixedNormal.mode = lambda self: self.mean 30 | 31 | 32 | class Categorical(nn.Module): 33 | def __init__(self, num_inputs, num_outputs): 34 | super(Categorical, self).__init__() 35 | 36 | init_ = lambda m: init(m, 37 | nn.init.orthogonal_, 38 | lambda x: nn.init.constant_(x, 0), 39 | gain=0.01) 40 | 41 | self.linear = init_(nn.Linear(num_inputs, num_outputs)) 42 | 43 | def forward(self, x): 44 | x = self.linear(x) 45 | return FixedCategorical(logits=x) 46 | 47 | 48 | class DiagGaussian(nn.Module): 49 | def __init__(self, num_inputs, num_outputs): 50 | super(DiagGaussian, self).__init__() 51 | 52 | init_ = lambda m: init(m, 53 | init_normc_, 54 | lambda x: nn.init.constant_(x, 0)) 55 | 56 | self.fc_mean = init_(nn.Linear(num_inputs, num_outputs)) 57 | self.logstd = AddBias(torch.zeros(num_outputs)) 58 | 59 | def forward(self, x): 60 | action_mean = self.fc_mean(x) 61 | 62 | # An ugly hack for my KFAC implementation. 63 | zeros = torch.zeros(action_mean.size()) 64 | if x.is_cuda: 65 | zeros = zeros.cuda() 66 | 67 | action_logstd = self.logstd(zeros) 68 | return FixedNormal(action_mean, action_logstd.exp()) -------------------------------------------------------------------------------- /environments/geom_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pdb 4 | 5 | # Convert to/from quaternion 6 | def quaternion_to_euler_angle(quart): 7 | w = quart[0] 8 | x = quart[1] 9 | y = quart[2] 10 | z = quart[3] 11 | 12 | ysqr = y * y 13 | 14 | t0 = +2.0 * (w * x + y * z) 15 | t1 = +1.0 - 2.0 * (x * x + ysqr) 16 | roll = math.atan2(t0, t1) 17 | 18 | t2 = +2.0 * (w * y - z * x) 19 | t2 = +1.0 if t2 > +1.0 else t2 20 | t2 = -1.0 if t2 < -1.0 else t2 21 | pitch = math.asin(t2) 22 | 23 | t3 = +2.0 * (w * z + x * y) 24 | t4 = +1.0 - 2.0 * (ysqr + z * z) 25 | yaw = math.atan2(t3, t4) 26 | 27 | return roll, pitch, yaw 28 | 29 | def euler_angle_to_quaternion(roll, pitch, yaw): 30 | cy = math.cos(yaw * 0.5); 31 | sy = math.sin(yaw * 0.5); 32 | cr = math.cos(roll * 0.5); 33 | sr = math.sin(roll * 0.5); 34 | cp = math.cos(pitch * 0.5); 35 | sp = math.sin(pitch * 0.5); 36 | 37 | w = cy * cr * cp + sy * sr * sp 38 | x = cy * sr * cp - sy * cr * sp 39 | y = cy * cr * sp + sy * sr * cp 40 | z = sy * cr * cp - cy * sr * sp 41 | 42 | return w, x, y, z 43 | 44 | # Angle to unit vector 45 | def angle_to_unit(angle): 46 | x = math.cos(angle) 47 | y = math.sin(angle) 48 | return np.array([x, y]) 49 | 50 | # Unit vector to angle 51 | def unit_to_angle(v): 52 | x = v[0] 53 | y = v[1] 54 | angle = math.atan2(y, x) 55 | 56 | # Average angles (do this by averaging unit vectors) 57 | def average_angles(angles): 58 | # Convert to unit vectors and average 59 | unit_vecs = [angle_to_unit(a) for a in angles] 60 | avg_dir = np.mean(unit_vecs, 0) 61 | 62 | # Return direction of the average unit vector 63 | avg_angle = math.atan2(avg_dir[1], avg_dir[0]) 64 | return avg_angle 65 | 66 | # Convert angle to an egocentric coordinate 67 | def convert_to_egocentric(ego_to_global_angle, global_angle): 68 | # ego_to_global_angle - the angle of the agent in the global coordinate system 69 | # global_angle - the angle (rad) in global coordinates we want to be egocentric 70 | ego_angle = global_angle - ego_to_global_angle 71 | if ego_angle > math.pi: 72 | ego_angle -= 2*math.pi 73 | elif ego_angle < -math.pi: 74 | ego_angle += 2*math.pi 75 | 76 | return ego_angle 77 | 78 | # Convert vector to an egocentric coordinate 79 | def convert_vector_to_egocentric(ego_to_global_angle, vector): 80 | #pdb.set_trace() 81 | # Get magnitude and direction 82 | xy_mag = np.linalg.norm(vector) 83 | xy_angle = math.atan2(vector[1], vector[0]) 84 | 85 | # Change direction to egocentric 86 | xy_angle = convert_to_egocentric(ego_to_global_angle, xy_angle) 87 | 88 | # Reform the vector 89 | x = xy_mag * math.cos(xy_angle) 90 | y = xy_mag * math.sin(xy_angle) 91 | ego_vec = np.array([x, y]) 92 | 93 | return ego_vec 94 | -------------------------------------------------------------------------------- /options/visualization/mujoco_verbose.yaml: -------------------------------------------------------------------------------- 1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor 2 | step_monitor_str: Last.Step.Monitor.csv # Name of step monitor (for last step) 3 | plot_keys: # Values we want to plot (and necessary info to plot them) 4 | delta_state_ep: 5 | log_name: delta_state 6 | data_src: episode_monitor 7 | data_type: array 8 | display_type: elementwise 9 | bin_size: 100 10 | smooth: 0 11 | delta_obs_ep: 12 | log_name: delta_obs 13 | data_src: episode_monitor 14 | data_type: array 15 | display_type: elementwise 16 | bin_size: 100 17 | smooth: 0 18 | mean_action_ep: 19 | log_name: mean_action 20 | data_src: episode_monitor 21 | data_type: array 22 | display_type: elementwise 23 | bin_size: 100 24 | smooth: 0 25 | episode_len_ep: 26 | log_name: episode_len 27 | data_src: episode_monitor 28 | data_type: scalar 29 | bin_size: 100 30 | smooth: 0 31 | rewards_ep: 32 | log_name: 33 | reward_env: True 34 | reward_exp: False 35 | reward_run: False 36 | reward_ctrl: False 37 | reward_contact: False 38 | reward_survive: False 39 | reward_forward: False 40 | reward_move: False 41 | reward_cycle: False 42 | reward_cycle_s: False 43 | reward_cycle_a: False 44 | data_src: episode_monitor 45 | data_type: multiscalar 46 | bin_size: 100 47 | smooth: 0 48 | obs_step: 49 | log_name: obs 50 | data_src: step_monitor 51 | update_delay: 10 52 | data_type: array 53 | display_type: elementwise 54 | bin_size: 1 55 | smooth: 0 56 | action_step: 57 | log_name: action 58 | data_src: step_monitor 59 | update_delay: 10 60 | data_type: array 61 | display_type: elementwise 62 | bin_size: 1 63 | smooth: 0 64 | env_count_step: 65 | log_name: env_count 66 | data_src: step_monitor 67 | update_delay: 10 68 | data_type: scalar 69 | bin_size: 1 70 | smooth: 0 71 | episode_count_step: 72 | log_name: episode_count 73 | data_src: step_monitor 74 | update_delay: 10 75 | data_type: single_value 76 | reward_step: 77 | log_name: 78 | reward_env: True 79 | reward_exp: False 80 | reward_run: False 81 | reward_ctrl: False 82 | reward_contact: False 83 | reward_survive: False 84 | reward_forward: False 85 | reward_move: False 86 | reward_cycle: False 87 | reward_cycle_s: False 88 | reward_cycle_a: False 89 | data_src: step_monitor 90 | update_delay: 10 91 | data_type: multiscalar 92 | bin_size: 1 93 | smooth: 0 94 | -------------------------------------------------------------------------------- /environments/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='ExplorerAnt-v2', 5 | entry_point='environments.explorer_ant:LowlevelAntEnv', 6 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 7 | ) 8 | 9 | register( 10 | id='ExplorerAntLowGear-v2', 11 | entry_point='environments.explorer_ant:LowlevelAntLowGearEnv', 12 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 13 | ) 14 | 15 | register( 16 | id='ExplorerProprioceptiveHumanoid-v2', 17 | entry_point='environments.explorer_humanoid:LowlevelProprioceptiveHumanoidEnv', 18 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 19 | ) 20 | 21 | register( 22 | id='HierarchicalAnt-v2', 23 | entry_point='environments.explorer_ant:HierarchyAntEnv', 24 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 25 | ) 26 | 27 | register( 28 | id='MyMazeDebugEnv-v2', 29 | entry_point='environments.maze_ant:ShowMazeEnv', 30 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 31 | ) 32 | 33 | register( 34 | id='AntNavigateEnv-v2', 35 | entry_point='environments.maze_ant:AntNavigateEnv', 36 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 37 | ) 38 | 39 | register( 40 | id='AntNavigateEnv10k-v2', 41 | entry_point='environments.maze_ant:AntNavigateEnv', 42 | tags={'wrapper_config.TimeLimit.max_episode_steps': 10000}, 43 | ) 44 | 45 | register( 46 | id='AntNavigateLowGearEnv-v2', 47 | entry_point='environments.maze_ant:AntNavigateLowGearEnv', 48 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 49 | ) 50 | 51 | register( 52 | id='AntCrossMazeEnv-v2', 53 | entry_point='environments.maze_ant:AntCrossMazeEnv', 54 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 55 | ) 56 | 57 | register( 58 | id='AntTMazeEnv-v2', 59 | entry_point='environments.maze_ant:AntTMazeEnv', 60 | tags={'wrapper_config.TimeLimit.max_episode_steps': 2000}, 61 | ) 62 | 63 | register( 64 | id='AntTMazeEnv10k-v2', 65 | entry_point='environments.maze_ant:AntTMazeEnv', 66 | tags={'wrapper_config.TimeLimit.max_episode_steps': 10000}, 67 | ) 68 | 69 | register( 70 | id='AntSkullMazeEnv-v2', 71 | entry_point='environments.maze_ant:AntSkullMazeEnv', 72 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 73 | ) 74 | 75 | register( 76 | id='AntDebugMazeEnv-v2', 77 | entry_point='environments.maze_ant:DebugAntMazeEnv', 78 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 79 | ) 80 | 81 | register( 82 | id='AntDebugMazeLeftEnv-v2', 83 | entry_point='environments.maze_ant:DebugAntMazeLeftEnv', 84 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 85 | ) 86 | 87 | register( 88 | id='AntDebugMazeRightEnv-v2', 89 | entry_point='environments.maze_ant:DebugAntMazeRightEnv', 90 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000}, 91 | ) 92 | 93 | register( 94 | id='ProprioceptiveHumanoidSmallCrossMazeEnv10k-v2', 95 | entry_point='environments.maze_humanoid:ProprioceptiveHumanoidSmallCrossMazeEnv', 96 | tags={'wrapper_config.TimeLimit.max_episode_steps': 10000}, 97 | ) 98 | 99 | -------------------------------------------------------------------------------- /algo/a2c_acktr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | 5 | from .kfac import KFACOptimizer 6 | 7 | class A2C_ACKTR(object): 8 | def __init__(self, 9 | actor_critic, 10 | value_loss_coef, 11 | entropy_coef, 12 | lr=None, 13 | eps=None, 14 | alpha=None, 15 | max_grad_norm=None, 16 | acktr=False): 17 | 18 | self.actor_critic = actor_critic 19 | self.acktr = acktr 20 | 21 | self.value_loss_coef = value_loss_coef 22 | self.entropy_coef = entropy_coef 23 | 24 | self.max_grad_norm = max_grad_norm 25 | 26 | if acktr: 27 | self.optimizer = KFACOptimizer(actor_critic) 28 | else: 29 | self.optimizer = optim.RMSprop( 30 | actor_critic.parameters(), lr, eps=eps, alpha=alpha) 31 | 32 | # Generate a state_dict object 33 | def state_dict(self): 34 | ckpt = {} 35 | ckpt['model'] = self.actor_critic.state_dict() 36 | ckpt['optim'] = self.optimizer.state_dict() 37 | return ckpt 38 | 39 | # Load from a state dict 40 | def load_state_dict(self, ckpt): 41 | self.actor_critic.load_state_dict(ckpt['model']) 42 | self.optimizer.load_state_dict(ckpt['optim']) 43 | 44 | # Update policy 45 | def update(self, rollouts): 46 | obs_shape = rollouts.observations.size()[2:] 47 | action_shape = rollouts.actions.size()[-1] 48 | num_steps, num_processes, _ = rollouts.rewards.size() 49 | 50 | values, action_log_probs, dist_entropy, states = self.actor_critic.evaluate_actions( 51 | rollouts.observations[:-1].view(-1, *obs_shape), 52 | rollouts.states[0].view(-1, self.actor_critic.state_size), 53 | rollouts.masks[:-1].view(-1, 1), 54 | rollouts.actions.view(-1, action_shape)) 55 | 56 | values = values.view(num_steps, num_processes, 1) 57 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1) 58 | 59 | advantages = rollouts.returns[:-1] - values 60 | value_loss = advantages.pow(2).mean() 61 | 62 | action_loss = -(advantages.detach() * action_log_probs).mean() 63 | 64 | if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: 65 | # Sampled fisher, see Martens 2014 66 | self.actor_critic.zero_grad() 67 | pg_fisher_loss = -action_log_probs.mean() 68 | 69 | value_noise = torch.randn(values.size()) 70 | if values.is_cuda: 71 | value_noise = value_noise.cuda() 72 | 73 | sample_values = values + value_noise 74 | vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() 75 | 76 | fisher_loss = pg_fisher_loss + vf_fisher_loss 77 | self.optimizer.acc_stats = True 78 | fisher_loss.backward(retain_graph=True) 79 | self.optimizer.acc_stats = False 80 | 81 | self.optimizer.zero_grad() 82 | (value_loss * self.value_loss_coef + action_loss - 83 | dist_entropy * self.entropy_coef).backward() 84 | 85 | if self.acktr == False: 86 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 87 | self.max_grad_norm) 88 | 89 | self.optimizer.step() 90 | 91 | return value_loss.item(), action_loss.item(), dist_entropy.item() 92 | -------------------------------------------------------------------------------- /options/baseline_lowlevel/baseline_simplemlp_pretrain_any.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 10 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: baseline_lowlevel # Mode is baseline with theta 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | env: 29 | gamma: 0.99 # Discount factor for rewards 30 | num_stack: 1 # Number of frames to stack 31 | add_timestep: False # Add timestep to observations 32 | known_reset: False # Reset to known position 33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 34 | time_scale: 0.001 # What to multiply timestep by for AC input 35 | theta_space_mode: pretrain_any # What theta mode we're in 36 | theta_reset_mode: never # When to change theta 37 | theta_reward_mode: lax # How to punish perpendicular movement 38 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 39 | theta_memory_lookback: 10 # How far to look back for reference global theta 40 | time_limit: 1000 # When to end an episode 41 | reward_shape_type: instant 42 | logs: 43 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 44 | exp_name: baseline_simplemlp_pretrain_any # Unique experiment name 45 | log_interval: 1 # Log interval, one log per n updates 46 | save_interval: 100000000000 # Save interval, one per n updates 47 | vis_interval: 1 # Vis interval, one log per n updates 48 | optim_ppo: 49 | lr: 0.0003 # Learning rate 50 | eps: 0.00001 # RMSprop optimizer epsiolon 51 | alpha: 0.99 # RMSprop optimizer alpha 52 | max_grad_norm: 0.5 # Max norm of gradients 53 | num_frames: 2000000 # Number of frames to train 54 | optim_a2c: 55 | lr: 0.0007 # Learning rate 56 | eps: 0.00001 # RMSprop optimizer epsiolon 57 | alpha: 0.99 # RMSprop optimizer alpha 58 | max_grad_norm: 0.5 # Max norm of gradients 59 | num_frames: 2000000 # Number of frames to train 60 | -------------------------------------------------------------------------------- /hier_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import collections 4 | import numpy as np 5 | import math 6 | import pdb 7 | 8 | # Class that handles all the messy hierarchical observation stuff 9 | class HierarchyUtils(object): 10 | def __init__(self, ll_obs_sz, hl_obs_sz, hl_action_space, theta_sz, add_count): 11 | self.ll_obs_sz = ll_obs_sz 12 | if add_count: 13 | self.ll_raw_obs_sz = [self.ll_obs_sz[0] - theta_sz - 1] 14 | else: 15 | self.ll_raw_obs_sz = [self.ll_obs_sz[0] - theta_sz] 16 | self.hl_obs_sz = hl_obs_sz 17 | self.theta_sz = theta_sz 18 | self.hl_action_space = hl_action_space 19 | self.add_count = add_count 20 | 21 | # Seperate out highlevel, lowlevel and counts 22 | def seperate_obs(self, obs): 23 | ll_raw_obs = obs[:, :self.ll_raw_obs_sz[0]] 24 | assert(ll_raw_obs.shape[-1] == self.ll_raw_obs_sz[0]) 25 | hl_obs = obs[:, self.ll_raw_obs_sz[0]:-1] 26 | assert(hl_obs.shape[-1] == self.hl_obs_sz[0]) 27 | count = obs[:, -1] 28 | return hl_obs, ll_raw_obs, count 29 | 30 | # Append theta and count to ll obs 31 | def append_theta(self, ll_raw_obs, hl_action, counts): 32 | # Get theta 33 | if self.hl_action_space.__class__.__name__ == 'Discrete': 34 | assert(self.theta_sz == self.hl_action_space.n) 35 | thetas = np.zeros([len(hl_action), self.theta_sz]) 36 | for e, act in enumerate(hl_action): 37 | thetas[e, act] = 1 38 | else: 39 | thetas = hl_action 40 | 41 | # Concanetate 42 | if self.add_count: 43 | if len(counts.shape) != len(ll_raw_obs.shape): 44 | counts = np.expand_dims(counts, axis=1) 45 | ll_obs = np.concatenate([ll_raw_obs, thetas, counts], 1) 46 | else: 47 | ll_obs = np.concatenate([ll_raw_obs, thetas], 1) 48 | assert(ll_obs.shape[-1] == self.ll_obs_sz[0]) 49 | 50 | return ll_obs 51 | 52 | # Append placeholder theta and count to ll obs 53 | def placeholder_theta(self, ll_raw_obs, counts): 54 | thetas = float('inf') * np.ones([len(ll_raw_obs), self.theta_sz]) 55 | 56 | # Concanetate 57 | if self.add_count: 58 | if len(counts.shape) != len(ll_raw_obs.shape): 59 | counts = np.expand_dims(counts, axis=1) 60 | ll_obs = np.concatenate([ll_raw_obs, thetas, counts], 1) 61 | else: 62 | ll_obs = np.concatenate([ll_raw_obs, thetas], 1) 63 | assert(ll_obs.shape[-1] == self.ll_obs_sz[0]) 64 | 65 | return ll_obs 66 | 67 | # Update ll_obs to remove placeholders 68 | def update_theta(self, ll_obs, hl_action): 69 | # Take in single obs and high level action and update away the placehodler 70 | assert(self.has_placeholder(ll_obs)) 71 | assert(ll_obs.shape == self.ll_obs_sz) 72 | 73 | # Get theta 74 | if self.hl_action_space.__class__.__name__ == 'Discrete': 75 | assert(self.theta_sz == self.hl_action_space.n) 76 | theta = torch.zeros(self.theta_sz) 77 | theta[hl_action] = 1 78 | else: 79 | theta = torch.from_numpy(hl_action) 80 | 81 | # Update observation with theta 82 | if self.add_count: 83 | ll_obs[self.ll_raw_obs_sz[0]:-1] = theta 84 | else: 85 | ll_obs[self.ll_raw_obs_sz[0]:] = theta 86 | assert(not self.has_placeholder(ll_obs)) 87 | return ll_obs 88 | 89 | # Check if ll_obs has a placeholder 90 | def has_placeholder(self, ll_obs): 91 | if float('inf') in ll_obs: 92 | return True 93 | else: 94 | return False 95 | 96 | -------------------------------------------------------------------------------- /options/baseline_lowlevel/baseline_simplemlp_skip_hs16_pretrain_any_20M.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 10 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: baseline_lowlevel # Mode is lowlevel phase 24 | hid_sz: 16 # MLP hidden size 25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | env: 29 | gamma: 0.99 # Discount factor for rewards 30 | num_stack: 1 # Number of frames to stack 31 | add_timestep: False # Add timestep to observations 32 | known_reset: False # Reset to known position 33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 34 | time_scale: 0.001 # What to multiply timestep by for AC input 35 | theta_space_mode: pretrain_any # What theta mode we're in 36 | theta_reset_mode: never # When to change theta 37 | theta_reward_mode: lax # How to punish perpendicular movement 38 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 39 | theta_memory_lookback: 10 # How far to look back for reference global theta 40 | time_limit: 1000 # When to end an episode 41 | reward_shape_type: instant 42 | logs: 43 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 44 | exp_name: baseline_simplemlp_skip_hs16_pretrain_any_20M # Unique experiment name 45 | log_interval: 1 # Log interval, one log per n updates 46 | save_interval: 100000000000 # Save interval, one per n updates 47 | vis_interval: 1 # Vis interval, one log per n updates 48 | optim_ppo: 49 | lr: 0.0003 # Learning rate 50 | eps: 0.00001 # RMSprop optimizer epsiolon 51 | alpha: 0.99 # RMSprop optimizer alpha 52 | max_grad_norm: 0.5 # Max norm of gradients 53 | num_frames: 20000000 # Number of frames to train 54 | optim_a2c: 55 | lr: 0.0007 # Learning rate 56 | eps: 0.00001 # RMSprop optimizer epsiolon 57 | alpha: 0.99 # RMSprop optimizer alpha 58 | max_grad_norm: 0.5 # Max norm of gradients 59 | num_frames: 20000000 # Number of frames to train 60 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import collections 4 | import numpy as np 5 | import math 6 | import pdb 7 | 8 | # Necessary for my KFAC implementation. 9 | class AddBias(nn.Module): 10 | def __init__(self, bias): 11 | super(AddBias, self).__init__() 12 | self._bias = nn.Parameter(bias.unsqueeze(1)) 13 | 14 | def forward(self, x): 15 | if x.dim() == 2: 16 | bias = self._bias.t().view(1, -1) 17 | else: 18 | bias = self._bias.t().view(1, -1, 1, 1) 19 | 20 | return x + bias 21 | 22 | # Rolling average 23 | class RollingAverage(object): 24 | def __init__(self, window_sz): 25 | self.window_sz = window_sz 26 | self.data = collections.deque() 27 | self.sum = None 28 | 29 | # Append item and update sum and data struct 30 | def append(self, item): 31 | assert(type(item) is np.ndarray) 32 | # If full, pop left and remove remove from average 33 | if len(self.data) == self.window_sz: 34 | removed = self.data.popleft() 35 | self.sum -= removed 36 | 37 | # Update sum with new item and add to data 38 | if len(self.data) == 0: 39 | self.sum = item 40 | else: 41 | self.sum += item 42 | self.data.append(item) 43 | assert(len(self.data) <= self.window_sz) 44 | 45 | # Return the average value 46 | def average(self): 47 | # Exception if list is empty 48 | if len(self.data) == 0: 49 | raise Exception("Can't compute rolling average on empty list") 50 | 51 | # Return average 52 | return self.sum / len(self.data) 53 | 54 | # Convert to/from quaternion 55 | def quaternion_to_euler_angle(w, x, y, z): 56 | ysqr = y * y 57 | 58 | t0 = +2.0 * (w * x + y * z) 59 | t1 = +1.0 - 2.0 * (x * x + ysqr) 60 | roll = math.atan2(t0, t1) 61 | 62 | t2 = +2.0 * (w * y - z * x) 63 | t2 = +1.0 if t2 > +1.0 else t2 64 | t2 = -1.0 if t2 < -1.0 else t2 65 | pitch = math.asin(t2) 66 | 67 | t3 = +2.0 * (w * z + x * y) 68 | t4 = +1.0 - 2.0 * (ysqr + z * z) 69 | yaw = math.atan2(t3, t4) 70 | 71 | return roll, pitch, yaw 72 | 73 | def euler_angle_to_quaternion(roll, pitch, yaw): 74 | cy = math.cos(yaw * 0.5); 75 | sy = math.sin(yaw * 0.5); 76 | cr = math.cos(roll * 0.5); 77 | sr = math.sin(roll * 0.5); 78 | cp = math.cos(pitch * 0.5); 79 | sp = math.sin(pitch * 0.5); 80 | 81 | w = cy * cr * cp + sy * sr * sp 82 | x = cy * sr * cp - sy * cr * sp 83 | y = cy * cr * sp + sy * sr * cp 84 | z = sy * cr * cp - cy * sr * sp 85 | 86 | return w, x, y, z 87 | 88 | # Angle to unit vector 89 | def angle_to_unit(angle): 90 | x = math.cos(angle) 91 | y = math.sin(angle) 92 | return np.array([x, y]) 93 | 94 | # Unit vector to angle 95 | def unit_to_angle(v): 96 | x = v[0] 97 | y = v[1] 98 | angle = math.atan2(y, x) 99 | 100 | # Convert angle to an egocentric coordinate (All in unit vectors) 101 | def convert_to_egocentric(ego_to_global_angle, global_angle): 102 | # ego_to_global_angle - the angle of the agent in the global coordinate system 103 | # global_angle - the angle (rad) in global coordinates we want to be egocentric 104 | ego_angle = global_angle - ego_to_global_angle 105 | if ego_angle > math.pi: 106 | ego_angle -= 2*math.pi 107 | elif ego_angle < -math.pi: 108 | ego_angle += 2*math.pi 109 | 110 | return ego_angle 111 | 112 | def init(module, weight_init, bias_init, gain=1): 113 | weight_init(module.weight.data, gain=gain) 114 | bias_init(module.bias.data) 115 | return module 116 | 117 | # https://github.com/openai/baselines/blob/master/baselines/common/tf_util.py#L87 118 | def init_normc_(weight, gain=1): 119 | weight.normal_(0, 1) 120 | weight *= gain / torch.sqrt(weight.pow(2).sum(1, keepdim=True)) 121 | -------------------------------------------------------------------------------- /algo/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import pdb 5 | 6 | class PPO(object): 7 | def __init__(self, 8 | actor_critic, 9 | clip_param, 10 | ppo_epoch, 11 | num_mini_batch, 12 | value_loss_coef, 13 | entropy_coef, 14 | lr=None, 15 | eps=None, 16 | max_grad_norm=None): 17 | 18 | self.actor_critic = actor_critic 19 | 20 | self.clip_param = clip_param 21 | self.ppo_epoch = ppo_epoch 22 | self.num_mini_batch = num_mini_batch 23 | 24 | self.value_loss_coef = value_loss_coef 25 | self.entropy_coef = entropy_coef 26 | 27 | self.max_grad_norm = max_grad_norm 28 | 29 | self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps) 30 | 31 | # Generate a state_dict object 32 | def state_dict(self): 33 | ckpt = {} 34 | ckpt['model'] = self.actor_critic.state_dict() 35 | ckpt['optim'] = self.optimizer.state_dict() 36 | return ckpt 37 | 38 | # Load from a state dict 39 | def load_state_dict(self, ckpt): 40 | self.actor_critic.load_state_dict(ckpt['model']) 41 | self.optimizer.load_state_dict(ckpt['optim']) 42 | 43 | # Load from pretrained (ModularPolicy) 44 | def load_pretrained_policies(self, ckpts): 45 | self.actor_critic.load_pretrained_policies(ckpts) 46 | 47 | # Update our policy network 48 | def update(self, rollouts): 49 | advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] 50 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) 51 | 52 | value_loss_epoch = 0 53 | action_loss_epoch = 0 54 | dist_entropy_epoch = 0 55 | 56 | for e in range(self.ppo_epoch): 57 | if self.actor_critic.is_recurrent: #hasattr(self.actor_critic.base, 'gru'): 58 | data_generator = rollouts.recurrent_generator( 59 | advantages, self.num_mini_batch) 60 | else: 61 | data_generator = rollouts.feed_forward_generator( 62 | advantages, self.num_mini_batch) 63 | 64 | for sample in data_generator: 65 | observations_batch, states_batch, actions_batch, \ 66 | return_batch, masks_batch, old_action_log_probs_batch, \ 67 | adv_targ = sample 68 | 69 | # Reshape to do in a single forward pass for all steps 70 | values, action_log_probs, dist_entropy, states = self.actor_critic.evaluate_actions( 71 | observations_batch, states_batch, 72 | masks_batch, actions_batch) 73 | 74 | ratio = torch.exp(action_log_probs - old_action_log_probs_batch) 75 | surr1 = ratio * adv_targ 76 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 77 | 1.0 + self.clip_param) * adv_targ 78 | action_loss = -torch.min(surr1, surr2).mean() 79 | 80 | value_loss = (return_batch - values).pow(2).mean() 81 | 82 | self.optimizer.zero_grad() 83 | (value_loss * self.value_loss_coef + action_loss - 84 | dist_entropy * self.entropy_coef).backward() 85 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 86 | self.max_grad_norm) 87 | self.optimizer.step() 88 | 89 | value_loss_epoch += value_loss.item() 90 | action_loss_epoch += action_loss.item() 91 | dist_entropy_epoch += dist_entropy.item() 92 | 93 | num_updates = self.ppo_epoch * self.num_mini_batch 94 | 95 | value_loss_epoch /= num_updates 96 | action_loss_epoch /= num_updates 97 | dist_entropy_epoch /= num_updates 98 | 99 | return value_loss_epoch, action_loss_epoch, dist_entropy_epoch 100 | -------------------------------------------------------------------------------- /options/phase_lowlevel/phase_mlp_pretrain_any.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 10 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: phase_lowlevel # Mode is lowlevel phase 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: False 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | logs: 51 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 52 | exp_name: phase_mlp_pretrain_any_20M # Unique experiment name 53 | log_interval: 1 # Log interval, one log per n updates 54 | save_interval: 100000000000 # Save interval, one per n updates 55 | vis_interval: 1 # Vis interval, one log per n updates 56 | optim_ppo: 57 | lr: 0.0003 # Learning rate 58 | eps: 0.00001 # RMSprop optimizer epsiolon 59 | alpha: 0.99 # RMSprop optimizer alpha 60 | max_grad_norm: 0.5 # Max norm of gradients 61 | num_frames: 2000000 # Number of frames to train 62 | optim_a2c: 63 | lr: 0.0007 # Learning rate 64 | eps: 0.00001 # RMSprop optimizer epsiolon 65 | alpha: 0.99 # RMSprop optimizer alpha 66 | max_grad_norm: 0.5 # Max norm of gradients 67 | num_frames: 2000000 # Number of frames to train 68 | -------------------------------------------------------------------------------- /options/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 10 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: phase_lowlevel # Mode is lowlevel phase 24 | hid_sz: 16 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: True 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | logs: 51 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 52 | exp_name: phase_mlp_skip_hs16_pretrain_any_20M # Unique experiment name 53 | log_interval: 1 # Log interval, one log per n updates 54 | save_interval: 100000000000 # Save interval, one per n updates 55 | vis_interval: 1 # Vis interval, one log per n updates 56 | optim_ppo: 57 | lr: 0.0003 # Learning rate 58 | eps: 0.00001 # RMSprop optimizer epsiolon 59 | alpha: 0.99 # RMSprop optimizer alpha 60 | max_grad_norm: 0.5 # Max norm of gradients 61 | num_frames: 20000000 # Number of frames to train 62 | optim_a2c: 63 | lr: 0.0007 # Learning rate 64 | eps: 0.00001 # RMSprop optimizer epsiolon 65 | alpha: 0.99 # RMSprop optimizer alpha 66 | max_grad_norm: 0.5 # Max norm of gradients 67 | num_frames: 20000000 # Number of frames to train 68 | -------------------------------------------------------------------------------- /wrappers.py: -------------------------------------------------------------------------------- 1 | # Modified by Kenneth Marino 2 | # VecNormalize originally copied from https://github.com/openai/baselines/ 3 | from vec_env import VecEnvWrapper 4 | from baselines.common.running_mean_std import RunningMeanStd 5 | import numpy as np 6 | import pdb 7 | 8 | # From openai baselines originally 9 | # My version of this saves the unclipped/unnormalized values for logging and other purposes 10 | class ObservationFilter(VecEnvWrapper): 11 | """ 12 | Vectorized environment base class 13 | """ 14 | def __init__(self, venv, ob=True, ret=True, train=True, noclip=False, has_timestep=False, ignore_mask=None, freeze_mask=None, time_scale=1e-3, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): 15 | VecEnvWrapper.__init__(self, venv) 16 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None 17 | self.ret_rms = RunningMeanStd(shape=()) if ret else None 18 | self.clipob = clipob 19 | self.cliprew = cliprew 20 | self.ret = np.zeros(self.num_envs) 21 | self.train = train 22 | self.gamma = gamma 23 | self.epsilon = epsilon 24 | self.noclip = noclip 25 | self.ignore_mask = ignore_mask 26 | self.freeze_mask = freeze_mask 27 | self.has_timestep = has_timestep 28 | self.time_scale = time_scale 29 | 30 | def step_wait(self): 31 | """ 32 | Apply sequence of actions to sequence of environments 33 | actions -> (observations, rewards, news) 34 | where 'news' is a boolean vector indicating whether each element is new. 35 | """ 36 | obs, rews, news, infos = self.venv.step_wait() 37 | self.ret = self.ret * self.gamma + rews 38 | self.raw_obs = obs 39 | self.raw_rews = rews 40 | # Do filtering (but not for step_mask = 0 values) 41 | for proc, obs_proc in enumerate(obs): 42 | obs_proc = np.array([obs_proc]) 43 | if self.step_mask[proc] > 0: 44 | obs_proc = self._obfilt(obs_proc) 45 | obs[proc] = obs_proc[0] 46 | if self.ret_rms: 47 | # Only update ret_rms if in training mode 48 | if self.train: 49 | self.ret_rms.update(self.ret) 50 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) 51 | return obs, rews, news, infos 52 | 53 | def _obfilt(self, obs): 54 | if self.ob_rms: 55 | # Only update ob_rms if in training mode 56 | if self.train: 57 | # Use freeze mask to only update part of the ob_rms 58 | if self.freeze_mask is not None: 59 | old_obs_rms_mean = np.array(self.ob_rms.mean) 60 | old_obs_rms_var = np.array(self.ob_rms.var) 61 | self.ob_rms.update(obs) 62 | self.ob_rms.mean = old_obs_rms_mean * self.freeze_mask + self.ob_rms.mean * (1 - self.freeze_mask) 63 | self.ob_rms.var = old_obs_rms_var * self.freeze_mask + self.ob_rms.var * (1 - self.freeze_mask) 64 | else: 65 | self.ob_rms.update(obs) 66 | 67 | # Copy original obs 68 | obs_orig = np.copy(obs) 69 | 70 | # Use code from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 71 | if self.noclip: 72 | obs = (obs - self.ob_rms.mean) / (3*(np.sqrt(self.ob_rms.var) + 0.1)) 73 | else: 74 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) 75 | 76 | # Use ignore_mask to restore parts of obs we want to leave alone 77 | obs = (1 - self.ignore_mask) * obs + self.ignore_mask * obs_orig 78 | 79 | # Scale timestep 80 | if self.has_timestep: 81 | obs[:, -1] *= self.time_scale 82 | 83 | return obs 84 | else: 85 | return obs 86 | 87 | 88 | 89 | def reset(self): 90 | """ 91 | Reset all environments 92 | """ 93 | obs = self.venv.reset() 94 | self.raw_obs = obs 95 | return self._obfilt(obs) 96 | 97 | -------------------------------------------------------------------------------- /environments/RewardCyclicEnv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import collections 3 | import pdb 4 | import gym 5 | import time 6 | import csv 7 | import json 8 | import shutil 9 | import numpy as np 10 | import random 11 | from . import ant_env 12 | from . import explorer_ant 13 | from . import geom_utils 14 | from gym.spaces.box import Box 15 | import math 16 | import sys 17 | sys.path.append('../') 18 | from utils import RollingAverage 19 | 20 | # Wrapper that defines our environment for our low level ant policy 21 | # Everything is egocentric to the ant 22 | class RewardCyclicEnv(gym.Wrapper): 23 | def __init__(self, env=None, opt=None): 24 | super(RewardCyclicEnv, self).__init__(env) 25 | 26 | # Should be in correct mode (can be baseline or phase, but should be theta version) 27 | self.mode = opt['model']['mode'] 28 | assert(self.mode in ['cyclic']) 29 | 30 | # Make sure we're using the right environment (our ant for now) 31 | assert isinstance(env.unwrapped, ant_env.BaseAntEnv) 32 | 33 | # Keep memory 34 | # Figure out sizes of external and external 35 | self.ex_states = [] 36 | self.pro_states = [] 37 | self.actions = [] 38 | 39 | # Phase period 40 | self.phase_k = opt['model']['phase_period'] 41 | 42 | # Params of reward 43 | self.min_movement = opt['env']['min_movement'] 44 | self.survive_reward = opt['env']['survive_reward'] 45 | 46 | # Step function 47 | # Does step and updates our stored values and also calculates our exploration reward 48 | def step(self, action): 49 | # Do the original step and get the environment reward (will throw some of this out) 50 | obs, true_reward, done, info = self.env.step(action) 51 | 52 | # Get the new state and step 53 | new_state_pro, new_state_ex = self.unwrapped.get_intern_extern_state() 54 | 55 | # Update the states and actions in memory 56 | self.ex_states.append(np.array(new_state_ex)) 57 | self.pro_states.append(np.array(new_state_pro)) 58 | self.actions.append(np.array(action)) 59 | new_count = self._elapsed_steps 60 | assert(len(self.ex_states) == new_count + 1) 61 | 62 | # Determine if there was enough movement 63 | min_movement_mult = float(np.linalg.norm(new_state_pro, 2) > self.min_movement) 64 | 65 | # Get cyclic penalty 66 | if len(self.pro_states) > self.phase_k + 1: 67 | # Get last/current cycle state and actions 68 | new_s = self.pro_states[-1] 69 | old_s = self.pro_states[-(self.phase_k+1)] 70 | new_a = self.actions[-1] 71 | old_a = self.actions[-(self.phase_k+1)] 72 | 73 | # Get cyclic reward for state 74 | state_diff = np.linalg.norm(new_s - old_s, 2) 75 | state_cycle_reward = -state_diff 76 | else: 77 | state_cycle_reward = 0 78 | 79 | # Update survive 80 | info['reward_survive'] = self.survive_reward 81 | info['reward_thresh'] = min_movement_mult 82 | info['reward_cycle'] = state_cycle_reward 83 | reward = info['reward_thresh'] * (info['reward_survive'] + info['reward_cycle']) 84 | info['reward_env'] = reward 85 | #info['reward_env'] = info['reward_forward'] + info['reward_ctrl'] + info['reward_contact'] + info['reward_survive'] 86 | 87 | # Return 88 | return obs, reward, done, info 89 | 90 | # Reset 91 | # Pass through and reset data 92 | def reset(self): 93 | obs = self.env.reset() 94 | 95 | # Reset our storage structures 96 | self.ex_states = [] 97 | self.pro_states = [] 98 | self.actions = [] 99 | 100 | # Update the states and actions in memory 101 | new_state_pro, new_state_ex = self.unwrapped.get_intern_extern_state() 102 | self.ex_states.append(np.array(new_state_ex)) 103 | self.pro_states.append(np.array(new_state_pro)) 104 | self.actions.append(np.zeros(self.action_space.shape)) 105 | assert(self._elapsed_steps == 0) 106 | 107 | return obs 108 | 109 | # Pass through _elapsed_steps 110 | @property 111 | def _elapsed_steps(self): 112 | return self.env._elapsed_steps 113 | 114 | -------------------------------------------------------------------------------- /options/hierarchical_final/hierarchical_many_phase.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: hierarchical_many # Mode is hierarchical many 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | env: 28 | gamma: 0.99 # Discount factor for rewards 29 | num_stack: 1 # Number of frames to stack 30 | add_timestep: False # Add timestep to observations 31 | known_reset: False # Reset to known position 32 | time_scale: 0.001 # What to multiply timestep by for AC input 33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 34 | maze: 35 | goal_radius: 2 # Distance to goal in order to reach it 36 | goal_reward: 5 # How much reward to give for getting to goal 37 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 38 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 39 | use_contact_cost: 0 # Whether to use contact cost in final reward 40 | use_survive_reward: 0 # Whether to use survive reward in final reward 41 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 42 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 43 | logs: 44 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 45 | exp_name: hierarchical_many_phase_final # Unique experiment name 46 | log_interval: 1 # Log interval, one log per n updates 47 | save_interval: 100 # Save interval, one per n updates 48 | vis_interval: 1 # Vis interval, one log per n updates 49 | optim_ppo: 50 | lr: 0.0003 # Learning rate 51 | eps: 0.00001 # RMSprop optimizer epsiolon 52 | alpha: 0.99 # RMSprop optimizer alpha 53 | max_grad_norm: 0.5 # Max norm of gradients 54 | num_frames: 20000000 # Number of frames to train 55 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 56 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 57 | optim_a2c: 58 | lr: 0.0007 # Learning rate 59 | eps: 0.00001 # RMSprop optimizer epsiolon 60 | alpha: 0.99 # RMSprop optimizer alpha 61 | max_grad_norm: 0.5 # Max norm of gradients 62 | num_frames: 20000000 # Number of frames to train 63 | lowlevel: 64 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy 65 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/ 66 | deterministic: False # Whether low level policies are deterministic 67 | num_load: 16 # Number of low level policies 68 | -------------------------------------------------------------------------------- /options/hierarchical_final/hierarchical_many_4M_phase.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: hierarchical_many # Mode is hierarchical many 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | env: 28 | gamma: 0.99 # Discount factor for rewards 29 | num_stack: 1 # Number of frames to stack 30 | add_timestep: False # Add timestep to observations 31 | known_reset: False # Reset to known position 32 | time_scale: 0.001 # What to multiply timestep by for AC input 33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 34 | maze: 35 | goal_radius: 2 # Distance to goal in order to reach it 36 | goal_reward: 5 # How much reward to give for getting to goal 37 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 38 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 39 | use_contact_cost: 0 # Whether to use contact cost in final reward 40 | use_survive_reward: 0 # Whether to use survive reward in final reward 41 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 42 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 43 | logs: 44 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 45 | exp_name: hierarchical_many_4M_phase_final # Unique experiment name 46 | log_interval: 1 # Log interval, one log per n updates 47 | save_interval: 100 # Save interval, one per n updates 48 | vis_interval: 1 # Vis interval, one log per n updates 49 | optim_ppo: 50 | lr: 0.0003 # Learning rate 51 | eps: 0.00001 # RMSprop optimizer epsiolon 52 | alpha: 0.99 # RMSprop optimizer alpha 53 | max_grad_norm: 0.5 # Max norm of gradients 54 | num_frames: 4000000 # Number of frames to train 55 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 56 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 57 | optim_a2c: 58 | lr: 0.0007 # Learning rate 59 | eps: 0.00001 # RMSprop optimizer epsiolon 60 | alpha: 0.99 # RMSprop optimizer alpha 61 | max_grad_norm: 0.5 # Max norm of gradients 62 | num_frames: 4000000 # Number of frames to train 63 | lowlevel: 64 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy 65 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/ 66 | deterministic: False # Whether low level policies are deterministic 67 | num_load: 16 # Number of low level policies 68 | -------------------------------------------------------------------------------- /options/hierarchical_final/hierarchical_many_phase_antlowgear.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: hierarchical_many # Mode is hierarchical many 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | env: 28 | gamma: 0.99 # Discount factor for rewards 29 | num_stack: 1 # Number of frames to stack 30 | add_timestep: False # Add timestep to observations 31 | known_reset: False # Reset to known position 32 | time_scale: 0.001 # What to multiply timestep by for AC input 33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 34 | maze: 35 | goal_radius: 2 # Distance to goal in order to reach it 36 | goal_reward: 5 # How much reward to give for getting to goal 37 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 38 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 39 | use_contact_cost: 0 # Whether to use contact cost in final reward 40 | use_survive_reward: 0 # Whether to use survive reward in final reward 41 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 42 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 43 | logs: 44 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 45 | exp_name: hierarchical_many_phase_antlowgear_final # Unique experiment name 46 | log_interval: 1 # Log interval, one log per n updates 47 | save_interval: 100 # Save interval, one per n updates 48 | vis_interval: 1 # Vis interval, one log per n updates 49 | optim_ppo: 50 | lr: 0.0003 # Learning rate 51 | eps: 0.00001 # RMSprop optimizer epsiolon 52 | alpha: 0.99 # RMSprop optimizer alpha 53 | max_grad_norm: 0.5 # Max norm of gradients 54 | num_frames: 20000000 # Number of frames to train 55 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 56 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 57 | optim_a2c: 58 | lr: 0.0007 # Learning rate 59 | eps: 0.00001 # RMSprop optimizer epsiolon 60 | alpha: 0.99 # RMSprop optimizer alpha 61 | max_grad_norm: 0.5 # Max norm of gradients 62 | num_frames: 20000000 # Number of frames to train 63 | lowlevel: 64 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy 65 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAntLowGear-v2/ 66 | deterministic: False # Whether low level policies are deterministic 67 | num_load: 16 # Number of low level policies 68 | -------------------------------------------------------------------------------- /options/hierarchical_final/hierarchical_many_phase_a2c.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | norm_ret: False 22 | model: 23 | recurrent_policy: False # Use a recurrent policy 24 | mode: hierarchical_many # Mode is hierarchical many 25 | hid_sz: 32 # MLP hidden size 26 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 27 | num_layer: 2 # Number of layers in MLP (minus input layer) 28 | env: 29 | gamma: 0.99 # Discount factor for rewards 30 | num_stack: 1 # Number of frames to stack 31 | add_timestep: False # Add timestep to observations 32 | known_reset: False # Reset to known position 33 | time_scale: 0.001 # What to multiply timestep by for AC input 34 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 35 | maze: 36 | goal_radius: 2 # Distance to goal in order to reach it 37 | goal_reward: 5 # How much reward to give for getting to goal 38 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 39 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 40 | use_contact_cost: 0 # Whether to use contact cost in final reward 41 | use_survive_reward: 0 # Whether to use survive reward in final reward 42 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 43 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 44 | logs: 45 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 46 | exp_name: hierarchical_many_phase_a2c_final # Unique experiment name 47 | log_interval: 1 # Log interval, one log per n updates 48 | save_interval: 100 # Save interval, one per n updates 49 | vis_interval: 1 # Vis interval, one log per n updates 50 | optim_ppo: 51 | lr: 0.0003 # Learning rate 52 | eps: 0.00001 # RMSprop optimizer epsiolon 53 | alpha: 0.99 # RMSprop optimizer alpha 54 | max_grad_norm: 0.5 # Max norm of gradients 55 | num_frames: 20000000 # Number of frames to train 56 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 57 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 58 | optim_a2c: 59 | lr: 0.0007 # Learning rate 60 | eps: 0.00001 # RMSprop optimizer epsiolon 61 | alpha: 0.99 # RMSprop optimizer alpha 62 | max_grad_norm: 0.5 # Max norm of gradients 63 | num_frames: 20000000 # Number of frames to train 64 | hierarchical_mode: train_highlevel 65 | num_ll_steps: 10 66 | lowlevel: 67 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy 68 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/ 69 | deterministic: False # Whether low level policies are deterministic 70 | num_load: 16 # Number of low level policies 71 | -------------------------------------------------------------------------------- /options/hierarchical_final/hierarchical_many_phase_16ll_proprioceptivehumanoid.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: hierarchical_many # Mode is hierarchical many 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | env: 28 | gamma: 0.99 # Discount factor for rewards 29 | num_stack: 1 # Number of frames to stack 30 | add_timestep: False # Add timestep to observations 31 | known_reset: False # Reset to known position 32 | time_scale: 0.001 # What to multiply timestep by for AC input 33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 34 | maze: 35 | goal_radius: 2 # Distance to goal in order to reach it 36 | goal_reward: 5 # How much reward to give for getting to goal 37 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 38 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 39 | use_contact_cost: 0 # Whether to use contact cost in final reward 40 | use_survive_reward: 0 # Whether to use survive reward in final reward 41 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 42 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 43 | logs: 44 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 45 | exp_name: hierarchical_many_phase_16ll_proprioceptivehumanoid_final_stopat175 # Unique experiment name 46 | log_interval: 1 # Log interval, one log per n updates 47 | save_interval: 100 # Save interval, one per n updates 48 | vis_interval: 1 # Vis interval, one log per n updates 49 | optim_ppo: 50 | lr: 0.0003 # Learning rate 51 | eps: 0.00001 # RMSprop optimizer epsiolon 52 | alpha: 0.99 # RMSprop optimizer alpha 53 | max_grad_norm: 0.5 # Max norm of gradients 54 | num_frames: 20000000 # Number of frames to train 55 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 56 | num_ll_steps: 100 # How many low level steps to do in a row before update high level step 57 | optim_a2c: 58 | lr: 0.0007 # Learning rate 59 | eps: 0.00001 # RMSprop optimizer epsiolon 60 | alpha: 0.99 # RMSprop optimizer alpha 61 | max_grad_norm: 0.5 # Max norm of gradients 62 | num_frames: 20000000 # Number of frames to train 63 | lowlevel: 64 | optfile: options/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M.yaml # Opt file location of low level policy 65 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M/ppo/ExplorerProprioceptiveHumanoid-v2/ 66 | deterministic: False # Whether low level policies are deterministic 67 | num_load: 16 # Number of low level policies 68 | -------------------------------------------------------------------------------- /options/hierarchical_final/hierarchical_many_baseline.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | norm_ret: False 22 | model: 23 | recurrent_policy: False # Use a recurrent policy 24 | mode: hierarchical_many # Mode is hierarchical many 25 | hid_sz: 32 # MLP hidden size 26 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 27 | num_layer: 2 # Number of layers in MLP (minus input layer) 28 | env: 29 | gamma: 0.99 # Discount factor for rewards 30 | num_stack: 1 # Number of frames to stack 31 | add_timestep: False # Add timestep to observations 32 | known_reset: False # Reset to known position 33 | time_scale: 0.001 # What to multiply timestep by for AC input 34 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 35 | maze: 36 | goal_radius: 2 # Distance to goal in order to reach it 37 | goal_reward: 5 # How much reward to give for getting to goal 38 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 39 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 40 | use_contact_cost: 0 # Whether to use contact cost in final reward 41 | use_survive_reward: 0 # Whether to use survive reward in final reward 42 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 43 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 44 | logs: 45 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 46 | exp_name: hierarchical_many_baseline_final # Unique experiment name 47 | log_interval: 1 # Log interval, one log per n updates 48 | save_interval: 100 # Save interval, one per n updates 49 | vis_interval: 1 # Vis interval, one log per n updates 50 | optim_ppo: 51 | lr: 0.0003 # Learning rate 52 | eps: 0.00001 # RMSprop optimizer epsiolon 53 | alpha: 0.99 # RMSprop optimizer alpha 54 | max_grad_norm: 0.5 # Max norm of gradients 55 | num_frames: 20000000 # Number of frames to train 56 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 57 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 58 | optim_a2c: 59 | lr: 0.0007 # Learning rate 60 | eps: 0.00001 # RMSprop optimizer epsiolon 61 | alpha: 0.99 # RMSprop optimizer alpha 62 | max_grad_norm: 0.5 # Max norm of gradients 63 | num_frames: 20000000 # Number of frames to train 64 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 65 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 66 | lowlevel: 67 | optfile: options/baseline_lowlevel/baseline_simplemlp_pretrain_any.yaml # Opt file location of low level policy 68 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAnt-v2/ 69 | deterministic: False # Whether low level policies are deterministic 70 | num_load: 16 # Number of low level policies 71 | -------------------------------------------------------------------------------- /options/hierarchical_final/hierarchical_many_4M_baseline.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | norm_ret: False 22 | model: 23 | recurrent_policy: False # Use a recurrent policy 24 | mode: hierarchical_many # Mode is hierarchical many 25 | hid_sz: 32 # MLP hidden size 26 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 27 | num_layer: 2 # Number of layers in MLP (minus input layer) 28 | env: 29 | gamma: 0.99 # Discount factor for rewards 30 | num_stack: 1 # Number of frames to stack 31 | add_timestep: False # Add timestep to observations 32 | known_reset: False # Reset to known position 33 | time_scale: 0.001 # What to multiply timestep by for AC input 34 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 35 | maze: 36 | goal_radius: 2 # Distance to goal in order to reach it 37 | goal_reward: 5 # How much reward to give for getting to goal 38 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 39 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 40 | use_contact_cost: 0 # Whether to use contact cost in final reward 41 | use_survive_reward: 0 # Whether to use survive reward in final reward 42 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 43 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 44 | logs: 45 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 46 | exp_name: hierarchical_many_4M_baseline_final # Unique experiment name 47 | log_interval: 1 # Log interval, one log per n updates 48 | save_interval: 100 # Save interval, one per n updates 49 | vis_interval: 1 # Vis interval, one log per n updates 50 | optim_ppo: 51 | lr: 0.0003 # Learning rate 52 | eps: 0.00001 # RMSprop optimizer epsiolon 53 | alpha: 0.99 # RMSprop optimizer alpha 54 | max_grad_norm: 0.5 # Max norm of gradients 55 | num_frames: 4000000 # Number of frames to train 56 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 57 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 58 | optim_a2c: 59 | lr: 0.0007 # Learning rate 60 | eps: 0.00001 # RMSprop optimizer epsiolon 61 | alpha: 0.99 # RMSprop optimizer alpha 62 | max_grad_norm: 0.5 # Max norm of gradients 63 | num_frames: 4000000 # Number of frames to train 64 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 65 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 66 | lowlevel: 67 | optfile: options/baseline_lowlevel/baseline_simplemlp_pretrain_any.yaml # Opt file location of low level policy 68 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAnt-v2/ 69 | deterministic: False # Whether low level policies are deterministic 70 | num_load: 16 # Number of low level policies 71 | -------------------------------------------------------------------------------- /options/hierarchical_final/hierarchical_many_baseline_antlowgear.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | norm_ret: False 22 | model: 23 | recurrent_policy: False # Use a recurrent policy 24 | mode: hierarchical_many # Mode is hierarchical many 25 | hid_sz: 32 # MLP hidden size 26 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 27 | num_layer: 2 # Number of layers in MLP (minus input layer) 28 | env: 29 | gamma: 0.99 # Discount factor for rewards 30 | num_stack: 1 # Number of frames to stack 31 | add_timestep: False # Add timestep to observations 32 | known_reset: False # Reset to known position 33 | time_scale: 0.001 # What to multiply timestep by for AC input 34 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 35 | maze: 36 | goal_radius: 2 # Distance to goal in order to reach it 37 | goal_reward: 5 # How much reward to give for getting to goal 38 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 39 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 40 | use_contact_cost: 0 # Whether to use contact cost in final reward 41 | use_survive_reward: 0 # Whether to use survive reward in final reward 42 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 43 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 44 | logs: 45 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 46 | exp_name: hierarchical_many_baseline_antlowgear_final # Unique experiment name 47 | log_interval: 1 # Log interval, one log per n updates 48 | save_interval: 100 # Save interval, one per n updates 49 | vis_interval: 1 # Vis interval, one log per n updates 50 | optim_ppo: 51 | lr: 0.0003 # Learning rate 52 | eps: 0.00001 # RMSprop optimizer epsiolon 53 | alpha: 0.99 # RMSprop optimizer alpha 54 | max_grad_norm: 0.5 # Max norm of gradients 55 | num_frames: 20000000 # Number of frames to train 56 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 57 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 58 | optim_a2c: 59 | lr: 0.0007 # Learning rate 60 | eps: 0.00001 # RMSprop optimizer epsiolon 61 | alpha: 0.99 # RMSprop optimizer alpha 62 | max_grad_norm: 0.5 # Max norm of gradients 63 | num_frames: 20000000 # Number of frames to train 64 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 65 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 66 | lowlevel: 67 | optfile: options/baseline_lowlevel/baseline_simplemlp_pretrain_any.yaml # Opt file location of low level policy 68 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAntLowGear-v2/ 69 | deterministic: False # Whether low level policies are deterministic 70 | num_load: 16 # Number of low level policies 71 | -------------------------------------------------------------------------------- /options/maze_baseline/maze_baseline.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline # Mode is lowlevel phase 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: False 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: False # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 5 # How much reward to give for getting to goal 53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | -------------------------------------------------------------------------------- /options/maze_baseline/maze_baseline_wmove_r1000.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline # Mode is lowlevel phase 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: False 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: False # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 1000 # How much reward to give for getting to goal 53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_wmove_r1000 # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | -------------------------------------------------------------------------------- /options/maze_baseline_wphase/maze_baseline_phase.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline_wphase # Mode is lowlevel phase 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: False 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 5 # How much reward to give for getting to goal 53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_phase # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | -------------------------------------------------------------------------------- /options/maze_baseline_wphase/maze_baseline_phase_humanoid.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline_wphase # Mode is lowlevel phase 24 | hid_sz: 16 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: True 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 5 # How much reward to give for getting to goal 53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_phase_humanoid # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | -------------------------------------------------------------------------------- /options/maze_baseline_wphase/maze_baseline_phase_wmove_r1000.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline_wphase # Mode is lowlevel phase 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: False 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 1000 # How much reward to give for getting to goal 53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_phase_wmove_r1000 # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | -------------------------------------------------------------------------------- /options/maze_baseline_wphase/maze_baseline_phase_wmove_r1000_humanoid.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline_wphase # Mode is lowlevel phase 24 | hid_sz: 16 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: True 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 1000 # How much reward to give for getting to goal 53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_phase_wmove_r1000_humanoid # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | -------------------------------------------------------------------------------- /options/maze_baseline/maze_baseline_finetune.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline # Mode is lowlevel phase 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: False 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: False # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 5 # How much reward to give for getting to goal 53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_finetune # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | lowlevel: 78 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAnt-v2/ 79 | -------------------------------------------------------------------------------- /options/maze_baseline_wphase/maze_baseline_phase_finetune.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline_wphase # Mode is lowlevel phase 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: False 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 5 # How much reward to give for getting to goal 53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_phase_finetune # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | lowlevel: 78 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/ 79 | -------------------------------------------------------------------------------- /options/maze_baseline/maze_baseline_wmove_r1000_finetune.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline # Mode is lowlevel phase 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: False 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: False # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 1000 # How much reward to give for getting to goal 53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_wmove_r1000_finetune # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | lowlevel: 78 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAnt-v2/ 79 | -------------------------------------------------------------------------------- /options/maze_baseline_wphase/maze_baseline_phase_wmove_r1000_finetune.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline_wphase # Mode is lowlevel phase 24 | hid_sz: 32 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: False 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 1000 # How much reward to give for getting to goal 53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_phase_wmove_r1000_finetune # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | lowlevel: 78 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/ 79 | -------------------------------------------------------------------------------- /options/maze_baseline_wphase/maze_baseline_phase_finetune_proprioceptivehumanoid.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline_wphase # Mode is lowlevel phase 24 | hid_sz: 16 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: True 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 5 # How much reward to give for getting to goal 53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_phase_finetune_proprioceptivehumanoid # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | lowlevel: 78 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M/ppo/ExplorerProprioceptiveHumanoid-v2/ 79 | -------------------------------------------------------------------------------- /options/maze_baseline_wphase/maze_baseline_phase_wmove_r1000_finetune_proprioceptivehumanoid.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | model: 22 | recurrent_policy: False # Use a recurrent policy 23 | mode: maze_baseline_wphase # Mode is lowlevel phase 24 | hid_sz: 16 # MLP hidden size 25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module) 26 | num_layer: 2 # Number of layers in MLP (minus input layer) 27 | phase_period: 10 28 | phase_hid_sz: 16 29 | skip_layer: True 30 | use_timestep: True 31 | time_scale: 0.001 32 | env: 33 | gamma: 0.99 # Discount factor for rewards 34 | num_stack: 1 # Number of frames to stack 35 | add_timestep: True # Add timestep to observations 36 | known_reset: False # Reset to known position 37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 38 | time_scale: 0.001 # What to multiply timestep by for AC input 39 | theta_space_mode: pretrain_any # What theta mode we're in 40 | theta_reset_mode: never # When to change theta 41 | theta_reward_mode: lax # How to punish perpendicular movement 42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector) 43 | theta_memory_lookback: 10 # How far to look back for reference global theta 44 | time_limit: 1000 # When to end an episode 45 | reward_shape_type: instant 46 | state_cycle_weight: 0.05 # How to weight state cycle differences 47 | action_cycle_weight: 0.01 # How to weight action cycle differences 48 | phase_period: 10 # How long the phase cycle is 49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up) 50 | maze: 51 | goal_radius: 2 # Distance to goal in order to reach it 52 | goal_reward: 1000 # How much reward to give for getting to goal 53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction) 54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 55 | use_contact_cost: 0 # Whether to use contact cost in final reward 56 | use_survive_reward: 0 # Whether to use survive reward in final reward 57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 59 | logs: 60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 61 | exp_name: maze_baseline_phase_wmove_r1000_finetune_proprioceptivehumanoid # Unique experiment name 62 | log_interval: 1 # Log interval, one log per n updates 63 | save_interval: 100000000000 # Save interval, one per n updates 64 | vis_interval: 1 # Vis interval, one log per n updates 65 | optim_ppo: 66 | lr: 0.0003 # Learning rate 67 | eps: 0.00001 # RMSprop optimizer epsiolon 68 | alpha: 0.99 # RMSprop optimizer alpha 69 | max_grad_norm: 0.5 # Max norm of gradients 70 | num_frames: 20000000 # Number of frames to train 71 | optim_a2c: 72 | lr: 0.0007 # Learning rate 73 | eps: 0.00001 # RMSprop optimizer epsiolon 74 | alpha: 0.99 # RMSprop optimizer alpha 75 | max_grad_norm: 0.5 # Max norm of gradients 76 | num_frames: 20000000 # Number of frames to train 77 | lowlevel: 78 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M/ppo/ExplorerProprioceptiveHumanoid-v2/ 79 | -------------------------------------------------------------------------------- /environments/assets/ant_custom_gear.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 82 | -------------------------------------------------------------------------------- /environments/assets/my_ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 82 | -------------------------------------------------------------------------------- /environments/mujoco_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from gym import error, spaces 4 | from gym.utils import seeding 5 | import numpy as np 6 | from os import path 7 | import gym 8 | import six 9 | 10 | try: 11 | import mujoco_py 12 | except ImportError as e: 13 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e)) 14 | 15 | DEFAULT_SIZE = 500 16 | 17 | class MujocoEnv(gym.Env): 18 | """Superclass for all MuJoCo environments. 19 | """ 20 | 21 | def __init__(self, model_path, frame_skip): 22 | if model_path.startswith("/"): 23 | fullpath = model_path 24 | else: 25 | fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path) 26 | if not path.exists(fullpath): 27 | raise IOError("File %s does not exist" % fullpath) 28 | self.frame_skip = frame_skip 29 | self.model = mujoco_py.load_model_from_path(fullpath) 30 | self.sim = mujoco_py.MjSim(self.model) 31 | self.data = self.sim.data 32 | self.viewer = None 33 | self._viewers = {} 34 | 35 | self.metadata = { 36 | 'render.modes': ['human', 'rgb_array'], 37 | 'video.frames_per_second': int(np.round(1.0 / self.dt)) 38 | } 39 | 40 | self.init_qpos = self.sim.data.qpos.ravel().copy() 41 | self.init_qvel = self.sim.data.qvel.ravel().copy() 42 | observation, _reward, done, _info = self.step(np.zeros(self.model.nu)) 43 | assert not done 44 | self.obs_dim = observation.size 45 | 46 | bounds = self.model.actuator_ctrlrange.copy() 47 | low = bounds[:, 0] 48 | high = bounds[:, 1] 49 | self.action_space = spaces.Box(low=low, high=high) 50 | 51 | high = np.inf*np.ones(self.obs_dim) 52 | low = -high 53 | self.observation_space = spaces.Box(low, high) 54 | 55 | self.seed() 56 | 57 | def seed(self, seed=None): 58 | self.np_random, seed = seeding.np_random(seed) 59 | return [seed] 60 | 61 | # methods to override: 62 | # ---------------------------- 63 | 64 | def reset_model(self): 65 | """ 66 | Reset the robot degrees of freedom (qpos and qvel). 67 | Implement this in each subclass. 68 | """ 69 | raise NotImplementedError 70 | 71 | def viewer_setup(self): 72 | """ 73 | This method is called when the viewer is initialized and after every reset 74 | Optionally implement this method, if you need to tinker with camera position 75 | and so forth. 76 | """ 77 | pass 78 | 79 | # ----------------------------- 80 | 81 | def reset(self): 82 | self.sim.reset() 83 | ob = self.reset_model() 84 | old_viewer = self.viewer 85 | for v in self._viewers.values(): 86 | self.viewer = v 87 | self.viewer_setup() 88 | self.viewer = old_viewer 89 | return ob 90 | 91 | def set_state(self, qpos, qvel): 92 | assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,) 93 | old_state = self.sim.get_state() 94 | new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel, 95 | old_state.act, old_state.udd_state) 96 | self.sim.set_state(new_state) 97 | self.sim.forward() 98 | 99 | @property 100 | def dt(self): 101 | return self.model.opt.timestep * self.frame_skip 102 | 103 | def do_simulation(self, ctrl, n_frames): 104 | self.sim.data.ctrl[:] = ctrl 105 | for _ in range(n_frames): 106 | self.sim.step() 107 | 108 | def render(self, mode='human', width=DEFAULT_SIZE, height=DEFAULT_SIZE): 109 | if mode == 'rgb_array': 110 | self._get_viewer(mode).render(width, height) 111 | # window size used for old mujoco-py: 112 | data = self._get_viewer(mode).read_pixels(width, height, depth=False) 113 | # original image is upside-down, so flip it 114 | return data[::-1, :, :] 115 | elif mode == 'human': 116 | self._get_viewer(mode).render() 117 | 118 | def close(self): 119 | if self.viewer is not None: 120 | # self.viewer.finish() 121 | self.viewer = None 122 | self._viewers = {} 123 | 124 | def _get_viewer(self, mode): 125 | self.viewer = self._viewers.get(mode) 126 | if self.viewer is None: 127 | if mode == 'human': 128 | self.viewer = mujoco_py.MjViewer(self.sim) 129 | elif mode == 'rgb_array': 130 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, 0) 131 | self.viewer_setup() 132 | self._viewers[mode] = self.viewer 133 | return self.viewer 134 | 135 | def get_body_com(self, body_name): 136 | return self.data.get_body_xpos(body_name) 137 | 138 | def state_vector(self): 139 | return np.concatenate([ 140 | self.sim.data.qpos.flat, 141 | self.sim.data.qvel.flat 142 | ]) 143 | -------------------------------------------------------------------------------- /environments/simple_humanoid_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy as np 3 | from gym import utils 4 | import pdb 5 | import math 6 | from . import mujoco_env 7 | from . import geom_utils 8 | 9 | def mass_center(model, sim): 10 | mass = np.expand_dims(model.body_mass, 1) 11 | xpos = sim.data.xipos 12 | return (np.sum(mass * xpos, 0) / np.sum(mass)) 13 | 14 | class BaseSimpleHumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): 15 | # Initialize Mujoco environment 16 | def __init__(self, xml_file='simple_humanoid.xml'): 17 | mujoco_env.MujocoEnv.__init__(self, xml_file, 1) 18 | utils.EzPickle.__init__(self) 19 | 20 | # Forward step 21 | def step(self, action): 22 | pos_before = mass_center(self.model, self.sim) 23 | self.do_simulation(action, self.frame_skip) 24 | pos_after = mass_center(self.model, self.sim) 25 | alive_bonus = 0.2 26 | data = self.sim.data 27 | lin_vel_cost = 0.25 * (pos_after - pos_before) / self.model.opt.timestep 28 | lb = -100 29 | ub = 100 30 | scaling = (ub - lb) * 0.5 31 | quad_ctrl_cost = .5 * 1e-3 * np.sum( 32 | np.square(action / scaling)) 33 | quad_impact_cost = .5 * 1e-5 * np.sum( 34 | np.square(np.clip(data.cfrc_ext, -1, 1))) 35 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 36 | qpos = self.sim.data.qpos 37 | done = bool((qpos[2] < 0.8) or (qpos[2] > 2.0)) 38 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 39 | 40 | # Get states by name 41 | def get_state_by_name(self, name, s=None): 42 | # Get state (if not passed in) 43 | if s is None: 44 | s = self.state_vector() 45 | 46 | # Switch on name 47 | if name == 'xyz': 48 | val = s[0:3] 49 | elif name == 'x': 50 | val = s[0] 51 | elif name == 'y': 52 | val = s[1] 53 | elif name == 'z': 54 | val = s[2] 55 | elif name == 'quart': 56 | val = s[3:7] 57 | elif name in ['rpy', 'roll', 'pitch', 'yaw']: 58 | quart = s[3:7] 59 | roll, pitch, yaw = geom_utils.quaternion_to_euler_angle(quart) 60 | if name == 'roll': 61 | val = roll 62 | elif name == 'pitch': 63 | val = pitch 64 | elif name == 'yaw': 65 | val = yaw 66 | elif name == 'rpy': 67 | return np.array([roll, pitch, yaw]) 68 | elif name == 'joint_angles': 69 | val = s[7:17] 70 | elif name == 'xyz_vel': 71 | val = s[17:20] 72 | elif name == 'x_vel': 73 | val = s[17] 74 | elif name == 'y_vel': 75 | val = s[18] 76 | elif name == 'z_vel': 77 | val = s[19] 78 | elif name == 'rpy_vel': 79 | val = s[20:23] 80 | elif name == 'roll_vel': 81 | val = s[20] 82 | elif name == 'pitch_vel': 83 | val = s[21] 84 | elif name == 'yaw_vel': 85 | val = s[22] 86 | elif name == 'joint_angle_vel': 87 | val = s[23:] 88 | return val 89 | 90 | # We remove the first 5 values from state which should correspond to global orientation and position 91 | # https://github.com/openai/gym/wiki/Humanoid-V1 92 | def get_intern_extern_state(self): 93 | # Extract different states 94 | s = self.state_vector() 95 | 96 | xyz = self.get_state_by_name('xyz', s) 97 | rpy = self.get_state_by_name('rpy', s) 98 | joint_angles = self.get_state_by_name('joint_angles', s) 99 | d_xyz = self.get_state_by_name('xyz_vel', s) 100 | d_rpy = self.get_state_by_name('rpy_vel', s) 101 | d_joint = self.get_state_by_name('joint_angle_vel', s) 102 | 103 | # Seperate out yaw 104 | roll = rpy[0] 105 | pitch = rpy[1] 106 | yaw = rpy[2] 107 | d_roll = d_rpy[0] 108 | d_pitch = d_rpy[1] 109 | d_yaw = d_rpy[2] 110 | 111 | # Set internal/external states 112 | s_internal = np.concatenate([[roll, pitch], joint_angles, [d_roll, d_pitch]]) 113 | s_external = np.concatenate([xyz, [yaw], d_xyz, [d_yaw]]) 114 | #s_internal = np.concatenate([s, np.clip(self.sim.data.cfrc_ext, -1, 1).flat, self.get_body_com("torso").flat] ) 115 | 116 | #assert(s_internal.shape[0] == 20) 117 | assert(s_external.shape[0] == 8) 118 | 119 | return s_internal, s_external 120 | 121 | def _get_obs(self): 122 | raise NotImplementedError 123 | 124 | def get_body_com(self, body_name): 125 | idx = self.model.body_names.index(body_name) 126 | return self.sim.data.subtree_com[idx] 127 | 128 | def reset_model(self): 129 | c = 0.01 130 | self.set_state( 131 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 132 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 133 | ) 134 | return self._get_obs() 135 | 136 | def viewer_setup(self): 137 | self.viewer.cam.trackbodyid = 1 138 | self.viewer.cam.distance = self.model.stat.extent * 1.0 139 | self.viewer.cam.lookat[2] += .8 140 | self.viewer.cam.elevation = -20 141 | -------------------------------------------------------------------------------- /options/hierarchical_final/hierarchical_many_phase_dqn.yaml: -------------------------------------------------------------------------------- 1 | alg_ppo: 2 | use_gae: True # Use generalized advantage estimation 3 | gae_tau: 0.95 # GAE parameter 4 | entropy_coef: 0 # Entropy term coefficient 5 | value_loss_coef: 1 # Value loss coefficient 6 | num_steps: 2048 # Number of forward steps 7 | num_processes: 1 # Number of parallel processes to run 8 | ppo_epoch: 100 # Number of ppo epochs 9 | num_mini_batch: 32 # Number of batcxhes for ppo 10 | clip_param: 0.2 # ppo clip parameter 11 | log_mult: 1 # How much less often to log for this alg 12 | norm_ret: False # Whether to add normalization to returns 13 | alg_a2c: 14 | use_gae: False # Use generalized advantage estimation 15 | gae_tau: 0.95 # GAE parameter 16 | entropy_coef: 0.01 # Entropy term coefficient 17 | value_loss_coef: 0.5 # Value loss coefficient 18 | num_steps: 5 # Number of forward steps 19 | num_processes: 16 # Number of parallel processes to run 20 | log_mult: 10 # How much less often to log for this alg 21 | alg_dqn: 22 | batch_size: 128 # DQN batch size 23 | target_update: 10000 # After how many steps to update DQN target 24 | mem_capacity: 10000000 # How many frames to store in replay memory 25 | num_steps: 1 # How many high level actions to take before doing DQN updates 26 | updates_per_step: 10 # How many DQN updates to do every loop 27 | num_processes: 1 # Number of parallel processes to run 28 | norm_ret: False # Whether to add normalization to returns 29 | log_mult: 100 30 | save_interval: 100000 31 | model: 32 | recurrent_policy: False # Use a recurrent policy 33 | mode: hierarchical_many # Mode is hierarchical many 34 | hid_sz: 32 # MLP hidden size 35 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module) 36 | num_layer: 2 # Number of layers in MLP (minus input layer) 37 | env: 38 | gamma: 0.99 # Discount factor for rewards 39 | num_stack: 1 # Number of frames to stack 40 | add_timestep: False # Add timestep to observations 41 | known_reset: False # Reset to known position 42 | time_scale: 0.001 # What to multiply timestep by for AC input 43 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92 44 | maze: 45 | goal_radius: 2 # Distance to goal in order to reach it 46 | goal_reward: 5 # How much reward to give for getting to goal 47 | velocity_reward_weight: 0 # How much weight to give to moving (any direction) 48 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward 49 | use_contact_cost: 0 # Whether to use contact cost in final reward 50 | use_survive_reward: 0 # Whether to use survive reward in final reward 51 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward) 52 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal 53 | logs: 54 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files) 55 | exp_name: hierarchical_many_phase_dqn_final # Unique experiment name 56 | log_interval: 1 # Log interval, one log per n updates 57 | save_interval: 100 # Save interval, one per n updates 58 | vis_interval: 1 # Vis interval, one log per n updates 59 | optim_ppo: 60 | lr: 0.0003 # Learning rate 61 | eps: 0.00001 # RMSprop optimizer epsiolon 62 | alpha: 0.99 # RMSprop optimizer alpha 63 | max_grad_norm: 0.5 # Max norm of gradients 64 | num_frames: 20000000 # Number of frames to train 65 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 66 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step 67 | optim_a2c: 68 | lr: 0.0007 # Learning rate 69 | eps: 0.00001 # RMSprop optimizer epsiolon 70 | alpha: 0.99 # RMSprop optimizer alpha 71 | max_grad_norm: 0.5 # Max norm of gradients 72 | num_frames: 20000000 # Number of frames to train 73 | hierarchical_mode: train_highlevel 74 | num_ll_steps: 10 75 | optim_dqn: 76 | lr: 0.0003 77 | eps: 0.00001 78 | max_grad_norm: 1 79 | eps_start: 0.9 80 | eps_end: 0 81 | eps_decay: 100000 # TODO - no idea if this value makes sense 82 | num_frames: 20000000 # Number of frames to train 83 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both) 84 | num_ll_steps: 10 85 | lowlevel: 86 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy 87 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/ 88 | deterministic: False # Whether low level policies are deterministic 89 | num_load: 16 # Number of low level policies 90 | -------------------------------------------------------------------------------- /environments/proprioceptive_humanoid_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy as np 3 | from gym import utils 4 | from . import mujoco_env 5 | from . import geom_utils 6 | 7 | def mass_center(model, sim): 8 | mass = np.expand_dims(model.body_mass, 1) 9 | xpos = sim.data.xipos 10 | return (np.sum(mass * xpos, 0) / np.sum(mass)) 11 | 12 | class BaseProprioceptiveHumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): 13 | # Initialize Mujoco environment 14 | def __init__(self, xml_file='humanoid.xml'): 15 | # Set start values for registration 16 | self.start_yaw = float('inf') 17 | self.start_z = float('inf') 18 | mujoco_env.MujocoEnv.__init__(self, xml_file, 5) 19 | utils.EzPickle.__init__(self) 20 | 21 | # Forward step 22 | def step(self, a): 23 | pos_before = mass_center(self.model, self.sim) 24 | self.do_simulation(a, self.frame_skip) 25 | pos_after = mass_center(self.model, self.sim) 26 | alive_bonus = 5.0 27 | data = self.sim.data 28 | lin_vel_cost = 0.25 * (pos_after - pos_before) / self.model.opt.timestep 29 | quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() 30 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() 31 | quad_impact_cost = min(quad_impact_cost, 10) 32 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 33 | qpos = self.sim.data.qpos 34 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 35 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 36 | 37 | # Get states by name 38 | def get_state_by_name(self, name, s=None): 39 | # Get state (if not passed in) 40 | if s is None: 41 | s = self.state_vector() 42 | 43 | # Replace with mass center 44 | s[0:3] = mass_center(self.model, self.sim) 45 | 46 | # Switch on name 47 | if name == 'xyz': 48 | val = s[0:3] 49 | elif name == 'x': 50 | val = s[0] 51 | elif name == 'y': 52 | val = s[1] 53 | elif name == 'z': 54 | val = s[2] 55 | elif name == 'quart': 56 | val = s[3:7] 57 | elif name in ['rpy', 'roll', 'pitch', 'yaw']: 58 | quart = s[3:7] 59 | roll, pitch, yaw = geom_utils.quaternion_to_euler_angle(quart) 60 | if name == 'roll': 61 | val = roll 62 | elif name == 'pitch': 63 | val = pitch 64 | elif name == 'yaw': 65 | val = yaw 66 | elif name == 'rpy': 67 | return np.array([roll, pitch, yaw]) 68 | elif name == 'joint_angles': 69 | val = s[7:24] 70 | elif name == 'xyz_vel': 71 | val = s[24:27] 72 | elif name == 'x_vel': 73 | val = s[24] 74 | elif name == 'y_vel': 75 | val = s[25] 76 | elif name == 'z_vel': 77 | val = s[26] 78 | elif name == 'rpy_vel': 79 | val = s[27:30] 80 | elif name == 'roll_vel': 81 | val = s[27] 82 | elif name == 'pitch_vel': 83 | val = s[28] 84 | elif name == 'yaw_vel': 85 | val = s[29] 86 | elif name == 'joint_angle_vel': 87 | val = s[30:] 88 | return val 89 | 90 | # We remove the first 5 values from state which should correspond to global orientation and position 91 | # https://github.com/openai/gym/wiki/Humanoid-V1 92 | def get_intern_extern_state(self): 93 | # Extract different states 94 | s = self.state_vector() 95 | z = self.get_state_by_name('z', s) 96 | xyz = self.get_state_by_name('xyz', s) 97 | rpy = self.get_state_by_name('rpy', s) 98 | joint_angles = self.get_state_by_name('joint_angles', s) 99 | d_xyz = self.get_state_by_name('xyz_vel', s) 100 | d_rpy = self.get_state_by_name('rpy_vel', s) 101 | d_joint = self.get_state_by_name('joint_angle_vel', s) 102 | 103 | # Seperate out yaw 104 | roll = rpy[0] 105 | pitch = rpy[1] 106 | yaw = rpy[2] 107 | d_roll = d_rpy[0] 108 | d_pitch = d_rpy[1] 109 | d_yaw = d_rpy[2] 110 | 111 | # Set internal/external states 112 | # Internal keeps track of integral z and yaw (subtract out the initial value) 113 | pro_yaw = geom_utils.convert_to_egocentric(self.start_yaw, yaw) 114 | pro_z = z - self.start_z 115 | s_internal = np.concatenate([[pro_z, roll, pitch, pro_yaw], joint_angles, d_xyz, [d_roll, d_pitch, d_yaw], d_joint]) 116 | s_external = np.concatenate([xyz, [yaw]]) 117 | 118 | return s_internal, s_external 119 | 120 | def _get_obs(self): 121 | raise NotImplementedError 122 | 123 | def reset_model(self): 124 | c = 0.01 125 | self.set_state( 126 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 127 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 128 | ) 129 | 130 | # Get the initial z and yaw, and keep track of to get integral values 131 | self.start_yaw = self.get_state_by_name('yaw') 132 | self.start_z = self.get_state_by_name('z') 133 | 134 | return self._get_obs() 135 | 136 | def viewer_setup(self): 137 | self.viewer.cam.trackbodyid = 1 138 | self.viewer.cam.distance = self.model.stat.extent * 1.0 139 | self.viewer.cam.lookat[2] += .8 140 | self.viewer.cam.elevation = -20 141 | -------------------------------------------------------------------------------- /algo/dqn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import random 7 | import pdb 8 | from storage import ReplayMemory, Transition 9 | 10 | # Code copied and adapted from pytorch Q learning tutorial https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html 11 | class DQN(object): 12 | def __init__(self, 13 | dqn, 14 | gamma, 15 | batch_size=128, 16 | target_update=100, 17 | mem_capacity=10000000, 18 | lr=None, 19 | eps=None, 20 | max_grad_norm=1): 21 | 22 | self.gamma = gamma 23 | self.dqn = dqn 24 | self.batch_size = batch_size 25 | self.target_update = target_update 26 | self.max_grad_norm = max_grad_norm 27 | self.optimizer = optim.Adam(self.dqn.policy_net.parameters(), lr=lr, eps=eps) 28 | self.num_updates = 0 29 | self.replay_memory = ReplayMemory(mem_capacity) 30 | 31 | # Generate a state_dict object 32 | def state_dict(self): 33 | ckpt = {} 34 | ckpt['model'] = [self.dqn.policy_net.state_dict(), self.dqn.target_net.state_dict()] 35 | ckpt['optim'] = self.optimizer.state_dict() 36 | ckpt['steps_done'] = self.dqn.steps_done 37 | ckpt['num_updates'] = self.num_updates 38 | ckpt['memory'] = self.replay_memory 39 | return ckpt 40 | 41 | # Load from a state dict 42 | def load_state_dict(self, ckpt): 43 | self.dqn.policy_net.load_state_dict(ckpt['model'][0]) 44 | self.dqn.target_net.load_state_dict(ckpt['model'][1]) 45 | self.optimizer.load_state_dict(ckpt['optim']) 46 | self.dqn.steps_done = ckpt['steps_done'] 47 | self.num_updates = ckpt['num_updates'] 48 | self.replay_memory = ckpt['memory'] 49 | 50 | # Update the replay memory 51 | def update_memory(self, states, actions, next_states, rewards, done_mask, step_masks): 52 | # Go through each index (corresponding to different environment steps) 53 | for state, action, next_state, reward, done, step_mask in zip(states, actions, next_states, rewards, done_mask, step_masks): 54 | # If in zombie step mask state, do nothing 55 | if step_mask > 0: 56 | # Make deep copies, convert to numpy and append to replay memory 57 | state = np.array(state.cpu().numpy()) 58 | action = np.array(action.cpu().numpy()) 59 | reward = np.array(reward) 60 | if done: 61 | next_state = None 62 | else: 63 | next_state = np.array(next_state.cpu().numpy()) 64 | 65 | # Push into replay memory 66 | self.replay_memory.push(state, action, next_state, reward) 67 | 68 | # Update our policy network 69 | def update(self, num_updates): 70 | # Replay memory needs to at least be the batch size 71 | if len(self.replay_memory) < self.batch_size: 72 | return 0, 0, 0 73 | assert(len(self.replay_memory) >= self.batch_size) 74 | 75 | # Do updates 76 | dqn_loss = 0 77 | for update in range(num_updates): 78 | # Get batch values 79 | transitions = self.replay_memory.sample(self.batch_size) 80 | batch = Transition(*zip(*transitions)) 81 | non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, 82 | batch.next_state)), dtype=torch.uint8) 83 | non_final_mask = non_final_mask.unsqueeze(1) 84 | non_final_next_states = torch.cat([torch.from_numpy(s).unsqueeze(0) for s in batch.next_state 85 | if s is not None]) 86 | state_batch = torch.cat([torch.from_numpy(s).unsqueeze(0) for s in batch.state]) 87 | action_batch = torch.cat([torch.from_numpy(a).unsqueeze(0) for a in batch.action]) 88 | reward_batch = torch.cat([torch.from_numpy(r).unsqueeze(0) for r in batch.reward]) 89 | next_state_values = torch.zeros(self.batch_size, 1) 90 | 91 | # Convert to cuda 92 | if self.dqn.target_net.in_fc.weight.is_cuda: 93 | non_final_mask = non_final_mask.cuda() 94 | non_final_next_states = non_final_next_states.cuda() 95 | state_batch = state_batch.cuda() 96 | action_batch = action_batch.cuda() 97 | reward_batch = reward_batch.cuda() 98 | next_state_values = next_state_values.cuda() 99 | 100 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 101 | # columns of actions taken 102 | state_action_values = self.dqn.policy_net(state_batch).gather(1, action_batch) 103 | 104 | # Compute V(s_{t+1}) for all next states. 105 | next_state_values[non_final_mask] = self.dqn.target_net(non_final_next_states).max(1)[0].detach() 106 | # Compute the expected Q values 107 | expected_state_action_values = (next_state_values * self.gamma) + reward_batch 108 | 109 | # Compute Huber loss 110 | loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) 111 | dqn_loss += loss 112 | 113 | # Optimize the model 114 | self.optimizer.zero_grad() 115 | loss.backward() 116 | for param in self.dqn.policy_net.parameters(): 117 | param.grad.data.clamp_(-self.max_grad_norm, self.max_grad_norm) 118 | self.optimizer.step() 119 | 120 | self.num_updates += 1 121 | 122 | # Update target network 123 | if self.num_updates % self.target_update == 0: 124 | self.dqn.target_net.load_state_dict(self.dqn.policy_net.state_dict()) 125 | 126 | dqn_loss /= num_updates 127 | return dqn_loss, 0, 0 128 | -------------------------------------------------------------------------------- /environments/assets/skull_maze_ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 99 | -------------------------------------------------------------------------------- /environments/assets/cross_maze_ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 103 | -------------------------------------------------------------------------------- /summarize_results.py: -------------------------------------------------------------------------------- 1 | # This file takes the completed slurm logs and summarizes the results 2 | # Right now, just displays the average +- var of the final rewards (or whatever value we set in the ymal) 3 | # Eventually should be able to plot variance? 4 | import argparse 5 | import yaml 6 | import json 7 | import csv 8 | from pprint import pprint 9 | import click 10 | import shutil 11 | import copy 12 | import glob 13 | import os 14 | import time 15 | import itertools 16 | import pdb 17 | import torch 18 | import numpy as np 19 | from visualize import Dashboard 20 | 21 | # Get Input Arguments 22 | parser = argparse.ArgumentParser(description='RL') 23 | 24 | ################################################## 25 | # yaml options file contains all default choices # 26 | parser.add_argument('--batch_path_opt', default='options/batch/default.yaml', type=str, 27 | help='path to a yaml options file') 28 | # yaml option file containing the visdom plotting options 29 | parser.add_argument('--vis_path_opt', default='options/visualization/reward.yaml', type=str, 30 | help='path to a yaml visualization options file') 31 | ################################################## 32 | parser.add_argument('--eval-key', type=str, default='reward_env', 33 | help='name of key in the Episode log that we actually want to evaluate on') 34 | parser.add_argument('--outfile', type=str, default='', 35 | help='where to dump these results (optional)') 36 | parser.add_argument('--bin-size', type=int, default=100, 37 | help='over how many episode to average final result value') 38 | # These options only matter if we're batch_path_opt is actually just a single yaml (not a batch) 39 | parser.add_argument('--num-trials', type=int, default=5) 40 | parser.add_argument('--trial-offset', type=int, default=0) 41 | parser.add_argument('--algo', default='a2c', 42 | help='algorithm to use: a2c | ppo | acktr') 43 | parser.add_argument('--env-name', default='Hopper-v2', 44 | help='environment to train on (default: Hopper-v2)') 45 | def main(): 46 | global args 47 | args = parser.parse_args() 48 | 49 | # Set options 50 | if args.batch_path_opt is not None: 51 | with open(args.batch_path_opt, 'r') as handle: 52 | batch_options = yaml.load(handle) 53 | if args.vis_path_opt is not None: 54 | with open(args.vis_path_opt, 'r') as handle: 55 | vis_options = yaml.load(handle) 56 | print('## args'); pprint(vars(args)) 57 | 58 | # Either use the slurm batch file or the single yaml file to get the values 59 | val_dict = {} 60 | if 'base_yaml' in batch_options: 61 | # Slurm version 62 | algo = batch_options['algo'] 63 | env_name = batch_options['env_name'] 64 | num_trials = batch_options['num_trials'] 65 | trial_offset = batch_options['trial_offset'] 66 | base_yaml_file = batch_options['base_yaml'] 67 | 68 | # Get the list of yaml files 69 | # Copies logic from clusterrun to make these 70 | grid = batch_options['params'] 71 | individual_options = [[{key: value} for value in values] for key, values in grid.items()] 72 | product_options = list(itertools.product(*individual_options)) 73 | jobs = [{k: v for d in option_set for k, v in d.items()} for option_set in product_options] 74 | basenames = [] 75 | yaml_files = [] 76 | with open(base_yaml_file) as f: 77 | base_options = yaml.load(f) 78 | for job in jobs: 79 | new_unique_name = base_options['logs']['exp_name'] 80 | for k, v in job.items(): 81 | new_unique_name += "_" + str(k) + "_" + str(v) 82 | assert(len(base_yaml_file.split('.')) == 2) 83 | new_yaml_filename = base_yaml_file.split('.')[0] 84 | new_yaml_filename = os.path.join(new_yaml_filename, new_unique_name) + '.yaml' 85 | basenames.append(new_unique_name) 86 | yaml_files.append(new_yaml_filename) 87 | assert(len(yaml_files) == len(jobs)) 88 | assert(len(basenames) == len(jobs)) 89 | 90 | # Get the eval vals for each param set 91 | val_dict = {} 92 | for yaml_file, name in zip(yaml_files, basenames): 93 | with open(yaml_file, 'r') as handle: 94 | opt = yaml.load(handle) 95 | eval_vals = get_last_eval_vals(opt, vis_options, args.eval_key, algo, env_name, num_trials, trial_offset, args.bin_size) 96 | if eval_vals is not None: 97 | val_dict[name] = eval_vals 98 | else: 99 | # Single yaml version 100 | algo = args.algo 101 | env_name = args.env_name 102 | opt = batch_options 103 | num_trials = args.num_trials 104 | trial_offset = args.trial_offset 105 | 106 | # Get the eval vals for this yaml 107 | eval_vals = get_last_eval_vals(opt, vis_options, args.eval_key, algo, env_name, num_trials, trial_offset, args.bin_size) 108 | 109 | # Save to dict 110 | name = opt['logs']['exp_name'] 111 | val_dict[name] = eval_vals 112 | 113 | # Get the average values and std for each value in dict 114 | # Sort by average 115 | # Display / print each by decreasing average value 116 | avg_dict = {k: np.mean(v) for k, v in val_dict.items()} 117 | sorted_avg_dict = sorted(avg_dict.items(), reverse=True, key=lambda x: x[1]) 118 | sorted_names = [x[0] for x in sorted_avg_dict] 119 | lines = [] 120 | lines.append("Results for run of {yaml_name} on variable {var}".format(yaml_name=args.batch_path_opt.split('/')[-1], var=args.eval_key)) 121 | for name in sorted_names: 122 | lines.append("{name}: {avg}+={std}".format(name=name, avg=np.mean(val_dict[name]), std=np.std(val_dict[name]))) 123 | 124 | # Print results 125 | for line in lines: 126 | print(line) 127 | 128 | # Optionally print to file 129 | if len(args.outfile) > 0: 130 | with open(args.outfile, 'w') as f: 131 | for line in lines: 132 | f.write(line + '\n') 133 | 134 | # Get the last bucket values for the eval_key for each trial and return 135 | def get_last_eval_vals(opt, vis_opt, eval_key, algo, env_name, num_trials, trial_offset, bin_size): 136 | # For each trial 137 | eval_vals = [] 138 | for trial in range(trial_offset, trial_offset+num_trials): 139 | # Get the logpath 140 | logpath = os.path.join(opt['logs']['log_base'], opt['model']['mode'], opt['logs']['exp_name'], algo, env_name, 'trial%d' % trial) 141 | if not os.path.isdir(logpath): 142 | return None 143 | 144 | 145 | # Create the dashboard object 146 | opt['env']['env-name'] = env_name 147 | opt['alg'] = opt['alg_%s' % algo] 148 | opt['optim'] = opt['optim_%s' % algo] 149 | opt['alg']['algo'] = algo 150 | opt['trial'] = trial 151 | dash = Dashboard(opt, vis_opt, logpath, vis=False) 152 | 153 | # Get data 154 | try: 155 | dash.preload_data() 156 | raw_x, raw_y = dash.load_data('episode_monitor', 'scalar', eval_key) 157 | except Exception: 158 | return None 159 | 160 | # Get data from last bin 161 | if not (len(raw_y) > bin_size): 162 | return None 163 | raw_vals = raw_y[-bin_size:] 164 | assert(len(raw_vals) == bin_size) 165 | raw_vals = [float(v) for v in raw_vals] 166 | raw_val = np.mean(raw_vals) 167 | eval_vals.append(raw_val) 168 | 169 | # Return 170 | return eval_vals 171 | 172 | if __name__ == "__main__": 173 | main() 174 | 175 | --------------------------------------------------------------------------------