├── algo
├── __init__.py
├── passthrough.py
├── a2c_acktr.py
├── ppo.py
└── dqn.py
├── options
├── visualization
│ ├── eval_default.yaml
│ ├── supervised_default.yaml
│ ├── simple_reward.yaml
│ ├── reward_forward.yaml
│ ├── goal_distance.yaml
│ ├── interp_reward_simple.yaml
│ ├── interp_reward_surviveinterp.yaml
│ ├── arbitrary_theta_vis.yaml
│ ├── default.yaml
│ ├── reward.yaml
│ ├── interp_reward.yaml
│ ├── plot_state_action_cycles.yaml
│ ├── state_reward.yaml
│ ├── state_xy_reward.yaml
│ ├── all_episode.yaml
│ ├── state_reward_alg.yaml
│ └── mujoco_verbose.yaml
├── baseline_lowlevel
│ ├── baseline_simplemlp_pretrain_any.yaml
│ └── baseline_simplemlp_skip_hs16_pretrain_any_20M.yaml
├── phase_lowlevel
│ ├── phase_mlp_pretrain_any.yaml
│ └── phase_mlp_skip_hs16_pretrain_any_20M.yaml
├── hierarchical_final
│ ├── hierarchical_many_phase.yaml
│ ├── hierarchical_many_4M_phase.yaml
│ ├── hierarchical_many_phase_antlowgear.yaml
│ ├── hierarchical_many_phase_a2c.yaml
│ ├── hierarchical_many_phase_16ll_proprioceptivehumanoid.yaml
│ ├── hierarchical_many_baseline.yaml
│ ├── hierarchical_many_4M_baseline.yaml
│ ├── hierarchical_many_baseline_antlowgear.yaml
│ └── hierarchical_many_phase_dqn.yaml
├── maze_baseline
│ ├── maze_baseline.yaml
│ ├── maze_baseline_wmove_r1000.yaml
│ ├── maze_baseline_finetune.yaml
│ └── maze_baseline_wmove_r1000_finetune.yaml
└── maze_baseline_wphase
│ ├── maze_baseline_phase.yaml
│ ├── maze_baseline_phase_humanoid.yaml
│ ├── maze_baseline_phase_wmove_r1000.yaml
│ ├── maze_baseline_phase_wmove_r1000_humanoid.yaml
│ ├── maze_baseline_phase_finetune.yaml
│ ├── maze_baseline_phase_wmove_r1000_finetune.yaml
│ ├── maze_baseline_phase_finetune_proprioceptivehumanoid.yaml
│ └── maze_baseline_phase_wmove_r1000_finetune_proprioceptivehumanoid.yaml
├── environments
├── explorer_humanoid.py
├── explorer_ant.py
├── geom_utils.py
├── __init__.py
├── RewardCyclicEnv.py
├── assets
│ ├── ant_custom_gear.xml
│ ├── my_ant.xml
│ ├── skull_maze_ant.xml
│ └── cross_maze_ant.xml
├── mujoco_env.py
├── simple_humanoid_env.py
└── proprioceptive_humanoid_env.py
├── LICENSE
├── README.md
├── .gitignore
├── distributions.py
├── hier_utils.py
├── utils.py
├── wrappers.py
└── summarize_results.py
/algo/__init__.py:
--------------------------------------------------------------------------------
1 | from .a2c_acktr import A2C_ACKTR
2 | from .ppo import PPO
3 | from .dqn import DQN
4 | from .passthrough import Passthrough
5 |
--------------------------------------------------------------------------------
/options/visualization/eval_default.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 |
4 |
--------------------------------------------------------------------------------
/options/visualization/supervised_default.yaml:
--------------------------------------------------------------------------------
1 | alg_monitor_str: Alg.Monitor.csv
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | supervised_loss:
4 | log_name:
5 | train_loss: True
6 | val_loss: True
7 | data_src: alg_monitor
8 | data_type: multiscalar
9 | bin_size: 1
10 | smooth: 0
11 |
--------------------------------------------------------------------------------
/options/visualization/simple_reward.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | rewards_ep:
4 | log_name:
5 | reward_env: True
6 | data_src: episode_monitor
7 | data_type: multiscalar
8 | bin_size: 100
9 | smooth: 1
10 |
--------------------------------------------------------------------------------
/options/visualization/reward_forward.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | rewards_ep:
4 | log_name:
5 | reward_forward: True
6 | data_src: episode_monitor
7 | data_type: multiscalar
8 | bin_size: 100
9 | smooth: 1
10 |
--------------------------------------------------------------------------------
/options/visualization/goal_distance.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | rewards_ep:
4 | log_name:
5 | goal_distance_radius: True
6 | data_src: episode_monitor
7 | data_type: multiscalar
8 | bin_size: 100
9 | smooth: 1
10 |
--------------------------------------------------------------------------------
/options/visualization/interp_reward_simple.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | rewards_ep:
4 | log_name:
5 | reward_interpolate: True
6 | data_src: episode_monitor
7 | data_type: multiscalar
8 | bin_size: 100
9 | smooth: 1
10 |
--------------------------------------------------------------------------------
/options/visualization/interp_reward_surviveinterp.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | rewards_ep:
4 | log_name:
5 | reward_survive: True
6 | reward_interpolate: True
7 | data_src: episode_monitor
8 | data_type: multiscalar
9 | bin_size: 100
10 | smooth: 1
11 |
--------------------------------------------------------------------------------
/options/visualization/arbitrary_theta_vis.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | step_monitor_str: Last.Step.Monitor.csv # Name of step
3 | plot_keys: # Values we want to plot (and necessary info to plot them)
4 | theta_xy_plot:
5 | log_name:
6 | state: True
7 | obs: True
8 | data_src: step_monitor
9 | update_delay: 0
10 | data_type: special
11 | window_once: True
12 |
--------------------------------------------------------------------------------
/environments/explorer_humanoid.py:
--------------------------------------------------------------------------------
1 | from . import proprioceptive_humanoid_env
2 | import numpy as np
3 |
4 | # All obs but xy but yaw and z use integrals
5 | class LowlevelProprioceptiveHumanoidEnv(proprioceptive_humanoid_env.BaseProprioceptiveHumanoidEnv):
6 | # Initialize environment
7 | def __init__(self):
8 | super(LowlevelProprioceptiveHumanoidEnv, self).__init__()
9 |
10 | def _get_obs(self):
11 | s_internal, _ = self.get_intern_extern_state()
12 | return s_internal
13 |
--------------------------------------------------------------------------------
/options/visualization/default.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | rewards_ep:
4 | log_name:
5 | reward_env: True
6 | reward_exp: False
7 | reward_run: False
8 | reward_ctrl: False
9 | reward_contact: False
10 | reward_survive: False
11 | reward_forward: False
12 | reward_move: False
13 | reward_cycle: False
14 | reward_cycle_s: False
15 | reward_cycle_a: False
16 | data_src: episode_monitor
17 | data_type: multiscalar
18 | bin_size: 100
19 | smooth: 1
20 |
--------------------------------------------------------------------------------
/options/visualization/reward.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | rewards_ep:
4 | log_name:
5 | reward_env: True
6 | reward_exp: False
7 | reward_run: False
8 | reward_ctrl: False
9 | reward_contact: False
10 | reward_survive: False
11 | reward_forward: False
12 | reward_move: False
13 | reward_cycle: False
14 | reward_cycle_s: False
15 | reward_cycle_a: False
16 | reward_thresh: False
17 | data_src: episode_monitor
18 | data_type: multiscalar
19 | bin_size: 100
20 | smooth: 1
21 |
--------------------------------------------------------------------------------
/options/visualization/interp_reward.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | rewards_ep:
4 | log_name:
5 | reward_interpolate: True
6 | reward_env: True
7 | reward_exp: False
8 | reward_run: False
9 | reward_ctrl: False
10 | reward_contact: False
11 | reward_survive: False
12 | reward_forward: False
13 | reward_move: False
14 | reward_cycle: False
15 | reward_cycle_s: False
16 | reward_cycle_a: False
17 | data_src: episode_monitor
18 | data_type: multiscalar
19 | bin_size: 100
20 | smooth: 1
21 |
--------------------------------------------------------------------------------
/environments/explorer_ant.py:
--------------------------------------------------------------------------------
1 | from . import ant_env
2 | import numpy as np
3 |
4 | # Only contains the internal state of the ant in the observation
5 | class LowlevelAntEnv(ant_env.BaseAntEnv):
6 | # Initialize environment
7 | def __init__(self):
8 | super(LowlevelAntEnv, self).__init__()
9 |
10 | def _get_obs(self):
11 | s_internal, _ = self.get_intern_extern_state()
12 | return s_internal
13 |
14 | # Only contains the internal state of the ant in the observation
15 | class LowlevelAntLowGearEnv(ant_env.BaseAntLowGearEnv):
16 | # Initialize environment
17 | def __init__(self):
18 | super(LowlevelAntLowGearEnv, self).__init__(xml_file='ant_custom_gear.xml')
19 |
20 | def _get_obs(self):
21 | s_internal, _ = self.get_intern_extern_state()
22 | return s_internal
23 |
24 |
--------------------------------------------------------------------------------
/algo/passthrough.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 |
5 | # Does no updates, just does forward passes
6 | class Passthrough(object):
7 | def __init__(self, actor_critic):
8 | self.actor_critic = actor_critic
9 |
10 | # Generate a state_dict object
11 | def state_dict(self):
12 | ckpt = {}
13 | ckpt['model'] = self.actor_critic.state_dict()
14 | return ckpt
15 |
16 | # Load from a state dict
17 | def load_state_dict(self, ckpt):
18 | self.actor_critic.load_state_dict(ckpt['model'])
19 |
20 | # Load from pretrained (ModularPolicy)
21 | def load_pretrained_policies(self, ckpts):
22 | self.actor_critic.load_pretrained_policies(ckpts)
23 |
24 | # Update our policy network
25 | def update(self, rollouts):
26 | return 0, 0, 0
27 |
28 |
--------------------------------------------------------------------------------
/options/visualization/plot_state_action_cycles.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | step_monitor_str: Last.Step.Monitor.csv # Name of step
3 | plot_keys: # Values we want to plot (and necessary info to plot them)
4 | obs_step:
5 | log_name: obs
6 | data_src: step_monitor
7 | update_delay: 0
8 | bin_size: 1
9 | smooth: 0
10 | data_type: array
11 | display_type: elementwise_subset
12 | start_ind: 0
13 | end_ind: 20
14 | time_start: 500
15 | time_end: 550
16 | action_step:
17 | log_name: action
18 | data_src: step_monitor
19 | update_delay: 0
20 | bin_size: 1
21 | smooth: 0
22 | data_type: array
23 | display_type: elementwise
24 | time_start: 500
25 | time_end: 550
26 |
--------------------------------------------------------------------------------
/options/visualization/state_reward.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | delta_state_ep:
4 | log_name: delta_state
5 | data_src: episode_monitor
6 | data_type: array
7 | display_type: elementwise_subset
8 | start_ind: 0
9 | end_ind: 1
10 | bin_size: 100
11 | smooth: 1
12 | rewards_ep:
13 | log_name:
14 | reward_env: True
15 | reward_exp: False
16 | reward_run: False
17 | reward_ctrl: False
18 | reward_contact: False
19 | reward_survive: False
20 | reward_forward: False
21 | reward_move: False
22 | reward_cycle: False
23 | reward_cycle_s: False
24 | reward_cycle_a: False
25 | data_src: episode_monitor
26 | data_type: multiscalar
27 | bin_size: 100
28 | smooth: 1
29 |
--------------------------------------------------------------------------------
/options/visualization/state_xy_reward.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | delta_state_ep:
4 | log_name: delta_state
5 | data_src: episode_monitor
6 | data_type: array
7 | display_type: elementwise_subset
8 | start_ind: 0
9 | end_ind: 2
10 | bin_size: 100
11 | smooth: 1
12 | rewards_ep:
13 | log_name:
14 | reward_env: True
15 | reward_exp: False
16 | reward_run: False
17 | reward_ctrl: False
18 | reward_contact: False
19 | reward_survive: False
20 | reward_forward: False
21 | reward_move: False
22 | reward_cycle: False
23 | reward_cycle_s: False
24 | reward_cycle_a: False
25 | data_src: episode_monitor
26 | data_type: multiscalar
27 | bin_size: 100
28 | smooth: 1
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Kenneth Marino
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hrl-ep3
2 | Code for our paper: Hierarchical RL Using an Ensemble of Proprioceptive Periodic Policies
3 |
4 | ## Citation
5 | If you find this work helpful, please cite our work, as well as Ilya Kostrikov's pytoch baseline repo.
6 |
7 | @article{marino2019ep3,
8 | title={Hierarchical RL Using an Ensemble of Proprioceptive Periodic Policies},
9 | author={Marino, Kenneth and Gupta, Abhinav and Fergus, Rob and Szlam, Arthur},
10 | journal={ICLR},
11 | year={2019}
12 | }
13 |
14 | ## Requirements
15 | * Python 3
16 | * [PyTorch](http://pytorch.org/)
17 | * [Visdom](https://github.com/facebookresearch/visdom)
18 | * [OpenAI baselines](https://github.com/openai/baselines)
19 |
20 | ## Acknowledgements
21 | This repo was initially copied from Ilya Kostrikov's pytorch baseline repo https://github.com/ikostrikov/pytorch-a2c-ppo-acktr . We will modify many of the original files and add new ones as is useful. We also will continue to use this as our baselines. See their github for more documentation on the baselines. Please also cite this repository in your publications if you make use of this code:
22 |
23 | @misc{pytorchrl,
24 | author = {Kostrikov, Ilya},
25 | title = {PyTorch Implementations of Reinforcement Learning Algorithms},
26 | year = {2018},
27 | publisher = {GitHub},
28 | journal = {GitHub repository},
29 | howpublished = {\url{https://github.com/ikostrikov/pytorch-a2c-ppo-acktr}},
30 | }
31 |
32 |
33 |
--------------------------------------------------------------------------------
/options/visualization/all_episode.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | plot_keys: # Values we want to plot (and necessary info to plot them)
3 | delta_state_ep:
4 | log_name: delta_state
5 | data_src: episode_monitor
6 | data_type: array
7 | display_type: elementwise
8 | bin_size: 100
9 | smooth: 0
10 | delta_obs_ep:
11 | log_name: delta_obs
12 | data_src: episode_monitor
13 | data_type: array
14 | display_type: elementwise
15 | bin_size: 100
16 | smooth: 0
17 | mean_action_ep:
18 | log_name: mean_action
19 | data_src: episode_monitor
20 | data_type: array
21 | display_type: elementwise
22 | bin_size: 100
23 | smooth: 0
24 | episode_len_ep:
25 | log_name: episode_len
26 | data_src: episode_monitor
27 | data_type: scalar
28 | bin_size: 100
29 | smooth: 0
30 | rewards_ep:
31 | log_name:
32 | reward_env: True
33 | reward_exp: False
34 | reward_run: False
35 | reward_ctrl: False
36 | reward_contact: False
37 | reward_survive: False
38 | reward_forward: False
39 | reward_move: False
40 | reward_cycle: False
41 | reward_cycle_s: False
42 | reward_cycle_a: False
43 | data_src: episode_monitor
44 | data_type: multiscalar
45 | bin_size: 100
46 | smooth: 0
47 |
--------------------------------------------------------------------------------
/options/visualization/state_reward_alg.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | alg_monitor_str: Alg.Monitor.csv # Basename of Alg monitor
3 | plot_keys: # Values we want to plot (and necessary info to plot them)
4 | delta_state_ep:
5 | log_name: delta_state
6 | data_src: episode_monitor
7 | data_type: array
8 | display_type: elementwise_subset
9 | start_ind: 0
10 | end_ind: 1
11 | bin_size: 100
12 | smooth: 1
13 | rewards_ep:
14 | log_name:
15 | reward_env: True
16 | reward_exp: False
17 | reward_run: False
18 | reward_ctrl: False
19 | reward_contact: False
20 | reward_survive: False
21 | reward_forward: False
22 | reward_move: False
23 | reward_cycle: False
24 | reward_cycle_s: False
25 | reward_cycle_a: False
26 | data_src: episode_monitor
27 | data_type: multiscalar
28 | bin_size: 100
29 | smooth: 1
30 | value_loss:
31 | log_name: value_loss
32 | data_src: alg_monitor
33 | data_type: scalar
34 | bin_size: 1
35 | smooth: 0
36 | action_loss:
37 | log_name: action_loss
38 | data_src: alg_monitor
39 | data_type: scalar
40 | bin_size: 1
41 | smooth: 0
42 | dist_entropy:
43 | log_name: dist_entropy
44 | data_src: alg_monitor
45 | data_type: scalar
46 | bin_size: 1
47 | smooth: 0
48 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Swap files
2 | *.sw*
3 | *.sv*
4 | *.su*
5 | *.st*
6 |
7 | # Temp ckpt folders
8 | ckpt*/
9 | slurm_logs/
10 | results/
11 | logs/
12 | sac-hierarchy/
13 | eval_runs/
14 |
15 | # Whatever this is
16 | .DS_Store
17 |
18 | # Byte-compiled / optimized / DLL files
19 | __pycache__/
20 | *.py[cod]
21 | *$py.class
22 |
23 | # C extensions
24 | *.so
25 |
26 | # Distribution / packaging
27 | .Python
28 | build/
29 | develop-eggs/
30 | dist/
31 | downloads/
32 | eggs/
33 | .eggs/
34 | lib/
35 | lib64/
36 | parts/
37 | sdist/
38 | var/
39 | wheels/
40 | *.egg-info/
41 | .installed.cfg
42 | *.egg
43 | MANIFEST
44 |
45 | # PyInstaller
46 | # Usually these files are written by a python script from a template
47 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
48 | *.manifest
49 | *.spec
50 |
51 | # Installer logs
52 | pip-log.txt
53 | pip-delete-this-directory.txt
54 |
55 | # Unit test / coverage reports
56 | htmlcov/
57 | .tox/
58 | .coverage
59 | .coverage.*
60 | .cache
61 | nosetests.xml
62 | coverage.xml
63 | *.cover
64 | .hypothesis/
65 | .pytest_cache/
66 |
67 | # Translations
68 | *.mo
69 | *.pot
70 |
71 | # Django stuff:
72 | *.log
73 | local_settings.py
74 | db.sqlite3
75 |
76 | # Flask stuff:
77 | instance/
78 | .webassets-cache
79 |
80 | # Scrapy stuff:
81 | .scrapy
82 |
83 | # Sphinx documentation
84 | docs/_build/
85 |
86 | # PyBuilder
87 | target/
88 |
89 | # Jupyter Notebook
90 | .ipynb_checkpoints
91 |
92 | # pyenv
93 | .python-version
94 |
95 | # celery beat schedule file
96 | celerybeat-schedule
97 |
98 | # SageMath parsed files
99 | *.sage.py
100 |
101 | # Environments
102 | .env
103 | .venv
104 | env/
105 | venv/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 |
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 |
114 | # Rope project settings
115 | .ropeproject
116 |
117 | # mkdocs documentation
118 | /site
119 |
120 | trained_models/
121 | .fuse_hidden*
122 |
--------------------------------------------------------------------------------
/distributions.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from utils import init, init_normc_, AddBias
7 |
8 | """
9 | Modify standard PyTorch distributions so they are compatible with this code.
10 | """
11 |
12 | FixedCategorical = torch.distributions.Categorical
13 |
14 | old_sample = FixedCategorical.sample
15 | FixedCategorical.sample = lambda self: old_sample(self).unsqueeze(-1)
16 |
17 | log_prob_cat = FixedCategorical.log_prob
18 | FixedCategorical.log_probs = lambda self, actions: log_prob_cat(self, actions.squeeze(-1)).unsqueeze(-1)
19 |
20 | FixedCategorical.mode = lambda self: self.probs.argmax(dim=1, keepdim=True)
21 |
22 | FixedNormal = torch.distributions.Normal
23 | log_prob_normal = FixedNormal.log_prob
24 | FixedNormal.log_probs = lambda self, actions: log_prob_normal(self, actions).sum(-1, keepdim=True)
25 |
26 | entropy = FixedNormal.entropy
27 | FixedNormal.entropy = lambda self: entropy(self).sum(-1)
28 |
29 | FixedNormal.mode = lambda self: self.mean
30 |
31 |
32 | class Categorical(nn.Module):
33 | def __init__(self, num_inputs, num_outputs):
34 | super(Categorical, self).__init__()
35 |
36 | init_ = lambda m: init(m,
37 | nn.init.orthogonal_,
38 | lambda x: nn.init.constant_(x, 0),
39 | gain=0.01)
40 |
41 | self.linear = init_(nn.Linear(num_inputs, num_outputs))
42 |
43 | def forward(self, x):
44 | x = self.linear(x)
45 | return FixedCategorical(logits=x)
46 |
47 |
48 | class DiagGaussian(nn.Module):
49 | def __init__(self, num_inputs, num_outputs):
50 | super(DiagGaussian, self).__init__()
51 |
52 | init_ = lambda m: init(m,
53 | init_normc_,
54 | lambda x: nn.init.constant_(x, 0))
55 |
56 | self.fc_mean = init_(nn.Linear(num_inputs, num_outputs))
57 | self.logstd = AddBias(torch.zeros(num_outputs))
58 |
59 | def forward(self, x):
60 | action_mean = self.fc_mean(x)
61 |
62 | # An ugly hack for my KFAC implementation.
63 | zeros = torch.zeros(action_mean.size())
64 | if x.is_cuda:
65 | zeros = zeros.cuda()
66 |
67 | action_logstd = self.logstd(zeros)
68 | return FixedNormal(action_mean, action_logstd.exp())
--------------------------------------------------------------------------------
/environments/geom_utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import pdb
4 |
5 | # Convert to/from quaternion
6 | def quaternion_to_euler_angle(quart):
7 | w = quart[0]
8 | x = quart[1]
9 | y = quart[2]
10 | z = quart[3]
11 |
12 | ysqr = y * y
13 |
14 | t0 = +2.0 * (w * x + y * z)
15 | t1 = +1.0 - 2.0 * (x * x + ysqr)
16 | roll = math.atan2(t0, t1)
17 |
18 | t2 = +2.0 * (w * y - z * x)
19 | t2 = +1.0 if t2 > +1.0 else t2
20 | t2 = -1.0 if t2 < -1.0 else t2
21 | pitch = math.asin(t2)
22 |
23 | t3 = +2.0 * (w * z + x * y)
24 | t4 = +1.0 - 2.0 * (ysqr + z * z)
25 | yaw = math.atan2(t3, t4)
26 |
27 | return roll, pitch, yaw
28 |
29 | def euler_angle_to_quaternion(roll, pitch, yaw):
30 | cy = math.cos(yaw * 0.5);
31 | sy = math.sin(yaw * 0.5);
32 | cr = math.cos(roll * 0.5);
33 | sr = math.sin(roll * 0.5);
34 | cp = math.cos(pitch * 0.5);
35 | sp = math.sin(pitch * 0.5);
36 |
37 | w = cy * cr * cp + sy * sr * sp
38 | x = cy * sr * cp - sy * cr * sp
39 | y = cy * cr * sp + sy * sr * cp
40 | z = sy * cr * cp - cy * sr * sp
41 |
42 | return w, x, y, z
43 |
44 | # Angle to unit vector
45 | def angle_to_unit(angle):
46 | x = math.cos(angle)
47 | y = math.sin(angle)
48 | return np.array([x, y])
49 |
50 | # Unit vector to angle
51 | def unit_to_angle(v):
52 | x = v[0]
53 | y = v[1]
54 | angle = math.atan2(y, x)
55 |
56 | # Average angles (do this by averaging unit vectors)
57 | def average_angles(angles):
58 | # Convert to unit vectors and average
59 | unit_vecs = [angle_to_unit(a) for a in angles]
60 | avg_dir = np.mean(unit_vecs, 0)
61 |
62 | # Return direction of the average unit vector
63 | avg_angle = math.atan2(avg_dir[1], avg_dir[0])
64 | return avg_angle
65 |
66 | # Convert angle to an egocentric coordinate
67 | def convert_to_egocentric(ego_to_global_angle, global_angle):
68 | # ego_to_global_angle - the angle of the agent in the global coordinate system
69 | # global_angle - the angle (rad) in global coordinates we want to be egocentric
70 | ego_angle = global_angle - ego_to_global_angle
71 | if ego_angle > math.pi:
72 | ego_angle -= 2*math.pi
73 | elif ego_angle < -math.pi:
74 | ego_angle += 2*math.pi
75 |
76 | return ego_angle
77 |
78 | # Convert vector to an egocentric coordinate
79 | def convert_vector_to_egocentric(ego_to_global_angle, vector):
80 | #pdb.set_trace()
81 | # Get magnitude and direction
82 | xy_mag = np.linalg.norm(vector)
83 | xy_angle = math.atan2(vector[1], vector[0])
84 |
85 | # Change direction to egocentric
86 | xy_angle = convert_to_egocentric(ego_to_global_angle, xy_angle)
87 |
88 | # Reform the vector
89 | x = xy_mag * math.cos(xy_angle)
90 | y = xy_mag * math.sin(xy_angle)
91 | ego_vec = np.array([x, y])
92 |
93 | return ego_vec
94 |
--------------------------------------------------------------------------------
/options/visualization/mujoco_verbose.yaml:
--------------------------------------------------------------------------------
1 | episode_monitor_str: Episode.Monitor.csv # Basename of Episode monitor
2 | step_monitor_str: Last.Step.Monitor.csv # Name of step monitor (for last step)
3 | plot_keys: # Values we want to plot (and necessary info to plot them)
4 | delta_state_ep:
5 | log_name: delta_state
6 | data_src: episode_monitor
7 | data_type: array
8 | display_type: elementwise
9 | bin_size: 100
10 | smooth: 0
11 | delta_obs_ep:
12 | log_name: delta_obs
13 | data_src: episode_monitor
14 | data_type: array
15 | display_type: elementwise
16 | bin_size: 100
17 | smooth: 0
18 | mean_action_ep:
19 | log_name: mean_action
20 | data_src: episode_monitor
21 | data_type: array
22 | display_type: elementwise
23 | bin_size: 100
24 | smooth: 0
25 | episode_len_ep:
26 | log_name: episode_len
27 | data_src: episode_monitor
28 | data_type: scalar
29 | bin_size: 100
30 | smooth: 0
31 | rewards_ep:
32 | log_name:
33 | reward_env: True
34 | reward_exp: False
35 | reward_run: False
36 | reward_ctrl: False
37 | reward_contact: False
38 | reward_survive: False
39 | reward_forward: False
40 | reward_move: False
41 | reward_cycle: False
42 | reward_cycle_s: False
43 | reward_cycle_a: False
44 | data_src: episode_monitor
45 | data_type: multiscalar
46 | bin_size: 100
47 | smooth: 0
48 | obs_step:
49 | log_name: obs
50 | data_src: step_monitor
51 | update_delay: 10
52 | data_type: array
53 | display_type: elementwise
54 | bin_size: 1
55 | smooth: 0
56 | action_step:
57 | log_name: action
58 | data_src: step_monitor
59 | update_delay: 10
60 | data_type: array
61 | display_type: elementwise
62 | bin_size: 1
63 | smooth: 0
64 | env_count_step:
65 | log_name: env_count
66 | data_src: step_monitor
67 | update_delay: 10
68 | data_type: scalar
69 | bin_size: 1
70 | smooth: 0
71 | episode_count_step:
72 | log_name: episode_count
73 | data_src: step_monitor
74 | update_delay: 10
75 | data_type: single_value
76 | reward_step:
77 | log_name:
78 | reward_env: True
79 | reward_exp: False
80 | reward_run: False
81 | reward_ctrl: False
82 | reward_contact: False
83 | reward_survive: False
84 | reward_forward: False
85 | reward_move: False
86 | reward_cycle: False
87 | reward_cycle_s: False
88 | reward_cycle_a: False
89 | data_src: step_monitor
90 | update_delay: 10
91 | data_type: multiscalar
92 | bin_size: 1
93 | smooth: 0
94 |
--------------------------------------------------------------------------------
/environments/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | register(
4 | id='ExplorerAnt-v2',
5 | entry_point='environments.explorer_ant:LowlevelAntEnv',
6 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
7 | )
8 |
9 | register(
10 | id='ExplorerAntLowGear-v2',
11 | entry_point='environments.explorer_ant:LowlevelAntLowGearEnv',
12 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
13 | )
14 |
15 | register(
16 | id='ExplorerProprioceptiveHumanoid-v2',
17 | entry_point='environments.explorer_humanoid:LowlevelProprioceptiveHumanoidEnv',
18 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
19 | )
20 |
21 | register(
22 | id='HierarchicalAnt-v2',
23 | entry_point='environments.explorer_ant:HierarchyAntEnv',
24 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
25 | )
26 |
27 | register(
28 | id='MyMazeDebugEnv-v2',
29 | entry_point='environments.maze_ant:ShowMazeEnv',
30 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
31 | )
32 |
33 | register(
34 | id='AntNavigateEnv-v2',
35 | entry_point='environments.maze_ant:AntNavigateEnv',
36 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
37 | )
38 |
39 | register(
40 | id='AntNavigateEnv10k-v2',
41 | entry_point='environments.maze_ant:AntNavigateEnv',
42 | tags={'wrapper_config.TimeLimit.max_episode_steps': 10000},
43 | )
44 |
45 | register(
46 | id='AntNavigateLowGearEnv-v2',
47 | entry_point='environments.maze_ant:AntNavigateLowGearEnv',
48 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
49 | )
50 |
51 | register(
52 | id='AntCrossMazeEnv-v2',
53 | entry_point='environments.maze_ant:AntCrossMazeEnv',
54 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
55 | )
56 |
57 | register(
58 | id='AntTMazeEnv-v2',
59 | entry_point='environments.maze_ant:AntTMazeEnv',
60 | tags={'wrapper_config.TimeLimit.max_episode_steps': 2000},
61 | )
62 |
63 | register(
64 | id='AntTMazeEnv10k-v2',
65 | entry_point='environments.maze_ant:AntTMazeEnv',
66 | tags={'wrapper_config.TimeLimit.max_episode_steps': 10000},
67 | )
68 |
69 | register(
70 | id='AntSkullMazeEnv-v2',
71 | entry_point='environments.maze_ant:AntSkullMazeEnv',
72 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
73 | )
74 |
75 | register(
76 | id='AntDebugMazeEnv-v2',
77 | entry_point='environments.maze_ant:DebugAntMazeEnv',
78 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
79 | )
80 |
81 | register(
82 | id='AntDebugMazeLeftEnv-v2',
83 | entry_point='environments.maze_ant:DebugAntMazeLeftEnv',
84 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
85 | )
86 |
87 | register(
88 | id='AntDebugMazeRightEnv-v2',
89 | entry_point='environments.maze_ant:DebugAntMazeRightEnv',
90 | tags={'wrapper_config.TimeLimit.max_episode_steps': 1000},
91 | )
92 |
93 | register(
94 | id='ProprioceptiveHumanoidSmallCrossMazeEnv10k-v2',
95 | entry_point='environments.maze_humanoid:ProprioceptiveHumanoidSmallCrossMazeEnv',
96 | tags={'wrapper_config.TimeLimit.max_episode_steps': 10000},
97 | )
98 |
99 |
--------------------------------------------------------------------------------
/algo/a2c_acktr.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 |
5 | from .kfac import KFACOptimizer
6 |
7 | class A2C_ACKTR(object):
8 | def __init__(self,
9 | actor_critic,
10 | value_loss_coef,
11 | entropy_coef,
12 | lr=None,
13 | eps=None,
14 | alpha=None,
15 | max_grad_norm=None,
16 | acktr=False):
17 |
18 | self.actor_critic = actor_critic
19 | self.acktr = acktr
20 |
21 | self.value_loss_coef = value_loss_coef
22 | self.entropy_coef = entropy_coef
23 |
24 | self.max_grad_norm = max_grad_norm
25 |
26 | if acktr:
27 | self.optimizer = KFACOptimizer(actor_critic)
28 | else:
29 | self.optimizer = optim.RMSprop(
30 | actor_critic.parameters(), lr, eps=eps, alpha=alpha)
31 |
32 | # Generate a state_dict object
33 | def state_dict(self):
34 | ckpt = {}
35 | ckpt['model'] = self.actor_critic.state_dict()
36 | ckpt['optim'] = self.optimizer.state_dict()
37 | return ckpt
38 |
39 | # Load from a state dict
40 | def load_state_dict(self, ckpt):
41 | self.actor_critic.load_state_dict(ckpt['model'])
42 | self.optimizer.load_state_dict(ckpt['optim'])
43 |
44 | # Update policy
45 | def update(self, rollouts):
46 | obs_shape = rollouts.observations.size()[2:]
47 | action_shape = rollouts.actions.size()[-1]
48 | num_steps, num_processes, _ = rollouts.rewards.size()
49 |
50 | values, action_log_probs, dist_entropy, states = self.actor_critic.evaluate_actions(
51 | rollouts.observations[:-1].view(-1, *obs_shape),
52 | rollouts.states[0].view(-1, self.actor_critic.state_size),
53 | rollouts.masks[:-1].view(-1, 1),
54 | rollouts.actions.view(-1, action_shape))
55 |
56 | values = values.view(num_steps, num_processes, 1)
57 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1)
58 |
59 | advantages = rollouts.returns[:-1] - values
60 | value_loss = advantages.pow(2).mean()
61 |
62 | action_loss = -(advantages.detach() * action_log_probs).mean()
63 |
64 | if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
65 | # Sampled fisher, see Martens 2014
66 | self.actor_critic.zero_grad()
67 | pg_fisher_loss = -action_log_probs.mean()
68 |
69 | value_noise = torch.randn(values.size())
70 | if values.is_cuda:
71 | value_noise = value_noise.cuda()
72 |
73 | sample_values = values + value_noise
74 | vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()
75 |
76 | fisher_loss = pg_fisher_loss + vf_fisher_loss
77 | self.optimizer.acc_stats = True
78 | fisher_loss.backward(retain_graph=True)
79 | self.optimizer.acc_stats = False
80 |
81 | self.optimizer.zero_grad()
82 | (value_loss * self.value_loss_coef + action_loss -
83 | dist_entropy * self.entropy_coef).backward()
84 |
85 | if self.acktr == False:
86 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
87 | self.max_grad_norm)
88 |
89 | self.optimizer.step()
90 |
91 | return value_loss.item(), action_loss.item(), dist_entropy.item()
92 |
--------------------------------------------------------------------------------
/options/baseline_lowlevel/baseline_simplemlp_pretrain_any.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 10 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: baseline_lowlevel # Mode is baseline with theta
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | env:
29 | gamma: 0.99 # Discount factor for rewards
30 | num_stack: 1 # Number of frames to stack
31 | add_timestep: False # Add timestep to observations
32 | known_reset: False # Reset to known position
33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
34 | time_scale: 0.001 # What to multiply timestep by for AC input
35 | theta_space_mode: pretrain_any # What theta mode we're in
36 | theta_reset_mode: never # When to change theta
37 | theta_reward_mode: lax # How to punish perpendicular movement
38 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
39 | theta_memory_lookback: 10 # How far to look back for reference global theta
40 | time_limit: 1000 # When to end an episode
41 | reward_shape_type: instant
42 | logs:
43 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
44 | exp_name: baseline_simplemlp_pretrain_any # Unique experiment name
45 | log_interval: 1 # Log interval, one log per n updates
46 | save_interval: 100000000000 # Save interval, one per n updates
47 | vis_interval: 1 # Vis interval, one log per n updates
48 | optim_ppo:
49 | lr: 0.0003 # Learning rate
50 | eps: 0.00001 # RMSprop optimizer epsiolon
51 | alpha: 0.99 # RMSprop optimizer alpha
52 | max_grad_norm: 0.5 # Max norm of gradients
53 | num_frames: 2000000 # Number of frames to train
54 | optim_a2c:
55 | lr: 0.0007 # Learning rate
56 | eps: 0.00001 # RMSprop optimizer epsiolon
57 | alpha: 0.99 # RMSprop optimizer alpha
58 | max_grad_norm: 0.5 # Max norm of gradients
59 | num_frames: 2000000 # Number of frames to train
60 |
--------------------------------------------------------------------------------
/hier_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import collections
4 | import numpy as np
5 | import math
6 | import pdb
7 |
8 | # Class that handles all the messy hierarchical observation stuff
9 | class HierarchyUtils(object):
10 | def __init__(self, ll_obs_sz, hl_obs_sz, hl_action_space, theta_sz, add_count):
11 | self.ll_obs_sz = ll_obs_sz
12 | if add_count:
13 | self.ll_raw_obs_sz = [self.ll_obs_sz[0] - theta_sz - 1]
14 | else:
15 | self.ll_raw_obs_sz = [self.ll_obs_sz[0] - theta_sz]
16 | self.hl_obs_sz = hl_obs_sz
17 | self.theta_sz = theta_sz
18 | self.hl_action_space = hl_action_space
19 | self.add_count = add_count
20 |
21 | # Seperate out highlevel, lowlevel and counts
22 | def seperate_obs(self, obs):
23 | ll_raw_obs = obs[:, :self.ll_raw_obs_sz[0]]
24 | assert(ll_raw_obs.shape[-1] == self.ll_raw_obs_sz[0])
25 | hl_obs = obs[:, self.ll_raw_obs_sz[0]:-1]
26 | assert(hl_obs.shape[-1] == self.hl_obs_sz[0])
27 | count = obs[:, -1]
28 | return hl_obs, ll_raw_obs, count
29 |
30 | # Append theta and count to ll obs
31 | def append_theta(self, ll_raw_obs, hl_action, counts):
32 | # Get theta
33 | if self.hl_action_space.__class__.__name__ == 'Discrete':
34 | assert(self.theta_sz == self.hl_action_space.n)
35 | thetas = np.zeros([len(hl_action), self.theta_sz])
36 | for e, act in enumerate(hl_action):
37 | thetas[e, act] = 1
38 | else:
39 | thetas = hl_action
40 |
41 | # Concanetate
42 | if self.add_count:
43 | if len(counts.shape) != len(ll_raw_obs.shape):
44 | counts = np.expand_dims(counts, axis=1)
45 | ll_obs = np.concatenate([ll_raw_obs, thetas, counts], 1)
46 | else:
47 | ll_obs = np.concatenate([ll_raw_obs, thetas], 1)
48 | assert(ll_obs.shape[-1] == self.ll_obs_sz[0])
49 |
50 | return ll_obs
51 |
52 | # Append placeholder theta and count to ll obs
53 | def placeholder_theta(self, ll_raw_obs, counts):
54 | thetas = float('inf') * np.ones([len(ll_raw_obs), self.theta_sz])
55 |
56 | # Concanetate
57 | if self.add_count:
58 | if len(counts.shape) != len(ll_raw_obs.shape):
59 | counts = np.expand_dims(counts, axis=1)
60 | ll_obs = np.concatenate([ll_raw_obs, thetas, counts], 1)
61 | else:
62 | ll_obs = np.concatenate([ll_raw_obs, thetas], 1)
63 | assert(ll_obs.shape[-1] == self.ll_obs_sz[0])
64 |
65 | return ll_obs
66 |
67 | # Update ll_obs to remove placeholders
68 | def update_theta(self, ll_obs, hl_action):
69 | # Take in single obs and high level action and update away the placehodler
70 | assert(self.has_placeholder(ll_obs))
71 | assert(ll_obs.shape == self.ll_obs_sz)
72 |
73 | # Get theta
74 | if self.hl_action_space.__class__.__name__ == 'Discrete':
75 | assert(self.theta_sz == self.hl_action_space.n)
76 | theta = torch.zeros(self.theta_sz)
77 | theta[hl_action] = 1
78 | else:
79 | theta = torch.from_numpy(hl_action)
80 |
81 | # Update observation with theta
82 | if self.add_count:
83 | ll_obs[self.ll_raw_obs_sz[0]:-1] = theta
84 | else:
85 | ll_obs[self.ll_raw_obs_sz[0]:] = theta
86 | assert(not self.has_placeholder(ll_obs))
87 | return ll_obs
88 |
89 | # Check if ll_obs has a placeholder
90 | def has_placeholder(self, ll_obs):
91 | if float('inf') in ll_obs:
92 | return True
93 | else:
94 | return False
95 |
96 |
--------------------------------------------------------------------------------
/options/baseline_lowlevel/baseline_simplemlp_skip_hs16_pretrain_any_20M.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 10 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: baseline_lowlevel # Mode is lowlevel phase
24 | hid_sz: 16 # MLP hidden size
25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | env:
29 | gamma: 0.99 # Discount factor for rewards
30 | num_stack: 1 # Number of frames to stack
31 | add_timestep: False # Add timestep to observations
32 | known_reset: False # Reset to known position
33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
34 | time_scale: 0.001 # What to multiply timestep by for AC input
35 | theta_space_mode: pretrain_any # What theta mode we're in
36 | theta_reset_mode: never # When to change theta
37 | theta_reward_mode: lax # How to punish perpendicular movement
38 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
39 | theta_memory_lookback: 10 # How far to look back for reference global theta
40 | time_limit: 1000 # When to end an episode
41 | reward_shape_type: instant
42 | logs:
43 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
44 | exp_name: baseline_simplemlp_skip_hs16_pretrain_any_20M # Unique experiment name
45 | log_interval: 1 # Log interval, one log per n updates
46 | save_interval: 100000000000 # Save interval, one per n updates
47 | vis_interval: 1 # Vis interval, one log per n updates
48 | optim_ppo:
49 | lr: 0.0003 # Learning rate
50 | eps: 0.00001 # RMSprop optimizer epsiolon
51 | alpha: 0.99 # RMSprop optimizer alpha
52 | max_grad_norm: 0.5 # Max norm of gradients
53 | num_frames: 20000000 # Number of frames to train
54 | optim_a2c:
55 | lr: 0.0007 # Learning rate
56 | eps: 0.00001 # RMSprop optimizer epsiolon
57 | alpha: 0.99 # RMSprop optimizer alpha
58 | max_grad_norm: 0.5 # Max norm of gradients
59 | num_frames: 20000000 # Number of frames to train
60 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import collections
4 | import numpy as np
5 | import math
6 | import pdb
7 |
8 | # Necessary for my KFAC implementation.
9 | class AddBias(nn.Module):
10 | def __init__(self, bias):
11 | super(AddBias, self).__init__()
12 | self._bias = nn.Parameter(bias.unsqueeze(1))
13 |
14 | def forward(self, x):
15 | if x.dim() == 2:
16 | bias = self._bias.t().view(1, -1)
17 | else:
18 | bias = self._bias.t().view(1, -1, 1, 1)
19 |
20 | return x + bias
21 |
22 | # Rolling average
23 | class RollingAverage(object):
24 | def __init__(self, window_sz):
25 | self.window_sz = window_sz
26 | self.data = collections.deque()
27 | self.sum = None
28 |
29 | # Append item and update sum and data struct
30 | def append(self, item):
31 | assert(type(item) is np.ndarray)
32 | # If full, pop left and remove remove from average
33 | if len(self.data) == self.window_sz:
34 | removed = self.data.popleft()
35 | self.sum -= removed
36 |
37 | # Update sum with new item and add to data
38 | if len(self.data) == 0:
39 | self.sum = item
40 | else:
41 | self.sum += item
42 | self.data.append(item)
43 | assert(len(self.data) <= self.window_sz)
44 |
45 | # Return the average value
46 | def average(self):
47 | # Exception if list is empty
48 | if len(self.data) == 0:
49 | raise Exception("Can't compute rolling average on empty list")
50 |
51 | # Return average
52 | return self.sum / len(self.data)
53 |
54 | # Convert to/from quaternion
55 | def quaternion_to_euler_angle(w, x, y, z):
56 | ysqr = y * y
57 |
58 | t0 = +2.0 * (w * x + y * z)
59 | t1 = +1.0 - 2.0 * (x * x + ysqr)
60 | roll = math.atan2(t0, t1)
61 |
62 | t2 = +2.0 * (w * y - z * x)
63 | t2 = +1.0 if t2 > +1.0 else t2
64 | t2 = -1.0 if t2 < -1.0 else t2
65 | pitch = math.asin(t2)
66 |
67 | t3 = +2.0 * (w * z + x * y)
68 | t4 = +1.0 - 2.0 * (ysqr + z * z)
69 | yaw = math.atan2(t3, t4)
70 |
71 | return roll, pitch, yaw
72 |
73 | def euler_angle_to_quaternion(roll, pitch, yaw):
74 | cy = math.cos(yaw * 0.5);
75 | sy = math.sin(yaw * 0.5);
76 | cr = math.cos(roll * 0.5);
77 | sr = math.sin(roll * 0.5);
78 | cp = math.cos(pitch * 0.5);
79 | sp = math.sin(pitch * 0.5);
80 |
81 | w = cy * cr * cp + sy * sr * sp
82 | x = cy * sr * cp - sy * cr * sp
83 | y = cy * cr * sp + sy * sr * cp
84 | z = sy * cr * cp - cy * sr * sp
85 |
86 | return w, x, y, z
87 |
88 | # Angle to unit vector
89 | def angle_to_unit(angle):
90 | x = math.cos(angle)
91 | y = math.sin(angle)
92 | return np.array([x, y])
93 |
94 | # Unit vector to angle
95 | def unit_to_angle(v):
96 | x = v[0]
97 | y = v[1]
98 | angle = math.atan2(y, x)
99 |
100 | # Convert angle to an egocentric coordinate (All in unit vectors)
101 | def convert_to_egocentric(ego_to_global_angle, global_angle):
102 | # ego_to_global_angle - the angle of the agent in the global coordinate system
103 | # global_angle - the angle (rad) in global coordinates we want to be egocentric
104 | ego_angle = global_angle - ego_to_global_angle
105 | if ego_angle > math.pi:
106 | ego_angle -= 2*math.pi
107 | elif ego_angle < -math.pi:
108 | ego_angle += 2*math.pi
109 |
110 | return ego_angle
111 |
112 | def init(module, weight_init, bias_init, gain=1):
113 | weight_init(module.weight.data, gain=gain)
114 | bias_init(module.bias.data)
115 | return module
116 |
117 | # https://github.com/openai/baselines/blob/master/baselines/common/tf_util.py#L87
118 | def init_normc_(weight, gain=1):
119 | weight.normal_(0, 1)
120 | weight *= gain / torch.sqrt(weight.pow(2).sum(1, keepdim=True))
121 |
--------------------------------------------------------------------------------
/algo/ppo.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 | import pdb
5 |
6 | class PPO(object):
7 | def __init__(self,
8 | actor_critic,
9 | clip_param,
10 | ppo_epoch,
11 | num_mini_batch,
12 | value_loss_coef,
13 | entropy_coef,
14 | lr=None,
15 | eps=None,
16 | max_grad_norm=None):
17 |
18 | self.actor_critic = actor_critic
19 |
20 | self.clip_param = clip_param
21 | self.ppo_epoch = ppo_epoch
22 | self.num_mini_batch = num_mini_batch
23 |
24 | self.value_loss_coef = value_loss_coef
25 | self.entropy_coef = entropy_coef
26 |
27 | self.max_grad_norm = max_grad_norm
28 |
29 | self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps)
30 |
31 | # Generate a state_dict object
32 | def state_dict(self):
33 | ckpt = {}
34 | ckpt['model'] = self.actor_critic.state_dict()
35 | ckpt['optim'] = self.optimizer.state_dict()
36 | return ckpt
37 |
38 | # Load from a state dict
39 | def load_state_dict(self, ckpt):
40 | self.actor_critic.load_state_dict(ckpt['model'])
41 | self.optimizer.load_state_dict(ckpt['optim'])
42 |
43 | # Load from pretrained (ModularPolicy)
44 | def load_pretrained_policies(self, ckpts):
45 | self.actor_critic.load_pretrained_policies(ckpts)
46 |
47 | # Update our policy network
48 | def update(self, rollouts):
49 | advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
50 | advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5)
51 |
52 | value_loss_epoch = 0
53 | action_loss_epoch = 0
54 | dist_entropy_epoch = 0
55 |
56 | for e in range(self.ppo_epoch):
57 | if self.actor_critic.is_recurrent: #hasattr(self.actor_critic.base, 'gru'):
58 | data_generator = rollouts.recurrent_generator(
59 | advantages, self.num_mini_batch)
60 | else:
61 | data_generator = rollouts.feed_forward_generator(
62 | advantages, self.num_mini_batch)
63 |
64 | for sample in data_generator:
65 | observations_batch, states_batch, actions_batch, \
66 | return_batch, masks_batch, old_action_log_probs_batch, \
67 | adv_targ = sample
68 |
69 | # Reshape to do in a single forward pass for all steps
70 | values, action_log_probs, dist_entropy, states = self.actor_critic.evaluate_actions(
71 | observations_batch, states_batch,
72 | masks_batch, actions_batch)
73 |
74 | ratio = torch.exp(action_log_probs - old_action_log_probs_batch)
75 | surr1 = ratio * adv_targ
76 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
77 | 1.0 + self.clip_param) * adv_targ
78 | action_loss = -torch.min(surr1, surr2).mean()
79 |
80 | value_loss = (return_batch - values).pow(2).mean()
81 |
82 | self.optimizer.zero_grad()
83 | (value_loss * self.value_loss_coef + action_loss -
84 | dist_entropy * self.entropy_coef).backward()
85 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
86 | self.max_grad_norm)
87 | self.optimizer.step()
88 |
89 | value_loss_epoch += value_loss.item()
90 | action_loss_epoch += action_loss.item()
91 | dist_entropy_epoch += dist_entropy.item()
92 |
93 | num_updates = self.ppo_epoch * self.num_mini_batch
94 |
95 | value_loss_epoch /= num_updates
96 | action_loss_epoch /= num_updates
97 | dist_entropy_epoch /= num_updates
98 |
99 | return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
100 |
--------------------------------------------------------------------------------
/options/phase_lowlevel/phase_mlp_pretrain_any.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 10 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: phase_lowlevel # Mode is lowlevel phase
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: False
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | logs:
51 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
52 | exp_name: phase_mlp_pretrain_any_20M # Unique experiment name
53 | log_interval: 1 # Log interval, one log per n updates
54 | save_interval: 100000000000 # Save interval, one per n updates
55 | vis_interval: 1 # Vis interval, one log per n updates
56 | optim_ppo:
57 | lr: 0.0003 # Learning rate
58 | eps: 0.00001 # RMSprop optimizer epsiolon
59 | alpha: 0.99 # RMSprop optimizer alpha
60 | max_grad_norm: 0.5 # Max norm of gradients
61 | num_frames: 2000000 # Number of frames to train
62 | optim_a2c:
63 | lr: 0.0007 # Learning rate
64 | eps: 0.00001 # RMSprop optimizer epsiolon
65 | alpha: 0.99 # RMSprop optimizer alpha
66 | max_grad_norm: 0.5 # Max norm of gradients
67 | num_frames: 2000000 # Number of frames to train
68 |
--------------------------------------------------------------------------------
/options/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 10 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: phase_lowlevel # Mode is lowlevel phase
24 | hid_sz: 16 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: True
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | logs:
51 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
52 | exp_name: phase_mlp_skip_hs16_pretrain_any_20M # Unique experiment name
53 | log_interval: 1 # Log interval, one log per n updates
54 | save_interval: 100000000000 # Save interval, one per n updates
55 | vis_interval: 1 # Vis interval, one log per n updates
56 | optim_ppo:
57 | lr: 0.0003 # Learning rate
58 | eps: 0.00001 # RMSprop optimizer epsiolon
59 | alpha: 0.99 # RMSprop optimizer alpha
60 | max_grad_norm: 0.5 # Max norm of gradients
61 | num_frames: 20000000 # Number of frames to train
62 | optim_a2c:
63 | lr: 0.0007 # Learning rate
64 | eps: 0.00001 # RMSprop optimizer epsiolon
65 | alpha: 0.99 # RMSprop optimizer alpha
66 | max_grad_norm: 0.5 # Max norm of gradients
67 | num_frames: 20000000 # Number of frames to train
68 |
--------------------------------------------------------------------------------
/wrappers.py:
--------------------------------------------------------------------------------
1 | # Modified by Kenneth Marino
2 | # VecNormalize originally copied from https://github.com/openai/baselines/
3 | from vec_env import VecEnvWrapper
4 | from baselines.common.running_mean_std import RunningMeanStd
5 | import numpy as np
6 | import pdb
7 |
8 | # From openai baselines originally
9 | # My version of this saves the unclipped/unnormalized values for logging and other purposes
10 | class ObservationFilter(VecEnvWrapper):
11 | """
12 | Vectorized environment base class
13 | """
14 | def __init__(self, venv, ob=True, ret=True, train=True, noclip=False, has_timestep=False, ignore_mask=None, freeze_mask=None, time_scale=1e-3, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
15 | VecEnvWrapper.__init__(self, venv)
16 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
17 | self.ret_rms = RunningMeanStd(shape=()) if ret else None
18 | self.clipob = clipob
19 | self.cliprew = cliprew
20 | self.ret = np.zeros(self.num_envs)
21 | self.train = train
22 | self.gamma = gamma
23 | self.epsilon = epsilon
24 | self.noclip = noclip
25 | self.ignore_mask = ignore_mask
26 | self.freeze_mask = freeze_mask
27 | self.has_timestep = has_timestep
28 | self.time_scale = time_scale
29 |
30 | def step_wait(self):
31 | """
32 | Apply sequence of actions to sequence of environments
33 | actions -> (observations, rewards, news)
34 | where 'news' is a boolean vector indicating whether each element is new.
35 | """
36 | obs, rews, news, infos = self.venv.step_wait()
37 | self.ret = self.ret * self.gamma + rews
38 | self.raw_obs = obs
39 | self.raw_rews = rews
40 | # Do filtering (but not for step_mask = 0 values)
41 | for proc, obs_proc in enumerate(obs):
42 | obs_proc = np.array([obs_proc])
43 | if self.step_mask[proc] > 0:
44 | obs_proc = self._obfilt(obs_proc)
45 | obs[proc] = obs_proc[0]
46 | if self.ret_rms:
47 | # Only update ret_rms if in training mode
48 | if self.train:
49 | self.ret_rms.update(self.ret)
50 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
51 | return obs, rews, news, infos
52 |
53 | def _obfilt(self, obs):
54 | if self.ob_rms:
55 | # Only update ob_rms if in training mode
56 | if self.train:
57 | # Use freeze mask to only update part of the ob_rms
58 | if self.freeze_mask is not None:
59 | old_obs_rms_mean = np.array(self.ob_rms.mean)
60 | old_obs_rms_var = np.array(self.ob_rms.var)
61 | self.ob_rms.update(obs)
62 | self.ob_rms.mean = old_obs_rms_mean * self.freeze_mask + self.ob_rms.mean * (1 - self.freeze_mask)
63 | self.ob_rms.var = old_obs_rms_var * self.freeze_mask + self.ob_rms.var * (1 - self.freeze_mask)
64 | else:
65 | self.ob_rms.update(obs)
66 |
67 | # Copy original obs
68 | obs_orig = np.copy(obs)
69 |
70 | # Use code from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
71 | if self.noclip:
72 | obs = (obs - self.ob_rms.mean) / (3*(np.sqrt(self.ob_rms.var) + 0.1))
73 | else:
74 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
75 |
76 | # Use ignore_mask to restore parts of obs we want to leave alone
77 | obs = (1 - self.ignore_mask) * obs + self.ignore_mask * obs_orig
78 |
79 | # Scale timestep
80 | if self.has_timestep:
81 | obs[:, -1] *= self.time_scale
82 |
83 | return obs
84 | else:
85 | return obs
86 |
87 |
88 |
89 | def reset(self):
90 | """
91 | Reset all environments
92 | """
93 | obs = self.venv.reset()
94 | self.raw_obs = obs
95 | return self._obfilt(obs)
96 |
97 |
--------------------------------------------------------------------------------
/environments/RewardCyclicEnv.py:
--------------------------------------------------------------------------------
1 | import os
2 | import collections
3 | import pdb
4 | import gym
5 | import time
6 | import csv
7 | import json
8 | import shutil
9 | import numpy as np
10 | import random
11 | from . import ant_env
12 | from . import explorer_ant
13 | from . import geom_utils
14 | from gym.spaces.box import Box
15 | import math
16 | import sys
17 | sys.path.append('../')
18 | from utils import RollingAverage
19 |
20 | # Wrapper that defines our environment for our low level ant policy
21 | # Everything is egocentric to the ant
22 | class RewardCyclicEnv(gym.Wrapper):
23 | def __init__(self, env=None, opt=None):
24 | super(RewardCyclicEnv, self).__init__(env)
25 |
26 | # Should be in correct mode (can be baseline or phase, but should be theta version)
27 | self.mode = opt['model']['mode']
28 | assert(self.mode in ['cyclic'])
29 |
30 | # Make sure we're using the right environment (our ant for now)
31 | assert isinstance(env.unwrapped, ant_env.BaseAntEnv)
32 |
33 | # Keep memory
34 | # Figure out sizes of external and external
35 | self.ex_states = []
36 | self.pro_states = []
37 | self.actions = []
38 |
39 | # Phase period
40 | self.phase_k = opt['model']['phase_period']
41 |
42 | # Params of reward
43 | self.min_movement = opt['env']['min_movement']
44 | self.survive_reward = opt['env']['survive_reward']
45 |
46 | # Step function
47 | # Does step and updates our stored values and also calculates our exploration reward
48 | def step(self, action):
49 | # Do the original step and get the environment reward (will throw some of this out)
50 | obs, true_reward, done, info = self.env.step(action)
51 |
52 | # Get the new state and step
53 | new_state_pro, new_state_ex = self.unwrapped.get_intern_extern_state()
54 |
55 | # Update the states and actions in memory
56 | self.ex_states.append(np.array(new_state_ex))
57 | self.pro_states.append(np.array(new_state_pro))
58 | self.actions.append(np.array(action))
59 | new_count = self._elapsed_steps
60 | assert(len(self.ex_states) == new_count + 1)
61 |
62 | # Determine if there was enough movement
63 | min_movement_mult = float(np.linalg.norm(new_state_pro, 2) > self.min_movement)
64 |
65 | # Get cyclic penalty
66 | if len(self.pro_states) > self.phase_k + 1:
67 | # Get last/current cycle state and actions
68 | new_s = self.pro_states[-1]
69 | old_s = self.pro_states[-(self.phase_k+1)]
70 | new_a = self.actions[-1]
71 | old_a = self.actions[-(self.phase_k+1)]
72 |
73 | # Get cyclic reward for state
74 | state_diff = np.linalg.norm(new_s - old_s, 2)
75 | state_cycle_reward = -state_diff
76 | else:
77 | state_cycle_reward = 0
78 |
79 | # Update survive
80 | info['reward_survive'] = self.survive_reward
81 | info['reward_thresh'] = min_movement_mult
82 | info['reward_cycle'] = state_cycle_reward
83 | reward = info['reward_thresh'] * (info['reward_survive'] + info['reward_cycle'])
84 | info['reward_env'] = reward
85 | #info['reward_env'] = info['reward_forward'] + info['reward_ctrl'] + info['reward_contact'] + info['reward_survive']
86 |
87 | # Return
88 | return obs, reward, done, info
89 |
90 | # Reset
91 | # Pass through and reset data
92 | def reset(self):
93 | obs = self.env.reset()
94 |
95 | # Reset our storage structures
96 | self.ex_states = []
97 | self.pro_states = []
98 | self.actions = []
99 |
100 | # Update the states and actions in memory
101 | new_state_pro, new_state_ex = self.unwrapped.get_intern_extern_state()
102 | self.ex_states.append(np.array(new_state_ex))
103 | self.pro_states.append(np.array(new_state_pro))
104 | self.actions.append(np.zeros(self.action_space.shape))
105 | assert(self._elapsed_steps == 0)
106 |
107 | return obs
108 |
109 | # Pass through _elapsed_steps
110 | @property
111 | def _elapsed_steps(self):
112 | return self.env._elapsed_steps
113 |
114 |
--------------------------------------------------------------------------------
/options/hierarchical_final/hierarchical_many_phase.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: hierarchical_many # Mode is hierarchical many
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | env:
28 | gamma: 0.99 # Discount factor for rewards
29 | num_stack: 1 # Number of frames to stack
30 | add_timestep: False # Add timestep to observations
31 | known_reset: False # Reset to known position
32 | time_scale: 0.001 # What to multiply timestep by for AC input
33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
34 | maze:
35 | goal_radius: 2 # Distance to goal in order to reach it
36 | goal_reward: 5 # How much reward to give for getting to goal
37 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
38 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
39 | use_contact_cost: 0 # Whether to use contact cost in final reward
40 | use_survive_reward: 0 # Whether to use survive reward in final reward
41 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
42 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
43 | logs:
44 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
45 | exp_name: hierarchical_many_phase_final # Unique experiment name
46 | log_interval: 1 # Log interval, one log per n updates
47 | save_interval: 100 # Save interval, one per n updates
48 | vis_interval: 1 # Vis interval, one log per n updates
49 | optim_ppo:
50 | lr: 0.0003 # Learning rate
51 | eps: 0.00001 # RMSprop optimizer epsiolon
52 | alpha: 0.99 # RMSprop optimizer alpha
53 | max_grad_norm: 0.5 # Max norm of gradients
54 | num_frames: 20000000 # Number of frames to train
55 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
56 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
57 | optim_a2c:
58 | lr: 0.0007 # Learning rate
59 | eps: 0.00001 # RMSprop optimizer epsiolon
60 | alpha: 0.99 # RMSprop optimizer alpha
61 | max_grad_norm: 0.5 # Max norm of gradients
62 | num_frames: 20000000 # Number of frames to train
63 | lowlevel:
64 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy
65 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/
66 | deterministic: False # Whether low level policies are deterministic
67 | num_load: 16 # Number of low level policies
68 |
--------------------------------------------------------------------------------
/options/hierarchical_final/hierarchical_many_4M_phase.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: hierarchical_many # Mode is hierarchical many
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | env:
28 | gamma: 0.99 # Discount factor for rewards
29 | num_stack: 1 # Number of frames to stack
30 | add_timestep: False # Add timestep to observations
31 | known_reset: False # Reset to known position
32 | time_scale: 0.001 # What to multiply timestep by for AC input
33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
34 | maze:
35 | goal_radius: 2 # Distance to goal in order to reach it
36 | goal_reward: 5 # How much reward to give for getting to goal
37 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
38 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
39 | use_contact_cost: 0 # Whether to use contact cost in final reward
40 | use_survive_reward: 0 # Whether to use survive reward in final reward
41 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
42 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
43 | logs:
44 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
45 | exp_name: hierarchical_many_4M_phase_final # Unique experiment name
46 | log_interval: 1 # Log interval, one log per n updates
47 | save_interval: 100 # Save interval, one per n updates
48 | vis_interval: 1 # Vis interval, one log per n updates
49 | optim_ppo:
50 | lr: 0.0003 # Learning rate
51 | eps: 0.00001 # RMSprop optimizer epsiolon
52 | alpha: 0.99 # RMSprop optimizer alpha
53 | max_grad_norm: 0.5 # Max norm of gradients
54 | num_frames: 4000000 # Number of frames to train
55 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
56 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
57 | optim_a2c:
58 | lr: 0.0007 # Learning rate
59 | eps: 0.00001 # RMSprop optimizer epsiolon
60 | alpha: 0.99 # RMSprop optimizer alpha
61 | max_grad_norm: 0.5 # Max norm of gradients
62 | num_frames: 4000000 # Number of frames to train
63 | lowlevel:
64 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy
65 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/
66 | deterministic: False # Whether low level policies are deterministic
67 | num_load: 16 # Number of low level policies
68 |
--------------------------------------------------------------------------------
/options/hierarchical_final/hierarchical_many_phase_antlowgear.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: hierarchical_many # Mode is hierarchical many
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | env:
28 | gamma: 0.99 # Discount factor for rewards
29 | num_stack: 1 # Number of frames to stack
30 | add_timestep: False # Add timestep to observations
31 | known_reset: False # Reset to known position
32 | time_scale: 0.001 # What to multiply timestep by for AC input
33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
34 | maze:
35 | goal_radius: 2 # Distance to goal in order to reach it
36 | goal_reward: 5 # How much reward to give for getting to goal
37 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
38 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
39 | use_contact_cost: 0 # Whether to use contact cost in final reward
40 | use_survive_reward: 0 # Whether to use survive reward in final reward
41 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
42 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
43 | logs:
44 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
45 | exp_name: hierarchical_many_phase_antlowgear_final # Unique experiment name
46 | log_interval: 1 # Log interval, one log per n updates
47 | save_interval: 100 # Save interval, one per n updates
48 | vis_interval: 1 # Vis interval, one log per n updates
49 | optim_ppo:
50 | lr: 0.0003 # Learning rate
51 | eps: 0.00001 # RMSprop optimizer epsiolon
52 | alpha: 0.99 # RMSprop optimizer alpha
53 | max_grad_norm: 0.5 # Max norm of gradients
54 | num_frames: 20000000 # Number of frames to train
55 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
56 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
57 | optim_a2c:
58 | lr: 0.0007 # Learning rate
59 | eps: 0.00001 # RMSprop optimizer epsiolon
60 | alpha: 0.99 # RMSprop optimizer alpha
61 | max_grad_norm: 0.5 # Max norm of gradients
62 | num_frames: 20000000 # Number of frames to train
63 | lowlevel:
64 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy
65 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAntLowGear-v2/
66 | deterministic: False # Whether low level policies are deterministic
67 | num_load: 16 # Number of low level policies
68 |
--------------------------------------------------------------------------------
/options/hierarchical_final/hierarchical_many_phase_a2c.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | norm_ret: False
22 | model:
23 | recurrent_policy: False # Use a recurrent policy
24 | mode: hierarchical_many # Mode is hierarchical many
25 | hid_sz: 32 # MLP hidden size
26 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
27 | num_layer: 2 # Number of layers in MLP (minus input layer)
28 | env:
29 | gamma: 0.99 # Discount factor for rewards
30 | num_stack: 1 # Number of frames to stack
31 | add_timestep: False # Add timestep to observations
32 | known_reset: False # Reset to known position
33 | time_scale: 0.001 # What to multiply timestep by for AC input
34 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
35 | maze:
36 | goal_radius: 2 # Distance to goal in order to reach it
37 | goal_reward: 5 # How much reward to give for getting to goal
38 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
39 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
40 | use_contact_cost: 0 # Whether to use contact cost in final reward
41 | use_survive_reward: 0 # Whether to use survive reward in final reward
42 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
43 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
44 | logs:
45 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
46 | exp_name: hierarchical_many_phase_a2c_final # Unique experiment name
47 | log_interval: 1 # Log interval, one log per n updates
48 | save_interval: 100 # Save interval, one per n updates
49 | vis_interval: 1 # Vis interval, one log per n updates
50 | optim_ppo:
51 | lr: 0.0003 # Learning rate
52 | eps: 0.00001 # RMSprop optimizer epsiolon
53 | alpha: 0.99 # RMSprop optimizer alpha
54 | max_grad_norm: 0.5 # Max norm of gradients
55 | num_frames: 20000000 # Number of frames to train
56 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
57 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
58 | optim_a2c:
59 | lr: 0.0007 # Learning rate
60 | eps: 0.00001 # RMSprop optimizer epsiolon
61 | alpha: 0.99 # RMSprop optimizer alpha
62 | max_grad_norm: 0.5 # Max norm of gradients
63 | num_frames: 20000000 # Number of frames to train
64 | hierarchical_mode: train_highlevel
65 | num_ll_steps: 10
66 | lowlevel:
67 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy
68 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/
69 | deterministic: False # Whether low level policies are deterministic
70 | num_load: 16 # Number of low level policies
71 |
--------------------------------------------------------------------------------
/options/hierarchical_final/hierarchical_many_phase_16ll_proprioceptivehumanoid.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: hierarchical_many # Mode is hierarchical many
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | env:
28 | gamma: 0.99 # Discount factor for rewards
29 | num_stack: 1 # Number of frames to stack
30 | add_timestep: False # Add timestep to observations
31 | known_reset: False # Reset to known position
32 | time_scale: 0.001 # What to multiply timestep by for AC input
33 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
34 | maze:
35 | goal_radius: 2 # Distance to goal in order to reach it
36 | goal_reward: 5 # How much reward to give for getting to goal
37 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
38 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
39 | use_contact_cost: 0 # Whether to use contact cost in final reward
40 | use_survive_reward: 0 # Whether to use survive reward in final reward
41 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
42 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
43 | logs:
44 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
45 | exp_name: hierarchical_many_phase_16ll_proprioceptivehumanoid_final_stopat175 # Unique experiment name
46 | log_interval: 1 # Log interval, one log per n updates
47 | save_interval: 100 # Save interval, one per n updates
48 | vis_interval: 1 # Vis interval, one log per n updates
49 | optim_ppo:
50 | lr: 0.0003 # Learning rate
51 | eps: 0.00001 # RMSprop optimizer epsiolon
52 | alpha: 0.99 # RMSprop optimizer alpha
53 | max_grad_norm: 0.5 # Max norm of gradients
54 | num_frames: 20000000 # Number of frames to train
55 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
56 | num_ll_steps: 100 # How many low level steps to do in a row before update high level step
57 | optim_a2c:
58 | lr: 0.0007 # Learning rate
59 | eps: 0.00001 # RMSprop optimizer epsiolon
60 | alpha: 0.99 # RMSprop optimizer alpha
61 | max_grad_norm: 0.5 # Max norm of gradients
62 | num_frames: 20000000 # Number of frames to train
63 | lowlevel:
64 | optfile: options/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M.yaml # Opt file location of low level policy
65 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M/ppo/ExplorerProprioceptiveHumanoid-v2/
66 | deterministic: False # Whether low level policies are deterministic
67 | num_load: 16 # Number of low level policies
68 |
--------------------------------------------------------------------------------
/options/hierarchical_final/hierarchical_many_baseline.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | norm_ret: False
22 | model:
23 | recurrent_policy: False # Use a recurrent policy
24 | mode: hierarchical_many # Mode is hierarchical many
25 | hid_sz: 32 # MLP hidden size
26 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
27 | num_layer: 2 # Number of layers in MLP (minus input layer)
28 | env:
29 | gamma: 0.99 # Discount factor for rewards
30 | num_stack: 1 # Number of frames to stack
31 | add_timestep: False # Add timestep to observations
32 | known_reset: False # Reset to known position
33 | time_scale: 0.001 # What to multiply timestep by for AC input
34 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
35 | maze:
36 | goal_radius: 2 # Distance to goal in order to reach it
37 | goal_reward: 5 # How much reward to give for getting to goal
38 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
39 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
40 | use_contact_cost: 0 # Whether to use contact cost in final reward
41 | use_survive_reward: 0 # Whether to use survive reward in final reward
42 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
43 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
44 | logs:
45 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
46 | exp_name: hierarchical_many_baseline_final # Unique experiment name
47 | log_interval: 1 # Log interval, one log per n updates
48 | save_interval: 100 # Save interval, one per n updates
49 | vis_interval: 1 # Vis interval, one log per n updates
50 | optim_ppo:
51 | lr: 0.0003 # Learning rate
52 | eps: 0.00001 # RMSprop optimizer epsiolon
53 | alpha: 0.99 # RMSprop optimizer alpha
54 | max_grad_norm: 0.5 # Max norm of gradients
55 | num_frames: 20000000 # Number of frames to train
56 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
57 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
58 | optim_a2c:
59 | lr: 0.0007 # Learning rate
60 | eps: 0.00001 # RMSprop optimizer epsiolon
61 | alpha: 0.99 # RMSprop optimizer alpha
62 | max_grad_norm: 0.5 # Max norm of gradients
63 | num_frames: 20000000 # Number of frames to train
64 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
65 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
66 | lowlevel:
67 | optfile: options/baseline_lowlevel/baseline_simplemlp_pretrain_any.yaml # Opt file location of low level policy
68 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAnt-v2/
69 | deterministic: False # Whether low level policies are deterministic
70 | num_load: 16 # Number of low level policies
71 |
--------------------------------------------------------------------------------
/options/hierarchical_final/hierarchical_many_4M_baseline.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | norm_ret: False
22 | model:
23 | recurrent_policy: False # Use a recurrent policy
24 | mode: hierarchical_many # Mode is hierarchical many
25 | hid_sz: 32 # MLP hidden size
26 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
27 | num_layer: 2 # Number of layers in MLP (minus input layer)
28 | env:
29 | gamma: 0.99 # Discount factor for rewards
30 | num_stack: 1 # Number of frames to stack
31 | add_timestep: False # Add timestep to observations
32 | known_reset: False # Reset to known position
33 | time_scale: 0.001 # What to multiply timestep by for AC input
34 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
35 | maze:
36 | goal_radius: 2 # Distance to goal in order to reach it
37 | goal_reward: 5 # How much reward to give for getting to goal
38 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
39 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
40 | use_contact_cost: 0 # Whether to use contact cost in final reward
41 | use_survive_reward: 0 # Whether to use survive reward in final reward
42 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
43 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
44 | logs:
45 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
46 | exp_name: hierarchical_many_4M_baseline_final # Unique experiment name
47 | log_interval: 1 # Log interval, one log per n updates
48 | save_interval: 100 # Save interval, one per n updates
49 | vis_interval: 1 # Vis interval, one log per n updates
50 | optim_ppo:
51 | lr: 0.0003 # Learning rate
52 | eps: 0.00001 # RMSprop optimizer epsiolon
53 | alpha: 0.99 # RMSprop optimizer alpha
54 | max_grad_norm: 0.5 # Max norm of gradients
55 | num_frames: 4000000 # Number of frames to train
56 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
57 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
58 | optim_a2c:
59 | lr: 0.0007 # Learning rate
60 | eps: 0.00001 # RMSprop optimizer epsiolon
61 | alpha: 0.99 # RMSprop optimizer alpha
62 | max_grad_norm: 0.5 # Max norm of gradients
63 | num_frames: 4000000 # Number of frames to train
64 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
65 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
66 | lowlevel:
67 | optfile: options/baseline_lowlevel/baseline_simplemlp_pretrain_any.yaml # Opt file location of low level policy
68 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAnt-v2/
69 | deterministic: False # Whether low level policies are deterministic
70 | num_load: 16 # Number of low level policies
71 |
--------------------------------------------------------------------------------
/options/hierarchical_final/hierarchical_many_baseline_antlowgear.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | norm_ret: False
22 | model:
23 | recurrent_policy: False # Use a recurrent policy
24 | mode: hierarchical_many # Mode is hierarchical many
25 | hid_sz: 32 # MLP hidden size
26 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
27 | num_layer: 2 # Number of layers in MLP (minus input layer)
28 | env:
29 | gamma: 0.99 # Discount factor for rewards
30 | num_stack: 1 # Number of frames to stack
31 | add_timestep: False # Add timestep to observations
32 | known_reset: False # Reset to known position
33 | time_scale: 0.001 # What to multiply timestep by for AC input
34 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
35 | maze:
36 | goal_radius: 2 # Distance to goal in order to reach it
37 | goal_reward: 5 # How much reward to give for getting to goal
38 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
39 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
40 | use_contact_cost: 0 # Whether to use contact cost in final reward
41 | use_survive_reward: 0 # Whether to use survive reward in final reward
42 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
43 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
44 | logs:
45 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
46 | exp_name: hierarchical_many_baseline_antlowgear_final # Unique experiment name
47 | log_interval: 1 # Log interval, one log per n updates
48 | save_interval: 100 # Save interval, one per n updates
49 | vis_interval: 1 # Vis interval, one log per n updates
50 | optim_ppo:
51 | lr: 0.0003 # Learning rate
52 | eps: 0.00001 # RMSprop optimizer epsiolon
53 | alpha: 0.99 # RMSprop optimizer alpha
54 | max_grad_norm: 0.5 # Max norm of gradients
55 | num_frames: 20000000 # Number of frames to train
56 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
57 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
58 | optim_a2c:
59 | lr: 0.0007 # Learning rate
60 | eps: 0.00001 # RMSprop optimizer epsiolon
61 | alpha: 0.99 # RMSprop optimizer alpha
62 | max_grad_norm: 0.5 # Max norm of gradients
63 | num_frames: 20000000 # Number of frames to train
64 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
65 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
66 | lowlevel:
67 | optfile: options/baseline_lowlevel/baseline_simplemlp_pretrain_any.yaml # Opt file location of low level policy
68 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAntLowGear-v2/
69 | deterministic: False # Whether low level policies are deterministic
70 | num_load: 16 # Number of low level policies
71 |
--------------------------------------------------------------------------------
/options/maze_baseline/maze_baseline.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline # Mode is lowlevel phase
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: False
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: False # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 5 # How much reward to give for getting to goal
53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 |
--------------------------------------------------------------------------------
/options/maze_baseline/maze_baseline_wmove_r1000.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline # Mode is lowlevel phase
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: False
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: False # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 1000 # How much reward to give for getting to goal
53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_wmove_r1000 # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 |
--------------------------------------------------------------------------------
/options/maze_baseline_wphase/maze_baseline_phase.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline_wphase # Mode is lowlevel phase
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: False
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 5 # How much reward to give for getting to goal
53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_phase # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 |
--------------------------------------------------------------------------------
/options/maze_baseline_wphase/maze_baseline_phase_humanoid.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline_wphase # Mode is lowlevel phase
24 | hid_sz: 16 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: True
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 5 # How much reward to give for getting to goal
53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_phase_humanoid # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 |
--------------------------------------------------------------------------------
/options/maze_baseline_wphase/maze_baseline_phase_wmove_r1000.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline_wphase # Mode is lowlevel phase
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: False
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 1000 # How much reward to give for getting to goal
53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_phase_wmove_r1000 # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 |
--------------------------------------------------------------------------------
/options/maze_baseline_wphase/maze_baseline_phase_wmove_r1000_humanoid.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline_wphase # Mode is lowlevel phase
24 | hid_sz: 16 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: True
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 1000 # How much reward to give for getting to goal
53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_phase_wmove_r1000_humanoid # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 |
--------------------------------------------------------------------------------
/options/maze_baseline/maze_baseline_finetune.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline # Mode is lowlevel phase
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: False
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: False # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 5 # How much reward to give for getting to goal
53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_finetune # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 | lowlevel:
78 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAnt-v2/
79 |
--------------------------------------------------------------------------------
/options/maze_baseline_wphase/maze_baseline_phase_finetune.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline_wphase # Mode is lowlevel phase
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: False
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 5 # How much reward to give for getting to goal
53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_phase_finetune # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 | lowlevel:
78 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/
79 |
--------------------------------------------------------------------------------
/options/maze_baseline/maze_baseline_wmove_r1000_finetune.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline # Mode is lowlevel phase
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: False
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: False # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 1000 # How much reward to give for getting to goal
53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_wmove_r1000_finetune # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 | lowlevel:
78 | ckpt: /checkpoint/kdmarino/phasefunlogs/baseline_lowlevel/baseline_simplemlp_pretrain_any/ppo/ExplorerAnt-v2/
79 |
--------------------------------------------------------------------------------
/options/maze_baseline_wphase/maze_baseline_phase_wmove_r1000_finetune.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline_wphase # Mode is lowlevel phase
24 | hid_sz: 32 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: False
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 1000 # How much reward to give for getting to goal
53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_phase_wmove_r1000_finetune # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 | lowlevel:
78 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/
79 |
--------------------------------------------------------------------------------
/options/maze_baseline_wphase/maze_baseline_phase_finetune_proprioceptivehumanoid.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline_wphase # Mode is lowlevel phase
24 | hid_sz: 16 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: True
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 5 # How much reward to give for getting to goal
53 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_phase_finetune_proprioceptivehumanoid # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 | lowlevel:
78 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M/ppo/ExplorerProprioceptiveHumanoid-v2/
79 |
--------------------------------------------------------------------------------
/options/maze_baseline_wphase/maze_baseline_phase_wmove_r1000_finetune_proprioceptivehumanoid.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | model:
22 | recurrent_policy: False # Use a recurrent policy
23 | mode: maze_baseline_wphase # Mode is lowlevel phase
24 | hid_sz: 16 # MLP hidden size
25 | model_type: MLPPretrain # What kind of network model (MLP | Mult | Module)
26 | num_layer: 2 # Number of layers in MLP (minus input layer)
27 | phase_period: 10
28 | phase_hid_sz: 16
29 | skip_layer: True
30 | use_timestep: True
31 | time_scale: 0.001
32 | env:
33 | gamma: 0.99 # Discount factor for rewards
34 | num_stack: 1 # Number of frames to stack
35 | add_timestep: True # Add timestep to observations
36 | known_reset: False # Reset to known position
37 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
38 | time_scale: 0.001 # What to multiply timestep by for AC input
39 | theta_space_mode: pretrain_any # What theta mode we're in
40 | theta_reset_mode: never # When to change theta
41 | theta_reward_mode: lax # How to punish perpendicular movement
42 | theta_obs_mode: pretrain # How to parameterize theta in network (ind | vector)
43 | theta_memory_lookback: 10 # How far to look back for reference global theta
44 | time_limit: 1000 # When to end an episode
45 | reward_shape_type: instant
46 | state_cycle_weight: 0.05 # How to weight state cycle differences
47 | action_cycle_weight: 0.01 # How to weight action cycle differences
48 | phase_period: 10 # How long the phase cycle is
49 | cycle_startup: 0 # Whether we should skip the first cycle penalty (let it start up)
50 | maze:
51 | goal_radius: 2 # Distance to goal in order to reach it
52 | goal_reward: 1000 # How much reward to give for getting to goal
53 | velocity_reward_weight: 1 # How much weight to give to moving (any direction)
54 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
55 | use_contact_cost: 0 # Whether to use contact cost in final reward
56 | use_survive_reward: 0 # Whether to use survive reward in final reward
57 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
58 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
59 | logs:
60 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
61 | exp_name: maze_baseline_phase_wmove_r1000_finetune_proprioceptivehumanoid # Unique experiment name
62 | log_interval: 1 # Log interval, one log per n updates
63 | save_interval: 100000000000 # Save interval, one per n updates
64 | vis_interval: 1 # Vis interval, one log per n updates
65 | optim_ppo:
66 | lr: 0.0003 # Learning rate
67 | eps: 0.00001 # RMSprop optimizer epsiolon
68 | alpha: 0.99 # RMSprop optimizer alpha
69 | max_grad_norm: 0.5 # Max norm of gradients
70 | num_frames: 20000000 # Number of frames to train
71 | optim_a2c:
72 | lr: 0.0007 # Learning rate
73 | eps: 0.00001 # RMSprop optimizer epsiolon
74 | alpha: 0.99 # RMSprop optimizer alpha
75 | max_grad_norm: 0.5 # Max norm of gradients
76 | num_frames: 20000000 # Number of frames to train
77 | lowlevel:
78 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_skip_hs16_pretrain_any_20M/ppo/ExplorerProprioceptiveHumanoid-v2/
79 |
--------------------------------------------------------------------------------
/environments/assets/ant_custom_gear.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
--------------------------------------------------------------------------------
/environments/assets/my_ant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
--------------------------------------------------------------------------------
/environments/mujoco_env.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from gym import error, spaces
4 | from gym.utils import seeding
5 | import numpy as np
6 | from os import path
7 | import gym
8 | import six
9 |
10 | try:
11 | import mujoco_py
12 | except ImportError as e:
13 | raise error.DependencyNotInstalled("{}. (HINT: you need to install mujoco_py, and also perform the setup instructions here: https://github.com/openai/mujoco-py/.)".format(e))
14 |
15 | DEFAULT_SIZE = 500
16 |
17 | class MujocoEnv(gym.Env):
18 | """Superclass for all MuJoCo environments.
19 | """
20 |
21 | def __init__(self, model_path, frame_skip):
22 | if model_path.startswith("/"):
23 | fullpath = model_path
24 | else:
25 | fullpath = os.path.join(os.path.dirname(__file__), "assets", model_path)
26 | if not path.exists(fullpath):
27 | raise IOError("File %s does not exist" % fullpath)
28 | self.frame_skip = frame_skip
29 | self.model = mujoco_py.load_model_from_path(fullpath)
30 | self.sim = mujoco_py.MjSim(self.model)
31 | self.data = self.sim.data
32 | self.viewer = None
33 | self._viewers = {}
34 |
35 | self.metadata = {
36 | 'render.modes': ['human', 'rgb_array'],
37 | 'video.frames_per_second': int(np.round(1.0 / self.dt))
38 | }
39 |
40 | self.init_qpos = self.sim.data.qpos.ravel().copy()
41 | self.init_qvel = self.sim.data.qvel.ravel().copy()
42 | observation, _reward, done, _info = self.step(np.zeros(self.model.nu))
43 | assert not done
44 | self.obs_dim = observation.size
45 |
46 | bounds = self.model.actuator_ctrlrange.copy()
47 | low = bounds[:, 0]
48 | high = bounds[:, 1]
49 | self.action_space = spaces.Box(low=low, high=high)
50 |
51 | high = np.inf*np.ones(self.obs_dim)
52 | low = -high
53 | self.observation_space = spaces.Box(low, high)
54 |
55 | self.seed()
56 |
57 | def seed(self, seed=None):
58 | self.np_random, seed = seeding.np_random(seed)
59 | return [seed]
60 |
61 | # methods to override:
62 | # ----------------------------
63 |
64 | def reset_model(self):
65 | """
66 | Reset the robot degrees of freedom (qpos and qvel).
67 | Implement this in each subclass.
68 | """
69 | raise NotImplementedError
70 |
71 | def viewer_setup(self):
72 | """
73 | This method is called when the viewer is initialized and after every reset
74 | Optionally implement this method, if you need to tinker with camera position
75 | and so forth.
76 | """
77 | pass
78 |
79 | # -----------------------------
80 |
81 | def reset(self):
82 | self.sim.reset()
83 | ob = self.reset_model()
84 | old_viewer = self.viewer
85 | for v in self._viewers.values():
86 | self.viewer = v
87 | self.viewer_setup()
88 | self.viewer = old_viewer
89 | return ob
90 |
91 | def set_state(self, qpos, qvel):
92 | assert qpos.shape == (self.model.nq,) and qvel.shape == (self.model.nv,)
93 | old_state = self.sim.get_state()
94 | new_state = mujoco_py.MjSimState(old_state.time, qpos, qvel,
95 | old_state.act, old_state.udd_state)
96 | self.sim.set_state(new_state)
97 | self.sim.forward()
98 |
99 | @property
100 | def dt(self):
101 | return self.model.opt.timestep * self.frame_skip
102 |
103 | def do_simulation(self, ctrl, n_frames):
104 | self.sim.data.ctrl[:] = ctrl
105 | for _ in range(n_frames):
106 | self.sim.step()
107 |
108 | def render(self, mode='human', width=DEFAULT_SIZE, height=DEFAULT_SIZE):
109 | if mode == 'rgb_array':
110 | self._get_viewer(mode).render(width, height)
111 | # window size used for old mujoco-py:
112 | data = self._get_viewer(mode).read_pixels(width, height, depth=False)
113 | # original image is upside-down, so flip it
114 | return data[::-1, :, :]
115 | elif mode == 'human':
116 | self._get_viewer(mode).render()
117 |
118 | def close(self):
119 | if self.viewer is not None:
120 | # self.viewer.finish()
121 | self.viewer = None
122 | self._viewers = {}
123 |
124 | def _get_viewer(self, mode):
125 | self.viewer = self._viewers.get(mode)
126 | if self.viewer is None:
127 | if mode == 'human':
128 | self.viewer = mujoco_py.MjViewer(self.sim)
129 | elif mode == 'rgb_array':
130 | self.viewer = mujoco_py.MjRenderContextOffscreen(self.sim, 0)
131 | self.viewer_setup()
132 | self._viewers[mode] = self.viewer
133 | return self.viewer
134 |
135 | def get_body_com(self, body_name):
136 | return self.data.get_body_xpos(body_name)
137 |
138 | def state_vector(self):
139 | return np.concatenate([
140 | self.sim.data.qpos.flat,
141 | self.sim.data.qvel.flat
142 | ])
143 |
--------------------------------------------------------------------------------
/environments/simple_humanoid_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numpy as np
3 | from gym import utils
4 | import pdb
5 | import math
6 | from . import mujoco_env
7 | from . import geom_utils
8 |
9 | def mass_center(model, sim):
10 | mass = np.expand_dims(model.body_mass, 1)
11 | xpos = sim.data.xipos
12 | return (np.sum(mass * xpos, 0) / np.sum(mass))
13 |
14 | class BaseSimpleHumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
15 | # Initialize Mujoco environment
16 | def __init__(self, xml_file='simple_humanoid.xml'):
17 | mujoco_env.MujocoEnv.__init__(self, xml_file, 1)
18 | utils.EzPickle.__init__(self)
19 |
20 | # Forward step
21 | def step(self, action):
22 | pos_before = mass_center(self.model, self.sim)
23 | self.do_simulation(action, self.frame_skip)
24 | pos_after = mass_center(self.model, self.sim)
25 | alive_bonus = 0.2
26 | data = self.sim.data
27 | lin_vel_cost = 0.25 * (pos_after - pos_before) / self.model.opt.timestep
28 | lb = -100
29 | ub = 100
30 | scaling = (ub - lb) * 0.5
31 | quad_ctrl_cost = .5 * 1e-3 * np.sum(
32 | np.square(action / scaling))
33 | quad_impact_cost = .5 * 1e-5 * np.sum(
34 | np.square(np.clip(data.cfrc_ext, -1, 1)))
35 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
36 | qpos = self.sim.data.qpos
37 | done = bool((qpos[2] < 0.8) or (qpos[2] > 2.0))
38 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost)
39 |
40 | # Get states by name
41 | def get_state_by_name(self, name, s=None):
42 | # Get state (if not passed in)
43 | if s is None:
44 | s = self.state_vector()
45 |
46 | # Switch on name
47 | if name == 'xyz':
48 | val = s[0:3]
49 | elif name == 'x':
50 | val = s[0]
51 | elif name == 'y':
52 | val = s[1]
53 | elif name == 'z':
54 | val = s[2]
55 | elif name == 'quart':
56 | val = s[3:7]
57 | elif name in ['rpy', 'roll', 'pitch', 'yaw']:
58 | quart = s[3:7]
59 | roll, pitch, yaw = geom_utils.quaternion_to_euler_angle(quart)
60 | if name == 'roll':
61 | val = roll
62 | elif name == 'pitch':
63 | val = pitch
64 | elif name == 'yaw':
65 | val = yaw
66 | elif name == 'rpy':
67 | return np.array([roll, pitch, yaw])
68 | elif name == 'joint_angles':
69 | val = s[7:17]
70 | elif name == 'xyz_vel':
71 | val = s[17:20]
72 | elif name == 'x_vel':
73 | val = s[17]
74 | elif name == 'y_vel':
75 | val = s[18]
76 | elif name == 'z_vel':
77 | val = s[19]
78 | elif name == 'rpy_vel':
79 | val = s[20:23]
80 | elif name == 'roll_vel':
81 | val = s[20]
82 | elif name == 'pitch_vel':
83 | val = s[21]
84 | elif name == 'yaw_vel':
85 | val = s[22]
86 | elif name == 'joint_angle_vel':
87 | val = s[23:]
88 | return val
89 |
90 | # We remove the first 5 values from state which should correspond to global orientation and position
91 | # https://github.com/openai/gym/wiki/Humanoid-V1
92 | def get_intern_extern_state(self):
93 | # Extract different states
94 | s = self.state_vector()
95 |
96 | xyz = self.get_state_by_name('xyz', s)
97 | rpy = self.get_state_by_name('rpy', s)
98 | joint_angles = self.get_state_by_name('joint_angles', s)
99 | d_xyz = self.get_state_by_name('xyz_vel', s)
100 | d_rpy = self.get_state_by_name('rpy_vel', s)
101 | d_joint = self.get_state_by_name('joint_angle_vel', s)
102 |
103 | # Seperate out yaw
104 | roll = rpy[0]
105 | pitch = rpy[1]
106 | yaw = rpy[2]
107 | d_roll = d_rpy[0]
108 | d_pitch = d_rpy[1]
109 | d_yaw = d_rpy[2]
110 |
111 | # Set internal/external states
112 | s_internal = np.concatenate([[roll, pitch], joint_angles, [d_roll, d_pitch]])
113 | s_external = np.concatenate([xyz, [yaw], d_xyz, [d_yaw]])
114 | #s_internal = np.concatenate([s, np.clip(self.sim.data.cfrc_ext, -1, 1).flat, self.get_body_com("torso").flat] )
115 |
116 | #assert(s_internal.shape[0] == 20)
117 | assert(s_external.shape[0] == 8)
118 |
119 | return s_internal, s_external
120 |
121 | def _get_obs(self):
122 | raise NotImplementedError
123 |
124 | def get_body_com(self, body_name):
125 | idx = self.model.body_names.index(body_name)
126 | return self.sim.data.subtree_com[idx]
127 |
128 | def reset_model(self):
129 | c = 0.01
130 | self.set_state(
131 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
132 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
133 | )
134 | return self._get_obs()
135 |
136 | def viewer_setup(self):
137 | self.viewer.cam.trackbodyid = 1
138 | self.viewer.cam.distance = self.model.stat.extent * 1.0
139 | self.viewer.cam.lookat[2] += .8
140 | self.viewer.cam.elevation = -20
141 |
--------------------------------------------------------------------------------
/options/hierarchical_final/hierarchical_many_phase_dqn.yaml:
--------------------------------------------------------------------------------
1 | alg_ppo:
2 | use_gae: True # Use generalized advantage estimation
3 | gae_tau: 0.95 # GAE parameter
4 | entropy_coef: 0 # Entropy term coefficient
5 | value_loss_coef: 1 # Value loss coefficient
6 | num_steps: 2048 # Number of forward steps
7 | num_processes: 1 # Number of parallel processes to run
8 | ppo_epoch: 100 # Number of ppo epochs
9 | num_mini_batch: 32 # Number of batcxhes for ppo
10 | clip_param: 0.2 # ppo clip parameter
11 | log_mult: 1 # How much less often to log for this alg
12 | norm_ret: False # Whether to add normalization to returns
13 | alg_a2c:
14 | use_gae: False # Use generalized advantage estimation
15 | gae_tau: 0.95 # GAE parameter
16 | entropy_coef: 0.01 # Entropy term coefficient
17 | value_loss_coef: 0.5 # Value loss coefficient
18 | num_steps: 5 # Number of forward steps
19 | num_processes: 16 # Number of parallel processes to run
20 | log_mult: 10 # How much less often to log for this alg
21 | alg_dqn:
22 | batch_size: 128 # DQN batch size
23 | target_update: 10000 # After how many steps to update DQN target
24 | mem_capacity: 10000000 # How many frames to store in replay memory
25 | num_steps: 1 # How many high level actions to take before doing DQN updates
26 | updates_per_step: 10 # How many DQN updates to do every loop
27 | num_processes: 1 # Number of parallel processes to run
28 | norm_ret: False # Whether to add normalization to returns
29 | log_mult: 100
30 | save_interval: 100000
31 | model:
32 | recurrent_policy: False # Use a recurrent policy
33 | mode: hierarchical_many # Mode is hierarchical many
34 | hid_sz: 32 # MLP hidden size
35 | model_type: MLPSimpleDebug # What kind of network model (MLP | Mult | Module)
36 | num_layer: 2 # Number of layers in MLP (minus input layer)
37 | env:
38 | gamma: 0.99 # Discount factor for rewards
39 | num_stack: 1 # Number of frames to stack
40 | add_timestep: False # Add timestep to observations
41 | known_reset: False # Reset to known position
42 | time_scale: 0.001 # What to multiply timestep by for AC input
43 | step_plus_noclip: True # Do the /3 no clip thing from https://github.com/pat-coady/trpo/blob/5ac6b2e8476d0f1639a88128f59e8a51f1f8bce1/src/train.py#L92
44 | maze:
45 | goal_radius: 2 # Distance to goal in order to reach it
46 | goal_reward: 5 # How much reward to give for getting to goal
47 | velocity_reward_weight: 0 # How much weight to give to moving (any direction)
48 | use_ctrl_cost: 0 # Whether to use ctrl cost in final reward
49 | use_contact_cost: 0 # Whether to use contact cost in final reward
50 | use_survive_reward: 0 # Whether to use survive reward in final reward
51 | use_negative_goals: False # Whether to absorb at other goals (and give 0 or negative reward)
52 | negative_goal_reward: 0 # What reward to give for reaching the wrong goal
53 | logs:
54 | log_base: /checkpoint/kdmarino/phasefunlogs/ # Base dir log (should be same for all config files)
55 | exp_name: hierarchical_many_phase_dqn_final # Unique experiment name
56 | log_interval: 1 # Log interval, one log per n updates
57 | save_interval: 100 # Save interval, one per n updates
58 | vis_interval: 1 # Vis interval, one log per n updates
59 | optim_ppo:
60 | lr: 0.0003 # Learning rate
61 | eps: 0.00001 # RMSprop optimizer epsiolon
62 | alpha: 0.99 # RMSprop optimizer alpha
63 | max_grad_norm: 0.5 # Max norm of gradients
64 | num_frames: 20000000 # Number of frames to train
65 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
66 | num_ll_steps: 10 # How many low level steps to do in a row before update high level step
67 | optim_a2c:
68 | lr: 0.0007 # Learning rate
69 | eps: 0.00001 # RMSprop optimizer epsiolon
70 | alpha: 0.99 # RMSprop optimizer alpha
71 | max_grad_norm: 0.5 # Max norm of gradients
72 | num_frames: 20000000 # Number of frames to train
73 | hierarchical_mode: train_highlevel
74 | num_ll_steps: 10
75 | optim_dqn:
76 | lr: 0.0003
77 | eps: 0.00001
78 | max_grad_norm: 1
79 | eps_start: 0.9
80 | eps_end: 0
81 | eps_decay: 100000 # TODO - no idea if this value makes sense
82 | num_frames: 20000000 # Number of frames to train
83 | hierarchical_mode: train_highlevel # How to train the hierarchical policies (train_highlevel | train_both)
84 | num_ll_steps: 10
85 | lowlevel:
86 | optfile: options/phase_lowlevel/phase_mlp_pretrain_any.yaml # Opt file location of low level policy
87 | ckpt: /checkpoint/kdmarino/phasefunlogs/phase_lowlevel/phase_mlp_pretrain_any_20M/ppo/ExplorerAnt-v2/
88 | deterministic: False # Whether low level policies are deterministic
89 | num_load: 16 # Number of low level policies
90 |
--------------------------------------------------------------------------------
/environments/proprioceptive_humanoid_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numpy as np
3 | from gym import utils
4 | from . import mujoco_env
5 | from . import geom_utils
6 |
7 | def mass_center(model, sim):
8 | mass = np.expand_dims(model.body_mass, 1)
9 | xpos = sim.data.xipos
10 | return (np.sum(mass * xpos, 0) / np.sum(mass))
11 |
12 | class BaseProprioceptiveHumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
13 | # Initialize Mujoco environment
14 | def __init__(self, xml_file='humanoid.xml'):
15 | # Set start values for registration
16 | self.start_yaw = float('inf')
17 | self.start_z = float('inf')
18 | mujoco_env.MujocoEnv.__init__(self, xml_file, 5)
19 | utils.EzPickle.__init__(self)
20 |
21 | # Forward step
22 | def step(self, a):
23 | pos_before = mass_center(self.model, self.sim)
24 | self.do_simulation(a, self.frame_skip)
25 | pos_after = mass_center(self.model, self.sim)
26 | alive_bonus = 5.0
27 | data = self.sim.data
28 | lin_vel_cost = 0.25 * (pos_after - pos_before) / self.model.opt.timestep
29 | quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
30 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum()
31 | quad_impact_cost = min(quad_impact_cost, 10)
32 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
33 | qpos = self.sim.data.qpos
34 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
35 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost)
36 |
37 | # Get states by name
38 | def get_state_by_name(self, name, s=None):
39 | # Get state (if not passed in)
40 | if s is None:
41 | s = self.state_vector()
42 |
43 | # Replace with mass center
44 | s[0:3] = mass_center(self.model, self.sim)
45 |
46 | # Switch on name
47 | if name == 'xyz':
48 | val = s[0:3]
49 | elif name == 'x':
50 | val = s[0]
51 | elif name == 'y':
52 | val = s[1]
53 | elif name == 'z':
54 | val = s[2]
55 | elif name == 'quart':
56 | val = s[3:7]
57 | elif name in ['rpy', 'roll', 'pitch', 'yaw']:
58 | quart = s[3:7]
59 | roll, pitch, yaw = geom_utils.quaternion_to_euler_angle(quart)
60 | if name == 'roll':
61 | val = roll
62 | elif name == 'pitch':
63 | val = pitch
64 | elif name == 'yaw':
65 | val = yaw
66 | elif name == 'rpy':
67 | return np.array([roll, pitch, yaw])
68 | elif name == 'joint_angles':
69 | val = s[7:24]
70 | elif name == 'xyz_vel':
71 | val = s[24:27]
72 | elif name == 'x_vel':
73 | val = s[24]
74 | elif name == 'y_vel':
75 | val = s[25]
76 | elif name == 'z_vel':
77 | val = s[26]
78 | elif name == 'rpy_vel':
79 | val = s[27:30]
80 | elif name == 'roll_vel':
81 | val = s[27]
82 | elif name == 'pitch_vel':
83 | val = s[28]
84 | elif name == 'yaw_vel':
85 | val = s[29]
86 | elif name == 'joint_angle_vel':
87 | val = s[30:]
88 | return val
89 |
90 | # We remove the first 5 values from state which should correspond to global orientation and position
91 | # https://github.com/openai/gym/wiki/Humanoid-V1
92 | def get_intern_extern_state(self):
93 | # Extract different states
94 | s = self.state_vector()
95 | z = self.get_state_by_name('z', s)
96 | xyz = self.get_state_by_name('xyz', s)
97 | rpy = self.get_state_by_name('rpy', s)
98 | joint_angles = self.get_state_by_name('joint_angles', s)
99 | d_xyz = self.get_state_by_name('xyz_vel', s)
100 | d_rpy = self.get_state_by_name('rpy_vel', s)
101 | d_joint = self.get_state_by_name('joint_angle_vel', s)
102 |
103 | # Seperate out yaw
104 | roll = rpy[0]
105 | pitch = rpy[1]
106 | yaw = rpy[2]
107 | d_roll = d_rpy[0]
108 | d_pitch = d_rpy[1]
109 | d_yaw = d_rpy[2]
110 |
111 | # Set internal/external states
112 | # Internal keeps track of integral z and yaw (subtract out the initial value)
113 | pro_yaw = geom_utils.convert_to_egocentric(self.start_yaw, yaw)
114 | pro_z = z - self.start_z
115 | s_internal = np.concatenate([[pro_z, roll, pitch, pro_yaw], joint_angles, d_xyz, [d_roll, d_pitch, d_yaw], d_joint])
116 | s_external = np.concatenate([xyz, [yaw]])
117 |
118 | return s_internal, s_external
119 |
120 | def _get_obs(self):
121 | raise NotImplementedError
122 |
123 | def reset_model(self):
124 | c = 0.01
125 | self.set_state(
126 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
127 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
128 | )
129 |
130 | # Get the initial z and yaw, and keep track of to get integral values
131 | self.start_yaw = self.get_state_by_name('yaw')
132 | self.start_z = self.get_state_by_name('z')
133 |
134 | return self._get_obs()
135 |
136 | def viewer_setup(self):
137 | self.viewer.cam.trackbodyid = 1
138 | self.viewer.cam.distance = self.model.stat.extent * 1.0
139 | self.viewer.cam.lookat[2] += .8
140 | self.viewer.cam.elevation = -20
141 |
--------------------------------------------------------------------------------
/algo/dqn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 | import torch.nn.functional as F
5 | import numpy as np
6 | import random
7 | import pdb
8 | from storage import ReplayMemory, Transition
9 |
10 | # Code copied and adapted from pytorch Q learning tutorial https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
11 | class DQN(object):
12 | def __init__(self,
13 | dqn,
14 | gamma,
15 | batch_size=128,
16 | target_update=100,
17 | mem_capacity=10000000,
18 | lr=None,
19 | eps=None,
20 | max_grad_norm=1):
21 |
22 | self.gamma = gamma
23 | self.dqn = dqn
24 | self.batch_size = batch_size
25 | self.target_update = target_update
26 | self.max_grad_norm = max_grad_norm
27 | self.optimizer = optim.Adam(self.dqn.policy_net.parameters(), lr=lr, eps=eps)
28 | self.num_updates = 0
29 | self.replay_memory = ReplayMemory(mem_capacity)
30 |
31 | # Generate a state_dict object
32 | def state_dict(self):
33 | ckpt = {}
34 | ckpt['model'] = [self.dqn.policy_net.state_dict(), self.dqn.target_net.state_dict()]
35 | ckpt['optim'] = self.optimizer.state_dict()
36 | ckpt['steps_done'] = self.dqn.steps_done
37 | ckpt['num_updates'] = self.num_updates
38 | ckpt['memory'] = self.replay_memory
39 | return ckpt
40 |
41 | # Load from a state dict
42 | def load_state_dict(self, ckpt):
43 | self.dqn.policy_net.load_state_dict(ckpt['model'][0])
44 | self.dqn.target_net.load_state_dict(ckpt['model'][1])
45 | self.optimizer.load_state_dict(ckpt['optim'])
46 | self.dqn.steps_done = ckpt['steps_done']
47 | self.num_updates = ckpt['num_updates']
48 | self.replay_memory = ckpt['memory']
49 |
50 | # Update the replay memory
51 | def update_memory(self, states, actions, next_states, rewards, done_mask, step_masks):
52 | # Go through each index (corresponding to different environment steps)
53 | for state, action, next_state, reward, done, step_mask in zip(states, actions, next_states, rewards, done_mask, step_masks):
54 | # If in zombie step mask state, do nothing
55 | if step_mask > 0:
56 | # Make deep copies, convert to numpy and append to replay memory
57 | state = np.array(state.cpu().numpy())
58 | action = np.array(action.cpu().numpy())
59 | reward = np.array(reward)
60 | if done:
61 | next_state = None
62 | else:
63 | next_state = np.array(next_state.cpu().numpy())
64 |
65 | # Push into replay memory
66 | self.replay_memory.push(state, action, next_state, reward)
67 |
68 | # Update our policy network
69 | def update(self, num_updates):
70 | # Replay memory needs to at least be the batch size
71 | if len(self.replay_memory) < self.batch_size:
72 | return 0, 0, 0
73 | assert(len(self.replay_memory) >= self.batch_size)
74 |
75 | # Do updates
76 | dqn_loss = 0
77 | for update in range(num_updates):
78 | # Get batch values
79 | transitions = self.replay_memory.sample(self.batch_size)
80 | batch = Transition(*zip(*transitions))
81 | non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
82 | batch.next_state)), dtype=torch.uint8)
83 | non_final_mask = non_final_mask.unsqueeze(1)
84 | non_final_next_states = torch.cat([torch.from_numpy(s).unsqueeze(0) for s in batch.next_state
85 | if s is not None])
86 | state_batch = torch.cat([torch.from_numpy(s).unsqueeze(0) for s in batch.state])
87 | action_batch = torch.cat([torch.from_numpy(a).unsqueeze(0) for a in batch.action])
88 | reward_batch = torch.cat([torch.from_numpy(r).unsqueeze(0) for r in batch.reward])
89 | next_state_values = torch.zeros(self.batch_size, 1)
90 |
91 | # Convert to cuda
92 | if self.dqn.target_net.in_fc.weight.is_cuda:
93 | non_final_mask = non_final_mask.cuda()
94 | non_final_next_states = non_final_next_states.cuda()
95 | state_batch = state_batch.cuda()
96 | action_batch = action_batch.cuda()
97 | reward_batch = reward_batch.cuda()
98 | next_state_values = next_state_values.cuda()
99 |
100 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
101 | # columns of actions taken
102 | state_action_values = self.dqn.policy_net(state_batch).gather(1, action_batch)
103 |
104 | # Compute V(s_{t+1}) for all next states.
105 | next_state_values[non_final_mask] = self.dqn.target_net(non_final_next_states).max(1)[0].detach()
106 | # Compute the expected Q values
107 | expected_state_action_values = (next_state_values * self.gamma) + reward_batch
108 |
109 | # Compute Huber loss
110 | loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
111 | dqn_loss += loss
112 |
113 | # Optimize the model
114 | self.optimizer.zero_grad()
115 | loss.backward()
116 | for param in self.dqn.policy_net.parameters():
117 | param.grad.data.clamp_(-self.max_grad_norm, self.max_grad_norm)
118 | self.optimizer.step()
119 |
120 | self.num_updates += 1
121 |
122 | # Update target network
123 | if self.num_updates % self.target_update == 0:
124 | self.dqn.target_net.load_state_dict(self.dqn.policy_net.state_dict())
125 |
126 | dqn_loss /= num_updates
127 | return dqn_loss, 0, 0
128 |
--------------------------------------------------------------------------------
/environments/assets/skull_maze_ant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/environments/assets/cross_maze_ant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
--------------------------------------------------------------------------------
/summarize_results.py:
--------------------------------------------------------------------------------
1 | # This file takes the completed slurm logs and summarizes the results
2 | # Right now, just displays the average +- var of the final rewards (or whatever value we set in the ymal)
3 | # Eventually should be able to plot variance?
4 | import argparse
5 | import yaml
6 | import json
7 | import csv
8 | from pprint import pprint
9 | import click
10 | import shutil
11 | import copy
12 | import glob
13 | import os
14 | import time
15 | import itertools
16 | import pdb
17 | import torch
18 | import numpy as np
19 | from visualize import Dashboard
20 |
21 | # Get Input Arguments
22 | parser = argparse.ArgumentParser(description='RL')
23 |
24 | ##################################################
25 | # yaml options file contains all default choices #
26 | parser.add_argument('--batch_path_opt', default='options/batch/default.yaml', type=str,
27 | help='path to a yaml options file')
28 | # yaml option file containing the visdom plotting options
29 | parser.add_argument('--vis_path_opt', default='options/visualization/reward.yaml', type=str,
30 | help='path to a yaml visualization options file')
31 | ##################################################
32 | parser.add_argument('--eval-key', type=str, default='reward_env',
33 | help='name of key in the Episode log that we actually want to evaluate on')
34 | parser.add_argument('--outfile', type=str, default='',
35 | help='where to dump these results (optional)')
36 | parser.add_argument('--bin-size', type=int, default=100,
37 | help='over how many episode to average final result value')
38 | # These options only matter if we're batch_path_opt is actually just a single yaml (not a batch)
39 | parser.add_argument('--num-trials', type=int, default=5)
40 | parser.add_argument('--trial-offset', type=int, default=0)
41 | parser.add_argument('--algo', default='a2c',
42 | help='algorithm to use: a2c | ppo | acktr')
43 | parser.add_argument('--env-name', default='Hopper-v2',
44 | help='environment to train on (default: Hopper-v2)')
45 | def main():
46 | global args
47 | args = parser.parse_args()
48 |
49 | # Set options
50 | if args.batch_path_opt is not None:
51 | with open(args.batch_path_opt, 'r') as handle:
52 | batch_options = yaml.load(handle)
53 | if args.vis_path_opt is not None:
54 | with open(args.vis_path_opt, 'r') as handle:
55 | vis_options = yaml.load(handle)
56 | print('## args'); pprint(vars(args))
57 |
58 | # Either use the slurm batch file or the single yaml file to get the values
59 | val_dict = {}
60 | if 'base_yaml' in batch_options:
61 | # Slurm version
62 | algo = batch_options['algo']
63 | env_name = batch_options['env_name']
64 | num_trials = batch_options['num_trials']
65 | trial_offset = batch_options['trial_offset']
66 | base_yaml_file = batch_options['base_yaml']
67 |
68 | # Get the list of yaml files
69 | # Copies logic from clusterrun to make these
70 | grid = batch_options['params']
71 | individual_options = [[{key: value} for value in values] for key, values in grid.items()]
72 | product_options = list(itertools.product(*individual_options))
73 | jobs = [{k: v for d in option_set for k, v in d.items()} for option_set in product_options]
74 | basenames = []
75 | yaml_files = []
76 | with open(base_yaml_file) as f:
77 | base_options = yaml.load(f)
78 | for job in jobs:
79 | new_unique_name = base_options['logs']['exp_name']
80 | for k, v in job.items():
81 | new_unique_name += "_" + str(k) + "_" + str(v)
82 | assert(len(base_yaml_file.split('.')) == 2)
83 | new_yaml_filename = base_yaml_file.split('.')[0]
84 | new_yaml_filename = os.path.join(new_yaml_filename, new_unique_name) + '.yaml'
85 | basenames.append(new_unique_name)
86 | yaml_files.append(new_yaml_filename)
87 | assert(len(yaml_files) == len(jobs))
88 | assert(len(basenames) == len(jobs))
89 |
90 | # Get the eval vals for each param set
91 | val_dict = {}
92 | for yaml_file, name in zip(yaml_files, basenames):
93 | with open(yaml_file, 'r') as handle:
94 | opt = yaml.load(handle)
95 | eval_vals = get_last_eval_vals(opt, vis_options, args.eval_key, algo, env_name, num_trials, trial_offset, args.bin_size)
96 | if eval_vals is not None:
97 | val_dict[name] = eval_vals
98 | else:
99 | # Single yaml version
100 | algo = args.algo
101 | env_name = args.env_name
102 | opt = batch_options
103 | num_trials = args.num_trials
104 | trial_offset = args.trial_offset
105 |
106 | # Get the eval vals for this yaml
107 | eval_vals = get_last_eval_vals(opt, vis_options, args.eval_key, algo, env_name, num_trials, trial_offset, args.bin_size)
108 |
109 | # Save to dict
110 | name = opt['logs']['exp_name']
111 | val_dict[name] = eval_vals
112 |
113 | # Get the average values and std for each value in dict
114 | # Sort by average
115 | # Display / print each by decreasing average value
116 | avg_dict = {k: np.mean(v) for k, v in val_dict.items()}
117 | sorted_avg_dict = sorted(avg_dict.items(), reverse=True, key=lambda x: x[1])
118 | sorted_names = [x[0] for x in sorted_avg_dict]
119 | lines = []
120 | lines.append("Results for run of {yaml_name} on variable {var}".format(yaml_name=args.batch_path_opt.split('/')[-1], var=args.eval_key))
121 | for name in sorted_names:
122 | lines.append("{name}: {avg}+={std}".format(name=name, avg=np.mean(val_dict[name]), std=np.std(val_dict[name])))
123 |
124 | # Print results
125 | for line in lines:
126 | print(line)
127 |
128 | # Optionally print to file
129 | if len(args.outfile) > 0:
130 | with open(args.outfile, 'w') as f:
131 | for line in lines:
132 | f.write(line + '\n')
133 |
134 | # Get the last bucket values for the eval_key for each trial and return
135 | def get_last_eval_vals(opt, vis_opt, eval_key, algo, env_name, num_trials, trial_offset, bin_size):
136 | # For each trial
137 | eval_vals = []
138 | for trial in range(trial_offset, trial_offset+num_trials):
139 | # Get the logpath
140 | logpath = os.path.join(opt['logs']['log_base'], opt['model']['mode'], opt['logs']['exp_name'], algo, env_name, 'trial%d' % trial)
141 | if not os.path.isdir(logpath):
142 | return None
143 |
144 |
145 | # Create the dashboard object
146 | opt['env']['env-name'] = env_name
147 | opt['alg'] = opt['alg_%s' % algo]
148 | opt['optim'] = opt['optim_%s' % algo]
149 | opt['alg']['algo'] = algo
150 | opt['trial'] = trial
151 | dash = Dashboard(opt, vis_opt, logpath, vis=False)
152 |
153 | # Get data
154 | try:
155 | dash.preload_data()
156 | raw_x, raw_y = dash.load_data('episode_monitor', 'scalar', eval_key)
157 | except Exception:
158 | return None
159 |
160 | # Get data from last bin
161 | if not (len(raw_y) > bin_size):
162 | return None
163 | raw_vals = raw_y[-bin_size:]
164 | assert(len(raw_vals) == bin_size)
165 | raw_vals = [float(v) for v in raw_vals]
166 | raw_val = np.mean(raw_vals)
167 | eval_vals.append(raw_val)
168 |
169 | # Return
170 | return eval_vals
171 |
172 | if __name__ == "__main__":
173 | main()
174 |
175 |
--------------------------------------------------------------------------------