├── rl_sandbox
    ├── rl_sandbox
    │   ├── __init__.py
    │   ├── envs
    │   │   ├── __init__.py
    │   │   ├── wrappers
    │   │   │   ├── __init__.py
    │   │   │   ├── wrapper.py
    │   │   │   ├── action_repeat.py
    │   │   │   ├── absorbing_state.py
    │   │   │   └── frame_stack.py
    │   │   ├── fake_env.py
    │   │   └── utils.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── random_agents.py
    │   │   └── rl_agents.py
    │   ├── buffers
    │   │   ├── __init__.py
    │   │   ├── wrappers
    │   │   │   ├── __init__.py
    │   │   │   ├── buffer_wrapper.py
    │   │   │   ├── torch_buffer.py
    │   │   │   └── noise_wrapper.py
    │   │   ├── buffer.py
    │   │   ├── disk_buffer.py
    │   │   └── utils.py
    │   ├── examples
    │   │   ├── __init__.py
    │   │   ├── lfgp
    │   │   │   ├── __init__.py
    │   │   │   ├── experts
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── subsample_expert_data.py
    │   │   │   │   ├── scripted_policies.py
    │   │   │   │   └── create_subsampled_data.py
    │   │   │   ├── default_configs
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── dac.py
    │   │   │   ├── experiment_utils.py
    │   │   │   └── transfer.py
    │   │   ├── eval_tools
    │   │   │   └── __init__.py
    │   │   └── collect_data.py
    │   ├── train
    │   │   ├── __init__.py
    │   │   ├── train_bc.py
    │   │   ├── train_bc_no_overfit.py
    │   │   ├── train_multitask_bc.py
    │   │   ├── train_multitask_bc_no_overfit.py
    │   │   └── train_dac_sac.py
    │   ├── algorithms
    │   │   ├── __init__.py
    │   │   ├── bc
    │   │   │   └── __init__.py
    │   │   ├── dac
    │   │   │   └── __init__.py
    │   │   ├── sac
    │   │   │   └── __init__.py
    │   │   ├── sac_x
    │   │   │   ├── __init__.py
    │   │   │   ├── intentions_update
    │   │   │   │   └── __init__.py
    │   │   │   ├── schedulers_update
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── q_scheduler.py
    │   │   │   └── sac_x.py
    │   │   └── utils.py
    │   ├── transforms
    │   │   ├── __init__.py
    │   │   └── general_transforms.py
    │   ├── auxiliary_rewards
    │   │   ├── __init__.py
    │   │   ├── rce_envs
    │   │   │   ├── __init__.py
    │   │   │   ├── door_human_v0.py
    │   │   │   ├── relocate_human_v0.py
    │   │   │   ├── hammer_human_v0.py
    │   │   │   └── sawyer.py
    │   │   ├── manipulator_learning
    │   │   │   ├── __init__.py
    │   │   │   └── panda
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── lift_xyz_state.py
    │   │   └── generic.py
    │   ├── auxiliary_tasks
    │   │   ├── __init__.py
    │   │   ├── auxiliary_tasks.py
    │   │   ├── utils.py
    │   │   └── koopman.py
    │   ├── model_architectures
    │   │   ├── __init__.py
    │   │   ├── actor_critics
    │   │   │   └── __init__.py
    │   │   ├── discriminators
    │   │   │   ├── __init__.py
    │   │   │   └── fully_connected_discriminators.py
    │   │   ├── layers_definition.py
    │   │   ├── shared.py
    │   │   └── utils.py
    │   └── utils.py
    └── setup.py
├── pytorch-a2c-ppo-acktr-gail
    ├── a2c_ppo_acktr
    │   ├── __init__.py
    │   ├── algo
    │   │   ├── __init__.py
    │   │   ├── a2c_acktr.py
    │   │   ├── ppo.py
    │   │   └── gail.py
    │   ├── utils.py
    │   ├── distributions.py
    │   └── arguments.py
    ├── requirements.txt
    ├── setup.py
    ├── gail_experts
    │   ├── README.md
    │   ├── convert_to_pytorch.py
    │   └── convert_lfgp_expert_data.py
    ├── scripts
    │   ├── gail_bring.bash
    │   └── gail.bash
    ├── LICENSE
    ├── generate_tmux_yaml.py
    ├── .gitignore
    ├── evaluation.py
    ├── enjoy.py
    └── README.md
├── system.png
├── scripts
    ├── experiments
    │   ├── any_all_seeds.bash
    │   ├── any_script_any_seeds.bash
    │   ├── bc.bash
    │   ├── bc_no_overfit.bash
    │   ├── multi_bc.bash
    │   ├── dac.bash
    │   ├── lfgp.bash
    │   └── multi_bc_no_overfit.bash
    ├── create_data
    │   ├── create_modified_data.bash
    │   └── create_expert_data.bash
    ├── evaluation
    │   └── visualize_model.bash
    └── plotting
    │   ├── common.py
    │   └── multitask_performance.py
├── LICENSE
├── .gitignore
└── six_state_mdp.py


/rl_sandbox/rl_sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/envs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/buffers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/train/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/bc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/dac/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/sac/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/envs/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/lfgp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/sac_x/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_tasks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/buffers/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/eval_tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/model_architectures/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/lfgp/experts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/lfgp/default_configs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/sac_x/intentions_update/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/sac_x/schedulers_update/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/model_architectures/actor_critics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/model_architectures/discriminators/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/manipulator_learning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/system.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utiasSTARS/lfgp/HEAD/system.png


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/manipulator_learning/panda/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/requirements.txt:
--------------------------------------------------------------------------------
1 | gym
2 | matplotlib
3 | pybullet
4 | stable-baselines3


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/algo/__init__.py:
--------------------------------------------------------------------------------
1 | from .a2c_acktr import A2C_ACKTR
2 | from .ppo import PPO


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/utils.py:
--------------------------------------------------------------------------------
1 | def aug_data(data, num_aug, aug_batch_size):
2 |         return data.repeat(1, num_aug, *[1] * (len(data.shape) - 2)).reshape(
3 |             aug_batch_size, *data.shape[1:])
4 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 | 
3 | setup(
4 |     name='a2c-ppo-acktr',
5 |     packages=find_packages(),
6 |     version='0.0.1',
7 |     install_requires=['gym', 'matplotlib', 'pybullet', 'stable-baselines3'])
8 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/envs/fake_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class FakeEnv:
 4 |     def __init__(self, obs_dim):
 5 |         self._obs_dim = obs_dim
 6 | 
 7 |     def reset(self):
 8 |         return np.zeros(self._obs_dim)
 9 | 
10 |     def step(self, action):
11 |         return np.zeros(self._obs_dim), 0., False, {}
12 | 
13 |     def render(self):
14 |         pass
15 | 


--------------------------------------------------------------------------------
/scripts/experiments/any_all_seeds.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SCRIPT=$1
 4 | DEVICE=$2
 5 | MAIN_TASK=$3
 6 | 
 7 | # expert dir should just be the lowest level directory before int_X.gz,
 8 | # the rest is handled in the individual script files
 9 | EXPERT_DIR=$4
10 | USER_MACHINE=$5
11 | EXPERIMENT_NAME=$6
12 | 
13 | seeds=(1 2 3 4 5)
14 | for seed in "${seeds[@]}"
15 | do
16 |     bash "${SCRIPT}" "${seed}" "${DEVICE}" "${MAIN_TASK}" "${EXPERT_DIR}" "${USER_MACHINE}" "${EXPERIMENT_NAME}"
17 | done


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/envs/wrappers/wrapper.py:
--------------------------------------------------------------------------------
 1 | class Wrapper:
 2 |     def __init__(self, env):
 3 |         self._env = env
 4 | 
 5 |     def reset(self, **kwargs):
 6 |         return self._env.reset(**kwargs)
 7 | 
 8 |     def step(self, action):
 9 |         return self.step(action)
10 | 
11 |     def render(self, **kwargs):
12 |         return self._env.render(**kwargs)
13 | 
14 |     def seed(self, seed):
15 |         self._env.seed(seed)
16 | 
17 |     def __getattr__(self, attr):
18 |         return getattr(self._env, attr)
19 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/lfgp/experiment_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import rl_sandbox.constants as c
 3 | 
 4 | def get_save_path(algo_name, main_task, seed, exp_name, top_path="results"):
 5 |     return os.path.join(top_path, main_task, str(seed), algo_name, exp_name)
 6 | 
 7 | 
 8 | def config_check(experiment_config, top_path):
 9 |     """
10 |     custom checks for fixing config for particular machines
11 |     """
12 |     if "scratch" in top_path and experiment_config[c.ENV_SETTING][c.ENV_TYPE] == c.MANIPULATOR_LEARNING:
13 |         experiment_config[c.ENV_SETTING][c.KWARGS]["egl"] = False


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/gail_experts/README.md:
--------------------------------------------------------------------------------
 1 | ## Data
 2 | 
 3 | Download from 
 4 | https://drive.google.com/open?id=1Ipu5k99nwewVDG1yFetUxqtwVlgBg5su
 5 | 
 6 | and store in this folder.
 7 | 
 8 | ## Convert to pytorch
 9 | 
10 | ```bash
11 | python convert_to_pytorch.py --h5-file trajs_halfcheetah.h5
12 | ```
13 | 
14 | ## Run
15 | 
16 | ```bash
17 | python main.py --env-name "HalfCheetah-v2" --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --gae-lambda 0.95 --num-env-steps 10000000 --use-linear-lr-decay --use-proper-time-limits --gail
18 | ```
19 | 


--------------------------------------------------------------------------------
/rl_sandbox/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(name='rl_sandbox',
 4 |       version='3.1.0+vpace',
 5 |       packages=[package for package in find_packages()
 6 |                 if package.startswith('rl_sandbox')],
 7 |       install_requires=['gym>=0.15.4,<=0.23.0',
 8 |                         'numpy>=1.23.4,<2.0',
 9 |                         'tensorboard<=2.11',
10 |                         'torch==1.13.*',
11 |                         'manipulator_learning @ git+ssh://git@github.com/utiasSTARS/manipulator-learning@master#egg=manipulator_learning',
12 |                         'ConfigArgParse',
13 |                         'PyYAML']
14 |       )
15 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/model_architectures/layers_definition.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | import rl_sandbox.constants as c
 4 | import rl_sandbox.model_architectures.shared as snn
 5 | 
 6 | 
 7 | CUSTOM_WIDTH_LINEAR_LAYERS = lambda in_dim, width: (
 8 |     [in_dim,  width,      nn.ReLU(), True, 0],
 9 |     [width,   width,      nn.ReLU(), True, 0],
10 | )
11 | 
12 | VALUE_BASED_LINEAR_LAYERS = lambda in_dim: (
13 |     [in_dim,  256,      nn.ReLU(), True, 0],
14 |     [256,     256,      nn.ReLU(), True, 0],
15 | )
16 | 
17 | SAC_DISCRIMINATOR_LINEAR_LAYERS = lambda in_dim: (
18 |     [in_dim,  256,      nn.Tanh(), True, 0],
19 |     [256,     256,      nn.Tanh(), True, 0],
20 | )
21 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/generic.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import numpy as np
 4 | from numpy.linalg import norm
 5 | 
 6 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward
 7 | 
 8 | 
 9 | class FromEnvAuxiliaryReward(AuxiliaryReward):
10 |     def __init__(self, env, aux_rewards=()):
11 |         aux_reward_funcs = []
12 |         if aux_rewards == ():
13 |             aux_rewards = env.VALID_AUX_TASKS
14 |         for aux_str in aux_rewards:
15 |             rew_func = partial(env.get_aux_rew, tasks=(aux_str,))
16 |             rew_func.__qualname__ = aux_str
17 |             aux_reward_funcs.append(rew_func)
18 | 
19 |         super().__init__(aux_reward_funcs, False)
20 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/scripts/gail_bring.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | eval_eps=50
 4 | eval_interval=100000
 5 | num_env_steps=4000000
 6 | expert_file="../gail_experts/data/bring_0-expert_data/reset/int_2.pt"
 7 | env="bring_0"
 8 | num_processes=1
 9 | seed=10
10 | 
11 | python ../main.py \
12 |   --seed "$seed" \
13 |   --num-steps 2048 \
14 |   --lr 3e-4 \
15 |   --entropy-coef 0 \
16 |   --value-loss-coef 0.5 \
17 |   --ppo-epoch 10 \
18 |   --num-mini-batch 32 \
19 |   --gamma 0.99 \
20 |   --gae-lambda 0.95 \
21 |   --use-linear-lr-decay \
22 |   --use-proper-time-limits \
23 |   --num-processes="$num_processes" \
24 |   --use-gae \
25 |   --algo ppo \
26 |   --gail \
27 |   --eval-interval="$eval_interval" \
28 |   --num-env-steps="$num_env_steps" \
29 |   --gail-experts-file="$expert_file" \
30 |   --env-name="$env" \
31 |   --log-interval 1 \
32 |   --eval-eps "$eval_eps" \
33 |   --no-cuda
34 | #  --train-render
35 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/model_architectures/shared.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class Flatten(nn.Module):
 7 |     def forward(self, x):
 8 |         return x.view(x.size(0), -1)
 9 | 
10 | 
11 | class Split(nn.Module):
12 |     def __init__(self, feature_dims):
13 |         super().__init__()
14 |         self.feature_dims = feature_dims
15 | 
16 |     def forward(self, x):
17 |         features = []
18 |         last_feature_idx = 0
19 |         for feature_dim in self.feature_dims:
20 |             features.append(x[..., last_feature_idx:last_feature_idx + feature_dim])
21 |             last_feature_idx += feature_dim
22 |         return features
23 | 
24 | 
25 | class Swish(nn.Module):
26 |     def __init__(self):
27 |         super().__init__()
28 | 
29 |     def forward(self, x):
30 |         x = x * nn.functional.sigmoid(x)
31 |         return x
32 | 
33 | 
34 | class Fuse(nn.Module):
35 |     def forward(self, features):
36 |         return torch.cat(features, dim=-1)
37 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/lfgp/transfer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def get_transfer_params(load_existing_dir, load_model, load_buffer, load_transfer_exp_settings, load_aux_old_removal):
 5 |     if load_model == "":
 6 |         load_model = False
 7 |     else:
 8 |         load_model = os.path.join(load_existing_dir, load_model)
 9 | 
10 |     if load_buffer == "":
11 |         load_buffer = False
12 |     else:
13 |         load_buffer = os.path.join(load_existing_dir, load_buffer)
14 | 
15 |     # transfer
16 |     if load_transfer_exp_settings != "":
17 |         # assert load_buffer and load_model
18 |         assert load_buffer
19 |         load_transfer_exp_settings = os.path.join(load_existing_dir, load_transfer_exp_settings)
20 |     else:
21 |         load_transfer_exp_settings = False
22 | 
23 |     if load_aux_old_removal != "":
24 |         load_aux_old_removal = load_aux_old_removal.split(',')
25 |     else:
26 |         load_aux_old_removal = None
27 | 
28 |     return load_model, load_buffer, load_transfer_exp_settings, load_aux_old_removal


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/scripts/gail.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | eval_eps=50
 4 | eval_interval=100000
 5 | num_env_steps=4000000
 6 | expert_file="../gail_experts/expert-data/${1}/reset/800_steps-90_sp_point5_play_open_200_extra_lasts/int_2.gz"
 7 | env=$1
 8 | num_processes=1
 9 | #seed=$2
10 | 
11 | seeds=(1 2 3 4 5)
12 | # seeds=(3 4 5)
13 | # seeds=(10 11)
14 | job_type="$1"
15 | 
16 | for seed in "${seeds[@]}"
17 | do
18 |     
19 | python ../main.py \
20 |   --seed "$seed" \
21 |   --num-steps 2048 \
22 |   --lr 3e-4 \
23 |   --entropy-coef 0 \
24 |   --value-loss-coef 0.5 \
25 |   --ppo-epoch 10 \
26 |   --num-mini-batch 32 \
27 |   --gamma 0.99 \
28 |   --gae-lambda 0.95 \
29 |   --use-linear-lr-decay \
30 |   --use-proper-time-limits \
31 |   --num-processes="$num_processes" \
32 |   --use-gae \
33 |   --algo ppo \
34 |   --gail \
35 |   --eval-interval="$eval_interval" \
36 |   --num-env-steps="$num_env_steps" \
37 |   --gail-experts-file="$expert_file" \
38 |   --env-name="$env" \
39 |   --log-interval 10 \
40 |   --eval-eps "$eval_eps" \
41 |   --no-cuda &
42 | #  --train-render
43 | 
44 | 
45 | done
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | MIT License
 3 | 
 4 | Copyright (c) 2021 STARS Laboratory
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Ilya Kostrikov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/door_human_v0.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.linalg import norm
 3 | 
 4 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward
 5 | 
 6 | 
 7 | # reach reward defined based on how defined in original envs combined with how we defined datasets in rce_multitask_envs.py
 8 | def reach(observation, next_observation, **kwargs):
 9 |     obs = observation
10 |     next_obs = next_observation
11 | 
12 |     palm_to_handle_dist_before = np.linalg.norm(obs[-4:-1])
13 |     palm_to_handle_dist_after = np.linalg.norm(next_obs[-4:-1])
14 | 
15 |     return palm_to_handle_dist_before - palm_to_handle_dist_after
16 | 
17 | 
18 | def grasp(observation, next_observation, **kwargs):
19 |     obs = observation
20 |     next_obs = next_observation
21 | 
22 |     latch_turn_before = obs[27]
23 |     latch_turn_after = next_obs[27]
24 | 
25 |     return latch_turn_after - latch_turn_before
26 | 
27 | 
28 | class DoorHumanV0AuxiliaryReward(AuxiliaryReward):
29 |     def __init__(self, env_name, aux_rewards=('reach',)):
30 |         aux_reward_funcs = [globals()[ar_str] for ar_str in aux_rewards]
31 |         super().__init__(aux_reward_funcs, True)
32 | 


--------------------------------------------------------------------------------
/scripts/create_data/create_modified_data.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MAIN_TASK=$1
 4 | STEPS_PER_TASK_ORIG=$2
 5 | STEPS_PER_TASK_NEW=$3
 6 | NUM_EXTRA_LASTS=$4
 7 | KEEP_EVERY_NTH=$5
 8 | 
 9 | 
10 | INPUT_PATH_POST="${STEPS_PER_TASK_ORIG}_steps"
11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/"
12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}}
13 | echo "Using TOP_DIR OF ${TOP_DIR}"
14 | OUTPUT_PATH_POST="${STEPS_PER_TASK_NEW}_steps_${KEEP_EVERY_NTH}_ss_${NUM_EXTRA_LASTS}_el/"
15 | 
16 | echo "Generating smaller dataset, subsampled by ${KEEP_EVERY_NTH}, getting ${NUM_EXTRA_LASTS} extra final transtions, for ${MAIN_TASK}, original: ${INPUT_PATH_POST}, new: ${OUTPUT_PATH_POST}."
17 | 
18 | IN_PATH="${TOP_DIR}${MAIN_TASK}/${INPUT_PATH_POST}"
19 | OUT_PATH="${TOP_DIR}${MAIN_TASK}/${OUTPUT_PATH_POST}"
20 | 
21 | echo "Getting data from ${IN_PATH}, Saving new data to ${OUT_PATH}."
22 | 
23 | python ../../rl_sandbox/rl_sandbox/examples/lfgp/experts/create_subsampled_data.py \
24 |     --seed=0 \
25 |     --keep_last \
26 |     --input_path="${IN_PATH}" \
27 |     --output_path="${OUT_PATH}" \
28 |     --keep_every_nth="${KEEP_EVERY_NTH}" \
29 |     --num_to_subsample_from="${STEPS_PER_TASK_NEW}" \
30 |     --num_extra_lasts="${NUM_EXTRA_LASTS}"


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/relocate_human_v0.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.linalg import norm
 3 | 
 4 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward
 5 | 
 6 | 
 7 | # reach reward defined based on how defined in original envs combined with how we defined datasets in rce_multitask_envs.py
 8 | def reach(observation, next_observation, **kwargs):
 9 |     obs = observation
10 |     next_obs = next_observation
11 | 
12 |     palm_to_ball_dist_before = np.linalg.norm(obs[-9:-6])
13 |     palm_to_ball_dist_after = np.linalg.norm(next_obs[-9:-6])
14 | 
15 |     return palm_to_ball_dist_before - palm_to_ball_dist_after
16 | 
17 | 
18 | def grasp(observation, next_observation, **kwargs):
19 |     obs = observation
20 |     next_obs = next_observation
21 | 
22 |     ball_target_z_diff_before = abs(obs[-1])
23 |     ball_target_z_diff_after = abs(next_obs[-1])
24 | 
25 |     return ball_target_z_diff_before - ball_target_z_diff_after
26 | 
27 | 
28 | class RelocateHumanV0AuxiliaryReward(AuxiliaryReward):
29 |     def __init__(self, env_name, aux_rewards=('reach',)):
30 |         aux_reward_funcs = [globals()[ar_str] for ar_str in aux_rewards]
31 |         super().__init__(aux_reward_funcs, True)
32 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/hammer_human_v0.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.linalg import norm
 3 | 
 4 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward
 5 | 
 6 | 
 7 | # reach reward defined based on how defined in original envs combined with how we defined datasets in rce_multitask_envs.py
 8 | def reach(observation, next_observation, **kwargs):
 9 |     obs = observation
10 |     next_obs = next_observation
11 | 
12 |     palm_to_hammer_dist_before = np.linalg.norm(np.array(obs[-13:-10])-np.array(obs[-10:-7]))
13 |     palm_to_hammer_dist_after = np.linalg.norm(np.array(next_obs[-13:-10])-np.array(next_obs[-10:-7]))
14 | 
15 |     return palm_to_hammer_dist_before - palm_to_hammer_dist_after
16 | 
17 | 
18 | def grasp(observation, next_observation, **kwargs):
19 |     obs = observation
20 |     next_obs = next_observation
21 | 
22 |     hammer_height_before = obs[-8]
23 |     hammer_height_after = next_obs[-8]
24 | 
25 |     return hammer_height_after - hammer_height_before
26 | 
27 | 
28 | class HammerHumanV0AuxiliaryReward(AuxiliaryReward):
29 |     def __init__(self, env_name, aux_rewards=('reach',)):
30 |         aux_reward_funcs = [globals()[ar_str] for ar_str in aux_rewards]
31 |         super().__init__(aux_reward_funcs, True)
32 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/lfgp/experts/subsample_expert_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tools for subsampling expert data.
 3 | """
 4 | 
 5 | import copy
 6 | import numpy as np
 7 | 
 8 | 
 9 | def subsample_buffers(buffers, keep_every_nth, keep_first_last=False):
10 |     subsampled_bufs = []
11 |     data_strs = ['observations', 'hidden_states', 'actions', 'rewards', 'dones', 'next_observations',
12 |                  'next_hidden_states']
13 |     for b in buffers:
14 |         initial_offset = np.random.randint(keep_every_nth)
15 |         subsampled_b = copy.deepcopy(b)
16 |         if keep_first_last:
17 |             raise NotImplementedError()
18 |             ends = np.argwhere(np.invert(np.all(b.observations[1:] == b.next_observations[:-1], axis=1)))
19 | 
20 |         inds = np.array(range(initial_offset, len(b), keep_every_nth))
21 | 
22 |         for ds in data_strs:
23 |             setattr(subsampled_b, ds, getattr(b, ds)[inds])
24 | 
25 |         # infos done separately since it's a dict
26 |         for k in b.infos.keys():
27 |             subsampled_b.infos[k] = b.infos[k][inds]
28 | 
29 |         subsampled_b._pointer = 0
30 |         subsampled_b._count = len(inds)
31 |         subsampled_b._memory_size = len(inds)
32 | 
33 |         subsampled_bufs.append(subsampled_b)
34 | 
35 |     return subsampled_bufs


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/buffers/buffer.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | import rl_sandbox.constants as c
 4 | 
 5 | 
 6 | class NoSampleError(Exception):
 7 |     pass
 8 | 
 9 | 
10 | class LengthMismatchError(Exception):
11 |     pass
12 | 
13 | 
14 | class CheckpointIndexError(Exception):
15 |     pass
16 | 
17 | 
18 | class Buffer:
19 |     @property
20 |     def memory_size(self):
21 |         raise NotImplementedError
22 | 
23 |     @property
24 |     def is_full(self):
25 |         raise NotImplementedError
26 | 
27 |     def __len__(self):
28 |         raise NotImplementedError
29 | 
30 |     def push(self, obs, h_state, act, rew, done, info, **kwargs):
31 |         raise NotImplementedError
32 | 
33 |     def clear(self):
34 |         raise NotImplementedError
35 | 
36 |     def sample(self, batch_size, idxes=None):
37 |         raise NotImplementedError
38 | 
39 |     def sample_with_next_obs(self, batch_size, next_obs, next_h_state=None, idxes=None):
40 |         raise NotImplementedError
41 | 
42 |     def sample_consecutive(self, batch_size, end_with_done=False):
43 |         raise NotImplementedError
44 | 
45 |     def save(self, save_path, **kwargs):
46 |         raise NotImplementedError
47 | 
48 |     def load(self, load_path, load_rng=True):
49 |         raise NotImplementedError
50 | 
51 |     def close(self):
52 |         pass
53 | 


--------------------------------------------------------------------------------
/scripts/experiments/any_script_any_seeds.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # seeds should be in the following format: "1 2 3 4 5"
 4 | SEEDS=($1)
 5 | SCRIPT=$2
 6 | DEVICE=$3
 7 | MAIN_TASK=$4
 8 | 
 9 | # expert dir should just be the lowest level directory before int_X.gz,
10 | # the rest is handled in the individual script files
11 | EXPERT_DIR=$5
12 | USER_MACHINE=$6
13 | EXPERIMENT_NAME=$7
14 | 
15 | # optional for DAC/LfGP only
16 | EXPBUF_LAST_SAMPLE_PROP=$8  # default is .95, 0. turns it off
17 | EXPBUF_MODEL_SAMPLE_RATE=$9  # default is .1, 0. turns it off
18 | 
19 | # optional for LfGP only
20 | SCHEDULER=$10
21 | 
22 | if [ "${SCRIPT}" = "lfgp.bash" ]; then
23 |     for seed in "${SEEDS[@]}"
24 |     do
25 |         bash "${SCRIPT}" "${seed}" "${DEVICE}" "${MAIN_TASK}" "${EXPERT_DIR}" "${USER_MACHINE}" "${SCHEDULER}" \
26 |             "${EXPBUF_LAST_SAMPLE_PROP}" "${EXPBUF_MODEL_SAMPLE_RATE}" "${EXPERIMENT_NAME}"
27 |     done
28 | elif [ "${SCRIPT}" = "dac.bash" ]; then
29 |     for seed in "${SEEDS[@]}"
30 |     do
31 |         bash "${SCRIPT}" "${seed}" "${DEVICE}" "${MAIN_TASK}" "${EXPERT_DIR}" "${USER_MACHINE}" \
32 |             "${EXPBUF_LAST_SAMPLE_PROP}" "${EXPBUF_MODEL_SAMPLE_RATE}" "${EXPERIMENT_NAME}"
33 |     done
34 | else
35 |     for seed in "${SEEDS[@]}"
36 |     do
37 |         bash "${SCRIPT}" "${seed}" "${DEVICE}" "${MAIN_TASK}" "${EXPERT_DIR}" "${USER_MACHINE}" "${EXPERIMENT_NAME}"
38 |     done
39 | fi


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/transforms/general_transforms.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | class Transform:
 6 |     def __call__(self, obs):
 7 |         raise NotImplementedError
 8 | 
 9 |     def reset(self):
10 |         pass
11 | 
12 | 
13 | class Identity(Transform):
14 |     def __call__(self, obs):
15 |         return obs
16 | 
17 | 
18 | class Compose(Transform):
19 |     def __init__(self, transforms):
20 |         self._transforms = transforms
21 | 
22 |     def __call__(self, obs):
23 |         for transform in self._transforms:
24 |             obs = transform(obs)
25 |         return obs
26 | 
27 |     def reset(self):
28 |         for transform in self._transforms:
29 |             transform.reset()
30 | 
31 | 
32 | class AsType(Transform):
33 |     def __init__(self, dtype=np.float32):
34 |         self._dtype = dtype
35 | 
36 |     def __call__(self, obs):
37 |         return obs.astype(self._dtype)
38 | 
39 | 
40 | class FrameStack(Transform):
41 |     def __init__(self, frame_dim):
42 |         """ stack observation along axis 0. Assumes observation has 1 less dimension
43 |         """
44 |         assert len(frame_dim) > 1
45 |         self._frame_dim = frame_dim
46 |         self._frames = np.zeros(shape=frame_dim, dtype=np.float32)
47 | 
48 |     def __call__(self, obs):
49 |         self._frames = np.concatenate((self._frames[1:], [obs]))
50 |         return self._frames
51 | 
52 |     def reset(self):
53 |         self._frames.fill(0)
54 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/envs/wrappers/action_repeat.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import rl_sandbox.constants as c
 4 | 
 5 | 
 6 | class ActionRepeatWrapper:
 7 |     def __init__(self, env, action_repeat, discount_factor=1, enable_discounting=False):
 8 |         assert action_repeat > 0
 9 |         self._env = env
10 |         self._action_repeat = action_repeat
11 |         self._enable_discounting = enable_discounting
12 |         self._discount_factor = discount_factor if enable_discounting else 1.
13 | 
14 |     def __getattr__(self, attr):
15 |         return getattr(self._env, attr)
16 | 
17 |     def reset(self, **kwargs):
18 |         return self._env.reset(**kwargs)
19 | 
20 |     def step(self, action, **kwargs):
21 |         done = False
22 |         cum_reward = 0
23 |         num_repeated = 0
24 |         infos = {
25 |             c.INFOS: []
26 |         }
27 | 
28 |         while not done and num_repeated < self._action_repeat:
29 |             obs, reward, done, info = self._env.step(action, **kwargs)
30 |             cum_reward += (self._discount_factor ** num_repeated) * reward
31 |             num_repeated += 1
32 |             infos[c.INFOS].append(info)
33 | 
34 |         infos[c.DISCOUNTING] = np.array([num_repeated if self._enable_discounting else 1])
35 | 
36 |         return obs, cum_reward, done, infos
37 | 
38 |     def render(self, **kwargs):
39 |         return self._env.render(**kwargs)
40 | 
41 |     def seed(self, seed):
42 |         self._env.seed(seed)
43 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/sawyer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.linalg import norm
 3 | 
 4 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward
 5 | 
 6 | 
 7 | # reach reward defined based on how rewards are defined for rce envs
 8 | def reach(observation, next_observation, **kwargs):
 9 |     obs = observation
10 |     next_obs = next_observation
11 | 
12 |     ee_pos = obs[:3]
13 |     obj_pos = obs[3:6]
14 |     next_ee_pos = next_obs[:3]
15 |     next_obj_pos = next_obs[3:6]
16 | 
17 |     d_before = norm(ee_pos - obj_pos)
18 |     d_after = norm(next_ee_pos - next_obj_pos)
19 |     return d_before - d_after
20 | 
21 | 
22 | def grasp(observation, next_observation, action, **kwargs):
23 |     obs = observation
24 |     next_obs = next_observation
25 | 
26 |     reach_rew = reach(obs, next_obs)
27 | 
28 |     obj_z_pos = obs[5]
29 |     next_obj_z_pos = next_obs[5]
30 |     z_inc = next_obj_z_pos - obj_z_pos
31 |     grip_pos = obs[6:8]
32 |     next_grip_pos = next_obs[6:8]
33 |     grip_inc = next_grip_pos[0] - grip_pos[0] - (next_grip_pos[1] - grip_pos[1])  # 2nd index goes negative as it closes
34 | 
35 |     grasp_rew = z_inc + grip_inc
36 | 
37 |     return reach_rew + grasp_rew
38 | 
39 | class SawyerAuxiliaryReward(AuxiliaryReward):
40 |     def __init__(self, env_name, aux_rewards=('reach',)):
41 |         aux_reward_funcs = [globals()[ar_str] for ar_str in aux_rewards]
42 |         super().__init__(aux_reward_funcs, True)
43 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/gail_experts/convert_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | 
 5 | import h5py
 6 | import numpy as np
 7 | import torch
 8 | 
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser(
12 |         'Converts expert trajectories from h5 to pt format.')
13 |     parser.add_argument(
14 |         '--h5-file',
15 |         default='trajs_halfcheetah.h5',
16 |         help='input h5 file',
17 |         type=str)
18 |     parser.add_argument(
19 |         '--pt-file',
20 |         default=None,
21 |         help='output pt file, by default replaces file extension with pt',
22 |         type=str)
23 |     args = parser.parse_args()
24 | 
25 |     if args.pt_file is None:
26 |         args.pt_file = os.path.splitext(args.h5_file)[0] + '.pt'
27 | 
28 |     with h5py.File(args.h5_file, 'r') as f:
29 |         dataset_size = f['obs_B_T_Do'].shape[0]  # full dataset size
30 | 
31 |         states = f['obs_B_T_Do'][:dataset_size, ...][...]
32 |         actions = f['a_B_T_Da'][:dataset_size, ...][...]
33 |         rewards = f['r_B_T'][:dataset_size, ...][...]
34 |         lens = f['len_B'][:dataset_size, ...][...]
35 | 
36 |         states = torch.from_numpy(states).float()
37 |         actions = torch.from_numpy(actions).float()
38 |         rewards = torch.from_numpy(rewards).float()
39 |         lens = torch.from_numpy(lens).long()
40 | 
41 |     data = {
42 |         'states': states,
43 |         'actions': actions,
44 |         'rewards': rewards,
45 |         'lengths': lens
46 |     }
47 | 
48 |     torch.save(data, args.pt_file)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_tasks/auxiliary_tasks.py:
--------------------------------------------------------------------------------
 1 | class AuxiliaryTask:
 2 |     def load_state_dict(self, state_dict):
 3 |         pass
 4 | 
 5 |     def state_dict(self):
 6 |         pass
 7 | 
 8 |     def compute_loss(self, next_obs, next_h_state):
 9 |         return 0., dict()
10 | 
11 |     def zero_grad(self):
12 |         pass
13 | 
14 |     def step(self):
15 |         pass
16 |         
17 |         
18 | class AuxiliaryTasks(AuxiliaryTask):
19 |     def __init__(self, aux_tasks):
20 |         super().__init__()
21 |         self._aux_tasks = aux_tasks
22 | 
23 |     def load_state_dict(self, state_dict):
24 |         for task_name, task_state_dict in state_dict.items():
25 |             assert task_name in self._aux_tasks
26 |             self._aux_tasks[task_name].load_state_dict(task_state_dict)
27 | 
28 |     def state_dict(self):
29 |         state_dict = dict()
30 |         for task_name, task in self._aux_tasks.items():
31 |             state_dict[task_name] = task.state_dict()
32 |         return state_dict
33 | 
34 |     def compute_loss(self, next_obs, next_h_state):
35 |         update_info = dict()
36 | 
37 |         total_loss = 0
38 |         for task_name, task in self._aux_tasks.items():
39 |             loss = task.compute_loss(next_obs, next_h_state)
40 |             update_info[task_name] = loss.detach().cpu()
41 | 
42 |             total_loss += loss
43 | 
44 |         return total_loss, update_info
45 | 
46 |     def zero_grad(self):
47 |         for task in self._aux_tasks.values():
48 |             task.opt.zero_grad()
49 | 
50 |     def step(self):
51 |         for task in self._aux_tasks.values():
52 |             task.opt.step()
53 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/gail_experts/convert_lfgp_expert_data.py:
--------------------------------------------------------------------------------
 1 | import _pickle as pickle
 2 | import gzip
 3 | import numpy as np
 4 | import torch
 5 | import os
 6 | import argparse
 7 | 
 8 | TOP_DIR=os.path.join(os.environ['LFGP_TOP_DIR'], 'play_xyz', 'expert-data')
 9 | MID="reset/800_steps-90_sp_point5_play_open_200_extra_lasts/"
10 | END="int_2.gz"
11 | TASKS = ['stack_0', 'bring_0', 'insert_0', 'unstack_stack_env_only_0']
12 | EXPERT_PATH_DICT = {
13 |     "stack_0": os.path.join(TOP_DIR, "open-close-stack-lift-reach-move", MID, END),
14 |     "bring_0": os.path.join(TOP_DIR, "open-close-bring-lift-reach-move", MID, END),
15 |     "insert_0": os.path.join(TOP_DIR, "open-close-insert-bring-lift-reach-move", MID, END),
16 |     "unstack_stack_env_only_0": os.path.join(TOP_DIR, "open-close-unstackstack-lift-reach-move-35M", MID, END),
17 | }
18 | 
19 | for t in TASKS:
20 |     # src_path = f"data/{t}-expert_data/reset/int_2.gz"
21 |     src_path = EXPERT_PATH_DICT[t]
22 |     dst_path = f"expert-data/{t}/{MID}"
23 |     dst_file = 'int_2.gz'
24 | 
25 |     src_data = pickle.load(gzip.open(src_path, "rb"))
26 | 
27 |     print(src_data.keys())
28 | 
29 |     ep_start_idxes = [0]
30 |     for idx, (curr_obs, next_obs) in enumerate(zip(src_data["observations"][1:], src_data["next_observations"][:-1])):
31 |         if np.any(curr_obs != next_obs):
32 |             ep_start_idxes.append(idx + 1)
33 | 
34 |     num_eps = len(ep_start_idxes)
35 | 
36 |     print(num_eps)
37 | 
38 |     data = {
39 |         'states': src_data["observations"][:, :-1],
40 |         'actions': src_data["actions"],
41 |     }
42 | 
43 |     os.makedirs(dst_path, exist_ok=True)
44 |     torch.save(data, os.path.join(dst_path, dst_file))


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/generate_tmux_yaml.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import yaml
 4 | 
 5 | parser = argparse.ArgumentParser(description='Process some integers.')
 6 | parser.add_argument(
 7 |     '--num-seeds',
 8 |     type=int,
 9 |     default=4,
10 |     help='number of random seeds to generate')
11 | parser.add_argument(
12 |     '--env-names',
13 |     default="PongNoFrameskip-v4",
14 |     help='environment name separated by semicolons')
15 | args = parser.parse_args()
16 | 
17 | ppo_mujoco_template = "python main.py --env-name {0} --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --tau 0.95 --num-env-steps 1000000 --use-linear-lr-decay --no-cuda --log-dir /tmp/gym/{1}/{1}-{2} --seed {2} --use-proper-time-limits"
18 | 
19 | ppo_atari_template = "env CUDA_VISIBLE_DEVICES={2} python main.py --env-name {0} --algo ppo --use-gae --lr 2.5e-4 --clip-param 0.1 --value-loss-coef 0.5 --num-processes 8 --num-steps 128 --num-mini-batch 4 --log-interval 1 --use-linear-lr-decay --entropy-coef 0.01 --log-dir /tmp/gym/{1}/{1}-{2} --seed {2}"
20 | 
21 | template = ppo_atari_template
22 | 
23 | config = {"session_name": "run-all", "windows": []}
24 | 
25 | for i in range(args.num_seeds):
26 |     panes_list = []
27 |     for env_name in args.env_names.split(';'):
28 |         panes_list.append(
29 |             template.format(env_name,
30 |                             env_name.split('-')[0].lower(), i))
31 | 
32 |     config["windows"].append({
33 |         "window_name": "seed-{}".format(i),
34 |         "panes": panes_list
35 |     })
36 | 
37 | yaml.dump(config, open("run_all.yaml", "w"), default_flow_style=False)
38 | 


--------------------------------------------------------------------------------
/scripts/evaluation/visualize_model.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEED=$1
 4 | MODEL_PATH_AFTER_TOP="$2"
 5 | MODEL_NAME="$3"
 6 | CONFIG_NAME="$4"
 7 | NUM_EPISODES="$5"
 8 | INTENTION="$6"
 9 | RENDER="$7"
10 | STOCHASTIC="$8"
11 | FORCED_SCHEDULE="$9"
12 | 
13 | # some ideas for what you might want to try with forced schedule!
14 | # FORCED_SCHEDULE="{0: 4, 45: 3, 90: 5, 135: 2, 180: 0}"
15 | # FORCED_SCHEDULE="{0: 3, 25: 2, 50: 4, 75: 5, 100: 1, 125: 3, 150: 4, 175: 2, 200: 1}"
16 | # FORCED_SCHEDULE="{0: 3, 90: 2, 180: 0}"
17 | # FORCED_SCHEDULE="{0: 4, 45: 3, 90: 2, 180: 0}"
18 | # FORCED_SCHEDULE="{0: 4, 45: 3, 90: 2, 135: 0, 180: 4, 225: 3, 270: 2, 315: 0}"
19 | # FORCED_SCHEDULE="{0: 3, 45: 2, 90: 0, 135: 3, 180: 2, 225: 0, 270: 3, 315: 2}"
20 | # FORCED_SCHEDULE="{0: 5, 90: 2, 180: 0}"
21 | # FORCED_SCHEDULE="{0: 5, 45: 2, 90: 4, 135: 2, 180: 3, 225: 2, 270: 2, 315: 5}"  # realistic WRS ep
22 | 
23 | 
24 | DEFAULT_TOP_DIR="../../lfgp_data/trained_models/"
25 | TOP_DIR=${LFGP_MODEL_TOP_DIR:=${DEFAULT_TOP_DIR}}
26 | echo "Using TOP_DIR OF ${TOP_DIR}"
27 | 
28 | COMMON_TOP="${TOP_DIR}/${MODEL_PATH_AFTER_TOP}"
29 | MODEL_PATH="${COMMON_TOP}/${MODEL_NAME}"
30 | CONFIG_PATH="${COMMON_TOP}/${CONFIG_NAME}"
31 | 
32 | PYTHON_TO_EXEC=$(cat <<-END 
33 | ../../rl_sandbox/rl_sandbox/examples/eval_tools/evaluate.py 
34 | --seed=${SEED}
35 | --model_path=${MODEL_PATH} 
36 | --config_path=${CONFIG_PATH}
37 | --num_episodes=${NUM_EPISODES}
38 | --intention=${INTENTION}
39 | --model_path=${MODEL_PATH}
40 | --forced_schedule=${FORCED_SCHEDULE}
41 | --force_egl
42 | END
43 | )
44 | 
45 | if [ "${RENDER}" = "true" ]; then
46 |     PYTHON_TO_EXEC+=" --render"
47 | fi
48 | 
49 | if [ "${STOCHASTIC}" = "true" ]; then
50 |     PYTHON_TO_EXEC+=" --stochastic"
51 | fi
52 | 
53 | python ${PYTHON_TO_EXEC}


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | trained_models/
104 | .fuse_hidden*
105 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/utils.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from a2c_ppo_acktr.envs import VecNormalize
 8 | 
 9 | 
10 | # Get a render function
11 | def get_render_func(venv):
12 |     if hasattr(venv, 'envs'):
13 |         return venv.envs[0].render
14 |     elif hasattr(venv, 'venv'):
15 |         return get_render_func(venv.venv)
16 |     elif hasattr(venv, 'env'):
17 |         return get_render_func(venv.env)
18 | 
19 |     return None
20 | 
21 | 
22 | def get_vec_normalize(venv):
23 |     if isinstance(venv, VecNormalize):
24 |         return venv
25 |     elif hasattr(venv, 'venv'):
26 |         return get_vec_normalize(venv.venv)
27 | 
28 |     return None
29 | 
30 | 
31 | # Necessary for my KFAC implementation.
32 | class AddBias(nn.Module):
33 |     def __init__(self, bias):
34 |         super(AddBias, self).__init__()
35 |         self._bias = nn.Parameter(bias.unsqueeze(1))
36 | 
37 |     def forward(self, x):
38 |         if x.dim() == 2:
39 |             bias = self._bias.t().view(1, -1)
40 |         else:
41 |             bias = self._bias.t().view(1, -1, 1, 1)
42 | 
43 |         return x + bias
44 | 
45 | 
46 | def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
47 |     """Decreases the learning rate linearly"""
48 |     lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
49 |     for param_group in optimizer.param_groups:
50 |         param_group['lr'] = lr
51 | 
52 | 
53 | def init(module, weight_init, bias_init, gain=1):
54 |     weight_init(module.weight.data, gain=gain)
55 |     bias_init(module.bias.data)
56 |     return module
57 | 
58 | 
59 | def cleanup_log_dir(log_dir):
60 |     try:
61 |         os.makedirs(log_dir)
62 |     except OSError:
63 |         files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
64 |         for f in files:
65 |             os.remove(f)
66 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/agents/random_agents.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import rl_sandbox.constants as c
 4 | 
 5 | 
 6 | class UniformContinuousAgent:
 7 |     def __init__(self, min_action, max_action, rng=np.random):
 8 |         self.min_action = np.array(min_action)
 9 |         self.max_action = np.array(max_action)
10 |         self.entropy = np.log(max_action - min_action).astype(np.float32)
11 |         self.log_prob = -self.entropy.sum(keepdims=True).astype(np.float32)
12 |         self._act_info = {
13 |             c.LOG_PROB: self.log_prob,
14 |             c.ENTROPY: self.entropy,
15 |             c.VALUE: np.array([np.nan], dtype=np.float32),
16 |             c.MEAN: ((max_action + min_action) / 2).astype(np.float32),
17 |             c.VARIANCE: (((max_action - min_action) ** 2) / 2).astype(np.float32),
18 |         }
19 |         self.rng = rng
20 | 
21 |     def compute_action(self, **kwargs):
22 |         return self.rng.uniform(self.min_action, self.max_action).astype(np.float32), None, self._act_info
23 | 
24 |     def reset(self):
25 |         return None
26 | 
27 | 
28 | class UniformContinuousActionRepeatAgent(UniformContinuousAgent):
29 |     def __init__(self, min_action, max_action, max_repeat, min_repeat=1, rng=np.random):
30 |         super().__init__(min_action, max_action, rng)
31 |         self.min_repeat = min_repeat
32 |         self.max_repeat = max_repeat
33 |         self._cur_action = super().compute_action()
34 |         self.reset()
35 | 
36 |     def compute_action(self, **kwargs):
37 |         if self._ts >= self._action_repeat:
38 |             self._cur_action = super().compute_action()
39 |             self._action_repeat = self.rng.randint(self.min_repeat, self.max_repeat)
40 |             self._ts = 0
41 |         self._ts += 1
42 | 
43 |         return self._cur_action
44 | 
45 |     def reset(self):
46 |         self._ts = 0
47 |         self._action_repeat = self.rng.randint(self.min_repeat, self.max_repeat)
48 |         return None


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_tasks/utils.py:
--------------------------------------------------------------------------------
 1 | import rl_sandbox.constants as c
 2 | 
 3 | from rl_sandbox.auxiliary_tasks.auxiliary_tasks import AuxiliaryTasks
 4 | from rl_sandbox.auxiliary_tasks.koopman import Koopman, KoopmanDynamics
 5 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer
 6 | 
 7 | 
 8 | def make_auxiliary_tasks(tasks, model, buffer, cfg):
 9 |     aux_tasks = dict()
10 |     if tasks is not None:
11 |         for task_name, task_setting in tasks.items():
12 |             assert task_name not in aux_tasks
13 |             if task_name == c.KOOPMAN:
14 |                 task_setting[c.MODEL_SETTING][c.KWARGS][c.LAYERS_DIM] = model.encoder.layers_dim
15 |                 decoder = make_model(task_setting[c.MODEL_SETTING]).to(task_setting[c.DEVICE])
16 |                 dynamics = KoopmanDynamics(z_dim=task_setting[c.Z_DIM],
17 |                                            u_dim=task_setting[c.U_DIM],
18 |                                            device=task_setting[c.DEVICE])
19 |                 aux_opt = make_optimizer(list(decoder.parameters()) + list(dynamics.parameters()), task_setting[c.OPTIMIZER_SETTING])
20 | 
21 |                 aux_tasks[c.KOOPMAN] = Koopman(rec_dim=task_setting[c.REC_DIM],
22 |                                                batch_size=task_setting[c.BATCH_SIZE],
23 |                                                decoder=decoder,
24 |                                                encoder=model.encoder,
25 |                                                dynamics=dynamics,
26 |                                                opt=aux_opt,
27 |                                                buffer=buffer,
28 |                                                algo_params=cfg,
29 |                                                reduction=task_setting[c.REDUCTION],
30 |                                                loss_coef=task_setting[c.LOSS_COEF],
31 |                                                device=task_setting[c.DEVICE])
32 |             else:
33 |                 raise NotImplementedError
34 |     return AuxiliaryTasks(aux_tasks)
35 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/envs/wrappers/absorbing_state.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import rl_sandbox.constants as c
 4 | 
 5 | from rl_sandbox.envs.wrappers.wrapper import Wrapper
 6 | 
 7 | 
 8 | class AbsorbingStateWrapper(Wrapper):
 9 |     def __init__(self, env, create_absorbing_state, max_episode_length):
10 |         super().__init__(env)
11 |         self._done = False
12 |         self._obs = None
13 |         self._max_episode_length = max_episode_length
14 |         self._create_absorbing_state = create_absorbing_state
15 |         self._prev_info = None
16 | 
17 |     def _get_obs(self):
18 |         if self._done:
19 |             # Return absorbing state which is [0, ..., 0, 1]
20 |             return np.eye(self._obs.size + 1)[-1]
21 | 
22 |         return np.concatenate((self._obs.reshape(-1), [0]), axis=0)
23 | 
24 |     def reset(self, **kwargs):
25 |         self._curr_timestep = 0
26 |         self._obs = self._env.reset(**kwargs)
27 |         self._done = False
28 |         return self._get_obs()
29 | 
30 |     def step(self, action, **kwargs):
31 |         self._curr_timestep += 1
32 |         if self._create_absorbing_state and self._done:
33 |             return self._get_obs(), 0., True, {**self._prev_info, c.ABSORBING_STATE: True, c.DONE: False}
34 | 
35 |         self._obs, reward, done, info = self._env.step(action, **kwargs)
36 |         self._prev_info = info
37 |         info[c.ABSORBING_STATE] = False
38 |         info[c.DONE] = done
39 |         if self._create_absorbing_state and self._curr_timestep < self._max_episode_length and done:
40 |             self._done = True
41 |             done = False  # otherwise env will reset without getting next absorbing state
42 |         return self._get_obs(), reward, done, info
43 | 
44 |     def render(self, **kwargs):
45 |         return self._env.render(**kwargs)
46 | 
47 |     def seed(self, seed):
48 |         self._env.seed(seed)
49 | 
50 | 
51 | def check_absorbing(config):
52 |     for wrapper in config[c.ENV_SETTING][c.ENV_WRAPPERS]:
53 |         if wrapper[c.WRAPPER] == AbsorbingStateWrapper:
54 |             return True
55 |     return False
56 | 


--------------------------------------------------------------------------------
/scripts/experiments/bc.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEED=$1
 4 | DEVICE=$2
 5 | MAIN_TASK=$3
 6 | EXPERT_DIR=$4
 7 | USER_MACHINE=$5
 8 | EXPERIMENT_NAME="${EXPERT_DIR}_$6"
 9 | 
10 | 
11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/"
12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}}
13 | echo "Using TOP_DIR OF ${TOP_DIR}"
14 | 
15 | DEFAULT_STACK_DIR="stack/"
16 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}}
17 | 
18 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/"
19 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}}
20 | 
21 | DEFAULT_BRING_DIR="bring/"
22 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}}
23 | 
24 | DEFAULT_INSERT_DIR="insert/"
25 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}}
26 | 
27 | 
28 | if [ "${MAIN_TASK}" = "stack" ]; then
29 |     EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}"
30 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
31 |     EXPERT_PATH="${PRE}2.gz"
32 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then
33 |     EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}"
34 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
35 |     EXPERT_PATH="${PRE}2.gz"
36 | elif [ "${MAIN_TASK}" = "bring" ]; then
37 |     EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}"
38 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
39 |     EXPERT_PATH="${PRE}2.gz"
40 | elif [ "${MAIN_TASK}" = "insert" ]; then
41 |     EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}"
42 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
43 |     EXPERT_PATH="${PRE}2.gz"
44 | else
45 |     echo "Invalid MAIN_TASK ${MAIN_TASK}"
46 |     exit 1
47 | fi
48 | 
49 | 
50 | echo "Running BC (early stopping) for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}."
51 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}."
52 | 
53 | PYTHON_TO_EXEC=$(cat <<-END
54 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_bc.py
55 | --seed=${SEED}
56 | --expert_path=${EXPERT_PATH}
57 | --main_task=${MAIN_TASK}_0
58 | --device=${DEVICE}
59 | --exp_name=${EXPERIMENT_NAME}
60 | --user_machine=${USER_MACHINE}
61 | END
62 | )
63 | 
64 | if [[ "${DEVICE}" == *"cuda"* ]]; then
65 |     PYTHON_TO_EXEC+=" --gpu_buffer"
66 | fi
67 | 
68 | python ${PYTHON_TO_EXEC}


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/buffers/wrappers/buffer_wrapper.py:
--------------------------------------------------------------------------------
 1 | from rl_sandbox.buffers.buffer import Buffer
 2 | 
 3 | 
 4 | class BufferWrapper(Buffer):
 5 |     def __init__(self, buffer):
 6 |         self.buffer = buffer
 7 | 
 8 |     def __getattr__(self, attr):
 9 |         return getattr(self.buffer, attr)
10 | 
11 |     def sample(self, batch_size, idxes=None):
12 |         return self.buffer.sample(batch_size, idxes)
13 | 
14 |     # def sample_with_next_obs(self, batch_size, next_obs, next_h_state=None, idxes=None):
15 |     #     return self.buffer.sample_with_next_obs(batch_size, next_obs, next_h_state, idxes)
16 |     def sample_with_next_obs(self, *args, **kwargs):
17 |         return self.buffer.sample_with_next_obs(*args, **kwargs)
18 | 
19 |     def sample_consecutive(self, batch_size, end_with_done=False):
20 |         return self.buffer.sample_consecutive(batch_size, end_with_done)
21 | 
22 |     def sample_init_obs(self, batch_size):
23 |         return self.buffer.sample_init_obs(batch_size)
24 | 
25 |     def sample_trajs(self, batch_size, next_obs, idxes=None, horizon_length=2):
26 |         return self.buffer.sample_trajs(batch_size, next_obs, idxes, horizon_length)
27 | 
28 |     @property
29 |     def memory_size(self):
30 |         return self.buffer.memory_size
31 | 
32 |     @property
33 |     def is_full(self):
34 |         return self.buffer.is_full
35 | 
36 |     def __len__(self):
37 |         return len(self.buffer)
38 | 
39 |     def push(self, obs, h_state, act, rew, done, info, *args, **kwargs):
40 |         self.buffer.push(obs, h_state, act, rew, done, info, *args, **kwargs)
41 | 
42 |     def clear(self):
43 |         return self.buffer.clear()
44 | 
45 |     def save(self, save_path, **kwargs):
46 |         return self.buffer.save(save_path, **kwargs)
47 | 
48 |     # def load(self, load_path, load_rng=True):
49 |     #     return self.buffer.load(load_path, load_rng=load_rng)
50 |     def load(self, *args, **kwargs):
51 |         return self.buffer.load(*args, **kwargs)
52 | 
53 |     def transfer_data(self, load_path):
54 |         return self.buffer.transfer_data(load_path)
55 | 
56 |     def close(self):
57 |         return self.buffer.close()
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # output
  2 | *.out
  3 | results/
  4 | lfgp_data/
  5 | rce/exp_data
  6 | scripts/lfebp/create_data
  7 | 
  8 | # gail outputs
  9 | pytorch-a2c-ppo-acktr-gail/gail_experts/data/
 10 | pytorch-a2c-ppo-acktr-gail/scripts/eval_logs/
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # celery beat schedule file
 90 | celerybeat-schedule
 91 | 
 92 | # SageMath parsed files
 93 | *.sage.py
 94 | 
 95 | # Environments
 96 | .env
 97 | .venv
 98 | env/
 99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | .envrc
104 | 
105 | # Spyder project settings
106 | .spyderproject
107 | .spyproject
108 | 
109 | # Rope project settings
110 | .ropeproject
111 | 
112 | # mkdocs documentation
113 | /site
114 | 
115 | # mypy
116 | .mypy_cache/
117 | 
118 | # vscode
119 | .vscode
120 | 
121 | # lfgp
122 | pytorch-a2c-ppo-acktr-gail/gail_experts/expert-data
123 | pytorch-a2c-ppo-acktr-gail/scripts/eval_logs/
124 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/envs/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | 
 4 | os.environ["MUJOCO_GL"] = "egl"
 5 | 
 6 | import rl_sandbox.constants as c
 7 | from rl_sandbox.envs.wrappers.absorbing_state import AbsorbingStateWrapper
 8 | 
 9 | def make_env(env_config, seed=None, dummy_env=False):
10 |     assert env_config[c.ENV_TYPE] in c.VALID_ENV_TYPE
11 |     if env_config[c.ENV_TYPE] == c.GYM:
12 |         import gym
13 |         import pybullet_envs
14 |         env = gym.make(**env_config[c.ENV_BASE])
15 |     elif env_config[c.ENV_TYPE] == c.DM_CONTROL:
16 |         from dm_control import suite
17 |         env = suite.load(**env_config[c.ENV_BASE])
18 |     elif env_config[c.ENV_TYPE] == c.MANIPULATOR_LEARNING:
19 |         import manipulator_learning.sim.envs as manlearn_envs
20 |         env = getattr(manlearn_envs,
21 |                       env_config[c.ENV_BASE][c.ENV_NAME])(dense_reward=False, **env_config.get(c.KWARGS, {}))
22 |     elif env_config[c.ENV_TYPE] in [c.SAWYER, c.HAND_DAPG]:
23 |         import rl_sandbox.envs.rce_envs as rce_envs
24 |         env = rce_envs.load_env(env_config[c.ENV_BASE][c.ENV_NAME], gym_env=True, **env_config.get(c.KWARGS, {}))
25 |     elif env_config[c.ENV_TYPE] == c.PANDA_RL_ENVS:
26 |         import panda_rl_envs
27 |         env_kwargs = env_config.get(c.KWARGS, {})
28 |         env_config_dict = env_kwargs.get("config_dict", {})
29 |         env_config_dict['dummy_env'] = dummy_env
30 |         env_kwargs['config_dict'] = env_config_dict
31 |         env = getattr(panda_rl_envs, env_config[c.ENV_BASE][c.ENV_NAME])(**env_kwargs)
32 |     else:
33 |         raise NotImplementedError
34 | 
35 |     for wrapper_config in env_config[c.ENV_WRAPPERS]:
36 |         env = wrapper_config[c.WRAPPER](env, **wrapper_config[c.KWARGS])
37 | 
38 |     if seed is None:
39 |         seed = np.random.randint(0, 2 ** 32 - 1)
40 | 
41 |     env.seed(seed)
42 | 
43 |     return env
44 | 
45 | 
46 | def absorbing_check(algo_params):
47 |     absorbing_in_settings = False
48 |     if c.ENV_WRAPPERS in algo_params[c.ENV_SETTING]:
49 |         for wrapper in algo_params[c.ENV_SETTING][c.ENV_WRAPPERS]:
50 |             if wrapper[c.WRAPPER] == AbsorbingStateWrapper:
51 |                 absorbing_in_settings = True
52 | 
53 |     return absorbing_in_settings


--------------------------------------------------------------------------------
/scripts/experiments/bc_no_overfit.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEED=$1
 4 | DEVICE=$2
 5 | MAIN_TASK=$3
 6 | EXPERT_DIR=$4
 7 | USER_MACHINE=$5
 8 | EXPERIMENT_NAME="${EXPERT_DIR}_$6"
 9 | 
10 | 
11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/"
12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}}
13 | echo "Using TOP_DIR OF ${TOP_DIR}"
14 | 
15 | DEFAULT_STACK_DIR="stack/"
16 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}}
17 | 
18 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/"
19 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}}
20 | 
21 | DEFAULT_BRING_DIR="bring/"
22 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}}
23 | 
24 | DEFAULT_INSERT_DIR="insert/"
25 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}}
26 | 
27 | 
28 | if [ "${MAIN_TASK}" = "stack" ]; then
29 |     EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}"
30 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
31 |     EXPERT_PATH="${PRE}2.gz"
32 |     NUM_TRAINING=20
33 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then
34 |     EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}"
35 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
36 |     EXPERT_PATH="${PRE}2.gz"
37 |     NUM_TRAINING=20
38 | elif [ "${MAIN_TASK}" = "bring" ]; then
39 |     EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}"
40 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
41 |     EXPERT_PATH="${PRE}2.gz"
42 |     NUM_TRAINING=20
43 | elif [ "${MAIN_TASK}" = "insert" ]; then
44 |     EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}"
45 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
46 |     EXPERT_PATH="${PRE}2.gz"
47 |     NUM_TRAINING=40
48 | else
49 |     echo "Invalid MAIN_TASK ${MAIN_TASK}"
50 |     exit 1
51 | fi
52 | 
53 | 
54 | echo "Running BC for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}."
55 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}."
56 | 
57 | PYTHON_TO_EXEC=$(cat <<-END
58 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_bc_no_overfit.py
59 | --seed=${SEED}
60 | --expert_path=${EXPERT_PATH}
61 | --main_task=${MAIN_TASK}_0
62 | --device=${DEVICE}
63 | --exp_name=${EXPERIMENT_NAME}
64 | --user_machine=${USER_MACHINE}
65 | --num_training=${NUM_TRAINING}
66 | --num_updates=100000
67 | --num_evals=50
68 | END
69 | )
70 | 
71 | if [[ "${DEVICE}" == *"cuda"* ]]; then
72 |     PYTHON_TO_EXEC+=" --gpu_buffer"
73 | fi
74 | 
75 | python ${PYTHON_TO_EXEC}
76 | 


--------------------------------------------------------------------------------
/scripts/experiments/multi_bc.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEED=$1
 4 | DEVICE=$2
 5 | MAIN_TASK=$3
 6 | EXPERT_DIR=$4
 7 | USER_MACHINE=$5
 8 | EXPERIMENT_NAME="${EXPERT_DIR}_$6"
 9 | 
10 | 
11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/"
12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}}
13 | echo "Using TOP_DIR OF ${TOP_DIR}"
14 | 
15 | DEFAULT_STACK_DIR="stack/"
16 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}}
17 | 
18 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/"
19 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}}
20 | 
21 | DEFAULT_BRING_DIR="bring/"
22 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}}
23 | 
24 | DEFAULT_INSERT_DIR="insert/"
25 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}}
26 | 
27 | 
28 | if [ "${MAIN_TASK}" = "stack" ]; then
29 |     EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}"
30 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
31 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz"
32 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then
33 |     EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}"
34 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
35 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz"
36 | elif [ "${MAIN_TASK}" = "bring" ]; then
37 |     EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}"
38 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
39 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz"
40 | elif [ "${MAIN_TASK}" = "insert" ]; then
41 |     EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}"
42 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
43 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz,${PRE}6.gz"
44 | else
45 |     echo "Invalid MAIN_TASK ${MAIN_TASK}"
46 |     exit 1
47 | fi
48 | 
49 | 
50 | echo "Running Multi BC (early stopping) for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}."
51 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}."
52 | 
53 | PYTHON_TO_EXEC=$(cat <<-END
54 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_multitask_bc.py
55 | --seed=${SEED}
56 | --expert_path=${EXPERT_PATHS}
57 | --main_task=${MAIN_TASK}_0
58 | --device=${DEVICE}
59 | --exp_name=${EXPERIMENT_NAME}
60 | --user_machine=${USER_MACHINE}
61 | END
62 | )
63 | 
64 | if [[ "${DEVICE}" == *"cuda"* ]]; then
65 |     PYTHON_TO_EXEC+=" --gpu_buffer"
66 | fi
67 | 
68 | python ${PYTHON_TO_EXEC}
69 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/buffers/disk_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import torch
 4 | 
 5 | from rl_sandbox.buffers.ram_buffer import NumPyBuffer
 6 | 
 7 | class DiskNumPyBuffer(NumPyBuffer):
 8 |     def __init__(self,
 9 |                  memory_size,
10 |                  obs_dim,
11 |                  h_state_dim,
12 |                  action_dim,
13 |                  reward_dim,
14 |                  infos=dict(),
15 |                  disk_dir="./",
16 |                  history_length=0,
17 |                  checkpoint_interval=0,
18 |                  checkpoint_path=None,
19 |                  rng=np.random,
20 |                  dtype=np.float32):
21 |         self.rng = rng
22 |         self._memory_size = memory_size
23 |         self._dtype = dtype
24 |         os.makedirs(f"{disk_dir}", exist_ok=True)
25 |         self.observations = np.memmap(filename=f"{disk_dir}/observations.npy", mode="w+", shape=(memory_size, *obs_dim), dtype=dtype)
26 |         self.hidden_states = np.memmap(filename=f"{disk_dir}/hidden_states.npy", mode="w+", shape=(memory_size, *h_state_dim), dtype=dtype)
27 |         self.actions = np.memmap(filename=f"{disk_dir}/actions.npy", mode="w+", shape=(memory_size, *action_dim), dtype=dtype)
28 |         self.rewards = np.memmap(filename=f"{disk_dir}/rewards.npy", mode="w+", shape=(memory_size, *reward_dim), dtype=dtype)
29 |         self.dones = np.memmap(filename=f"{disk_dir}/dones.npy", mode="w+", shape=(memory_size, 1), dtype=np.bool)
30 |         self.infos = dict()
31 |         for info_name, (info_shape, info_dtype) in infos.items():
32 |             self.infos[info_name] = np.memmap(filename=f"{disk_dir}/{info_name}.npy", mode="w+", shape=(memory_size, *info_shape), dtype=info_dtype)
33 | 
34 |         self._checkpoint_interval = checkpoint_interval
35 |         self._checkpoint_idxes = np.ones(shape=memory_size, dtype=np.bool)
36 |         if checkpoint_path is not None and memory_size >= checkpoint_interval > 0:
37 |             self._checkpoint_path = checkpoint_path
38 |             os.makedirs(checkpoint_path, exist_ok=True)
39 |             self.checkpoint = self._checkpoint
40 |             self._checkpoint_count = 0
41 |         else:
42 |             self.checkpoint = lambda: None
43 | 
44 |         self._pointer = 0
45 |         self._count = 0
46 |         self.history_length = history_length
47 |         self.history_frame = np.zeros(shape=(history_length, *obs_dim), dtype=dtype)
48 | 
49 |     def save(self, save_path, end_with_done=True):
50 |         pass
51 | 


--------------------------------------------------------------------------------
/scripts/experiments/dac.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEED=$1
 4 | DEVICE=$2
 5 | MAIN_TASK=$3
 6 | EXPERT_DIR=$4
 7 | USER_MACHINE=$5
 8 | EXPBUF_LAST_SAMPLE_PROP=$6  # default is .95, 0. turns it off
 9 | EXPBUF_MODEL_SAMPLE_RATE=$7  # default is .1, 0. turns it off
10 | EXPERIMENT_NAME="${EXPERT_DIR}_$8"
11 | 
12 | 
13 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/"
14 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}}
15 | echo "Using TOP_DIR OF ${TOP_DIR}"
16 | 
17 | DEFAULT_STACK_DIR="stack/"
18 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}}
19 | 
20 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/"
21 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}}
22 | 
23 | DEFAULT_BRING_DIR="bring/"
24 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}}
25 | 
26 | DEFAULT_INSERT_DIR="insert/"
27 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}}
28 | 
29 | 
30 | if [ "${MAIN_TASK}" = "stack" ]; then
31 |     EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}"
32 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
33 |     EXPERT_PATH="${PRE}2.gz"
34 |     MAX_STEPS=2000000
35 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then
36 |     EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}"
37 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
38 |     EXPERT_PATH="${PRE}2.gz"
39 |     MAX_STEPS=2000000
40 | elif [ "${MAIN_TASK}" = "bring" ]; then
41 |     EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}"
42 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
43 |     EXPERT_PATH="${PRE}2.gz"
44 |     MAX_STEPS=2000000
45 | elif [ "${MAIN_TASK}" = "insert" ]; then
46 |     EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}"
47 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
48 |     EXPERT_PATH="${PRE}2.gz"
49 |     MAX_STEPS=4000000
50 | else
51 |     echo "Invalid MAIN_TASK ${MAIN_TASK}"
52 |     exit 1
53 | fi
54 | 
55 | echo "Running DAC for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}."
56 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}."
57 | 
58 | PYTHON_TO_EXEC=$(cat <<-END
59 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_dac.py
60 | --seed ${SEED}
61 | --user_machine ${USER_MACHINE}
62 | --expert_path ${EXPERT_PATH}
63 | --main_task ${MAIN_TASK}_0
64 | --exp_name ${EXPERIMENT_NAME}
65 | --device ${DEVICE}
66 | --num_evals 50 
67 | --max_steps ${MAX_STEPS}
68 | --expbuf_last_sample_prop=${EXPBUF_LAST_SAMPLE_PROP}
69 | --expbuf_model_sample_rate=${EXPBUF_MODEL_SAMPLE_RATE}
70 | END
71 | )
72 | 
73 | if [[ "${DEVICE}" == *"cuda"* ]]; then
74 |     PYTHON_TO_EXEC+=" --gpu_buffer"
75 | fi
76 | 
77 | python ${PYTHON_TO_EXEC}
78 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/sac_x/sac_x.py:
--------------------------------------------------------------------------------
 1 | import timeit
 2 | 
 3 | import rl_sandbox.constants as c
 4 | 
 5 | 
 6 | class SACX:
 7 |     def __init__(self, update_scheduler, update_intentions, algo_params):
 8 |         self.update_scheduler = update_scheduler
 9 |         self.update_intentions = update_intentions
10 |         self.algo_params = algo_params
11 |         self.buffer = update_intentions.buffer
12 |         self.step = 0
13 | 
14 |         if hasattr(self.update_intentions, '_use_absorbing_state'):
15 |             self._use_absorbing_state = self.update_intentions._use_absorbing_state
16 |         else:
17 |             self._use_absorbing_state = False
18 | 
19 |     def state_dict(self):
20 |         state_dict = {
21 |             c.SCHEDULER: self.update_scheduler.state_dict(),
22 |             c.INTENTIONS: self.update_intentions.state_dict(),
23 |         }
24 |         return state_dict
25 | 
26 |     def load_state_dict(self, state_dict):
27 |         self.update_scheduler.load_state_dict(state_dict[c.SCHEDULER])
28 |         self.update_intentions.load_state_dict(state_dict[c.INTENTIONS])
29 | 
30 |     def update(self, curr_obs, curr_h_state, act, rew, done, info, next_obs, next_h_state,
31 |                update_intentions=True, update_scheduler=True, update_buffer=True, update_info={}):
32 |         update_info = update_info
33 | 
34 |         # Intention Learning
35 |         updated_intentions = False
36 |         if update_intentions:
37 |             tic = timeit.default_timer()
38 |             updated_intentions, intentions_info = self.update_intentions.update(
39 |                 curr_obs, curr_h_state, act, rew, done, info, next_obs, next_h_state, update_buffer=update_buffer)
40 |             toc = timeit.default_timer()
41 |             if updated_intentions:
42 |                 update_info[c.INTENTIONS_UPDATE_TIME] = toc - tic
43 |                 update_info.update(intentions_info)
44 | 
45 |         # Scheduler Learning
46 |         if update_scheduler:
47 |             self.step += 1
48 |             tic = timeit.default_timer()
49 |             updated_scheduler, scheduler_info = self.update_scheduler.update(curr_obs, act, rew, done, info)
50 |             toc = timeit.default_timer()
51 |             if updated_scheduler:
52 |                 update_info[c.SCHEDULER_UPDATE_TIME] = toc - tic
53 |                 update_info.update(scheduler_info)
54 | 
55 |         return updated_intentions, update_info
56 | 
57 |     def reset(self):
58 |         if hasattr(self.update_scheduler, 'reset'):
59 |             self.update_scheduler.reset()


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/buffers/wrappers/torch_buffer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import rl_sandbox.constants as c
 4 | 
 5 | from rl_sandbox.buffers.wrappers.buffer_wrapper import BufferWrapper
 6 | 
 7 | 
 8 | class TorchBuffer(BufferWrapper):
 9 |     def __init__(self, buffer):
10 |         super().__init__(buffer)
11 | 
12 |     def _convert_batch_to_torch(self, obss, h_states, acts, rews, dones, infos, lengths):
13 |         obss = torch.as_tensor(obss).float()
14 |         h_states = torch.as_tensor(h_states).float()
15 |         acts = torch.as_tensor(acts).float()
16 |         rews = torch.as_tensor(rews).float()
17 |         dones = torch.as_tensor(dones).long()
18 |         infos = {k: torch.as_tensor(v) for k, v in infos.items()}
19 |         lengths = torch.as_tensor(lengths).long()
20 | 
21 |         return obss, h_states, acts, rews, dones, infos, lengths
22 | 
23 |     def sample(self, batch_size, idxes=None):
24 |         obss, h_states, acts, rews, dones, infos, lengths, idxes = super().sample(batch_size, idxes)
25 |         return self._convert_batch_to_torch(obss, h_states, acts, rews, dones, infos, lengths)
26 | 
27 |     def sample_with_next_obs(self, batch_size, next_obs, next_h_state=None, idxes=None):
28 |         obss, h_states, acts, rews, dones, next_obss, next_h_states, infos, lengths, _ = super().sample_with_next_obs(batch_size, next_obs, next_h_state, idxes)
29 |         obss, h_states, acts, rews, dones, infos, lengths = self._convert_batch_to_torch(obss, h_states, acts, rews, dones, infos, lengths)
30 |         next_obss = torch.as_tensor(next_obss).float()
31 |         next_h_states = torch.as_tensor(next_h_states)
32 | 
33 |         return obss, h_states, acts, rews, dones, next_obss, next_h_states, infos, lengths
34 | 
35 |     def sample_consecutive(self, batch_size, end_with_done=False):
36 |         obss, h_states, acts, rews, dones, infos, lengths, _ = super().sample_consecutive(batch_size, end_with_done)
37 |         return self._convert_batch_to_torch(obss, h_states, acts, rews, dones, infos, lengths)
38 | 
39 |     def sample_init_obs(self, batch_size):
40 |         obss, h_states = super().sample_init_obs(batch_size)
41 |         return torch.as_tensor(obss).float(), torch.as_tensor(h_states).float()
42 | 
43 |     def sample_trajs(self, batch_size, next_obs, idxes=None, horizon_length=2):
44 |         obss, h_states, acts, rews, dones, infos, lengths, ep_lengths, idxes = super().sample_trajs(batch_size, next_obs, idxes, horizon_length)
45 |         obss, h_states, acts, rews, dones, infos, lengths = self._convert_batch_to_torch(obss, h_states, acts, rews, dones, infos, lengths)
46 |         ep_lengths = torch.as_tensor(ep_lengths).long()
47 |         idxes = torch.as_tensor(idxes).long()
48 |         return obss, h_states, acts, rews, dones, infos, lengths, ep_lengths, idxes
49 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/evaluation.py:
--------------------------------------------------------------------------------
 1 | import _pickle as pickle
 2 | import numpy as np
 3 | import torch
 4 | 
 5 | from a2c_ppo_acktr import utils
 6 | from a2c_ppo_acktr.envs import make_vec_envs
 7 | 
 8 | 
 9 | def evaluate(actor_critic, eval_envs, eval_log_dir, device, eval_i, seed, env_name, render, reward_suc_wrapper=None,
10 |              num_eval_eps=50):
11 | 
12 |     # obss = [[]]
13 |     # rews = [[]]
14 |     # sucs = [[]]
15 |     # acts = [[]]
16 |     # infos = [[]]
17 |     returns = []
18 |     successes = []
19 |     success = None
20 | 
21 |     obs = eval_envs.reset()
22 |     eval_recurrent_hidden_states = torch.zeros(
23 |         1, actor_critic.recurrent_hidden_state_size, device=device)
24 |     eval_masks = torch.zeros(1, 1, device=device)
25 | 
26 |     num_eps = 0
27 |     ep_return = 0
28 |     success_latch = False
29 | 
30 |     while num_eps < num_eval_eps:
31 |         with torch.no_grad():
32 |             _, action, _, eval_recurrent_hidden_states = actor_critic.act(
33 |                 obs.to(device),
34 |                 eval_recurrent_hidden_states,
35 |                 eval_masks,
36 |                 deterministic=True)
37 | 
38 |         if render:
39 |             eval_envs.render()
40 | 
41 |         # Obser reward and next obs
42 |         prev_obs = obs
43 |         obs, rew, done, info = eval_envs.step(action)
44 | 
45 |         # fix reward
46 |         if reward_suc_wrapper is not None:
47 |             rew, success = reward_suc_wrapper.get_rew_suc(prev_obs, action, info)
48 | 
49 |         eval_masks = torch.tensor(
50 |             [[0.0] if done_ else [1.0] for done_ in done],
51 |             dtype=torch.float32,
52 |             device=device)
53 | 
54 |         # obss[-1].append(obs)
55 |         # rews[-1].append(rew)
56 |         # sucs[-1].append(success)
57 |         # acts[-1].append(action)
58 |         # infos[-1].append(info)
59 | 
60 |         ep_return += rew
61 |         if success:
62 |             success_latch = True
63 | 
64 |         if done:
65 |             num_eps += 1
66 |             returns.append(ep_return)
67 |             successes.append(int(success_latch))
68 |             ep_return = 0
69 |             success_latch = False
70 | 
71 |             # obss.append([])
72 |             # rews.append([])
73 |             # sucs.append([])
74 |             # acts.append([])
75 |             # infos.append([])
76 | 
77 |     # pickle.dump({
78 |     #     "obss": obss,
79 |     #     "rews": rews,
80 |     #     "sucs": sucs,
81 |     #     "acts": acts,
82 |     #     "infos": infos,
83 |     # }, open(f"{env_name}-{seed}-{eval_i}.pkl", "wb"))
84 |     print(" Evaluation using {} episodes: mean reward {:.5f}, suc rate {:.5f} \n".format(
85 |         num_eps, np.mean(returns), np.sum(successes) / len(successes)))
86 | 
87 |     return returns, successes
88 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/envs/wrappers/frame_stack.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from collections import deque
 4 | 
 5 | from rl_sandbox.envs.wrappers.wrapper import Wrapper
 6 | 
 7 | 
 8 | class FrameStackWrapper(Wrapper):
 9 |     def __init__(self, env, num_frames):
10 |         assert num_frames > 0
11 |         super().__init__(env)
12 |         self._num_frames = num_frames
13 |         self.frames = deque([], maxlen=num_frames)
14 | 
15 |     def _get_obs(self):
16 |         assert len(self.frames) == self._num_frames
17 |         # return np.stack(self.frames)
18 |         return np.concatenate(self.frames)[None, :]
19 | 
20 |     def reset(self, **kwargs):
21 |         obs = self._env.reset(**kwargs)
22 |         for _ in range(self._num_frames):
23 |             self.frames.append(obs)
24 | 
25 |         return self._get_obs()
26 | 
27 |     def step(self, action, **kwargs):
28 |         obs, reward, done, info = self._env.step(action, **kwargs)
29 |         self.frames.append(obs)
30 | 
31 |         return self._get_obs(), reward, done, info
32 | 
33 |     def render(self, **kwargs):
34 |         return self._env.render(**kwargs)
35 | 
36 |     def seed(self, seed):
37 |         self._env.seed(seed)
38 | 
39 | 
40 | class OfflineFrameStack:
41 |     def __init__(self, num_frames):
42 |         self._num_frames = num_frames
43 |         self.frames = deque([], maxlen=num_frames)
44 | 
45 |     def get_stacked_obs(self, obs):
46 |         assert len(self.frames) == self._num_frames
47 |         self.frames.append(obs)
48 |         # return np.stack(self.frames)
49 |         return np.concatenate(self.frames)
50 | 
51 |     def reset(self, obs):
52 |         for _ in range(self._num_frames):
53 |             self.frames.append(obs)
54 | 
55 |         return np.concatenate(self.frames)
56 | 
57 | 
58 | def make_frame_stack(num_frames, obss, dones, next_obss=None):
59 |     # inefficiently doing this with a for loop for now
60 |     stacked_obss = []
61 |     frame_stacker = OfflineFrameStack(num_frames)
62 |     if next_obss is not None:
63 |         stacked_next_obss = []
64 |         next_obss_frame_stacker = OfflineFrameStack(num_frames)
65 | 
66 |     new_ep = True
67 | 
68 |     for i in range(0, len(obss)):
69 |         if new_ep:
70 |             stacked_obss.append(frame_stacker.reset(obss[i]))
71 |             if next_obss is not None:
72 |                 stacked_next_obss.append(next_obss_frame_stacker.reset(next_obss[i]))
73 |         else:
74 |             stacked_obss.append(frame_stacker.get_stacked_obs(obss[i]))
75 |             if next_obss is not None:
76 |                 stacked_next_obss.append(next_obss_frame_stacker.get_stacked_obs(next_obss[i]))
77 | 
78 |         new_ep = dones[i]
79 | 
80 |     if next_obss is None:
81 |         return np.vstack(stacked_obss)
82 |     else:
83 |         return np.vstack(stacked_obss), np.vstack(stacked_next_obss)


--------------------------------------------------------------------------------
/scripts/experiments/lfgp.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEED=$1
 4 | DEVICE=$2
 5 | MAIN_TASK=$3
 6 | EXPERT_DIR=$4
 7 | USER_MACHINE=$5
 8 | SCHEDULER=$6  # can be one of [wrs_plus_handcraft, wrs, learned, no_sched]
 9 | EXPBUF_LAST_SAMPLE_PROP=$7  # default is .95, 0. turns it off
10 | EXPBUF_MODEL_SAMPLE_RATE=$8  # default is .1, 0. turns it off
11 | EXPERIMENT_NAME="${EXPERT_DIR}_$9"
12 | 
13 | 
14 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/"
15 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}}
16 | echo "Using TOP_DIR OF ${TOP_DIR}"
17 | 
18 | DEFAULT_STACK_DIR="stack/"
19 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}}
20 | 
21 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/"
22 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}}
23 | 
24 | DEFAULT_BRING_DIR="bring/"
25 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}}
26 | 
27 | DEFAULT_INSERT_DIR="insert/"
28 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}}
29 | 
30 | 
31 | if [ "${MAIN_TASK}" = "stack" ]; then
32 |     EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}"
33 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
34 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz"
35 |     MAX_STEPS=2000000
36 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then
37 |     EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}"
38 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
39 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz"
40 |     MAX_STEPS=2000000
41 | elif [ "${MAIN_TASK}" = "bring" ]; then
42 |     EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}"
43 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
44 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz"
45 |     MAX_STEPS=2000000
46 | elif [ "${MAIN_TASK}" = "insert" ]; then
47 |     EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}"
48 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
49 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz,${PRE}6.gz"
50 |     MAX_STEPS=4000000
51 | else
52 |     echo "Invalid MAIN_TASK ${MAIN_TASK}"
53 |     exit 1
54 | fi
55 | 
56 | echo "Running LfGP sched ${SCHEDULER} for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}."
57 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}."
58 | 
59 | PYTHON_TO_EXEC=$(cat <<-END
60 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_lfgp.py
61 | --seed=${SEED}
62 | --expert_path=${EXPERT_PATHS}
63 | --main_task=${MAIN_TASK}_0
64 | --device=${DEVICE}
65 | --exp_name=${EXPERIMENT_NAME}
66 | --user_machine=${USER_MACHINE}
67 | --scheduler=${SCHEDULER}
68 | --max_steps=${MAX_STEPS}
69 | --expbuf_last_sample_prop=${EXPBUF_LAST_SAMPLE_PROP}
70 | --expbuf_model_sample_rate=${EXPBUF_MODEL_SAMPLE_RATE}
71 | END
72 | )
73 | 
74 | if [[ "${DEVICE}" == *"cuda"* ]]; then
75 |     PYTHON_TO_EXEC+=" --gpu_buffer"
76 | fi
77 | 
78 | python ${PYTHON_TO_EXEC}
79 | 


--------------------------------------------------------------------------------
/scripts/experiments/multi_bc_no_overfit.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEED=$1
 4 | DEVICE=$2
 5 | MAIN_TASK=$3
 6 | EXPERT_DIR=$4
 7 | USER_MACHINE=$5
 8 | EXPERIMENT_NAME="${EXPERT_DIR}_$6"
 9 | 
10 | 
11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/"
12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}}
13 | echo "Using TOP_DIR OF ${TOP_DIR}"
14 | 
15 | DEFAULT_STACK_DIR="stack/"
16 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}}
17 | 
18 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/"
19 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}}
20 | 
21 | DEFAULT_BRING_DIR="bring/"
22 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}}
23 | 
24 | DEFAULT_INSERT_DIR="insert/"
25 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}}
26 | 
27 | 
28 | if [ "${MAIN_TASK}" = "stack" ]; then
29 |     EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}"
30 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
31 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz"
32 |     NUM_TRAINING=20
33 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then
34 |     EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}"
35 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
36 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz"
37 |     NUM_TRAINING=20
38 | elif [ "${MAIN_TASK}" = "bring" ]; then
39 |     EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}"
40 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
41 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz"
42 |     NUM_TRAINING=20
43 | elif [ "${MAIN_TASK}" = "insert" ]; then
44 |     EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}"
45 |     PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_"
46 |     EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz,${PRE}6.gz"
47 |     NUM_TRAINING=40
48 | else
49 |     echo "Invalid MAIN_TASK ${MAIN_TASK}"
50 |     exit 1
51 | fi
52 | 
53 | echo "Running Multi BC for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}."
54 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}."
55 | 
56 | PYTHON_TO_EXEC=$(cat <<-END
57 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_multitask_bc_no_overfit.py
58 | --seed=${SEED}
59 | --expert_paths=${EXPERT_PATHS}
60 | --main_task=${MAIN_TASK}_0
61 | --device=${DEVICE}
62 | --exp_name=${EXPERIMENT_NAME}
63 | --user_machine=${USER_MACHINE}
64 | --num_training=${NUM_TRAINING}
65 | --num_updates=100000
66 | END
67 | )
68 | 
69 | if [[ "${DEVICE}" == *"cuda"* ]]; then
70 |     PYTHON_TO_EXEC+=" --gpu_buffer"
71 | fi
72 | 
73 | python ${PYTHON_TO_EXEC}
74 | 
75 | # python ../../rl_sandbox/rl_sandbox/examples/lfgp/run_multitask_bc_no_overfit.py \
76 | # --seed="${SEED}" \
77 | # --expert_paths="${EXPERT_PATHS}" \
78 | # --main_task="${MAIN_TASK}_0" \
79 | # --device="${DEVICE}" \
80 | # --exp_name="${EXPERIMENT_NAME}" \
81 | # --user_machine="${USER_MACHINE}" \
82 | # --num_training="${NUM_TRAINING}" \
83 | # --num_updates=100000 \
84 | # --gpu_buffer


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/enjoy.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | # workaround to unpickle olf model files
 4 | import sys
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | 
 9 | from a2c_ppo_acktr.envs import VecPyTorch, make_vec_envs
10 | from a2c_ppo_acktr.utils import get_render_func, get_vec_normalize
11 | 
12 | sys.path.append('a2c_ppo_acktr')
13 | 
14 | parser = argparse.ArgumentParser(description='RL')
15 | parser.add_argument(
16 |     '--seed', type=int, default=1, help='random seed (default: 1)')
17 | parser.add_argument(
18 |     '--log-interval',
19 |     type=int,
20 |     default=10,
21 |     help='log interval, one log per n updates (default: 10)')
22 | parser.add_argument(
23 |     '--env-name',
24 |     default='PongNoFrameskip-v4',
25 |     help='environment to train on (default: PongNoFrameskip-v4)')
26 | parser.add_argument(
27 |     '--load-dir',
28 |     default='./trained_models/',
29 |     help='directory to save agent logs (default: ./trained_models/)')
30 | parser.add_argument(
31 |     '--non-det',
32 |     action='store_true',
33 |     default=False,
34 |     help='whether to use a non-deterministic policy')
35 | args = parser.parse_args()
36 | 
37 | args.det = not args.non_det
38 | 
39 | env = make_vec_envs(
40 |     args.env_name,
41 |     args.seed + 1000,
42 |     1,
43 |     None,
44 |     None,
45 |     device='cpu',
46 |     allow_early_resets=False)
47 | 
48 | # Get a render function
49 | render_func = get_render_func(env)
50 | 
51 | # We need to use the same statistics for normalization as used in training
52 | actor_critic, obs_rms = \
53 |             torch.load(os.path.join(args.load_dir, args.env_name + ".pt"),
54 |                         map_location='cpu')
55 | 
56 | vec_norm = get_vec_normalize(env)
57 | if vec_norm is not None:
58 |     vec_norm.eval()
59 |     vec_norm.obs_rms = obs_rms
60 | 
61 | recurrent_hidden_states = torch.zeros(1,
62 |                                       actor_critic.recurrent_hidden_state_size)
63 | masks = torch.zeros(1, 1)
64 | 
65 | obs = env.reset()
66 | 
67 | if render_func is not None:
68 |     render_func('human')
69 | 
70 | if args.env_name.find('Bullet') > -1:
71 |     import pybullet as p
72 | 
73 |     torsoId = -1
74 |     for i in range(p.getNumBodies()):
75 |         if (p.getBodyInfo(i)[0].decode() == "torso"):
76 |             torsoId = i
77 | 
78 | while True:
79 |     with torch.no_grad():
80 |         value, action, _, recurrent_hidden_states = actor_critic.act(
81 |             obs, recurrent_hidden_states, masks, deterministic=args.det)
82 | 
83 |     # Obser reward and next obs
84 |     obs, reward, done, _ = env.step(action)
85 | 
86 |     masks.fill_(0.0 if done else 1.0)
87 | 
88 |     if args.env_name.find('Bullet') > -1:
89 |         if torsoId > -1:
90 |             distance = 5
91 |             yaw = 0
92 |             humanPos, humanOrn = p.getBasePositionAndOrientation(torsoId)
93 |             p.resetDebugVisualizerCamera(distance, yaw, -20, humanPos)
94 | 
95 |     if render_func is not None:
96 |         render_func('human')
97 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/algo/a2c_acktr.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.optim as optim
 4 | 
 5 | from a2c_ppo_acktr.algo.kfac import KFACOptimizer
 6 | 
 7 | 
 8 | class A2C_ACKTR():
 9 |     def __init__(self,
10 |                  actor_critic,
11 |                  value_loss_coef,
12 |                  entropy_coef,
13 |                  lr=None,
14 |                  eps=None,
15 |                  alpha=None,
16 |                  max_grad_norm=None,
17 |                  acktr=False):
18 | 
19 |         self.actor_critic = actor_critic
20 |         self.acktr = acktr
21 | 
22 |         self.value_loss_coef = value_loss_coef
23 |         self.entropy_coef = entropy_coef
24 | 
25 |         self.max_grad_norm = max_grad_norm
26 | 
27 |         if acktr:
28 |             self.optimizer = KFACOptimizer(actor_critic)
29 |         else:
30 |             self.optimizer = optim.RMSprop(
31 |                 actor_critic.parameters(), lr, eps=eps, alpha=alpha)
32 | 
33 |     def update(self, rollouts):
34 |         obs_shape = rollouts.obs.size()[2:]
35 |         action_shape = rollouts.actions.size()[-1]
36 |         num_steps, num_processes, _ = rollouts.rewards.size()
37 | 
38 |         values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
39 |             rollouts.obs[:-1].view(-1, *obs_shape),
40 |             rollouts.recurrent_hidden_states[0].view(
41 |                 -1, self.actor_critic.recurrent_hidden_state_size),
42 |             rollouts.masks[:-1].view(-1, 1),
43 |             rollouts.actions.view(-1, action_shape))
44 | 
45 |         values = values.view(num_steps, num_processes, 1)
46 |         action_log_probs = action_log_probs.view(num_steps, num_processes, 1)
47 | 
48 |         advantages = rollouts.returns[:-1] - values
49 |         value_loss = advantages.pow(2).mean()
50 | 
51 |         action_loss = -(advantages.detach() * action_log_probs).mean()
52 | 
53 |         if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
54 |             # Compute fisher, see Martens 2014
55 |             self.actor_critic.zero_grad()
56 |             pg_fisher_loss = -action_log_probs.mean()
57 | 
58 |             value_noise = torch.randn(values.size())
59 |             if values.is_cuda:
60 |                 value_noise = value_noise.cuda()
61 | 
62 |             sample_values = values + value_noise
63 |             vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()
64 | 
65 |             fisher_loss = pg_fisher_loss + vf_fisher_loss
66 |             self.optimizer.acc_stats = True
67 |             fisher_loss.backward(retain_graph=True)
68 |             self.optimizer.acc_stats = False
69 | 
70 |         self.optimizer.zero_grad()
71 |         (value_loss * self.value_loss_coef + action_loss -
72 |          dist_entropy * self.entropy_coef).backward()
73 | 
74 |         if self.acktr == False:
75 |             nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
76 |                                      self.max_grad_norm)
77 | 
78 |         self.optimizer.step()
79 | 
80 |         return value_loss.item(), action_loss.item(), dist_entropy.item()
81 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/lfgp/experts/scripted_policies.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from torch.distributions import Normal, Uniform
 5 | 
 6 | 
 7 | class GripperIntentions:
 8 |     """
 9 |     Intentions that respectively open and close the gripper while following Gaussian noise for other actions
10 |     """
11 |     def __init__(self, action_dim, gripper_dim, means, vars):
12 |         self._action_dim = action_dim
13 |         self._gripper_dim = gripper_dim
14 |         self._non_gripper_dim = torch.ones(self._action_dim)
15 |         self._non_gripper_dim[gripper_dim] = 0
16 |         self._non_gripper_dim = torch.where(self._non_gripper_dim)
17 | 
18 |         self.action_dist = Normal(loc=means, scale=vars)
19 |         self.gripper_dist = Uniform(0., 1.)
20 | 
21 |         self._entropies = np.zeros((2, self._action_dim))
22 |         self._means = np.zeros((2, self._action_dim))
23 |         self._vars = np.ones((2, self._action_dim))
24 |         for idx in range(2):
25 |             self._entropies[idx][self._non_gripper_dim] = self.action_dist.entropy().numpy()
26 |             self._entropies[idx, self._gripper_dim] = self.gripper_dist.entropy().numpy()
27 |             self._means[idx][self._non_gripper_dim] = self.action_dist.mean.numpy()
28 |             self._means[idx, self._gripper_dim]= self.gripper_dist.mean.numpy() * ((-1) ** (idx + 1))
29 |             self._vars[idx][self._non_gripper_dim] = self.action_dist.variance.numpy()
30 |             self._vars[idx, self._gripper_dim]= self.gripper_dist.variance.numpy()
31 | 
32 |     def compute_action(self, x, h):
33 |         act = torch.zeros((2, self._action_dim), dtype=torch.float)
34 |         log_probs = np.zeros((2, self._action_dim))
35 | 
36 |         for idx in range(2):
37 |             act[idx][self._non_gripper_dim] = self.action_dist.sample()
38 |             act[idx, self._gripper_dim] = 1 - self.gripper_dist.sample()
39 |             log_probs[idx][self._non_gripper_dim] = self.action_dist.log_prob(act[idx][self._non_gripper_dim]).numpy()
40 |             log_probs[idx, self._gripper_dim] = self.gripper_dist.log_prob(act[idx, self._gripper_dim]).numpy()
41 | 
42 |         act[0, self._gripper_dim] *= -1.
43 |         log_probs = np.sum(log_probs, axis=-1)
44 | 
45 |         return act, np.zeros(2), h[0].cpu().numpy(), log_probs, self._entropies, self._means, self._vars
46 | 
47 |     def deterministic_action(self, x, h):
48 |         act = torch.zeros((2, self._action_dim), dtype=torch.float)
49 |         log_probs = np.zeros((2, self._action_dim))
50 | 
51 |         for idx in range(2):
52 |             act[idx][self._non_gripper_dim] = self.action_dist.mean
53 |             act[idx, self._gripper_dim] = 1.
54 |             log_probs[idx][self._non_gripper_dim] = self.action_dist.log_prob(act[idx][self._non_gripper_dim]).numpy()
55 |             log_probs[idx, self._gripper_dim] = self.gripper_dist.log_prob(act[idx, self._gripper_dim]).numpy()
56 | 
57 |         act[0, self._gripper_dim] *= -1.
58 |         log_probs = np.sum(log_probs, axis=-1)
59 | 
60 |         return act, np.zeros(2), h[0].cpu().numpy(), log_probs, self._entropies, self._means, self._vars
61 | 


--------------------------------------------------------------------------------
/scripts/create_data/create_expert_data.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # NUM_STEPS_PER_BUFFER should be either an empty string, or a comma separated list, e.g. "9000, 0, 0, 0, 0, 0"
 4 | 
 5 | MAIN_TASK=$1
 6 | STEPS_PER_TASK=$2
 7 | NUM_STEPS_PER_BUFFER=$3
 8 | AUX_OVERRIDE=$4
 9 | 
10 | 
11 | SAVE_PATH_POST="${STEPS_PER_TASK}_steps_no_extra_final"
12 | DEFAULT_TOP_DIR="../../lfgp_data"
13 | TOP_DIR=${LFGP_TOP_DIR:=${DEFAULT_TOP_DIR}}
14 | echo "Using TOP_DIR OF ${TOP_DIR}"
15 | FULL_PATH="${TOP_DIR}/trained_models/experts/${MAIN_TASK}"
16 | MODEL_PATH="${FULL_PATH}/state_dict.pt"
17 | CONFIG_PATH="${FULL_PATH}/sacx_experiment_setting.pkl"
18 | SAVE_PATH="${TOP_DIR}/custom_expert_data/${MAIN_TASK}/${SAVE_PATH_POST}/"
19 | 
20 | 
21 | echo "Generating data for ${MAIN_TASK}, ${STEPS_PER_TASK} steps per task."
22 | if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then
23 |     echo "NUM_STEPS_PER_BUFFER is unset, running for all tasks"
24 | else
25 |     echo "Getting ${NUM_STEPS_PER_BUFFER} for each task, running task ${AUX_OVERRIDE} only."
26 | fi
27 | 
28 | 
29 | if [ "${MAIN_TASK}" = "insert" ]; then
30 |     O_PRB="0.08333333333"
31 |     FORCED_SCHEDULE="{0: {0: ([0, 1, 2, 3, 4, 5, 6], [${O_PRB}, ${O_PRB}, .5, ${O_PRB}, ${O_PRB}, ${O_PRB}, ${O_PRB}], ['k', 'd', 'd', 'd', 'd', 'd', 'd']), 70: 0}, 1: {0: 3, 15: 1}}"
32 |     SCHEDULER_PERIOD=90
33 |     if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then
34 |         NUM_STEPS_PER_BUFFER=""
35 |         AUX_OVERRIDE=""
36 |     fi
37 | 
38 | elif [ "${MAIN_TASK}" = "stack" ]; then
39 |     FORCED_SCHEDULE="{0: {0: ([0, 1, 2, 3, 4, 5], [.1, .1, .5, .1, .1, .1], ['k', 'd', 'd', 'd', 'd', 'd']), 45: 0}, 1: {0: 3, 15: 1}}"
40 |     SCHEDULER_PERIOD=90
41 |     if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then
42 |         NUM_STEPS_PER_BUFFER=""
43 |         AUX_OVERRIDE=""
44 |     fi
45 | 
46 | elif [ "${MAIN_TASK}" = "bring" ]; then
47 |     FORCED_SCHEDULE="{0: {0: ([0, 1, 3, 4, 5, 6], [.1, .1, .5, .1, .1, .1], ['k', 'd', 'd', 'd', 'd', 'd']), 45: 0}, 1: {0: 3, 15: 1}}"
48 |     SCHEDULER_PERIOD=90
49 |     if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then
50 |         NUM_STEPS_PER_BUFFER=""
51 |         AUX_OVERRIDE="0,1,3,4,5,6"  # to ensure we skip insert from this model
52 |     fi
53 | 
54 | elif [ "${MAIN_TASK}" = "unstack-stack" ]; then
55 |     FORCED_SCHEDULE="{0: {0: ([0, 1, 2, 4, 5, 6], [.1, .1, .5, .1, .1, .1], ['k', 'd', 'd', 'd', 'd', 'd']), 45: 0}, 1: {0: 3, 15: 1}}"
56 |     SCHEDULER_PERIOD=120
57 |     if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then
58 |         NUM_STEPS_PER_BUFFER=""
59 |         AUX_OVERRIDE="0,1,2,4,5,6"  # to ensure we skip unstack from this model
60 |     fi
61 | 
62 | fi
63 | 
64 | echo "Saving to ${SAVE_PATH}"
65 | 
66 | python ../../rl_sandbox/rl_sandbox/examples/lfgp/experts/create_expert_data.py \
67 |     --model_path="${MODEL_PATH}" \
68 |     --config_path="${CONFIG_PATH}" \
69 |     --save_path="${SAVE_PATH}" \
70 |     --num_episodes=10000000 \
71 |     --num_steps="${STEPS_PER_TASK}" \
72 |     --seed=1 \
73 |     --forced_schedule="${FORCED_SCHEDULE}" \
74 |     --scheduler_period="${SCHEDULER_PERIOD}" \
75 |     --success_only \
76 |     --reset_on_success \
77 |     --reset_between_intentions \
78 |     --aux_override="${AUX_OVERRIDE}"


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/distributions.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | from a2c_ppo_acktr.utils import AddBias, init
  8 | 
  9 | """
 10 | Modify standard PyTorch distributions so they are compatible with this code.
 11 | """
 12 | 
 13 | #
 14 | # Standardize distribution interfaces
 15 | #
 16 | 
 17 | # Categorical
 18 | class FixedCategorical(torch.distributions.Categorical):
 19 |     def sample(self):
 20 |         return super().sample().unsqueeze(-1)
 21 | 
 22 |     def log_probs(self, actions):
 23 |         return (
 24 |             super()
 25 |             .log_prob(actions.squeeze(-1))
 26 |             .view(actions.size(0), -1)
 27 |             .sum(-1)
 28 |             .unsqueeze(-1)
 29 |         )
 30 | 
 31 |     def mode(self):
 32 |         return self.probs.argmax(dim=-1, keepdim=True)
 33 | 
 34 | 
 35 | # Normal
 36 | class FixedNormal(torch.distributions.Normal):
 37 |     def log_probs(self, actions):
 38 |         return super().log_prob(actions).sum(-1, keepdim=True)
 39 | 
 40 |     def entropy(self):
 41 |         return super().entropy().sum(-1)
 42 | 
 43 |     def mode(self):
 44 |         return self.mean
 45 | 
 46 | 
 47 | # Bernoulli
 48 | class FixedBernoulli(torch.distributions.Bernoulli):
 49 |     def log_probs(self, actions):
 50 |         return super.log_prob(actions).view(actions.size(0), -1).sum(-1).unsqueeze(-1)
 51 | 
 52 |     def entropy(self):
 53 |         return super().entropy().sum(-1)
 54 | 
 55 |     def mode(self):
 56 |         return torch.gt(self.probs, 0.5).float()
 57 | 
 58 | 
 59 | class Categorical(nn.Module):
 60 |     def __init__(self, num_inputs, num_outputs):
 61 |         super(Categorical, self).__init__()
 62 | 
 63 |         init_ = lambda m: init(
 64 |             m,
 65 |             nn.init.orthogonal_,
 66 |             lambda x: nn.init.constant_(x, 0),
 67 |             gain=0.01)
 68 | 
 69 |         self.linear = init_(nn.Linear(num_inputs, num_outputs))
 70 | 
 71 |     def forward(self, x):
 72 |         x = self.linear(x)
 73 |         return FixedCategorical(logits=x)
 74 | 
 75 | 
 76 | class DiagGaussian(nn.Module):
 77 |     def __init__(self, num_inputs, num_outputs):
 78 |         super(DiagGaussian, self).__init__()
 79 | 
 80 |         init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
 81 |                                constant_(x, 0))
 82 | 
 83 |         self.fc_mean = init_(nn.Linear(num_inputs, num_outputs))
 84 |         self.logstd = AddBias(torch.zeros(num_outputs))
 85 | 
 86 |     def forward(self, x):
 87 |         action_mean = self.fc_mean(x)
 88 | 
 89 |         #  An ugly hack for my KFAC implementation.
 90 |         zeros = torch.zeros(action_mean.size())
 91 |         if x.is_cuda:
 92 |             zeros = zeros.cuda()
 93 | 
 94 |         action_logstd = self.logstd(zeros)
 95 |         return FixedNormal(action_mean, action_logstd.exp())
 96 | 
 97 | 
 98 | class Bernoulli(nn.Module):
 99 |     def __init__(self, num_inputs, num_outputs):
100 |         super(Bernoulli, self).__init__()
101 | 
102 |         init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
103 |                                constant_(x, 0))
104 | 
105 |         self.linear = init_(nn.Linear(num_inputs, num_outputs))
106 | 
107 |     def forward(self, x):
108 |         x = self.linear(x)
109 |         return FixedBernoulli(logits=x)
110 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/train/train_bc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import gzip
 3 | import pickle
 4 | 
 5 | import rl_sandbox.constants as c
 6 | 
 7 | from rl_sandbox.algorithms.bc.bc import BC
 8 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks
 9 | from rl_sandbox.buffers.utils import make_buffer
10 | from rl_sandbox.envs.fake_env import FakeEnv
11 | from rl_sandbox.envs.utils import make_env
12 | from rl_sandbox.learning_utils import train
13 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer
14 | from rl_sandbox.agents.rl_agents import ACAgent
15 | from rl_sandbox.transforms.general_transforms import Identity
16 | from rl_sandbox.utils import make_summary_writer, set_seed
17 | 
18 | def train_bc(experiment_config):
19 |     seed = experiment_config[c.SEED]
20 |     save_path = experiment_config.get(c.SAVE_PATH, None)
21 |     buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity())
22 | 
23 |     set_seed(seed)
24 |     train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM])
25 |     model = make_model(experiment_config[c.MODEL_SETTING])
26 | 
27 | 
28 |     # drop memory size for expert buffers to only what is needed
29 |     load_path = experiment_config[c.EXPERT_BUFFER]
30 |     with gzip.open(load_path, "rb") as f:
31 |         data = pickle.load(f)
32 |         experiment_config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE]
33 |     expert_buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.EXPERT_BUFFER])
34 |     optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY])
35 | 
36 |     aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
37 |                                      model,
38 |                                      expert_buffer,
39 |                                      experiment_config)
40 | 
41 |     learning_algorithm = BC(model=model,
42 |                             optimizer=optimizer,
43 |                             expert_buffer=expert_buffer,
44 |                             algo_params=experiment_config,
45 |                             aux_tasks=aux_tasks)
46 | 
47 |     load_model = experiment_config.get(c.LOAD_MODEL, False)
48 |     if load_model:
49 |         learning_algorithm.load_state_dict(torch.load(load_model))
50 | 
51 |     agent = ACAgent(model=model,
52 |                     learning_algorithm=learning_algorithm,
53 |                     preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
54 |     evaluation_env = None
55 |     evaluation_agent = None
56 |     if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
57 |         evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
58 |         evaluation_agent = ACAgent(model=model,
59 |                                    learning_algorithm=None,
60 |                                    preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
61 | 
62 |     summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.BC, cfg=experiment_config)
63 |     train(agent=agent,
64 |           evaluation_agent=evaluation_agent,
65 |           train_env=train_env,
66 |           evaluation_env=evaluation_env,
67 |           buffer_preprocess=buffer_preprocessing,
68 |           experiment_settings=experiment_config,
69 |           auxiliary_reward=experiment_config[c.EVALUATION_REWARD_FUNC],
70 |           summary_writer=summary_writer,
71 |           save_path=save_path)
72 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/train/train_bc_no_overfit.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import gzip
 3 | import pickle
 4 | 
 5 | import rl_sandbox.constants as c
 6 | 
 7 | from rl_sandbox.algorithms.bc.bc_no_overfit import BC
 8 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks
 9 | from rl_sandbox.buffers.utils import make_buffer
10 | from rl_sandbox.envs.fake_env import FakeEnv
11 | from rl_sandbox.envs.utils import make_env
12 | from rl_sandbox.learning_utils import train
13 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer
14 | from rl_sandbox.agents.rl_agents import ACAgent
15 | from rl_sandbox.transforms.general_transforms import Identity
16 | from rl_sandbox.utils import make_summary_writer, set_seed
17 | 
18 | def train_bc_no_overfit(experiment_config):
19 |     seed = experiment_config[c.SEED]
20 |     save_path = experiment_config.get(c.SAVE_PATH, None)
21 |     buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity())
22 | 
23 |     set_seed(seed)
24 |     train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM])
25 |     model = make_model(experiment_config[c.MODEL_SETTING])
26 | 
27 | 
28 |     # drop memory size for expert buffers to only what is needed
29 |     load_path = experiment_config[c.EXPERT_BUFFER]
30 |     with gzip.open(load_path, "rb") as f:
31 |         data = pickle.load(f)
32 |         experiment_config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE]
33 |     expert_buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.EXPERT_BUFFER])
34 |     optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY])
35 | 
36 |     aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
37 |                                      model,
38 |                                      expert_buffer,
39 |                                      experiment_config)
40 | 
41 |     learning_algorithm = BC(model=model,
42 |                             optimizer=optimizer,
43 |                             expert_buffer=expert_buffer,
44 |                             algo_params=experiment_config,
45 |                             aux_tasks=aux_tasks)
46 | 
47 |     load_model = experiment_config.get(c.LOAD_MODEL, False)
48 |     if load_model:
49 |         learning_algorithm.load_state_dict(torch.load(load_model))
50 | 
51 |     agent = ACAgent(model=model,
52 |                     learning_algorithm=learning_algorithm,
53 |                     preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
54 |     evaluation_env = None
55 |     evaluation_agent = None
56 |     if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
57 |         evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
58 |         evaluation_agent = ACAgent(model=model,
59 |                                    learning_algorithm=None,
60 |                                    preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
61 | 
62 |     summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.BC, cfg=experiment_config)
63 |     train(agent=agent,
64 |           evaluation_agent=evaluation_agent,
65 |           train_env=train_env,
66 |           evaluation_env=evaluation_env,
67 |           buffer_preprocess=buffer_preprocessing,
68 |           experiment_settings=experiment_config,
69 |           auxiliary_reward=experiment_config[c.EVALUATION_REWARD_FUNC],
70 |           summary_writer=summary_writer,
71 |           save_path=save_path)
72 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/buffers/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gzip
 3 | import pickle
 4 | 
 5 | import rl_sandbox.constants as c
 6 | 
 7 | from rl_sandbox.buffers.disk_buffer import DiskNumPyBuffer
 8 | from rl_sandbox.buffers.ram_buffer import NumPyBuffer, NextStateNumPyBuffer, TrajectoryNumPyBuffer
 9 | from rl_sandbox.buffers.torch_pin_buffer import TorchPinBuffer, TrajectoryPinBuffer
10 | 
11 | 
12 | def make_buffer(buffer_cfg, seed=None, load_buffer=False, start_idx=0, end_idx=None, match_load_size=False,
13 |                 frame_stack_load=1):
14 |     if match_load_size and load_buffer:
15 |         with gzip.open(load_buffer, "rb") as f:
16 |             data = pickle.load(f)
17 |         original_size = buffer_cfg[c.KWARGS][c.MEMORY_SIZE]
18 |         buffer_cfg[c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE]
19 | 
20 |     if seed is None:
21 |         seed = np.random.randint(0, 2 ** 32 - 1)
22 | 
23 |     buffer_cfg[c.KWARGS][c.RNG] = np.random.RandomState(seed)
24 | 
25 |     if buffer_cfg[c.STORAGE_TYPE] == c.DISK:
26 |         buffer = DiskNumPyBuffer(**buffer_cfg[c.KWARGS])
27 |     elif buffer_cfg[c.STORAGE_TYPE] == c.RAM:
28 |         buffer_type = buffer_cfg.get(c.BUFFER_TYPE, c.DEFAULT)
29 |         assert buffer_type in c.VALID_BUFFER_TYPE, f"Invalid buffer type: {buffer_type}"
30 | 
31 |         # this line for compatibility with old code
32 |         if buffer_cfg.get(c.STORE_NEXT_OBSERVATION, False):
33 |             buffer_type = c.STORE_NEXT_OBSERVATION
34 | 
35 |         if buffer_type == c.DEFAULT:
36 |             buffer = NumPyBuffer(**buffer_cfg[c.KWARGS])
37 |         elif buffer_type == c.STORE_NEXT_OBSERVATION:
38 |             buffer = NextStateNumPyBuffer(**buffer_cfg[c.KWARGS])
39 |         elif buffer_type == c.TRAJECTORY:
40 |             buffer = TrajectoryNumPyBuffer(**buffer_cfg[c.KWARGS])
41 |         else:
42 |             raise NotImplementedError
43 | 
44 |     elif buffer_cfg[c.STORAGE_TYPE] == c.GPU:
45 |         buffer = TorchPinBuffer(**buffer_cfg[c.KWARGS])
46 |     elif buffer_cfg[c.STORAGE_TYPE] == c.NSTEP_GPU:
47 |         buffer = TrajectoryPinBuffer(**buffer_cfg[c.KWARGS])
48 |     else:
49 |         raise NotImplementedError
50 | 
51 |     for wrapper_config in buffer_cfg[c.BUFFER_WRAPPERS]:
52 |         buffer = wrapper_config[c.WRAPPER](buffer, **wrapper_config[c.KWARGS])
53 | 
54 |     if load_buffer:
55 |         buffer.load(load_buffer, load_rng=seed==None, start_idx=start_idx, end_idx=end_idx, frame_stack=frame_stack_load)
56 |         if match_load_size:
57 |             buffer_cfg[c.KWARGS][c.MEMORY_SIZE] = original_size
58 | 
59 |     return buffer
60 | 
61 | 
62 | def get_default_buffer(memory_size, obs_dim, action_dim):
63 |     buffer_settings = {
64 |         c.KWARGS: {
65 |             c.MEMORY_SIZE: memory_size,
66 |             c.OBS_DIM: (obs_dim,),
67 |             c.H_STATE_DIM: (1,),
68 |             c.ACTION_DIM: (action_dim,),
69 |             c.REWARD_DIM: (1,),
70 |             c.INFOS: {c.MEAN: ((action_dim,), np.float32),
71 |                         c.VARIANCE: ((action_dim,), np.float32),
72 |                         c.ENTROPY: ((action_dim,), np.float32),
73 |                         c.LOG_PROB: ((1,), np.float32),
74 |                         c.VALUE: ((1,), np.float32),
75 |                         c.DISCOUNTING: ((1,), np.float32)},
76 |             c.CHECKPOINT_INTERVAL: 0,
77 |             c.CHECKPOINT_PATH: None,
78 |         },
79 |         c.STORAGE_TYPE: c.RAM,
80 |         c.BUFFER_TYPE: c.STORE_NEXT_OBSERVATION,
81 |         c.BUFFER_WRAPPERS: [],
82 |         c.LOAD_BUFFER: False,
83 |     }
84 |     return make_buffer(buffer_settings)


--------------------------------------------------------------------------------
/scripts/plotting/common.py:
--------------------------------------------------------------------------------
 1 | """ 
 2 | Common things for many plotting files. 
 3 | task_inds are: 0--stack, 1--unstack-stack, 2--bring, 3--insert
 4 | """
 5 | 
 6 | import os
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | 
10 | ALGO_TITLE_DICT = {
11 |     'lfgp_wrs': 'LfGP (multi)',
12 |     'multitask_bc': 'BC (multi)',
13 |     'dac': 'DAC (single)',
14 |     'bc': 'BC (single)'
15 | }
16 | 
17 | def get_path_defaults(fig_name, task_inds=(0,1,2,3)):
18 |     root_dir = "/media/starslab/users/trevor-ablett/dac-x"
19 |     fig_path = root_dir + "/figures/" + fig_name
20 |     experiment_root_dir = root_dir + "/play_xyz"
21 |     seeds = ['1','2','3','4','5']
22 |     expert_root = os.path.join(root_dir, "play_xyz/expert-data")
23 |     expert_perf_files = [
24 |         os.path.join(expert_root, "open-close-stack-lift-reach-move/policies/05-09-21_21_57_07/eval_1999999_100_eps_per_int.pkl"),
25 |         os.path.join(expert_root, "open-close-unstackstack-lift-reach-move-35M/policies/08-29-21_23_05_03/eval_3499999_100_eps_per_int.pkl"),
26 |         os.path.join(expert_root, "open-close-insert-bring-lift-reach-move/policies/05-30-21_20_38_48/eval_1299999_100_eps_per_int.pkl"),
27 |         os.path.join(expert_root, "open-close-insert-bring-lift-reach-move/policies/05-30-21_20_38_48/eval_2699999_100_eps_per_int.pkl")]
28 |     expert_perf_file_main_task_i = [2, 2, 3, 2]
29 | 
30 |     out_epf = []
31 |     out_epf_mti = []
32 |     for i in task_inds:
33 |         out_epf.append(expert_perf_files[i])
34 |         out_epf_mti.append(expert_perf_file_main_task_i[i])
35 | 
36 |     return root_dir, fig_path, experiment_root_dir, seeds, expert_root, out_epf, out_epf_mti
37 | 
38 | 
39 | def get_task_defaults(task_inds=(0,1,2,3)):
40 |     task_dir_names = ["stack_0", "unstack_stack_env_only_0", "bring_0", "insert_0"]
41 |     valid_task = [True, True, True, True]
42 |     task_titles = ["Stack", "Unstack-Stack", "Bring", "Insert"]
43 |     main_task_i = [2, 2, 2, 2]
44 |     num_aux = [6, 6, 6, 7]
45 |     task_data_filenames = ['train.pkl', 'train.pkl', 'train_rerun.pkl', 'train.pkl']
46 |     num_eval_steps_to_use = [20, 20, 20, 40]
47 | 
48 |     out_tdn = []
49 |     out_vt = []
50 |     out_tt = []
51 |     out_mti = []
52 |     out_na = []
53 |     out_tdf = []
54 |     out_nestu = []
55 |     for i in task_inds:
56 |         out_tdn.append(task_dir_names[i])
57 |         out_vt.append(valid_task[i])
58 |         out_tt.append(task_titles[i])
59 |         out_mti.append(main_task_i[i])
60 |         out_na.append(num_aux[i])
61 |         out_tdf.append(task_data_filenames[i])
62 |         out_nestu.append(num_eval_steps_to_use[i])
63 |     
64 |     return out_tdn, out_vt, out_tt, out_mti, out_na, out_tdf, out_nestu
65 | 
66 | 
67 | def get_algo_defaults():
68 |     algo_dir_names=['lfgp_wrs', 'multitask_bc', 'dac', 'bc']
69 |     algo_titles = ['LfGP (multi)', 'BC (multi)', 'DAC (single)', 'BC (single)']
70 |     multitask_algos = ['multitask_bc', 'lfgp_wrs']
71 |     eval_eps_per_task = 50
72 | 
73 |     return algo_dir_names, algo_titles, multitask_algos, eval_eps_per_task
74 | 
75 | 
76 | def get_fig_defaults(num_plots=4):
77 |     fig_shape = [1, num_plots]  # row x col
78 |     plot_size = [3.2, 2.4]
79 |     num_stds = 1
80 |     font_size = 16
81 |     eval_interval = 100000
82 |     cmap = plt.get_cmap("tab10")
83 |     linewidth = 1
84 |     std_alpha = .5
85 |     x_val_scale = 1e6
86 |     subsample_rate = 1  # 1 for no subsample
87 |     include_expert_baseline = True
88 | 
89 |     return fig_shape, plot_size, num_stds, font_size, eval_interval, cmap, linewidth, std_alpha, x_val_scale, subsample_rate, \
90 |         include_expert_baseline


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_rewards/manipulator_learning/panda/lift_xyz_state.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from numpy.linalg import norm
 4 | 
 5 | 
 6 | TABLE_HEIGHT = 0.6247  # not definitively defined anywhere, just found through trial and error
 7 | BLOCK_HEIGHT_ON_TABLE = 0.6550  # again, trial and error
 8 | 
 9 | # taken directly from sac-x paper
10 | def opened(info, **kwargs):
11 |     return 1 if np.all(info["infos"][-1]['grip_pos'] >= .9) else 0
12 | 
13 | def closed(info, **kwargs):
14 |     return 1 if np.all(info["infos"][-1]['grip_pos'] <= .1) else 0
15 | 
16 | def lifted(info, max_rew_height=.1, **kwargs):
17 |     block_height = info["infos"][-1]['obj_pos_world'][0][2] - BLOCK_HEIGHT_ON_TABLE
18 |     if block_height > max_rew_height:
19 |         return 1.5
20 |     elif block_height < .005:
21 |         return 0
22 |     else:
23 |         return block_height / max_rew_height
24 | 
25 | # this is just the generic one, not meant to be used on its own as an aux reward
26 | def close(dist_thresh, obj_1_pos, obj_2_pos, tanh_multiplier=10.0, close_rew=1.5):
27 |     dist = norm(obj_1_pos - obj_2_pos)
28 |     if dist < dist_thresh:
29 |         return close_rew
30 |     else:
31 |         # return 1 - (np.tanh(dist / 10))**2  # from SAC-X paper, but very poorly scaled for meters as units
32 |         return 1 - np.tanh(tanh_multiplier * dist)
33 | 
34 | def hand_block_close(info, **kwargs):
35 |     return close(0.0, info["infos"][-1]['obj_pos'][:3], info["infos"][-1]['pos'])  # only for first aka blue block
36 | 
37 | # modified rewards to make more "human like" intentions
38 | def open_action(action, **kwargs):
39 |     action_mag = norm(action[:3])
40 |     open_rew = 1 if action[-1] < 0 else 0
41 |     return open_rew - .5 * action_mag
42 | 
43 | def close_action(action, **kwargs):
44 |     action_mag = norm(action[:3])
45 |     close_rew = 1 if action[-1] > 0 else 0
46 |     return close_rew - .5 * action_mag
47 | 
48 | def hand_block_close_speed_penalty(info, action, **kwargs):
49 |     close_rew = hand_block_close(info, **kwargs)
50 |     dist = norm(info["infos"][-1]['obj_pos'][:3] - info["infos"][-1]['pos'])
51 |     action_mag = norm(action[:3])
52 |     speed_penalty = (1. - np.tanh(10 * dist)) * action_mag
53 |     return close_rew
54 | 
55 | 
56 | class PandaLiftXYZStateAuxiliaryReward:
57 |     def __init__(self, aux_rewards=(open_action, close_action, lifted, hand_block_close_speed_penalty), include_main=True):
58 |         self._aux_rewards = aux_rewards
59 |         self._include_main = include_main
60 | 
61 |         # self._done_failure_reward = -5
62 |         # self._done_success_reward = 100
63 | 
64 |     @property
65 |     def num_auxiliary_rewards(self):
66 |         return len(self._aux_rewards)
67 | 
68 |     def reward(self,
69 |                observation,
70 |                action,
71 |                reward,
72 |                done,
73 |                next_observation,
74 |                info):
75 |         observation = observation.reshape(-1)
76 |         next_observation = next_observation.reshape(-1)
77 |         reward_vector = []
78 |         if self._include_main:
79 |             reward_vector.append(reward)
80 |         for task_reward in self._aux_rewards:
81 |             reward_vector.append(task_reward(observation=observation,
82 |                                              action=action,
83 |                                              reward=reward,
84 |                                              next_observation=next_observation,
85 |                                              done=done,
86 |                                              info=info))
87 | 
88 |         return np.array(reward_vector, dtype=np.float32)
89 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/buffers/wrappers/noise_wrapper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import rl_sandbox.constants as c
 4 | 
 5 | from rl_sandbox.buffers.wrappers.buffer_wrapper import BufferWrapper
 6 | from rl_sandbox.model_architectures.utils import RunningMeanStd
 7 | 
 8 | 
 9 | class NoiseBuffer(BufferWrapper):
10 |     def __init__(self, buffer, noise_magnitude, update_on_sample=False):
11 |         super().__init__(buffer)
12 | 
13 |         self._noise_mag = noise_magnitude
14 |         self._obs_mean = None
15 |         self._obs_std = None
16 |         self._dim_noise_mag = None
17 |         if update_on_sample:
18 |             self._rms = None
19 | 
20 |     def sample(self, *args, **kwargs):
21 |         buf_data = super().sample(*args, **kwargs)
22 |         obss = buf_data[0]
23 | 
24 |         if self._dim_noise_mag is None and not hasattr(self, '_rms'): self.update_stats()
25 | 
26 |         if hasattr(self, '_rms'):
27 |             if self._rms is None:
28 |                 self._rms = RunningMeanStd(shape=(obss.shape[-1],), device=self.device)
29 |             self._rms.update(obss)
30 |             self._obs_mean = self._rms.mean
31 |             self._obs_std = self._rms.std
32 |             self._dim_noise_mag = self._noise_mag * self._obs_std
33 | 
34 | 
35 |         obss_noise = torch.randn_like(obss) * self._dim_noise_mag
36 |         obss += obss_noise  # changes buf_data as well
37 | 
38 |         return buf_data
39 | 
40 |     def sample_with_next_obs(self, *args, **kwargs):
41 |         buf_data = super().sample_with_next_obs(*args, **kwargs)
42 |         obss = buf_data[0]
43 |         next_obss = buf_data[5]
44 | 
45 |         if self._dim_noise_mag is None and not hasattr(self, '_rms'): self.update_stats()
46 | 
47 |         if hasattr(self, '_rms'):
48 |             if self._rms is None:
49 |                 self._rms = RunningMeanStd(shape=(obss.shape[-1],), device=self.device)
50 |             self._rms.update(obss)
51 |             self._obs_mean = self._rms.mean
52 |             self._obs_std = self._rms.std
53 |             self._dim_noise_mag = self._noise_mag * self._obs_std
54 | 
55 |         obss_noise = torch.randn_like(obss) * self._dim_noise_mag
56 |         next_obss_noise = torch.randn_like(next_obss) * self._dim_noise_mag
57 |         obss += obss_noise  # changes buf_data as well
58 |         next_obss += next_obss_noise  # changes buf_data as well
59 | 
60 |         return buf_data
61 | 
62 |     def sample_trajs(self, *args, **kwargs):
63 |         buf_data = super().sample_trajs(*args, **kwargs)
64 |         obss = buf_data[0]
65 |         next_obss = buf_data[5]  # TODO this only works with the way torch pin buffer is set up for now
66 | 
67 |         if self._dim_noise_mag is None and not hasattr(self, '_rms'): self.update_stats()
68 | 
69 |         if hasattr(self, '_rms'):
70 |             if self._rms is None:
71 |                 self._rms = RunningMeanStd(shape=(obss.shape[-1],), device=self.device)
72 |             self._rms.update(obss)
73 |             self._obs_mean = self._rms.mean
74 |             self._obs_std = self._rms.std
75 |             self._dim_noise_mag = self._noise_mag * self._obs_std
76 | 
77 | 
78 |         obss_noise = torch.randn_like(obss) * self._dim_noise_mag
79 |         next_obss_noise = torch.randn_like(next_obss) * self._dim_noise_mag
80 |         obss += obss_noise  # changes buf_data as well
81 |         next_obss += next_obss_noise  # changes buf_data as well
82 | 
83 |         return buf_data
84 | 
85 |     def update_stats(self):
86 |         self._obs_mean = self.buffer.observations.mean(axis=0)
87 |         self._obs_std = self.buffer.observations.std(axis=0)
88 |         self._dim_noise_mag = self._noise_mag * self._obs_std
89 | 


--------------------------------------------------------------------------------
/six_state_mdp.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | # CONFIG
  4 | max_iters = 5000
  5 | alpha = .1
  6 | stop_tol = 1e-5
  7 | max_action_q_learning = False  # if true, q learning instead of SARSA
  8 | initial_q = 0
  9 | 
 10 | q_table = {
 11 |     (1, None): 0,
 12 |     (2, None): 0,
 13 |     (3, None): 0,
 14 |     (4, None): 0,
 15 |     (5, None): 0,
 16 |     (1, 2): initial_q,
 17 |     (1, 5): initial_q,
 18 |     (2, 3): initial_q,
 19 |     (2, 6): initial_q,
 20 |     (3, 4): initial_q,
 21 |     (3, 6): initial_q,
 22 |     (4, 5): initial_q,
 23 |     (4, 6): initial_q,
 24 |     (5, 1): initial_q,
 25 |     (5, 5): initial_q,
 26 |     (6, 1): initial_q,
 27 |     (6, 6): initial_q
 28 | }
 29 | q_new = copy.deepcopy(q_table)
 30 | 
 31 | r_table = {
 32 |     (1, 2): 1,
 33 |     (1, 5): -1,
 34 |     (2, 3): 1,
 35 |     (2, 6): -1,
 36 |     (3, 4): 1,
 37 |     (3, 6): -1,
 38 |     (4, 5): 1,
 39 |     (4, 6): -1,
 40 |     (5, 1): -1,
 41 |     (5, 5): 1,
 42 |     (6, 1): -1,
 43 |     (6, 6): -1
 44 | }
 45 | 
 46 | buffer = [
 47 |     ((1, 5), (5, 5)),  # ep 2
 48 |     ((5, 5), (5, 5)),
 49 |     ((5, 5), (5, 5)),
 50 |     ((5, 5), (5, 5)),
 51 |     ((5, 5), (5, None)),
 52 |     ((1, 2), (2, 6)),  # ep 1
 53 |     ((2, 6), (6, 1)),
 54 |     ((6, 1), (1, 5)),
 55 |     ((1, 5), (5, 5)),
 56 |     ((5, 5), (5, None)),
 57 |     ((1, 2), (2, 3)),  # ep 3
 58 |     ((2, 3), (3, 6)),
 59 |     ((3, 6), (6, 1)),
 60 |     ((6, 1), (1, 5)),
 61 |     ((1, 5), (5, None)),
 62 | ]
 63 | 
 64 | valid_update_states = [pair[0] for pair in buffer]
 65 | 
 66 | for ep_i in range(len(buffer) // 5):
 67 | 
 68 |     for i in range(max_iters):
 69 | 
 70 |         short_buffer = buffer[:(ep_i + 1) * 5]
 71 |         for (state_act, next_state_act) in short_buffer:
 72 | 
 73 |             if next_state_act[1] is None:  # equivalent of done, so update exclusively uses reward
 74 |                 q_new[state_act] = q_new[state_act] + alpha * (r_table[state_act] - q_new[state_act])
 75 | 
 76 |             else:
 77 |                 if max_action_q_learning:
 78 |                     max_state_act_val = -1e100
 79 |                     max_state_act = None
 80 | 
 81 |                     for qt_state_act in q_table:
 82 |                         if qt_state_act[0] == next_state_act[0] and q_new[qt_state_act] > max_state_act_val\
 83 |                                 and qt_state_act[1] is not None and qt_state_act in valid_update_states:
 84 |                             # print(f"New max: next state act {qt_state_act}, "
 85 |                             #       f"val {q_new[qt_state_act]} used for updating {state_act}")
 86 |                             max_state_act_val = q_new[qt_state_act]
 87 |                             max_state_act = qt_state_act
 88 | 
 89 |                     # print(f"Selected max next state act {max_state_act}, "
 90 |                     #       f"val {q_new[max_state_act]} used for updating {state_act}")
 91 | 
 92 |                     q_new[state_act] = q_new[state_act] + alpha * (r_table[state_act] +
 93 |                                                                    q_new[max_state_act] - q_new[state_act])
 94 |                 else:
 95 |                     q_new[state_act] = q_new[state_act] + alpha * (r_table[state_act] +
 96 |                                                                    q_new[next_state_act] - q_new[state_act])
 97 | 
 98 |         total_diff = 0
 99 |         for state_act in q_table.keys():
100 |             total_diff += abs(q_new[state_act] - q_table[state_act])
101 | 
102 |         q_table = copy.deepcopy(q_new)
103 |         if total_diff < stop_tol:
104 |             break
105 | 
106 |     print(f"End of ep: {ep_i}, current q(1, 5): {q_table[(1, 5)]}, q(1, 2): {q_table[(1, 2)]}")
107 | 
108 | for k in q_table:
109 |     q_table[k] = round(q_table[k], 4)
110 | 
111 | print(f"Final Q Table after {i} iterations: {q_table}")


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/lfgp/default_configs/dac.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import os
 4 | import torch
 5 | 
 6 | import rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state as p_aux
 7 | import rl_sandbox.constants as c
 8 | import rl_sandbox.examples.lfgp.experiment_utils as exp_utils
 9 | 
10 | 
11 | from rl_sandbox.model_architectures.actor_critics.fully_connected_soft_actor_critic import FullyConnectedSeparate, \
12 |     FullyConnectedSquashedGaussianSAC
13 | import rl_sandbox.examples.lfgp.default_configs.common as common_default
14 | 
15 | 
16 | def reward_func(reward, **kwargs): return np.array([reward])
17 | 
18 | def get_settings(args):
19 |     if args.env_type == c.MANIPULATOR_LEARNING:
20 |         common_default.main_task_alias_set(args)
21 |     obs_dim, action_dim = common_default.get_obs_action_dim(args)
22 |     common_default.default_settings(args)
23 |     device = torch.device(args.device)
24 |     num_tasks = 1
25 | 
26 |     if args.env_type == c.MANIPULATOR_LEARNING:
27 |         save_path = exp_utils.get_save_path(c.DAC, args.main_task, args.seed, args.exp_name, args.top_save_path)
28 |     else:
29 |         save_path = exp_utils.get_save_path(c.DAC, args.env_name, args.seed, args.exp_name, args.top_save_path)
30 | 
31 |     # expert path
32 |     expert_buffer = os.path.join(args.expert_top_dir, args.expert_dir_rest, args.expert_filenames)
33 | 
34 |     # reward options -- ensure we get the correct aux reward
35 |     if args.env_type == c.MANIPULATOR_LEARNING:
36 |         aux_reward_all = p_aux.PandaPlayXYZStateAuxiliaryReward(args.main_task, include_main=False)
37 |         aux_reward_names = [func.__qualname__ for func in aux_reward_all._aux_rewards]
38 | 
39 |         if "unstack" in args.main_task:
40 |             aux_reward_name = "stack_0"
41 |         elif "insert" in args.main_task:
42 |             aux_reward_name = "insert_0"
43 |         else:
44 |             aux_reward_name = args.main_task
45 | 
46 |         if 'no_move' in aux_reward_name:
47 |             task_name = aux_reward_name.split('_no_move_')[0]
48 |             aux_reward_name = f"{task_name}_0"
49 | 
50 |         eval_reward = aux_reward_all._aux_rewards[aux_reward_names.index(aux_reward_name)]
51 | 
52 |     elif args.env_type in [c.SAWYER, c.HAND_DAPG, c.PANDA_RL_ENVS]:
53 |         eval_reward = None  # uses env reward
54 | 
55 |     else:
56 |         raise NotImplementedError("Not yet implemented for other env types")
57 | 
58 |     buffer_settings, expert_buffer_settings = common_default.get_buffer_settings(
59 |         args, obs_dim, action_dim, num_tasks, False, device)
60 | 
61 |     ##### populate settings dictionary #####
62 |     experiment_setting = {
63 |         **common_default.get_rl_settings(args, obs_dim, action_dim, args.num_evals_per_task),
64 |         **common_default.get_train_settings(args, action_dim, device),
65 |         c.DISCRIMINATOR_SETTING: common_default.get_discriminator_settings(args, obs_dim, action_dim, num_tasks, device),
66 |         c.OPTIMIZER_SETTING: common_default.get_optimizer_settings(args),
67 |         c.BUFFER_SETTING: buffer_settings,
68 |         c.EXPERT_BUFFER_SETTING: expert_buffer_settings,
69 | 
70 |         # Model
71 |         c.MODEL_SETTING: {
72 |             c.MODEL_ARCHITECTURE: FullyConnectedSeparate if args.no_shared_layers else FullyConnectedSquashedGaussianSAC,
73 |             c.KWARGS: {
74 |                 **common_default.get_model_kwargs(args, obs_dim, action_dim, device),
75 |             }
76 |         },
77 | 
78 |         # DAC
79 |         c.EXPERT_BUFFER: expert_buffer,
80 |         c.EXPERT_AMOUNT: int(args.expert_amounts),
81 |         c.EVALUATION_REWARD_FUNC: eval_reward,
82 | 
83 |         # Save
84 |         c.SAVE_PATH: save_path,
85 |     }
86 | 
87 |     if args.full_traj_expert_filenames:
88 |         experiment_setting[c.FT_EXPERT_BUFFER] = os.path.join(args.expert_top_dir, args.ft_expert_dir_rest, args.expert_filenames)
89 | 
90 |     exp_utils.config_check(experiment_setting, args.top_save_path)
91 | 
92 |     return experiment_setting
93 | 
94 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/model_architectures/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | import rl_sandbox.constants as c
  6 | 
  7 | def default_weight_init(m):
  8 |     if type(m) == nn.Linear:
  9 |         torch.nn.init.xavier_uniform_(m.weight)
 10 |         # torch.nn.init.orthogonal_(m.weight)
 11 |         if m.bias is not None:
 12 |             m.bias.data.fill_(0)
 13 |     elif type(m) == nn.Conv2d:
 14 |         torch.nn.init.kaiming_uniform_(m.weight)
 15 |         if m.bias is not None:
 16 |             m.bias.data.fill_(0)
 17 |     elif type(m) == nn.LSTM or type(m) == nn.GRU:
 18 |         torch.nn.init.xavier_uniform_(m.weight_ih_l0)
 19 |         torch.nn.init.orthogonal_(m.weight_hh_l0)
 20 |         if m.bias is not None:
 21 |             m.bias_ih_l0.data.fill_(0)
 22 |             m.bias_hh_l0.data.fill_(0)
 23 | 
 24 | 
 25 | def construct_linear_layers(layers):
 26 |     linear_layers = nn.ModuleList()
 27 |     for (in_dim, out_dim, activation, use_bias, dropout_p) in layers:
 28 |         linear_layers.append(nn.Linear(in_dim, out_dim, bias=use_bias))
 29 |         linear_layers.append(activation)
 30 |         if dropout_p > 0.:
 31 |             linear_layers.append(nn.Dropout(dropout_p))
 32 | 
 33 |     return linear_layers
 34 | 
 35 | 
 36 | def make_model(model_cfg):
 37 |     return model_cfg[c.MODEL_ARCHITECTURE](**model_cfg[c.KWARGS])
 38 | 
 39 | 
 40 | def make_optimizer(parameters, optimizer_cfg):
 41 |     return optimizer_cfg[c.OPTIMIZER](parameters, **optimizer_cfg[c.KWARGS])
 42 | 
 43 | 
 44 | class RunningMeanStd():
 45 |     """ Modified from Baseline
 46 |     Assumes shape to be (number of inputs, input_shape)
 47 |     """
 48 | 
 49 |     def __init__(self, epsilon=1e-4, shape=(), norm_dim=(0,), a_min=-5., a_max=5., device='cpu'):
 50 |         assert epsilon > 0.
 51 |         self.shape = shape
 52 |         self.device = torch.device(device)
 53 |         self.mean = torch.zeros(shape, dtype=torch.float)
 54 |         self.var = torch.ones(shape, dtype=torch.float)
 55 |         self.epsilon = epsilon
 56 |         self.count = epsilon
 57 |         self.a_min = a_min
 58 |         self.a_max = a_max
 59 |         self.norm_dim = norm_dim
 60 |         self.to(self.device)
 61 | 
 62 |     def to(self, device):
 63 |         self.mean = self.mean.to(device)
 64 |         self.var = self.var.to(device)
 65 |         eps = torch.tensor([self.epsilon])
 66 |         self.epsilon = eps.to(device)
 67 | 
 68 |     def update(self, x):
 69 |         batch_mean = torch.mean(x, dim=self.norm_dim)
 70 |         batch_var = torch.var(x, dim=self.norm_dim)
 71 |         batch_count = int(torch.prod(torch.tensor(
 72 |             [x.shape[dim] for dim in self.norm_dim])))
 73 |         self.update_from_moments(batch_mean, batch_var, batch_count)
 74 | 
 75 |     def update_from_moments(self, batch_mean, batch_var, batch_count):
 76 |         delta = batch_mean - self.mean
 77 |         tot_count = self.count + batch_count
 78 | 
 79 |         new_mean = self.mean + delta * batch_count / tot_count
 80 |         m_a = self.var * self.count
 81 |         m_b = batch_var * batch_count
 82 |         M2 = m_a + m_b + (delta ** 2) * self.count * batch_count / tot_count
 83 |         new_var = M2 / tot_count
 84 |         new_count = tot_count
 85 | 
 86 |         self.mean = new_mean
 87 |         self.var = new_var
 88 |         self.std = torch.sqrt(self.var + self.epsilon)
 89 |         self.count = new_count
 90 | 
 91 |     def normalize(self, x):
 92 |         x_shape = x.shape
 93 |         x = x.reshape(-1, *self.shape).to(self.device)
 94 |         normalized_x = torch.clamp((x - self.mean) / torch.sqrt(self.var + self.epsilon),
 95 |                                    min=self.a_min,
 96 |                                    max=self.a_max)
 97 |         normalized_x[normalized_x != normalized_x] = 0.
 98 |         normalized_x = normalized_x.reshape(x_shape)
 99 |         return normalized_x
100 | 
101 |     def unnormalize(self, x):
102 |         # return x * torch.sqrt(self.var + self.epsilon) + self.mean
103 |         return x * self.std + self.mean
104 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/algo/ppo.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import torch.optim as optim
 5 | 
 6 | 
 7 | class PPO():
 8 |     def __init__(self,
 9 |                  actor_critic,
10 |                  clip_param,
11 |                  ppo_epoch,
12 |                  num_mini_batch,
13 |                  value_loss_coef,
14 |                  entropy_coef,
15 |                  lr=None,
16 |                  eps=None,
17 |                  max_grad_norm=None,
18 |                  use_clipped_value_loss=True):
19 | 
20 |         self.actor_critic = actor_critic
21 | 
22 |         self.clip_param = clip_param
23 |         self.ppo_epoch = ppo_epoch
24 |         self.num_mini_batch = num_mini_batch
25 | 
26 |         self.value_loss_coef = value_loss_coef
27 |         self.entropy_coef = entropy_coef
28 | 
29 |         self.max_grad_norm = max_grad_norm
30 |         self.use_clipped_value_loss = use_clipped_value_loss
31 | 
32 |         self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps)
33 | 
34 |     def update(self, rollouts):
35 |         advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1]
36 |         advantages = (advantages - advantages.mean()) / (
37 |             advantages.std() + 1e-5)
38 | 
39 |         value_loss_epoch = 0
40 |         action_loss_epoch = 0
41 |         dist_entropy_epoch = 0
42 | 
43 |         for e in range(self.ppo_epoch):
44 |             if self.actor_critic.is_recurrent:
45 |                 data_generator = rollouts.recurrent_generator(
46 |                     advantages, self.num_mini_batch)
47 |             else:
48 |                 data_generator = rollouts.feed_forward_generator(
49 |                     advantages, self.num_mini_batch)
50 | 
51 |             for sample in data_generator:
52 |                 obs_batch, recurrent_hidden_states_batch, actions_batch, \
53 |                    value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \
54 |                         adv_targ = sample
55 | 
56 |                 # Reshape to do in a single forward pass for all steps
57 |                 values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
58 |                     obs_batch, recurrent_hidden_states_batch, masks_batch,
59 |                     actions_batch)
60 | 
61 |                 ratio = torch.exp(action_log_probs -
62 |                                   old_action_log_probs_batch)
63 |                 surr1 = ratio * adv_targ
64 |                 surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
65 |                                     1.0 + self.clip_param) * adv_targ
66 |                 action_loss = -torch.min(surr1, surr2).mean()
67 | 
68 |                 if self.use_clipped_value_loss:
69 |                     value_pred_clipped = value_preds_batch + \
70 |                         (values - value_preds_batch).clamp(-self.clip_param, self.clip_param)
71 |                     value_losses = (values - return_batch).pow(2)
72 |                     value_losses_clipped = (
73 |                         value_pred_clipped - return_batch).pow(2)
74 |                     value_loss = 0.5 * torch.max(value_losses,
75 |                                                  value_losses_clipped).mean()
76 |                 else:
77 |                     value_loss = 0.5 * (return_batch - values).pow(2).mean()
78 | 
79 |                 self.optimizer.zero_grad()
80 |                 (value_loss * self.value_loss_coef + action_loss -
81 |                  dist_entropy * self.entropy_coef).backward()
82 |                 nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
83 |                                          self.max_grad_norm)
84 |                 self.optimizer.step()
85 | 
86 |                 value_loss_epoch += value_loss.item()
87 |                 action_loss_epoch += action_loss.item()
88 |                 dist_entropy_epoch += dist_entropy.item()
89 | 
90 |         num_updates = self.ppo_epoch * self.num_mini_batch
91 | 
92 |         value_loss_epoch /= num_updates
93 |         action_loss_epoch /= num_updates
94 |         dist_entropy_epoch /= num_updates
95 | 
96 |         return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
97 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/train/train_multitask_bc.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import gzip
 3 | 
 4 | import torch
 5 | 
 6 | import rl_sandbox.constants as c
 7 | 
 8 | from rl_sandbox.algorithms.bc.bc import MultitaskBC
 9 | from rl_sandbox.algorithms.sac_x.schedulers import FixedScheduler, RecycleScheduler
10 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks
11 | from rl_sandbox.buffers.utils import make_buffer
12 | from rl_sandbox.envs.fake_env import FakeEnv
13 | from rl_sandbox.envs.utils import make_env
14 | from rl_sandbox.learning_utils import train
15 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer
16 | from rl_sandbox.agents.hrl_agents import SACXAgent
17 | from rl_sandbox.transforms.general_transforms import Identity
18 | from rl_sandbox.utils import make_summary_writer, set_seed
19 | 
20 | def train_multitask_bc(experiment_config):
21 |     seed = experiment_config[c.SEED]
22 |     save_path = experiment_config.get(c.SAVE_PATH, None)
23 |     buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity())
24 |     num_tasks = experiment_config[c.NUM_TASKS]
25 | 
26 |     set_seed(seed)
27 |     train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM])
28 |     model = make_model(experiment_config[c.MODEL_SETTING])
29 | 
30 |     assert num_tasks == len(experiment_config[c.EXPERT_BUFFERS]) == experiment_config[c.AUXILIARY_REWARDS].num_auxiliary_rewards
31 |     expert_buffers = []
32 |     for load_path in experiment_config[c.EXPERT_BUFFERS]:
33 |         # drop memory size for expert buffers to only what is needed
34 |         with gzip.open(load_path, "rb") as f:
35 |             data = pickle.load(f)
36 |             experiment_config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE]
37 | 
38 |         expert_buffers.append(make_buffer(experiment_config[c.BUFFER_SETTING], seed, load_path))
39 | 
40 |     optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY])
41 |     aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
42 |                                      model,
43 |                                      expert_buffers[0],
44 |                                      experiment_config)
45 | 
46 |     learning_algorithm = MultitaskBC(model=model,
47 |                                      optimizer=optimizer,
48 |                                      expert_buffers=expert_buffers,
49 |                                      algo_params=experiment_config,
50 |                                      aux_tasks=aux_tasks)
51 | 
52 |     load_model = experiment_config.get(c.LOAD_MODEL, False)
53 |     if load_model:
54 |         learning_algorithm.load_state_dict(torch.load(load_model))
55 | 
56 |     agent = SACXAgent(scheduler=FixedScheduler(num_tasks=num_tasks,
57 |                                                intention_i=0),
58 |                       intentions=model,
59 |                       learning_algorithm=learning_algorithm,
60 |                       scheduler_period=c.MAX_INT,
61 |                       preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
62 |     evaluation_env = None
63 |     evaluation_agent = None
64 |     if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
65 |         assert experiment_config[c.NUM_EVALUATION_EPISODES] % num_tasks == 0
66 |         evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
67 |         evaluation_agent = SACXAgent(scheduler=RecycleScheduler(num_tasks=num_tasks,
68 |                                                                 scheduling=[experiment_config[c.NUM_EVALUATION_EPISODES] // num_tasks] * num_tasks),
69 |                                      intentions=model,
70 |                                      learning_algorithm=None,
71 |                                      scheduler_period=c.MAX_INT,
72 |                                      preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
73 | 
74 |     summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.MULTITASK_BC, cfg=experiment_config)
75 |     train(agent=agent,
76 |           evaluation_agent=evaluation_agent,
77 |           train_env=train_env,
78 |           evaluation_env=evaluation_env,
79 |           auxiliary_reward=experiment_config[c.AUXILIARY_REWARDS].reward,
80 |           buffer_preprocess=buffer_preprocessing,
81 |           experiment_settings=experiment_config,
82 |           summary_writer=summary_writer,
83 |           save_path=save_path)
84 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/auxiliary_tasks/koopman.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | import rl_sandbox.constants as c
  6 | 
  7 | from rl_sandbox.auxiliary_tasks.auxiliary_tasks import AuxiliaryTask
  8 | from rl_sandbox.model_architectures.shared import Flatten
  9 | 
 10 | class Koopman(AuxiliaryTask):
 11 |     def __init__(self,
 12 |                  rec_dim,
 13 |                  batch_size,
 14 |                  encoder,
 15 |                  decoder,
 16 |                  dynamics,
 17 |                  opt,
 18 |                  buffer,
 19 |                  algo_params,
 20 |                  reduction=c.SUM,
 21 |                  loss_coef=1.,
 22 |                  device=torch.device(c.CPU),
 23 |                  **kwargs):
 24 |         # Image dim: (num_images, num_frames, height, width)
 25 |         assert len(rec_dim) == 4
 26 |         super().__init__()
 27 |         self._flat = Flatten()
 28 | 
 29 |         self._rec_dim = rec_dim
 30 |         self._flatten_dim = int(np.product(rec_dim))
 31 |         self._batch_size = batch_size
 32 | 
 33 |         self._buffer = buffer
 34 |         self._encoder = encoder
 35 |         self._decoder = decoder
 36 |         self._dynamics = dynamics
 37 |         self._opt = opt
 38 | 
 39 |         self._loss_coef = loss_coef
 40 |         self._mse = torch.nn.MSELoss(reduction=reduction)
 41 | 
 42 |         self.device = device
 43 |         self.algo_params = algo_params
 44 |         self.train_preprocessing = algo_params[c.TRAIN_PREPROCESSING]
 45 | 
 46 |     def state_dict(self):
 47 |         return {
 48 |             c.DECODER: self._decoder.state_dict(),
 49 |             c.KOOPMAN_DYNAMICS: self._dynamics.state_dict(),
 50 |             c.KOOPMAN_OPTIMIZER: self._opt.state_dict()
 51 |         }
 52 | 
 53 |     def load_state_dict(self, state_dict):
 54 |         self._decoder.load_state_dict(state_dict[c.DECODER])
 55 |         self._dynamics.load_state_dict(state_dict[c.KOOPMAN_DYNAMICS])
 56 |         self._opt.load_state_dict(state_dict[c.KOOPMAN_OPTIMIZER])
 57 | 
 58 |     @property
 59 |     def opt(self):
 60 |         return self._opt
 61 | 
 62 |     def compute_loss(self, next_obs, next_h_state):
 63 |         obss, _, acts, _, dones, next_obss, _, _ = self._buffer.sample_with_next_obs(
 64 |             self._batch_size, next_obs, next_h_state)
 65 | 
 66 |         obss = self.train_preprocessing(obss)
 67 |         next_obss = self.train_preprocessing(next_obss)
 68 | 
 69 |         batch_size = obss.shape[0]
 70 | 
 71 |         x = obss[:, :self._flatten_dim].reshape(
 72 |             batch_size * self._rec_dim[0], *self._rec_dim[1:]).to(self.device)
 73 | 
 74 |         z_hat = self._encoder(x)
 75 |         x_hat = self._decoder(z_hat)
 76 | 
 77 |         # Compute autoencoder reconstruction loss
 78 |         ae_loss = self._mse(x_hat, x)
 79 | 
 80 |         # This only looks at observations with valid transitions
 81 |         valid_ind = torch.where(dones == 0)[0]
 82 | 
 83 |         z_hat = z_hat[valid_ind]
 84 |         x_hat = x_hat[valid_ind]
 85 | 
 86 |         # Compute MSE K(g(x{n})) + B(u_{n}) and g(x_{n+1})
 87 |         next_x = next_obss[valid_ind, :self._flatten_dim].reshape(
 88 |             len(valid_ind) * self._rec_dim[0], *self._rec_dim[1:]).to(self.device)
 89 |         z_next_hat = self._encoder(next_x)
 90 | 
 91 |         z_next_trans = self._dynamics(z_hat, acts[valid_ind])
 92 |         transition_loss = self._mse(z_next_hat, z_next_trans)
 93 | 
 94 |         # Compute MSE of future state reconstruction
 95 |         # Compute reconstruction of K(g(x{n})) + B(u_{n}), which is approximately = g(x_{n+1})
 96 |         x_next_hat = self._decoder(z_next_hat)
 97 |         x_next_trans = self._decoder(z_next_trans)
 98 | 
 99 |         # Compute reconstruction from z_{n+1}
100 |         future_rec_loss = self._mse(x_next_hat, x_next_trans)
101 | 
102 |         return self._loss_coef * (ae_loss + transition_loss + future_rec_loss)
103 | 
104 | 
105 | class KoopmanDynamics(nn.Module):
106 |     def __init__(self, z_dim, u_dim, device=torch.device(c.CPU)):
107 |         super().__init__()
108 |         self.device = device
109 | 
110 |         self.K = torch.nn.Linear(z_dim, z_dim)
111 |         self.B = torch.nn.Linear(u_dim, z_dim)
112 | 
113 |         self.to(device)
114 | 
115 |     def forward(self, z, u):
116 |         z, u = z.to(self.device), u.to(self.device)
117 |         Kz = self.K(z)
118 |         Bu = self.B(u)
119 | 
120 |         return Kz + Bu
121 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/train/train_multitask_bc_no_overfit.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import gzip
 3 | 
 4 | import torch
 5 | 
 6 | import rl_sandbox.constants as c
 7 | 
 8 | from rl_sandbox.algorithms.bc.bc_no_overfit import MultitaskBC
 9 | from rl_sandbox.algorithms.sac_x.schedulers import FixedScheduler, RecycleScheduler
10 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks
11 | from rl_sandbox.buffers.utils import make_buffer
12 | from rl_sandbox.envs.fake_env import FakeEnv
13 | from rl_sandbox.envs.utils import make_env
14 | from rl_sandbox.learning_utils import train
15 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer
16 | from rl_sandbox.agents.hrl_agents import SACXAgent
17 | from rl_sandbox.transforms.general_transforms import Identity
18 | from rl_sandbox.utils import make_summary_writer, set_seed
19 | from rl_sandbox.examples.lfgp.experts.subsample_expert_data import subsample_buffers
20 | 
21 | def train_multitask_bc_no_overfit(experiment_config):
22 |     seed = experiment_config[c.SEED]
23 |     save_path = experiment_config.get(c.SAVE_PATH, None)
24 |     buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity())
25 |     num_tasks = experiment_config[c.NUM_TASKS]
26 | 
27 |     set_seed(seed)
28 |     train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM])
29 |     model = make_model(experiment_config[c.MODEL_SETTING])
30 | 
31 |     assert num_tasks == len(experiment_config[c.EXPERT_BUFFERS]) == experiment_config[c.AUXILIARY_REWARDS].num_auxiliary_rewards
32 |     expert_buffers = []
33 |     for load_path in experiment_config[c.EXPERT_BUFFERS]:
34 |         # drop memory size for expert buffers to only what is needed
35 |         with gzip.open(load_path, "rb") as f:
36 |             data = pickle.load(f)
37 |             experiment_config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE]
38 | 
39 |         expert_buffers.append(make_buffer(experiment_config[c.BUFFER_SETTING], seed, load_path))
40 | 
41 |     if experiment_config.get(c.EXPERT_BUFFER_SUBSAMPLING, None) is not None:
42 |         expert_buffers = subsample_buffers(expert_buffers, experiment_config[c.EXPERT_BUFFER_SUBSAMPLING])
43 | 
44 |     optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY])
45 |     aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
46 |                                      model,
47 |                                      expert_buffers[0],
48 |                                      experiment_config)
49 | 
50 |     learning_algorithm = MultitaskBC(model=model,
51 |                                      optimizer=optimizer,
52 |                                      expert_buffers=expert_buffers,
53 |                                      algo_params=experiment_config,
54 |                                      aux_tasks=aux_tasks)
55 | 
56 |     load_model = experiment_config.get(c.LOAD_MODEL, False)
57 |     if load_model:
58 |         learning_algorithm.load_state_dict(torch.load(load_model))
59 | 
60 |     agent = SACXAgent(scheduler=FixedScheduler(num_tasks=num_tasks,
61 |                                                intention_i=0),
62 |                       intentions=model,
63 |                       learning_algorithm=learning_algorithm,
64 |                       scheduler_period=c.MAX_INT,
65 |                       preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
66 |     evaluation_env = None
67 |     evaluation_agent = None
68 |     if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
69 |         assert experiment_config[c.NUM_EVALUATION_EPISODES] % num_tasks == 0
70 |         evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
71 |         evaluation_agent = SACXAgent(scheduler=RecycleScheduler(num_tasks=num_tasks,
72 |                                                                 scheduling=[experiment_config[c.NUM_EVALUATION_EPISODES] // num_tasks] * num_tasks),
73 |                                      intentions=model,
74 |                                      learning_algorithm=None,
75 |                                      scheduler_period=c.MAX_INT,
76 |                                      preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
77 | 
78 |     summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.MULTITASK_BC, cfg=experiment_config)
79 |     train(agent=agent,
80 |           evaluation_agent=evaluation_agent,
81 |           train_env=train_env,
82 |           evaluation_env=evaluation_env,
83 |           auxiliary_reward=experiment_config[c.AUXILIARY_REWARDS].reward,
84 |           buffer_preprocess=buffer_preprocessing,
85 |           experiment_settings=experiment_config,
86 |           summary_writer=summary_writer,
87 |           save_path=save_path)
88 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/agents/rl_agents.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | from torch.distributions import Categorical, Normal
  5 | 
  6 | import rl_sandbox.constants as c
  7 | from rl_sandbox.agents.random_agents import UniformContinuousAgent
  8 | 
  9 | 
 10 | class RLAgent():
 11 |     def __init__(self, model, learning_algorithm):
 12 |         self.model = model
 13 |         self.learning_algorithm = learning_algorithm
 14 | 
 15 |     def update(self, curr_obs, curr_h_state, action, reward, done, info, next_obs, next_h_state, **kwargs):
 16 |         return self.learning_algorithm.update(curr_obs,
 17 |                                               curr_h_state,
 18 |                                               action,
 19 |                                               reward,
 20 |                                               done,
 21 |                                               info,
 22 |                                               next_obs,
 23 |                                               next_h_state,
 24 |                                               **kwargs)
 25 | 
 26 |     def compute_action(self, obs, **kwargs):
 27 |         raise NotImplementedError
 28 | 
 29 |     def reset(self):
 30 |         # Returns initial hidden state
 31 |         if hasattr(self.model, c.INITIALIZE_HIDDEN_STATE):
 32 |             return self.model.initialize_hidden_state().numpy().astype(np.float32)
 33 |         return np.array([np.nan], dtype=np.float32)
 34 | 
 35 | 
 36 | class ACAgent(RLAgent):
 37 |     def __init__(self, model, learning_algorithm, preprocess=lambda obs: obs):
 38 |         super().__init__(model=model,
 39 |                          learning_algorithm=learning_algorithm)
 40 |         self.preprocess = preprocess
 41 | 
 42 |     def preprocess(self, obs):
 43 |         return obs
 44 | 
 45 |     def compute_action(self, obs, hidden_state):
 46 |         obs = torch.tensor(obs).unsqueeze(0)
 47 |         obs = self.preprocess(obs)
 48 |         hidden_state = torch.tensor(hidden_state).unsqueeze(0)
 49 |         action, value, hidden_state, log_prob, entropy, mean, variance = self.model.compute_action(
 50 |             obs, hidden_state)
 51 |         act_info = {c.VALUE: value,
 52 |                     c.LOG_PROB: log_prob,
 53 |                     c.ENTROPY: entropy,
 54 |                     c.MEAN: mean,
 55 |                     c.VARIANCE: variance}
 56 |         return action, hidden_state, act_info
 57 | 
 58 |     def deterministic_action(self, obs, hidden_state):
 59 |         obs = torch.tensor(obs).unsqueeze(0)
 60 |         obs = self.preprocess(obs)
 61 |         hidden_state = torch.tensor(hidden_state).unsqueeze(0)
 62 |         action, value, hidden_state, log_prob, entropy = self.model.deterministic_action(
 63 |             obs, hidden_state)
 64 |         act_info = {c.VALUE: value,
 65 |                     c.LOG_PROB: log_prob,
 66 |                     c.ENTROPY: entropy}
 67 |         return action, hidden_state, act_info
 68 | 
 69 | 
 70 | class ACAgentEUniformExplorer(ACAgent):
 71 |     """ Agent that enforces more exploration.
 72 | 
 73 |     prob_explore_ep: probability of executing an "exploration" episode. Determined during call to agent.reset().
 74 |     prob_explore_act: probablility of executing an exploratory action during exploration episode.
 75 |     max_repeat: max number of timesteps to repeat exploratory action.
 76 |     min_repeat: min number of timesteps to repeat exploratory action.
 77 |     """
 78 |     def __init__(self, model, learning_algorithm, prob_explore_ep, prob_explore_act, max_repeat, min_repeat,
 79 |                  min_action=-1, max_action=1, preprocess=lambda obs: obs):
 80 |         super().__init__(model, learning_algorithm, preprocess)
 81 |         self._prob_explore_ep = prob_explore_ep
 82 |         self._prob_explore_act = prob_explore_act
 83 |         self._max_repeat = max_repeat
 84 |         self._min_repeat = min_repeat
 85 |         self._explore_ep = False
 86 |         self._cur_explore_act = None
 87 |         self._act_repeat_ts = 0
 88 |         self._act_repeat_length = 0
 89 |         self._action_dim = self.model._action_dim
 90 |         self._uni_rand_agent = UniformContinuousAgent(np.ones(self._action_dim) * min_action,
 91 |                                                       np.ones(self._action_dim) * max_action)
 92 | 
 93 |     def compute_action(self, obs, hidden_state):
 94 |         explore_act = False
 95 |         if self._explore_ep:
 96 |             if self._cur_explore_act is not None:
 97 |                 if self._act_repeat_ts < self._act_repeat_length:
 98 |                     explore_act = True
 99 |                     self._act_repeat_ts +=1
100 |                 else:
101 |                     # reset action repeat explore
102 |                     self._cur_explore_act = None
103 |                     self._act_repeat_ts = 0
104 | 
105 |             if self._cur_explore_act is None:
106 |                 explore_act = np.random.rand() < self._prob_explore_act
107 | 
108 |                 if explore_act:
109 |                     self._cur_explore_act = list(self._uni_rand_agent.compute_action())
110 |                     self._cur_explore_act[1] = hidden_state
111 |                     self._cur_explore_act = tuple(self._cur_explore_act)
112 |                     self._act_repeat_length = np.random.randint(self._min_repeat, self._max_repeat)
113 |                     self._act_repeat_ts +=1
114 | 
115 |         if explore_act:
116 |             return self._cur_explore_act
117 |         else:
118 |             return super().compute_action(obs, hidden_state)
119 | 
120 |     def reset(self):
121 |         self._explore_ep = np.random.rand() < self._prob_explore_ep
122 |         self._cur_explore_act = None
123 |         return super().reset()


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/collect_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script loads up a trained model and collects trajectories using the model.
  3 | We choose how much data is generated by the model and random uniform policy.
  4 | 
  5 | The model path consists of the state dict of the model.
  6 | 
  7 | The config path consists of all the settings to load the environment
  8 | and preprocessing.
  9 | 
 10 | Example usage:
 11 | python collect_data.py --seed=0 --model_path=./state_dict.pt \
 12 |     --config_path=./experiment_setting.pkl --num_episodes=5 \
 13 |         --num_samples=1000 --save_path=./data.pkl
 14 | """
 15 | 
 16 | import _pickle as pickle
 17 | import argparse
 18 | import gzip
 19 | import numpy as np
 20 | import os
 21 | 
 22 | from pprint import pprint
 23 | from tqdm import tqdm
 24 | 
 25 | import rl_sandbox.constants as c
 26 | 
 27 | from rl_sandbox.examples.eval_tools.utils import load_model
 28 | from rl_sandbox.utils import set_seed
 29 | 
 30 | 
 31 | def collect_data(args):
 32 |     set_seed(args.seed)
 33 |     assert args.num_episodes > 0
 34 |     assert args.num_samples > 0
 35 |     assert 0 <= args.mixture_ratio <= 1
 36 | 
 37 |     dir_exists = os.path.isdir(args.save_path)
 38 |     assert dir_exists or not os.path.exists(args.save_path)
 39 | 
 40 |     if not dir_exists:
 41 |         os.makedirs(args.save_path, exist_ok=True)
 42 | 
 43 |     config, env, buffer_preprocess, agent = load_model(args.seed,
 44 |                                                        args.config_path,
 45 |                                                        args.model_path,
 46 |                                                        args.intention)
 47 | 
 48 |     init_observations = []
 49 |     observations = []
 50 |     actions = []
 51 |     rewards = []
 52 |     dones = []
 53 | 
 54 |     episodes_pbar = tqdm(total=args.num_episodes)
 55 |     samples_pbar = tqdm(total=args.num_samples)
 56 | 
 57 |     sample_i = 0
 58 |     eval_returns = []
 59 |     for episode_i in range(args.num_episodes):
 60 |         eval_returns.append(0)
 61 |         obs = env.reset()
 62 | 
 63 |         init_observations.append(obs)
 64 | 
 65 |         buffer_preprocess.reset()
 66 |         obs = buffer_preprocess(obs)
 67 |         h_state = agent.reset()
 68 |         done = False
 69 | 
 70 |         while not done:
 71 |             if hasattr(env, c.RENDER) and args.render:
 72 |                 env.render()
 73 | 
 74 |             if args.deterministic:
 75 |                 action, h_state, act_info = agent.deterministic_action(
 76 |                     obs=obs, hidden_state=h_state)
 77 |             else:
 78 |                 action, h_state, act_info = agent.compute_action(
 79 |                     obs=obs, hidden_state=h_state)
 80 | 
 81 |             if np.random.uniform() < args.mixture_ratio:
 82 |                 action = np.random.uniform(config[c.MIN_ACTION], config[c.MAX_ACTION], config[c.ACTION_DIM])
 83 | 
 84 |             actions.append(action)
 85 | 
 86 |             if config[c.CLIP_ACTION]:
 87 |                 action = np.clip(action, a_min=config[c.MIN_ACTION], a_max=config[c.MAX_ACTION])
 88 | 
 89 |             obs, reward, done, _ = env.step(action)
 90 | 
 91 |             observations.append(obs)
 92 |             rewards.append(reward)
 93 |             dones.append(done)
 94 |             obs = buffer_preprocess(obs)
 95 | 
 96 |             eval_returns[-1] += reward
 97 |             sample_i += 1
 98 |             samples_pbar.update(1)
 99 |             if sample_i >= args.num_samples:
100 |                 break
101 |         else:
102 |             episodes_pbar.update(1)
103 |             continue
104 |         break
105 | 
106 |     ret_mean = np.mean(eval_returns)
107 |     ret_std = np.std(eval_returns)
108 |     ret_max = np.max(eval_returns)
109 |     ret_min = np.min(eval_returns)
110 | 
111 |     print("=" * 100)
112 |     print("Interacted with {} complete episodes ({} timesteps)".format(episode_i, sample_i))
113 |     print("Average Return: {} - Std: {}".format(ret_mean, ret_std))
114 |     print("Max Return: {} - Min Return: {}".format(ret_max, ret_min))
115 | 
116 |     for (filename, data) in zip(("init_obss", "obss", "acts", "rews", "dones"),
117 |                                 (init_observations, observations, actions, rewards, dones)):
118 |         with gzip.open(f"{args.save_path}/{filename}.pkl", "wb") as f:
119 |             pickle.dump(data, f)
120 | 
121 |     with gzip.open(f"{args.save_path}/metadata.pkl", "wb") as f:
122 |             pickle.dump({
123 |                 "returns": eval_returns,
124 |                 "min": ret_min,
125 |                 "max": ret_max,
126 |                 "avg": ret_mean,
127 |                 "std": ret_std,
128 |                 **args.__dict__,
129 |             }, f)
130 | 
131 | if __name__ == "__main__":
132 |     parser = argparse.ArgumentParser()
133 |     parser.add_argument("--render", action="store_true", help="Render the environment")
134 | 
135 |     parser.add_argument("--seed", type=int, default=0, help="The random seed")
136 |     parser.add_argument("--save_path", type=str, required=True, help="The directory to save the trajectories")
137 |     parser.add_argument("--mixture_ratio", required=True, type=float, help="Amount of data sampled using random uniform policy")
138 |     parser.add_argument("--deterministic", action="store_true", help="Whether or not to use deterministic action (the action mean) from the agent")
139 |     parser.add_argument("--num_episodes", required=True, type=int, help="The maximum number of episodes")
140 |     parser.add_argument("--num_samples", required=True, type=int, help="The maximum number of samples")
141 | 
142 |     parser.add_argument("--model_path", required=True, type=str, help="The path to load the model")
143 |     parser.add_argument("--config_path", required=True, type=str, help="The path to load the config that trained the model")
144 |     parser.add_argument("--intention", type=int, default=0, help="The intention to use for SAC-X")
145 |     args = parser.parse_args()
146 | 
147 |     pprint(args)
148 | 
149 |     collect_data(args)
150 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/arguments.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import torch
  4 | 
  5 | 
  6 | def get_args():
  7 |     parser = argparse.ArgumentParser(description='RL')
  8 |     parser.add_argument(
  9 |         '--algo', default='a2c', help='algorithm to use: a2c | ppo | acktr')
 10 |     parser.add_argument(
 11 |         '--gail',
 12 |         action='store_true',
 13 |         default=False,
 14 |         help='do imitation learning with gail')
 15 |     parser.add_argument(
 16 |         '--gail-experts-file',
 17 |         help='file that contains expert demonstrations for gail')
 18 |     parser.add_argument(
 19 |         '--gail-batch-size',
 20 |         type=int,
 21 |         default=128,
 22 |         help='gail batch size (default: 128)')
 23 |     parser.add_argument(
 24 |         '--gail-epoch', type=int, default=5, help='gail epochs (default: 5)')
 25 |     parser.add_argument(
 26 |         '--lr', type=float, default=7e-4, help='learning rate (default: 7e-4)')
 27 |     parser.add_argument(
 28 |         '--eps',
 29 |         type=float,
 30 |         default=1e-5,
 31 |         help='RMSprop optimizer epsilon (default: 1e-5)')
 32 |     parser.add_argument(
 33 |         '--alpha',
 34 |         type=float,
 35 |         default=0.99,
 36 |         help='RMSprop optimizer apha (default: 0.99)')
 37 |     parser.add_argument(
 38 |         '--gamma',
 39 |         type=float,
 40 |         default=0.99,
 41 |         help='discount factor for rewards (default: 0.99)')
 42 |     parser.add_argument(
 43 |         '--use-gae',
 44 |         action='store_true',
 45 |         default=False,
 46 |         help='use generalized advantage estimation')
 47 |     parser.add_argument(
 48 |         '--gae-lambda',
 49 |         type=float,
 50 |         default=0.95,
 51 |         help='gae lambda parameter (default: 0.95)')
 52 |     parser.add_argument(
 53 |         '--entropy-coef',
 54 |         type=float,
 55 |         default=0.01,
 56 |         help='entropy term coefficient (default: 0.01)')
 57 |     parser.add_argument(
 58 |         '--value-loss-coef',
 59 |         type=float,
 60 |         default=0.5,
 61 |         help='value loss coefficient (default: 0.5)')
 62 |     parser.add_argument(
 63 |         '--max-grad-norm',
 64 |         type=float,
 65 |         default=0.5,
 66 |         help='max norm of gradients (default: 0.5)')
 67 |     parser.add_argument(
 68 |         '--seed', type=int, default=1, help='random seed (default: 1)')
 69 |     parser.add_argument(
 70 |         '--cuda-deterministic',
 71 |         action='store_true',
 72 |         default=False,
 73 |         help="sets flags for determinism when using CUDA (potentially slow!)")
 74 |     parser.add_argument(
 75 |         '--num-processes',
 76 |         type=int,
 77 |         default=16,
 78 |         help='how many training CPU processes to use (default: 16)')
 79 |     parser.add_argument(
 80 |         '--num-steps',
 81 |         type=int,
 82 |         default=5,
 83 |         help='number of forward steps in A2C (default: 5)')
 84 |     parser.add_argument(
 85 |         '--ppo-epoch',
 86 |         type=int,
 87 |         default=4,
 88 |         help='number of ppo epochs (default: 4)')
 89 |     parser.add_argument(
 90 |         '--num-mini-batch',
 91 |         type=int,
 92 |         default=32,
 93 |         help='number of batches for ppo (default: 32)')
 94 |     parser.add_argument(
 95 |         '--clip-param',
 96 |         type=float,
 97 |         default=0.2,
 98 |         help='ppo clip parameter (default: 0.2)')
 99 |     parser.add_argument(
100 |         '--log-interval',
101 |         type=int,
102 |         default=10,
103 |         help='log interval, one log per n updates (default: 10)')
104 |     parser.add_argument(
105 |         '--save-interval',
106 |         type=int,
107 |         default=100,
108 |         help='save interval, one save per n updates (default: 100)')
109 |     parser.add_argument(
110 |         '--eval-interval',
111 |         type=int,
112 |         default=None,
113 |         help='eval interval, one eval per n updates (default: None)')
114 |     parser.add_argument(
115 |         '--num-env-steps',
116 |         type=int,
117 |         default=10e6,
118 |         help='number of environment steps to train (default: 10e6)')
119 |     parser.add_argument(
120 |         '--env-name',
121 |         default='PongNoFrameskip-v4',
122 |         help='environment to train on (default: PongNoFrameskip-v4)')
123 |     parser.add_argument(
124 |         '--log-dir',
125 |         default='/tmp/gym/',
126 |         help='directory to save agent logs (default: /tmp/gym)')
127 |     parser.add_argument(
128 |         '--save-dir',
129 |         default='./trained_models/',
130 |         help='directory to save agent logs (default: ./trained_models/)')
131 |     parser.add_argument(
132 |         '--no-cuda',
133 |         action='store_true',
134 |         default=False,
135 |         help='disables CUDA training')
136 |     parser.add_argument(
137 |         '--use-proper-time-limits',
138 |         action='store_true',
139 |         default=False,
140 |         help='compute returns taking into account time limits')
141 |     parser.add_argument(
142 |         '--recurrent-policy',
143 |         action='store_true',
144 |         default=False,
145 |         help='use a recurrent policy')
146 |     parser.add_argument(
147 |         '--use-linear-lr-decay',
148 |         action='store_true',
149 |         default=False,
150 |         help='use a linear schedule on the learning rate')
151 |     parser.add_argument(
152 |         '--train-render',
153 |         action='store_true',
154 |         default=False,
155 |         help='render training env')
156 |     parser.add_argument(
157 |         '--eval-render',
158 |         action='store_true',
159 |         default=False,
160 |         help='render eval env')
161 |     parser.add_argument(
162 |         '--eval-eps',
163 |         type=int,
164 |         default=50,
165 |         help='# of evaluation episodes')
166 |     args = parser.parse_args()
167 | 
168 |     args.cuda = not args.no_cuda and torch.cuda.is_available()
169 | 
170 |     assert args.algo in ['a2c', 'ppo', 'acktr']
171 |     if args.recurrent_policy:
172 |         assert args.algo in ['a2c', 'ppo'], \
173 |             'Recurrent policy is not implemented for ACKTR'
174 | 
175 |     return args
176 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/examples/lfgp/experts/create_subsampled_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script loads up existing buffers and generates subsampled versions.
  3 | 
  4 | Compared with subsampling on the fly, this ensures that all methods use the exact same data.
  5 | 
  6 | Example usage:
  7 | python create_subsampled_data.py --seed=0 --input_path=./expert_data \
  8 |     --output_path=./expert_data_subsampled --keep_every_nth=20
  9 | """
 10 | 
 11 | import copy
 12 | import glob
 13 | import gzip
 14 | import _pickle as pickle
 15 | import argparse
 16 | import os
 17 | import numpy as np
 18 | 
 19 | import rl_sandbox.constants as c
 20 | 
 21 | 
 22 | parser = argparse.ArgumentParser()
 23 | parser.add_argument("--seed", type=int, default=0, help="The random seed")
 24 | parser.add_argument("--input_path", required=True, type=str, help="The path to .gz file(s) with expert data")
 25 | parser.add_argument("--output_path", required=True, type=str, help="The path to save the new data")
 26 | parser.add_argument("--keep_every_nth", required=False, type=int, help="Keep every nth piece of data.")
 27 | parser.add_argument("--keep_first_last", action='store_true', help="Keep the first and last of each trajectory, "\
 28 |                                                                    "and otherwise subsample normally.")
 29 | parser.add_argument("--keep_last", action='store_true', help="Keep the last of each trajectory, "\
 30 |                                                              "and otherwise subsample normally.")
 31 | parser.add_argument("--keep_last_only", action='store_true', help="Keep the last of each trajectory exclusively.")
 32 | parser.add_argument("--num_to_keep", required=False, type=int, help="Cap the amount of data to keep in each intention.")
 33 | parser.add_argument("--num_extra_lasts", required=False, type=int, help="Add a number of extra final transitions to each intention.")
 34 | parser.add_argument("--num_to_subsample_from", required=False, type=int, help="Cap amount of data to subsample from, but firsts/lasts can still come from more.")
 35 | 
 36 | args = parser.parse_args()
 37 | 
 38 | 
 39 | SUBSAMPLE_KEYS = ['observations', 'hidden_states', 'actions', 'rewards', 'dones', 'next_observations',
 40 |                   'next_hidden_states']
 41 | 
 42 | np.random.seed(args.seed)
 43 | data_paths = glob.glob(os.path.join(args.input_path, '*.gz'))
 44 | 
 45 | assert(os.path.exists(args.input_path)), f"No data folder found at {args.input_path}"
 46 | assert sum([args.keep_first_last, args.keep_last, args.keep_last_only]) <= 1, "Can only set one of these."
 47 | # assert not (args.keep_first_last and args.keep_last), "Can't set both keep_first_last and keep_last"
 48 | 
 49 | if os.path.exists(args.output_path):
 50 |     overwrite = input("Output path already exists. Overwrite? Anything but \"yes\" exits.")
 51 |     if overwrite != 'yes':
 52 |         exit(0)
 53 | 
 54 | os.makedirs(args.output_path, exist_ok=True)
 55 | 
 56 | for dp in data_paths:
 57 |     gz_filename = dp.split('/')[-1]
 58 |     out_path = os.path.join(args.output_path, gz_filename)
 59 | 
 60 |     with gzip.open(dp, 'rb') as f:
 61 |         data = pickle.load(f)
 62 | 
 63 |     out_data = copy.deepcopy(data)
 64 | 
 65 |     if args.keep_first_last or args.keep_last or args.keep_last_only:
 66 |         inds = []
 67 |         ends = np.argwhere(np.invert(np.all(data['observations'][1:] == data['next_observations'][:-1], axis=1)))
 68 |         starts = np.concatenate([[[0]], ends + 1])
 69 | 
 70 |         if args.keep_last_only:
 71 |             inds = ends
 72 |         else:
 73 |             for start, end in zip(starts, ends):
 74 |                 if args.keep_first_last:
 75 |                     inds.append(int(start))
 76 |                 if end == start:  # should only happen if very first index is an end
 77 |                     if args.keep_last:
 78 |                         inds.append(int(end))
 79 |                     continue
 80 | 
 81 |                 initial_offset = np.random.randint(args.keep_every_nth)
 82 |                 next_i = start + initial_offset
 83 |                 while next_i < end:
 84 |                     inds.append(int(next_i))
 85 |                     next_i += args.keep_every_nth
 86 | 
 87 |                 inds.append(int(end))
 88 | 
 89 |         inds = np.array(inds).squeeze()
 90 | 
 91 |         if args.num_to_subsample_from is not None:
 92 |             inds = inds[inds < args.num_to_subsample_from]
 93 |     else:
 94 |         initial_offset = np.random.randint(args.keep_every_nth)
 95 | 
 96 |         if args.num_to_subsample_from is None:
 97 |             max_ind = len(data['observations'])
 98 |         else:
 99 |             max_ind = args.num_to_subsample_from
100 | 
101 |         # this assumes that the buffers are coming in as only being the size that they need to be
102 |         inds = np.array(range(initial_offset, max_ind, args.keep_every_nth))
103 | 
104 |     if args.num_to_keep is not None:
105 |         # assert len(inds) >= args.num_to_keep, f"Not enough timesteps, wanted {args.num_to_keep}, found "\
106 |         #                                         f"{len(inds)} for {gz_filename}."
107 |         if len(inds) < args.num_to_keep:
108 |             print(f"Not enough timesteps, wanted {args.num_to_keep}, found {len(inds)} for {gz_filename}.")
109 |             # print(f"Skipping path {dp}")
110 |         inds = inds[:args.num_to_keep]
111 | 
112 |     if args.num_extra_lasts is not None:
113 |         ends = np.argwhere(np.invert(np.all(data['observations'][1:] == data['next_observations'][:-1], axis=1)))
114 |         unused_ends = ends[ends > inds[-1]]
115 |         if unused_ends.shape[0] < args.num_extra_lasts:
116 |             print(f"WARNING: wanted {args.num_extra_lasts} extra lasts, but only found {unused_ends.shape[0]} for {gz_filename}")
117 |         inds = np.concatenate([inds, unused_ends[:args.num_extra_lasts]])
118 | 
119 |     print(f"Keeping {len(inds)} data for {gz_filename}.")
120 | 
121 |     for k in SUBSAMPLE_KEYS:
122 |         out_data[k] = data[k][inds]
123 | 
124 |     for ik in data['infos'].keys():
125 |         out_data['infos'][ik] = data['infos'][ik][inds]
126 | 
127 |     # also need to update size parameters
128 |     out_data['pointer'] = 0
129 |     out_data['count'] = len(inds)
130 |     out_data['memory_size'] = len(inds)
131 | 
132 |     with gzip.open(out_path, "wb") as f:
133 |         pickle.dump(out_data, f)
134 | 
135 | print(f"Subsampled data created and saved to {args.output_path}.")


--------------------------------------------------------------------------------
/scripts/plotting/multitask_performance.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import pickle
  4 | import os
  5 | import glob
  6 | 
  7 | import utils as plot_utils
  8 | from data_locations import main_performance as data_locations
  9 | import common as plot_common
 10 | 
 11 | 
 12 | opts = plot_utils.get_default_opts()
 13 | #### Options ########################################################################################################
 14 | # fig
 15 | opts['font_size'] = 24
 16 | opts['data_locations'] = data_locations
 17 | opts['aux_names'] = [
 18 |     ['Open', 'Close', 'Stack', 'Lift', 'Reach', 'Move'],
 19 |     ['Open', 'Close', 'Unstack-Stack', 'Lift', 'Reach', 'Move'],
 20 |     ['Open', 'Close', 'Bring', 'Lift', 'Reach', 'Move'],
 21 |     ['Open', 'Close', 'Insert', 'Bring', 'Lift', 'Reach', 'Move']
 22 | ]
 23 | opts['aux_orders'] = [
 24 |     [2, 0, 1, 3, 4, 5],
 25 |     [2, 0, 1, 3, 4, 5],
 26 |     [2, 0, 1, 3, 4, 5],
 27 |     [2, 0, 1, 3, 4, 5, 6],
 28 | ]
 29 | opts['algo_dir_names'] = ['dac-x', 'multitask_dac', 'multitask_bc']
 30 | opts['algo_titles'] = ['LfGP (multi)', 'LfGP-NS (multi)', 'BC (multi)']
 31 | opts['fig_path'] = opts['root_dir'] + "/figures/multitask_performance"
 32 | opts['valid_task'] = [True, True, True, True]
 33 | # opts['valid_task'] = [False, True, False, False]
 34 | # opts['rl_eval_eps_per_task'] = 10  
 35 | # opts['bc_eval_eps_per_task'] = 10
 36 | # opts['eval_interval'] = 10000
 37 | 
 38 | root_dir, fig_path, experiment_root_dir, seeds, expert_root, expert_perf_files, expert_perf_file_main_task_i = \
 39 |     plot_common.get_path_defaults(fig_name="multitask_performance")
 40 | 
 41 | task_dir_names, valid_task, task_titles, main_task_i, num_aux, task_data_filenames, num_eval_steps_to_use = \
 42 |     plot_common.get_task_defaults()
 43 | 
 44 | algo_dir_names, algo_titles, multitask_algos, eval_eps_per_task = plot_common.get_algo_defaults()
 45 | 
 46 | fig_shape, plot_size, num_stds, font_size, eval_interval, cmap, linewidth, std_alpha, x_val_scale, subsample_rate, \
 47 |     include_expert_baseline = plot_common.get_fig_defaults()
 48 | 
 49 | aux_names = [
 50 |     ['Open', 'Close', 'Stack', 'Lift', 'Reach', 'Move'],
 51 |     ['Open', 'Close', 'Unstack-Stack', 'Lift', 'Reach', 'Move'],
 52 |     ['Open', 'Close', 'Bring', 'Lift', 'Reach', 'Move'],
 53 |     ['Open', 'Close', 'Insert', 'Bring', 'Lift', 'Reach', 'Move']
 54 | ]
 55 | aux_orders = [
 56 |     [2, 0, 1, 3, 4, 5],
 57 |     [2, 0, 1, 3, 4, 5],
 58 |     [2, 0, 1, 3, 4, 5],
 59 |     [2, 0, 1, 3, 4, 5, 6],
 60 | ]
 61 | #####################################################################################################################
 62 | 
 63 | # pretty plotting, allow tex
 64 | plt.rcParams.update({"text.usetex": True, "font.family": "serif"})
 65 | plt.rc('text.latex', preamble=r'\usepackage{amsmath}')
 66 | 
 67 | all_successes, all_returns = plot_utils.get_returns_successes("multitask_performance", data_locations)
 68 | 
 69 | # dicts are all_successes['task']['algo']['raw/mean/std'],
 70 | # raw shape: (seed, timestep, aux task, eval ep)
 71 | # mean and std shape: (timestep, aux_task)
 72 | 
 73 | # fig 1: success rate of each aux while executing own task
 74 | # own_task_s_figs = [plt.subplots(nrows=1, ncols=opts['num_aux[task_i])
 75 | nrows = len(task_dir_names)
 76 | ncols = max(num_aux)
 77 | own_task_s_fig = plt.figure(figsize=[plot_size[0] * ncols, plot_size[1] * nrows])
 78 | own_task_r_fig = plt.figure(figsize=[plot_size[0] * ncols, plot_size[1] * nrows])
 79 | 
 80 | 
 81 | for task_i, task in enumerate(task_dir_names):
 82 |     if not valid_task[task_i]: 
 83 |         print(f"Task {task} set to false in valid_task, skipping in plotting")
 84 |         continue
 85 | 
 86 |     # for aux_i in range(num_aux[task_i]):
 87 |     for col_i, aux_i in enumerate(aux_orders[task_i]):
 88 |         # plt_index = task_i * ncols + aux_i + 1
 89 |         plt_index = task_i * ncols + col_i + 1
 90 |         for plot_type, fig, data in zip(['s', 'r'], [own_task_s_fig, own_task_r_fig], [all_successes, all_returns]):
 91 |             ax = fig.add_subplot(nrows, ncols, plt_index)
 92 |             ax.set_title(aux_names[task_i][aux_i], fontsize=font_size)
 93 | 
 94 |             # if aux_i == 0:
 95 |             if col_i == 0:
 96 |                 ax.set_ylabel(task_titles[task_i], fontsize=font_size)
 97 | 
 98 |             for algo_i, algo in enumerate(algo_dir_names):
 99 |                 if algo in multitask_algos or aux_i == 2:
100 |                     plot_utils.plot_mean_std(ax, aux_i, algo, algo_i, data[task][algo],
101 |                                             algo_label=algo_titles[algo_i] if (task_i == 0 and aux_i == 2) else None)
102 |             
103 |             # pretty
104 |             if plot_type == 's':
105 |                 ax.set_ylim(-.01, 1.05)
106 |                 ax.set_yticks([0, .5, 1])
107 |                 ax.set_yticks([0,.25, .5, .75, 1], minor=True)
108 |             ax.tick_params(labelsize=font_size - 4)
109 |             if task == 'insert_0':
110 |                 ax.set_xlim(0, 4.1)
111 |                 ax.set_xticks([1, 2, 3, 4])
112 |                 ax.set_xticks(np.arange(0, 4, 0.5), minor=True)
113 |                 ax.grid(which='both', alpha=0.5)
114 |             else:
115 |                 ax.set_xlim(0, 2.1)
116 |                 ax.set_xticks([0.5, 1, 1.5, 2])
117 |                 # ax.set_xticks([0,1,2,3,4], minor=True)
118 |                 ax.grid(which='both', alpha=0.5)
119 | 
120 | 
121 | for fig, fig_name in zip([own_task_s_fig, own_task_r_fig], ['s_fig.pdf', 'r_fig.pdf']):
122 |     fig.tight_layout()
123 |     fig.legend(fancybox=True, shadow=True, fontsize=font_size, loc="right",
124 |                bbox_to_anchor=(0.98, 0.5))
125 |             #    ncol=len(algo_dir_names) + 1, bbox_to_anchor=(0.5, -0.31))
126 | 
127 |     ax = fig.add_subplot(111, frameon=False)
128 |     ax.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
129 |     ax.set_xlabel("Updates/steps (millions)", fontsize=font_size + 4, labelpad=10)
130 |     
131 |     if 's_fig' in fig_name:
132 |         ax.set_ylabel("Success Rate", fontsize=font_size + 4, labelpad=32)
133 |     else:
134 |         ax.set_ylabel("Episode Return", fontsize=font_size + 4, labelpad=30)
135 | 
136 |     os.makedirs(fig_path, exist_ok=True)
137 |     fig.savefig(os.path.join(fig_path, fig_name), bbox_inches='tight')
138 | 
139 | # fig 2: success rate of each aux while running the main task
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/train/train_dac_sac.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | 
  4 | import torch
  5 | 
  6 | import rl_sandbox.constants as c
  7 | 
  8 | from rl_sandbox.algorithms.dac.sac import SACDAC
  9 | from rl_sandbox.algorithms.dac.dac import DAC
 10 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks
 11 | from rl_sandbox.buffers.utils import make_buffer
 12 | from rl_sandbox.envs.utils import make_env
 13 | from rl_sandbox.learning_utils import train
 14 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer
 15 | from rl_sandbox.agents.rl_agents import ACAgent, ACAgentEUniformExplorer
 16 | from rl_sandbox.transforms.general_transforms import Identity
 17 | from rl_sandbox.utils import make_summary_writer, set_seed, set_rng_state, check_load_latest_checkpoint, check_load_as_jumpoff_point
 18 | from rl_sandbox.envs.wrappers.frame_stack import FrameStackWrapper
 19 | 
 20 | def train_dac_sac(experiment_config):
 21 |     seed = experiment_config[c.SEED]
 22 |     save_path = experiment_config.get(c.SAVE_PATH, None)
 23 |     buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity())
 24 | 
 25 |     save_path, add_time_tag_to_save_path = check_load_latest_checkpoint(experiment_config, save_path)
 26 |     save_path, add_time_tag_to_save_path = check_load_as_jumpoff_point(experiment_config, save_path, add_time_tag_to_save_path)
 27 |     buffer_end_idx = None
 28 |     if experiment_config.get(c.LOAD_BUFFER_START_INDEX, -1) >= 0:
 29 |         buffer_end_idx = experiment_config[c.LOAD_BUFFER_START_INDEX]
 30 | 
 31 |     set_seed(seed)
 32 |     train_env = make_env(experiment_config[c.ENV_SETTING], seed)
 33 |     model = make_model(experiment_config[c.MODEL_SETTING])
 34 |     buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False),
 35 |                          end_idx=buffer_end_idx)
 36 | 
 37 |     policy_opt = make_optimizer(model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY])
 38 |     qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS])
 39 |     alpha_opt = make_optimizer([model.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA])
 40 | 
 41 |     aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS],
 42 |                                      model,
 43 |                                      buffer,
 44 |                                      experiment_config)
 45 | 
 46 |     frame_stack = 1
 47 |     for wrap_dict in experiment_config[c.ENV_SETTING][c.ENV_WRAPPERS]:
 48 |         if wrap_dict[c.WRAPPER] == FrameStackWrapper:
 49 |             frame_stack = wrap_dict[c.KWARGS][c.NUM_FRAMES]
 50 | 
 51 |     # handle old code without expert amount option
 52 |     expert_amount = experiment_config.get(c.EXPERT_AMOUNT, None)
 53 |     expert_buffer_settings = experiment_config.get(c.EXPERT_BUFFER_SETTING, experiment_config[c.BUFFER_SETTING])
 54 | 
 55 |     expert_buffer = make_buffer(expert_buffer_settings, seed, experiment_config[c.EXPERT_BUFFER],
 56 |                                 end_idx=expert_amount, match_load_size=True, frame_stack_load=frame_stack)
 57 | 
 58 |     if c.FT_EXPERT_BUFFER in experiment_config:
 59 |         ft_expert_buffer = make_buffer(expert_buffer_settings, seed, experiment_config[c.FT_EXPERT_BUFFER],
 60 |                                        end_idx=expert_amount, match_load_size=True, frame_stack_load=frame_stack)
 61 |         expert_buffer.merge(ft_expert_buffer)
 62 | 
 63 |     learning_algorithm = SACDAC(model=model,
 64 |                                 policy_opt=policy_opt,
 65 |                                 qs_opt=qs_opt,
 66 |                                 alpha_opt=alpha_opt,
 67 |                                 learn_alpha=experiment_config[c.LEARN_ALPHA],
 68 |                                 buffer=buffer,
 69 |                                 algo_params=experiment_config,
 70 |                                 aux_tasks=aux_tasks,
 71 |                                 expert_buffer=expert_buffer)
 72 | 
 73 |     discriminator = make_model(experiment_config[c.DISCRIMINATOR_SETTING])
 74 |     discriminator_opt = make_optimizer(discriminator.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.DISCRIMINATOR])
 75 |     dac = DAC(discriminator=discriminator,
 76 |               discriminator_opt=discriminator_opt,
 77 |               expert_buffer=expert_buffer,
 78 |               learning_algorithm=learning_algorithm,
 79 |               algo_params=experiment_config)
 80 | 
 81 |     load_model = experiment_config.get(c.LOAD_MODEL, False)
 82 |     if load_model:
 83 |         state_dict = torch.load(load_model, map_location=experiment_config[c.DEVICE])
 84 |         dac.load_state_dict(state_dict)
 85 |         set_rng_state(state_dict[c.TORCH_RNG_STATE], state_dict[c.NP_RNG_STATE])
 86 | 
 87 |     # TODO add this as a proper option
 88 |     # agent = ACAgentEUniformExplorer(model=model, learning_algorithm=dac,
 89 |     #                                 prob_explore_ep=.2, prob_explore_act=.05, max_repeat=41, min_repeat=40)
 90 | 
 91 |     agent = ACAgent(model=model,
 92 |                     learning_algorithm=dac,
 93 |                     preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
 94 | 
 95 |     # overwrites the save path with a time tag
 96 |     summary_writer, save_path = make_summary_writer(save_path=save_path,
 97 |                                                     algo=c.DAC,
 98 |                                                     cfg=experiment_config,
 99 |                                                     add_time_tag=add_time_tag_to_save_path)
100 |     evaluation_env = None
101 |     evaluation_agent = None
102 |     if experiment_config.get(c.EVALUATION_FREQUENCY, 0):
103 |         if experiment_config[c.ENV_SETTING][c.ENV_TYPE] == c.PANDA_RL_ENVS:
104 |             evaluation_env = train_env
105 |         else:
106 |             evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1)
107 |         evaluation_agent = ACAgent(model=model,
108 |                                    learning_algorithm=None,
109 |                                    preprocess=experiment_config[c.EVALUATION_PREPROCESSING])
110 | 
111 |     train(agent=agent,
112 |           evaluation_agent=evaluation_agent,
113 |           train_env=train_env,
114 |           evaluation_env=evaluation_env,
115 |           buffer_preprocess=buffer_preprocessing,
116 |           experiment_settings=experiment_config,
117 |           auxiliary_reward=experiment_config[c.EVALUATION_REWARD_FUNC],
118 |           summary_writer=summary_writer,
119 |           save_path=save_path)
120 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/model_architectures/discriminators/fully_connected_discriminators.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | 
  4 | from torch.distributions import Normal
  5 | 
  6 | from rl_sandbox.constants import CPU
  7 | from rl_sandbox.model_architectures.shared import Flatten
  8 | from rl_sandbox.model_architectures.utils import construct_linear_layers, RunningMeanStd
  9 | 
 10 | 
 11 | class ActionConditionedFullyConnectedDiscriminator(nn.Module):
 12 |     def __init__(self,
 13 |                  obs_dim,
 14 |                  action_dim,
 15 |                  output_dim,
 16 |                  shared_layers=None,
 17 |                  device=torch.device(CPU),
 18 |                  obs_only=False,
 19 |                  branched_outputs=False,
 20 |                  activation=nn.Tanh(),
 21 |                  layers=None):
 22 |         super().__init__()
 23 |         self.device = device
 24 | 
 25 |         self._obs_dim = obs_dim
 26 |         self._action_dim = action_dim
 27 |         self._output_dim = output_dim
 28 |         self._obs_only = obs_only
 29 | 
 30 |         self.branched_outputs = branched_outputs
 31 | 
 32 |         self._flatten = Flatten()
 33 | 
 34 |         # backwards compatibility
 35 |         if shared_layers is None and layers is not None:
 36 |             shared_layers = layers
 37 | 
 38 |         if shared_layers is not None:
 39 |             # backwards compatible with old code for layer setup, but now we can customize activation function
 40 |             new_shared_layers = []
 41 |             for l in shared_layers:
 42 |                 new_layer = []
 43 |                 new_layer.extend(l[:2])
 44 |                 new_layer.append(activation)
 45 |                 new_layer.extend(l[3:])
 46 |                 new_shared_layers.append(new_layer)
 47 |             shared_layers = tuple(new_shared_layers)
 48 | 
 49 |         if shared_layers is None:
 50 |             assert branched_outputs, "Can't have no shared layers in multitask discriminator without branched outputs on."
 51 | 
 52 |             self.fc_layers = nn.ModuleList([nn.Identity()])
 53 |             num_inputs = obs_dim + obs_only * action_dim
 54 |             self.output = nn.Sequential(
 55 |                 nn.Conv1d(num_inputs * output_dim, 256 * output_dim, kernel_size=1, groups=output_dim), activation,
 56 |                 nn.Conv1d(256 * output_dim, 256 * output_dim, kernel_size=1, groups=output_dim), activation,
 57 |                 nn.Conv1d(256 * output_dim, output_dim, kernel_size=1, groups=output_dim)
 58 |             )
 59 | 
 60 |         else:
 61 |             self.fc_layers = construct_linear_layers(shared_layers)
 62 |             if self.branched_outputs:
 63 |                 self.output = nn.Sequential(
 64 |                     nn.Conv1d(256 * output_dim, 256 * output_dim, kernel_size=1, groups=output_dim), activation,
 65 |                     nn.Conv1d(256 * output_dim, output_dim, kernel_size=1, groups=output_dim)
 66 |                 )
 67 |             else:
 68 |                 self.output = nn.Linear(shared_layers[-1][1], output_dim)
 69 | 
 70 |         self.to(device)
 71 | 
 72 |     def forward(self, obss, acts):
 73 |         batch_size = obss.shape[0]
 74 | 
 75 |         obss = obss.reshape(batch_size, -1)
 76 |         if self._obs_only:
 77 |             x = obss
 78 |         else:
 79 |             x = torch.cat((obss, acts), dim=-1)
 80 | 
 81 |         x = self._flatten(x)
 82 | 
 83 |         x = x.to(self.device)
 84 |         for layer in self.fc_layers:
 85 |             x = layer(x)
 86 | 
 87 |         if self.branched_outputs:
 88 |             x = x.repeat(1, self._output_dim).unsqueeze(-1)
 89 |             logits = self.output(x).squeeze(-1)
 90 |         else:
 91 |             logits = self.output(x)
 92 | 
 93 |         return logits
 94 | 
 95 | 
 96 | class ActionConditionedFullyConnectedDiscriminatorPlusRewards(nn.Module):
 97 |     def __init__(self, obs_dim, action_dim, output_dim, handcraft_rewards, layers, device=torch.device(CPU)):
 98 |         """
 99 |         handcraft_rewards is a dict containing the indices and corresponding reward functions that should be output
100 |         in place of NN outputs.
101 |         """
102 |         super().__init__()
103 |         self.device = device
104 | 
105 |         self._obs_dim = obs_dim
106 |         self._action_dim = action_dim
107 |         self._output_dim = output_dim
108 |         self._handcraft_rewards = handcraft_rewards
109 |         self._true_output_dim = output_dim + len(self._handcraft_rewards)
110 | 
111 |         # need these so that handcrafted reward magnitudes can be rescaled to match average of learned reward magnitudes
112 |         self._trainable_logits_rms = RunningMeanStd(shape=(self._output_dim,), norm_dim=(0,))
113 |         self._trainable_logits_rms.to(device)
114 |         self._handcraft_rewards_rmss = [RunningMeanStd(shape=(1,), norm_dim=(0,))] * len(self._handcraft_rewards)
115 |         for rms in self._handcraft_rewards_rmss:
116 |             rms.to(device)
117 | 
118 |         # get trainable indices of true output
119 |         handcraft_indices = [index for index in self._handcraft_rewards.keys()]
120 |         self._trainable_indices = sorted(list(set(range(self._true_output_dim)) ^ set(handcraft_indices)))
121 | 
122 |         self._flatten = Flatten()
123 |         self.fc_layers = construct_linear_layers(layers)
124 | 
125 |         self.output = nn.Linear(layers[-1][1], output_dim)
126 | 
127 |         self.to(device)
128 | 
129 |     def forward(self, obss, acts):
130 |         batch_size = obss.shape[0]
131 | 
132 |         obss = obss.reshape(batch_size, -1)
133 |         x = torch.cat((obss, acts), dim=-1)
134 |         x = self._flatten(x)
135 | 
136 |         x = x.to(self.device)
137 |         for layer in self.fc_layers:
138 |             x = layer(x)
139 | 
140 |         logits = self.output(x)
141 | 
142 |         self._trainable_logits_rms.update(logits.detach())
143 |         trainable_mean = self._trainable_logits_rms.mean.mean()
144 |         trainable_var = self._trainable_logits_rms.var.mean()
145 | 
146 |         full_out = torch.zeros([batch_size, self._true_output_dim]).to(self.device)
147 |         full_out[:, self._trainable_indices] = logits
148 | 
149 |         obss = obss.to(self.device)
150 |         acts = acts.to(self.device)
151 |         for list_index, (index, func) in enumerate(self._handcraft_rewards.items()):
152 |             rms = self._handcraft_rewards_rmss[list_index]
153 |             rews = func(None, acts, obss, torch_multi=True)
154 |             rms.update(rews)
155 |             rews_normalized = rms.normalize(rews)
156 |             rews_scale_matched = rews_normalized * torch.sqrt(trainable_var + self._trainable_logits_rms.epsilon) + \
157 |                                  trainable_mean
158 |             full_out[:, index] = rews_scale_matched
159 | 
160 |         return full_out


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/algo/gail.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.utils.data
  7 | from torch import autograd
  8 | 
  9 | from stable_baselines3.common.running_mean_std import RunningMeanStd
 10 | 
 11 | class Discriminator(nn.Module):
 12 |     def __init__(self, input_dim, hidden_dim, device):
 13 |         super(Discriminator, self).__init__()
 14 | 
 15 |         self.device = device
 16 | 
 17 |         self.trunk = nn.Sequential(
 18 |             nn.Linear(input_dim, hidden_dim), nn.Tanh(),
 19 |             nn.Linear(hidden_dim, hidden_dim), nn.Tanh(),
 20 |             nn.Linear(hidden_dim, 1)).to(device)
 21 | 
 22 |         self.trunk.train()
 23 | 
 24 |         self.optimizer = torch.optim.Adam(self.trunk.parameters())
 25 | 
 26 |         self.returns = None
 27 |         self.ret_rms = RunningMeanStd(shape=())
 28 | 
 29 |     def compute_grad_pen(self,
 30 |                          expert_state,
 31 |                          expert_action,
 32 |                          policy_state,
 33 |                          policy_action,
 34 |                          lambda_=10):
 35 |         alpha = torch.rand(expert_state.size(0), 1)
 36 |         expert_data = torch.cat([expert_state, expert_action], dim=1)
 37 |         policy_data = torch.cat([policy_state, policy_action], dim=1)
 38 | 
 39 |         alpha = alpha.expand_as(expert_data).to(expert_data.device)
 40 | 
 41 |         mixup_data = alpha * expert_data + (1 - alpha) * policy_data
 42 |         mixup_data.requires_grad = True
 43 | 
 44 |         disc = self.trunk(mixup_data)
 45 |         ones = torch.ones(disc.size()).to(disc.device)
 46 |         grad = autograd.grad(
 47 |             outputs=disc,
 48 |             inputs=mixup_data,
 49 |             grad_outputs=ones,
 50 |             create_graph=True,
 51 |             retain_graph=True,
 52 |             only_inputs=True)[0]
 53 | 
 54 |         grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean()
 55 |         return grad_pen
 56 | 
 57 |     def update(self, expert_loader, rollouts, obsfilt=None):
 58 |         self.train()
 59 | 
 60 |         policy_data_generator = rollouts.feed_forward_generator(
 61 |             None, mini_batch_size=expert_loader.batch_size)
 62 | 
 63 |         loss = 0
 64 |         n = 0
 65 |         for expert_batch, policy_batch in zip(expert_loader,
 66 |                                               policy_data_generator):
 67 |             policy_state, policy_action = policy_batch[0], policy_batch[2]
 68 |             policy_d = self.trunk(
 69 |                 torch.cat([policy_state, policy_action], dim=1))
 70 | 
 71 |             expert_state, expert_action = expert_batch
 72 |             # expert_state = obsfilt(expert_state.numpy(), update=False)
 73 |             expert_state = torch.FloatTensor(expert_state).to(self.device)
 74 |             expert_action = expert_action.to(self.device)
 75 |             # print(expert_state.shape, expert_action.shape)
 76 |             expert_d = self.trunk(
 77 |                 torch.cat([expert_state, expert_action], dim=1))
 78 | 
 79 |             expert_loss = F.binary_cross_entropy_with_logits(
 80 |                 expert_d,
 81 |                 torch.ones(expert_d.size()).to(self.device))
 82 |             policy_loss = F.binary_cross_entropy_with_logits(
 83 |                 policy_d,
 84 |                 torch.zeros(policy_d.size()).to(self.device))
 85 | 
 86 |             gail_loss = expert_loss + policy_loss
 87 |             grad_pen = self.compute_grad_pen(expert_state, expert_action,
 88 |                                              policy_state, policy_action)
 89 | 
 90 |             loss += (gail_loss + grad_pen).item()
 91 |             n += 1
 92 | 
 93 |             self.optimizer.zero_grad()
 94 |             (gail_loss + grad_pen).backward()
 95 |             self.optimizer.step()
 96 |         return loss / n
 97 | 
 98 |     def predict_reward(self, state, action, gamma, masks, update_rms=True):
 99 |         with torch.no_grad():
100 |             self.eval()
101 |             d = self.trunk(torch.cat([state, action], dim=1))
102 |             s = torch.sigmoid(d)
103 |             # reward = s.log() - (1 - s).log()
104 |             reward = - (1 - s).log()
105 |             if self.returns is None:
106 |                 self.returns = reward.clone()
107 | 
108 |             if update_rms:
109 |                 self.returns = self.returns * masks * gamma + reward
110 |                 self.ret_rms.update(self.returns.cpu().numpy())
111 | 
112 |             return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
113 | 
114 | 
115 | class ExpertDataset(torch.utils.data.Dataset):
116 |     def __init__(self, file_name, num_trajectories=4, subsample_frequency=20):
117 |         all_trajectories = torch.load(file_name)
118 |         
119 |         perm = torch.randperm(all_trajectories['states'].size(0))
120 |         idx = perm[:num_trajectories]
121 | 
122 |         self.trajectories = {}
123 |         
124 |         # See https://github.com/pytorch/pytorch/issues/14886
125 |         # .long() for fixing bug in torch v0.4.1
126 |         start_idx = torch.randint(
127 |             0, subsample_frequency, size=(num_trajectories, )).long()
128 | 
129 |         for k, v in all_trajectories.items():
130 |             data = v[idx]
131 | 
132 |             if k != 'lengths':
133 |                 samples = []
134 |                 for i in range(num_trajectories):
135 |                     samples.append(data[i, start_idx[i]::subsample_frequency])
136 |                 self.trajectories[k] = torch.stack(samples)
137 |             else:
138 |                 self.trajectories[k] = data // subsample_frequency
139 | 
140 |         self.i2traj_idx = {}
141 |         self.i2i = {}
142 |         
143 |         self.length = self.trajectories['lengths'].sum().item()
144 | 
145 |         traj_idx = 0
146 |         i = 0
147 | 
148 |         self.get_idx = []
149 |         
150 |         for j in range(self.length):
151 |             
152 |             while self.trajectories['lengths'][traj_idx].item() <= i:
153 |                 i -= self.trajectories['lengths'][traj_idx].item()
154 |                 traj_idx += 1
155 | 
156 |             self.get_idx.append((traj_idx, i))
157 | 
158 |             i += 1
159 |             
160 |             
161 |     def __len__(self):
162 |         return self.length
163 | 
164 |     def __getitem__(self, i):
165 |         traj_idx, i = self.get_idx[i]
166 | 
167 |         return self.trajectories['states'][traj_idx][i], self.trajectories[
168 |             'actions'][traj_idx][i]
169 | 
170 | 
171 | class LfGPExpertDataset(torch.utils.data.Dataset):
172 |     def __init__(self, file_name):
173 |         self.trajectories = torch.load(file_name)
174 |         self.length = len(self.trajectories["states"])
175 |             
176 |     def __len__(self):
177 |         return self.length
178 | 
179 |     def __getitem__(self, i):
180 |         return self.trajectories['states'][i], self.trajectories[
181 |             'actions'][i]
182 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/algorithms/sac_x/schedulers_update/q_scheduler.py:
--------------------------------------------------------------------------------
  1 | import timeit
  2 | import torch
  3 | import numpy as np
  4 | 
  5 | import rl_sandbox.constants as c
  6 | from rl_sandbox.envs.utils import absorbing_check
  7 | 
  8 | 
  9 | class UpdateQScheduler:
 10 |     def __init__(self, model, algo_params):
 11 |         self.model = model
 12 |         self._num_tasks = algo_params.get(c.NUM_TASKS, 1)
 13 |         self._action_dim = algo_params[c.ACTION_DIM]
 14 | 
 15 |         self._scheduler_period = algo_params[c.SCHEDULER_SETTING][c.TRAIN][c.SCHEDULER_PERIOD]
 16 |         self._scheduler_tau = algo_params[c.SCHEDULER_TAU]
 17 |         self.main_intention = algo_params.get(c.MAIN_INTENTION, 0)
 18 | 
 19 |         self._gamma = algo_params[c.GAMMA]
 20 |         self._rewards = []
 21 |         self._discounting = []
 22 | 
 23 |     def state_dict(self):
 24 |         return self.model.state_dict
 25 | 
 26 |     def load_state_dict(self, state_dict):
 27 |         self.model.load_state_dict(state_dict)
 28 | 
 29 |     def _compute_returns(self):
 30 |         episode_length = len(self._rewards)
 31 |         returns = torch.zeros(episode_length + 1)
 32 |         for step in range(episode_length - 1, -1, -1):
 33 |             returns[step] = self._rewards[step] + \
 34 |                 (self._gamma ** self._discounting[step]) * returns[step + 1]
 35 | 
 36 |         # Only take the returns for every scheduler's action
 37 |         return returns[:-1][::self._scheduler_period]
 38 | 
 39 |     def update_scheduler(self, obs, act, update_info):
 40 |         traj = obs + [act.item()]
 41 |         q_first_action = self.model.compute_qs([])
 42 | 
 43 |         print(f"Scheduler Trajectory: {traj} - Q([], a), for all a: {q_first_action}")
 44 | 
 45 |         update_info[c.SCHEDULER_TRAJ] = traj
 46 |         update_info[c.SCHEDULER_TRAJ_VALUE] = np.array(q_first_action)
 47 | 
 48 |         if self.model.LEARNED:
 49 |             tic = timeit.default_timer()
 50 |             # update_info[c.Q_UPDATE_TIME] = []  # breaks print epoch summary if you use multiple gradient steps!!!
 51 |             rets = self._compute_returns()
 52 |             for step in range(len(traj)):
 53 |                 old_q_value = self.model.compute_qsa(traj[:step], traj[step])
 54 |                 new_q_value = old_q_value * (1 - self._scheduler_tau) + rets[step] * self._scheduler_tau
 55 |                 self.model.update_qsa(traj[:step], traj[step], new_q_value)
 56 |             # update_info[c.Q_UPDATE_TIME].append(timeit.default_timer() - tic)
 57 | 
 58 |     def update(self, obs, act, reward, done, info):
 59 |         self._rewards.append(reward[self.main_intention].item())
 60 |         self._discounting.append(info[c.DISCOUNTING][0].item())
 61 | 
 62 |         update_info = dict()
 63 |         if done:
 64 |             obs = info[c.HIGH_LEVEL_OBSERVATION]
 65 |             act = info[c.HIGH_LEVEL_ACTION]
 66 |             self.update_scheduler(obs, act, update_info)
 67 |             self._rewards.clear()
 68 |             self._discounting.clear()
 69 |             return True, update_info
 70 |         return False, update_info
 71 | 
 72 | 
 73 | class UpdateDACQScheduler(UpdateQScheduler):
 74 |     def __init__(self, model, reward_function, algo_params):
 75 |         super().__init__(model=model,
 76 |                          algo_params=algo_params)
 77 |         self.reward_function = reward_function
 78 |         self.max_ep_length = algo_params[c.MAX_EPISODE_LENGTH]
 79 |         self.curr_timestep = 0
 80 |         self.obss = []
 81 |         self.acts = []
 82 |         self.device = algo_params[c.DEVICE]
 83 |         self.train_preprocessing = algo_params[c.TRAIN_PREPROCESSING]
 84 |         self.main_intention = algo_params[c.MAIN_INTENTION]
 85 |         self.use_absorbing = absorbing_check(algo_params)
 86 | 
 87 |     def _compute_returns(self):
 88 |         obss = self.train_preprocessing(torch.as_tensor(np.array(self.obss)).squeeze(1).float()).to(self.device)
 89 |         acts = torch.as_tensor(np.array(self.acts)).float().to(self.device)
 90 | 
 91 |         with torch.no_grad():
 92 |             rews = self.reward_function(obss, acts).detach()
 93 |         episode_length = len(rews)
 94 |         returns = torch.zeros(episode_length + 1)
 95 |         for step in range(episode_length - 1, -1, -1):
 96 |             returns[step] = rews[step, self.main_intention].cpu() + \
 97 |                 (self._gamma ** self._discounting[step]) * returns[step + 1]
 98 | 
 99 |         self.obss = []
100 |         self.acts = []
101 | 
102 |         # Only take the returns for every scheduler's action
103 |         return returns[:-1][::self._scheduler_period]
104 | 
105 |     def update(self, obs, act, rew, done, info):
106 |         self.curr_timestep += 1
107 |         self.obss.append(obs)
108 |         self.acts.append(act)
109 |         if (self.use_absorbing and obs[:, -1] == 1) or self.curr_timestep == self.max_ep_length:
110 |             act[:] = 0
111 |             done = True
112 |             self.curr_timestep = 0
113 | 
114 |         return super().update(obs, act, rew, done, info)
115 | 
116 |     def reset(self):
117 |         # Call on env reset
118 |         self.obss = []
119 |         self.acts = []
120 |         self.curr_timestep = 0
121 | 
122 | 
123 | class UpdateDACQSchedulerPlusHandcraft(UpdateQScheduler):
124 |     def __init__(self, model, reward_function, algo_params):
125 |         super().__init__(model=model,
126 |                          algo_params=algo_params)
127 |         self.reward_function = reward_function
128 |         self.max_ep_length = algo_params[c.MAX_EPISODE_LENGTH]
129 |         self.curr_timestep = 0
130 |         self.obss = []
131 |         self.acts = []
132 |         self.device = algo_params[c.DEVICE]
133 |         self.train_preprocessing = algo_params[c.TRAIN_PREPROCESSING]
134 |         self.main_intention = algo_params[c.HANDCRAFT_TASKS]['main_task'][0]
135 |         self.use_absorbing = absorbing_check(algo_params)
136 | 
137 |     def _compute_returns(self):
138 |         obss = self.train_preprocessing(torch.as_tensor(self.obss).squeeze(1).float()).to(self.device)
139 |         acts = torch.as_tensor(self.acts).float().to(self.device)
140 | 
141 |         with torch.no_grad():
142 |             rews = self.reward_function(obss, acts).detach()
143 | 
144 |         episode_length = len(rews)
145 |         returns = torch.zeros(episode_length + 1)
146 | 
147 |         for step in range(episode_length - 1, -1, -1):
148 |             returns[step] = rews[step, self.main_intention].cpu() + \
149 |                 (self._gamma ** self._discounting[step]) * returns[step + 1]
150 | 
151 |         self.obss = []
152 |         self.acts = []
153 | 
154 |         # Only take the returns for every scheduler's action
155 |         return returns[:-1][::self._scheduler_period]
156 | 
157 |     def update(self, obs, act, rew, done, info):
158 |         self.curr_timestep += 1
159 |         self.obss.append(obs)
160 |         self.acts.append(act)
161 |         if (self.use_absorbing and obs[:, -1] == 1) or self.curr_timestep == self.max_ep_length:
162 |             act[:] = 0
163 |             done = True
164 |             self.curr_timestep = 0
165 | 
166 |         return super().update(obs, act, rew, done, info)
167 | 


--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail/README.md:
--------------------------------------------------------------------------------
  1 | # pytorch-a2c-ppo-acktr
  2 | 
  3 | ## Note from LfGP Authors
  4 | This repository is originally hosted here https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail. We reused the code for our GAIL results. See the main LfGP README for reproducing GAIL results.
  5 | 
  6 | ## Update (April 12th, 2021)
  7 | 
  8 | PPO is great, but [Soft Actor Critic](https://arxiv.org/abs/1812.05905) can be better for many continuous control tasks. Please check out [my new RL](http://github.com/ikostrikov/jax-rl) repository in jax.
  9 | 
 10 | ## Please use hyper parameters from this readme. With other hyper parameters things might not work (it's RL after all)!
 11 | 
 12 | This is a PyTorch implementation of
 13 | * Advantage Actor Critic (A2C), a synchronous deterministic version of [A3C](https://arxiv.org/pdf/1602.01783v1.pdf)
 14 | * Proximal Policy Optimization [PPO](https://arxiv.org/pdf/1707.06347.pdf)
 15 | * Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation [ACKTR](https://arxiv.org/abs/1708.05144)
 16 | * Generative Adversarial Imitation Learning [GAIL](https://arxiv.org/abs/1606.03476)
 17 | 
 18 | Also see the OpenAI posts: [A2C/ACKTR](https://blog.openai.com/baselines-acktr-a2c/) and [PPO](https://blog.openai.com/openai-baselines-ppo/) for more information.
 19 | 
 20 | This implementation is inspired by the OpenAI baselines for [A2C](https://github.com/openai/baselines/tree/master/baselines/a2c), [ACKTR](https://github.com/openai/baselines/tree/master/baselines/acktr) and [PPO](https://github.com/openai/baselines/tree/master/baselines/ppo1). It uses the same hyper parameters and the model since they were well tuned for Atari games.
 21 | 
 22 | Please use this bibtex if you want to cite this repository in your publications:
 23 | 
 24 |     @misc{pytorchrl,
 25 |       author = {Kostrikov, Ilya},
 26 |       title = {PyTorch Implementations of Reinforcement Learning Algorithms},
 27 |       year = {2018},
 28 |       publisher = {GitHub},
 29 |       journal = {GitHub repository},
 30 |       howpublished = {\url{https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail}},
 31 |     }
 32 | 
 33 | ## Supported (and tested) environments (via [OpenAI Gym](https://gym.openai.com))
 34 | * [Atari Learning Environment](https://github.com/mgbellemare/Arcade-Learning-Environment)
 35 | * [MuJoCo](http://mujoco.org)
 36 | * [PyBullet](http://pybullet.org) (including Racecar, Minitaur and Kuka)
 37 | * [DeepMind Control Suite](https://github.com/deepmind/dm_control) (via [dm_control2gym](https://github.com/martinseilair/dm_control2gym))
 38 | 
 39 | I highly recommend PyBullet as a free open source alternative to MuJoCo for continuous control tasks.
 40 | 
 41 | All environments are operated using exactly the same Gym interface. See their documentations for a comprehensive list.
 42 | 
 43 | To use the DeepMind Control Suite environments, set the flag `--env-name dm.<domain_name>.<task_name>`, where `domain_name` and `task_name` are the name of a domain (e.g. `hopper`) and a task within that domain (e.g. `stand`) from the DeepMind Control Suite. Refer to their repo and their [tech report](https://arxiv.org/abs/1801.00690) for a full list of available domains and tasks. Other than setting the task, the API for interacting with the environment is exactly the same as for all the Gym environments thanks to [dm_control2gym](https://github.com/martinseilair/dm_control2gym).
 44 | 
 45 | ## Requirements
 46 | 
 47 | * Python 3 (it might work with Python 2, but I didn't test it)
 48 | * [PyTorch](http://pytorch.org/)
 49 | * [Stable baselines3](https://github.com/DLR-RM/stable-baselines3)
 50 | 
 51 | In order to install requirements, follow:
 52 | 
 53 | ```bash
 54 | # PyTorch
 55 | conda install pytorch torchvision -c soumith
 56 | 
 57 | # Other requirements
 58 | pip install -r requirements.txt
 59 | ```
 60 | 
 61 | ## Contributions
 62 | 
 63 | Contributions are very welcome. If you know how to make this code better, please open an issue. If you want to submit a pull request, please open an issue first. Also see a todo list below.
 64 | 
 65 | Also I'm searching for volunteers to run all experiments on Atari and MuJoCo (with multiple random seeds).
 66 | 
 67 | ## Disclaimer
 68 | 
 69 | It's extremely difficult to reproduce results for Reinforcement Learning methods. See ["Deep Reinforcement Learning that Matters"](https://arxiv.org/abs/1709.06560) for more information. I tried to reproduce OpenAI results as closely as possible. However, majors differences in performance can be caused even by minor differences in TensorFlow and PyTorch libraries.
 70 | 
 71 | ### TODO
 72 | * Improve this README file. Rearrange images.
 73 | * Improve performance of KFAC, see kfac.py for more information
 74 | * Run evaluation for all games and algorithms
 75 | 
 76 | ## Visualization
 77 | 
 78 | In order to visualize the results use ```visualize.ipynb```.
 79 | 
 80 | 
 81 | ## Training
 82 | 
 83 | ### Atari
 84 | #### A2C
 85 | 
 86 | ```bash
 87 | python main.py --env-name "PongNoFrameskip-v4"
 88 | ```
 89 | 
 90 | #### PPO
 91 | 
 92 | ```bash
 93 | python main.py --env-name "PongNoFrameskip-v4" --algo ppo --use-gae --lr 2.5e-4 --clip-param 0.1 --value-loss-coef 0.5 --num-processes 8 --num-steps 128 --num-mini-batch 4 --log-interval 1 --use-linear-lr-decay --entropy-coef 0.01
 94 | ```
 95 | 
 96 | #### ACKTR
 97 | 
 98 | ```bash
 99 | python main.py --env-name "PongNoFrameskip-v4" --algo acktr --num-processes 32 --num-steps 20
100 | ```
101 | 
102 | ### MuJoCo
103 | 
104 | Please always try to use  ```--use-proper-time-limits``` flag. It properly handles partial trajectories (see https://github.com/sfujim/TD3/blob/master/main.py#L123).
105 | 
106 | #### A2C
107 | 
108 | ```bash
109 | python main.py --env-name "Reacher-v2" --num-env-steps 1000000
110 | ```
111 | 
112 | #### PPO
113 | 
114 | ```bash
115 | python main.py --env-name "Reacher-v2" --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --gae-lambda 0.95 --num-env-steps 1000000 --use-linear-lr-decay --use-proper-time-limits
116 | ```
117 | 
118 | #### ACKTR
119 | 
120 | ACKTR requires some modifications to be made specifically for MuJoCo. But at the moment, I want to keep this code as unified as possible. Thus, I'm going for better ways to integrate it into the codebase.
121 | 
122 | ## Enjoy
123 | 
124 | ### Atari
125 | 
126 | ```bash
127 | python enjoy.py --load-dir trained_models/a2c --env-name "PongNoFrameskip-v4"
128 | ```
129 | 
130 | ### MuJoCo
131 | 
132 | ```bash
133 | python enjoy.py --load-dir trained_models/ppo --env-name "Reacher-v2"
134 | ```
135 | 
136 | ## Results
137 | 
138 | ### A2C
139 | 
140 | ![BreakoutNoFrameskip-v4](imgs/a2c_breakout.png)
141 | 
142 | ![SeaquestNoFrameskip-v4](imgs/a2c_seaquest.png)
143 | 
144 | ![QbertNoFrameskip-v4](imgs/a2c_qbert.png)
145 | 
146 | ![beamriderNoFrameskip-v4](imgs/a2c_beamrider.png)
147 | 
148 | ### PPO
149 | 
150 | 
151 | ![BreakoutNoFrameskip-v4](imgs/ppo_halfcheetah.png)
152 | 
153 | ![SeaquestNoFrameskip-v4](imgs/ppo_hopper.png)
154 | 
155 | ![QbertNoFrameskip-v4](imgs/ppo_reacher.png)
156 | 
157 | ![beamriderNoFrameskip-v4](imgs/ppo_walker.png)
158 | 
159 | 
160 | ### ACKTR
161 | 
162 | ![BreakoutNoFrameskip-v4](imgs/acktr_breakout.png)
163 | 
164 | ![SeaquestNoFrameskip-v4](imgs/acktr_seaquest.png)
165 | 
166 | ![QbertNoFrameskip-v4](imgs/acktr_qbert.png)
167 | 
168 | ![beamriderNoFrameskip-v4](imgs/acktr_beamrider.png)
169 | 


--------------------------------------------------------------------------------
/rl_sandbox/rl_sandbox/utils.py:
--------------------------------------------------------------------------------
  1 | import _pickle as pickle
  2 | import json
  3 | import numpy as np
  4 | import os
  5 | import timeit
  6 | import torch
  7 | import glob
  8 | 
  9 | from datetime import datetime
 10 | from torch.utils.tensorboard import SummaryWriter
 11 | 
 12 | import rl_sandbox.constants as c
 13 | 
 14 | 
 15 | def check_load_latest_checkpoint(experiment_config, save_path):
 16 |     if experiment_config[c.LOAD_LATEST_CHECKPOINT]:
 17 |         paths = glob.glob(os.path.join(save_path, '*'))
 18 |         if len(paths) == 0:
 19 |             print(f"Warning: load_latest_checkpoint set with no existing experiments at {save_path}, starting new experiment.")
 20 |             add_time_tag_to_save_path = True
 21 |             experiment_config[c.LOAD_LATEST_CHECKPOINT] = False
 22 |         else:
 23 |             latest_path = sorted(paths)[-1]
 24 |             if not os.path.isfile(os.path.join(latest_path, f"{experiment_config[c.CHECKPOINT_NAME]}_buffer.pkl")):
 25 |                 print(f"Warning: load_latest_checkpoint set with no existing experiments at {save_path}, starting new experiment.")
 26 |                 add_time_tag_to_save_path = True
 27 |                 experiment_config[c.LOAD_LATEST_CHECKPOINT] = False
 28 |             else:
 29 |                 save_path = latest_path
 30 |                 print(f"Loading latest checkpoint from {save_path}/{experiment_config[c.CHECKPOINT_NAME]}")
 31 |                 experiment_config[c.BUFFER_SETTING][c.LOAD_BUFFER] = os.path.join(
 32 |                     save_path, f"{experiment_config[c.CHECKPOINT_NAME]}_buffer.pkl")
 33 |                 experiment_config[c.LOAD_MODEL] = os.path.join(
 34 |                     save_path, f"{experiment_config[c.CHECKPOINT_NAME]}.pt")
 35 |                 add_time_tag_to_save_path = False
 36 |     else:
 37 |         add_time_tag_to_save_path = True
 38 | 
 39 |     return save_path, add_time_tag_to_save_path
 40 | 
 41 | def check_load_as_jumpoff_point(experiment_config, save_path, add_time_tag_to_save_path):
 42 |     if experiment_config.get(c.LOAD_MODEL_NAME, "") != "":
 43 |         paths = glob.glob(os.path.join(save_path, '*'))
 44 |         if len(paths) == 0:
 45 |             raise ValueError(f"No paths found at {save_path} to load jumpoff point from")
 46 |         else:
 47 |             latest_path = sorted(paths)[-1]
 48 |             model_n = experiment_config[c.LOAD_MODEL_NAME]
 49 |             buffer_n = experiment_config[c.LOAD_BUFFER_NAME]
 50 |             print(f"Loading jumpoff point from {latest_path} with model name {model_n}, buffer name {buffer_n}")
 51 | 
 52 |             experiment_config[c.BUFFER_SETTING][c.LOAD_BUFFER] = os.path.join(
 53 |                 latest_path, f"{buffer_n}_buffer.pkl")
 54 |             experiment_config[c.LOAD_MODEL] = os.path.join(
 55 |                 latest_path, f"{model_n}.pt")
 56 |             experiment_config[c.LOAD_TRACKING_DICT] = os.path.join(
 57 |                 latest_path, f"{model_n}_tracking_dict.pkl")
 58 | 
 59 |             save_path = ('/').join(latest_path.split('/')[:-1]) + f'_from_{model_n}'
 60 | 
 61 |         add_time_tag_to_save_path = True
 62 |     else:
 63 |         add_time_tag_to_save_path = add_time_tag_to_save_path
 64 | 
 65 |     return save_path, add_time_tag_to_save_path
 66 | 
 67 | class DummySummaryWriter():
 68 |     def add_scalar(self, arg_1, arg_2, arg_3):
 69 |         pass
 70 | 
 71 |     def add_scalars(self, arg_1, arg_2, arg_3):
 72 |         pass
 73 | 
 74 |     def add_text(self, arg_1, arg_2, arg_3):
 75 |         pass
 76 | 
 77 | 
 78 | def make_summary_writer(save_path, algo, cfg, add_time_tag=True):
 79 |     summary_writer = DummySummaryWriter()
 80 |     cfg[c.ALGO] = algo
 81 |     if save_path is not None:
 82 |         if add_time_tag:
 83 |             time_tag = datetime.strftime(datetime.now(), "%m-%d-%y_%H_%M_%S")
 84 |             save_path = f"{save_path}/{time_tag}"
 85 |         os.makedirs(save_path, exist_ok=True)
 86 |         summary_writer = SummaryWriter(log_dir=f"{save_path}/tensorboard")
 87 |         pickle.dump(
 88 |             cfg,
 89 |             open(f'{save_path}/{algo}_experiment_setting.pkl', 'wb'))
 90 |         json.dump(
 91 |             cfg,
 92 |             open(f'{save_path}/{algo}_experiment_setting.json', 'w'),
 93 |             indent=4,
 94 |             default=lambda o: f"<<non-serializable: {type(o).__qualname__}>>"
 95 |         )
 96 | 
 97 |     return summary_writer, save_path
 98 | 
 99 | def get_rng_state():
100 |     return {'torch_rng_state': torch.get_rng_state(), 'np_rng_state': np.random.get_state()}
101 | 
102 | def set_rng_state(torch_rng_state, np_rng_state):
103 |     torch.set_rng_state(torch_rng_state.cpu())  # without .cpu throws a bizarre error about not being a ByteTensor
104 |     np.random.set_state(np_rng_state)
105 | 
106 | def set_seed(seed=None):
107 |     if seed is None:
108 |         seed = np.random.randint(0, c.MAX_INT)
109 | 
110 |     np.random.seed(seed)
111 |     torch.manual_seed(seed)
112 | 
113 | 
114 | class EpochSummary:
115 |     def __init__(self, default_key_length=10, padding=11):
116 |         self._key_length = default_key_length
117 |         self._padding = padding
118 |         self._summary = dict()
119 |         self._epoch = 0
120 |         self._init_tic = timeit.default_timer()
121 | 
122 |     def log(self, key, value, track_std=True, track_min_max=True, axis=None):
123 |         self._key_length = max(self._key_length, len(key))
124 |         self._summary.setdefault(key, {
125 |             c.LOG_SETTING: {
126 |                 c.STANDARD_DEVIATION: track_std,
127 |                 c.MIN_MAX: track_min_max,
128 |                 c.AXIS: axis,
129 |             },
130 |             c.CONTENT: []
131 |         })
132 |         self._summary[key][c.CONTENT].append(value)
133 | 
134 |     def new_epoch(self):
135 |         self._epoch += 1
136 |         self._summary.clear()
137 |         self._curr_tic = timeit.default_timer()
138 | 
139 |     def print_summary(self):
140 |         toc = timeit.default_timer()
141 |         key_length = self._key_length + self._padding
142 |         print("=" * 100)
143 |         print(f"Epoch: {self._epoch}")
144 |         print(f"Epoch Time Spent: {toc - self._curr_tic}")
145 |         print(f"Total Time Spent: {toc - self._init_tic}")
146 |         print("=" * 100)
147 |         print('|'.join(str(x).ljust(key_length) for x in ("Key", "Content")))
148 |         print("-" * 100)
149 | 
150 |         # temp fix for scheduler trajs that are not always same length
151 |         if 'update_info/scheduler_traj' in self._summary:
152 |             del self._summary['update_info/scheduler_traj']
153 | 
154 |         for key in sorted(self._summary):
155 |             val = self._summary[key][c.CONTENT]
156 |             setting = self._summary[key][c.LOG_SETTING]
157 |             try:
158 |                 print('|'.join(str(x).ljust(key_length) for x in (f"{key} - AVG", np.mean(val, axis=setting[c.AXIS]))))
159 |                 if setting[c.STANDARD_DEVIATION]:
160 |                     print('|'.join(str(x).ljust(key_length) for x in (f"{key} - STD DEV", np.std(val, axis=setting[c.AXIS]))))
161 |                 if setting[c.MIN_MAX]:
162 |                     print('|'.join(str(x).ljust(key_length) for x in (f"{key} - MIN", np.min(val, axis=setting[c.AXIS]))))
163 |                     print('|'.join(str(x).ljust(key_length) for x in (f"{key} - MAX", np.max(val, axis=setting[c.AXIS]))))
164 |             except:
165 |                 print(val)
166 |                 print(key)
167 |                 assert 0
168 |         print("=" * 100)
169 | 


--------------------------------------------------------------------------------