├── rl_sandbox ├── rl_sandbox │ ├── __init__.py │ ├── envs │ │ ├── __init__.py │ │ ├── wrappers │ │ │ ├── __init__.py │ │ │ ├── wrapper.py │ │ │ ├── action_repeat.py │ │ │ ├── absorbing_state.py │ │ │ └── frame_stack.py │ │ ├── fake_env.py │ │ └── utils.py │ ├── agents │ │ ├── __init__.py │ │ ├── random_agents.py │ │ └── rl_agents.py │ ├── buffers │ │ ├── __init__.py │ │ ├── wrappers │ │ │ ├── __init__.py │ │ │ ├── buffer_wrapper.py │ │ │ ├── torch_buffer.py │ │ │ └── noise_wrapper.py │ │ ├── buffer.py │ │ ├── disk_buffer.py │ │ └── utils.py │ ├── examples │ │ ├── __init__.py │ │ ├── lfgp │ │ │ ├── __init__.py │ │ │ ├── experts │ │ │ │ ├── __init__.py │ │ │ │ ├── subsample_expert_data.py │ │ │ │ ├── scripted_policies.py │ │ │ │ └── create_subsampled_data.py │ │ │ ├── default_configs │ │ │ │ ├── __init__.py │ │ │ │ └── dac.py │ │ │ ├── experiment_utils.py │ │ │ └── transfer.py │ │ ├── eval_tools │ │ │ └── __init__.py │ │ └── collect_data.py │ ├── train │ │ ├── __init__.py │ │ ├── train_bc.py │ │ ├── train_bc_no_overfit.py │ │ ├── train_multitask_bc.py │ │ ├── train_multitask_bc_no_overfit.py │ │ └── train_dac_sac.py │ ├── algorithms │ │ ├── __init__.py │ │ ├── bc │ │ │ └── __init__.py │ │ ├── dac │ │ │ └── __init__.py │ │ ├── sac │ │ │ └── __init__.py │ │ ├── sac_x │ │ │ ├── __init__.py │ │ │ ├── intentions_update │ │ │ │ └── __init__.py │ │ │ ├── schedulers_update │ │ │ │ ├── __init__.py │ │ │ │ └── q_scheduler.py │ │ │ └── sac_x.py │ │ └── utils.py │ ├── transforms │ │ ├── __init__.py │ │ └── general_transforms.py │ ├── auxiliary_rewards │ │ ├── __init__.py │ │ ├── rce_envs │ │ │ ├── __init__.py │ │ │ ├── door_human_v0.py │ │ │ ├── relocate_human_v0.py │ │ │ ├── hammer_human_v0.py │ │ │ └── sawyer.py │ │ ├── manipulator_learning │ │ │ ├── __init__.py │ │ │ └── panda │ │ │ │ ├── __init__.py │ │ │ │ └── lift_xyz_state.py │ │ └── generic.py │ ├── auxiliary_tasks │ │ ├── __init__.py │ │ ├── auxiliary_tasks.py │ │ ├── utils.py │ │ └── koopman.py │ ├── model_architectures │ │ ├── __init__.py │ │ ├── actor_critics │ │ │ └── __init__.py │ │ ├── discriminators │ │ │ ├── __init__.py │ │ │ └── fully_connected_discriminators.py │ │ ├── layers_definition.py │ │ ├── shared.py │ │ └── utils.py │ └── utils.py └── setup.py ├── pytorch-a2c-ppo-acktr-gail ├── a2c_ppo_acktr │ ├── __init__.py │ ├── algo │ │ ├── __init__.py │ │ ├── a2c_acktr.py │ │ ├── ppo.py │ │ └── gail.py │ ├── utils.py │ ├── distributions.py │ └── arguments.py ├── requirements.txt ├── setup.py ├── gail_experts │ ├── README.md │ ├── convert_to_pytorch.py │ └── convert_lfgp_expert_data.py ├── scripts │ ├── gail_bring.bash │ └── gail.bash ├── LICENSE ├── generate_tmux_yaml.py ├── .gitignore ├── evaluation.py ├── enjoy.py └── README.md ├── system.png ├── scripts ├── experiments │ ├── any_all_seeds.bash │ ├── any_script_any_seeds.bash │ ├── bc.bash │ ├── bc_no_overfit.bash │ ├── multi_bc.bash │ ├── dac.bash │ ├── lfgp.bash │ └── multi_bc_no_overfit.bash ├── create_data │ ├── create_modified_data.bash │ └── create_expert_data.bash ├── evaluation │ └── visualize_model.bash └── plotting │ ├── common.py │ └── multitask_performance.py ├── LICENSE ├── .gitignore └── six_state_mdp.py /rl_sandbox/rl_sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/buffers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/train/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/bc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/dac/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/sac/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/envs/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/lfgp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/sac_x/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/buffers/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/eval_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/model_architectures/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/lfgp/experts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/lfgp/default_configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/sac_x/intentions_update/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/sac_x/schedulers_update/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/model_architectures/actor_critics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/model_architectures/discriminators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/manipulator_learning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /system.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utiasSTARS/lfgp/HEAD/system.png -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/manipulator_learning/panda/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/requirements.txt: -------------------------------------------------------------------------------- 1 | gym 2 | matplotlib 3 | pybullet 4 | stable-baselines3 -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/algo/__init__.py: -------------------------------------------------------------------------------- 1 | from .a2c_acktr import A2C_ACKTR 2 | from .ppo import PPO -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/utils.py: -------------------------------------------------------------------------------- 1 | def aug_data(data, num_aug, aug_batch_size): 2 | return data.repeat(1, num_aug, *[1] * (len(data.shape) - 2)).reshape( 3 | aug_batch_size, *data.shape[1:]) 4 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='a2c-ppo-acktr', 5 | packages=find_packages(), 6 | version='0.0.1', 7 | install_requires=['gym', 'matplotlib', 'pybullet', 'stable-baselines3']) 8 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/envs/fake_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class FakeEnv: 4 | def __init__(self, obs_dim): 5 | self._obs_dim = obs_dim 6 | 7 | def reset(self): 8 | return np.zeros(self._obs_dim) 9 | 10 | def step(self, action): 11 | return np.zeros(self._obs_dim), 0., False, {} 12 | 13 | def render(self): 14 | pass 15 | -------------------------------------------------------------------------------- /scripts/experiments/any_all_seeds.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SCRIPT=$1 4 | DEVICE=$2 5 | MAIN_TASK=$3 6 | 7 | # expert dir should just be the lowest level directory before int_X.gz, 8 | # the rest is handled in the individual script files 9 | EXPERT_DIR=$4 10 | USER_MACHINE=$5 11 | EXPERIMENT_NAME=$6 12 | 13 | seeds=(1 2 3 4 5) 14 | for seed in "${seeds[@]}" 15 | do 16 | bash "${SCRIPT}" "${seed}" "${DEVICE}" "${MAIN_TASK}" "${EXPERT_DIR}" "${USER_MACHINE}" "${EXPERIMENT_NAME}" 17 | done -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/envs/wrappers/wrapper.py: -------------------------------------------------------------------------------- 1 | class Wrapper: 2 | def __init__(self, env): 3 | self._env = env 4 | 5 | def reset(self, **kwargs): 6 | return self._env.reset(**kwargs) 7 | 8 | def step(self, action): 9 | return self.step(action) 10 | 11 | def render(self, **kwargs): 12 | return self._env.render(**kwargs) 13 | 14 | def seed(self, seed): 15 | self._env.seed(seed) 16 | 17 | def __getattr__(self, attr): 18 | return getattr(self._env, attr) 19 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/lfgp/experiment_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import rl_sandbox.constants as c 3 | 4 | def get_save_path(algo_name, main_task, seed, exp_name, top_path="results"): 5 | return os.path.join(top_path, main_task, str(seed), algo_name, exp_name) 6 | 7 | 8 | def config_check(experiment_config, top_path): 9 | """ 10 | custom checks for fixing config for particular machines 11 | """ 12 | if "scratch" in top_path and experiment_config[c.ENV_SETTING][c.ENV_TYPE] == c.MANIPULATOR_LEARNING: 13 | experiment_config[c.ENV_SETTING][c.KWARGS]["egl"] = False -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/gail_experts/README.md: -------------------------------------------------------------------------------- 1 | ## Data 2 | 3 | Download from 4 | https://drive.google.com/open?id=1Ipu5k99nwewVDG1yFetUxqtwVlgBg5su 5 | 6 | and store in this folder. 7 | 8 | ## Convert to pytorch 9 | 10 | ```bash 11 | python convert_to_pytorch.py --h5-file trajs_halfcheetah.h5 12 | ``` 13 | 14 | ## Run 15 | 16 | ```bash 17 | python main.py --env-name "HalfCheetah-v2" --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --gae-lambda 0.95 --num-env-steps 10000000 --use-linear-lr-decay --use-proper-time-limits --gail 18 | ``` 19 | -------------------------------------------------------------------------------- /rl_sandbox/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='rl_sandbox', 4 | version='3.1.0+vpace', 5 | packages=[package for package in find_packages() 6 | if package.startswith('rl_sandbox')], 7 | install_requires=['gym>=0.15.4,<=0.23.0', 8 | 'numpy>=1.23.4,<2.0', 9 | 'tensorboard<=2.11', 10 | 'torch==1.13.*', 11 | 'manipulator_learning @ git+ssh://git@github.com/utiasSTARS/manipulator-learning@master#egg=manipulator_learning', 12 | 'ConfigArgParse', 13 | 'PyYAML'] 14 | ) 15 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/model_architectures/layers_definition.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | import rl_sandbox.constants as c 4 | import rl_sandbox.model_architectures.shared as snn 5 | 6 | 7 | CUSTOM_WIDTH_LINEAR_LAYERS = lambda in_dim, width: ( 8 | [in_dim, width, nn.ReLU(), True, 0], 9 | [width, width, nn.ReLU(), True, 0], 10 | ) 11 | 12 | VALUE_BASED_LINEAR_LAYERS = lambda in_dim: ( 13 | [in_dim, 256, nn.ReLU(), True, 0], 14 | [256, 256, nn.ReLU(), True, 0], 15 | ) 16 | 17 | SAC_DISCRIMINATOR_LINEAR_LAYERS = lambda in_dim: ( 18 | [in_dim, 256, nn.Tanh(), True, 0], 19 | [256, 256, nn.Tanh(), True, 0], 20 | ) 21 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/generic.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import numpy as np 4 | from numpy.linalg import norm 5 | 6 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward 7 | 8 | 9 | class FromEnvAuxiliaryReward(AuxiliaryReward): 10 | def __init__(self, env, aux_rewards=()): 11 | aux_reward_funcs = [] 12 | if aux_rewards == (): 13 | aux_rewards = env.VALID_AUX_TASKS 14 | for aux_str in aux_rewards: 15 | rew_func = partial(env.get_aux_rew, tasks=(aux_str,)) 16 | rew_func.__qualname__ = aux_str 17 | aux_reward_funcs.append(rew_func) 18 | 19 | super().__init__(aux_reward_funcs, False) 20 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/scripts/gail_bring.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | eval_eps=50 4 | eval_interval=100000 5 | num_env_steps=4000000 6 | expert_file="../gail_experts/data/bring_0-expert_data/reset/int_2.pt" 7 | env="bring_0" 8 | num_processes=1 9 | seed=10 10 | 11 | python ../main.py \ 12 | --seed "$seed" \ 13 | --num-steps 2048 \ 14 | --lr 3e-4 \ 15 | --entropy-coef 0 \ 16 | --value-loss-coef 0.5 \ 17 | --ppo-epoch 10 \ 18 | --num-mini-batch 32 \ 19 | --gamma 0.99 \ 20 | --gae-lambda 0.95 \ 21 | --use-linear-lr-decay \ 22 | --use-proper-time-limits \ 23 | --num-processes="$num_processes" \ 24 | --use-gae \ 25 | --algo ppo \ 26 | --gail \ 27 | --eval-interval="$eval_interval" \ 28 | --num-env-steps="$num_env_steps" \ 29 | --gail-experts-file="$expert_file" \ 30 | --env-name="$env" \ 31 | --log-interval 1 \ 32 | --eval-eps "$eval_eps" \ 33 | --no-cuda 34 | # --train-render 35 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/model_architectures/shared.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class Flatten(nn.Module): 7 | def forward(self, x): 8 | return x.view(x.size(0), -1) 9 | 10 | 11 | class Split(nn.Module): 12 | def __init__(self, feature_dims): 13 | super().__init__() 14 | self.feature_dims = feature_dims 15 | 16 | def forward(self, x): 17 | features = [] 18 | last_feature_idx = 0 19 | for feature_dim in self.feature_dims: 20 | features.append(x[..., last_feature_idx:last_feature_idx + feature_dim]) 21 | last_feature_idx += feature_dim 22 | return features 23 | 24 | 25 | class Swish(nn.Module): 26 | def __init__(self): 27 | super().__init__() 28 | 29 | def forward(self, x): 30 | x = x * nn.functional.sigmoid(x) 31 | return x 32 | 33 | 34 | class Fuse(nn.Module): 35 | def forward(self, features): 36 | return torch.cat(features, dim=-1) 37 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/lfgp/transfer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def get_transfer_params(load_existing_dir, load_model, load_buffer, load_transfer_exp_settings, load_aux_old_removal): 5 | if load_model == "": 6 | load_model = False 7 | else: 8 | load_model = os.path.join(load_existing_dir, load_model) 9 | 10 | if load_buffer == "": 11 | load_buffer = False 12 | else: 13 | load_buffer = os.path.join(load_existing_dir, load_buffer) 14 | 15 | # transfer 16 | if load_transfer_exp_settings != "": 17 | # assert load_buffer and load_model 18 | assert load_buffer 19 | load_transfer_exp_settings = os.path.join(load_existing_dir, load_transfer_exp_settings) 20 | else: 21 | load_transfer_exp_settings = False 22 | 23 | if load_aux_old_removal != "": 24 | load_aux_old_removal = load_aux_old_removal.split(',') 25 | else: 26 | load_aux_old_removal = None 27 | 28 | return load_model, load_buffer, load_transfer_exp_settings, load_aux_old_removal -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/scripts/gail.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | eval_eps=50 4 | eval_interval=100000 5 | num_env_steps=4000000 6 | expert_file="../gail_experts/expert-data/${1}/reset/800_steps-90_sp_point5_play_open_200_extra_lasts/int_2.gz" 7 | env=$1 8 | num_processes=1 9 | #seed=$2 10 | 11 | seeds=(1 2 3 4 5) 12 | # seeds=(3 4 5) 13 | # seeds=(10 11) 14 | job_type="$1" 15 | 16 | for seed in "${seeds[@]}" 17 | do 18 | 19 | python ../main.py \ 20 | --seed "$seed" \ 21 | --num-steps 2048 \ 22 | --lr 3e-4 \ 23 | --entropy-coef 0 \ 24 | --value-loss-coef 0.5 \ 25 | --ppo-epoch 10 \ 26 | --num-mini-batch 32 \ 27 | --gamma 0.99 \ 28 | --gae-lambda 0.95 \ 29 | --use-linear-lr-decay \ 30 | --use-proper-time-limits \ 31 | --num-processes="$num_processes" \ 32 | --use-gae \ 33 | --algo ppo \ 34 | --gail \ 35 | --eval-interval="$eval_interval" \ 36 | --num-env-steps="$num_env_steps" \ 37 | --gail-experts-file="$expert_file" \ 38 | --env-name="$env" \ 39 | --log-interval 10 \ 40 | --eval-eps "$eval_eps" \ 41 | --no-cuda & 42 | # --train-render 43 | 44 | 45 | done 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2021 STARS Laboratory 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Ilya Kostrikov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/door_human_v0.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import norm 3 | 4 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward 5 | 6 | 7 | # reach reward defined based on how defined in original envs combined with how we defined datasets in rce_multitask_envs.py 8 | def reach(observation, next_observation, **kwargs): 9 | obs = observation 10 | next_obs = next_observation 11 | 12 | palm_to_handle_dist_before = np.linalg.norm(obs[-4:-1]) 13 | palm_to_handle_dist_after = np.linalg.norm(next_obs[-4:-1]) 14 | 15 | return palm_to_handle_dist_before - palm_to_handle_dist_after 16 | 17 | 18 | def grasp(observation, next_observation, **kwargs): 19 | obs = observation 20 | next_obs = next_observation 21 | 22 | latch_turn_before = obs[27] 23 | latch_turn_after = next_obs[27] 24 | 25 | return latch_turn_after - latch_turn_before 26 | 27 | 28 | class DoorHumanV0AuxiliaryReward(AuxiliaryReward): 29 | def __init__(self, env_name, aux_rewards=('reach',)): 30 | aux_reward_funcs = [globals()[ar_str] for ar_str in aux_rewards] 31 | super().__init__(aux_reward_funcs, True) 32 | -------------------------------------------------------------------------------- /scripts/create_data/create_modified_data.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAIN_TASK=$1 4 | STEPS_PER_TASK_ORIG=$2 5 | STEPS_PER_TASK_NEW=$3 6 | NUM_EXTRA_LASTS=$4 7 | KEEP_EVERY_NTH=$5 8 | 9 | 10 | INPUT_PATH_POST="${STEPS_PER_TASK_ORIG}_steps" 11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/" 12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}} 13 | echo "Using TOP_DIR OF ${TOP_DIR}" 14 | OUTPUT_PATH_POST="${STEPS_PER_TASK_NEW}_steps_${KEEP_EVERY_NTH}_ss_${NUM_EXTRA_LASTS}_el/" 15 | 16 | echo "Generating smaller dataset, subsampled by ${KEEP_EVERY_NTH}, getting ${NUM_EXTRA_LASTS} extra final transtions, for ${MAIN_TASK}, original: ${INPUT_PATH_POST}, new: ${OUTPUT_PATH_POST}." 17 | 18 | IN_PATH="${TOP_DIR}${MAIN_TASK}/${INPUT_PATH_POST}" 19 | OUT_PATH="${TOP_DIR}${MAIN_TASK}/${OUTPUT_PATH_POST}" 20 | 21 | echo "Getting data from ${IN_PATH}, Saving new data to ${OUT_PATH}." 22 | 23 | python ../../rl_sandbox/rl_sandbox/examples/lfgp/experts/create_subsampled_data.py \ 24 | --seed=0 \ 25 | --keep_last \ 26 | --input_path="${IN_PATH}" \ 27 | --output_path="${OUT_PATH}" \ 28 | --keep_every_nth="${KEEP_EVERY_NTH}" \ 29 | --num_to_subsample_from="${STEPS_PER_TASK_NEW}" \ 30 | --num_extra_lasts="${NUM_EXTRA_LASTS}" -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/relocate_human_v0.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import norm 3 | 4 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward 5 | 6 | 7 | # reach reward defined based on how defined in original envs combined with how we defined datasets in rce_multitask_envs.py 8 | def reach(observation, next_observation, **kwargs): 9 | obs = observation 10 | next_obs = next_observation 11 | 12 | palm_to_ball_dist_before = np.linalg.norm(obs[-9:-6]) 13 | palm_to_ball_dist_after = np.linalg.norm(next_obs[-9:-6]) 14 | 15 | return palm_to_ball_dist_before - palm_to_ball_dist_after 16 | 17 | 18 | def grasp(observation, next_observation, **kwargs): 19 | obs = observation 20 | next_obs = next_observation 21 | 22 | ball_target_z_diff_before = abs(obs[-1]) 23 | ball_target_z_diff_after = abs(next_obs[-1]) 24 | 25 | return ball_target_z_diff_before - ball_target_z_diff_after 26 | 27 | 28 | class RelocateHumanV0AuxiliaryReward(AuxiliaryReward): 29 | def __init__(self, env_name, aux_rewards=('reach',)): 30 | aux_reward_funcs = [globals()[ar_str] for ar_str in aux_rewards] 31 | super().__init__(aux_reward_funcs, True) 32 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/hammer_human_v0.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import norm 3 | 4 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward 5 | 6 | 7 | # reach reward defined based on how defined in original envs combined with how we defined datasets in rce_multitask_envs.py 8 | def reach(observation, next_observation, **kwargs): 9 | obs = observation 10 | next_obs = next_observation 11 | 12 | palm_to_hammer_dist_before = np.linalg.norm(np.array(obs[-13:-10])-np.array(obs[-10:-7])) 13 | palm_to_hammer_dist_after = np.linalg.norm(np.array(next_obs[-13:-10])-np.array(next_obs[-10:-7])) 14 | 15 | return palm_to_hammer_dist_before - palm_to_hammer_dist_after 16 | 17 | 18 | def grasp(observation, next_observation, **kwargs): 19 | obs = observation 20 | next_obs = next_observation 21 | 22 | hammer_height_before = obs[-8] 23 | hammer_height_after = next_obs[-8] 24 | 25 | return hammer_height_after - hammer_height_before 26 | 27 | 28 | class HammerHumanV0AuxiliaryReward(AuxiliaryReward): 29 | def __init__(self, env_name, aux_rewards=('reach',)): 30 | aux_reward_funcs = [globals()[ar_str] for ar_str in aux_rewards] 31 | super().__init__(aux_reward_funcs, True) 32 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/lfgp/experts/subsample_expert_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tools for subsampling expert data. 3 | """ 4 | 5 | import copy 6 | import numpy as np 7 | 8 | 9 | def subsample_buffers(buffers, keep_every_nth, keep_first_last=False): 10 | subsampled_bufs = [] 11 | data_strs = ['observations', 'hidden_states', 'actions', 'rewards', 'dones', 'next_observations', 12 | 'next_hidden_states'] 13 | for b in buffers: 14 | initial_offset = np.random.randint(keep_every_nth) 15 | subsampled_b = copy.deepcopy(b) 16 | if keep_first_last: 17 | raise NotImplementedError() 18 | ends = np.argwhere(np.invert(np.all(b.observations[1:] == b.next_observations[:-1], axis=1))) 19 | 20 | inds = np.array(range(initial_offset, len(b), keep_every_nth)) 21 | 22 | for ds in data_strs: 23 | setattr(subsampled_b, ds, getattr(b, ds)[inds]) 24 | 25 | # infos done separately since it's a dict 26 | for k in b.infos.keys(): 27 | subsampled_b.infos[k] = b.infos[k][inds] 28 | 29 | subsampled_b._pointer = 0 30 | subsampled_b._count = len(inds) 31 | subsampled_b._memory_size = len(inds) 32 | 33 | subsampled_bufs.append(subsampled_b) 34 | 35 | return subsampled_bufs -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/buffers/buffer.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import rl_sandbox.constants as c 4 | 5 | 6 | class NoSampleError(Exception): 7 | pass 8 | 9 | 10 | class LengthMismatchError(Exception): 11 | pass 12 | 13 | 14 | class CheckpointIndexError(Exception): 15 | pass 16 | 17 | 18 | class Buffer: 19 | @property 20 | def memory_size(self): 21 | raise NotImplementedError 22 | 23 | @property 24 | def is_full(self): 25 | raise NotImplementedError 26 | 27 | def __len__(self): 28 | raise NotImplementedError 29 | 30 | def push(self, obs, h_state, act, rew, done, info, **kwargs): 31 | raise NotImplementedError 32 | 33 | def clear(self): 34 | raise NotImplementedError 35 | 36 | def sample(self, batch_size, idxes=None): 37 | raise NotImplementedError 38 | 39 | def sample_with_next_obs(self, batch_size, next_obs, next_h_state=None, idxes=None): 40 | raise NotImplementedError 41 | 42 | def sample_consecutive(self, batch_size, end_with_done=False): 43 | raise NotImplementedError 44 | 45 | def save(self, save_path, **kwargs): 46 | raise NotImplementedError 47 | 48 | def load(self, load_path, load_rng=True): 49 | raise NotImplementedError 50 | 51 | def close(self): 52 | pass 53 | -------------------------------------------------------------------------------- /scripts/experiments/any_script_any_seeds.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # seeds should be in the following format: "1 2 3 4 5" 4 | SEEDS=($1) 5 | SCRIPT=$2 6 | DEVICE=$3 7 | MAIN_TASK=$4 8 | 9 | # expert dir should just be the lowest level directory before int_X.gz, 10 | # the rest is handled in the individual script files 11 | EXPERT_DIR=$5 12 | USER_MACHINE=$6 13 | EXPERIMENT_NAME=$7 14 | 15 | # optional for DAC/LfGP only 16 | EXPBUF_LAST_SAMPLE_PROP=$8 # default is .95, 0. turns it off 17 | EXPBUF_MODEL_SAMPLE_RATE=$9 # default is .1, 0. turns it off 18 | 19 | # optional for LfGP only 20 | SCHEDULER=$10 21 | 22 | if [ "${SCRIPT}" = "lfgp.bash" ]; then 23 | for seed in "${SEEDS[@]}" 24 | do 25 | bash "${SCRIPT}" "${seed}" "${DEVICE}" "${MAIN_TASK}" "${EXPERT_DIR}" "${USER_MACHINE}" "${SCHEDULER}" \ 26 | "${EXPBUF_LAST_SAMPLE_PROP}" "${EXPBUF_MODEL_SAMPLE_RATE}" "${EXPERIMENT_NAME}" 27 | done 28 | elif [ "${SCRIPT}" = "dac.bash" ]; then 29 | for seed in "${SEEDS[@]}" 30 | do 31 | bash "${SCRIPT}" "${seed}" "${DEVICE}" "${MAIN_TASK}" "${EXPERT_DIR}" "${USER_MACHINE}" \ 32 | "${EXPBUF_LAST_SAMPLE_PROP}" "${EXPBUF_MODEL_SAMPLE_RATE}" "${EXPERIMENT_NAME}" 33 | done 34 | else 35 | for seed in "${SEEDS[@]}" 36 | do 37 | bash "${SCRIPT}" "${seed}" "${DEVICE}" "${MAIN_TASK}" "${EXPERT_DIR}" "${USER_MACHINE}" "${EXPERIMENT_NAME}" 38 | done 39 | fi -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/transforms/general_transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | class Transform: 6 | def __call__(self, obs): 7 | raise NotImplementedError 8 | 9 | def reset(self): 10 | pass 11 | 12 | 13 | class Identity(Transform): 14 | def __call__(self, obs): 15 | return obs 16 | 17 | 18 | class Compose(Transform): 19 | def __init__(self, transforms): 20 | self._transforms = transforms 21 | 22 | def __call__(self, obs): 23 | for transform in self._transforms: 24 | obs = transform(obs) 25 | return obs 26 | 27 | def reset(self): 28 | for transform in self._transforms: 29 | transform.reset() 30 | 31 | 32 | class AsType(Transform): 33 | def __init__(self, dtype=np.float32): 34 | self._dtype = dtype 35 | 36 | def __call__(self, obs): 37 | return obs.astype(self._dtype) 38 | 39 | 40 | class FrameStack(Transform): 41 | def __init__(self, frame_dim): 42 | """ stack observation along axis 0. Assumes observation has 1 less dimension 43 | """ 44 | assert len(frame_dim) > 1 45 | self._frame_dim = frame_dim 46 | self._frames = np.zeros(shape=frame_dim, dtype=np.float32) 47 | 48 | def __call__(self, obs): 49 | self._frames = np.concatenate((self._frames[1:], [obs])) 50 | return self._frames 51 | 52 | def reset(self): 53 | self._frames.fill(0) 54 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/envs/wrappers/action_repeat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import rl_sandbox.constants as c 4 | 5 | 6 | class ActionRepeatWrapper: 7 | def __init__(self, env, action_repeat, discount_factor=1, enable_discounting=False): 8 | assert action_repeat > 0 9 | self._env = env 10 | self._action_repeat = action_repeat 11 | self._enable_discounting = enable_discounting 12 | self._discount_factor = discount_factor if enable_discounting else 1. 13 | 14 | def __getattr__(self, attr): 15 | return getattr(self._env, attr) 16 | 17 | def reset(self, **kwargs): 18 | return self._env.reset(**kwargs) 19 | 20 | def step(self, action, **kwargs): 21 | done = False 22 | cum_reward = 0 23 | num_repeated = 0 24 | infos = { 25 | c.INFOS: [] 26 | } 27 | 28 | while not done and num_repeated < self._action_repeat: 29 | obs, reward, done, info = self._env.step(action, **kwargs) 30 | cum_reward += (self._discount_factor ** num_repeated) * reward 31 | num_repeated += 1 32 | infos[c.INFOS].append(info) 33 | 34 | infos[c.DISCOUNTING] = np.array([num_repeated if self._enable_discounting else 1]) 35 | 36 | return obs, cum_reward, done, infos 37 | 38 | def render(self, **kwargs): 39 | return self._env.render(**kwargs) 40 | 41 | def seed(self, seed): 42 | self._env.seed(seed) 43 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/rce_envs/sawyer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import norm 3 | 4 | from rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state import AuxiliaryReward 5 | 6 | 7 | # reach reward defined based on how rewards are defined for rce envs 8 | def reach(observation, next_observation, **kwargs): 9 | obs = observation 10 | next_obs = next_observation 11 | 12 | ee_pos = obs[:3] 13 | obj_pos = obs[3:6] 14 | next_ee_pos = next_obs[:3] 15 | next_obj_pos = next_obs[3:6] 16 | 17 | d_before = norm(ee_pos - obj_pos) 18 | d_after = norm(next_ee_pos - next_obj_pos) 19 | return d_before - d_after 20 | 21 | 22 | def grasp(observation, next_observation, action, **kwargs): 23 | obs = observation 24 | next_obs = next_observation 25 | 26 | reach_rew = reach(obs, next_obs) 27 | 28 | obj_z_pos = obs[5] 29 | next_obj_z_pos = next_obs[5] 30 | z_inc = next_obj_z_pos - obj_z_pos 31 | grip_pos = obs[6:8] 32 | next_grip_pos = next_obs[6:8] 33 | grip_inc = next_grip_pos[0] - grip_pos[0] - (next_grip_pos[1] - grip_pos[1]) # 2nd index goes negative as it closes 34 | 35 | grasp_rew = z_inc + grip_inc 36 | 37 | return reach_rew + grasp_rew 38 | 39 | class SawyerAuxiliaryReward(AuxiliaryReward): 40 | def __init__(self, env_name, aux_rewards=('reach',)): 41 | aux_reward_funcs = [globals()[ar_str] for ar_str in aux_rewards] 42 | super().__init__(aux_reward_funcs, True) 43 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/gail_experts/convert_to_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | 5 | import h5py 6 | import numpy as np 7 | import torch 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser( 12 | 'Converts expert trajectories from h5 to pt format.') 13 | parser.add_argument( 14 | '--h5-file', 15 | default='trajs_halfcheetah.h5', 16 | help='input h5 file', 17 | type=str) 18 | parser.add_argument( 19 | '--pt-file', 20 | default=None, 21 | help='output pt file, by default replaces file extension with pt', 22 | type=str) 23 | args = parser.parse_args() 24 | 25 | if args.pt_file is None: 26 | args.pt_file = os.path.splitext(args.h5_file)[0] + '.pt' 27 | 28 | with h5py.File(args.h5_file, 'r') as f: 29 | dataset_size = f['obs_B_T_Do'].shape[0] # full dataset size 30 | 31 | states = f['obs_B_T_Do'][:dataset_size, ...][...] 32 | actions = f['a_B_T_Da'][:dataset_size, ...][...] 33 | rewards = f['r_B_T'][:dataset_size, ...][...] 34 | lens = f['len_B'][:dataset_size, ...][...] 35 | 36 | states = torch.from_numpy(states).float() 37 | actions = torch.from_numpy(actions).float() 38 | rewards = torch.from_numpy(rewards).float() 39 | lens = torch.from_numpy(lens).long() 40 | 41 | data = { 42 | 'states': states, 43 | 'actions': actions, 44 | 'rewards': rewards, 45 | 'lengths': lens 46 | } 47 | 48 | torch.save(data, args.pt_file) 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_tasks/auxiliary_tasks.py: -------------------------------------------------------------------------------- 1 | class AuxiliaryTask: 2 | def load_state_dict(self, state_dict): 3 | pass 4 | 5 | def state_dict(self): 6 | pass 7 | 8 | def compute_loss(self, next_obs, next_h_state): 9 | return 0., dict() 10 | 11 | def zero_grad(self): 12 | pass 13 | 14 | def step(self): 15 | pass 16 | 17 | 18 | class AuxiliaryTasks(AuxiliaryTask): 19 | def __init__(self, aux_tasks): 20 | super().__init__() 21 | self._aux_tasks = aux_tasks 22 | 23 | def load_state_dict(self, state_dict): 24 | for task_name, task_state_dict in state_dict.items(): 25 | assert task_name in self._aux_tasks 26 | self._aux_tasks[task_name].load_state_dict(task_state_dict) 27 | 28 | def state_dict(self): 29 | state_dict = dict() 30 | for task_name, task in self._aux_tasks.items(): 31 | state_dict[task_name] = task.state_dict() 32 | return state_dict 33 | 34 | def compute_loss(self, next_obs, next_h_state): 35 | update_info = dict() 36 | 37 | total_loss = 0 38 | for task_name, task in self._aux_tasks.items(): 39 | loss = task.compute_loss(next_obs, next_h_state) 40 | update_info[task_name] = loss.detach().cpu() 41 | 42 | total_loss += loss 43 | 44 | return total_loss, update_info 45 | 46 | def zero_grad(self): 47 | for task in self._aux_tasks.values(): 48 | task.opt.zero_grad() 49 | 50 | def step(self): 51 | for task in self._aux_tasks.values(): 52 | task.opt.step() 53 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/gail_experts/convert_lfgp_expert_data.py: -------------------------------------------------------------------------------- 1 | import _pickle as pickle 2 | import gzip 3 | import numpy as np 4 | import torch 5 | import os 6 | import argparse 7 | 8 | TOP_DIR=os.path.join(os.environ['LFGP_TOP_DIR'], 'play_xyz', 'expert-data') 9 | MID="reset/800_steps-90_sp_point5_play_open_200_extra_lasts/" 10 | END="int_2.gz" 11 | TASKS = ['stack_0', 'bring_0', 'insert_0', 'unstack_stack_env_only_0'] 12 | EXPERT_PATH_DICT = { 13 | "stack_0": os.path.join(TOP_DIR, "open-close-stack-lift-reach-move", MID, END), 14 | "bring_0": os.path.join(TOP_DIR, "open-close-bring-lift-reach-move", MID, END), 15 | "insert_0": os.path.join(TOP_DIR, "open-close-insert-bring-lift-reach-move", MID, END), 16 | "unstack_stack_env_only_0": os.path.join(TOP_DIR, "open-close-unstackstack-lift-reach-move-35M", MID, END), 17 | } 18 | 19 | for t in TASKS: 20 | # src_path = f"data/{t}-expert_data/reset/int_2.gz" 21 | src_path = EXPERT_PATH_DICT[t] 22 | dst_path = f"expert-data/{t}/{MID}" 23 | dst_file = 'int_2.gz' 24 | 25 | src_data = pickle.load(gzip.open(src_path, "rb")) 26 | 27 | print(src_data.keys()) 28 | 29 | ep_start_idxes = [0] 30 | for idx, (curr_obs, next_obs) in enumerate(zip(src_data["observations"][1:], src_data["next_observations"][:-1])): 31 | if np.any(curr_obs != next_obs): 32 | ep_start_idxes.append(idx + 1) 33 | 34 | num_eps = len(ep_start_idxes) 35 | 36 | print(num_eps) 37 | 38 | data = { 39 | 'states': src_data["observations"][:, :-1], 40 | 'actions': src_data["actions"], 41 | } 42 | 43 | os.makedirs(dst_path, exist_ok=True) 44 | torch.save(data, os.path.join(dst_path, dst_file)) -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/generate_tmux_yaml.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import yaml 4 | 5 | parser = argparse.ArgumentParser(description='Process some integers.') 6 | parser.add_argument( 7 | '--num-seeds', 8 | type=int, 9 | default=4, 10 | help='number of random seeds to generate') 11 | parser.add_argument( 12 | '--env-names', 13 | default="PongNoFrameskip-v4", 14 | help='environment name separated by semicolons') 15 | args = parser.parse_args() 16 | 17 | ppo_mujoco_template = "python main.py --env-name {0} --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --tau 0.95 --num-env-steps 1000000 --use-linear-lr-decay --no-cuda --log-dir /tmp/gym/{1}/{1}-{2} --seed {2} --use-proper-time-limits" 18 | 19 | ppo_atari_template = "env CUDA_VISIBLE_DEVICES={2} python main.py --env-name {0} --algo ppo --use-gae --lr 2.5e-4 --clip-param 0.1 --value-loss-coef 0.5 --num-processes 8 --num-steps 128 --num-mini-batch 4 --log-interval 1 --use-linear-lr-decay --entropy-coef 0.01 --log-dir /tmp/gym/{1}/{1}-{2} --seed {2}" 20 | 21 | template = ppo_atari_template 22 | 23 | config = {"session_name": "run-all", "windows": []} 24 | 25 | for i in range(args.num_seeds): 26 | panes_list = [] 27 | for env_name in args.env_names.split(';'): 28 | panes_list.append( 29 | template.format(env_name, 30 | env_name.split('-')[0].lower(), i)) 31 | 32 | config["windows"].append({ 33 | "window_name": "seed-{}".format(i), 34 | "panes": panes_list 35 | }) 36 | 37 | yaml.dump(config, open("run_all.yaml", "w"), default_flow_style=False) 38 | -------------------------------------------------------------------------------- /scripts/evaluation/visualize_model.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEED=$1 4 | MODEL_PATH_AFTER_TOP="$2" 5 | MODEL_NAME="$3" 6 | CONFIG_NAME="$4" 7 | NUM_EPISODES="$5" 8 | INTENTION="$6" 9 | RENDER="$7" 10 | STOCHASTIC="$8" 11 | FORCED_SCHEDULE="$9" 12 | 13 | # some ideas for what you might want to try with forced schedule! 14 | # FORCED_SCHEDULE="{0: 4, 45: 3, 90: 5, 135: 2, 180: 0}" 15 | # FORCED_SCHEDULE="{0: 3, 25: 2, 50: 4, 75: 5, 100: 1, 125: 3, 150: 4, 175: 2, 200: 1}" 16 | # FORCED_SCHEDULE="{0: 3, 90: 2, 180: 0}" 17 | # FORCED_SCHEDULE="{0: 4, 45: 3, 90: 2, 180: 0}" 18 | # FORCED_SCHEDULE="{0: 4, 45: 3, 90: 2, 135: 0, 180: 4, 225: 3, 270: 2, 315: 0}" 19 | # FORCED_SCHEDULE="{0: 3, 45: 2, 90: 0, 135: 3, 180: 2, 225: 0, 270: 3, 315: 2}" 20 | # FORCED_SCHEDULE="{0: 5, 90: 2, 180: 0}" 21 | # FORCED_SCHEDULE="{0: 5, 45: 2, 90: 4, 135: 2, 180: 3, 225: 2, 270: 2, 315: 5}" # realistic WRS ep 22 | 23 | 24 | DEFAULT_TOP_DIR="../../lfgp_data/trained_models/" 25 | TOP_DIR=${LFGP_MODEL_TOP_DIR:=${DEFAULT_TOP_DIR}} 26 | echo "Using TOP_DIR OF ${TOP_DIR}" 27 | 28 | COMMON_TOP="${TOP_DIR}/${MODEL_PATH_AFTER_TOP}" 29 | MODEL_PATH="${COMMON_TOP}/${MODEL_NAME}" 30 | CONFIG_PATH="${COMMON_TOP}/${CONFIG_NAME}" 31 | 32 | PYTHON_TO_EXEC=$(cat <<-END 33 | ../../rl_sandbox/rl_sandbox/examples/eval_tools/evaluate.py 34 | --seed=${SEED} 35 | --model_path=${MODEL_PATH} 36 | --config_path=${CONFIG_PATH} 37 | --num_episodes=${NUM_EPISODES} 38 | --intention=${INTENTION} 39 | --model_path=${MODEL_PATH} 40 | --forced_schedule=${FORCED_SCHEDULE} 41 | --force_egl 42 | END 43 | ) 44 | 45 | if [ "${RENDER}" = "true" ]; then 46 | PYTHON_TO_EXEC+=" --render" 47 | fi 48 | 49 | if [ "${STOCHASTIC}" = "true" ]; then 50 | PYTHON_TO_EXEC+=" --stochastic" 51 | fi 52 | 53 | python ${PYTHON_TO_EXEC} -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | trained_models/ 104 | .fuse_hidden* 105 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from a2c_ppo_acktr.envs import VecNormalize 8 | 9 | 10 | # Get a render function 11 | def get_render_func(venv): 12 | if hasattr(venv, 'envs'): 13 | return venv.envs[0].render 14 | elif hasattr(venv, 'venv'): 15 | return get_render_func(venv.venv) 16 | elif hasattr(venv, 'env'): 17 | return get_render_func(venv.env) 18 | 19 | return None 20 | 21 | 22 | def get_vec_normalize(venv): 23 | if isinstance(venv, VecNormalize): 24 | return venv 25 | elif hasattr(venv, 'venv'): 26 | return get_vec_normalize(venv.venv) 27 | 28 | return None 29 | 30 | 31 | # Necessary for my KFAC implementation. 32 | class AddBias(nn.Module): 33 | def __init__(self, bias): 34 | super(AddBias, self).__init__() 35 | self._bias = nn.Parameter(bias.unsqueeze(1)) 36 | 37 | def forward(self, x): 38 | if x.dim() == 2: 39 | bias = self._bias.t().view(1, -1) 40 | else: 41 | bias = self._bias.t().view(1, -1, 1, 1) 42 | 43 | return x + bias 44 | 45 | 46 | def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr): 47 | """Decreases the learning rate linearly""" 48 | lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs))) 49 | for param_group in optimizer.param_groups: 50 | param_group['lr'] = lr 51 | 52 | 53 | def init(module, weight_init, bias_init, gain=1): 54 | weight_init(module.weight.data, gain=gain) 55 | bias_init(module.bias.data) 56 | return module 57 | 58 | 59 | def cleanup_log_dir(log_dir): 60 | try: 61 | os.makedirs(log_dir) 62 | except OSError: 63 | files = glob.glob(os.path.join(log_dir, '*.monitor.csv')) 64 | for f in files: 65 | os.remove(f) 66 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/agents/random_agents.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import rl_sandbox.constants as c 4 | 5 | 6 | class UniformContinuousAgent: 7 | def __init__(self, min_action, max_action, rng=np.random): 8 | self.min_action = np.array(min_action) 9 | self.max_action = np.array(max_action) 10 | self.entropy = np.log(max_action - min_action).astype(np.float32) 11 | self.log_prob = -self.entropy.sum(keepdims=True).astype(np.float32) 12 | self._act_info = { 13 | c.LOG_PROB: self.log_prob, 14 | c.ENTROPY: self.entropy, 15 | c.VALUE: np.array([np.nan], dtype=np.float32), 16 | c.MEAN: ((max_action + min_action) / 2).astype(np.float32), 17 | c.VARIANCE: (((max_action - min_action) ** 2) / 2).astype(np.float32), 18 | } 19 | self.rng = rng 20 | 21 | def compute_action(self, **kwargs): 22 | return self.rng.uniform(self.min_action, self.max_action).astype(np.float32), None, self._act_info 23 | 24 | def reset(self): 25 | return None 26 | 27 | 28 | class UniformContinuousActionRepeatAgent(UniformContinuousAgent): 29 | def __init__(self, min_action, max_action, max_repeat, min_repeat=1, rng=np.random): 30 | super().__init__(min_action, max_action, rng) 31 | self.min_repeat = min_repeat 32 | self.max_repeat = max_repeat 33 | self._cur_action = super().compute_action() 34 | self.reset() 35 | 36 | def compute_action(self, **kwargs): 37 | if self._ts >= self._action_repeat: 38 | self._cur_action = super().compute_action() 39 | self._action_repeat = self.rng.randint(self.min_repeat, self.max_repeat) 40 | self._ts = 0 41 | self._ts += 1 42 | 43 | return self._cur_action 44 | 45 | def reset(self): 46 | self._ts = 0 47 | self._action_repeat = self.rng.randint(self.min_repeat, self.max_repeat) 48 | return None -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_tasks/utils.py: -------------------------------------------------------------------------------- 1 | import rl_sandbox.constants as c 2 | 3 | from rl_sandbox.auxiliary_tasks.auxiliary_tasks import AuxiliaryTasks 4 | from rl_sandbox.auxiliary_tasks.koopman import Koopman, KoopmanDynamics 5 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer 6 | 7 | 8 | def make_auxiliary_tasks(tasks, model, buffer, cfg): 9 | aux_tasks = dict() 10 | if tasks is not None: 11 | for task_name, task_setting in tasks.items(): 12 | assert task_name not in aux_tasks 13 | if task_name == c.KOOPMAN: 14 | task_setting[c.MODEL_SETTING][c.KWARGS][c.LAYERS_DIM] = model.encoder.layers_dim 15 | decoder = make_model(task_setting[c.MODEL_SETTING]).to(task_setting[c.DEVICE]) 16 | dynamics = KoopmanDynamics(z_dim=task_setting[c.Z_DIM], 17 | u_dim=task_setting[c.U_DIM], 18 | device=task_setting[c.DEVICE]) 19 | aux_opt = make_optimizer(list(decoder.parameters()) + list(dynamics.parameters()), task_setting[c.OPTIMIZER_SETTING]) 20 | 21 | aux_tasks[c.KOOPMAN] = Koopman(rec_dim=task_setting[c.REC_DIM], 22 | batch_size=task_setting[c.BATCH_SIZE], 23 | decoder=decoder, 24 | encoder=model.encoder, 25 | dynamics=dynamics, 26 | opt=aux_opt, 27 | buffer=buffer, 28 | algo_params=cfg, 29 | reduction=task_setting[c.REDUCTION], 30 | loss_coef=task_setting[c.LOSS_COEF], 31 | device=task_setting[c.DEVICE]) 32 | else: 33 | raise NotImplementedError 34 | return AuxiliaryTasks(aux_tasks) 35 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/envs/wrappers/absorbing_state.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import rl_sandbox.constants as c 4 | 5 | from rl_sandbox.envs.wrappers.wrapper import Wrapper 6 | 7 | 8 | class AbsorbingStateWrapper(Wrapper): 9 | def __init__(self, env, create_absorbing_state, max_episode_length): 10 | super().__init__(env) 11 | self._done = False 12 | self._obs = None 13 | self._max_episode_length = max_episode_length 14 | self._create_absorbing_state = create_absorbing_state 15 | self._prev_info = None 16 | 17 | def _get_obs(self): 18 | if self._done: 19 | # Return absorbing state which is [0, ..., 0, 1] 20 | return np.eye(self._obs.size + 1)[-1] 21 | 22 | return np.concatenate((self._obs.reshape(-1), [0]), axis=0) 23 | 24 | def reset(self, **kwargs): 25 | self._curr_timestep = 0 26 | self._obs = self._env.reset(**kwargs) 27 | self._done = False 28 | return self._get_obs() 29 | 30 | def step(self, action, **kwargs): 31 | self._curr_timestep += 1 32 | if self._create_absorbing_state and self._done: 33 | return self._get_obs(), 0., True, {**self._prev_info, c.ABSORBING_STATE: True, c.DONE: False} 34 | 35 | self._obs, reward, done, info = self._env.step(action, **kwargs) 36 | self._prev_info = info 37 | info[c.ABSORBING_STATE] = False 38 | info[c.DONE] = done 39 | if self._create_absorbing_state and self._curr_timestep < self._max_episode_length and done: 40 | self._done = True 41 | done = False # otherwise env will reset without getting next absorbing state 42 | return self._get_obs(), reward, done, info 43 | 44 | def render(self, **kwargs): 45 | return self._env.render(**kwargs) 46 | 47 | def seed(self, seed): 48 | self._env.seed(seed) 49 | 50 | 51 | def check_absorbing(config): 52 | for wrapper in config[c.ENV_SETTING][c.ENV_WRAPPERS]: 53 | if wrapper[c.WRAPPER] == AbsorbingStateWrapper: 54 | return True 55 | return False 56 | -------------------------------------------------------------------------------- /scripts/experiments/bc.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEED=$1 4 | DEVICE=$2 5 | MAIN_TASK=$3 6 | EXPERT_DIR=$4 7 | USER_MACHINE=$5 8 | EXPERIMENT_NAME="${EXPERT_DIR}_$6" 9 | 10 | 11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/" 12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}} 13 | echo "Using TOP_DIR OF ${TOP_DIR}" 14 | 15 | DEFAULT_STACK_DIR="stack/" 16 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}} 17 | 18 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/" 19 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}} 20 | 21 | DEFAULT_BRING_DIR="bring/" 22 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}} 23 | 24 | DEFAULT_INSERT_DIR="insert/" 25 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}} 26 | 27 | 28 | if [ "${MAIN_TASK}" = "stack" ]; then 29 | EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}" 30 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 31 | EXPERT_PATH="${PRE}2.gz" 32 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then 33 | EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}" 34 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 35 | EXPERT_PATH="${PRE}2.gz" 36 | elif [ "${MAIN_TASK}" = "bring" ]; then 37 | EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}" 38 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 39 | EXPERT_PATH="${PRE}2.gz" 40 | elif [ "${MAIN_TASK}" = "insert" ]; then 41 | EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}" 42 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 43 | EXPERT_PATH="${PRE}2.gz" 44 | else 45 | echo "Invalid MAIN_TASK ${MAIN_TASK}" 46 | exit 1 47 | fi 48 | 49 | 50 | echo "Running BC (early stopping) for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}." 51 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}." 52 | 53 | PYTHON_TO_EXEC=$(cat <<-END 54 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_bc.py 55 | --seed=${SEED} 56 | --expert_path=${EXPERT_PATH} 57 | --main_task=${MAIN_TASK}_0 58 | --device=${DEVICE} 59 | --exp_name=${EXPERIMENT_NAME} 60 | --user_machine=${USER_MACHINE} 61 | END 62 | ) 63 | 64 | if [[ "${DEVICE}" == *"cuda"* ]]; then 65 | PYTHON_TO_EXEC+=" --gpu_buffer" 66 | fi 67 | 68 | python ${PYTHON_TO_EXEC} -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/buffers/wrappers/buffer_wrapper.py: -------------------------------------------------------------------------------- 1 | from rl_sandbox.buffers.buffer import Buffer 2 | 3 | 4 | class BufferWrapper(Buffer): 5 | def __init__(self, buffer): 6 | self.buffer = buffer 7 | 8 | def __getattr__(self, attr): 9 | return getattr(self.buffer, attr) 10 | 11 | def sample(self, batch_size, idxes=None): 12 | return self.buffer.sample(batch_size, idxes) 13 | 14 | # def sample_with_next_obs(self, batch_size, next_obs, next_h_state=None, idxes=None): 15 | # return self.buffer.sample_with_next_obs(batch_size, next_obs, next_h_state, idxes) 16 | def sample_with_next_obs(self, *args, **kwargs): 17 | return self.buffer.sample_with_next_obs(*args, **kwargs) 18 | 19 | def sample_consecutive(self, batch_size, end_with_done=False): 20 | return self.buffer.sample_consecutive(batch_size, end_with_done) 21 | 22 | def sample_init_obs(self, batch_size): 23 | return self.buffer.sample_init_obs(batch_size) 24 | 25 | def sample_trajs(self, batch_size, next_obs, idxes=None, horizon_length=2): 26 | return self.buffer.sample_trajs(batch_size, next_obs, idxes, horizon_length) 27 | 28 | @property 29 | def memory_size(self): 30 | return self.buffer.memory_size 31 | 32 | @property 33 | def is_full(self): 34 | return self.buffer.is_full 35 | 36 | def __len__(self): 37 | return len(self.buffer) 38 | 39 | def push(self, obs, h_state, act, rew, done, info, *args, **kwargs): 40 | self.buffer.push(obs, h_state, act, rew, done, info, *args, **kwargs) 41 | 42 | def clear(self): 43 | return self.buffer.clear() 44 | 45 | def save(self, save_path, **kwargs): 46 | return self.buffer.save(save_path, **kwargs) 47 | 48 | # def load(self, load_path, load_rng=True): 49 | # return self.buffer.load(load_path, load_rng=load_rng) 50 | def load(self, *args, **kwargs): 51 | return self.buffer.load(*args, **kwargs) 52 | 53 | def transfer_data(self, load_path): 54 | return self.buffer.transfer_data(load_path) 55 | 56 | def close(self): 57 | return self.buffer.close() 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # output 2 | *.out 3 | results/ 4 | lfgp_data/ 5 | rce/exp_data 6 | scripts/lfebp/create_data 7 | 8 | # gail outputs 9 | pytorch-a2c-ppo-acktr-gail/gail_experts/data/ 10 | pytorch-a2c-ppo-acktr-gail/scripts/eval_logs/ 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # celery beat schedule file 90 | celerybeat-schedule 91 | 92 | # SageMath parsed files 93 | *.sage.py 94 | 95 | # Environments 96 | .env 97 | .venv 98 | env/ 99 | venv/ 100 | ENV/ 101 | env.bak/ 102 | venv.bak/ 103 | .envrc 104 | 105 | # Spyder project settings 106 | .spyderproject 107 | .spyproject 108 | 109 | # Rope project settings 110 | .ropeproject 111 | 112 | # mkdocs documentation 113 | /site 114 | 115 | # mypy 116 | .mypy_cache/ 117 | 118 | # vscode 119 | .vscode 120 | 121 | # lfgp 122 | pytorch-a2c-ppo-acktr-gail/gail_experts/expert-data 123 | pytorch-a2c-ppo-acktr-gail/scripts/eval_logs/ 124 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/envs/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | os.environ["MUJOCO_GL"] = "egl" 5 | 6 | import rl_sandbox.constants as c 7 | from rl_sandbox.envs.wrappers.absorbing_state import AbsorbingStateWrapper 8 | 9 | def make_env(env_config, seed=None, dummy_env=False): 10 | assert env_config[c.ENV_TYPE] in c.VALID_ENV_TYPE 11 | if env_config[c.ENV_TYPE] == c.GYM: 12 | import gym 13 | import pybullet_envs 14 | env = gym.make(**env_config[c.ENV_BASE]) 15 | elif env_config[c.ENV_TYPE] == c.DM_CONTROL: 16 | from dm_control import suite 17 | env = suite.load(**env_config[c.ENV_BASE]) 18 | elif env_config[c.ENV_TYPE] == c.MANIPULATOR_LEARNING: 19 | import manipulator_learning.sim.envs as manlearn_envs 20 | env = getattr(manlearn_envs, 21 | env_config[c.ENV_BASE][c.ENV_NAME])(dense_reward=False, **env_config.get(c.KWARGS, {})) 22 | elif env_config[c.ENV_TYPE] in [c.SAWYER, c.HAND_DAPG]: 23 | import rl_sandbox.envs.rce_envs as rce_envs 24 | env = rce_envs.load_env(env_config[c.ENV_BASE][c.ENV_NAME], gym_env=True, **env_config.get(c.KWARGS, {})) 25 | elif env_config[c.ENV_TYPE] == c.PANDA_RL_ENVS: 26 | import panda_rl_envs 27 | env_kwargs = env_config.get(c.KWARGS, {}) 28 | env_config_dict = env_kwargs.get("config_dict", {}) 29 | env_config_dict['dummy_env'] = dummy_env 30 | env_kwargs['config_dict'] = env_config_dict 31 | env = getattr(panda_rl_envs, env_config[c.ENV_BASE][c.ENV_NAME])(**env_kwargs) 32 | else: 33 | raise NotImplementedError 34 | 35 | for wrapper_config in env_config[c.ENV_WRAPPERS]: 36 | env = wrapper_config[c.WRAPPER](env, **wrapper_config[c.KWARGS]) 37 | 38 | if seed is None: 39 | seed = np.random.randint(0, 2 ** 32 - 1) 40 | 41 | env.seed(seed) 42 | 43 | return env 44 | 45 | 46 | def absorbing_check(algo_params): 47 | absorbing_in_settings = False 48 | if c.ENV_WRAPPERS in algo_params[c.ENV_SETTING]: 49 | for wrapper in algo_params[c.ENV_SETTING][c.ENV_WRAPPERS]: 50 | if wrapper[c.WRAPPER] == AbsorbingStateWrapper: 51 | absorbing_in_settings = True 52 | 53 | return absorbing_in_settings -------------------------------------------------------------------------------- /scripts/experiments/bc_no_overfit.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEED=$1 4 | DEVICE=$2 5 | MAIN_TASK=$3 6 | EXPERT_DIR=$4 7 | USER_MACHINE=$5 8 | EXPERIMENT_NAME="${EXPERT_DIR}_$6" 9 | 10 | 11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/" 12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}} 13 | echo "Using TOP_DIR OF ${TOP_DIR}" 14 | 15 | DEFAULT_STACK_DIR="stack/" 16 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}} 17 | 18 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/" 19 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}} 20 | 21 | DEFAULT_BRING_DIR="bring/" 22 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}} 23 | 24 | DEFAULT_INSERT_DIR="insert/" 25 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}} 26 | 27 | 28 | if [ "${MAIN_TASK}" = "stack" ]; then 29 | EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}" 30 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 31 | EXPERT_PATH="${PRE}2.gz" 32 | NUM_TRAINING=20 33 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then 34 | EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}" 35 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 36 | EXPERT_PATH="${PRE}2.gz" 37 | NUM_TRAINING=20 38 | elif [ "${MAIN_TASK}" = "bring" ]; then 39 | EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}" 40 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 41 | EXPERT_PATH="${PRE}2.gz" 42 | NUM_TRAINING=20 43 | elif [ "${MAIN_TASK}" = "insert" ]; then 44 | EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}" 45 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 46 | EXPERT_PATH="${PRE}2.gz" 47 | NUM_TRAINING=40 48 | else 49 | echo "Invalid MAIN_TASK ${MAIN_TASK}" 50 | exit 1 51 | fi 52 | 53 | 54 | echo "Running BC for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}." 55 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}." 56 | 57 | PYTHON_TO_EXEC=$(cat <<-END 58 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_bc_no_overfit.py 59 | --seed=${SEED} 60 | --expert_path=${EXPERT_PATH} 61 | --main_task=${MAIN_TASK}_0 62 | --device=${DEVICE} 63 | --exp_name=${EXPERIMENT_NAME} 64 | --user_machine=${USER_MACHINE} 65 | --num_training=${NUM_TRAINING} 66 | --num_updates=100000 67 | --num_evals=50 68 | END 69 | ) 70 | 71 | if [[ "${DEVICE}" == *"cuda"* ]]; then 72 | PYTHON_TO_EXEC+=" --gpu_buffer" 73 | fi 74 | 75 | python ${PYTHON_TO_EXEC} 76 | -------------------------------------------------------------------------------- /scripts/experiments/multi_bc.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEED=$1 4 | DEVICE=$2 5 | MAIN_TASK=$3 6 | EXPERT_DIR=$4 7 | USER_MACHINE=$5 8 | EXPERIMENT_NAME="${EXPERT_DIR}_$6" 9 | 10 | 11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/" 12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}} 13 | echo "Using TOP_DIR OF ${TOP_DIR}" 14 | 15 | DEFAULT_STACK_DIR="stack/" 16 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}} 17 | 18 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/" 19 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}} 20 | 21 | DEFAULT_BRING_DIR="bring/" 22 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}} 23 | 24 | DEFAULT_INSERT_DIR="insert/" 25 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}} 26 | 27 | 28 | if [ "${MAIN_TASK}" = "stack" ]; then 29 | EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}" 30 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 31 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz" 32 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then 33 | EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}" 34 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 35 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz" 36 | elif [ "${MAIN_TASK}" = "bring" ]; then 37 | EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}" 38 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 39 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz" 40 | elif [ "${MAIN_TASK}" = "insert" ]; then 41 | EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}" 42 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 43 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz,${PRE}6.gz" 44 | else 45 | echo "Invalid MAIN_TASK ${MAIN_TASK}" 46 | exit 1 47 | fi 48 | 49 | 50 | echo "Running Multi BC (early stopping) for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}." 51 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}." 52 | 53 | PYTHON_TO_EXEC=$(cat <<-END 54 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_multitask_bc.py 55 | --seed=${SEED} 56 | --expert_path=${EXPERT_PATHS} 57 | --main_task=${MAIN_TASK}_0 58 | --device=${DEVICE} 59 | --exp_name=${EXPERIMENT_NAME} 60 | --user_machine=${USER_MACHINE} 61 | END 62 | ) 63 | 64 | if [[ "${DEVICE}" == *"cuda"* ]]; then 65 | PYTHON_TO_EXEC+=" --gpu_buffer" 66 | fi 67 | 68 | python ${PYTHON_TO_EXEC} 69 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/buffers/disk_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import torch 4 | 5 | from rl_sandbox.buffers.ram_buffer import NumPyBuffer 6 | 7 | class DiskNumPyBuffer(NumPyBuffer): 8 | def __init__(self, 9 | memory_size, 10 | obs_dim, 11 | h_state_dim, 12 | action_dim, 13 | reward_dim, 14 | infos=dict(), 15 | disk_dir="./", 16 | history_length=0, 17 | checkpoint_interval=0, 18 | checkpoint_path=None, 19 | rng=np.random, 20 | dtype=np.float32): 21 | self.rng = rng 22 | self._memory_size = memory_size 23 | self._dtype = dtype 24 | os.makedirs(f"{disk_dir}", exist_ok=True) 25 | self.observations = np.memmap(filename=f"{disk_dir}/observations.npy", mode="w+", shape=(memory_size, *obs_dim), dtype=dtype) 26 | self.hidden_states = np.memmap(filename=f"{disk_dir}/hidden_states.npy", mode="w+", shape=(memory_size, *h_state_dim), dtype=dtype) 27 | self.actions = np.memmap(filename=f"{disk_dir}/actions.npy", mode="w+", shape=(memory_size, *action_dim), dtype=dtype) 28 | self.rewards = np.memmap(filename=f"{disk_dir}/rewards.npy", mode="w+", shape=(memory_size, *reward_dim), dtype=dtype) 29 | self.dones = np.memmap(filename=f"{disk_dir}/dones.npy", mode="w+", shape=(memory_size, 1), dtype=np.bool) 30 | self.infos = dict() 31 | for info_name, (info_shape, info_dtype) in infos.items(): 32 | self.infos[info_name] = np.memmap(filename=f"{disk_dir}/{info_name}.npy", mode="w+", shape=(memory_size, *info_shape), dtype=info_dtype) 33 | 34 | self._checkpoint_interval = checkpoint_interval 35 | self._checkpoint_idxes = np.ones(shape=memory_size, dtype=np.bool) 36 | if checkpoint_path is not None and memory_size >= checkpoint_interval > 0: 37 | self._checkpoint_path = checkpoint_path 38 | os.makedirs(checkpoint_path, exist_ok=True) 39 | self.checkpoint = self._checkpoint 40 | self._checkpoint_count = 0 41 | else: 42 | self.checkpoint = lambda: None 43 | 44 | self._pointer = 0 45 | self._count = 0 46 | self.history_length = history_length 47 | self.history_frame = np.zeros(shape=(history_length, *obs_dim), dtype=dtype) 48 | 49 | def save(self, save_path, end_with_done=True): 50 | pass 51 | -------------------------------------------------------------------------------- /scripts/experiments/dac.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEED=$1 4 | DEVICE=$2 5 | MAIN_TASK=$3 6 | EXPERT_DIR=$4 7 | USER_MACHINE=$5 8 | EXPBUF_LAST_SAMPLE_PROP=$6 # default is .95, 0. turns it off 9 | EXPBUF_MODEL_SAMPLE_RATE=$7 # default is .1, 0. turns it off 10 | EXPERIMENT_NAME="${EXPERT_DIR}_$8" 11 | 12 | 13 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/" 14 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}} 15 | echo "Using TOP_DIR OF ${TOP_DIR}" 16 | 17 | DEFAULT_STACK_DIR="stack/" 18 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}} 19 | 20 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/" 21 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}} 22 | 23 | DEFAULT_BRING_DIR="bring/" 24 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}} 25 | 26 | DEFAULT_INSERT_DIR="insert/" 27 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}} 28 | 29 | 30 | if [ "${MAIN_TASK}" = "stack" ]; then 31 | EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}" 32 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 33 | EXPERT_PATH="${PRE}2.gz" 34 | MAX_STEPS=2000000 35 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then 36 | EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}" 37 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 38 | EXPERT_PATH="${PRE}2.gz" 39 | MAX_STEPS=2000000 40 | elif [ "${MAIN_TASK}" = "bring" ]; then 41 | EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}" 42 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 43 | EXPERT_PATH="${PRE}2.gz" 44 | MAX_STEPS=2000000 45 | elif [ "${MAIN_TASK}" = "insert" ]; then 46 | EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}" 47 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 48 | EXPERT_PATH="${PRE}2.gz" 49 | MAX_STEPS=4000000 50 | else 51 | echo "Invalid MAIN_TASK ${MAIN_TASK}" 52 | exit 1 53 | fi 54 | 55 | echo "Running DAC for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}." 56 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}." 57 | 58 | PYTHON_TO_EXEC=$(cat <<-END 59 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_dac.py 60 | --seed ${SEED} 61 | --user_machine ${USER_MACHINE} 62 | --expert_path ${EXPERT_PATH} 63 | --main_task ${MAIN_TASK}_0 64 | --exp_name ${EXPERIMENT_NAME} 65 | --device ${DEVICE} 66 | --num_evals 50 67 | --max_steps ${MAX_STEPS} 68 | --expbuf_last_sample_prop=${EXPBUF_LAST_SAMPLE_PROP} 69 | --expbuf_model_sample_rate=${EXPBUF_MODEL_SAMPLE_RATE} 70 | END 71 | ) 72 | 73 | if [[ "${DEVICE}" == *"cuda"* ]]; then 74 | PYTHON_TO_EXEC+=" --gpu_buffer" 75 | fi 76 | 77 | python ${PYTHON_TO_EXEC} 78 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/sac_x/sac_x.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | 3 | import rl_sandbox.constants as c 4 | 5 | 6 | class SACX: 7 | def __init__(self, update_scheduler, update_intentions, algo_params): 8 | self.update_scheduler = update_scheduler 9 | self.update_intentions = update_intentions 10 | self.algo_params = algo_params 11 | self.buffer = update_intentions.buffer 12 | self.step = 0 13 | 14 | if hasattr(self.update_intentions, '_use_absorbing_state'): 15 | self._use_absorbing_state = self.update_intentions._use_absorbing_state 16 | else: 17 | self._use_absorbing_state = False 18 | 19 | def state_dict(self): 20 | state_dict = { 21 | c.SCHEDULER: self.update_scheduler.state_dict(), 22 | c.INTENTIONS: self.update_intentions.state_dict(), 23 | } 24 | return state_dict 25 | 26 | def load_state_dict(self, state_dict): 27 | self.update_scheduler.load_state_dict(state_dict[c.SCHEDULER]) 28 | self.update_intentions.load_state_dict(state_dict[c.INTENTIONS]) 29 | 30 | def update(self, curr_obs, curr_h_state, act, rew, done, info, next_obs, next_h_state, 31 | update_intentions=True, update_scheduler=True, update_buffer=True, update_info={}): 32 | update_info = update_info 33 | 34 | # Intention Learning 35 | updated_intentions = False 36 | if update_intentions: 37 | tic = timeit.default_timer() 38 | updated_intentions, intentions_info = self.update_intentions.update( 39 | curr_obs, curr_h_state, act, rew, done, info, next_obs, next_h_state, update_buffer=update_buffer) 40 | toc = timeit.default_timer() 41 | if updated_intentions: 42 | update_info[c.INTENTIONS_UPDATE_TIME] = toc - tic 43 | update_info.update(intentions_info) 44 | 45 | # Scheduler Learning 46 | if update_scheduler: 47 | self.step += 1 48 | tic = timeit.default_timer() 49 | updated_scheduler, scheduler_info = self.update_scheduler.update(curr_obs, act, rew, done, info) 50 | toc = timeit.default_timer() 51 | if updated_scheduler: 52 | update_info[c.SCHEDULER_UPDATE_TIME] = toc - tic 53 | update_info.update(scheduler_info) 54 | 55 | return updated_intentions, update_info 56 | 57 | def reset(self): 58 | if hasattr(self.update_scheduler, 'reset'): 59 | self.update_scheduler.reset() -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/buffers/wrappers/torch_buffer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import rl_sandbox.constants as c 4 | 5 | from rl_sandbox.buffers.wrappers.buffer_wrapper import BufferWrapper 6 | 7 | 8 | class TorchBuffer(BufferWrapper): 9 | def __init__(self, buffer): 10 | super().__init__(buffer) 11 | 12 | def _convert_batch_to_torch(self, obss, h_states, acts, rews, dones, infos, lengths): 13 | obss = torch.as_tensor(obss).float() 14 | h_states = torch.as_tensor(h_states).float() 15 | acts = torch.as_tensor(acts).float() 16 | rews = torch.as_tensor(rews).float() 17 | dones = torch.as_tensor(dones).long() 18 | infos = {k: torch.as_tensor(v) for k, v in infos.items()} 19 | lengths = torch.as_tensor(lengths).long() 20 | 21 | return obss, h_states, acts, rews, dones, infos, lengths 22 | 23 | def sample(self, batch_size, idxes=None): 24 | obss, h_states, acts, rews, dones, infos, lengths, idxes = super().sample(batch_size, idxes) 25 | return self._convert_batch_to_torch(obss, h_states, acts, rews, dones, infos, lengths) 26 | 27 | def sample_with_next_obs(self, batch_size, next_obs, next_h_state=None, idxes=None): 28 | obss, h_states, acts, rews, dones, next_obss, next_h_states, infos, lengths, _ = super().sample_with_next_obs(batch_size, next_obs, next_h_state, idxes) 29 | obss, h_states, acts, rews, dones, infos, lengths = self._convert_batch_to_torch(obss, h_states, acts, rews, dones, infos, lengths) 30 | next_obss = torch.as_tensor(next_obss).float() 31 | next_h_states = torch.as_tensor(next_h_states) 32 | 33 | return obss, h_states, acts, rews, dones, next_obss, next_h_states, infos, lengths 34 | 35 | def sample_consecutive(self, batch_size, end_with_done=False): 36 | obss, h_states, acts, rews, dones, infos, lengths, _ = super().sample_consecutive(batch_size, end_with_done) 37 | return self._convert_batch_to_torch(obss, h_states, acts, rews, dones, infos, lengths) 38 | 39 | def sample_init_obs(self, batch_size): 40 | obss, h_states = super().sample_init_obs(batch_size) 41 | return torch.as_tensor(obss).float(), torch.as_tensor(h_states).float() 42 | 43 | def sample_trajs(self, batch_size, next_obs, idxes=None, horizon_length=2): 44 | obss, h_states, acts, rews, dones, infos, lengths, ep_lengths, idxes = super().sample_trajs(batch_size, next_obs, idxes, horizon_length) 45 | obss, h_states, acts, rews, dones, infos, lengths = self._convert_batch_to_torch(obss, h_states, acts, rews, dones, infos, lengths) 46 | ep_lengths = torch.as_tensor(ep_lengths).long() 47 | idxes = torch.as_tensor(idxes).long() 48 | return obss, h_states, acts, rews, dones, infos, lengths, ep_lengths, idxes 49 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/evaluation.py: -------------------------------------------------------------------------------- 1 | import _pickle as pickle 2 | import numpy as np 3 | import torch 4 | 5 | from a2c_ppo_acktr import utils 6 | from a2c_ppo_acktr.envs import make_vec_envs 7 | 8 | 9 | def evaluate(actor_critic, eval_envs, eval_log_dir, device, eval_i, seed, env_name, render, reward_suc_wrapper=None, 10 | num_eval_eps=50): 11 | 12 | # obss = [[]] 13 | # rews = [[]] 14 | # sucs = [[]] 15 | # acts = [[]] 16 | # infos = [[]] 17 | returns = [] 18 | successes = [] 19 | success = None 20 | 21 | obs = eval_envs.reset() 22 | eval_recurrent_hidden_states = torch.zeros( 23 | 1, actor_critic.recurrent_hidden_state_size, device=device) 24 | eval_masks = torch.zeros(1, 1, device=device) 25 | 26 | num_eps = 0 27 | ep_return = 0 28 | success_latch = False 29 | 30 | while num_eps < num_eval_eps: 31 | with torch.no_grad(): 32 | _, action, _, eval_recurrent_hidden_states = actor_critic.act( 33 | obs.to(device), 34 | eval_recurrent_hidden_states, 35 | eval_masks, 36 | deterministic=True) 37 | 38 | if render: 39 | eval_envs.render() 40 | 41 | # Obser reward and next obs 42 | prev_obs = obs 43 | obs, rew, done, info = eval_envs.step(action) 44 | 45 | # fix reward 46 | if reward_suc_wrapper is not None: 47 | rew, success = reward_suc_wrapper.get_rew_suc(prev_obs, action, info) 48 | 49 | eval_masks = torch.tensor( 50 | [[0.0] if done_ else [1.0] for done_ in done], 51 | dtype=torch.float32, 52 | device=device) 53 | 54 | # obss[-1].append(obs) 55 | # rews[-1].append(rew) 56 | # sucs[-1].append(success) 57 | # acts[-1].append(action) 58 | # infos[-1].append(info) 59 | 60 | ep_return += rew 61 | if success: 62 | success_latch = True 63 | 64 | if done: 65 | num_eps += 1 66 | returns.append(ep_return) 67 | successes.append(int(success_latch)) 68 | ep_return = 0 69 | success_latch = False 70 | 71 | # obss.append([]) 72 | # rews.append([]) 73 | # sucs.append([]) 74 | # acts.append([]) 75 | # infos.append([]) 76 | 77 | # pickle.dump({ 78 | # "obss": obss, 79 | # "rews": rews, 80 | # "sucs": sucs, 81 | # "acts": acts, 82 | # "infos": infos, 83 | # }, open(f"{env_name}-{seed}-{eval_i}.pkl", "wb")) 84 | print(" Evaluation using {} episodes: mean reward {:.5f}, suc rate {:.5f} \n".format( 85 | num_eps, np.mean(returns), np.sum(successes) / len(successes))) 86 | 87 | return returns, successes 88 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/envs/wrappers/frame_stack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from collections import deque 4 | 5 | from rl_sandbox.envs.wrappers.wrapper import Wrapper 6 | 7 | 8 | class FrameStackWrapper(Wrapper): 9 | def __init__(self, env, num_frames): 10 | assert num_frames > 0 11 | super().__init__(env) 12 | self._num_frames = num_frames 13 | self.frames = deque([], maxlen=num_frames) 14 | 15 | def _get_obs(self): 16 | assert len(self.frames) == self._num_frames 17 | # return np.stack(self.frames) 18 | return np.concatenate(self.frames)[None, :] 19 | 20 | def reset(self, **kwargs): 21 | obs = self._env.reset(**kwargs) 22 | for _ in range(self._num_frames): 23 | self.frames.append(obs) 24 | 25 | return self._get_obs() 26 | 27 | def step(self, action, **kwargs): 28 | obs, reward, done, info = self._env.step(action, **kwargs) 29 | self.frames.append(obs) 30 | 31 | return self._get_obs(), reward, done, info 32 | 33 | def render(self, **kwargs): 34 | return self._env.render(**kwargs) 35 | 36 | def seed(self, seed): 37 | self._env.seed(seed) 38 | 39 | 40 | class OfflineFrameStack: 41 | def __init__(self, num_frames): 42 | self._num_frames = num_frames 43 | self.frames = deque([], maxlen=num_frames) 44 | 45 | def get_stacked_obs(self, obs): 46 | assert len(self.frames) == self._num_frames 47 | self.frames.append(obs) 48 | # return np.stack(self.frames) 49 | return np.concatenate(self.frames) 50 | 51 | def reset(self, obs): 52 | for _ in range(self._num_frames): 53 | self.frames.append(obs) 54 | 55 | return np.concatenate(self.frames) 56 | 57 | 58 | def make_frame_stack(num_frames, obss, dones, next_obss=None): 59 | # inefficiently doing this with a for loop for now 60 | stacked_obss = [] 61 | frame_stacker = OfflineFrameStack(num_frames) 62 | if next_obss is not None: 63 | stacked_next_obss = [] 64 | next_obss_frame_stacker = OfflineFrameStack(num_frames) 65 | 66 | new_ep = True 67 | 68 | for i in range(0, len(obss)): 69 | if new_ep: 70 | stacked_obss.append(frame_stacker.reset(obss[i])) 71 | if next_obss is not None: 72 | stacked_next_obss.append(next_obss_frame_stacker.reset(next_obss[i])) 73 | else: 74 | stacked_obss.append(frame_stacker.get_stacked_obs(obss[i])) 75 | if next_obss is not None: 76 | stacked_next_obss.append(next_obss_frame_stacker.get_stacked_obs(next_obss[i])) 77 | 78 | new_ep = dones[i] 79 | 80 | if next_obss is None: 81 | return np.vstack(stacked_obss) 82 | else: 83 | return np.vstack(stacked_obss), np.vstack(stacked_next_obss) -------------------------------------------------------------------------------- /scripts/experiments/lfgp.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEED=$1 4 | DEVICE=$2 5 | MAIN_TASK=$3 6 | EXPERT_DIR=$4 7 | USER_MACHINE=$5 8 | SCHEDULER=$6 # can be one of [wrs_plus_handcraft, wrs, learned, no_sched] 9 | EXPBUF_LAST_SAMPLE_PROP=$7 # default is .95, 0. turns it off 10 | EXPBUF_MODEL_SAMPLE_RATE=$8 # default is .1, 0. turns it off 11 | EXPERIMENT_NAME="${EXPERT_DIR}_$9" 12 | 13 | 14 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/" 15 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}} 16 | echo "Using TOP_DIR OF ${TOP_DIR}" 17 | 18 | DEFAULT_STACK_DIR="stack/" 19 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}} 20 | 21 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/" 22 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}} 23 | 24 | DEFAULT_BRING_DIR="bring/" 25 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}} 26 | 27 | DEFAULT_INSERT_DIR="insert/" 28 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}} 29 | 30 | 31 | if [ "${MAIN_TASK}" = "stack" ]; then 32 | EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}" 33 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 34 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz" 35 | MAX_STEPS=2000000 36 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then 37 | EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}" 38 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 39 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz" 40 | MAX_STEPS=2000000 41 | elif [ "${MAIN_TASK}" = "bring" ]; then 42 | EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}" 43 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 44 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz" 45 | MAX_STEPS=2000000 46 | elif [ "${MAIN_TASK}" = "insert" ]; then 47 | EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}" 48 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 49 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz,${PRE}6.gz" 50 | MAX_STEPS=4000000 51 | else 52 | echo "Invalid MAIN_TASK ${MAIN_TASK}" 53 | exit 1 54 | fi 55 | 56 | echo "Running LfGP sched ${SCHEDULER} for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}." 57 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}." 58 | 59 | PYTHON_TO_EXEC=$(cat <<-END 60 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_lfgp.py 61 | --seed=${SEED} 62 | --expert_path=${EXPERT_PATHS} 63 | --main_task=${MAIN_TASK}_0 64 | --device=${DEVICE} 65 | --exp_name=${EXPERIMENT_NAME} 66 | --user_machine=${USER_MACHINE} 67 | --scheduler=${SCHEDULER} 68 | --max_steps=${MAX_STEPS} 69 | --expbuf_last_sample_prop=${EXPBUF_LAST_SAMPLE_PROP} 70 | --expbuf_model_sample_rate=${EXPBUF_MODEL_SAMPLE_RATE} 71 | END 72 | ) 73 | 74 | if [[ "${DEVICE}" == *"cuda"* ]]; then 75 | PYTHON_TO_EXEC+=" --gpu_buffer" 76 | fi 77 | 78 | python ${PYTHON_TO_EXEC} 79 | -------------------------------------------------------------------------------- /scripts/experiments/multi_bc_no_overfit.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEED=$1 4 | DEVICE=$2 5 | MAIN_TASK=$3 6 | EXPERT_DIR=$4 7 | USER_MACHINE=$5 8 | EXPERIMENT_NAME="${EXPERT_DIR}_$6" 9 | 10 | 11 | DEFAULT_TOP_DIR="../../lfgp_data/expert_data/" 12 | TOP_DIR=${LFGP_DATA_TOP_DIR:=${DEFAULT_TOP_DIR}} 13 | echo "Using TOP_DIR OF ${TOP_DIR}" 14 | 15 | DEFAULT_STACK_DIR="stack/" 16 | STACK_DIR=${STACK_DIR:=${DEFAULT_STACK_DIR}} 17 | 18 | DEFAULT_UNSTACK_DIR="unstack_stack_env_only/" 19 | UNSTACK_DIR=${UNSTACK_DIR:=${DEFAULT_UNSTACK_DIR}} 20 | 21 | DEFAULT_BRING_DIR="bring/" 22 | BRING_DIR=${BRING_DIR:=${DEFAULT_BRING_DIR}} 23 | 24 | DEFAULT_INSERT_DIR="insert/" 25 | INSERT_DIR=${INSERT_DIR:=${DEFAULT_INSERT_DIR}} 26 | 27 | 28 | if [ "${MAIN_TASK}" = "stack" ]; then 29 | EXPERT_PATH_MID="${TOP_DIR}${STACK_DIR}" 30 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 31 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz" 32 | NUM_TRAINING=20 33 | elif [ "${MAIN_TASK}" = "unstack_stack_env_only" ]; then 34 | EXPERT_PATH_MID="${TOP_DIR}${UNSTACK_DIR}" 35 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 36 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz" 37 | NUM_TRAINING=20 38 | elif [ "${MAIN_TASK}" = "bring" ]; then 39 | EXPERT_PATH_MID="${TOP_DIR}${BRING_DIR}" 40 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 41 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz" 42 | NUM_TRAINING=20 43 | elif [ "${MAIN_TASK}" = "insert" ]; then 44 | EXPERT_PATH_MID="${TOP_DIR}${INSERT_DIR}" 45 | PRE="${EXPERT_PATH_MID}${EXPERT_DIR}/int_" 46 | EXPERT_PATHS="${PRE}0.gz,${PRE}1.gz,${PRE}2.gz,${PRE}3.gz,${PRE}4.gz,${PRE}5.gz,${PRE}6.gz" 47 | NUM_TRAINING=40 48 | else 49 | echo "Invalid MAIN_TASK ${MAIN_TASK}" 50 | exit 1 51 | fi 52 | 53 | echo "Running Multi BC for seed ${SEED}, on device ${DEVICE}, main task ${MAIN_TASK}, expert dir ${EXPERT_DIR}." 54 | echo "User machine ${USER_MACHINE}, Experiment name ${EXPERIMENT_NAME}." 55 | 56 | PYTHON_TO_EXEC=$(cat <<-END 57 | ../../rl_sandbox/rl_sandbox/examples/lfgp/run_multitask_bc_no_overfit.py 58 | --seed=${SEED} 59 | --expert_paths=${EXPERT_PATHS} 60 | --main_task=${MAIN_TASK}_0 61 | --device=${DEVICE} 62 | --exp_name=${EXPERIMENT_NAME} 63 | --user_machine=${USER_MACHINE} 64 | --num_training=${NUM_TRAINING} 65 | --num_updates=100000 66 | END 67 | ) 68 | 69 | if [[ "${DEVICE}" == *"cuda"* ]]; then 70 | PYTHON_TO_EXEC+=" --gpu_buffer" 71 | fi 72 | 73 | python ${PYTHON_TO_EXEC} 74 | 75 | # python ../../rl_sandbox/rl_sandbox/examples/lfgp/run_multitask_bc_no_overfit.py \ 76 | # --seed="${SEED}" \ 77 | # --expert_paths="${EXPERT_PATHS}" \ 78 | # --main_task="${MAIN_TASK}_0" \ 79 | # --device="${DEVICE}" \ 80 | # --exp_name="${EXPERIMENT_NAME}" \ 81 | # --user_machine="${USER_MACHINE}" \ 82 | # --num_training="${NUM_TRAINING}" \ 83 | # --num_updates=100000 \ 84 | # --gpu_buffer -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/enjoy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | # workaround to unpickle olf model files 4 | import sys 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from a2c_ppo_acktr.envs import VecPyTorch, make_vec_envs 10 | from a2c_ppo_acktr.utils import get_render_func, get_vec_normalize 11 | 12 | sys.path.append('a2c_ppo_acktr') 13 | 14 | parser = argparse.ArgumentParser(description='RL') 15 | parser.add_argument( 16 | '--seed', type=int, default=1, help='random seed (default: 1)') 17 | parser.add_argument( 18 | '--log-interval', 19 | type=int, 20 | default=10, 21 | help='log interval, one log per n updates (default: 10)') 22 | parser.add_argument( 23 | '--env-name', 24 | default='PongNoFrameskip-v4', 25 | help='environment to train on (default: PongNoFrameskip-v4)') 26 | parser.add_argument( 27 | '--load-dir', 28 | default='./trained_models/', 29 | help='directory to save agent logs (default: ./trained_models/)') 30 | parser.add_argument( 31 | '--non-det', 32 | action='store_true', 33 | default=False, 34 | help='whether to use a non-deterministic policy') 35 | args = parser.parse_args() 36 | 37 | args.det = not args.non_det 38 | 39 | env = make_vec_envs( 40 | args.env_name, 41 | args.seed + 1000, 42 | 1, 43 | None, 44 | None, 45 | device='cpu', 46 | allow_early_resets=False) 47 | 48 | # Get a render function 49 | render_func = get_render_func(env) 50 | 51 | # We need to use the same statistics for normalization as used in training 52 | actor_critic, obs_rms = \ 53 | torch.load(os.path.join(args.load_dir, args.env_name + ".pt"), 54 | map_location='cpu') 55 | 56 | vec_norm = get_vec_normalize(env) 57 | if vec_norm is not None: 58 | vec_norm.eval() 59 | vec_norm.obs_rms = obs_rms 60 | 61 | recurrent_hidden_states = torch.zeros(1, 62 | actor_critic.recurrent_hidden_state_size) 63 | masks = torch.zeros(1, 1) 64 | 65 | obs = env.reset() 66 | 67 | if render_func is not None: 68 | render_func('human') 69 | 70 | if args.env_name.find('Bullet') > -1: 71 | import pybullet as p 72 | 73 | torsoId = -1 74 | for i in range(p.getNumBodies()): 75 | if (p.getBodyInfo(i)[0].decode() == "torso"): 76 | torsoId = i 77 | 78 | while True: 79 | with torch.no_grad(): 80 | value, action, _, recurrent_hidden_states = actor_critic.act( 81 | obs, recurrent_hidden_states, masks, deterministic=args.det) 82 | 83 | # Obser reward and next obs 84 | obs, reward, done, _ = env.step(action) 85 | 86 | masks.fill_(0.0 if done else 1.0) 87 | 88 | if args.env_name.find('Bullet') > -1: 89 | if torsoId > -1: 90 | distance = 5 91 | yaw = 0 92 | humanPos, humanOrn = p.getBasePositionAndOrientation(torsoId) 93 | p.resetDebugVisualizerCamera(distance, yaw, -20, humanPos) 94 | 95 | if render_func is not None: 96 | render_func('human') 97 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/algo/a2c_acktr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | 5 | from a2c_ppo_acktr.algo.kfac import KFACOptimizer 6 | 7 | 8 | class A2C_ACKTR(): 9 | def __init__(self, 10 | actor_critic, 11 | value_loss_coef, 12 | entropy_coef, 13 | lr=None, 14 | eps=None, 15 | alpha=None, 16 | max_grad_norm=None, 17 | acktr=False): 18 | 19 | self.actor_critic = actor_critic 20 | self.acktr = acktr 21 | 22 | self.value_loss_coef = value_loss_coef 23 | self.entropy_coef = entropy_coef 24 | 25 | self.max_grad_norm = max_grad_norm 26 | 27 | if acktr: 28 | self.optimizer = KFACOptimizer(actor_critic) 29 | else: 30 | self.optimizer = optim.RMSprop( 31 | actor_critic.parameters(), lr, eps=eps, alpha=alpha) 32 | 33 | def update(self, rollouts): 34 | obs_shape = rollouts.obs.size()[2:] 35 | action_shape = rollouts.actions.size()[-1] 36 | num_steps, num_processes, _ = rollouts.rewards.size() 37 | 38 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 39 | rollouts.obs[:-1].view(-1, *obs_shape), 40 | rollouts.recurrent_hidden_states[0].view( 41 | -1, self.actor_critic.recurrent_hidden_state_size), 42 | rollouts.masks[:-1].view(-1, 1), 43 | rollouts.actions.view(-1, action_shape)) 44 | 45 | values = values.view(num_steps, num_processes, 1) 46 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1) 47 | 48 | advantages = rollouts.returns[:-1] - values 49 | value_loss = advantages.pow(2).mean() 50 | 51 | action_loss = -(advantages.detach() * action_log_probs).mean() 52 | 53 | if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: 54 | # Compute fisher, see Martens 2014 55 | self.actor_critic.zero_grad() 56 | pg_fisher_loss = -action_log_probs.mean() 57 | 58 | value_noise = torch.randn(values.size()) 59 | if values.is_cuda: 60 | value_noise = value_noise.cuda() 61 | 62 | sample_values = values + value_noise 63 | vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() 64 | 65 | fisher_loss = pg_fisher_loss + vf_fisher_loss 66 | self.optimizer.acc_stats = True 67 | fisher_loss.backward(retain_graph=True) 68 | self.optimizer.acc_stats = False 69 | 70 | self.optimizer.zero_grad() 71 | (value_loss * self.value_loss_coef + action_loss - 72 | dist_entropy * self.entropy_coef).backward() 73 | 74 | if self.acktr == False: 75 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 76 | self.max_grad_norm) 77 | 78 | self.optimizer.step() 79 | 80 | return value_loss.item(), action_loss.item(), dist_entropy.item() 81 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/lfgp/experts/scripted_policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from torch.distributions import Normal, Uniform 5 | 6 | 7 | class GripperIntentions: 8 | """ 9 | Intentions that respectively open and close the gripper while following Gaussian noise for other actions 10 | """ 11 | def __init__(self, action_dim, gripper_dim, means, vars): 12 | self._action_dim = action_dim 13 | self._gripper_dim = gripper_dim 14 | self._non_gripper_dim = torch.ones(self._action_dim) 15 | self._non_gripper_dim[gripper_dim] = 0 16 | self._non_gripper_dim = torch.where(self._non_gripper_dim) 17 | 18 | self.action_dist = Normal(loc=means, scale=vars) 19 | self.gripper_dist = Uniform(0., 1.) 20 | 21 | self._entropies = np.zeros((2, self._action_dim)) 22 | self._means = np.zeros((2, self._action_dim)) 23 | self._vars = np.ones((2, self._action_dim)) 24 | for idx in range(2): 25 | self._entropies[idx][self._non_gripper_dim] = self.action_dist.entropy().numpy() 26 | self._entropies[idx, self._gripper_dim] = self.gripper_dist.entropy().numpy() 27 | self._means[idx][self._non_gripper_dim] = self.action_dist.mean.numpy() 28 | self._means[idx, self._gripper_dim]= self.gripper_dist.mean.numpy() * ((-1) ** (idx + 1)) 29 | self._vars[idx][self._non_gripper_dim] = self.action_dist.variance.numpy() 30 | self._vars[idx, self._gripper_dim]= self.gripper_dist.variance.numpy() 31 | 32 | def compute_action(self, x, h): 33 | act = torch.zeros((2, self._action_dim), dtype=torch.float) 34 | log_probs = np.zeros((2, self._action_dim)) 35 | 36 | for idx in range(2): 37 | act[idx][self._non_gripper_dim] = self.action_dist.sample() 38 | act[idx, self._gripper_dim] = 1 - self.gripper_dist.sample() 39 | log_probs[idx][self._non_gripper_dim] = self.action_dist.log_prob(act[idx][self._non_gripper_dim]).numpy() 40 | log_probs[idx, self._gripper_dim] = self.gripper_dist.log_prob(act[idx, self._gripper_dim]).numpy() 41 | 42 | act[0, self._gripper_dim] *= -1. 43 | log_probs = np.sum(log_probs, axis=-1) 44 | 45 | return act, np.zeros(2), h[0].cpu().numpy(), log_probs, self._entropies, self._means, self._vars 46 | 47 | def deterministic_action(self, x, h): 48 | act = torch.zeros((2, self._action_dim), dtype=torch.float) 49 | log_probs = np.zeros((2, self._action_dim)) 50 | 51 | for idx in range(2): 52 | act[idx][self._non_gripper_dim] = self.action_dist.mean 53 | act[idx, self._gripper_dim] = 1. 54 | log_probs[idx][self._non_gripper_dim] = self.action_dist.log_prob(act[idx][self._non_gripper_dim]).numpy() 55 | log_probs[idx, self._gripper_dim] = self.gripper_dist.log_prob(act[idx, self._gripper_dim]).numpy() 56 | 57 | act[0, self._gripper_dim] *= -1. 58 | log_probs = np.sum(log_probs, axis=-1) 59 | 60 | return act, np.zeros(2), h[0].cpu().numpy(), log_probs, self._entropies, self._means, self._vars 61 | -------------------------------------------------------------------------------- /scripts/create_data/create_expert_data.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # NUM_STEPS_PER_BUFFER should be either an empty string, or a comma separated list, e.g. "9000, 0, 0, 0, 0, 0" 4 | 5 | MAIN_TASK=$1 6 | STEPS_PER_TASK=$2 7 | NUM_STEPS_PER_BUFFER=$3 8 | AUX_OVERRIDE=$4 9 | 10 | 11 | SAVE_PATH_POST="${STEPS_PER_TASK}_steps_no_extra_final" 12 | DEFAULT_TOP_DIR="../../lfgp_data" 13 | TOP_DIR=${LFGP_TOP_DIR:=${DEFAULT_TOP_DIR}} 14 | echo "Using TOP_DIR OF ${TOP_DIR}" 15 | FULL_PATH="${TOP_DIR}/trained_models/experts/${MAIN_TASK}" 16 | MODEL_PATH="${FULL_PATH}/state_dict.pt" 17 | CONFIG_PATH="${FULL_PATH}/sacx_experiment_setting.pkl" 18 | SAVE_PATH="${TOP_DIR}/custom_expert_data/${MAIN_TASK}/${SAVE_PATH_POST}/" 19 | 20 | 21 | echo "Generating data for ${MAIN_TASK}, ${STEPS_PER_TASK} steps per task." 22 | if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then 23 | echo "NUM_STEPS_PER_BUFFER is unset, running for all tasks" 24 | else 25 | echo "Getting ${NUM_STEPS_PER_BUFFER} for each task, running task ${AUX_OVERRIDE} only." 26 | fi 27 | 28 | 29 | if [ "${MAIN_TASK}" = "insert" ]; then 30 | O_PRB="0.08333333333" 31 | FORCED_SCHEDULE="{0: {0: ([0, 1, 2, 3, 4, 5, 6], [${O_PRB}, ${O_PRB}, .5, ${O_PRB}, ${O_PRB}, ${O_PRB}, ${O_PRB}], ['k', 'd', 'd', 'd', 'd', 'd', 'd']), 70: 0}, 1: {0: 3, 15: 1}}" 32 | SCHEDULER_PERIOD=90 33 | if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then 34 | NUM_STEPS_PER_BUFFER="" 35 | AUX_OVERRIDE="" 36 | fi 37 | 38 | elif [ "${MAIN_TASK}" = "stack" ]; then 39 | FORCED_SCHEDULE="{0: {0: ([0, 1, 2, 3, 4, 5], [.1, .1, .5, .1, .1, .1], ['k', 'd', 'd', 'd', 'd', 'd']), 45: 0}, 1: {0: 3, 15: 1}}" 40 | SCHEDULER_PERIOD=90 41 | if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then 42 | NUM_STEPS_PER_BUFFER="" 43 | AUX_OVERRIDE="" 44 | fi 45 | 46 | elif [ "${MAIN_TASK}" = "bring" ]; then 47 | FORCED_SCHEDULE="{0: {0: ([0, 1, 3, 4, 5, 6], [.1, .1, .5, .1, .1, .1], ['k', 'd', 'd', 'd', 'd', 'd']), 45: 0}, 1: {0: 3, 15: 1}}" 48 | SCHEDULER_PERIOD=90 49 | if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then 50 | NUM_STEPS_PER_BUFFER="" 51 | AUX_OVERRIDE="0,1,3,4,5,6" # to ensure we skip insert from this model 52 | fi 53 | 54 | elif [ "${MAIN_TASK}" = "unstack-stack" ]; then 55 | FORCED_SCHEDULE="{0: {0: ([0, 1, 2, 4, 5, 6], [.1, .1, .5, .1, .1, .1], ['k', 'd', 'd', 'd', 'd', 'd']), 45: 0}, 1: {0: 3, 15: 1}}" 56 | SCHEDULER_PERIOD=120 57 | if [ -z "${NUM_STEPS_PER_BUFFER}" ]; then 58 | NUM_STEPS_PER_BUFFER="" 59 | AUX_OVERRIDE="0,1,2,4,5,6" # to ensure we skip unstack from this model 60 | fi 61 | 62 | fi 63 | 64 | echo "Saving to ${SAVE_PATH}" 65 | 66 | python ../../rl_sandbox/rl_sandbox/examples/lfgp/experts/create_expert_data.py \ 67 | --model_path="${MODEL_PATH}" \ 68 | --config_path="${CONFIG_PATH}" \ 69 | --save_path="${SAVE_PATH}" \ 70 | --num_episodes=10000000 \ 71 | --num_steps="${STEPS_PER_TASK}" \ 72 | --seed=1 \ 73 | --forced_schedule="${FORCED_SCHEDULE}" \ 74 | --scheduler_period="${SCHEDULER_PERIOD}" \ 75 | --success_only \ 76 | --reset_on_success \ 77 | --reset_between_intentions \ 78 | --aux_override="${AUX_OVERRIDE}" -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/distributions.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from a2c_ppo_acktr.utils import AddBias, init 8 | 9 | """ 10 | Modify standard PyTorch distributions so they are compatible with this code. 11 | """ 12 | 13 | # 14 | # Standardize distribution interfaces 15 | # 16 | 17 | # Categorical 18 | class FixedCategorical(torch.distributions.Categorical): 19 | def sample(self): 20 | return super().sample().unsqueeze(-1) 21 | 22 | def log_probs(self, actions): 23 | return ( 24 | super() 25 | .log_prob(actions.squeeze(-1)) 26 | .view(actions.size(0), -1) 27 | .sum(-1) 28 | .unsqueeze(-1) 29 | ) 30 | 31 | def mode(self): 32 | return self.probs.argmax(dim=-1, keepdim=True) 33 | 34 | 35 | # Normal 36 | class FixedNormal(torch.distributions.Normal): 37 | def log_probs(self, actions): 38 | return super().log_prob(actions).sum(-1, keepdim=True) 39 | 40 | def entropy(self): 41 | return super().entropy().sum(-1) 42 | 43 | def mode(self): 44 | return self.mean 45 | 46 | 47 | # Bernoulli 48 | class FixedBernoulli(torch.distributions.Bernoulli): 49 | def log_probs(self, actions): 50 | return super.log_prob(actions).view(actions.size(0), -1).sum(-1).unsqueeze(-1) 51 | 52 | def entropy(self): 53 | return super().entropy().sum(-1) 54 | 55 | def mode(self): 56 | return torch.gt(self.probs, 0.5).float() 57 | 58 | 59 | class Categorical(nn.Module): 60 | def __init__(self, num_inputs, num_outputs): 61 | super(Categorical, self).__init__() 62 | 63 | init_ = lambda m: init( 64 | m, 65 | nn.init.orthogonal_, 66 | lambda x: nn.init.constant_(x, 0), 67 | gain=0.01) 68 | 69 | self.linear = init_(nn.Linear(num_inputs, num_outputs)) 70 | 71 | def forward(self, x): 72 | x = self.linear(x) 73 | return FixedCategorical(logits=x) 74 | 75 | 76 | class DiagGaussian(nn.Module): 77 | def __init__(self, num_inputs, num_outputs): 78 | super(DiagGaussian, self).__init__() 79 | 80 | init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. 81 | constant_(x, 0)) 82 | 83 | self.fc_mean = init_(nn.Linear(num_inputs, num_outputs)) 84 | self.logstd = AddBias(torch.zeros(num_outputs)) 85 | 86 | def forward(self, x): 87 | action_mean = self.fc_mean(x) 88 | 89 | # An ugly hack for my KFAC implementation. 90 | zeros = torch.zeros(action_mean.size()) 91 | if x.is_cuda: 92 | zeros = zeros.cuda() 93 | 94 | action_logstd = self.logstd(zeros) 95 | return FixedNormal(action_mean, action_logstd.exp()) 96 | 97 | 98 | class Bernoulli(nn.Module): 99 | def __init__(self, num_inputs, num_outputs): 100 | super(Bernoulli, self).__init__() 101 | 102 | init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. 103 | constant_(x, 0)) 104 | 105 | self.linear = init_(nn.Linear(num_inputs, num_outputs)) 106 | 107 | def forward(self, x): 108 | x = self.linear(x) 109 | return FixedBernoulli(logits=x) 110 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/train/train_bc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gzip 3 | import pickle 4 | 5 | import rl_sandbox.constants as c 6 | 7 | from rl_sandbox.algorithms.bc.bc import BC 8 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks 9 | from rl_sandbox.buffers.utils import make_buffer 10 | from rl_sandbox.envs.fake_env import FakeEnv 11 | from rl_sandbox.envs.utils import make_env 12 | from rl_sandbox.learning_utils import train 13 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer 14 | from rl_sandbox.agents.rl_agents import ACAgent 15 | from rl_sandbox.transforms.general_transforms import Identity 16 | from rl_sandbox.utils import make_summary_writer, set_seed 17 | 18 | def train_bc(experiment_config): 19 | seed = experiment_config[c.SEED] 20 | save_path = experiment_config.get(c.SAVE_PATH, None) 21 | buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) 22 | 23 | set_seed(seed) 24 | train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM]) 25 | model = make_model(experiment_config[c.MODEL_SETTING]) 26 | 27 | 28 | # drop memory size for expert buffers to only what is needed 29 | load_path = experiment_config[c.EXPERT_BUFFER] 30 | with gzip.open(load_path, "rb") as f: 31 | data = pickle.load(f) 32 | experiment_config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE] 33 | expert_buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.EXPERT_BUFFER]) 34 | optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) 35 | 36 | aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], 37 | model, 38 | expert_buffer, 39 | experiment_config) 40 | 41 | learning_algorithm = BC(model=model, 42 | optimizer=optimizer, 43 | expert_buffer=expert_buffer, 44 | algo_params=experiment_config, 45 | aux_tasks=aux_tasks) 46 | 47 | load_model = experiment_config.get(c.LOAD_MODEL, False) 48 | if load_model: 49 | learning_algorithm.load_state_dict(torch.load(load_model)) 50 | 51 | agent = ACAgent(model=model, 52 | learning_algorithm=learning_algorithm, 53 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 54 | evaluation_env = None 55 | evaluation_agent = None 56 | if experiment_config.get(c.EVALUATION_FREQUENCY, 0): 57 | evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) 58 | evaluation_agent = ACAgent(model=model, 59 | learning_algorithm=None, 60 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 61 | 62 | summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.BC, cfg=experiment_config) 63 | train(agent=agent, 64 | evaluation_agent=evaluation_agent, 65 | train_env=train_env, 66 | evaluation_env=evaluation_env, 67 | buffer_preprocess=buffer_preprocessing, 68 | experiment_settings=experiment_config, 69 | auxiliary_reward=experiment_config[c.EVALUATION_REWARD_FUNC], 70 | summary_writer=summary_writer, 71 | save_path=save_path) 72 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/train/train_bc_no_overfit.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import gzip 3 | import pickle 4 | 5 | import rl_sandbox.constants as c 6 | 7 | from rl_sandbox.algorithms.bc.bc_no_overfit import BC 8 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks 9 | from rl_sandbox.buffers.utils import make_buffer 10 | from rl_sandbox.envs.fake_env import FakeEnv 11 | from rl_sandbox.envs.utils import make_env 12 | from rl_sandbox.learning_utils import train 13 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer 14 | from rl_sandbox.agents.rl_agents import ACAgent 15 | from rl_sandbox.transforms.general_transforms import Identity 16 | from rl_sandbox.utils import make_summary_writer, set_seed 17 | 18 | def train_bc_no_overfit(experiment_config): 19 | seed = experiment_config[c.SEED] 20 | save_path = experiment_config.get(c.SAVE_PATH, None) 21 | buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) 22 | 23 | set_seed(seed) 24 | train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM]) 25 | model = make_model(experiment_config[c.MODEL_SETTING]) 26 | 27 | 28 | # drop memory size for expert buffers to only what is needed 29 | load_path = experiment_config[c.EXPERT_BUFFER] 30 | with gzip.open(load_path, "rb") as f: 31 | data = pickle.load(f) 32 | experiment_config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE] 33 | expert_buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.EXPERT_BUFFER]) 34 | optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) 35 | 36 | aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], 37 | model, 38 | expert_buffer, 39 | experiment_config) 40 | 41 | learning_algorithm = BC(model=model, 42 | optimizer=optimizer, 43 | expert_buffer=expert_buffer, 44 | algo_params=experiment_config, 45 | aux_tasks=aux_tasks) 46 | 47 | load_model = experiment_config.get(c.LOAD_MODEL, False) 48 | if load_model: 49 | learning_algorithm.load_state_dict(torch.load(load_model)) 50 | 51 | agent = ACAgent(model=model, 52 | learning_algorithm=learning_algorithm, 53 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 54 | evaluation_env = None 55 | evaluation_agent = None 56 | if experiment_config.get(c.EVALUATION_FREQUENCY, 0): 57 | evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) 58 | evaluation_agent = ACAgent(model=model, 59 | learning_algorithm=None, 60 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 61 | 62 | summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.BC, cfg=experiment_config) 63 | train(agent=agent, 64 | evaluation_agent=evaluation_agent, 65 | train_env=train_env, 66 | evaluation_env=evaluation_env, 67 | buffer_preprocess=buffer_preprocessing, 68 | experiment_settings=experiment_config, 69 | auxiliary_reward=experiment_config[c.EVALUATION_REWARD_FUNC], 70 | summary_writer=summary_writer, 71 | save_path=save_path) 72 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/buffers/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gzip 3 | import pickle 4 | 5 | import rl_sandbox.constants as c 6 | 7 | from rl_sandbox.buffers.disk_buffer import DiskNumPyBuffer 8 | from rl_sandbox.buffers.ram_buffer import NumPyBuffer, NextStateNumPyBuffer, TrajectoryNumPyBuffer 9 | from rl_sandbox.buffers.torch_pin_buffer import TorchPinBuffer, TrajectoryPinBuffer 10 | 11 | 12 | def make_buffer(buffer_cfg, seed=None, load_buffer=False, start_idx=0, end_idx=None, match_load_size=False, 13 | frame_stack_load=1): 14 | if match_load_size and load_buffer: 15 | with gzip.open(load_buffer, "rb") as f: 16 | data = pickle.load(f) 17 | original_size = buffer_cfg[c.KWARGS][c.MEMORY_SIZE] 18 | buffer_cfg[c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE] 19 | 20 | if seed is None: 21 | seed = np.random.randint(0, 2 ** 32 - 1) 22 | 23 | buffer_cfg[c.KWARGS][c.RNG] = np.random.RandomState(seed) 24 | 25 | if buffer_cfg[c.STORAGE_TYPE] == c.DISK: 26 | buffer = DiskNumPyBuffer(**buffer_cfg[c.KWARGS]) 27 | elif buffer_cfg[c.STORAGE_TYPE] == c.RAM: 28 | buffer_type = buffer_cfg.get(c.BUFFER_TYPE, c.DEFAULT) 29 | assert buffer_type in c.VALID_BUFFER_TYPE, f"Invalid buffer type: {buffer_type}" 30 | 31 | # this line for compatibility with old code 32 | if buffer_cfg.get(c.STORE_NEXT_OBSERVATION, False): 33 | buffer_type = c.STORE_NEXT_OBSERVATION 34 | 35 | if buffer_type == c.DEFAULT: 36 | buffer = NumPyBuffer(**buffer_cfg[c.KWARGS]) 37 | elif buffer_type == c.STORE_NEXT_OBSERVATION: 38 | buffer = NextStateNumPyBuffer(**buffer_cfg[c.KWARGS]) 39 | elif buffer_type == c.TRAJECTORY: 40 | buffer = TrajectoryNumPyBuffer(**buffer_cfg[c.KWARGS]) 41 | else: 42 | raise NotImplementedError 43 | 44 | elif buffer_cfg[c.STORAGE_TYPE] == c.GPU: 45 | buffer = TorchPinBuffer(**buffer_cfg[c.KWARGS]) 46 | elif buffer_cfg[c.STORAGE_TYPE] == c.NSTEP_GPU: 47 | buffer = TrajectoryPinBuffer(**buffer_cfg[c.KWARGS]) 48 | else: 49 | raise NotImplementedError 50 | 51 | for wrapper_config in buffer_cfg[c.BUFFER_WRAPPERS]: 52 | buffer = wrapper_config[c.WRAPPER](buffer, **wrapper_config[c.KWARGS]) 53 | 54 | if load_buffer: 55 | buffer.load(load_buffer, load_rng=seed==None, start_idx=start_idx, end_idx=end_idx, frame_stack=frame_stack_load) 56 | if match_load_size: 57 | buffer_cfg[c.KWARGS][c.MEMORY_SIZE] = original_size 58 | 59 | return buffer 60 | 61 | 62 | def get_default_buffer(memory_size, obs_dim, action_dim): 63 | buffer_settings = { 64 | c.KWARGS: { 65 | c.MEMORY_SIZE: memory_size, 66 | c.OBS_DIM: (obs_dim,), 67 | c.H_STATE_DIM: (1,), 68 | c.ACTION_DIM: (action_dim,), 69 | c.REWARD_DIM: (1,), 70 | c.INFOS: {c.MEAN: ((action_dim,), np.float32), 71 | c.VARIANCE: ((action_dim,), np.float32), 72 | c.ENTROPY: ((action_dim,), np.float32), 73 | c.LOG_PROB: ((1,), np.float32), 74 | c.VALUE: ((1,), np.float32), 75 | c.DISCOUNTING: ((1,), np.float32)}, 76 | c.CHECKPOINT_INTERVAL: 0, 77 | c.CHECKPOINT_PATH: None, 78 | }, 79 | c.STORAGE_TYPE: c.RAM, 80 | c.BUFFER_TYPE: c.STORE_NEXT_OBSERVATION, 81 | c.BUFFER_WRAPPERS: [], 82 | c.LOAD_BUFFER: False, 83 | } 84 | return make_buffer(buffer_settings) -------------------------------------------------------------------------------- /scripts/plotting/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common things for many plotting files. 3 | task_inds are: 0--stack, 1--unstack-stack, 2--bring, 3--insert 4 | """ 5 | 6 | import os 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | ALGO_TITLE_DICT = { 11 | 'lfgp_wrs': 'LfGP (multi)', 12 | 'multitask_bc': 'BC (multi)', 13 | 'dac': 'DAC (single)', 14 | 'bc': 'BC (single)' 15 | } 16 | 17 | def get_path_defaults(fig_name, task_inds=(0,1,2,3)): 18 | root_dir = "/media/starslab/users/trevor-ablett/dac-x" 19 | fig_path = root_dir + "/figures/" + fig_name 20 | experiment_root_dir = root_dir + "/play_xyz" 21 | seeds = ['1','2','3','4','5'] 22 | expert_root = os.path.join(root_dir, "play_xyz/expert-data") 23 | expert_perf_files = [ 24 | os.path.join(expert_root, "open-close-stack-lift-reach-move/policies/05-09-21_21_57_07/eval_1999999_100_eps_per_int.pkl"), 25 | os.path.join(expert_root, "open-close-unstackstack-lift-reach-move-35M/policies/08-29-21_23_05_03/eval_3499999_100_eps_per_int.pkl"), 26 | os.path.join(expert_root, "open-close-insert-bring-lift-reach-move/policies/05-30-21_20_38_48/eval_1299999_100_eps_per_int.pkl"), 27 | os.path.join(expert_root, "open-close-insert-bring-lift-reach-move/policies/05-30-21_20_38_48/eval_2699999_100_eps_per_int.pkl")] 28 | expert_perf_file_main_task_i = [2, 2, 3, 2] 29 | 30 | out_epf = [] 31 | out_epf_mti = [] 32 | for i in task_inds: 33 | out_epf.append(expert_perf_files[i]) 34 | out_epf_mti.append(expert_perf_file_main_task_i[i]) 35 | 36 | return root_dir, fig_path, experiment_root_dir, seeds, expert_root, out_epf, out_epf_mti 37 | 38 | 39 | def get_task_defaults(task_inds=(0,1,2,3)): 40 | task_dir_names = ["stack_0", "unstack_stack_env_only_0", "bring_0", "insert_0"] 41 | valid_task = [True, True, True, True] 42 | task_titles = ["Stack", "Unstack-Stack", "Bring", "Insert"] 43 | main_task_i = [2, 2, 2, 2] 44 | num_aux = [6, 6, 6, 7] 45 | task_data_filenames = ['train.pkl', 'train.pkl', 'train_rerun.pkl', 'train.pkl'] 46 | num_eval_steps_to_use = [20, 20, 20, 40] 47 | 48 | out_tdn = [] 49 | out_vt = [] 50 | out_tt = [] 51 | out_mti = [] 52 | out_na = [] 53 | out_tdf = [] 54 | out_nestu = [] 55 | for i in task_inds: 56 | out_tdn.append(task_dir_names[i]) 57 | out_vt.append(valid_task[i]) 58 | out_tt.append(task_titles[i]) 59 | out_mti.append(main_task_i[i]) 60 | out_na.append(num_aux[i]) 61 | out_tdf.append(task_data_filenames[i]) 62 | out_nestu.append(num_eval_steps_to_use[i]) 63 | 64 | return out_tdn, out_vt, out_tt, out_mti, out_na, out_tdf, out_nestu 65 | 66 | 67 | def get_algo_defaults(): 68 | algo_dir_names=['lfgp_wrs', 'multitask_bc', 'dac', 'bc'] 69 | algo_titles = ['LfGP (multi)', 'BC (multi)', 'DAC (single)', 'BC (single)'] 70 | multitask_algos = ['multitask_bc', 'lfgp_wrs'] 71 | eval_eps_per_task = 50 72 | 73 | return algo_dir_names, algo_titles, multitask_algos, eval_eps_per_task 74 | 75 | 76 | def get_fig_defaults(num_plots=4): 77 | fig_shape = [1, num_plots] # row x col 78 | plot_size = [3.2, 2.4] 79 | num_stds = 1 80 | font_size = 16 81 | eval_interval = 100000 82 | cmap = plt.get_cmap("tab10") 83 | linewidth = 1 84 | std_alpha = .5 85 | x_val_scale = 1e6 86 | subsample_rate = 1 # 1 for no subsample 87 | include_expert_baseline = True 88 | 89 | return fig_shape, plot_size, num_stds, font_size, eval_interval, cmap, linewidth, std_alpha, x_val_scale, subsample_rate, \ 90 | include_expert_baseline -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_rewards/manipulator_learning/panda/lift_xyz_state.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from numpy.linalg import norm 4 | 5 | 6 | TABLE_HEIGHT = 0.6247 # not definitively defined anywhere, just found through trial and error 7 | BLOCK_HEIGHT_ON_TABLE = 0.6550 # again, trial and error 8 | 9 | # taken directly from sac-x paper 10 | def opened(info, **kwargs): 11 | return 1 if np.all(info["infos"][-1]['grip_pos'] >= .9) else 0 12 | 13 | def closed(info, **kwargs): 14 | return 1 if np.all(info["infos"][-1]['grip_pos'] <= .1) else 0 15 | 16 | def lifted(info, max_rew_height=.1, **kwargs): 17 | block_height = info["infos"][-1]['obj_pos_world'][0][2] - BLOCK_HEIGHT_ON_TABLE 18 | if block_height > max_rew_height: 19 | return 1.5 20 | elif block_height < .005: 21 | return 0 22 | else: 23 | return block_height / max_rew_height 24 | 25 | # this is just the generic one, not meant to be used on its own as an aux reward 26 | def close(dist_thresh, obj_1_pos, obj_2_pos, tanh_multiplier=10.0, close_rew=1.5): 27 | dist = norm(obj_1_pos - obj_2_pos) 28 | if dist < dist_thresh: 29 | return close_rew 30 | else: 31 | # return 1 - (np.tanh(dist / 10))**2 # from SAC-X paper, but very poorly scaled for meters as units 32 | return 1 - np.tanh(tanh_multiplier * dist) 33 | 34 | def hand_block_close(info, **kwargs): 35 | return close(0.0, info["infos"][-1]['obj_pos'][:3], info["infos"][-1]['pos']) # only for first aka blue block 36 | 37 | # modified rewards to make more "human like" intentions 38 | def open_action(action, **kwargs): 39 | action_mag = norm(action[:3]) 40 | open_rew = 1 if action[-1] < 0 else 0 41 | return open_rew - .5 * action_mag 42 | 43 | def close_action(action, **kwargs): 44 | action_mag = norm(action[:3]) 45 | close_rew = 1 if action[-1] > 0 else 0 46 | return close_rew - .5 * action_mag 47 | 48 | def hand_block_close_speed_penalty(info, action, **kwargs): 49 | close_rew = hand_block_close(info, **kwargs) 50 | dist = norm(info["infos"][-1]['obj_pos'][:3] - info["infos"][-1]['pos']) 51 | action_mag = norm(action[:3]) 52 | speed_penalty = (1. - np.tanh(10 * dist)) * action_mag 53 | return close_rew 54 | 55 | 56 | class PandaLiftXYZStateAuxiliaryReward: 57 | def __init__(self, aux_rewards=(open_action, close_action, lifted, hand_block_close_speed_penalty), include_main=True): 58 | self._aux_rewards = aux_rewards 59 | self._include_main = include_main 60 | 61 | # self._done_failure_reward = -5 62 | # self._done_success_reward = 100 63 | 64 | @property 65 | def num_auxiliary_rewards(self): 66 | return len(self._aux_rewards) 67 | 68 | def reward(self, 69 | observation, 70 | action, 71 | reward, 72 | done, 73 | next_observation, 74 | info): 75 | observation = observation.reshape(-1) 76 | next_observation = next_observation.reshape(-1) 77 | reward_vector = [] 78 | if self._include_main: 79 | reward_vector.append(reward) 80 | for task_reward in self._aux_rewards: 81 | reward_vector.append(task_reward(observation=observation, 82 | action=action, 83 | reward=reward, 84 | next_observation=next_observation, 85 | done=done, 86 | info=info)) 87 | 88 | return np.array(reward_vector, dtype=np.float32) 89 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/buffers/wrappers/noise_wrapper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import rl_sandbox.constants as c 4 | 5 | from rl_sandbox.buffers.wrappers.buffer_wrapper import BufferWrapper 6 | from rl_sandbox.model_architectures.utils import RunningMeanStd 7 | 8 | 9 | class NoiseBuffer(BufferWrapper): 10 | def __init__(self, buffer, noise_magnitude, update_on_sample=False): 11 | super().__init__(buffer) 12 | 13 | self._noise_mag = noise_magnitude 14 | self._obs_mean = None 15 | self._obs_std = None 16 | self._dim_noise_mag = None 17 | if update_on_sample: 18 | self._rms = None 19 | 20 | def sample(self, *args, **kwargs): 21 | buf_data = super().sample(*args, **kwargs) 22 | obss = buf_data[0] 23 | 24 | if self._dim_noise_mag is None and not hasattr(self, '_rms'): self.update_stats() 25 | 26 | if hasattr(self, '_rms'): 27 | if self._rms is None: 28 | self._rms = RunningMeanStd(shape=(obss.shape[-1],), device=self.device) 29 | self._rms.update(obss) 30 | self._obs_mean = self._rms.mean 31 | self._obs_std = self._rms.std 32 | self._dim_noise_mag = self._noise_mag * self._obs_std 33 | 34 | 35 | obss_noise = torch.randn_like(obss) * self._dim_noise_mag 36 | obss += obss_noise # changes buf_data as well 37 | 38 | return buf_data 39 | 40 | def sample_with_next_obs(self, *args, **kwargs): 41 | buf_data = super().sample_with_next_obs(*args, **kwargs) 42 | obss = buf_data[0] 43 | next_obss = buf_data[5] 44 | 45 | if self._dim_noise_mag is None and not hasattr(self, '_rms'): self.update_stats() 46 | 47 | if hasattr(self, '_rms'): 48 | if self._rms is None: 49 | self._rms = RunningMeanStd(shape=(obss.shape[-1],), device=self.device) 50 | self._rms.update(obss) 51 | self._obs_mean = self._rms.mean 52 | self._obs_std = self._rms.std 53 | self._dim_noise_mag = self._noise_mag * self._obs_std 54 | 55 | obss_noise = torch.randn_like(obss) * self._dim_noise_mag 56 | next_obss_noise = torch.randn_like(next_obss) * self._dim_noise_mag 57 | obss += obss_noise # changes buf_data as well 58 | next_obss += next_obss_noise # changes buf_data as well 59 | 60 | return buf_data 61 | 62 | def sample_trajs(self, *args, **kwargs): 63 | buf_data = super().sample_trajs(*args, **kwargs) 64 | obss = buf_data[0] 65 | next_obss = buf_data[5] # TODO this only works with the way torch pin buffer is set up for now 66 | 67 | if self._dim_noise_mag is None and not hasattr(self, '_rms'): self.update_stats() 68 | 69 | if hasattr(self, '_rms'): 70 | if self._rms is None: 71 | self._rms = RunningMeanStd(shape=(obss.shape[-1],), device=self.device) 72 | self._rms.update(obss) 73 | self._obs_mean = self._rms.mean 74 | self._obs_std = self._rms.std 75 | self._dim_noise_mag = self._noise_mag * self._obs_std 76 | 77 | 78 | obss_noise = torch.randn_like(obss) * self._dim_noise_mag 79 | next_obss_noise = torch.randn_like(next_obss) * self._dim_noise_mag 80 | obss += obss_noise # changes buf_data as well 81 | next_obss += next_obss_noise # changes buf_data as well 82 | 83 | return buf_data 84 | 85 | def update_stats(self): 86 | self._obs_mean = self.buffer.observations.mean(axis=0) 87 | self._obs_std = self.buffer.observations.std(axis=0) 88 | self._dim_noise_mag = self._noise_mag * self._obs_std 89 | -------------------------------------------------------------------------------- /six_state_mdp.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | # CONFIG 4 | max_iters = 5000 5 | alpha = .1 6 | stop_tol = 1e-5 7 | max_action_q_learning = False # if true, q learning instead of SARSA 8 | initial_q = 0 9 | 10 | q_table = { 11 | (1, None): 0, 12 | (2, None): 0, 13 | (3, None): 0, 14 | (4, None): 0, 15 | (5, None): 0, 16 | (1, 2): initial_q, 17 | (1, 5): initial_q, 18 | (2, 3): initial_q, 19 | (2, 6): initial_q, 20 | (3, 4): initial_q, 21 | (3, 6): initial_q, 22 | (4, 5): initial_q, 23 | (4, 6): initial_q, 24 | (5, 1): initial_q, 25 | (5, 5): initial_q, 26 | (6, 1): initial_q, 27 | (6, 6): initial_q 28 | } 29 | q_new = copy.deepcopy(q_table) 30 | 31 | r_table = { 32 | (1, 2): 1, 33 | (1, 5): -1, 34 | (2, 3): 1, 35 | (2, 6): -1, 36 | (3, 4): 1, 37 | (3, 6): -1, 38 | (4, 5): 1, 39 | (4, 6): -1, 40 | (5, 1): -1, 41 | (5, 5): 1, 42 | (6, 1): -1, 43 | (6, 6): -1 44 | } 45 | 46 | buffer = [ 47 | ((1, 5), (5, 5)), # ep 2 48 | ((5, 5), (5, 5)), 49 | ((5, 5), (5, 5)), 50 | ((5, 5), (5, 5)), 51 | ((5, 5), (5, None)), 52 | ((1, 2), (2, 6)), # ep 1 53 | ((2, 6), (6, 1)), 54 | ((6, 1), (1, 5)), 55 | ((1, 5), (5, 5)), 56 | ((5, 5), (5, None)), 57 | ((1, 2), (2, 3)), # ep 3 58 | ((2, 3), (3, 6)), 59 | ((3, 6), (6, 1)), 60 | ((6, 1), (1, 5)), 61 | ((1, 5), (5, None)), 62 | ] 63 | 64 | valid_update_states = [pair[0] for pair in buffer] 65 | 66 | for ep_i in range(len(buffer) // 5): 67 | 68 | for i in range(max_iters): 69 | 70 | short_buffer = buffer[:(ep_i + 1) * 5] 71 | for (state_act, next_state_act) in short_buffer: 72 | 73 | if next_state_act[1] is None: # equivalent of done, so update exclusively uses reward 74 | q_new[state_act] = q_new[state_act] + alpha * (r_table[state_act] - q_new[state_act]) 75 | 76 | else: 77 | if max_action_q_learning: 78 | max_state_act_val = -1e100 79 | max_state_act = None 80 | 81 | for qt_state_act in q_table: 82 | if qt_state_act[0] == next_state_act[0] and q_new[qt_state_act] > max_state_act_val\ 83 | and qt_state_act[1] is not None and qt_state_act in valid_update_states: 84 | # print(f"New max: next state act {qt_state_act}, " 85 | # f"val {q_new[qt_state_act]} used for updating {state_act}") 86 | max_state_act_val = q_new[qt_state_act] 87 | max_state_act = qt_state_act 88 | 89 | # print(f"Selected max next state act {max_state_act}, " 90 | # f"val {q_new[max_state_act]} used for updating {state_act}") 91 | 92 | q_new[state_act] = q_new[state_act] + alpha * (r_table[state_act] + 93 | q_new[max_state_act] - q_new[state_act]) 94 | else: 95 | q_new[state_act] = q_new[state_act] + alpha * (r_table[state_act] + 96 | q_new[next_state_act] - q_new[state_act]) 97 | 98 | total_diff = 0 99 | for state_act in q_table.keys(): 100 | total_diff += abs(q_new[state_act] - q_table[state_act]) 101 | 102 | q_table = copy.deepcopy(q_new) 103 | if total_diff < stop_tol: 104 | break 105 | 106 | print(f"End of ep: {ep_i}, current q(1, 5): {q_table[(1, 5)]}, q(1, 2): {q_table[(1, 2)]}") 107 | 108 | for k in q_table: 109 | q_table[k] = round(q_table[k], 4) 110 | 111 | print(f"Final Q Table after {i} iterations: {q_table}") -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/lfgp/default_configs/dac.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import torch 5 | 6 | import rl_sandbox.auxiliary_rewards.manipulator_learning.panda.play_xyz_state as p_aux 7 | import rl_sandbox.constants as c 8 | import rl_sandbox.examples.lfgp.experiment_utils as exp_utils 9 | 10 | 11 | from rl_sandbox.model_architectures.actor_critics.fully_connected_soft_actor_critic import FullyConnectedSeparate, \ 12 | FullyConnectedSquashedGaussianSAC 13 | import rl_sandbox.examples.lfgp.default_configs.common as common_default 14 | 15 | 16 | def reward_func(reward, **kwargs): return np.array([reward]) 17 | 18 | def get_settings(args): 19 | if args.env_type == c.MANIPULATOR_LEARNING: 20 | common_default.main_task_alias_set(args) 21 | obs_dim, action_dim = common_default.get_obs_action_dim(args) 22 | common_default.default_settings(args) 23 | device = torch.device(args.device) 24 | num_tasks = 1 25 | 26 | if args.env_type == c.MANIPULATOR_LEARNING: 27 | save_path = exp_utils.get_save_path(c.DAC, args.main_task, args.seed, args.exp_name, args.top_save_path) 28 | else: 29 | save_path = exp_utils.get_save_path(c.DAC, args.env_name, args.seed, args.exp_name, args.top_save_path) 30 | 31 | # expert path 32 | expert_buffer = os.path.join(args.expert_top_dir, args.expert_dir_rest, args.expert_filenames) 33 | 34 | # reward options -- ensure we get the correct aux reward 35 | if args.env_type == c.MANIPULATOR_LEARNING: 36 | aux_reward_all = p_aux.PandaPlayXYZStateAuxiliaryReward(args.main_task, include_main=False) 37 | aux_reward_names = [func.__qualname__ for func in aux_reward_all._aux_rewards] 38 | 39 | if "unstack" in args.main_task: 40 | aux_reward_name = "stack_0" 41 | elif "insert" in args.main_task: 42 | aux_reward_name = "insert_0" 43 | else: 44 | aux_reward_name = args.main_task 45 | 46 | if 'no_move' in aux_reward_name: 47 | task_name = aux_reward_name.split('_no_move_')[0] 48 | aux_reward_name = f"{task_name}_0" 49 | 50 | eval_reward = aux_reward_all._aux_rewards[aux_reward_names.index(aux_reward_name)] 51 | 52 | elif args.env_type in [c.SAWYER, c.HAND_DAPG, c.PANDA_RL_ENVS]: 53 | eval_reward = None # uses env reward 54 | 55 | else: 56 | raise NotImplementedError("Not yet implemented for other env types") 57 | 58 | buffer_settings, expert_buffer_settings = common_default.get_buffer_settings( 59 | args, obs_dim, action_dim, num_tasks, False, device) 60 | 61 | ##### populate settings dictionary ##### 62 | experiment_setting = { 63 | **common_default.get_rl_settings(args, obs_dim, action_dim, args.num_evals_per_task), 64 | **common_default.get_train_settings(args, action_dim, device), 65 | c.DISCRIMINATOR_SETTING: common_default.get_discriminator_settings(args, obs_dim, action_dim, num_tasks, device), 66 | c.OPTIMIZER_SETTING: common_default.get_optimizer_settings(args), 67 | c.BUFFER_SETTING: buffer_settings, 68 | c.EXPERT_BUFFER_SETTING: expert_buffer_settings, 69 | 70 | # Model 71 | c.MODEL_SETTING: { 72 | c.MODEL_ARCHITECTURE: FullyConnectedSeparate if args.no_shared_layers else FullyConnectedSquashedGaussianSAC, 73 | c.KWARGS: { 74 | **common_default.get_model_kwargs(args, obs_dim, action_dim, device), 75 | } 76 | }, 77 | 78 | # DAC 79 | c.EXPERT_BUFFER: expert_buffer, 80 | c.EXPERT_AMOUNT: int(args.expert_amounts), 81 | c.EVALUATION_REWARD_FUNC: eval_reward, 82 | 83 | # Save 84 | c.SAVE_PATH: save_path, 85 | } 86 | 87 | if args.full_traj_expert_filenames: 88 | experiment_setting[c.FT_EXPERT_BUFFER] = os.path.join(args.expert_top_dir, args.ft_expert_dir_rest, args.expert_filenames) 89 | 90 | exp_utils.config_check(experiment_setting, args.top_save_path) 91 | 92 | return experiment_setting 93 | 94 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/model_architectures/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | import rl_sandbox.constants as c 6 | 7 | def default_weight_init(m): 8 | if type(m) == nn.Linear: 9 | torch.nn.init.xavier_uniform_(m.weight) 10 | # torch.nn.init.orthogonal_(m.weight) 11 | if m.bias is not None: 12 | m.bias.data.fill_(0) 13 | elif type(m) == nn.Conv2d: 14 | torch.nn.init.kaiming_uniform_(m.weight) 15 | if m.bias is not None: 16 | m.bias.data.fill_(0) 17 | elif type(m) == nn.LSTM or type(m) == nn.GRU: 18 | torch.nn.init.xavier_uniform_(m.weight_ih_l0) 19 | torch.nn.init.orthogonal_(m.weight_hh_l0) 20 | if m.bias is not None: 21 | m.bias_ih_l0.data.fill_(0) 22 | m.bias_hh_l0.data.fill_(0) 23 | 24 | 25 | def construct_linear_layers(layers): 26 | linear_layers = nn.ModuleList() 27 | for (in_dim, out_dim, activation, use_bias, dropout_p) in layers: 28 | linear_layers.append(nn.Linear(in_dim, out_dim, bias=use_bias)) 29 | linear_layers.append(activation) 30 | if dropout_p > 0.: 31 | linear_layers.append(nn.Dropout(dropout_p)) 32 | 33 | return linear_layers 34 | 35 | 36 | def make_model(model_cfg): 37 | return model_cfg[c.MODEL_ARCHITECTURE](**model_cfg[c.KWARGS]) 38 | 39 | 40 | def make_optimizer(parameters, optimizer_cfg): 41 | return optimizer_cfg[c.OPTIMIZER](parameters, **optimizer_cfg[c.KWARGS]) 42 | 43 | 44 | class RunningMeanStd(): 45 | """ Modified from Baseline 46 | Assumes shape to be (number of inputs, input_shape) 47 | """ 48 | 49 | def __init__(self, epsilon=1e-4, shape=(), norm_dim=(0,), a_min=-5., a_max=5., device='cpu'): 50 | assert epsilon > 0. 51 | self.shape = shape 52 | self.device = torch.device(device) 53 | self.mean = torch.zeros(shape, dtype=torch.float) 54 | self.var = torch.ones(shape, dtype=torch.float) 55 | self.epsilon = epsilon 56 | self.count = epsilon 57 | self.a_min = a_min 58 | self.a_max = a_max 59 | self.norm_dim = norm_dim 60 | self.to(self.device) 61 | 62 | def to(self, device): 63 | self.mean = self.mean.to(device) 64 | self.var = self.var.to(device) 65 | eps = torch.tensor([self.epsilon]) 66 | self.epsilon = eps.to(device) 67 | 68 | def update(self, x): 69 | batch_mean = torch.mean(x, dim=self.norm_dim) 70 | batch_var = torch.var(x, dim=self.norm_dim) 71 | batch_count = int(torch.prod(torch.tensor( 72 | [x.shape[dim] for dim in self.norm_dim]))) 73 | self.update_from_moments(batch_mean, batch_var, batch_count) 74 | 75 | def update_from_moments(self, batch_mean, batch_var, batch_count): 76 | delta = batch_mean - self.mean 77 | tot_count = self.count + batch_count 78 | 79 | new_mean = self.mean + delta * batch_count / tot_count 80 | m_a = self.var * self.count 81 | m_b = batch_var * batch_count 82 | M2 = m_a + m_b + (delta ** 2) * self.count * batch_count / tot_count 83 | new_var = M2 / tot_count 84 | new_count = tot_count 85 | 86 | self.mean = new_mean 87 | self.var = new_var 88 | self.std = torch.sqrt(self.var + self.epsilon) 89 | self.count = new_count 90 | 91 | def normalize(self, x): 92 | x_shape = x.shape 93 | x = x.reshape(-1, *self.shape).to(self.device) 94 | normalized_x = torch.clamp((x - self.mean) / torch.sqrt(self.var + self.epsilon), 95 | min=self.a_min, 96 | max=self.a_max) 97 | normalized_x[normalized_x != normalized_x] = 0. 98 | normalized_x = normalized_x.reshape(x_shape) 99 | return normalized_x 100 | 101 | def unnormalize(self, x): 102 | # return x * torch.sqrt(self.var + self.epsilon) + self.mean 103 | return x * self.std + self.mean 104 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/algo/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | 6 | 7 | class PPO(): 8 | def __init__(self, 9 | actor_critic, 10 | clip_param, 11 | ppo_epoch, 12 | num_mini_batch, 13 | value_loss_coef, 14 | entropy_coef, 15 | lr=None, 16 | eps=None, 17 | max_grad_norm=None, 18 | use_clipped_value_loss=True): 19 | 20 | self.actor_critic = actor_critic 21 | 22 | self.clip_param = clip_param 23 | self.ppo_epoch = ppo_epoch 24 | self.num_mini_batch = num_mini_batch 25 | 26 | self.value_loss_coef = value_loss_coef 27 | self.entropy_coef = entropy_coef 28 | 29 | self.max_grad_norm = max_grad_norm 30 | self.use_clipped_value_loss = use_clipped_value_loss 31 | 32 | self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps) 33 | 34 | def update(self, rollouts): 35 | advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] 36 | advantages = (advantages - advantages.mean()) / ( 37 | advantages.std() + 1e-5) 38 | 39 | value_loss_epoch = 0 40 | action_loss_epoch = 0 41 | dist_entropy_epoch = 0 42 | 43 | for e in range(self.ppo_epoch): 44 | if self.actor_critic.is_recurrent: 45 | data_generator = rollouts.recurrent_generator( 46 | advantages, self.num_mini_batch) 47 | else: 48 | data_generator = rollouts.feed_forward_generator( 49 | advantages, self.num_mini_batch) 50 | 51 | for sample in data_generator: 52 | obs_batch, recurrent_hidden_states_batch, actions_batch, \ 53 | value_preds_batch, return_batch, masks_batch, old_action_log_probs_batch, \ 54 | adv_targ = sample 55 | 56 | # Reshape to do in a single forward pass for all steps 57 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 58 | obs_batch, recurrent_hidden_states_batch, masks_batch, 59 | actions_batch) 60 | 61 | ratio = torch.exp(action_log_probs - 62 | old_action_log_probs_batch) 63 | surr1 = ratio * adv_targ 64 | surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 65 | 1.0 + self.clip_param) * adv_targ 66 | action_loss = -torch.min(surr1, surr2).mean() 67 | 68 | if self.use_clipped_value_loss: 69 | value_pred_clipped = value_preds_batch + \ 70 | (values - value_preds_batch).clamp(-self.clip_param, self.clip_param) 71 | value_losses = (values - return_batch).pow(2) 72 | value_losses_clipped = ( 73 | value_pred_clipped - return_batch).pow(2) 74 | value_loss = 0.5 * torch.max(value_losses, 75 | value_losses_clipped).mean() 76 | else: 77 | value_loss = 0.5 * (return_batch - values).pow(2).mean() 78 | 79 | self.optimizer.zero_grad() 80 | (value_loss * self.value_loss_coef + action_loss - 81 | dist_entropy * self.entropy_coef).backward() 82 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 83 | self.max_grad_norm) 84 | self.optimizer.step() 85 | 86 | value_loss_epoch += value_loss.item() 87 | action_loss_epoch += action_loss.item() 88 | dist_entropy_epoch += dist_entropy.item() 89 | 90 | num_updates = self.ppo_epoch * self.num_mini_batch 91 | 92 | value_loss_epoch /= num_updates 93 | action_loss_epoch /= num_updates 94 | dist_entropy_epoch /= num_updates 95 | 96 | return value_loss_epoch, action_loss_epoch, dist_entropy_epoch 97 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/train/train_multitask_bc.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import gzip 3 | 4 | import torch 5 | 6 | import rl_sandbox.constants as c 7 | 8 | from rl_sandbox.algorithms.bc.bc import MultitaskBC 9 | from rl_sandbox.algorithms.sac_x.schedulers import FixedScheduler, RecycleScheduler 10 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks 11 | from rl_sandbox.buffers.utils import make_buffer 12 | from rl_sandbox.envs.fake_env import FakeEnv 13 | from rl_sandbox.envs.utils import make_env 14 | from rl_sandbox.learning_utils import train 15 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer 16 | from rl_sandbox.agents.hrl_agents import SACXAgent 17 | from rl_sandbox.transforms.general_transforms import Identity 18 | from rl_sandbox.utils import make_summary_writer, set_seed 19 | 20 | def train_multitask_bc(experiment_config): 21 | seed = experiment_config[c.SEED] 22 | save_path = experiment_config.get(c.SAVE_PATH, None) 23 | buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) 24 | num_tasks = experiment_config[c.NUM_TASKS] 25 | 26 | set_seed(seed) 27 | train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM]) 28 | model = make_model(experiment_config[c.MODEL_SETTING]) 29 | 30 | assert num_tasks == len(experiment_config[c.EXPERT_BUFFERS]) == experiment_config[c.AUXILIARY_REWARDS].num_auxiliary_rewards 31 | expert_buffers = [] 32 | for load_path in experiment_config[c.EXPERT_BUFFERS]: 33 | # drop memory size for expert buffers to only what is needed 34 | with gzip.open(load_path, "rb") as f: 35 | data = pickle.load(f) 36 | experiment_config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE] 37 | 38 | expert_buffers.append(make_buffer(experiment_config[c.BUFFER_SETTING], seed, load_path)) 39 | 40 | optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) 41 | aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], 42 | model, 43 | expert_buffers[0], 44 | experiment_config) 45 | 46 | learning_algorithm = MultitaskBC(model=model, 47 | optimizer=optimizer, 48 | expert_buffers=expert_buffers, 49 | algo_params=experiment_config, 50 | aux_tasks=aux_tasks) 51 | 52 | load_model = experiment_config.get(c.LOAD_MODEL, False) 53 | if load_model: 54 | learning_algorithm.load_state_dict(torch.load(load_model)) 55 | 56 | agent = SACXAgent(scheduler=FixedScheduler(num_tasks=num_tasks, 57 | intention_i=0), 58 | intentions=model, 59 | learning_algorithm=learning_algorithm, 60 | scheduler_period=c.MAX_INT, 61 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 62 | evaluation_env = None 63 | evaluation_agent = None 64 | if experiment_config.get(c.EVALUATION_FREQUENCY, 0): 65 | assert experiment_config[c.NUM_EVALUATION_EPISODES] % num_tasks == 0 66 | evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) 67 | evaluation_agent = SACXAgent(scheduler=RecycleScheduler(num_tasks=num_tasks, 68 | scheduling=[experiment_config[c.NUM_EVALUATION_EPISODES] // num_tasks] * num_tasks), 69 | intentions=model, 70 | learning_algorithm=None, 71 | scheduler_period=c.MAX_INT, 72 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 73 | 74 | summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.MULTITASK_BC, cfg=experiment_config) 75 | train(agent=agent, 76 | evaluation_agent=evaluation_agent, 77 | train_env=train_env, 78 | evaluation_env=evaluation_env, 79 | auxiliary_reward=experiment_config[c.AUXILIARY_REWARDS].reward, 80 | buffer_preprocess=buffer_preprocessing, 81 | experiment_settings=experiment_config, 82 | summary_writer=summary_writer, 83 | save_path=save_path) 84 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/auxiliary_tasks/koopman.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | import rl_sandbox.constants as c 6 | 7 | from rl_sandbox.auxiliary_tasks.auxiliary_tasks import AuxiliaryTask 8 | from rl_sandbox.model_architectures.shared import Flatten 9 | 10 | class Koopman(AuxiliaryTask): 11 | def __init__(self, 12 | rec_dim, 13 | batch_size, 14 | encoder, 15 | decoder, 16 | dynamics, 17 | opt, 18 | buffer, 19 | algo_params, 20 | reduction=c.SUM, 21 | loss_coef=1., 22 | device=torch.device(c.CPU), 23 | **kwargs): 24 | # Image dim: (num_images, num_frames, height, width) 25 | assert len(rec_dim) == 4 26 | super().__init__() 27 | self._flat = Flatten() 28 | 29 | self._rec_dim = rec_dim 30 | self._flatten_dim = int(np.product(rec_dim)) 31 | self._batch_size = batch_size 32 | 33 | self._buffer = buffer 34 | self._encoder = encoder 35 | self._decoder = decoder 36 | self._dynamics = dynamics 37 | self._opt = opt 38 | 39 | self._loss_coef = loss_coef 40 | self._mse = torch.nn.MSELoss(reduction=reduction) 41 | 42 | self.device = device 43 | self.algo_params = algo_params 44 | self.train_preprocessing = algo_params[c.TRAIN_PREPROCESSING] 45 | 46 | def state_dict(self): 47 | return { 48 | c.DECODER: self._decoder.state_dict(), 49 | c.KOOPMAN_DYNAMICS: self._dynamics.state_dict(), 50 | c.KOOPMAN_OPTIMIZER: self._opt.state_dict() 51 | } 52 | 53 | def load_state_dict(self, state_dict): 54 | self._decoder.load_state_dict(state_dict[c.DECODER]) 55 | self._dynamics.load_state_dict(state_dict[c.KOOPMAN_DYNAMICS]) 56 | self._opt.load_state_dict(state_dict[c.KOOPMAN_OPTIMIZER]) 57 | 58 | @property 59 | def opt(self): 60 | return self._opt 61 | 62 | def compute_loss(self, next_obs, next_h_state): 63 | obss, _, acts, _, dones, next_obss, _, _ = self._buffer.sample_with_next_obs( 64 | self._batch_size, next_obs, next_h_state) 65 | 66 | obss = self.train_preprocessing(obss) 67 | next_obss = self.train_preprocessing(next_obss) 68 | 69 | batch_size = obss.shape[0] 70 | 71 | x = obss[:, :self._flatten_dim].reshape( 72 | batch_size * self._rec_dim[0], *self._rec_dim[1:]).to(self.device) 73 | 74 | z_hat = self._encoder(x) 75 | x_hat = self._decoder(z_hat) 76 | 77 | # Compute autoencoder reconstruction loss 78 | ae_loss = self._mse(x_hat, x) 79 | 80 | # This only looks at observations with valid transitions 81 | valid_ind = torch.where(dones == 0)[0] 82 | 83 | z_hat = z_hat[valid_ind] 84 | x_hat = x_hat[valid_ind] 85 | 86 | # Compute MSE K(g(x{n})) + B(u_{n}) and g(x_{n+1}) 87 | next_x = next_obss[valid_ind, :self._flatten_dim].reshape( 88 | len(valid_ind) * self._rec_dim[0], *self._rec_dim[1:]).to(self.device) 89 | z_next_hat = self._encoder(next_x) 90 | 91 | z_next_trans = self._dynamics(z_hat, acts[valid_ind]) 92 | transition_loss = self._mse(z_next_hat, z_next_trans) 93 | 94 | # Compute MSE of future state reconstruction 95 | # Compute reconstruction of K(g(x{n})) + B(u_{n}), which is approximately = g(x_{n+1}) 96 | x_next_hat = self._decoder(z_next_hat) 97 | x_next_trans = self._decoder(z_next_trans) 98 | 99 | # Compute reconstruction from z_{n+1} 100 | future_rec_loss = self._mse(x_next_hat, x_next_trans) 101 | 102 | return self._loss_coef * (ae_loss + transition_loss + future_rec_loss) 103 | 104 | 105 | class KoopmanDynamics(nn.Module): 106 | def __init__(self, z_dim, u_dim, device=torch.device(c.CPU)): 107 | super().__init__() 108 | self.device = device 109 | 110 | self.K = torch.nn.Linear(z_dim, z_dim) 111 | self.B = torch.nn.Linear(u_dim, z_dim) 112 | 113 | self.to(device) 114 | 115 | def forward(self, z, u): 116 | z, u = z.to(self.device), u.to(self.device) 117 | Kz = self.K(z) 118 | Bu = self.B(u) 119 | 120 | return Kz + Bu 121 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/train/train_multitask_bc_no_overfit.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import gzip 3 | 4 | import torch 5 | 6 | import rl_sandbox.constants as c 7 | 8 | from rl_sandbox.algorithms.bc.bc_no_overfit import MultitaskBC 9 | from rl_sandbox.algorithms.sac_x.schedulers import FixedScheduler, RecycleScheduler 10 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks 11 | from rl_sandbox.buffers.utils import make_buffer 12 | from rl_sandbox.envs.fake_env import FakeEnv 13 | from rl_sandbox.envs.utils import make_env 14 | from rl_sandbox.learning_utils import train 15 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer 16 | from rl_sandbox.agents.hrl_agents import SACXAgent 17 | from rl_sandbox.transforms.general_transforms import Identity 18 | from rl_sandbox.utils import make_summary_writer, set_seed 19 | from rl_sandbox.examples.lfgp.experts.subsample_expert_data import subsample_buffers 20 | 21 | def train_multitask_bc_no_overfit(experiment_config): 22 | seed = experiment_config[c.SEED] 23 | save_path = experiment_config.get(c.SAVE_PATH, None) 24 | buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) 25 | num_tasks = experiment_config[c.NUM_TASKS] 26 | 27 | set_seed(seed) 28 | train_env = FakeEnv(obs_dim=experiment_config[c.OBS_DIM]) 29 | model = make_model(experiment_config[c.MODEL_SETTING]) 30 | 31 | assert num_tasks == len(experiment_config[c.EXPERT_BUFFERS]) == experiment_config[c.AUXILIARY_REWARDS].num_auxiliary_rewards 32 | expert_buffers = [] 33 | for load_path in experiment_config[c.EXPERT_BUFFERS]: 34 | # drop memory size for expert buffers to only what is needed 35 | with gzip.open(load_path, "rb") as f: 36 | data = pickle.load(f) 37 | experiment_config[c.BUFFER_SETTING][c.KWARGS][c.MEMORY_SIZE] = data[c.MEMORY_SIZE] 38 | 39 | expert_buffers.append(make_buffer(experiment_config[c.BUFFER_SETTING], seed, load_path)) 40 | 41 | if experiment_config.get(c.EXPERT_BUFFER_SUBSAMPLING, None) is not None: 42 | expert_buffers = subsample_buffers(expert_buffers, experiment_config[c.EXPERT_BUFFER_SUBSAMPLING]) 43 | 44 | optimizer = make_optimizer(model.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) 45 | aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], 46 | model, 47 | expert_buffers[0], 48 | experiment_config) 49 | 50 | learning_algorithm = MultitaskBC(model=model, 51 | optimizer=optimizer, 52 | expert_buffers=expert_buffers, 53 | algo_params=experiment_config, 54 | aux_tasks=aux_tasks) 55 | 56 | load_model = experiment_config.get(c.LOAD_MODEL, False) 57 | if load_model: 58 | learning_algorithm.load_state_dict(torch.load(load_model)) 59 | 60 | agent = SACXAgent(scheduler=FixedScheduler(num_tasks=num_tasks, 61 | intention_i=0), 62 | intentions=model, 63 | learning_algorithm=learning_algorithm, 64 | scheduler_period=c.MAX_INT, 65 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 66 | evaluation_env = None 67 | evaluation_agent = None 68 | if experiment_config.get(c.EVALUATION_FREQUENCY, 0): 69 | assert experiment_config[c.NUM_EVALUATION_EPISODES] % num_tasks == 0 70 | evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) 71 | evaluation_agent = SACXAgent(scheduler=RecycleScheduler(num_tasks=num_tasks, 72 | scheduling=[experiment_config[c.NUM_EVALUATION_EPISODES] // num_tasks] * num_tasks), 73 | intentions=model, 74 | learning_algorithm=None, 75 | scheduler_period=c.MAX_INT, 76 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 77 | 78 | summary_writer, save_path = make_summary_writer(save_path=save_path, algo=c.MULTITASK_BC, cfg=experiment_config) 79 | train(agent=agent, 80 | evaluation_agent=evaluation_agent, 81 | train_env=train_env, 82 | evaluation_env=evaluation_env, 83 | auxiliary_reward=experiment_config[c.AUXILIARY_REWARDS].reward, 84 | buffer_preprocess=buffer_preprocessing, 85 | experiment_settings=experiment_config, 86 | summary_writer=summary_writer, 87 | save_path=save_path) 88 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/agents/rl_agents.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from torch.distributions import Categorical, Normal 5 | 6 | import rl_sandbox.constants as c 7 | from rl_sandbox.agents.random_agents import UniformContinuousAgent 8 | 9 | 10 | class RLAgent(): 11 | def __init__(self, model, learning_algorithm): 12 | self.model = model 13 | self.learning_algorithm = learning_algorithm 14 | 15 | def update(self, curr_obs, curr_h_state, action, reward, done, info, next_obs, next_h_state, **kwargs): 16 | return self.learning_algorithm.update(curr_obs, 17 | curr_h_state, 18 | action, 19 | reward, 20 | done, 21 | info, 22 | next_obs, 23 | next_h_state, 24 | **kwargs) 25 | 26 | def compute_action(self, obs, **kwargs): 27 | raise NotImplementedError 28 | 29 | def reset(self): 30 | # Returns initial hidden state 31 | if hasattr(self.model, c.INITIALIZE_HIDDEN_STATE): 32 | return self.model.initialize_hidden_state().numpy().astype(np.float32) 33 | return np.array([np.nan], dtype=np.float32) 34 | 35 | 36 | class ACAgent(RLAgent): 37 | def __init__(self, model, learning_algorithm, preprocess=lambda obs: obs): 38 | super().__init__(model=model, 39 | learning_algorithm=learning_algorithm) 40 | self.preprocess = preprocess 41 | 42 | def preprocess(self, obs): 43 | return obs 44 | 45 | def compute_action(self, obs, hidden_state): 46 | obs = torch.tensor(obs).unsqueeze(0) 47 | obs = self.preprocess(obs) 48 | hidden_state = torch.tensor(hidden_state).unsqueeze(0) 49 | action, value, hidden_state, log_prob, entropy, mean, variance = self.model.compute_action( 50 | obs, hidden_state) 51 | act_info = {c.VALUE: value, 52 | c.LOG_PROB: log_prob, 53 | c.ENTROPY: entropy, 54 | c.MEAN: mean, 55 | c.VARIANCE: variance} 56 | return action, hidden_state, act_info 57 | 58 | def deterministic_action(self, obs, hidden_state): 59 | obs = torch.tensor(obs).unsqueeze(0) 60 | obs = self.preprocess(obs) 61 | hidden_state = torch.tensor(hidden_state).unsqueeze(0) 62 | action, value, hidden_state, log_prob, entropy = self.model.deterministic_action( 63 | obs, hidden_state) 64 | act_info = {c.VALUE: value, 65 | c.LOG_PROB: log_prob, 66 | c.ENTROPY: entropy} 67 | return action, hidden_state, act_info 68 | 69 | 70 | class ACAgentEUniformExplorer(ACAgent): 71 | """ Agent that enforces more exploration. 72 | 73 | prob_explore_ep: probability of executing an "exploration" episode. Determined during call to agent.reset(). 74 | prob_explore_act: probablility of executing an exploratory action during exploration episode. 75 | max_repeat: max number of timesteps to repeat exploratory action. 76 | min_repeat: min number of timesteps to repeat exploratory action. 77 | """ 78 | def __init__(self, model, learning_algorithm, prob_explore_ep, prob_explore_act, max_repeat, min_repeat, 79 | min_action=-1, max_action=1, preprocess=lambda obs: obs): 80 | super().__init__(model, learning_algorithm, preprocess) 81 | self._prob_explore_ep = prob_explore_ep 82 | self._prob_explore_act = prob_explore_act 83 | self._max_repeat = max_repeat 84 | self._min_repeat = min_repeat 85 | self._explore_ep = False 86 | self._cur_explore_act = None 87 | self._act_repeat_ts = 0 88 | self._act_repeat_length = 0 89 | self._action_dim = self.model._action_dim 90 | self._uni_rand_agent = UniformContinuousAgent(np.ones(self._action_dim) * min_action, 91 | np.ones(self._action_dim) * max_action) 92 | 93 | def compute_action(self, obs, hidden_state): 94 | explore_act = False 95 | if self._explore_ep: 96 | if self._cur_explore_act is not None: 97 | if self._act_repeat_ts < self._act_repeat_length: 98 | explore_act = True 99 | self._act_repeat_ts +=1 100 | else: 101 | # reset action repeat explore 102 | self._cur_explore_act = None 103 | self._act_repeat_ts = 0 104 | 105 | if self._cur_explore_act is None: 106 | explore_act = np.random.rand() < self._prob_explore_act 107 | 108 | if explore_act: 109 | self._cur_explore_act = list(self._uni_rand_agent.compute_action()) 110 | self._cur_explore_act[1] = hidden_state 111 | self._cur_explore_act = tuple(self._cur_explore_act) 112 | self._act_repeat_length = np.random.randint(self._min_repeat, self._max_repeat) 113 | self._act_repeat_ts +=1 114 | 115 | if explore_act: 116 | return self._cur_explore_act 117 | else: 118 | return super().compute_action(obs, hidden_state) 119 | 120 | def reset(self): 121 | self._explore_ep = np.random.rand() < self._prob_explore_ep 122 | self._cur_explore_act = None 123 | return super().reset() -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/collect_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script loads up a trained model and collects trajectories using the model. 3 | We choose how much data is generated by the model and random uniform policy. 4 | 5 | The model path consists of the state dict of the model. 6 | 7 | The config path consists of all the settings to load the environment 8 | and preprocessing. 9 | 10 | Example usage: 11 | python collect_data.py --seed=0 --model_path=./state_dict.pt \ 12 | --config_path=./experiment_setting.pkl --num_episodes=5 \ 13 | --num_samples=1000 --save_path=./data.pkl 14 | """ 15 | 16 | import _pickle as pickle 17 | import argparse 18 | import gzip 19 | import numpy as np 20 | import os 21 | 22 | from pprint import pprint 23 | from tqdm import tqdm 24 | 25 | import rl_sandbox.constants as c 26 | 27 | from rl_sandbox.examples.eval_tools.utils import load_model 28 | from rl_sandbox.utils import set_seed 29 | 30 | 31 | def collect_data(args): 32 | set_seed(args.seed) 33 | assert args.num_episodes > 0 34 | assert args.num_samples > 0 35 | assert 0 <= args.mixture_ratio <= 1 36 | 37 | dir_exists = os.path.isdir(args.save_path) 38 | assert dir_exists or not os.path.exists(args.save_path) 39 | 40 | if not dir_exists: 41 | os.makedirs(args.save_path, exist_ok=True) 42 | 43 | config, env, buffer_preprocess, agent = load_model(args.seed, 44 | args.config_path, 45 | args.model_path, 46 | args.intention) 47 | 48 | init_observations = [] 49 | observations = [] 50 | actions = [] 51 | rewards = [] 52 | dones = [] 53 | 54 | episodes_pbar = tqdm(total=args.num_episodes) 55 | samples_pbar = tqdm(total=args.num_samples) 56 | 57 | sample_i = 0 58 | eval_returns = [] 59 | for episode_i in range(args.num_episodes): 60 | eval_returns.append(0) 61 | obs = env.reset() 62 | 63 | init_observations.append(obs) 64 | 65 | buffer_preprocess.reset() 66 | obs = buffer_preprocess(obs) 67 | h_state = agent.reset() 68 | done = False 69 | 70 | while not done: 71 | if hasattr(env, c.RENDER) and args.render: 72 | env.render() 73 | 74 | if args.deterministic: 75 | action, h_state, act_info = agent.deterministic_action( 76 | obs=obs, hidden_state=h_state) 77 | else: 78 | action, h_state, act_info = agent.compute_action( 79 | obs=obs, hidden_state=h_state) 80 | 81 | if np.random.uniform() < args.mixture_ratio: 82 | action = np.random.uniform(config[c.MIN_ACTION], config[c.MAX_ACTION], config[c.ACTION_DIM]) 83 | 84 | actions.append(action) 85 | 86 | if config[c.CLIP_ACTION]: 87 | action = np.clip(action, a_min=config[c.MIN_ACTION], a_max=config[c.MAX_ACTION]) 88 | 89 | obs, reward, done, _ = env.step(action) 90 | 91 | observations.append(obs) 92 | rewards.append(reward) 93 | dones.append(done) 94 | obs = buffer_preprocess(obs) 95 | 96 | eval_returns[-1] += reward 97 | sample_i += 1 98 | samples_pbar.update(1) 99 | if sample_i >= args.num_samples: 100 | break 101 | else: 102 | episodes_pbar.update(1) 103 | continue 104 | break 105 | 106 | ret_mean = np.mean(eval_returns) 107 | ret_std = np.std(eval_returns) 108 | ret_max = np.max(eval_returns) 109 | ret_min = np.min(eval_returns) 110 | 111 | print("=" * 100) 112 | print("Interacted with {} complete episodes ({} timesteps)".format(episode_i, sample_i)) 113 | print("Average Return: {} - Std: {}".format(ret_mean, ret_std)) 114 | print("Max Return: {} - Min Return: {}".format(ret_max, ret_min)) 115 | 116 | for (filename, data) in zip(("init_obss", "obss", "acts", "rews", "dones"), 117 | (init_observations, observations, actions, rewards, dones)): 118 | with gzip.open(f"{args.save_path}/{filename}.pkl", "wb") as f: 119 | pickle.dump(data, f) 120 | 121 | with gzip.open(f"{args.save_path}/metadata.pkl", "wb") as f: 122 | pickle.dump({ 123 | "returns": eval_returns, 124 | "min": ret_min, 125 | "max": ret_max, 126 | "avg": ret_mean, 127 | "std": ret_std, 128 | **args.__dict__, 129 | }, f) 130 | 131 | if __name__ == "__main__": 132 | parser = argparse.ArgumentParser() 133 | parser.add_argument("--render", action="store_true", help="Render the environment") 134 | 135 | parser.add_argument("--seed", type=int, default=0, help="The random seed") 136 | parser.add_argument("--save_path", type=str, required=True, help="The directory to save the trajectories") 137 | parser.add_argument("--mixture_ratio", required=True, type=float, help="Amount of data sampled using random uniform policy") 138 | parser.add_argument("--deterministic", action="store_true", help="Whether or not to use deterministic action (the action mean) from the agent") 139 | parser.add_argument("--num_episodes", required=True, type=int, help="The maximum number of episodes") 140 | parser.add_argument("--num_samples", required=True, type=int, help="The maximum number of samples") 141 | 142 | parser.add_argument("--model_path", required=True, type=str, help="The path to load the model") 143 | parser.add_argument("--config_path", required=True, type=str, help="The path to load the config that trained the model") 144 | parser.add_argument("--intention", type=int, default=0, help="The intention to use for SAC-X") 145 | args = parser.parse_args() 146 | 147 | pprint(args) 148 | 149 | collect_data(args) 150 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser(description='RL') 8 | parser.add_argument( 9 | '--algo', default='a2c', help='algorithm to use: a2c | ppo | acktr') 10 | parser.add_argument( 11 | '--gail', 12 | action='store_true', 13 | default=False, 14 | help='do imitation learning with gail') 15 | parser.add_argument( 16 | '--gail-experts-file', 17 | help='file that contains expert demonstrations for gail') 18 | parser.add_argument( 19 | '--gail-batch-size', 20 | type=int, 21 | default=128, 22 | help='gail batch size (default: 128)') 23 | parser.add_argument( 24 | '--gail-epoch', type=int, default=5, help='gail epochs (default: 5)') 25 | parser.add_argument( 26 | '--lr', type=float, default=7e-4, help='learning rate (default: 7e-4)') 27 | parser.add_argument( 28 | '--eps', 29 | type=float, 30 | default=1e-5, 31 | help='RMSprop optimizer epsilon (default: 1e-5)') 32 | parser.add_argument( 33 | '--alpha', 34 | type=float, 35 | default=0.99, 36 | help='RMSprop optimizer apha (default: 0.99)') 37 | parser.add_argument( 38 | '--gamma', 39 | type=float, 40 | default=0.99, 41 | help='discount factor for rewards (default: 0.99)') 42 | parser.add_argument( 43 | '--use-gae', 44 | action='store_true', 45 | default=False, 46 | help='use generalized advantage estimation') 47 | parser.add_argument( 48 | '--gae-lambda', 49 | type=float, 50 | default=0.95, 51 | help='gae lambda parameter (default: 0.95)') 52 | parser.add_argument( 53 | '--entropy-coef', 54 | type=float, 55 | default=0.01, 56 | help='entropy term coefficient (default: 0.01)') 57 | parser.add_argument( 58 | '--value-loss-coef', 59 | type=float, 60 | default=0.5, 61 | help='value loss coefficient (default: 0.5)') 62 | parser.add_argument( 63 | '--max-grad-norm', 64 | type=float, 65 | default=0.5, 66 | help='max norm of gradients (default: 0.5)') 67 | parser.add_argument( 68 | '--seed', type=int, default=1, help='random seed (default: 1)') 69 | parser.add_argument( 70 | '--cuda-deterministic', 71 | action='store_true', 72 | default=False, 73 | help="sets flags for determinism when using CUDA (potentially slow!)") 74 | parser.add_argument( 75 | '--num-processes', 76 | type=int, 77 | default=16, 78 | help='how many training CPU processes to use (default: 16)') 79 | parser.add_argument( 80 | '--num-steps', 81 | type=int, 82 | default=5, 83 | help='number of forward steps in A2C (default: 5)') 84 | parser.add_argument( 85 | '--ppo-epoch', 86 | type=int, 87 | default=4, 88 | help='number of ppo epochs (default: 4)') 89 | parser.add_argument( 90 | '--num-mini-batch', 91 | type=int, 92 | default=32, 93 | help='number of batches for ppo (default: 32)') 94 | parser.add_argument( 95 | '--clip-param', 96 | type=float, 97 | default=0.2, 98 | help='ppo clip parameter (default: 0.2)') 99 | parser.add_argument( 100 | '--log-interval', 101 | type=int, 102 | default=10, 103 | help='log interval, one log per n updates (default: 10)') 104 | parser.add_argument( 105 | '--save-interval', 106 | type=int, 107 | default=100, 108 | help='save interval, one save per n updates (default: 100)') 109 | parser.add_argument( 110 | '--eval-interval', 111 | type=int, 112 | default=None, 113 | help='eval interval, one eval per n updates (default: None)') 114 | parser.add_argument( 115 | '--num-env-steps', 116 | type=int, 117 | default=10e6, 118 | help='number of environment steps to train (default: 10e6)') 119 | parser.add_argument( 120 | '--env-name', 121 | default='PongNoFrameskip-v4', 122 | help='environment to train on (default: PongNoFrameskip-v4)') 123 | parser.add_argument( 124 | '--log-dir', 125 | default='/tmp/gym/', 126 | help='directory to save agent logs (default: /tmp/gym)') 127 | parser.add_argument( 128 | '--save-dir', 129 | default='./trained_models/', 130 | help='directory to save agent logs (default: ./trained_models/)') 131 | parser.add_argument( 132 | '--no-cuda', 133 | action='store_true', 134 | default=False, 135 | help='disables CUDA training') 136 | parser.add_argument( 137 | '--use-proper-time-limits', 138 | action='store_true', 139 | default=False, 140 | help='compute returns taking into account time limits') 141 | parser.add_argument( 142 | '--recurrent-policy', 143 | action='store_true', 144 | default=False, 145 | help='use a recurrent policy') 146 | parser.add_argument( 147 | '--use-linear-lr-decay', 148 | action='store_true', 149 | default=False, 150 | help='use a linear schedule on the learning rate') 151 | parser.add_argument( 152 | '--train-render', 153 | action='store_true', 154 | default=False, 155 | help='render training env') 156 | parser.add_argument( 157 | '--eval-render', 158 | action='store_true', 159 | default=False, 160 | help='render eval env') 161 | parser.add_argument( 162 | '--eval-eps', 163 | type=int, 164 | default=50, 165 | help='# of evaluation episodes') 166 | args = parser.parse_args() 167 | 168 | args.cuda = not args.no_cuda and torch.cuda.is_available() 169 | 170 | assert args.algo in ['a2c', 'ppo', 'acktr'] 171 | if args.recurrent_policy: 172 | assert args.algo in ['a2c', 'ppo'], \ 173 | 'Recurrent policy is not implemented for ACKTR' 174 | 175 | return args 176 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/examples/lfgp/experts/create_subsampled_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script loads up existing buffers and generates subsampled versions. 3 | 4 | Compared with subsampling on the fly, this ensures that all methods use the exact same data. 5 | 6 | Example usage: 7 | python create_subsampled_data.py --seed=0 --input_path=./expert_data \ 8 | --output_path=./expert_data_subsampled --keep_every_nth=20 9 | """ 10 | 11 | import copy 12 | import glob 13 | import gzip 14 | import _pickle as pickle 15 | import argparse 16 | import os 17 | import numpy as np 18 | 19 | import rl_sandbox.constants as c 20 | 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("--seed", type=int, default=0, help="The random seed") 24 | parser.add_argument("--input_path", required=True, type=str, help="The path to .gz file(s) with expert data") 25 | parser.add_argument("--output_path", required=True, type=str, help="The path to save the new data") 26 | parser.add_argument("--keep_every_nth", required=False, type=int, help="Keep every nth piece of data.") 27 | parser.add_argument("--keep_first_last", action='store_true', help="Keep the first and last of each trajectory, "\ 28 | "and otherwise subsample normally.") 29 | parser.add_argument("--keep_last", action='store_true', help="Keep the last of each trajectory, "\ 30 | "and otherwise subsample normally.") 31 | parser.add_argument("--keep_last_only", action='store_true', help="Keep the last of each trajectory exclusively.") 32 | parser.add_argument("--num_to_keep", required=False, type=int, help="Cap the amount of data to keep in each intention.") 33 | parser.add_argument("--num_extra_lasts", required=False, type=int, help="Add a number of extra final transitions to each intention.") 34 | parser.add_argument("--num_to_subsample_from", required=False, type=int, help="Cap amount of data to subsample from, but firsts/lasts can still come from more.") 35 | 36 | args = parser.parse_args() 37 | 38 | 39 | SUBSAMPLE_KEYS = ['observations', 'hidden_states', 'actions', 'rewards', 'dones', 'next_observations', 40 | 'next_hidden_states'] 41 | 42 | np.random.seed(args.seed) 43 | data_paths = glob.glob(os.path.join(args.input_path, '*.gz')) 44 | 45 | assert(os.path.exists(args.input_path)), f"No data folder found at {args.input_path}" 46 | assert sum([args.keep_first_last, args.keep_last, args.keep_last_only]) <= 1, "Can only set one of these." 47 | # assert not (args.keep_first_last and args.keep_last), "Can't set both keep_first_last and keep_last" 48 | 49 | if os.path.exists(args.output_path): 50 | overwrite = input("Output path already exists. Overwrite? Anything but \"yes\" exits.") 51 | if overwrite != 'yes': 52 | exit(0) 53 | 54 | os.makedirs(args.output_path, exist_ok=True) 55 | 56 | for dp in data_paths: 57 | gz_filename = dp.split('/')[-1] 58 | out_path = os.path.join(args.output_path, gz_filename) 59 | 60 | with gzip.open(dp, 'rb') as f: 61 | data = pickle.load(f) 62 | 63 | out_data = copy.deepcopy(data) 64 | 65 | if args.keep_first_last or args.keep_last or args.keep_last_only: 66 | inds = [] 67 | ends = np.argwhere(np.invert(np.all(data['observations'][1:] == data['next_observations'][:-1], axis=1))) 68 | starts = np.concatenate([[[0]], ends + 1]) 69 | 70 | if args.keep_last_only: 71 | inds = ends 72 | else: 73 | for start, end in zip(starts, ends): 74 | if args.keep_first_last: 75 | inds.append(int(start)) 76 | if end == start: # should only happen if very first index is an end 77 | if args.keep_last: 78 | inds.append(int(end)) 79 | continue 80 | 81 | initial_offset = np.random.randint(args.keep_every_nth) 82 | next_i = start + initial_offset 83 | while next_i < end: 84 | inds.append(int(next_i)) 85 | next_i += args.keep_every_nth 86 | 87 | inds.append(int(end)) 88 | 89 | inds = np.array(inds).squeeze() 90 | 91 | if args.num_to_subsample_from is not None: 92 | inds = inds[inds < args.num_to_subsample_from] 93 | else: 94 | initial_offset = np.random.randint(args.keep_every_nth) 95 | 96 | if args.num_to_subsample_from is None: 97 | max_ind = len(data['observations']) 98 | else: 99 | max_ind = args.num_to_subsample_from 100 | 101 | # this assumes that the buffers are coming in as only being the size that they need to be 102 | inds = np.array(range(initial_offset, max_ind, args.keep_every_nth)) 103 | 104 | if args.num_to_keep is not None: 105 | # assert len(inds) >= args.num_to_keep, f"Not enough timesteps, wanted {args.num_to_keep}, found "\ 106 | # f"{len(inds)} for {gz_filename}." 107 | if len(inds) < args.num_to_keep: 108 | print(f"Not enough timesteps, wanted {args.num_to_keep}, found {len(inds)} for {gz_filename}.") 109 | # print(f"Skipping path {dp}") 110 | inds = inds[:args.num_to_keep] 111 | 112 | if args.num_extra_lasts is not None: 113 | ends = np.argwhere(np.invert(np.all(data['observations'][1:] == data['next_observations'][:-1], axis=1))) 114 | unused_ends = ends[ends > inds[-1]] 115 | if unused_ends.shape[0] < args.num_extra_lasts: 116 | print(f"WARNING: wanted {args.num_extra_lasts} extra lasts, but only found {unused_ends.shape[0]} for {gz_filename}") 117 | inds = np.concatenate([inds, unused_ends[:args.num_extra_lasts]]) 118 | 119 | print(f"Keeping {len(inds)} data for {gz_filename}.") 120 | 121 | for k in SUBSAMPLE_KEYS: 122 | out_data[k] = data[k][inds] 123 | 124 | for ik in data['infos'].keys(): 125 | out_data['infos'][ik] = data['infos'][ik][inds] 126 | 127 | # also need to update size parameters 128 | out_data['pointer'] = 0 129 | out_data['count'] = len(inds) 130 | out_data['memory_size'] = len(inds) 131 | 132 | with gzip.open(out_path, "wb") as f: 133 | pickle.dump(out_data, f) 134 | 135 | print(f"Subsampled data created and saved to {args.output_path}.") -------------------------------------------------------------------------------- /scripts/plotting/multitask_performance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pickle 4 | import os 5 | import glob 6 | 7 | import utils as plot_utils 8 | from data_locations import main_performance as data_locations 9 | import common as plot_common 10 | 11 | 12 | opts = plot_utils.get_default_opts() 13 | #### Options ######################################################################################################## 14 | # fig 15 | opts['font_size'] = 24 16 | opts['data_locations'] = data_locations 17 | opts['aux_names'] = [ 18 | ['Open', 'Close', 'Stack', 'Lift', 'Reach', 'Move'], 19 | ['Open', 'Close', 'Unstack-Stack', 'Lift', 'Reach', 'Move'], 20 | ['Open', 'Close', 'Bring', 'Lift', 'Reach', 'Move'], 21 | ['Open', 'Close', 'Insert', 'Bring', 'Lift', 'Reach', 'Move'] 22 | ] 23 | opts['aux_orders'] = [ 24 | [2, 0, 1, 3, 4, 5], 25 | [2, 0, 1, 3, 4, 5], 26 | [2, 0, 1, 3, 4, 5], 27 | [2, 0, 1, 3, 4, 5, 6], 28 | ] 29 | opts['algo_dir_names'] = ['dac-x', 'multitask_dac', 'multitask_bc'] 30 | opts['algo_titles'] = ['LfGP (multi)', 'LfGP-NS (multi)', 'BC (multi)'] 31 | opts['fig_path'] = opts['root_dir'] + "/figures/multitask_performance" 32 | opts['valid_task'] = [True, True, True, True] 33 | # opts['valid_task'] = [False, True, False, False] 34 | # opts['rl_eval_eps_per_task'] = 10 35 | # opts['bc_eval_eps_per_task'] = 10 36 | # opts['eval_interval'] = 10000 37 | 38 | root_dir, fig_path, experiment_root_dir, seeds, expert_root, expert_perf_files, expert_perf_file_main_task_i = \ 39 | plot_common.get_path_defaults(fig_name="multitask_performance") 40 | 41 | task_dir_names, valid_task, task_titles, main_task_i, num_aux, task_data_filenames, num_eval_steps_to_use = \ 42 | plot_common.get_task_defaults() 43 | 44 | algo_dir_names, algo_titles, multitask_algos, eval_eps_per_task = plot_common.get_algo_defaults() 45 | 46 | fig_shape, plot_size, num_stds, font_size, eval_interval, cmap, linewidth, std_alpha, x_val_scale, subsample_rate, \ 47 | include_expert_baseline = plot_common.get_fig_defaults() 48 | 49 | aux_names = [ 50 | ['Open', 'Close', 'Stack', 'Lift', 'Reach', 'Move'], 51 | ['Open', 'Close', 'Unstack-Stack', 'Lift', 'Reach', 'Move'], 52 | ['Open', 'Close', 'Bring', 'Lift', 'Reach', 'Move'], 53 | ['Open', 'Close', 'Insert', 'Bring', 'Lift', 'Reach', 'Move'] 54 | ] 55 | aux_orders = [ 56 | [2, 0, 1, 3, 4, 5], 57 | [2, 0, 1, 3, 4, 5], 58 | [2, 0, 1, 3, 4, 5], 59 | [2, 0, 1, 3, 4, 5, 6], 60 | ] 61 | ##################################################################################################################### 62 | 63 | # pretty plotting, allow tex 64 | plt.rcParams.update({"text.usetex": True, "font.family": "serif"}) 65 | plt.rc('text.latex', preamble=r'\usepackage{amsmath}') 66 | 67 | all_successes, all_returns = plot_utils.get_returns_successes("multitask_performance", data_locations) 68 | 69 | # dicts are all_successes['task']['algo']['raw/mean/std'], 70 | # raw shape: (seed, timestep, aux task, eval ep) 71 | # mean and std shape: (timestep, aux_task) 72 | 73 | # fig 1: success rate of each aux while executing own task 74 | # own_task_s_figs = [plt.subplots(nrows=1, ncols=opts['num_aux[task_i]) 75 | nrows = len(task_dir_names) 76 | ncols = max(num_aux) 77 | own_task_s_fig = plt.figure(figsize=[plot_size[0] * ncols, plot_size[1] * nrows]) 78 | own_task_r_fig = plt.figure(figsize=[plot_size[0] * ncols, plot_size[1] * nrows]) 79 | 80 | 81 | for task_i, task in enumerate(task_dir_names): 82 | if not valid_task[task_i]: 83 | print(f"Task {task} set to false in valid_task, skipping in plotting") 84 | continue 85 | 86 | # for aux_i in range(num_aux[task_i]): 87 | for col_i, aux_i in enumerate(aux_orders[task_i]): 88 | # plt_index = task_i * ncols + aux_i + 1 89 | plt_index = task_i * ncols + col_i + 1 90 | for plot_type, fig, data in zip(['s', 'r'], [own_task_s_fig, own_task_r_fig], [all_successes, all_returns]): 91 | ax = fig.add_subplot(nrows, ncols, plt_index) 92 | ax.set_title(aux_names[task_i][aux_i], fontsize=font_size) 93 | 94 | # if aux_i == 0: 95 | if col_i == 0: 96 | ax.set_ylabel(task_titles[task_i], fontsize=font_size) 97 | 98 | for algo_i, algo in enumerate(algo_dir_names): 99 | if algo in multitask_algos or aux_i == 2: 100 | plot_utils.plot_mean_std(ax, aux_i, algo, algo_i, data[task][algo], 101 | algo_label=algo_titles[algo_i] if (task_i == 0 and aux_i == 2) else None) 102 | 103 | # pretty 104 | if plot_type == 's': 105 | ax.set_ylim(-.01, 1.05) 106 | ax.set_yticks([0, .5, 1]) 107 | ax.set_yticks([0,.25, .5, .75, 1], minor=True) 108 | ax.tick_params(labelsize=font_size - 4) 109 | if task == 'insert_0': 110 | ax.set_xlim(0, 4.1) 111 | ax.set_xticks([1, 2, 3, 4]) 112 | ax.set_xticks(np.arange(0, 4, 0.5), minor=True) 113 | ax.grid(which='both', alpha=0.5) 114 | else: 115 | ax.set_xlim(0, 2.1) 116 | ax.set_xticks([0.5, 1, 1.5, 2]) 117 | # ax.set_xticks([0,1,2,3,4], minor=True) 118 | ax.grid(which='both', alpha=0.5) 119 | 120 | 121 | for fig, fig_name in zip([own_task_s_fig, own_task_r_fig], ['s_fig.pdf', 'r_fig.pdf']): 122 | fig.tight_layout() 123 | fig.legend(fancybox=True, shadow=True, fontsize=font_size, loc="right", 124 | bbox_to_anchor=(0.98, 0.5)) 125 | # ncol=len(algo_dir_names) + 1, bbox_to_anchor=(0.5, -0.31)) 126 | 127 | ax = fig.add_subplot(111, frameon=False) 128 | ax.tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False) 129 | ax.set_xlabel("Updates/steps (millions)", fontsize=font_size + 4, labelpad=10) 130 | 131 | if 's_fig' in fig_name: 132 | ax.set_ylabel("Success Rate", fontsize=font_size + 4, labelpad=32) 133 | else: 134 | ax.set_ylabel("Episode Return", fontsize=font_size + 4, labelpad=30) 135 | 136 | os.makedirs(fig_path, exist_ok=True) 137 | fig.savefig(os.path.join(fig_path, fig_name), bbox_inches='tight') 138 | 139 | # fig 2: success rate of each aux while running the main task 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/train/train_dac_sac.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | 4 | import torch 5 | 6 | import rl_sandbox.constants as c 7 | 8 | from rl_sandbox.algorithms.dac.sac import SACDAC 9 | from rl_sandbox.algorithms.dac.dac import DAC 10 | from rl_sandbox.auxiliary_tasks.utils import make_auxiliary_tasks 11 | from rl_sandbox.buffers.utils import make_buffer 12 | from rl_sandbox.envs.utils import make_env 13 | from rl_sandbox.learning_utils import train 14 | from rl_sandbox.model_architectures.utils import make_model, make_optimizer 15 | from rl_sandbox.agents.rl_agents import ACAgent, ACAgentEUniformExplorer 16 | from rl_sandbox.transforms.general_transforms import Identity 17 | from rl_sandbox.utils import make_summary_writer, set_seed, set_rng_state, check_load_latest_checkpoint, check_load_as_jumpoff_point 18 | from rl_sandbox.envs.wrappers.frame_stack import FrameStackWrapper 19 | 20 | def train_dac_sac(experiment_config): 21 | seed = experiment_config[c.SEED] 22 | save_path = experiment_config.get(c.SAVE_PATH, None) 23 | buffer_preprocessing = experiment_config.get(c.BUFFER_PREPROCESSING, Identity()) 24 | 25 | save_path, add_time_tag_to_save_path = check_load_latest_checkpoint(experiment_config, save_path) 26 | save_path, add_time_tag_to_save_path = check_load_as_jumpoff_point(experiment_config, save_path, add_time_tag_to_save_path) 27 | buffer_end_idx = None 28 | if experiment_config.get(c.LOAD_BUFFER_START_INDEX, -1) >= 0: 29 | buffer_end_idx = experiment_config[c.LOAD_BUFFER_START_INDEX] 30 | 31 | set_seed(seed) 32 | train_env = make_env(experiment_config[c.ENV_SETTING], seed) 33 | model = make_model(experiment_config[c.MODEL_SETTING]) 34 | buffer = make_buffer(experiment_config[c.BUFFER_SETTING], seed, experiment_config[c.BUFFER_SETTING].get(c.LOAD_BUFFER, False), 35 | end_idx=buffer_end_idx) 36 | 37 | policy_opt = make_optimizer(model.policy_parameters, experiment_config[c.OPTIMIZER_SETTING][c.POLICY]) 38 | qs_opt = make_optimizer(model.qs_parameters, experiment_config[c.OPTIMIZER_SETTING][c.QS]) 39 | alpha_opt = make_optimizer([model.log_alpha], experiment_config[c.OPTIMIZER_SETTING][c.ALPHA]) 40 | 41 | aux_tasks = make_auxiliary_tasks(experiment_config[c.AUXILIARY_TASKS], 42 | model, 43 | buffer, 44 | experiment_config) 45 | 46 | frame_stack = 1 47 | for wrap_dict in experiment_config[c.ENV_SETTING][c.ENV_WRAPPERS]: 48 | if wrap_dict[c.WRAPPER] == FrameStackWrapper: 49 | frame_stack = wrap_dict[c.KWARGS][c.NUM_FRAMES] 50 | 51 | # handle old code without expert amount option 52 | expert_amount = experiment_config.get(c.EXPERT_AMOUNT, None) 53 | expert_buffer_settings = experiment_config.get(c.EXPERT_BUFFER_SETTING, experiment_config[c.BUFFER_SETTING]) 54 | 55 | expert_buffer = make_buffer(expert_buffer_settings, seed, experiment_config[c.EXPERT_BUFFER], 56 | end_idx=expert_amount, match_load_size=True, frame_stack_load=frame_stack) 57 | 58 | if c.FT_EXPERT_BUFFER in experiment_config: 59 | ft_expert_buffer = make_buffer(expert_buffer_settings, seed, experiment_config[c.FT_EXPERT_BUFFER], 60 | end_idx=expert_amount, match_load_size=True, frame_stack_load=frame_stack) 61 | expert_buffer.merge(ft_expert_buffer) 62 | 63 | learning_algorithm = SACDAC(model=model, 64 | policy_opt=policy_opt, 65 | qs_opt=qs_opt, 66 | alpha_opt=alpha_opt, 67 | learn_alpha=experiment_config[c.LEARN_ALPHA], 68 | buffer=buffer, 69 | algo_params=experiment_config, 70 | aux_tasks=aux_tasks, 71 | expert_buffer=expert_buffer) 72 | 73 | discriminator = make_model(experiment_config[c.DISCRIMINATOR_SETTING]) 74 | discriminator_opt = make_optimizer(discriminator.parameters(), experiment_config[c.OPTIMIZER_SETTING][c.DISCRIMINATOR]) 75 | dac = DAC(discriminator=discriminator, 76 | discriminator_opt=discriminator_opt, 77 | expert_buffer=expert_buffer, 78 | learning_algorithm=learning_algorithm, 79 | algo_params=experiment_config) 80 | 81 | load_model = experiment_config.get(c.LOAD_MODEL, False) 82 | if load_model: 83 | state_dict = torch.load(load_model, map_location=experiment_config[c.DEVICE]) 84 | dac.load_state_dict(state_dict) 85 | set_rng_state(state_dict[c.TORCH_RNG_STATE], state_dict[c.NP_RNG_STATE]) 86 | 87 | # TODO add this as a proper option 88 | # agent = ACAgentEUniformExplorer(model=model, learning_algorithm=dac, 89 | # prob_explore_ep=.2, prob_explore_act=.05, max_repeat=41, min_repeat=40) 90 | 91 | agent = ACAgent(model=model, 92 | learning_algorithm=dac, 93 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 94 | 95 | # overwrites the save path with a time tag 96 | summary_writer, save_path = make_summary_writer(save_path=save_path, 97 | algo=c.DAC, 98 | cfg=experiment_config, 99 | add_time_tag=add_time_tag_to_save_path) 100 | evaluation_env = None 101 | evaluation_agent = None 102 | if experiment_config.get(c.EVALUATION_FREQUENCY, 0): 103 | if experiment_config[c.ENV_SETTING][c.ENV_TYPE] == c.PANDA_RL_ENVS: 104 | evaluation_env = train_env 105 | else: 106 | evaluation_env = make_env(experiment_config[c.ENV_SETTING], seed + 1) 107 | evaluation_agent = ACAgent(model=model, 108 | learning_algorithm=None, 109 | preprocess=experiment_config[c.EVALUATION_PREPROCESSING]) 110 | 111 | train(agent=agent, 112 | evaluation_agent=evaluation_agent, 113 | train_env=train_env, 114 | evaluation_env=evaluation_env, 115 | buffer_preprocess=buffer_preprocessing, 116 | experiment_settings=experiment_config, 117 | auxiliary_reward=experiment_config[c.EVALUATION_REWARD_FUNC], 118 | summary_writer=summary_writer, 119 | save_path=save_path) 120 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/model_architectures/discriminators/fully_connected_discriminators.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from torch.distributions import Normal 5 | 6 | from rl_sandbox.constants import CPU 7 | from rl_sandbox.model_architectures.shared import Flatten 8 | from rl_sandbox.model_architectures.utils import construct_linear_layers, RunningMeanStd 9 | 10 | 11 | class ActionConditionedFullyConnectedDiscriminator(nn.Module): 12 | def __init__(self, 13 | obs_dim, 14 | action_dim, 15 | output_dim, 16 | shared_layers=None, 17 | device=torch.device(CPU), 18 | obs_only=False, 19 | branched_outputs=False, 20 | activation=nn.Tanh(), 21 | layers=None): 22 | super().__init__() 23 | self.device = device 24 | 25 | self._obs_dim = obs_dim 26 | self._action_dim = action_dim 27 | self._output_dim = output_dim 28 | self._obs_only = obs_only 29 | 30 | self.branched_outputs = branched_outputs 31 | 32 | self._flatten = Flatten() 33 | 34 | # backwards compatibility 35 | if shared_layers is None and layers is not None: 36 | shared_layers = layers 37 | 38 | if shared_layers is not None: 39 | # backwards compatible with old code for layer setup, but now we can customize activation function 40 | new_shared_layers = [] 41 | for l in shared_layers: 42 | new_layer = [] 43 | new_layer.extend(l[:2]) 44 | new_layer.append(activation) 45 | new_layer.extend(l[3:]) 46 | new_shared_layers.append(new_layer) 47 | shared_layers = tuple(new_shared_layers) 48 | 49 | if shared_layers is None: 50 | assert branched_outputs, "Can't have no shared layers in multitask discriminator without branched outputs on." 51 | 52 | self.fc_layers = nn.ModuleList([nn.Identity()]) 53 | num_inputs = obs_dim + obs_only * action_dim 54 | self.output = nn.Sequential( 55 | nn.Conv1d(num_inputs * output_dim, 256 * output_dim, kernel_size=1, groups=output_dim), activation, 56 | nn.Conv1d(256 * output_dim, 256 * output_dim, kernel_size=1, groups=output_dim), activation, 57 | nn.Conv1d(256 * output_dim, output_dim, kernel_size=1, groups=output_dim) 58 | ) 59 | 60 | else: 61 | self.fc_layers = construct_linear_layers(shared_layers) 62 | if self.branched_outputs: 63 | self.output = nn.Sequential( 64 | nn.Conv1d(256 * output_dim, 256 * output_dim, kernel_size=1, groups=output_dim), activation, 65 | nn.Conv1d(256 * output_dim, output_dim, kernel_size=1, groups=output_dim) 66 | ) 67 | else: 68 | self.output = nn.Linear(shared_layers[-1][1], output_dim) 69 | 70 | self.to(device) 71 | 72 | def forward(self, obss, acts): 73 | batch_size = obss.shape[0] 74 | 75 | obss = obss.reshape(batch_size, -1) 76 | if self._obs_only: 77 | x = obss 78 | else: 79 | x = torch.cat((obss, acts), dim=-1) 80 | 81 | x = self._flatten(x) 82 | 83 | x = x.to(self.device) 84 | for layer in self.fc_layers: 85 | x = layer(x) 86 | 87 | if self.branched_outputs: 88 | x = x.repeat(1, self._output_dim).unsqueeze(-1) 89 | logits = self.output(x).squeeze(-1) 90 | else: 91 | logits = self.output(x) 92 | 93 | return logits 94 | 95 | 96 | class ActionConditionedFullyConnectedDiscriminatorPlusRewards(nn.Module): 97 | def __init__(self, obs_dim, action_dim, output_dim, handcraft_rewards, layers, device=torch.device(CPU)): 98 | """ 99 | handcraft_rewards is a dict containing the indices and corresponding reward functions that should be output 100 | in place of NN outputs. 101 | """ 102 | super().__init__() 103 | self.device = device 104 | 105 | self._obs_dim = obs_dim 106 | self._action_dim = action_dim 107 | self._output_dim = output_dim 108 | self._handcraft_rewards = handcraft_rewards 109 | self._true_output_dim = output_dim + len(self._handcraft_rewards) 110 | 111 | # need these so that handcrafted reward magnitudes can be rescaled to match average of learned reward magnitudes 112 | self._trainable_logits_rms = RunningMeanStd(shape=(self._output_dim,), norm_dim=(0,)) 113 | self._trainable_logits_rms.to(device) 114 | self._handcraft_rewards_rmss = [RunningMeanStd(shape=(1,), norm_dim=(0,))] * len(self._handcraft_rewards) 115 | for rms in self._handcraft_rewards_rmss: 116 | rms.to(device) 117 | 118 | # get trainable indices of true output 119 | handcraft_indices = [index for index in self._handcraft_rewards.keys()] 120 | self._trainable_indices = sorted(list(set(range(self._true_output_dim)) ^ set(handcraft_indices))) 121 | 122 | self._flatten = Flatten() 123 | self.fc_layers = construct_linear_layers(layers) 124 | 125 | self.output = nn.Linear(layers[-1][1], output_dim) 126 | 127 | self.to(device) 128 | 129 | def forward(self, obss, acts): 130 | batch_size = obss.shape[0] 131 | 132 | obss = obss.reshape(batch_size, -1) 133 | x = torch.cat((obss, acts), dim=-1) 134 | x = self._flatten(x) 135 | 136 | x = x.to(self.device) 137 | for layer in self.fc_layers: 138 | x = layer(x) 139 | 140 | logits = self.output(x) 141 | 142 | self._trainable_logits_rms.update(logits.detach()) 143 | trainable_mean = self._trainable_logits_rms.mean.mean() 144 | trainable_var = self._trainable_logits_rms.var.mean() 145 | 146 | full_out = torch.zeros([batch_size, self._true_output_dim]).to(self.device) 147 | full_out[:, self._trainable_indices] = logits 148 | 149 | obss = obss.to(self.device) 150 | acts = acts.to(self.device) 151 | for list_index, (index, func) in enumerate(self._handcraft_rewards.items()): 152 | rms = self._handcraft_rewards_rmss[list_index] 153 | rews = func(None, acts, obss, torch_multi=True) 154 | rms.update(rews) 155 | rews_normalized = rms.normalize(rews) 156 | rews_scale_matched = rews_normalized * torch.sqrt(trainable_var + self._trainable_logits_rms.epsilon) + \ 157 | trainable_mean 158 | full_out[:, index] = rews_scale_matched 159 | 160 | return full_out -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/a2c_ppo_acktr/algo/gail.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.utils.data 7 | from torch import autograd 8 | 9 | from stable_baselines3.common.running_mean_std import RunningMeanStd 10 | 11 | class Discriminator(nn.Module): 12 | def __init__(self, input_dim, hidden_dim, device): 13 | super(Discriminator, self).__init__() 14 | 15 | self.device = device 16 | 17 | self.trunk = nn.Sequential( 18 | nn.Linear(input_dim, hidden_dim), nn.Tanh(), 19 | nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), 20 | nn.Linear(hidden_dim, 1)).to(device) 21 | 22 | self.trunk.train() 23 | 24 | self.optimizer = torch.optim.Adam(self.trunk.parameters()) 25 | 26 | self.returns = None 27 | self.ret_rms = RunningMeanStd(shape=()) 28 | 29 | def compute_grad_pen(self, 30 | expert_state, 31 | expert_action, 32 | policy_state, 33 | policy_action, 34 | lambda_=10): 35 | alpha = torch.rand(expert_state.size(0), 1) 36 | expert_data = torch.cat([expert_state, expert_action], dim=1) 37 | policy_data = torch.cat([policy_state, policy_action], dim=1) 38 | 39 | alpha = alpha.expand_as(expert_data).to(expert_data.device) 40 | 41 | mixup_data = alpha * expert_data + (1 - alpha) * policy_data 42 | mixup_data.requires_grad = True 43 | 44 | disc = self.trunk(mixup_data) 45 | ones = torch.ones(disc.size()).to(disc.device) 46 | grad = autograd.grad( 47 | outputs=disc, 48 | inputs=mixup_data, 49 | grad_outputs=ones, 50 | create_graph=True, 51 | retain_graph=True, 52 | only_inputs=True)[0] 53 | 54 | grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean() 55 | return grad_pen 56 | 57 | def update(self, expert_loader, rollouts, obsfilt=None): 58 | self.train() 59 | 60 | policy_data_generator = rollouts.feed_forward_generator( 61 | None, mini_batch_size=expert_loader.batch_size) 62 | 63 | loss = 0 64 | n = 0 65 | for expert_batch, policy_batch in zip(expert_loader, 66 | policy_data_generator): 67 | policy_state, policy_action = policy_batch[0], policy_batch[2] 68 | policy_d = self.trunk( 69 | torch.cat([policy_state, policy_action], dim=1)) 70 | 71 | expert_state, expert_action = expert_batch 72 | # expert_state = obsfilt(expert_state.numpy(), update=False) 73 | expert_state = torch.FloatTensor(expert_state).to(self.device) 74 | expert_action = expert_action.to(self.device) 75 | # print(expert_state.shape, expert_action.shape) 76 | expert_d = self.trunk( 77 | torch.cat([expert_state, expert_action], dim=1)) 78 | 79 | expert_loss = F.binary_cross_entropy_with_logits( 80 | expert_d, 81 | torch.ones(expert_d.size()).to(self.device)) 82 | policy_loss = F.binary_cross_entropy_with_logits( 83 | policy_d, 84 | torch.zeros(policy_d.size()).to(self.device)) 85 | 86 | gail_loss = expert_loss + policy_loss 87 | grad_pen = self.compute_grad_pen(expert_state, expert_action, 88 | policy_state, policy_action) 89 | 90 | loss += (gail_loss + grad_pen).item() 91 | n += 1 92 | 93 | self.optimizer.zero_grad() 94 | (gail_loss + grad_pen).backward() 95 | self.optimizer.step() 96 | return loss / n 97 | 98 | def predict_reward(self, state, action, gamma, masks, update_rms=True): 99 | with torch.no_grad(): 100 | self.eval() 101 | d = self.trunk(torch.cat([state, action], dim=1)) 102 | s = torch.sigmoid(d) 103 | # reward = s.log() - (1 - s).log() 104 | reward = - (1 - s).log() 105 | if self.returns is None: 106 | self.returns = reward.clone() 107 | 108 | if update_rms: 109 | self.returns = self.returns * masks * gamma + reward 110 | self.ret_rms.update(self.returns.cpu().numpy()) 111 | 112 | return reward / np.sqrt(self.ret_rms.var[0] + 1e-8) 113 | 114 | 115 | class ExpertDataset(torch.utils.data.Dataset): 116 | def __init__(self, file_name, num_trajectories=4, subsample_frequency=20): 117 | all_trajectories = torch.load(file_name) 118 | 119 | perm = torch.randperm(all_trajectories['states'].size(0)) 120 | idx = perm[:num_trajectories] 121 | 122 | self.trajectories = {} 123 | 124 | # See https://github.com/pytorch/pytorch/issues/14886 125 | # .long() for fixing bug in torch v0.4.1 126 | start_idx = torch.randint( 127 | 0, subsample_frequency, size=(num_trajectories, )).long() 128 | 129 | for k, v in all_trajectories.items(): 130 | data = v[idx] 131 | 132 | if k != 'lengths': 133 | samples = [] 134 | for i in range(num_trajectories): 135 | samples.append(data[i, start_idx[i]::subsample_frequency]) 136 | self.trajectories[k] = torch.stack(samples) 137 | else: 138 | self.trajectories[k] = data // subsample_frequency 139 | 140 | self.i2traj_idx = {} 141 | self.i2i = {} 142 | 143 | self.length = self.trajectories['lengths'].sum().item() 144 | 145 | traj_idx = 0 146 | i = 0 147 | 148 | self.get_idx = [] 149 | 150 | for j in range(self.length): 151 | 152 | while self.trajectories['lengths'][traj_idx].item() <= i: 153 | i -= self.trajectories['lengths'][traj_idx].item() 154 | traj_idx += 1 155 | 156 | self.get_idx.append((traj_idx, i)) 157 | 158 | i += 1 159 | 160 | 161 | def __len__(self): 162 | return self.length 163 | 164 | def __getitem__(self, i): 165 | traj_idx, i = self.get_idx[i] 166 | 167 | return self.trajectories['states'][traj_idx][i], self.trajectories[ 168 | 'actions'][traj_idx][i] 169 | 170 | 171 | class LfGPExpertDataset(torch.utils.data.Dataset): 172 | def __init__(self, file_name): 173 | self.trajectories = torch.load(file_name) 174 | self.length = len(self.trajectories["states"]) 175 | 176 | def __len__(self): 177 | return self.length 178 | 179 | def __getitem__(self, i): 180 | return self.trajectories['states'][i], self.trajectories[ 181 | 'actions'][i] 182 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/algorithms/sac_x/schedulers_update/q_scheduler.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | import torch 3 | import numpy as np 4 | 5 | import rl_sandbox.constants as c 6 | from rl_sandbox.envs.utils import absorbing_check 7 | 8 | 9 | class UpdateQScheduler: 10 | def __init__(self, model, algo_params): 11 | self.model = model 12 | self._num_tasks = algo_params.get(c.NUM_TASKS, 1) 13 | self._action_dim = algo_params[c.ACTION_DIM] 14 | 15 | self._scheduler_period = algo_params[c.SCHEDULER_SETTING][c.TRAIN][c.SCHEDULER_PERIOD] 16 | self._scheduler_tau = algo_params[c.SCHEDULER_TAU] 17 | self.main_intention = algo_params.get(c.MAIN_INTENTION, 0) 18 | 19 | self._gamma = algo_params[c.GAMMA] 20 | self._rewards = [] 21 | self._discounting = [] 22 | 23 | def state_dict(self): 24 | return self.model.state_dict 25 | 26 | def load_state_dict(self, state_dict): 27 | self.model.load_state_dict(state_dict) 28 | 29 | def _compute_returns(self): 30 | episode_length = len(self._rewards) 31 | returns = torch.zeros(episode_length + 1) 32 | for step in range(episode_length - 1, -1, -1): 33 | returns[step] = self._rewards[step] + \ 34 | (self._gamma ** self._discounting[step]) * returns[step + 1] 35 | 36 | # Only take the returns for every scheduler's action 37 | return returns[:-1][::self._scheduler_period] 38 | 39 | def update_scheduler(self, obs, act, update_info): 40 | traj = obs + [act.item()] 41 | q_first_action = self.model.compute_qs([]) 42 | 43 | print(f"Scheduler Trajectory: {traj} - Q([], a), for all a: {q_first_action}") 44 | 45 | update_info[c.SCHEDULER_TRAJ] = traj 46 | update_info[c.SCHEDULER_TRAJ_VALUE] = np.array(q_first_action) 47 | 48 | if self.model.LEARNED: 49 | tic = timeit.default_timer() 50 | # update_info[c.Q_UPDATE_TIME] = [] # breaks print epoch summary if you use multiple gradient steps!!! 51 | rets = self._compute_returns() 52 | for step in range(len(traj)): 53 | old_q_value = self.model.compute_qsa(traj[:step], traj[step]) 54 | new_q_value = old_q_value * (1 - self._scheduler_tau) + rets[step] * self._scheduler_tau 55 | self.model.update_qsa(traj[:step], traj[step], new_q_value) 56 | # update_info[c.Q_UPDATE_TIME].append(timeit.default_timer() - tic) 57 | 58 | def update(self, obs, act, reward, done, info): 59 | self._rewards.append(reward[self.main_intention].item()) 60 | self._discounting.append(info[c.DISCOUNTING][0].item()) 61 | 62 | update_info = dict() 63 | if done: 64 | obs = info[c.HIGH_LEVEL_OBSERVATION] 65 | act = info[c.HIGH_LEVEL_ACTION] 66 | self.update_scheduler(obs, act, update_info) 67 | self._rewards.clear() 68 | self._discounting.clear() 69 | return True, update_info 70 | return False, update_info 71 | 72 | 73 | class UpdateDACQScheduler(UpdateQScheduler): 74 | def __init__(self, model, reward_function, algo_params): 75 | super().__init__(model=model, 76 | algo_params=algo_params) 77 | self.reward_function = reward_function 78 | self.max_ep_length = algo_params[c.MAX_EPISODE_LENGTH] 79 | self.curr_timestep = 0 80 | self.obss = [] 81 | self.acts = [] 82 | self.device = algo_params[c.DEVICE] 83 | self.train_preprocessing = algo_params[c.TRAIN_PREPROCESSING] 84 | self.main_intention = algo_params[c.MAIN_INTENTION] 85 | self.use_absorbing = absorbing_check(algo_params) 86 | 87 | def _compute_returns(self): 88 | obss = self.train_preprocessing(torch.as_tensor(np.array(self.obss)).squeeze(1).float()).to(self.device) 89 | acts = torch.as_tensor(np.array(self.acts)).float().to(self.device) 90 | 91 | with torch.no_grad(): 92 | rews = self.reward_function(obss, acts).detach() 93 | episode_length = len(rews) 94 | returns = torch.zeros(episode_length + 1) 95 | for step in range(episode_length - 1, -1, -1): 96 | returns[step] = rews[step, self.main_intention].cpu() + \ 97 | (self._gamma ** self._discounting[step]) * returns[step + 1] 98 | 99 | self.obss = [] 100 | self.acts = [] 101 | 102 | # Only take the returns for every scheduler's action 103 | return returns[:-1][::self._scheduler_period] 104 | 105 | def update(self, obs, act, rew, done, info): 106 | self.curr_timestep += 1 107 | self.obss.append(obs) 108 | self.acts.append(act) 109 | if (self.use_absorbing and obs[:, -1] == 1) or self.curr_timestep == self.max_ep_length: 110 | act[:] = 0 111 | done = True 112 | self.curr_timestep = 0 113 | 114 | return super().update(obs, act, rew, done, info) 115 | 116 | def reset(self): 117 | # Call on env reset 118 | self.obss = [] 119 | self.acts = [] 120 | self.curr_timestep = 0 121 | 122 | 123 | class UpdateDACQSchedulerPlusHandcraft(UpdateQScheduler): 124 | def __init__(self, model, reward_function, algo_params): 125 | super().__init__(model=model, 126 | algo_params=algo_params) 127 | self.reward_function = reward_function 128 | self.max_ep_length = algo_params[c.MAX_EPISODE_LENGTH] 129 | self.curr_timestep = 0 130 | self.obss = [] 131 | self.acts = [] 132 | self.device = algo_params[c.DEVICE] 133 | self.train_preprocessing = algo_params[c.TRAIN_PREPROCESSING] 134 | self.main_intention = algo_params[c.HANDCRAFT_TASKS]['main_task'][0] 135 | self.use_absorbing = absorbing_check(algo_params) 136 | 137 | def _compute_returns(self): 138 | obss = self.train_preprocessing(torch.as_tensor(self.obss).squeeze(1).float()).to(self.device) 139 | acts = torch.as_tensor(self.acts).float().to(self.device) 140 | 141 | with torch.no_grad(): 142 | rews = self.reward_function(obss, acts).detach() 143 | 144 | episode_length = len(rews) 145 | returns = torch.zeros(episode_length + 1) 146 | 147 | for step in range(episode_length - 1, -1, -1): 148 | returns[step] = rews[step, self.main_intention].cpu() + \ 149 | (self._gamma ** self._discounting[step]) * returns[step + 1] 150 | 151 | self.obss = [] 152 | self.acts = [] 153 | 154 | # Only take the returns for every scheduler's action 155 | return returns[:-1][::self._scheduler_period] 156 | 157 | def update(self, obs, act, rew, done, info): 158 | self.curr_timestep += 1 159 | self.obss.append(obs) 160 | self.acts.append(act) 161 | if (self.use_absorbing and obs[:, -1] == 1) or self.curr_timestep == self.max_ep_length: 162 | act[:] = 0 163 | done = True 164 | self.curr_timestep = 0 165 | 166 | return super().update(obs, act, rew, done, info) 167 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail/README.md: -------------------------------------------------------------------------------- 1 | # pytorch-a2c-ppo-acktr 2 | 3 | ## Note from LfGP Authors 4 | This repository is originally hosted here https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail. We reused the code for our GAIL results. See the main LfGP README for reproducing GAIL results. 5 | 6 | ## Update (April 12th, 2021) 7 | 8 | PPO is great, but [Soft Actor Critic](https://arxiv.org/abs/1812.05905) can be better for many continuous control tasks. Please check out [my new RL](http://github.com/ikostrikov/jax-rl) repository in jax. 9 | 10 | ## Please use hyper parameters from this readme. With other hyper parameters things might not work (it's RL after all)! 11 | 12 | This is a PyTorch implementation of 13 | * Advantage Actor Critic (A2C), a synchronous deterministic version of [A3C](https://arxiv.org/pdf/1602.01783v1.pdf) 14 | * Proximal Policy Optimization [PPO](https://arxiv.org/pdf/1707.06347.pdf) 15 | * Scalable trust-region method for deep reinforcement learning using Kronecker-factored approximation [ACKTR](https://arxiv.org/abs/1708.05144) 16 | * Generative Adversarial Imitation Learning [GAIL](https://arxiv.org/abs/1606.03476) 17 | 18 | Also see the OpenAI posts: [A2C/ACKTR](https://blog.openai.com/baselines-acktr-a2c/) and [PPO](https://blog.openai.com/openai-baselines-ppo/) for more information. 19 | 20 | This implementation is inspired by the OpenAI baselines for [A2C](https://github.com/openai/baselines/tree/master/baselines/a2c), [ACKTR](https://github.com/openai/baselines/tree/master/baselines/acktr) and [PPO](https://github.com/openai/baselines/tree/master/baselines/ppo1). It uses the same hyper parameters and the model since they were well tuned for Atari games. 21 | 22 | Please use this bibtex if you want to cite this repository in your publications: 23 | 24 | @misc{pytorchrl, 25 | author = {Kostrikov, Ilya}, 26 | title = {PyTorch Implementations of Reinforcement Learning Algorithms}, 27 | year = {2018}, 28 | publisher = {GitHub}, 29 | journal = {GitHub repository}, 30 | howpublished = {\url{https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail}}, 31 | } 32 | 33 | ## Supported (and tested) environments (via [OpenAI Gym](https://gym.openai.com)) 34 | * [Atari Learning Environment](https://github.com/mgbellemare/Arcade-Learning-Environment) 35 | * [MuJoCo](http://mujoco.org) 36 | * [PyBullet](http://pybullet.org) (including Racecar, Minitaur and Kuka) 37 | * [DeepMind Control Suite](https://github.com/deepmind/dm_control) (via [dm_control2gym](https://github.com/martinseilair/dm_control2gym)) 38 | 39 | I highly recommend PyBullet as a free open source alternative to MuJoCo for continuous control tasks. 40 | 41 | All environments are operated using exactly the same Gym interface. See their documentations for a comprehensive list. 42 | 43 | To use the DeepMind Control Suite environments, set the flag `--env-name dm..`, where `domain_name` and `task_name` are the name of a domain (e.g. `hopper`) and a task within that domain (e.g. `stand`) from the DeepMind Control Suite. Refer to their repo and their [tech report](https://arxiv.org/abs/1801.00690) for a full list of available domains and tasks. Other than setting the task, the API for interacting with the environment is exactly the same as for all the Gym environments thanks to [dm_control2gym](https://github.com/martinseilair/dm_control2gym). 44 | 45 | ## Requirements 46 | 47 | * Python 3 (it might work with Python 2, but I didn't test it) 48 | * [PyTorch](http://pytorch.org/) 49 | * [Stable baselines3](https://github.com/DLR-RM/stable-baselines3) 50 | 51 | In order to install requirements, follow: 52 | 53 | ```bash 54 | # PyTorch 55 | conda install pytorch torchvision -c soumith 56 | 57 | # Other requirements 58 | pip install -r requirements.txt 59 | ``` 60 | 61 | ## Contributions 62 | 63 | Contributions are very welcome. If you know how to make this code better, please open an issue. If you want to submit a pull request, please open an issue first. Also see a todo list below. 64 | 65 | Also I'm searching for volunteers to run all experiments on Atari and MuJoCo (with multiple random seeds). 66 | 67 | ## Disclaimer 68 | 69 | It's extremely difficult to reproduce results for Reinforcement Learning methods. See ["Deep Reinforcement Learning that Matters"](https://arxiv.org/abs/1709.06560) for more information. I tried to reproduce OpenAI results as closely as possible. However, majors differences in performance can be caused even by minor differences in TensorFlow and PyTorch libraries. 70 | 71 | ### TODO 72 | * Improve this README file. Rearrange images. 73 | * Improve performance of KFAC, see kfac.py for more information 74 | * Run evaluation for all games and algorithms 75 | 76 | ## Visualization 77 | 78 | In order to visualize the results use ```visualize.ipynb```. 79 | 80 | 81 | ## Training 82 | 83 | ### Atari 84 | #### A2C 85 | 86 | ```bash 87 | python main.py --env-name "PongNoFrameskip-v4" 88 | ``` 89 | 90 | #### PPO 91 | 92 | ```bash 93 | python main.py --env-name "PongNoFrameskip-v4" --algo ppo --use-gae --lr 2.5e-4 --clip-param 0.1 --value-loss-coef 0.5 --num-processes 8 --num-steps 128 --num-mini-batch 4 --log-interval 1 --use-linear-lr-decay --entropy-coef 0.01 94 | ``` 95 | 96 | #### ACKTR 97 | 98 | ```bash 99 | python main.py --env-name "PongNoFrameskip-v4" --algo acktr --num-processes 32 --num-steps 20 100 | ``` 101 | 102 | ### MuJoCo 103 | 104 | Please always try to use ```--use-proper-time-limits``` flag. It properly handles partial trajectories (see https://github.com/sfujim/TD3/blob/master/main.py#L123). 105 | 106 | #### A2C 107 | 108 | ```bash 109 | python main.py --env-name "Reacher-v2" --num-env-steps 1000000 110 | ``` 111 | 112 | #### PPO 113 | 114 | ```bash 115 | python main.py --env-name "Reacher-v2" --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --gae-lambda 0.95 --num-env-steps 1000000 --use-linear-lr-decay --use-proper-time-limits 116 | ``` 117 | 118 | #### ACKTR 119 | 120 | ACKTR requires some modifications to be made specifically for MuJoCo. But at the moment, I want to keep this code as unified as possible. Thus, I'm going for better ways to integrate it into the codebase. 121 | 122 | ## Enjoy 123 | 124 | ### Atari 125 | 126 | ```bash 127 | python enjoy.py --load-dir trained_models/a2c --env-name "PongNoFrameskip-v4" 128 | ``` 129 | 130 | ### MuJoCo 131 | 132 | ```bash 133 | python enjoy.py --load-dir trained_models/ppo --env-name "Reacher-v2" 134 | ``` 135 | 136 | ## Results 137 | 138 | ### A2C 139 | 140 | ![BreakoutNoFrameskip-v4](imgs/a2c_breakout.png) 141 | 142 | ![SeaquestNoFrameskip-v4](imgs/a2c_seaquest.png) 143 | 144 | ![QbertNoFrameskip-v4](imgs/a2c_qbert.png) 145 | 146 | ![beamriderNoFrameskip-v4](imgs/a2c_beamrider.png) 147 | 148 | ### PPO 149 | 150 | 151 | ![BreakoutNoFrameskip-v4](imgs/ppo_halfcheetah.png) 152 | 153 | ![SeaquestNoFrameskip-v4](imgs/ppo_hopper.png) 154 | 155 | ![QbertNoFrameskip-v4](imgs/ppo_reacher.png) 156 | 157 | ![beamriderNoFrameskip-v4](imgs/ppo_walker.png) 158 | 159 | 160 | ### ACKTR 161 | 162 | ![BreakoutNoFrameskip-v4](imgs/acktr_breakout.png) 163 | 164 | ![SeaquestNoFrameskip-v4](imgs/acktr_seaquest.png) 165 | 166 | ![QbertNoFrameskip-v4](imgs/acktr_qbert.png) 167 | 168 | ![beamriderNoFrameskip-v4](imgs/acktr_beamrider.png) 169 | -------------------------------------------------------------------------------- /rl_sandbox/rl_sandbox/utils.py: -------------------------------------------------------------------------------- 1 | import _pickle as pickle 2 | import json 3 | import numpy as np 4 | import os 5 | import timeit 6 | import torch 7 | import glob 8 | 9 | from datetime import datetime 10 | from torch.utils.tensorboard import SummaryWriter 11 | 12 | import rl_sandbox.constants as c 13 | 14 | 15 | def check_load_latest_checkpoint(experiment_config, save_path): 16 | if experiment_config[c.LOAD_LATEST_CHECKPOINT]: 17 | paths = glob.glob(os.path.join(save_path, '*')) 18 | if len(paths) == 0: 19 | print(f"Warning: load_latest_checkpoint set with no existing experiments at {save_path}, starting new experiment.") 20 | add_time_tag_to_save_path = True 21 | experiment_config[c.LOAD_LATEST_CHECKPOINT] = False 22 | else: 23 | latest_path = sorted(paths)[-1] 24 | if not os.path.isfile(os.path.join(latest_path, f"{experiment_config[c.CHECKPOINT_NAME]}_buffer.pkl")): 25 | print(f"Warning: load_latest_checkpoint set with no existing experiments at {save_path}, starting new experiment.") 26 | add_time_tag_to_save_path = True 27 | experiment_config[c.LOAD_LATEST_CHECKPOINT] = False 28 | else: 29 | save_path = latest_path 30 | print(f"Loading latest checkpoint from {save_path}/{experiment_config[c.CHECKPOINT_NAME]}") 31 | experiment_config[c.BUFFER_SETTING][c.LOAD_BUFFER] = os.path.join( 32 | save_path, f"{experiment_config[c.CHECKPOINT_NAME]}_buffer.pkl") 33 | experiment_config[c.LOAD_MODEL] = os.path.join( 34 | save_path, f"{experiment_config[c.CHECKPOINT_NAME]}.pt") 35 | add_time_tag_to_save_path = False 36 | else: 37 | add_time_tag_to_save_path = True 38 | 39 | return save_path, add_time_tag_to_save_path 40 | 41 | def check_load_as_jumpoff_point(experiment_config, save_path, add_time_tag_to_save_path): 42 | if experiment_config.get(c.LOAD_MODEL_NAME, "") != "": 43 | paths = glob.glob(os.path.join(save_path, '*')) 44 | if len(paths) == 0: 45 | raise ValueError(f"No paths found at {save_path} to load jumpoff point from") 46 | else: 47 | latest_path = sorted(paths)[-1] 48 | model_n = experiment_config[c.LOAD_MODEL_NAME] 49 | buffer_n = experiment_config[c.LOAD_BUFFER_NAME] 50 | print(f"Loading jumpoff point from {latest_path} with model name {model_n}, buffer name {buffer_n}") 51 | 52 | experiment_config[c.BUFFER_SETTING][c.LOAD_BUFFER] = os.path.join( 53 | latest_path, f"{buffer_n}_buffer.pkl") 54 | experiment_config[c.LOAD_MODEL] = os.path.join( 55 | latest_path, f"{model_n}.pt") 56 | experiment_config[c.LOAD_TRACKING_DICT] = os.path.join( 57 | latest_path, f"{model_n}_tracking_dict.pkl") 58 | 59 | save_path = ('/').join(latest_path.split('/')[:-1]) + f'_from_{model_n}' 60 | 61 | add_time_tag_to_save_path = True 62 | else: 63 | add_time_tag_to_save_path = add_time_tag_to_save_path 64 | 65 | return save_path, add_time_tag_to_save_path 66 | 67 | class DummySummaryWriter(): 68 | def add_scalar(self, arg_1, arg_2, arg_3): 69 | pass 70 | 71 | def add_scalars(self, arg_1, arg_2, arg_3): 72 | pass 73 | 74 | def add_text(self, arg_1, arg_2, arg_3): 75 | pass 76 | 77 | 78 | def make_summary_writer(save_path, algo, cfg, add_time_tag=True): 79 | summary_writer = DummySummaryWriter() 80 | cfg[c.ALGO] = algo 81 | if save_path is not None: 82 | if add_time_tag: 83 | time_tag = datetime.strftime(datetime.now(), "%m-%d-%y_%H_%M_%S") 84 | save_path = f"{save_path}/{time_tag}" 85 | os.makedirs(save_path, exist_ok=True) 86 | summary_writer = SummaryWriter(log_dir=f"{save_path}/tensorboard") 87 | pickle.dump( 88 | cfg, 89 | open(f'{save_path}/{algo}_experiment_setting.pkl', 'wb')) 90 | json.dump( 91 | cfg, 92 | open(f'{save_path}/{algo}_experiment_setting.json', 'w'), 93 | indent=4, 94 | default=lambda o: f"<>" 95 | ) 96 | 97 | return summary_writer, save_path 98 | 99 | def get_rng_state(): 100 | return {'torch_rng_state': torch.get_rng_state(), 'np_rng_state': np.random.get_state()} 101 | 102 | def set_rng_state(torch_rng_state, np_rng_state): 103 | torch.set_rng_state(torch_rng_state.cpu()) # without .cpu throws a bizarre error about not being a ByteTensor 104 | np.random.set_state(np_rng_state) 105 | 106 | def set_seed(seed=None): 107 | if seed is None: 108 | seed = np.random.randint(0, c.MAX_INT) 109 | 110 | np.random.seed(seed) 111 | torch.manual_seed(seed) 112 | 113 | 114 | class EpochSummary: 115 | def __init__(self, default_key_length=10, padding=11): 116 | self._key_length = default_key_length 117 | self._padding = padding 118 | self._summary = dict() 119 | self._epoch = 0 120 | self._init_tic = timeit.default_timer() 121 | 122 | def log(self, key, value, track_std=True, track_min_max=True, axis=None): 123 | self._key_length = max(self._key_length, len(key)) 124 | self._summary.setdefault(key, { 125 | c.LOG_SETTING: { 126 | c.STANDARD_DEVIATION: track_std, 127 | c.MIN_MAX: track_min_max, 128 | c.AXIS: axis, 129 | }, 130 | c.CONTENT: [] 131 | }) 132 | self._summary[key][c.CONTENT].append(value) 133 | 134 | def new_epoch(self): 135 | self._epoch += 1 136 | self._summary.clear() 137 | self._curr_tic = timeit.default_timer() 138 | 139 | def print_summary(self): 140 | toc = timeit.default_timer() 141 | key_length = self._key_length + self._padding 142 | print("=" * 100) 143 | print(f"Epoch: {self._epoch}") 144 | print(f"Epoch Time Spent: {toc - self._curr_tic}") 145 | print(f"Total Time Spent: {toc - self._init_tic}") 146 | print("=" * 100) 147 | print('|'.join(str(x).ljust(key_length) for x in ("Key", "Content"))) 148 | print("-" * 100) 149 | 150 | # temp fix for scheduler trajs that are not always same length 151 | if 'update_info/scheduler_traj' in self._summary: 152 | del self._summary['update_info/scheduler_traj'] 153 | 154 | for key in sorted(self._summary): 155 | val = self._summary[key][c.CONTENT] 156 | setting = self._summary[key][c.LOG_SETTING] 157 | try: 158 | print('|'.join(str(x).ljust(key_length) for x in (f"{key} - AVG", np.mean(val, axis=setting[c.AXIS])))) 159 | if setting[c.STANDARD_DEVIATION]: 160 | print('|'.join(str(x).ljust(key_length) for x in (f"{key} - STD DEV", np.std(val, axis=setting[c.AXIS])))) 161 | if setting[c.MIN_MAX]: 162 | print('|'.join(str(x).ljust(key_length) for x in (f"{key} - MIN", np.min(val, axis=setting[c.AXIS])))) 163 | print('|'.join(str(x).ljust(key_length) for x in (f"{key} - MAX", np.max(val, axis=setting[c.AXIS])))) 164 | except: 165 | print(val) 166 | print(key) 167 | assert 0 168 | print("=" * 100) 169 | --------------------------------------------------------------------------------