├── .gitignore ├── LICENSE.md ├── README.md ├── agent_data └── readme.txt ├── data └── readme.txt ├── environment.yml ├── experiment_configs ├── algorithms │ ├── batch.py │ ├── batch_mbrl.py │ ├── mbrl.py │ ├── offline.py │ ├── offline_mbrl.py │ └── online.py ├── base_experiment.py └── configs │ ├── dads │ └── dads_config.py │ ├── lisp │ └── lisp_config.py │ ├── mpc │ ├── loop_config.py │ ├── make_mpc_policy.py │ └── mpc.py │ ├── pg │ ├── npg_config.py │ ├── ppo_config.py │ └── vpg_config.py │ └── q_learning │ ├── cql_config.py │ ├── mbpo_config.py │ └── sac_config.py ├── experiment_utils ├── __init__.py ├── config.py ├── launch_experiment.py ├── sweeper.py ├── teacher_data.py └── utils.py ├── lifelong_rl ├── __init__.py ├── core │ ├── __init__.py │ ├── logging │ │ ├── logging.py │ │ ├── logging_setup.py │ │ └── tabulate.py │ └── rl_algorithms │ │ ├── __init__.py │ │ ├── batch │ │ ├── __init__.py │ │ ├── batch_rl_algorithm.py │ │ └── mb_batch_rl_algorithm.py │ │ ├── offline │ │ ├── __init__.py │ │ ├── mb_offline_rl_algorithm.py │ │ └── offline_rl_algorithm.py │ │ ├── online │ │ ├── __init__.py │ │ ├── mbrl_algorithm.py │ │ └── online_rl_algorithm.py │ │ ├── rl_algorithm.py │ │ └── torch_rl_algorithm.py ├── data_management │ ├── replay_buffers │ │ ├── env_replay_buffer.py │ │ ├── mujoco_replay_buffer.py │ │ ├── replay_buffer.py │ │ └── simple_replay_buffer.py │ └── utils │ │ └── path_builder.py ├── envs │ ├── __init__.py │ ├── env_processor.py │ ├── env_utils.py │ ├── environments │ │ ├── __init__.py │ │ ├── ant_env.py │ │ ├── assets │ │ │ ├── ant.xml │ │ │ └── inverted_pendulum.xml │ │ ├── continuous_gridworld │ │ │ ├── __init__.py │ │ │ ├── cont_gridworld.py │ │ │ ├── grids │ │ │ │ ├── blank.txt │ │ │ │ ├── minecraft │ │ │ │ │ └── world_1.txt │ │ │ │ ├── one_goal.txt │ │ │ │ ├── volcano_1.txt │ │ │ │ └── volcano_2.txt │ │ │ └── tiles.py │ │ ├── hopper_env.py │ │ └── humanoid_env.py │ └── wrappers.py ├── models │ ├── __init__.py │ ├── dynamics_models │ │ ├── __init__.py │ │ └── probabilistic_ensemble.py │ └── networks.py ├── optimizers │ ├── __init__.py │ ├── optimizer.py │ └── random_shooting │ │ ├── __init__.py │ │ ├── cem.py │ │ ├── mppi.py │ │ └── rs_optimizer.py ├── policies │ ├── __init__.py │ ├── base │ │ ├── base.py │ │ ├── latent_prior_policy.py │ │ └── simple.py │ ├── models │ │ └── gaussian_policy.py │ └── mpc │ │ ├── mpc.py │ │ └── policy_mpc.py ├── samplers │ ├── __init__.py │ ├── data_collector │ │ ├── __init__.py │ │ ├── base.py │ │ ├── path_collector.py │ │ └── step_collector.py │ └── utils │ │ ├── model_rollout_functions.py │ │ ├── path_functions.py │ │ └── rollout_functions.py ├── torch │ ├── distributions.py │ ├── modules.py │ ├── pytorch_util.py │ └── risk_aversion.py ├── trainers │ ├── __init__.py │ ├── dads │ │ ├── __init__.py │ │ ├── dads.py │ │ ├── empowerment_functions.py │ │ └── skill_dynamics.py │ ├── her │ │ ├── __init__.py │ │ └── her.py │ ├── lisp │ │ ├── __init__.py │ │ ├── lisp.py │ │ └── mb_skill.py │ ├── mbrl │ │ ├── __init__.py │ │ └── mbrl.py │ ├── mpc │ │ ├── __init__.py │ │ └── mpc_trainer.py │ ├── multi_trainer.py │ ├── pg │ │ ├── __init__.py │ │ ├── npg.py │ │ ├── pg.py │ │ └── ppo.py │ ├── q_learning │ │ ├── __init__.py │ │ ├── ddpg.py │ │ ├── mbpo.py │ │ ├── sac.py │ │ └── td3.py │ └── trainer.py └── util │ ├── eval_util.py │ ├── pythonplusplus.py │ └── visualize_mujoco.py ├── run_scripts ├── dads.py ├── lisp.py ├── loop.py ├── mbpo.py ├── mopo.py ├── morel.py ├── mpc.py ├── npg.py ├── ppo.py ├── sac.py └── vpg.py └── scripts ├── download_d4rl_dataset.py └── viz_hist.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Random files 2 | **/.DS_STORE 3 | **/*.pyc 4 | **/*.swp 5 | MANIFEST 6 | *.egg-info 7 | \.idea/ 8 | /.idea 9 | MUJOCO_LOG.TXT 10 | 11 | # Misc lifelong_rl directories 12 | /agent_data 13 | /data 14 | /misc 15 | /video 16 | /wandb 17 | /tmp 18 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 LiSP (Reset-Free Lifelong Learning via Skill-Space Planning) Authors (https://arxiv.org/abs/2012.03548) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /agent_data/readme.txt: -------------------------------------------------------------------------------- 1 | The agent_data folder stores files of transitions (to be loaded in as offline data/demos for an agent). 2 | 3 | The files should be .pkl files stored in bytes as numpy arrays with shape: 4 | (N, obs_dim + action_dim + 1 (reward) + 1 (terminal) + obs_dim) 5 | i.e. a concatenated array of N transitions: (obs, action, reward, terminal, next_obs) 6 | 7 | Snapshots typically contain this in the "replay_buffer/transitions" key of the .pt dict. 8 | -------------------------------------------------------------------------------- /data/readme.txt: -------------------------------------------------------------------------------- 1 | Logs from experiments are saved in the data folder. 2 | 3 | Folders represents experiment names, and subfolders contain individual seeds/runs. 4 | 5 | Inside of the folder of a seed contains: 6 | - variant.json: json file containing hyperparameters of experiment 7 | - stdout.log: standard terminal output from experiment 8 | - progress.csv: metrics logged by agent at every timestep 9 | - itr_X.pt: Pytorch snapshot of agent after epoch X 10 | 11 | Offline experiments contain the following variants: 12 | - offline_progress.csv: analogous to progress.csv 13 | - offline_itr_X.pt: analogous to itr_X.pt 14 | 15 | Experiments with both an online and an offline phase will contain both types of files. 16 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: lifelong_rl 2 | dependencies: 3 | - python=3.7 4 | - patchelf 5 | - swig 6 | - pip: 7 | - atari-py==0.2.6 8 | - backcall==0.1.0 9 | - boto3==1.13.1 10 | - botocore==1.16.1 11 | - Box2D==2.3.10 12 | - Box2D-kengz==2.3.3 13 | - certifi==2020.4.5.1 14 | - cffi==1.14.0 15 | - chardet==3.0.4 16 | - click==7.1.2 17 | - cloudpickle==1.3.0 18 | - cycler==0.10.0 19 | - Cython==0.29.17 20 | - decorator==4.4.2 21 | - docutils==0.15.2 22 | - fasteners==0.15 23 | - Flask==1.1.2 24 | - future==0.18.2 25 | - gitdb==4.0.4 26 | - gitdb2==4.0.2 27 | - GitPython==3.1.1 28 | - glfw==1.11.0 29 | - gtimer==1.0.0b5 30 | - gym==0.17.1 31 | - idna==2.9 32 | - imageio==2.8.0 33 | - ipdb==0.13.2 34 | - ipython==7.14.0 35 | - ipython-genutils==0.2.0 36 | - itsdangerous==1.1.0 37 | - jedi==0.17.0 38 | - Jinja2==2.11.2 39 | - jmespath==0.9.5 40 | - joblib==0.14.1 41 | - kiwisolver==1.2.0 42 | - llvmlite==0.32.0 43 | - lockfile==0.12.2 44 | - Mako==1.1.2 45 | - MarkupSafe==1.1.1 46 | - matplotlib==3.2.1 47 | - monotonic==1.5 48 | - mujoco-py==2.0.2.9 49 | - numba==0.49.0 50 | - numpy==1.18.3 51 | - packaging==20.3 52 | - pandas==1.0.3 53 | - parso==0.7.0 54 | - pexpect==4.8.0 55 | - pickleshare==0.7.5 56 | - Pillow==7.1.2 57 | - plotly==4.6.0 58 | - prompt-toolkit==3.0.5 59 | - ptyprocess==0.6.0 60 | - pycparser==2.20 61 | - pygame==1.9.6 62 | - pyglet==1.5.0 63 | - Pygments==2.6.1 64 | - PyOpenGL==3.1.5 65 | - pyparsing==2.4.7 66 | - python-dateutil==2.8.1 67 | - pytz==2020.1 68 | - requests==2.23.0 69 | - retrying==1.3.3 70 | - s3transfer==0.3.3 71 | - scipy==1.4.1 72 | - seaborn==0.10.1 73 | - simplegeneric==0.8.1 74 | - sip==5.2.0 75 | - six==1.14.0 76 | - smmap==3.0.2 77 | - toml==0.10.0 78 | - torch==1.4.0 79 | - torchvision==0.5.0 80 | - traitlets==4.3.3 81 | - urllib3==1.25.9 82 | - wcwidth==0.1.9 83 | - Werkzeug==1.0.1 84 | -------------------------------------------------------------------------------- /experiment_configs/algorithms/batch.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchBatchRLAlgorithm 2 | 3 | 4 | def get_algorithm(config, expl_path_collector, eval_path_collector): 5 | 6 | algorithm = TorchBatchRLAlgorithm( 7 | trainer=config['trainer'], 8 | exploration_policy=config['exploration_policy'], 9 | exploration_env=config['exploration_env'], 10 | evaluation_env=config['evaluation_env'], 11 | replay_buffer=config['replay_buffer'], 12 | exploration_data_collector=expl_path_collector, 13 | evaluation_data_collector=eval_path_collector, 14 | **config['algorithm_kwargs'] 15 | ) 16 | 17 | return algorithm 18 | -------------------------------------------------------------------------------- /experiment_configs/algorithms/batch_mbrl.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchMBBatchRLAlgorithm 2 | 3 | 4 | def get_algorithm(config, expl_path_collector, eval_path_collector): 5 | 6 | algorithm = TorchMBBatchRLAlgorithm( 7 | trainer=config['trainer'], 8 | exploration_policy=config['exploration_policy'], 9 | model_trainer=config['model_trainer'], 10 | exploration_env=config['exploration_env'], 11 | evaluation_env=config['evaluation_env'], 12 | replay_buffer=config['replay_buffer'], 13 | exploration_data_collector=expl_path_collector, 14 | evaluation_data_collector=eval_path_collector, 15 | **config['algorithm_kwargs'] 16 | ) 17 | 18 | return algorithm 19 | -------------------------------------------------------------------------------- /experiment_configs/algorithms/mbrl.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchMBRLAlgorithm 2 | 3 | 4 | def get_algorithm(config, expl_path_collector, eval_path_collector): 5 | 6 | algorithm = TorchMBRLAlgorithm( 7 | trainer=config['trainer'], 8 | exploration_policy=config['exploration_policy'], 9 | model_trainer=config['model_trainer'], 10 | exploration_env=config['exploration_env'], 11 | evaluation_env=config['evaluation_env'], 12 | replay_buffer=config['replay_buffer'], 13 | exploration_data_collector=expl_path_collector, 14 | evaluation_data_collector=eval_path_collector, 15 | **config['algorithm_kwargs'] 16 | ) 17 | 18 | return algorithm 19 | -------------------------------------------------------------------------------- /experiment_configs/algorithms/offline.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchOfflineRLAlgorithm 2 | 3 | 4 | def get_offline_algorithm(config, eval_path_collector): 5 | 6 | algorithm = TorchOfflineRLAlgorithm( 7 | trainer=config['trainer'], 8 | evaluation_policy=config['evaluation_policy'], 9 | evaluation_env=config['evaluation_env'], 10 | replay_buffer=config['replay_buffer'], 11 | evaluation_data_collector=eval_path_collector, 12 | **config['offline_kwargs'] 13 | ) 14 | 15 | return algorithm 16 | -------------------------------------------------------------------------------- /experiment_configs/algorithms/offline_mbrl.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchOfflineMBRLAlgorithm 2 | 3 | 4 | def get_offline_algorithm(config, eval_path_collector): 5 | 6 | algorithm = TorchOfflineMBRLAlgorithm( 7 | trainer=config['trainer'], 8 | evaluation_policy=config['evaluation_policy'], 9 | model_trainer=config['model_trainer'], 10 | evaluation_env=config['evaluation_env'], 11 | replay_buffer=config['replay_buffer'], 12 | evaluation_data_collector=eval_path_collector, 13 | **config['offline_kwargs'] 14 | ) 15 | 16 | return algorithm 17 | -------------------------------------------------------------------------------- /experiment_configs/algorithms/online.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchOnlineRLAlgorithm 2 | 3 | 4 | def get_algorithm(config, expl_path_collector, eval_path_collector): 5 | 6 | algorithm = TorchOnlineRLAlgorithm( 7 | trainer=config['trainer'], 8 | exploration_policy=config['exploration_policy'], 9 | exploration_env=config['exploration_env'], 10 | evaluation_env=config['evaluation_env'], 11 | replay_buffer=config['replay_buffer'], 12 | exploration_data_collector=expl_path_collector, 13 | evaluation_data_collector=eval_path_collector, 14 | **config['algorithm_kwargs'] 15 | ) 16 | 17 | return algorithm 18 | -------------------------------------------------------------------------------- /experiment_configs/configs/dads/dads_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from lifelong_rl.policies.base.latent_prior_policy import PriorLatentPolicy 4 | from lifelong_rl.policies.models.gaussian_policy import TanhGaussianPolicy 5 | from lifelong_rl.models.networks import FlattenMlp 6 | from lifelong_rl.trainers.dads.dads import DADSTrainer 7 | from lifelong_rl.trainers.dads.skill_dynamics import SkillDynamics 8 | from lifelong_rl.trainers.q_learning.sac import SACTrainer 9 | import lifelong_rl.torch.pytorch_util as ptu 10 | import lifelong_rl.util.pythonplusplus as ppp 11 | 12 | 13 | def get_config( 14 | variant, 15 | expl_env, 16 | eval_env, 17 | obs_dim, 18 | action_dim, 19 | replay_buffer, 20 | ): 21 | 22 | """ 23 | Policy construction 24 | """ 25 | 26 | M = variant['policy_kwargs']['layer_size'] 27 | latent_dim = variant['policy_kwargs']['latent_dim'] 28 | restrict_dim = variant['discriminator_kwargs']['restrict_input_size'] 29 | 30 | control_policy = TanhGaussianPolicy( 31 | obs_dim=obs_dim + latent_dim, 32 | action_dim=action_dim, 33 | hidden_sizes=[M, M], 34 | restrict_obs_dim=restrict_dim, 35 | ) 36 | 37 | prior = torch.distributions.uniform.Uniform( 38 | -ptu.ones(latent_dim), ptu.ones(latent_dim), 39 | ) 40 | 41 | policy = PriorLatentPolicy( 42 | policy=control_policy, 43 | prior=prior, 44 | unconditional=True, 45 | ) 46 | 47 | qf1, qf2, target_qf1, target_qf2 = ppp.group_init( 48 | 4, 49 | FlattenMlp, 50 | input_size=obs_dim + latent_dim + action_dim, 51 | output_size=1, 52 | hidden_sizes=[M, M], 53 | ) 54 | 55 | """ 56 | Discriminator 57 | """ 58 | 59 | discrim_kwargs = variant['discriminator_kwargs'] 60 | discriminator = SkillDynamics( 61 | observation_size=obs_dim if restrict_dim == 0 else restrict_dim, 62 | action_size=action_dim, 63 | latent_size=latent_dim, 64 | normalize_observations=discrim_kwargs.get('normalize_observations', True), 65 | fix_variance=discrim_kwargs.get('fix_variance', True), 66 | fc_layer_params=[discrim_kwargs['layer_size']] * discrim_kwargs['num_layers'], 67 | ) 68 | 69 | """ 70 | Policy trainer 71 | """ 72 | 73 | policy_trainer = SACTrainer( 74 | env=expl_env, 75 | policy=control_policy, 76 | qf1=qf1, 77 | qf2=qf2, 78 | target_qf1=target_qf1, 79 | target_qf2=target_qf2, 80 | **variant['policy_trainer_kwargs'], 81 | ) 82 | 83 | """ 84 | Setup of intrinsic control 85 | """ 86 | 87 | dads_type = variant.get('dads_type', 'onpolicy') 88 | if dads_type == 'onpolicy': 89 | trainer_class = DADSTrainer 90 | else: 91 | raise NotImplementedError('dads_type not recognized') 92 | 93 | trainer = trainer_class( 94 | control_policy=control_policy, 95 | discriminator=discriminator, 96 | replay_buffer=replay_buffer, 97 | replay_size=variant['generated_replay_buffer_size'], 98 | policy_trainer=policy_trainer, 99 | restrict_input_size=restrict_dim, 100 | **variant['trainer_kwargs'], 101 | ) 102 | 103 | """ 104 | Create config dict 105 | """ 106 | 107 | config = dict() 108 | config.update(dict( 109 | trainer=trainer, 110 | exploration_policy=policy, 111 | evaluation_policy=policy, 112 | exploration_env=expl_env, 113 | evaluation_env=eval_env, 114 | replay_buffer=replay_buffer, 115 | prior=prior, 116 | control_policy=control_policy, 117 | latent_dim=latent_dim, 118 | policy_trainer=policy_trainer, 119 | )) 120 | config['algorithm_kwargs'] = variant.get('algorithm_kwargs', dict()) 121 | config['offline_kwargs'] = variant.get('offline_kwargs', dict()) 122 | 123 | return config 124 | -------------------------------------------------------------------------------- /experiment_configs/configs/lisp/lisp_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from lifelong_rl.models.dynamics_models.probabilistic_ensemble import ProbabilisticEnsemble 4 | from lifelong_rl.policies.base.latent_prior_policy import PriorLatentPolicy 5 | from lifelong_rl.policies.models.gaussian_policy import TanhGaussianPolicy 6 | from lifelong_rl.models.networks import FlattenMlp 7 | from lifelong_rl.trainers.lisp.lisp import LiSPTrainer 8 | from lifelong_rl.trainers.dads.skill_dynamics import SkillDynamics 9 | from lifelong_rl.trainers.mbrl.mbrl import MBRLTrainer 10 | from lifelong_rl.trainers.q_learning.sac import SACTrainer 11 | import lifelong_rl.torch.pytorch_util as ptu 12 | import lifelong_rl.util.pythonplusplus as ppp 13 | 14 | 15 | def get_config( 16 | variant, 17 | expl_env, 18 | eval_env, 19 | obs_dim, 20 | action_dim, 21 | replay_buffer, 22 | ): 23 | 24 | """ 25 | Policy construction 26 | """ 27 | 28 | M = variant['policy_kwargs']['layer_size'] 29 | latent_dim = variant['policy_kwargs']['latent_dim'] 30 | restrict_dim = variant['discriminator_kwargs']['restrict_input_size'] 31 | 32 | control_policy = TanhGaussianPolicy( 33 | obs_dim=obs_dim + latent_dim, 34 | action_dim=action_dim, 35 | hidden_sizes=[M, M], 36 | restrict_obs_dim=restrict_dim, 37 | ) 38 | 39 | prior = torch.distributions.uniform.Uniform( 40 | -ptu.ones(latent_dim), ptu.ones(latent_dim), 41 | ) 42 | 43 | policy = PriorLatentPolicy( 44 | policy=control_policy, 45 | prior=prior, 46 | unconditional=True, 47 | ) 48 | 49 | qf1, qf2, target_qf1, target_qf2 = ppp.group_init( 50 | 4, 51 | FlattenMlp, 52 | input_size=obs_dim + latent_dim + action_dim, 53 | output_size=1, 54 | hidden_sizes=[M, M], 55 | ) 56 | 57 | """ 58 | Learned skill-practice distribution 59 | """ 60 | 61 | skill_practice_dist = TanhGaussianPolicy( 62 | obs_dim=obs_dim, 63 | action_dim=latent_dim, 64 | hidden_sizes=[M, M], 65 | ) 66 | 67 | prior_qf1, prior_qf2, prior_target_qf1, prior_target_qf2 = ppp.group_init( 68 | 4, 69 | FlattenMlp, 70 | input_size=obs_dim + latent_dim, 71 | output_size=1, 72 | hidden_sizes=[M, M], 73 | ) 74 | 75 | skill_practice_trainer = SACTrainer( 76 | env=expl_env, 77 | policy=skill_practice_dist, 78 | qf1=prior_qf1, 79 | qf2=prior_qf2, 80 | target_qf1=prior_target_qf1, 81 | target_qf2=prior_target_qf2, 82 | **variant['skill_practice_trainer_kwargs'], 83 | ) 84 | 85 | """ 86 | Discriminator 87 | """ 88 | 89 | discrim_kwargs = variant['discriminator_kwargs'] 90 | discriminator = SkillDynamics( 91 | observation_size=obs_dim if restrict_dim == 0 else restrict_dim, 92 | action_size=action_dim, 93 | latent_size=latent_dim, 94 | normalize_observations=True, 95 | fix_variance=True, 96 | fc_layer_params=[discrim_kwargs['layer_size']] * discrim_kwargs['num_layers'], 97 | # restrict_observation=0, # we handle this outside of skill-dynamics 98 | # use_latents_as_delta=variant.get('use_latents_as_delta', False), 99 | ) 100 | 101 | """ 102 | Policy trainer 103 | """ 104 | 105 | policy_trainer = SACTrainer( 106 | env=expl_env, 107 | policy=control_policy, 108 | qf1=qf1, 109 | qf2=qf2, 110 | target_qf1=target_qf1, 111 | target_qf2=target_qf2, 112 | **variant['policy_trainer_kwargs'], 113 | ) 114 | 115 | """ 116 | Model-based reinforcement learning (MBRL) dynamics models 117 | """ 118 | 119 | M = variant['mbrl_kwargs']['layer_size'] 120 | 121 | dynamics_model = ProbabilisticEnsemble( 122 | ensemble_size=variant['mbrl_kwargs']['ensemble_size'], 123 | obs_dim=obs_dim, 124 | action_dim=action_dim, 125 | hidden_sizes=[M, M, M, M], 126 | ) 127 | model_trainer = MBRLTrainer( 128 | ensemble=dynamics_model, 129 | **variant['mbrl_kwargs'], 130 | ) 131 | 132 | rollout_len_schedule = variant['rollout_len_schedule'] 133 | 134 | def rollout_len(train_steps): 135 | """ 136 | rollout_len_schedule: [a, b, len_a, len_b] 137 | linearly increase length from len_a -> len_b over epochs a -> b 138 | """ 139 | epoch = train_steps // 1000 140 | if epoch < rollout_len_schedule[0]: 141 | return 1 142 | elif epoch >= rollout_len_schedule[1]: 143 | return rollout_len_schedule[3] 144 | else: 145 | return int( 146 | (epoch - rollout_len_schedule[0]) / 147 | (rollout_len_schedule[1] - rollout_len_schedule[0]) * 148 | (rollout_len_schedule[3] - rollout_len_schedule[2]) 149 | ) + rollout_len_schedule[2] 150 | 151 | """ 152 | Setup of intrinsic control 153 | """ 154 | 155 | trainer = LiSPTrainer( 156 | skill_practice_dist=skill_practice_dist, 157 | skill_practice_trainer=skill_practice_trainer, 158 | dynamics_model=dynamics_model, 159 | rollout_len_func=rollout_len, 160 | control_policy=control_policy, 161 | discriminator=discriminator, 162 | replay_buffer=replay_buffer, 163 | replay_size=variant['generated_replay_buffer_size'], 164 | policy_trainer=policy_trainer, 165 | restrict_input_size=restrict_dim, 166 | **variant['trainer_kwargs'], 167 | ) 168 | 169 | """ 170 | Create config dict 171 | """ 172 | 173 | config = dict() 174 | config.update(dict( 175 | trainer=trainer, 176 | model_trainer=model_trainer, 177 | exploration_policy=policy, 178 | evaluation_policy=policy, 179 | exploration_env=expl_env, 180 | evaluation_env=eval_env, 181 | replay_buffer=replay_buffer, 182 | dynamics_model=dynamics_model, 183 | prior=prior, 184 | learned_prior=skill_practice_dist, 185 | skill_practice_trainer=skill_practice_trainer, 186 | control_policy=control_policy, 187 | latent_dim=latent_dim, 188 | policy_trainer=policy_trainer, 189 | rollout_len_func=rollout_len, 190 | )) 191 | config['algorithm_kwargs'] = variant.get('algorithm_kwargs', dict()) 192 | config['offline_kwargs'] = variant.get('offline_kwargs', dict()) 193 | 194 | return config 195 | -------------------------------------------------------------------------------- /experiment_configs/configs/mpc/loop_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from lifelong_rl.models.dynamics_models.probabilistic_ensemble import ProbabilisticEnsemble 4 | from lifelong_rl.policies.models.gaussian_policy import TanhGaussianPolicy 5 | from lifelong_rl.policies.mpc.mpc import MPCPolicy 6 | from lifelong_rl.models.networks import FlattenMlp 7 | from lifelong_rl.trainers.mbrl.mbrl import MBRLTrainer 8 | from lifelong_rl.trainers.mpc.mpc_trainer import MPPITrainer 9 | from lifelong_rl.trainers.multi_trainer import MultiTrainer 10 | from lifelong_rl.trainers.q_learning.sac import SACTrainer 11 | import lifelong_rl.util.pythonplusplus as ppp 12 | 13 | 14 | def value_func(obs, critic_policy=None, qf1=None, qf2=None): 15 | actions, *_ = critic_policy(obs) 16 | sa = torch.cat([obs, actions], dim=-1) 17 | q1, q2 = qf1(sa), qf2(sa) 18 | min_q = torch.min(q1, q2) 19 | return min_q 20 | 21 | 22 | def get_config( 23 | variant, 24 | expl_env, 25 | eval_env, 26 | obs_dim, 27 | action_dim, 28 | replay_buffer, 29 | ): 30 | """ 31 | Set up terminal value function 32 | """ 33 | 34 | M = variant['policy_kwargs']['layer_size'] 35 | 36 | critic_policy = TanhGaussianPolicy( 37 | obs_dim=obs_dim, 38 | action_dim=action_dim, 39 | hidden_sizes=[M, M], 40 | ) 41 | 42 | qf1, qf2, target_qf1, target_qf2 = ppp.group_init( 43 | 4, 44 | FlattenMlp, 45 | input_size=obs_dim + action_dim, 46 | output_size=1, 47 | hidden_sizes=[M, M], 48 | ) 49 | 50 | critic_policy_trainer = SACTrainer( 51 | env=expl_env, 52 | policy=critic_policy, 53 | qf1=qf1, 54 | qf2=qf2, 55 | target_qf1=target_qf1, 56 | target_qf2=target_qf2, 57 | **variant['policy_trainer_kwargs'], 58 | ) 59 | 60 | """ 61 | Set up dynamics model 62 | """ 63 | 64 | M = variant['mbrl_kwargs']['layer_size'] 65 | 66 | dynamics_model = ProbabilisticEnsemble( 67 | ensemble_size=variant['mbrl_kwargs']['ensemble_size'], 68 | obs_dim=obs_dim, 69 | action_dim=action_dim, 70 | hidden_sizes=[M, M, M, M], 71 | ) 72 | model_trainer = MBRLTrainer( 73 | ensemble=dynamics_model, 74 | **variant['mbrl_kwargs'], 75 | ) 76 | 77 | """ 78 | Set up MPC 79 | """ 80 | 81 | policy = MPCPolicy( 82 | env=expl_env, 83 | dynamics_model=dynamics_model, 84 | plan_dim=action_dim, 85 | value_func=value_func, 86 | value_func_kwargs=dict( 87 | critic_policy=critic_policy, 88 | qf1=qf1, 89 | qf2=qf2, 90 | ), 91 | **variant['mpc_kwargs'], 92 | ) 93 | trainer = MPPITrainer( 94 | policy=policy, 95 | ) 96 | 97 | trainer = MultiTrainer( 98 | trainers=[trainer, critic_policy_trainer], 99 | trainer_steps=[1, 1], 100 | trainer_names=['mpc_trainer', 'sac_trainer'], 101 | ) 102 | 103 | config = dict() 104 | config.update(dict( 105 | trainer=trainer, 106 | model_trainer=model_trainer, 107 | exploration_policy=policy, 108 | evaluation_policy=critic_policy, 109 | exploration_env=expl_env, 110 | evaluation_env=eval_env, 111 | replay_buffer=replay_buffer, 112 | dynamics_model=dynamics_model, 113 | )) 114 | config['algorithm_kwargs'] = variant.get('algorithm_kwargs', dict()) 115 | 116 | return config 117 | -------------------------------------------------------------------------------- /experiment_configs/configs/mpc/make_mpc_policy.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.policies.mpc.policy_mpc import PolicyMPCController 2 | 3 | 4 | def make_get_config(base_get_config): 5 | 6 | """ 7 | Convert an algorithm that has a skill policy into one which performs MPC 8 | over the space of skills. 9 | """ 10 | 11 | def get_config( 12 | variant, 13 | expl_env, 14 | eval_env, 15 | obs_dim, 16 | action_dim, 17 | replay_buffer, 18 | ): 19 | config = base_get_config( 20 | variant, 21 | expl_env, 22 | eval_env, 23 | obs_dim, 24 | action_dim, 25 | replay_buffer, 26 | ) 27 | 28 | policy = PolicyMPCController( 29 | env=expl_env, 30 | dynamics_model=config['dynamics_model'], 31 | policy=config['control_policy'], 32 | latent_dim=config['latent_dim'], 33 | **variant['mppi_kwargs'], 34 | ) 35 | 36 | config['exploration_policy'] = policy 37 | 38 | if variant['use_as_eval_policy'] == 'mppi': 39 | config['evaluation_policy'] = policy 40 | 41 | return config 42 | 43 | return get_config 44 | -------------------------------------------------------------------------------- /experiment_configs/configs/mpc/mpc.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.models.dynamics_models.probabilistic_ensemble import ProbabilisticEnsemble 2 | from lifelong_rl.policies.mpc.mpc import MPCPolicy 3 | from lifelong_rl.trainers.mbrl.mbrl import MBRLTrainer 4 | from lifelong_rl.trainers.mpc.mpc_trainer import MPPITrainer 5 | 6 | 7 | def get_config( 8 | variant, 9 | expl_env, 10 | eval_env, 11 | obs_dim, 12 | action_dim, 13 | replay_buffer, 14 | ): 15 | 16 | """ 17 | Model-based reinforcement learning (MBRL) dynamics models 18 | """ 19 | 20 | M = variant['mbrl_kwargs']['layer_size'] 21 | 22 | dynamics_model = ProbabilisticEnsemble( 23 | ensemble_size=variant['mbrl_kwargs']['ensemble_size'], 24 | obs_dim=obs_dim, 25 | action_dim=action_dim, 26 | hidden_sizes=[M, M, M, M], 27 | ) 28 | model_trainer = MBRLTrainer( 29 | ensemble=dynamics_model, 30 | **variant['mbrl_kwargs'], 31 | ) 32 | 33 | """ 34 | Setup of MPPI policies 35 | """ 36 | 37 | policy = MPCPolicy( 38 | env=expl_env, 39 | dynamics_model=dynamics_model, 40 | plan_dim=action_dim, 41 | **variant['mpc_kwargs'], 42 | ) 43 | eval_policy = MPCPolicy( 44 | env=eval_env, 45 | dynamics_model=dynamics_model, 46 | plan_dim=action_dim, 47 | **variant['mpc_kwargs'], 48 | ) 49 | trainer = MPPITrainer( 50 | policy=policy, 51 | ) 52 | 53 | """ 54 | Create config dict 55 | """ 56 | 57 | config = dict() 58 | config.update(dict( 59 | trainer=trainer, 60 | model_trainer=model_trainer, 61 | exploration_policy=policy, 62 | evaluation_policy=eval_policy, 63 | exploration_env=expl_env, 64 | evaluation_env=eval_env, 65 | replay_buffer=replay_buffer, 66 | )) 67 | config['algorithm_kwargs'] = variant['algorithm_kwargs'] 68 | 69 | return config 70 | -------------------------------------------------------------------------------- /experiment_configs/configs/pg/npg_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from lifelong_rl.policies.base.base import MakeDeterministic 4 | from lifelong_rl.policies.models.gaussian_policy import TanhGaussianPolicy 5 | from lifelong_rl.models.networks import FlattenMlp 6 | from lifelong_rl.trainers.pg.npg import NPGTrainer 7 | import lifelong_rl.torch.pytorch_util as ptu 8 | 9 | 10 | def get_config( 11 | variant, 12 | expl_env, 13 | eval_env, 14 | obs_dim, 15 | action_dim, 16 | replay_buffer, 17 | ): 18 | 19 | M = variant['policy_kwargs']['layer_size'] 20 | 21 | # PG is very finicky with weight initializations 22 | 23 | policy = TanhGaussianPolicy( 24 | obs_dim=obs_dim, 25 | action_dim=action_dim, 26 | hidden_sizes=[M, M], 27 | hidden_activation=torch.tanh, 28 | b_init_value=0, 29 | w_scale=1.41, 30 | init_w=0.01, 31 | final_init_scale=0.01, 32 | std=0.5, 33 | hidden_init=ptu.orthogonal_init, 34 | ) 35 | 36 | M = variant['value_kwargs']['layer_size'] 37 | 38 | value_func = FlattenMlp( 39 | input_size=obs_dim, 40 | output_size=1, 41 | hidden_sizes=[M, M], 42 | hidden_activation=torch.tanh, 43 | hidden_init=ptu.orthogonal_init, 44 | b_init_value=0, 45 | final_init_scale=1, 46 | ) 47 | 48 | trainer = NPGTrainer( 49 | env=eval_env, 50 | policy=policy, 51 | value_func=value_func, 52 | **variant['policy_trainer_kwargs'], 53 | ) 54 | 55 | config = dict() 56 | config.update(dict( 57 | trainer=trainer, 58 | exploration_policy=policy, 59 | evaluation_policy=MakeDeterministic(policy), 60 | exploration_env=expl_env, 61 | evaluation_env=eval_env, 62 | replay_buffer=replay_buffer, 63 | )) 64 | config['algorithm_kwargs'] = variant['algorithm_kwargs'] 65 | 66 | return config 67 | -------------------------------------------------------------------------------- /experiment_configs/configs/pg/ppo_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from lifelong_rl.policies.base.base import MakeDeterministic 4 | from lifelong_rl.policies.models.gaussian_policy import TanhGaussianPolicy 5 | from lifelong_rl.models.networks import FlattenMlp 6 | from lifelong_rl.trainers.pg.ppo import PPOTrainer 7 | import lifelong_rl.torch.pytorch_util as ptu 8 | 9 | 10 | def get_config( 11 | variant, 12 | expl_env, 13 | eval_env, 14 | obs_dim, 15 | action_dim, 16 | replay_buffer, 17 | ): 18 | 19 | M = variant['policy_kwargs']['layer_size'] 20 | 21 | # PPO is very finicky with weight initializations 22 | 23 | policy = TanhGaussianPolicy( 24 | obs_dim=obs_dim, 25 | action_dim=action_dim, 26 | hidden_sizes=[M, M], 27 | hidden_activation=torch.tanh, 28 | b_init_value=0, 29 | w_scale=1.41, 30 | init_w=0.01, 31 | final_init_scale=0.01, 32 | std=0.5, 33 | hidden_init=ptu.orthogonal_init, 34 | ) 35 | 36 | M = variant['value_kwargs']['layer_size'] 37 | 38 | value_func = FlattenMlp( 39 | input_size=obs_dim, 40 | output_size=1, 41 | hidden_sizes=[M, M], 42 | hidden_activation=torch.tanh, 43 | hidden_init=ptu.orthogonal_init, 44 | b_init_value=0, 45 | final_init_scale=1, 46 | ) 47 | 48 | trainer = PPOTrainer( 49 | env=eval_env, 50 | policy=policy, 51 | value_func=value_func, 52 | **variant['policy_trainer_kwargs'], 53 | ) 54 | 55 | config = dict() 56 | config.update(dict( 57 | trainer=trainer, 58 | exploration_policy=policy, 59 | evaluation_policy=MakeDeterministic(policy), 60 | exploration_env=expl_env, 61 | evaluation_env=eval_env, 62 | replay_buffer=replay_buffer, 63 | )) 64 | config['algorithm_kwargs'] = variant['algorithm_kwargs'] 65 | 66 | return config 67 | -------------------------------------------------------------------------------- /experiment_configs/configs/pg/vpg_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from lifelong_rl.policies.base.base import MakeDeterministic 4 | from lifelong_rl.policies.models.gaussian_policy import TanhGaussianPolicy 5 | from lifelong_rl.models.networks import FlattenMlp 6 | from lifelong_rl.trainers.pg.pg import PGTrainer 7 | import lifelong_rl.torch.pytorch_util as ptu 8 | 9 | 10 | def get_config( 11 | variant, 12 | expl_env, 13 | eval_env, 14 | obs_dim, 15 | action_dim, 16 | replay_buffer, 17 | ): 18 | 19 | M = variant['policy_kwargs']['layer_size'] 20 | 21 | policy = TanhGaussianPolicy( 22 | obs_dim=obs_dim, 23 | action_dim=action_dim, 24 | hidden_sizes=[M, M], 25 | hidden_activation=torch.tanh, 26 | b_init_value=0, 27 | w_scale=1.41, 28 | init_w=0.01, 29 | final_init_scale=0.01, 30 | std=0.5, 31 | hidden_init=ptu.orthogonal_init, 32 | ) 33 | 34 | M = variant['value_kwargs']['layer_size'] 35 | 36 | value_func = FlattenMlp( 37 | input_size=obs_dim, 38 | output_size=1, 39 | hidden_sizes=[M, M], 40 | hidden_activation=torch.tanh, 41 | # hidden_init=ptu.orthogonal_init, 42 | # w_scale=1.41, 43 | b_init_value=0, 44 | final_init_scale=1, 45 | ) 46 | 47 | trainer = PGTrainer( 48 | env=eval_env, 49 | policy=policy, 50 | value_func=value_func, 51 | **variant['policy_trainer_kwargs'], 52 | ) 53 | 54 | config = dict() 55 | config.update(dict( 56 | trainer=trainer, 57 | exploration_policy=policy, 58 | evaluation_policy=MakeDeterministic(policy), 59 | exploration_env=expl_env, 60 | evaluation_env=eval_env, 61 | replay_buffer=replay_buffer, 62 | )) 63 | config['algorithm_kwargs'] = variant['algorithm_kwargs'] 64 | 65 | return config 66 | -------------------------------------------------------------------------------- /experiment_configs/configs/q_learning/cql_config.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.models.networks import FlattenMlp 2 | from lifelong_rl.policies.base.base import MakeDeterministic 3 | from lifelong_rl.policies.models.gaussian_policy import TanhGaussianPolicy 4 | from lifelong_rl.trainers.q_learning.cql import CQLTrainer 5 | import lifelong_rl.util.pythonplusplus as ppp 6 | 7 | 8 | def get_config( 9 | variant, 10 | expl_env, 11 | eval_env, 12 | obs_dim, 13 | action_dim, 14 | replay_buffer, 15 | ): 16 | """ 17 | Policy construction 18 | """ 19 | 20 | M = variant['policy_kwargs']['layer_size'] 21 | 22 | qf1, qf2, target_qf1, target_qf2 = ppp.group_init( 23 | 4, 24 | FlattenMlp, 25 | input_size=obs_dim + action_dim, 26 | output_size=1, 27 | hidden_sizes=[M, M], 28 | ) 29 | policy = TanhGaussianPolicy( 30 | obs_dim=obs_dim, 31 | action_dim=action_dim, 32 | hidden_sizes=[M, M], 33 | ) 34 | 35 | trainer = CQLTrainer( 36 | env=eval_env, 37 | policy=policy, 38 | qf1=qf1, 39 | qf2=qf2, 40 | target_qf1=target_qf1, 41 | target_qf2=target_qf2, 42 | **variant['trainer_kwargs'], 43 | ) 44 | 45 | """ 46 | Create config dict 47 | """ 48 | 49 | config = dict() 50 | config.update(dict( 51 | trainer=trainer, 52 | exploration_policy=policy, 53 | evaluation_policy=MakeDeterministic(policy), 54 | exploration_env=expl_env, 55 | evaluation_env=eval_env, 56 | replay_buffer=replay_buffer, 57 | )) 58 | 59 | return config 60 | -------------------------------------------------------------------------------- /experiment_configs/configs/q_learning/mbpo_config.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.data_management.replay_buffers.env_replay_buffer import EnvReplayBuffer 2 | from lifelong_rl.models.dynamics_models.probabilistic_ensemble import ProbabilisticEnsemble 3 | from lifelong_rl.models.networks import FlattenMlp 4 | from lifelong_rl.policies.base.base import MakeDeterministic 5 | from lifelong_rl.policies.models.gaussian_policy import TanhGaussianPolicy 6 | from lifelong_rl.trainers.mbrl.mbrl import MBRLTrainer 7 | from lifelong_rl.trainers.q_learning.mbpo import MBPOTrainer 8 | from lifelong_rl.trainers.q_learning.sac import SACTrainer 9 | import lifelong_rl.util.pythonplusplus as ppp 10 | 11 | 12 | def get_config( 13 | variant, 14 | expl_env, 15 | eval_env, 16 | obs_dim, 17 | action_dim, 18 | replay_buffer, 19 | ): 20 | 21 | """ 22 | Setup of soft actor critic (SAC), used as the policy optimization procedure of MBPO 23 | """ 24 | 25 | M = variant['policy_kwargs']['layer_size'] 26 | 27 | qf1, qf2, target_qf1, target_qf2 = ppp.group_init( 28 | 4, 29 | FlattenMlp, 30 | input_size=obs_dim + action_dim, 31 | output_size=1, 32 | hidden_sizes=[M, M], 33 | ) 34 | policy = TanhGaussianPolicy( 35 | obs_dim=obs_dim, 36 | action_dim=action_dim, 37 | hidden_sizes=[M, M], 38 | ) 39 | 40 | policy_trainer = SACTrainer( 41 | env=eval_env, 42 | policy=policy, 43 | qf1=qf1, 44 | qf2=qf2, 45 | target_qf1=target_qf1, 46 | target_qf2=target_qf2, 47 | **variant['trainer_kwargs']['policy_kwargs'] 48 | ) 49 | 50 | """ 51 | Model-based reinforcement learning (MBRL) dynamics models 52 | """ 53 | 54 | dynamics_model = ProbabilisticEnsemble( 55 | ensemble_size=variant['mbrl_kwargs']['ensemble_size'], 56 | obs_dim=obs_dim, 57 | action_dim=action_dim, 58 | hidden_sizes=variant['mbrl_kwargs']['hidden_sizes'], 59 | ) 60 | model_trainer = MBRLTrainer( 61 | ensemble=dynamics_model, 62 | **variant['mbrl_kwargs'], 63 | ) 64 | 65 | """ 66 | Setup of model-based policy optimization (MBPO) 67 | """ 68 | 69 | generated_replay_buffer = EnvReplayBuffer( 70 | variant['trainer_kwargs']['generated_buffer_size'], 71 | expl_env, 72 | ) 73 | 74 | rollout_len_schedule = variant['trainer_kwargs']['rollout_len_schedule'] 75 | 76 | def rollout_len(train_steps): 77 | """ 78 | rollout_len_schedule: [a, b, len_a, len_b] 79 | Linearly increase length from len_a -> len_b over epochs a -> b 80 | """ 81 | if 'algorithm_kwargs' in variant: 82 | epoch = train_steps // variant['algorithm_kwargs']['num_trains_per_train_loop'] 83 | else: 84 | epoch = 1 85 | if epoch < rollout_len_schedule[0]: 86 | return 1 87 | elif epoch >= rollout_len_schedule[1]: 88 | return rollout_len_schedule[3] 89 | else: 90 | return int( 91 | (epoch - rollout_len_schedule[0]) / \ 92 | (rollout_len_schedule[1] - rollout_len_schedule[0]) * \ 93 | (rollout_len_schedule[3] - rollout_len_schedule[2]) 94 | ) + 1 95 | 96 | trainer = MBPOTrainer( 97 | policy_trainer=policy_trainer, 98 | dynamics_model=dynamics_model, 99 | replay_buffer=replay_buffer, 100 | generated_data_buffer=generated_replay_buffer, 101 | rollout_len_func=rollout_len, 102 | **variant['trainer_kwargs'] 103 | ) 104 | 105 | """ 106 | Create config dict 107 | """ 108 | 109 | config = dict() 110 | config.update(dict( 111 | trainer=trainer, 112 | model_trainer=model_trainer, 113 | exploration_policy=policy, 114 | evaluation_policy=MakeDeterministic(policy), 115 | exploration_env=expl_env, 116 | evaluation_env=eval_env, 117 | replay_buffer=replay_buffer, 118 | )) 119 | 120 | return config 121 | -------------------------------------------------------------------------------- /experiment_configs/configs/q_learning/sac_config.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.models.networks import FlattenMlp 2 | from lifelong_rl.policies.base.base import MakeDeterministic 3 | from lifelong_rl.policies.models.gaussian_policy import TanhGaussianPolicy 4 | from lifelong_rl.trainers.q_learning.sac import SACTrainer 5 | import lifelong_rl.util.pythonplusplus as ppp 6 | 7 | 8 | def get_config( 9 | variant, 10 | expl_env, 11 | eval_env, 12 | obs_dim, 13 | action_dim, 14 | replay_buffer, 15 | ): 16 | """ 17 | Policy construction 18 | """ 19 | 20 | M = variant['policy_kwargs']['layer_size'] 21 | 22 | qf1, qf2, target_qf1, target_qf2 = ppp.group_init( 23 | 4, 24 | FlattenMlp, 25 | input_size=obs_dim + action_dim, 26 | output_size=1, 27 | hidden_sizes=[M, M], 28 | ) 29 | policy = TanhGaussianPolicy( 30 | obs_dim=obs_dim, 31 | action_dim=action_dim, 32 | hidden_sizes=[M, M], 33 | ) 34 | 35 | trainer = SACTrainer( 36 | env=eval_env, 37 | policy=policy, 38 | qf1=qf1, 39 | qf2=qf2, 40 | target_qf1=target_qf1, 41 | target_qf2=target_qf2, 42 | **variant['trainer_kwargs'], 43 | ) 44 | 45 | """ 46 | Create config dict 47 | """ 48 | 49 | config = dict() 50 | config.update(dict( 51 | trainer=trainer, 52 | exploration_policy=policy, 53 | evaluation_policy=MakeDeterministic(policy), 54 | exploration_env=expl_env, 55 | evaluation_env=eval_env, 56 | replay_buffer=replay_buffer, 57 | )) 58 | config['algorithm_kwargs'] = variant['algorithm_kwargs'] 59 | 60 | return config 61 | -------------------------------------------------------------------------------- /experiment_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/experiment_utils/__init__.py -------------------------------------------------------------------------------- /experiment_utils/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 4 | 5 | DOCKER_MOUNT_DIR = '/root/code/data' 6 | 7 | DATA_DIR = os.path.join(BASE_DIR, 'data') 8 | 9 | DOCKER_IMAGE = None 10 | 11 | S3_BUCKET_NAME = None 12 | 13 | WANDB_PROJECT = None 14 | -------------------------------------------------------------------------------- /experiment_utils/sweeper.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import itertools 3 | import random 4 | 5 | 6 | def set_dict_key(dict, path, value): 7 | if len(path) == 1: 8 | dict[path[0]] = value 9 | else: 10 | set_dict_key(dict[path[0]], path[1:], value) 11 | 12 | 13 | def generate_variants(base_variant, sweep_values, num_seeds=1): 14 | variants = [] 15 | for _ in range(num_seeds): 16 | for params in itertools.product(*[s for s in sweep_values.values()]): 17 | variant = copy.deepcopy(base_variant) 18 | for i, loc in enumerate(sweep_values.keys()): 19 | path = loc.split('/') 20 | set_dict_key(variant, path, params[i]) 21 | variant['seed'] = random.randint(0, int(1e8)) 22 | variants.append(variant) 23 | return variants 24 | -------------------------------------------------------------------------------- /experiment_utils/teacher_data.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | 4 | def add_transitions(replay_buffer, data_file, obs_dim, action_dim, max_transitions=int(1e8)): 5 | with open('agent_data/%s.pkl' % data_file, 'rb') as f: 6 | transitions = pickle.load(f) 7 | 8 | # method signature: add_sample(obs, act, r, d, next_obs, info) 9 | n_transitions = min(len(transitions), max_transitions) 10 | 11 | # in form (s, a, r, d, s') 12 | for t in range(n_transitions): 13 | replay_buffer.add_sample( 14 | transitions[t, :obs_dim], 15 | transitions[t, obs_dim:obs_dim + action_dim], 16 | transitions[t, obs_dim + action_dim:obs_dim + action_dim + 1], 17 | transitions[t, obs_dim + action_dim + 1:obs_dim + action_dim + 2], 18 | transitions[t, -obs_dim:], 19 | env_info={}, 20 | ) 21 | -------------------------------------------------------------------------------- /experiment_utils/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import glob 5 | 6 | 7 | def query_yes_no(question, default="no", allow_skip=False): 8 | """Ask a yes/no question via raw_input() and return their answer. 9 | 10 | "question" is a string that is presented to the user. 11 | "default" is the presumed answer if the user just hits . 12 | It must be "yes" (the default), "no" or None (meaning 13 | an answer is required of the user). 14 | 15 | The "answer" return value is True for "yes" or False for "no". 16 | """ 17 | valid = {"yes": True, "y": True, "ye": True, 18 | "no": False, "n": False} 19 | if allow_skip: 20 | valid["skip"] = "skip" 21 | if default is None: 22 | prompt = " [y/n] " 23 | elif default == "yes": 24 | prompt = " [Y/n] " 25 | elif default == "no": 26 | prompt = " [y/N] " 27 | else: 28 | raise ValueError("invalid default answer: '%s'" % default) 29 | if allow_skip: 30 | prompt += " or skip" 31 | while True: 32 | sys.stdout.write(question + prompt) 33 | choice = input().lower() 34 | if default is not None and choice == '': 35 | return valid[default] 36 | elif choice in valid: 37 | return valid[choice] 38 | else: 39 | sys.stdout.write("Please respond with 'yes' or 'no' " 40 | "(or 'y' or 'n').\n") 41 | 42 | 43 | def load_exps_data(exp_path, gap=1, max=None): 44 | exp_folder_paths = [os.path.abspath(x) for x in glob.iglob(exp_path)] 45 | exps = [] 46 | for exp_folder_path in exp_folder_paths: 47 | exps += [x[0] for x in os.walk(exp_folder_path)] 48 | exps_data = [] 49 | for exp in exps: 50 | try: 51 | exp_path = exp 52 | params_json = load_json(os.path.join(exp_path, "params.json")) 53 | progress_csv_path = os.path.join(exp_path, "progress.csv") 54 | pkl_paths = [] 55 | if gap > 0: 56 | for pkl_path in glob.iglob(os.path.join(exp_path, '*.pkl')): 57 | pkl_paths.append(pkl_path) 58 | pkl_paths.sort(key=lambda x: int(x.split('_')[-1][:-4])) 59 | pkl_paths = pkl_paths[:max:gap] 60 | exps_data.append(dict(csv=progress_csv_path, json=params_json, 61 | pkl=pkl_paths, exp_name=exp_path)) 62 | except IOError as e: 63 | print(e) 64 | return exps_data 65 | 66 | 67 | def load_json(params_json_path): 68 | with open(params_json_path, 'r') as f: 69 | data = json.loads(f.read()) 70 | if "args_data" in data: 71 | del data["args_data"] 72 | if "exp_name" not in data: 73 | data["exp_name"] = params_json_path.split("/")[-2] 74 | return data 75 | -------------------------------------------------------------------------------- /lifelong_rl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/core/__init__.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.core.logging.logging import logger 2 | 3 | 4 | logger = logger 5 | -------------------------------------------------------------------------------- /lifelong_rl/core/logging/logging_setup.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import os 4 | import os.path as osp 5 | from collections import namedtuple 6 | import dateutil.tz 7 | import wandb 8 | 9 | from lifelong_rl.core.logging.logging import logger 10 | import experiment_utils.config as config 11 | 12 | GitInfo = namedtuple( 13 | 'GitInfo', 14 | [ 15 | 'directory', 16 | 'code_diff', 17 | 'code_diff_staged', 18 | 'commit_hash', 19 | 'branch_name', 20 | ], 21 | ) 22 | 23 | 24 | def get_git_infos(dirs): 25 | try: 26 | import git 27 | git_infos = [] 28 | for directory in dirs: 29 | # Idk how to query these things, so I'm just doing try-catch 30 | try: 31 | repo = git.Repo(directory) 32 | try: 33 | branch_name = repo.active_branch.name 34 | except TypeError: 35 | branch_name = '[DETACHED]' 36 | git_infos.append(GitInfo( 37 | directory=directory, 38 | code_diff=repo.git.diff(None), 39 | code_diff_staged=repo.git.diff('--staged'), 40 | commit_hash=repo.head.commit.hexsha, 41 | branch_name=branch_name, 42 | )) 43 | except git.exc.InvalidGitRepositoryError as e: 44 | print("Not a valid git repo: {}".format(directory)) 45 | except ImportError: 46 | git_infos = None 47 | return git_infos 48 | 49 | 50 | def create_exp_name(exp_prefix, exp_id=0, seed=0): 51 | now = datetime.datetime.now(dateutil.tz.tzlocal()) 52 | timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') 53 | return "%s_%s_%04d--s-%d" % (exp_prefix, timestamp, exp_id, seed) 54 | 55 | 56 | def create_log_dir( 57 | exp_prefix, 58 | exp_id=0, 59 | seed=0, 60 | base_log_dir=None, 61 | include_exp_prefix_sub_dir=True, 62 | ): 63 | exp_name = create_exp_name(exp_prefix, exp_id=exp_id, 64 | seed=seed) 65 | if base_log_dir is None: 66 | base_log_dir = os.getcwd() + '/data/' 67 | if include_exp_prefix_sub_dir: 68 | log_dir = osp.join(base_log_dir, exp_prefix.replace("_", "-"), exp_name) 69 | else: 70 | log_dir = osp.join(base_log_dir, exp_name) 71 | if osp.exists(log_dir): 72 | print("WARNING: Log directory already exists {}".format(log_dir)) 73 | os.makedirs(log_dir, exist_ok=True) 74 | return log_dir 75 | 76 | 77 | def setup_logger( 78 | exp_prefix="default", 79 | variant=None, 80 | text_log_file="stdout.log", 81 | variant_log_file="variant.json", 82 | tabular_log_file="progress.csv", 83 | log_to_wandb=False, 84 | snapshot_mode="all", 85 | snapshot_gap=1, 86 | log_tabular_only=False, 87 | log_dir=None, 88 | git_infos=None, 89 | script_name=None, 90 | **create_log_dir_kwargs 91 | ): 92 | log_dir = create_log_dir(exp_prefix, **create_log_dir_kwargs) 93 | logger.log_dir = log_dir 94 | 95 | text_log_path = osp.join(log_dir, text_log_file) 96 | tabular_log_path = osp.join(log_dir, tabular_log_file) 97 | 98 | logger.set_text_output(text_log_path) 99 | logger.set_tabular_output(tabular_log_path) 100 | 101 | logger.set_snapshot_dir(log_dir) 102 | 103 | logger.set_snapshot_mode(snapshot_mode) 104 | logger.set_snapshot_gap(snapshot_gap) 105 | logger.set_log_tabular_only(log_tabular_only) 106 | 107 | exp_name = log_dir.split("/")[-1] 108 | logger.push_prefix("[%s] " % exp_name) 109 | 110 | if variant is not None: 111 | logger.log("Variant:") 112 | logger.log(json.dumps(dict_to_safe_json(variant), indent=2)) 113 | variant_log_path = osp.join(log_dir, variant_log_file) 114 | logger.log_variant(variant_log_path, variant) 115 | 116 | if log_to_wandb: 117 | logger.log_to_wandb = True 118 | name = os.path.split(log_dir)[1][len(exp_prefix)+1:] 119 | wandb.init(name=name, config=variant, project=config.WANDB_PROJECT, group=exp_prefix) 120 | 121 | if git_infos is not None: 122 | for ( 123 | directory, code_diff, code_diff_staged, commit_hash, branch_name 124 | ) in git_infos: 125 | if directory[-1] == '/': 126 | directory = directory[:-1] 127 | diff_file_name = directory[1:].replace("/", "-") + ".patch" 128 | diff_staged_file_name = ( 129 | directory[1:].replace("/", "-") + "_staged.patch" 130 | ) 131 | if code_diff is not None and len(code_diff) > 0: 132 | with open(osp.join(log_dir, diff_file_name), "w") as f: 133 | f.write(code_diff + '\n') 134 | if code_diff_staged is not None and len(code_diff_staged) > 0: 135 | with open(osp.join(log_dir, diff_staged_file_name), "w") as f: 136 | f.write(code_diff_staged + '\n') 137 | with open(osp.join(log_dir, "git_infos.txt"), "a") as f: 138 | f.write("directory: {}\n".format(directory)) 139 | f.write("git hash: {}\n".format(commit_hash)) 140 | f.write("git branch name: {}\n\n".format(branch_name)) 141 | if script_name is not None: 142 | with open(osp.join(log_dir, "script_name.txt"), "w") as f: 143 | f.write(script_name) 144 | return log_dir 145 | 146 | 147 | def dict_to_safe_json(d): 148 | new_d = {} 149 | for key, item in d.items(): 150 | if safe_json(item): 151 | new_d[key] = item 152 | else: 153 | if isinstance(item, dict): 154 | new_d[key] = dict_to_safe_json(item) 155 | else: 156 | new_d[key] = str(item) 157 | return new_d 158 | 159 | 160 | def safe_json(data): 161 | if data is None: 162 | return True 163 | elif isinstance(data, (bool, int, float)): 164 | return True 165 | elif isinstance(data, (tuple, list)): 166 | return all(safe_json(x) for x in data) 167 | elif isinstance(data, dict): 168 | return all(isinstance(k, str) and safe_json(v) for k, v in data.items()) 169 | return False 170 | -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/core/rl_algorithms/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/batch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/core/rl_algorithms/batch/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/batch/batch_rl_algorithm.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | import gtimer as gt 4 | from lifelong_rl.core.rl_algorithms.rl_algorithm import BaseRLAlgorithm 5 | 6 | 7 | class BatchRLAlgorithm(BaseRLAlgorithm, metaclass=abc.ABCMeta): 8 | 9 | def __init__( 10 | self, 11 | trainer, 12 | exploration_policy, 13 | exploration_env, 14 | evaluation_env, 15 | exploration_data_collector, 16 | evaluation_data_collector, 17 | replay_buffer, 18 | max_path_length, 19 | num_epochs, 20 | num_eval_steps_per_epoch, 21 | num_expl_steps_per_train_loop, 22 | num_trains_per_train_loop=1, 23 | num_train_loops_per_epoch=1, 24 | min_num_steps_before_training=0, 25 | save_snapshot_freq=100, 26 | post_epoch_funcs=None, 27 | ): 28 | super().__init__( 29 | trainer, 30 | exploration_policy, 31 | exploration_env, 32 | evaluation_env, 33 | exploration_data_collector, 34 | evaluation_data_collector, 35 | replay_buffer, 36 | save_snapshot_freq=save_snapshot_freq, 37 | post_epoch_funcs=post_epoch_funcs, 38 | ) 39 | 40 | self.max_path_length = max_path_length 41 | self.num_epochs = num_epochs 42 | self.num_eval_steps_per_epoch = num_eval_steps_per_epoch 43 | self.num_trains_per_train_loop = num_trains_per_train_loop 44 | self.num_train_loops_per_epoch = num_train_loops_per_epoch 45 | self.num_expl_steps_per_train_loop = num_expl_steps_per_train_loop 46 | self.min_num_steps_before_training = min_num_steps_before_training 47 | 48 | def _train(self): 49 | if self.min_num_steps_before_training > 0: 50 | init_expl_paths = self.expl_data_collector.collect_new_paths( 51 | self.max_path_length, 52 | self.min_num_steps_before_training, 53 | discard_incomplete_paths=False, 54 | ) 55 | self.replay_buffer.add_paths(init_expl_paths) 56 | self.expl_data_collector.end_epoch(-1) 57 | 58 | self._fit_input_stats() 59 | 60 | for epoch in gt.timed_for( 61 | range(self._start_epoch, self.num_epochs), 62 | save_itrs=True, 63 | ): 64 | self.eval_data_collector.collect_new_paths( 65 | self.max_path_length, 66 | self.num_eval_steps_per_epoch, 67 | discard_incomplete_paths=True, 68 | ) 69 | gt.stamp('evaluation sampling') 70 | 71 | for _ in range(self.num_train_loops_per_epoch): 72 | new_expl_paths = self.expl_data_collector.collect_new_paths( 73 | self.max_path_length, 74 | self.num_expl_steps_per_train_loop, 75 | discard_incomplete_paths=False, 76 | ) 77 | gt.stamp('exploration sampling', unique=False) 78 | 79 | self.replay_buffer.add_paths(new_expl_paths) 80 | gt.stamp('data storing', unique=False) 81 | 82 | self.training_mode(True) 83 | for _ in range(self.num_trains_per_train_loop): 84 | self.trainer.train_from_paths(new_expl_paths) 85 | gt.stamp('training', unique=False) 86 | self.training_mode(False) 87 | 88 | self._fit_input_stats() 89 | 90 | self._end_epoch(epoch) 91 | -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/batch/mb_batch_rl_algorithm.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | import gtimer as gt 4 | 5 | from lifelong_rl.core.rl_algorithms.batch.batch_rl_algorithm import BatchRLAlgorithm 6 | 7 | 8 | class MBBatchRLAlgorithm(BatchRLAlgorithm, metaclass=abc.ABCMeta): 9 | 10 | def __init__( 11 | self, 12 | model_trainer, 13 | model_batch_size, 14 | model_max_grad_steps=int(1e3), 15 | model_epochs_since_last_update=5, 16 | *args, 17 | **kwargs, 18 | ): 19 | super().__init__(*args, **kwargs) 20 | 21 | self.model_trainer = model_trainer 22 | self.model_batch_size = model_batch_size 23 | self.model_max_grad_steps = model_max_grad_steps 24 | self.model_epochs_since_last_update = model_epochs_since_last_update 25 | 26 | def _train(self): 27 | if self.min_num_steps_before_training > 0: 28 | init_expl_paths = self.expl_data_collector.collect_new_paths( 29 | self.max_path_length, 30 | self.min_num_steps_before_training, 31 | discard_incomplete_paths=False, 32 | ) 33 | self.replay_buffer.add_paths(init_expl_paths) 34 | self.expl_data_collector.end_epoch(-1) 35 | 36 | self._fit_input_stats() 37 | 38 | for epoch in gt.timed_for( 39 | range(self._start_epoch, self.num_epochs), 40 | save_itrs=True, 41 | ): 42 | self.eval_data_collector.collect_new_paths( 43 | self.max_path_length, 44 | self.num_eval_steps_per_epoch, 45 | discard_incomplete_paths=True, 46 | ) 47 | gt.stamp('evaluation sampling') 48 | 49 | self.training_mode(True) 50 | if self.replay_buffer.num_steps_can_sample() > 0: 51 | self.model_trainer.train_from_buffer( 52 | self.replay_buffer, 53 | max_grad_steps=self.model_max_grad_steps, 54 | epochs_since_last_update=self.model_epochs_since_last_update, 55 | ) 56 | gt.stamp('model training', unique=False) 57 | 58 | for _ in range(self.num_train_loops_per_epoch): 59 | new_expl_paths = self.expl_data_collector.collect_new_paths( 60 | self.max_path_length, 61 | self.num_expl_steps_per_train_loop, 62 | discard_incomplete_paths=False, 63 | ) 64 | gt.stamp('exploration sampling', unique=False) 65 | 66 | self.replay_buffer.add_paths(new_expl_paths) 67 | gt.stamp('data storing', unique=False) 68 | 69 | self.training_mode(True) 70 | for _ in range(self.num_trains_per_train_loop): 71 | self.trainer.train_from_paths(new_expl_paths) 72 | gt.stamp('training', unique=False) 73 | self.training_mode(False) 74 | 75 | self._fit_input_stats() 76 | 77 | self._end_epoch(epoch) 78 | 79 | def _get_training_diagnostics_dict(self): 80 | training_diagnostics = super()._get_training_diagnostics_dict() 81 | training_diagnostics['model_trainer'] = self.model_trainer.get_diagnostics() 82 | return training_diagnostics 83 | 84 | def _get_snapshot(self): 85 | snapshot = super()._get_snapshot() 86 | for k, v in self.model_trainer.get_snapshot().items(): 87 | snapshot['model/' + k] = v 88 | return snapshot 89 | 90 | def _end_epochs(self, epoch): 91 | super()._end_epochs(epoch) 92 | self.model_trainer.end_epoch(epoch) 93 | 94 | -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/offline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/core/rl_algorithms/offline/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/offline/mb_offline_rl_algorithm.py: -------------------------------------------------------------------------------- 1 | import gtimer as gt 2 | 3 | import abc 4 | 5 | from lifelong_rl.core.rl_algorithms.offline.offline_rl_algorithm import OfflineRLAlgorithm 6 | 7 | 8 | class OfflineMBRLAlgorithm(OfflineRLAlgorithm, metaclass=abc.ABCMeta): 9 | 10 | def __init__( 11 | self, 12 | model_trainer, 13 | model_batch_size, 14 | model_max_grad_steps=int(1e7), # The model will train until either this number of grad steps 15 | model_epochs_since_last_update=10, # or until holdout loss converged for this number of epochs 16 | train_at_start=True, # Flag for debugging 17 | *args, 18 | **kwargs, 19 | ): 20 | super().__init__(*args, **kwargs) 21 | 22 | self.model_trainer = model_trainer 23 | self.model_batch_size = model_batch_size 24 | self.model_max_grad_steps = model_max_grad_steps 25 | self.model_epochs_since_last_update = model_epochs_since_last_update 26 | self.train_at_start = train_at_start 27 | 28 | def _train(self): 29 | # Pretrain the model at the beginning of training until convergence 30 | # Note that convergence is measured against a holdout set of max size 8192 31 | if self.train_at_start: 32 | self.model_trainer.train_from_buffer( 33 | self.replay_buffer, 34 | max_grad_steps=self.model_max_grad_steps, 35 | epochs_since_last_update=self.model_epochs_since_last_update, 36 | ) 37 | gt.stamp('model training', unique=False) 38 | 39 | for epoch in gt.timed_for( 40 | range(self._start_epoch, self.num_epochs), 41 | save_itrs=True, 42 | ): 43 | self.eval_data_collector.collect_new_paths( 44 | self.max_path_length, 45 | self.num_eval_steps_per_epoch, 46 | discard_incomplete_paths=True, 47 | ) 48 | gt.stamp('evaluation sampling') 49 | 50 | self.training_mode(True) 51 | for _ in range(self.num_train_loops_per_epoch): 52 | for t in range(self.num_trains_per_train_loop): 53 | train_data = self.replay_buffer.random_batch(self.batch_size) 54 | self.trainer.train(train_data) 55 | gt.stamp('policy training', unique=False) 56 | self.training_mode(False) 57 | 58 | self._end_epoch(epoch) 59 | 60 | def _get_training_diagnostics_dict(self): 61 | training_diagnostics = super()._get_training_diagnostics_dict() 62 | training_diagnostics['model_trainer'] = self.model_trainer.get_diagnostics() 63 | return training_diagnostics 64 | 65 | def _get_snapshot(self): 66 | snapshot = super()._get_snapshot() 67 | for k, v in self.model_trainer.get_snapshot().items(): 68 | snapshot['model/' + k] = v 69 | return snapshot 70 | 71 | def _end_epochs(self, epoch): 72 | super()._end_epochs(epoch) 73 | self.model_trainer.end_epoch(epoch) 74 | -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/offline/offline_rl_algorithm.py: -------------------------------------------------------------------------------- 1 | import gtimer as gt 2 | 3 | import abc 4 | 5 | from lifelong_rl.core import logger 6 | from lifelong_rl.core.rl_algorithms.rl_algorithm import _get_epoch_timings 7 | from lifelong_rl.util import eval_util 8 | 9 | 10 | class OfflineRLAlgorithm(object, metaclass=abc.ABCMeta): 11 | 12 | def __init__( 13 | self, 14 | trainer, 15 | evaluation_policy, 16 | evaluation_env, 17 | evaluation_data_collector, 18 | replay_buffer, 19 | batch_size, 20 | max_path_length, 21 | num_epochs, 22 | num_eval_steps_per_epoch, 23 | num_trains_per_train_loop, 24 | num_train_loops_per_epoch=1, 25 | save_snapshot_freq=10, 26 | ): 27 | self.trainer = trainer 28 | self.eval_policy = evaluation_policy 29 | self.eval_env = evaluation_env 30 | self.eval_data_collector = evaluation_data_collector 31 | self.replay_buffer = replay_buffer 32 | 33 | self.batch_size = batch_size 34 | self.max_path_length = max_path_length 35 | self.num_epochs = num_epochs 36 | self.num_eval_steps_per_epoch = num_eval_steps_per_epoch 37 | self.num_trains_per_train_loop = num_trains_per_train_loop 38 | self.num_train_loops_per_epoch = num_train_loops_per_epoch 39 | self.save_snapshot_freq = save_snapshot_freq 40 | 41 | self._start_epoch = 0 42 | self.post_epoch_funcs = [] 43 | 44 | def _train(self): 45 | for epoch in gt.timed_for( 46 | range(self._start_epoch, self.num_epochs), 47 | save_itrs=True, 48 | ): 49 | self.eval_data_collector.collect_new_paths( 50 | self.max_path_length, 51 | self.num_eval_steps_per_epoch, 52 | discard_incomplete_paths=True, 53 | ) 54 | gt.stamp('evaluation sampling') 55 | 56 | self.training_mode(True) 57 | for _ in range(self.num_train_loops_per_epoch): 58 | for _ in range(self.num_trains_per_train_loop): 59 | train_data = self.replay_buffer.random_batch(self.batch_size) 60 | self.trainer.train(train_data) 61 | self.training_mode(False) 62 | gt.stamp('training') 63 | 64 | self._end_epoch(epoch) 65 | 66 | def train(self, start_epoch=0): 67 | self._start_epoch = start_epoch 68 | self._train() 69 | 70 | def _end_epoch(self, epoch): 71 | snapshot = self._get_snapshot() 72 | if self.save_snapshot_freq is not None and \ 73 | (epoch + 1) % self.save_snapshot_freq == 0: 74 | logger.save_itr_params(epoch, snapshot, prefix='offline_itr') 75 | gt.stamp('saving', unique=False) 76 | 77 | self._log_stats(epoch) 78 | 79 | self._end_epochs(epoch) 80 | 81 | for post_epoch_func in self.post_epoch_funcs: 82 | post_epoch_func(self, epoch) 83 | 84 | def _get_snapshot(self): 85 | snapshot = {} 86 | for k, v in self.trainer.get_snapshot().items(): 87 | snapshot['trainer/' + k] = v 88 | for k, v in self.eval_data_collector.get_snapshot().items(): 89 | snapshot['evaluation/' + k] = v 90 | for k, v in self.replay_buffer.get_snapshot().items(): 91 | snapshot['replay_buffer/' + k] = v 92 | return snapshot 93 | 94 | def _end_epochs(self, epoch): 95 | self.eval_data_collector.end_epoch(epoch) 96 | self.trainer.end_epoch(epoch) 97 | 98 | if hasattr(self.eval_policy, 'end_epoch'): 99 | self.eval_policy.end_epoch(epoch) 100 | 101 | def _get_trainer_diagnostics(self): 102 | return self.trainer.get_diagnostics() 103 | 104 | def _get_training_diagnostics_dict(self): 105 | return {'policy_trainer': self._get_trainer_diagnostics()} 106 | 107 | def _log_stats(self, epoch): 108 | logger.log("Epoch {} finished".format(epoch), with_timestamp=True) 109 | 110 | """ 111 | Replay Buffer 112 | """ 113 | logger.record_dict( 114 | self.replay_buffer.get_diagnostics(), 115 | prefix='replay_buffer/' 116 | ) 117 | 118 | """ 119 | Trainer 120 | """ 121 | training_diagnostics = self._get_training_diagnostics_dict() 122 | for prefix in training_diagnostics: 123 | logger.record_dict(training_diagnostics[prefix], prefix=prefix + '/') 124 | 125 | """ 126 | Evaluation 127 | """ 128 | if self.num_eval_steps_per_epoch > 0: 129 | logger.record_dict( 130 | self.eval_data_collector.get_diagnostics(), 131 | prefix='evaluation/', 132 | ) 133 | eval_paths = self.eval_data_collector.get_epoch_paths() 134 | if hasattr(self.eval_env, 'get_diagnostics'): 135 | logger.record_dict( 136 | self.eval_env.get_diagnostics(eval_paths), 137 | prefix='evaluation/', 138 | ) 139 | logger.record_dict( 140 | eval_util.get_generic_path_information(eval_paths), 141 | prefix="evaluation/", 142 | ) 143 | 144 | """ 145 | Misc 146 | """ 147 | logger.record_dict(_get_epoch_timings()) 148 | logger.record_tabular('Epoch', epoch) 149 | logger.dump_tabular(with_prefix=False, with_timestamp=False) 150 | gt.stamp('logging', unique=False) 151 | 152 | @abc.abstractmethod 153 | def training_mode(self, mode): 154 | """ 155 | Set training mode to `mode`. 156 | :param mode: If True, training will happen (e.g. set the dropout 157 | probabilities to not all ones). 158 | """ 159 | pass 160 | -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/online/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/core/rl_algorithms/online/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/online/mbrl_algorithm.py: -------------------------------------------------------------------------------- 1 | import gtimer as gt 2 | 3 | import abc 4 | 5 | from lifelong_rl.core.rl_algorithms.rl_algorithm import BaseRLAlgorithm 6 | 7 | 8 | class MBRLAlgorithm(BaseRLAlgorithm, metaclass=abc.ABCMeta): 9 | 10 | def __init__( 11 | self, 12 | trainer, 13 | model_trainer, 14 | exploration_policy, 15 | exploration_env, 16 | evaluation_env, 17 | exploration_data_collector, 18 | evaluation_data_collector, 19 | replay_buffer, 20 | batch_size, 21 | model_batch_size, 22 | max_path_length, 23 | num_epochs, 24 | num_eval_steps_per_epoch, 25 | num_expl_steps_per_train_loop, 26 | num_trains_per_train_loop, 27 | num_model_trains_per_train_loop, 28 | num_train_loops_per_epoch=1, 29 | min_num_steps_before_training=0, 30 | initial_training_steps=0, 31 | save_snapshot_freq=10, 32 | post_epoch_funcs=None, 33 | ): 34 | super().__init__( 35 | trainer, 36 | exploration_policy, 37 | exploration_env, 38 | evaluation_env, 39 | exploration_data_collector, 40 | evaluation_data_collector, 41 | replay_buffer, 42 | save_snapshot_freq=save_snapshot_freq, 43 | post_epoch_funcs=post_epoch_funcs, 44 | ) 45 | 46 | self.model_trainer = model_trainer 47 | self.batch_size = batch_size 48 | self.model_batch_size = model_batch_size 49 | self.max_path_length = max_path_length 50 | self.num_epochs = num_epochs 51 | self.num_eval_steps_per_epoch = num_eval_steps_per_epoch 52 | self.num_trains_per_train_loop = num_trains_per_train_loop 53 | self.num_model_trains_per_train_loop = num_model_trains_per_train_loop 54 | self.num_train_loops_per_epoch = num_train_loops_per_epoch 55 | self.num_expl_steps_per_train_loop = num_expl_steps_per_train_loop 56 | self.min_num_steps_before_training = min_num_steps_before_training 57 | self.initial_training_steps = initial_training_steps 58 | 59 | def _get_training_diagnostics_dict(self): 60 | training_diagnostics = super()._get_training_diagnostics_dict() 61 | training_diagnostics['model_trainer'] = self.model_trainer.get_diagnostics() 62 | return training_diagnostics 63 | 64 | def _get_snapshot(self): 65 | snapshot = super()._get_snapshot() 66 | for k, v in self.model_trainer.get_snapshot().items(): 67 | snapshot['model/' + k] = v 68 | return snapshot 69 | 70 | def _end_epochs(self, epoch): 71 | super()._end_epochs(epoch) 72 | self.model_trainer.end_epoch(epoch) 73 | 74 | def _train(self): 75 | self.training_mode(False) 76 | 77 | if self.min_num_steps_before_training > 0: 78 | for _ in range(self.min_num_steps_before_training): 79 | s, a, r, d, ns, info = self.expl_data_collector.collect_one_step( 80 | self.max_path_length, 81 | discard_incomplete_paths=False, 82 | initial_expl=True, 83 | ) 84 | 85 | self.replay_buffer.add_sample(s, a, r, d, ns, env_info=info) 86 | self.expl_data_collector.end_epoch(-1) 87 | 88 | gt.stamp('initial exploration', unique=False) 89 | 90 | num_trains_per_expl_step = self.num_trains_per_train_loop // self.num_expl_steps_per_train_loop 91 | if self.num_model_trains_per_train_loop == 0: 92 | model_train_freq = None 93 | else: 94 | model_train_freq = self.num_expl_steps_per_train_loop // self.num_model_trains_per_train_loop 95 | 96 | if self.replay_buffer.num_steps_can_sample() > 0 and model_train_freq is not None: 97 | self.model_trainer.train_from_buffer(self.replay_buffer, max_grad_steps=100000) 98 | gt.stamp('model training', unique=False) 99 | 100 | for epoch in gt.timed_for( 101 | range(self._start_epoch, self.num_epochs), 102 | save_itrs=True, 103 | ): 104 | if self.num_eval_steps_per_epoch > 0: 105 | self.eval_data_collector.collect_new_paths( 106 | self.max_path_length, 107 | self.num_eval_steps_per_epoch, 108 | discard_incomplete_paths=True, 109 | ) 110 | gt.stamp('evaluation sampling', unique=False) 111 | 112 | for _ in range(self.num_train_loops_per_epoch): 113 | for t in range(self.num_expl_steps_per_train_loop): 114 | self.training_mode(True) 115 | if model_train_freq is not None and \ 116 | ((t+1) % model_train_freq == 0 or \ 117 | (epoch == 0 and t == 0 and \ 118 | self.replay_buffer.num_steps_can_sample() > 0)): 119 | self.model_trainer.train_from_buffer(self.replay_buffer) 120 | gt.stamp('model training', unique=False) 121 | 122 | if (epoch == 0 and t == 0) and self.initial_training_steps > 0: 123 | for _ in range(self.initial_training_steps): 124 | train_data = self.replay_buffer.random_batch( 125 | self.batch_size) 126 | self.trainer.train(train_data) 127 | gt.stamp('initial policy training', unique=False) 128 | 129 | s, a, r, d, ns, info = self.expl_data_collector.collect_one_step( 130 | self.max_path_length, 131 | discard_incomplete_paths=False, 132 | ) 133 | gt.stamp('exploration sampling', unique=False) 134 | 135 | self.replay_buffer.add_sample(s, a, r, d, ns, env_info=info) 136 | gt.stamp('data storing', unique=False) 137 | 138 | for _ in range(num_trains_per_expl_step): 139 | train_data = self.replay_buffer.random_batch( 140 | self.batch_size) 141 | self.trainer.train(train_data) 142 | gt.stamp('policy training', unique=False) 143 | self.training_mode(False) 144 | 145 | self._end_epoch(epoch) 146 | -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/online/online_rl_algorithm.py: -------------------------------------------------------------------------------- 1 | import gtimer as gt 2 | 3 | import abc 4 | 5 | from lifelong_rl.core.rl_algorithms.rl_algorithm import BaseRLAlgorithm 6 | 7 | 8 | class OnlineRLAlgorithm(BaseRLAlgorithm, metaclass=abc.ABCMeta): 9 | 10 | def __init__( 11 | self, 12 | trainer, 13 | exploration_policy, 14 | exploration_env, 15 | evaluation_env, 16 | exploration_data_collector, 17 | evaluation_data_collector, 18 | replay_buffer, 19 | batch_size, 20 | max_path_length, 21 | num_epochs, 22 | num_eval_steps_per_epoch, 23 | num_expl_steps_per_train_loop, 24 | num_trains_per_train_loop, 25 | num_train_loops_per_epoch=1, 26 | min_num_steps_before_training=0, 27 | save_snapshot_freq=100, 28 | ): 29 | super().__init__( 30 | trainer, 31 | exploration_policy, 32 | exploration_env, 33 | evaluation_env, 34 | exploration_data_collector, 35 | evaluation_data_collector, 36 | replay_buffer, 37 | save_snapshot_freq=save_snapshot_freq, 38 | ) 39 | self.batch_size = batch_size 40 | self.max_path_length = max_path_length 41 | self.num_epochs = num_epochs 42 | self.num_eval_steps_per_epoch = num_eval_steps_per_epoch 43 | self.num_trains_per_train_loop = num_trains_per_train_loop 44 | self.num_train_loops_per_epoch = num_train_loops_per_epoch 45 | self.num_expl_steps_per_train_loop = num_expl_steps_per_train_loop 46 | self.min_num_steps_before_training = min_num_steps_before_training 47 | 48 | assert self.num_trains_per_train_loop >= self.num_expl_steps_per_train_loop, \ 49 | 'Online training presumes num_trains_per_train_loop >= num_expl_steps_per_train_loop' 50 | 51 | def _train(self): 52 | self.training_mode(False) 53 | if self.min_num_steps_before_training > 0: 54 | for _ in range(self.min_num_steps_before_training): 55 | s, a, r, d, ns, info = self.expl_data_collector.collect_one_step( 56 | self.max_path_length, 57 | discard_incomplete_paths=False, 58 | ) 59 | 60 | self.replay_buffer.add_sample(s, a, r, d, ns, env_info=info) 61 | 62 | self.expl_data_collector.end_epoch(-1) 63 | gt.stamp('initial exploration', unique=False) 64 | 65 | num_trains_per_expl_step = self.num_trains_per_train_loop // self.num_expl_steps_per_train_loop 66 | for epoch in gt.timed_for( 67 | range(self._start_epoch, self.num_epochs), 68 | save_itrs=True, 69 | ): 70 | self.eval_data_collector.collect_new_paths( 71 | self.max_path_length, 72 | self.num_eval_steps_per_epoch, 73 | discard_incomplete_paths=True, 74 | ) 75 | gt.stamp('evaluation sampling', unique=False) 76 | 77 | for _ in range(self.num_train_loops_per_epoch): 78 | for _ in range(self.num_expl_steps_per_train_loop): 79 | s, a, r, d, ns, info = self.expl_data_collector.collect_one_step( 80 | self.max_path_length, 81 | discard_incomplete_paths=False, 82 | ) 83 | gt.stamp('exploration sampling', unique=False) 84 | 85 | self.replay_buffer.add_sample(s, a, r, d, ns, env_info=info) 86 | gt.stamp('data storing', unique=False) 87 | 88 | self.training_mode(True) 89 | for _ in range(num_trains_per_expl_step): 90 | train_data = self.replay_buffer.random_batch( 91 | self.batch_size) 92 | self.trainer.train(train_data) 93 | gt.stamp('training', unique=False) 94 | self.training_mode(False) 95 | 96 | self._end_epoch(epoch) 97 | -------------------------------------------------------------------------------- /lifelong_rl/core/rl_algorithms/torch_rl_algorithm.py: -------------------------------------------------------------------------------- 1 | from torch import nn as nn 2 | import wandb 3 | 4 | import abc 5 | from collections import OrderedDict 6 | from typing import Iterable 7 | 8 | from lifelong_rl.core.rl_algorithms.batch.batch_rl_algorithm import BatchRLAlgorithm 9 | from lifelong_rl.core.rl_algorithms.batch.mb_batch_rl_algorithm import MBBatchRLAlgorithm 10 | from lifelong_rl.core.rl_algorithms.offline.offline_rl_algorithm import OfflineRLAlgorithm 11 | from lifelong_rl.core.rl_algorithms.offline.mb_offline_rl_algorithm import OfflineMBRLAlgorithm 12 | from lifelong_rl.core.rl_algorithms.online.online_rl_algorithm import OnlineRLAlgorithm 13 | from lifelong_rl.core.rl_algorithms.online.mbrl_algorithm import MBRLAlgorithm 14 | from lifelong_rl.trainers.trainer import Trainer 15 | from lifelong_rl.torch.pytorch_util import np_to_pytorch_batch 16 | 17 | 18 | class TorchOnlineRLAlgorithm(OnlineRLAlgorithm): 19 | def configure_logging(self): 20 | for net in set(self.trainer.networks): 21 | wandb.watch(net) 22 | 23 | def to(self, device): 24 | for net in self.trainer.networks: 25 | net.to(device) 26 | 27 | def training_mode(self, mode): 28 | for net in self.trainer.networks: 29 | net.train(mode) 30 | 31 | 32 | class TorchBatchRLAlgorithm(BatchRLAlgorithm): 33 | def configure_logging(self): 34 | for net in set(self.trainer.networks): 35 | wandb.watch(net) 36 | 37 | def to(self, device): 38 | for net in self.trainer.networks: 39 | net.to(device) 40 | 41 | def training_mode(self, mode): 42 | for net in self.trainer.networks: 43 | net.train(mode) 44 | 45 | 46 | class TorchMBRLAlgorithm(MBRLAlgorithm): 47 | def configure_logging(self): 48 | for net in set(self.trainer.networks + self.model_trainer.networks): 49 | wandb.watch(net) 50 | 51 | def to(self, device): 52 | for net in self.trainer.networks: 53 | net.to(device) 54 | for net in self.model_trainer.networks: 55 | net.to(device) 56 | 57 | def training_mode(self, mode): 58 | for net in self.trainer.networks: 59 | net.train(mode) 60 | for net in self.model_trainer.networks: 61 | net.train(mode) 62 | 63 | 64 | class TorchMBBatchRLAlgorithm(MBBatchRLAlgorithm): 65 | def configure_logging(self): 66 | for net in set(self.trainer.networks + self.model_trainer.networks): 67 | wandb.watch(net) 68 | 69 | def to(self, device): 70 | for net in self.trainer.networks: 71 | net.to(device) 72 | for net in self.model_trainer.networks: 73 | net.to(device) 74 | 75 | def training_mode(self, mode): 76 | for net in self.trainer.networks: 77 | net.train(mode) 78 | for net in self.model_trainer.networks: 79 | net.train(mode) 80 | 81 | 82 | class TorchOfflineRLAlgorithm(OfflineRLAlgorithm): 83 | def configure_logging(self): 84 | for net in set(self.trainer.networks): 85 | wandb.watch(net) 86 | 87 | def to(self, device): 88 | for net in self.trainer.networks: 89 | net.to(device) 90 | 91 | def training_mode(self, mode): 92 | for net in self.trainer.networks: 93 | net.train(mode) 94 | 95 | 96 | class TorchOfflineMBRLAlgorithm(OfflineMBRLAlgorithm): 97 | def configure_logging(self): 98 | for net in set(self.trainer.networks + self.model_trainer.networks): 99 | wandb.watch(net) 100 | 101 | def to(self, device): 102 | for net in self.trainer.networks: 103 | net.to(device) 104 | for net in self.model_trainer.networks: 105 | net.to(device) 106 | 107 | def training_mode(self, mode): 108 | for net in self.trainer.networks: 109 | net.train(mode) 110 | for net in self.model_trainer.networks: 111 | net.train(mode) 112 | 113 | 114 | class TorchTrainer(Trainer, metaclass=abc.ABCMeta): 115 | def __init__(self): 116 | self._num_train_steps = 0 117 | 118 | def train(self, np_batch): 119 | self._num_train_steps += 1 120 | batch = np_to_pytorch_batch(np_batch) 121 | self.train_from_torch(batch) 122 | 123 | def get_diagnostics(self): 124 | return OrderedDict([ 125 | ('num train calls', self._num_train_steps), 126 | ]) 127 | 128 | def train_from_torch(self, batch): 129 | pass 130 | 131 | @property 132 | def networks(self) -> Iterable[nn.Module]: 133 | pass 134 | -------------------------------------------------------------------------------- /lifelong_rl/data_management/replay_buffers/env_replay_buffer.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.spaces import Discrete 3 | 4 | from lifelong_rl.data_management.replay_buffers.simple_replay_buffer import SimpleReplayBuffer 5 | from lifelong_rl.envs.env_utils import get_dim 6 | import numpy as np 7 | 8 | 9 | class EnvReplayBuffer(SimpleReplayBuffer): 10 | 11 | def __init__( 12 | self, 13 | max_replay_buffer_size, 14 | env, 15 | env_info_sizes=None, 16 | ): 17 | """ 18 | :param max_replay_buffer_size: 19 | :param env: 20 | """ 21 | self.env = env 22 | self._ob_space = env.observation_space 23 | self._action_space = env.action_space 24 | self._meta_infos = [] 25 | 26 | if env_info_sizes is None: 27 | if hasattr(env, 'info_sizes'): 28 | env_info_sizes = env.info_sizes 29 | else: 30 | env_info_sizes = dict() 31 | 32 | if isinstance(self._ob_space, gym.spaces.Box): 33 | self._ob_shape = self._ob_space.shape 34 | else: 35 | self._ob_shape = None 36 | 37 | super().__init__( 38 | max_replay_buffer_size=max_replay_buffer_size, 39 | observation_dim=get_dim(self._ob_space), 40 | action_dim=get_dim(self._action_space), 41 | env_info_sizes=env_info_sizes 42 | ) 43 | 44 | def obs_preproc(self, obs): 45 | if len(obs.shape) > len(self._ob_space.shape): 46 | obs = np.reshape(obs, (obs.shape[0], self._observation_dim)) 47 | else: 48 | obs = np.reshape(obs, (self._observation_dim,)) 49 | return obs 50 | 51 | def obs_postproc(self, obs): 52 | if self._ob_shape is None: 53 | return obs 54 | if len(obs.shape) > 1: 55 | obs = np.reshape(obs, (obs.shape[0], *self._ob_shape)) 56 | else: 57 | obs = np.reshape(obs, self._ob_shape) 58 | return obs 59 | 60 | def add_sample(self, observation, action, reward, terminal, 61 | next_observation, env_info=None, **kwargs): 62 | if hasattr(self.env, 'get_meta_infos'): 63 | self._meta_infos.append(self.env.get_meta_infos()) 64 | if env_info is None: 65 | env_info = dict() 66 | if isinstance(self._action_space, Discrete): 67 | new_action = np.zeros(self._action_dim) 68 | new_action[action] = 1 69 | else: 70 | new_action = action 71 | return super().add_sample( 72 | observation=observation, 73 | action=new_action, 74 | reward=reward, 75 | next_observation=next_observation, 76 | terminal=terminal, 77 | env_info=env_info, 78 | **kwargs 79 | ) 80 | 81 | def get_snapshot(self): 82 | snapshot = super().get_snapshot() 83 | snapshot['meta_infos'] = self._meta_infos 84 | return snapshot 85 | -------------------------------------------------------------------------------- /lifelong_rl/data_management/replay_buffers/mujoco_replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import copy 4 | 5 | from lifelong_rl.data_management.replay_buffers.env_replay_buffer import EnvReplayBuffer 6 | from lifelong_rl.util.visualize_mujoco import visualize_mujoco_from_states 7 | 8 | 9 | class MujocoReplayBuffer(EnvReplayBuffer): 10 | 11 | def __init__( 12 | self, 13 | max_replay_buffer_size, 14 | env, 15 | env_info_sizes=None 16 | ): 17 | """ 18 | :param max_replay_buffer_size: 19 | :param env: 20 | """ 21 | super().__init__( 22 | max_replay_buffer_size=max_replay_buffer_size, 23 | env=env, 24 | env_info_sizes=env_info_sizes 25 | ) 26 | 27 | self.body_xpos_shape = env.sim.data.body_xpos.shape 28 | self._body_xpos = np.zeros((max_replay_buffer_size, *self.body_xpos_shape)) 29 | 30 | self.qpos_shape = env.sim.data.qpos.shape 31 | self._qpos = np.zeros((max_replay_buffer_size, *self.qpos_shape)) 32 | 33 | self.env_states = [] 34 | 35 | def add_sample(self, observation, action, reward, terminal, 36 | next_observation, **kwargs): 37 | self._body_xpos[self._top] = self.env.sim.data.body_xpos 38 | self._qpos[self._top] = self.env.sim.data.qpos 39 | if len(self.env_states) >= self.max_replay_buffer_size(): 40 | self.env_states[self._top] = self.env.sim.get_state() 41 | else: 42 | self.env_states.append(copy.deepcopy(self.env.sim.get_state())) 43 | return super().add_sample( 44 | observation=observation, 45 | action=action, 46 | reward=reward, 47 | next_observation=next_observation, 48 | terminal=terminal, 49 | **kwargs 50 | ) 51 | 52 | def get_snapshot(self): 53 | snapshot = super().get_snapshot() 54 | snapshot.update(dict( 55 | body_xpos=self._body_xpos[:self._size], 56 | qpos=self._qpos[:self._size], 57 | env_states=self.env_states[:self._size], 58 | )) 59 | return snapshot 60 | 61 | def visualize_agent(self, start_idx, end_idx): 62 | visualize_mujoco_from_states(self.env, self.env_states[start_idx:end_idx]) 63 | -------------------------------------------------------------------------------- /lifelong_rl/data_management/replay_buffers/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class ReplayBuffer(object, metaclass=abc.ABCMeta): 5 | """ 6 | A class used to save and replay data. 7 | """ 8 | 9 | @abc.abstractmethod 10 | def add_sample(self, observation, action, reward, next_observation, 11 | terminal, **kwargs): 12 | """ 13 | Add a transition tuple. 14 | """ 15 | pass 16 | 17 | @abc.abstractmethod 18 | def terminate_episode(self): 19 | """ 20 | Let the replay buffer know that the episode has terminated in case some 21 | special book-keeping has to happen. 22 | :return: 23 | """ 24 | pass 25 | 26 | @abc.abstractmethod 27 | def num_steps_can_sample(self, **kwargs): 28 | """ 29 | :return: # of unique items that can be sampled. 30 | """ 31 | pass 32 | 33 | def add_path(self, path): 34 | """ 35 | Add a path to the replay buffer. 36 | 37 | This default implementation naively goes through every step, but you 38 | may want to optimize this. 39 | 40 | NOTE: You should NOT call "terminate_episode" after calling add_path. 41 | It's assumed that this function handles the episode termination. 42 | 43 | :param path: Dict like one outputted by lifelong_rl.samplers.util.rollout 44 | """ 45 | for i, ( 46 | obs, 47 | action, 48 | reward, 49 | next_obs, 50 | terminal, 51 | agent_info, 52 | env_info 53 | ) in enumerate(zip( 54 | path["observations"], 55 | path["actions"], 56 | path["rewards"], 57 | path["next_observations"], 58 | path["terminals"], 59 | path["agent_infos"], 60 | path["env_infos"], 61 | )): 62 | self.add_sample( 63 | observation=obs, 64 | action=action, 65 | reward=reward, 66 | next_observation=next_obs, 67 | terminal=terminal, 68 | agent_info=agent_info, 69 | env_info=env_info, 70 | ) 71 | self.terminate_episode() 72 | 73 | def add_paths(self, paths): 74 | for path in paths: 75 | self.add_path(path) 76 | 77 | @abc.abstractmethod 78 | def random_batch(self, batch_size): 79 | """ 80 | Return a batch of size `batch_size`. 81 | :param batch_size: 82 | :return: 83 | """ 84 | pass 85 | 86 | def get_diagnostics(self): 87 | return {} 88 | 89 | def get_snapshot(self): 90 | return {} 91 | 92 | def end_epoch(self, epoch): 93 | return 94 | 95 | -------------------------------------------------------------------------------- /lifelong_rl/data_management/utils/path_builder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class PathBuilder(dict): 5 | """ 6 | Usage: 7 | ``` 8 | path_builder = PathBuilder() 9 | path.add_sample( 10 | observations=1, 11 | actions=2, 12 | next_observations=3, 13 | ... 14 | ) 15 | path.add_sample( 16 | observations=4, 17 | actions=5, 18 | next_observations=6, 19 | ... 20 | ) 21 | 22 | path = path_builder.get_all_stacked() 23 | 24 | path['observations'] 25 | # output: [1, 4] 26 | path['actions'] 27 | # output: [2, 5] 28 | ``` 29 | 30 | Note that the key should be "actions" and not "action" since the 31 | resulting dictionary will have those keys. 32 | """ 33 | 34 | def __init__(self): 35 | super().__init__() 36 | self._path_length = 0 37 | 38 | def add_all(self, **key_to_value): 39 | for k, v in key_to_value.items(): 40 | if k not in self: 41 | self[k] = [v] 42 | else: 43 | self[k].append(v) 44 | self._path_length += 1 45 | 46 | def get_all_stacked(self): 47 | output_dict = dict() 48 | for k, v in self.items(): 49 | output_dict[k] = stack_list(v) 50 | return output_dict 51 | 52 | def __len__(self): 53 | return self._path_length 54 | 55 | 56 | def stack_list(lst): 57 | if isinstance(lst[0], dict): 58 | return lst 59 | else: 60 | return np.array(lst) 61 | -------------------------------------------------------------------------------- /lifelong_rl/envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/envs/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/envs/env_processor.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from lifelong_rl.envs.wrappers import NormalizedBoxEnv, NonTerminatingEnv, SwapColorEnv 4 | 5 | 6 | gym.logger.set_level(40) # stop annoying Box bound precision error 7 | 8 | 9 | def make_env(env_name, terminates=True, **kwargs): 10 | env = None 11 | base_env = None 12 | env_infos = dict() 13 | 14 | """ 15 | Episodic reinforcement learning 16 | """ 17 | if env_name == 'HalfCheetah': 18 | from gym.envs.mujoco import HalfCheetahEnv 19 | base_env = HalfCheetahEnv 20 | env_infos['mujoco'] = True 21 | elif env_name == 'Hopper': 22 | from gym.envs.mujoco import HopperEnv 23 | base_env = HopperEnv 24 | env_infos['mujoco'] = True 25 | elif env_name == 'InvertedPendulum': 26 | from gym.envs.mujoco import InvertedPendulumEnv 27 | base_env = InvertedPendulumEnv 28 | env_infos['mujoco'] = True 29 | elif env_name == 'Humanoid': 30 | from lifelong_rl.envs.environments.humanoid_env import HumanoidTruncatedObsEnv as HumanoidEnv 31 | from gym.envs.mujoco import HumanoidEnv 32 | base_env = HumanoidEnv 33 | env_infos['mujoco'] = True 34 | 35 | """ 36 | Lifelong reinforcement learning 37 | """ 38 | if env_name == 'LifelongHopper': 39 | from lifelong_rl.envs.environments.hopper_env import LifelongHopperEnv 40 | base_env = LifelongHopperEnv 41 | env_infos['mujoco'] = True 42 | elif env_name == 'LifelongAnt': 43 | from lifelong_rl.envs.environments.ant_env import LifelongAntEnv 44 | base_env = LifelongAntEnv 45 | env_infos['mujoco'] = True 46 | elif env_name == 'Gridworld': 47 | from lifelong_rl.envs.environments.continuous_gridworld.cont_gridworld import ContinuousGridworld 48 | base_env = ContinuousGridworld 49 | env_infos['mujoco'] = False 50 | 51 | if env is None and base_env is None: 52 | raise NameError('env_name not recognized') 53 | 54 | if env is None: 55 | env = base_env(**kwargs) 56 | 57 | if not isinstance(env.action_space, gym.spaces.Discrete): 58 | env = NormalizedBoxEnv(env) 59 | 60 | if not terminates: 61 | env = NonTerminatingEnv(env) 62 | 63 | return env, env_infos 64 | -------------------------------------------------------------------------------- /lifelong_rl/envs/env_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from gym.spaces import Box, Discrete, Tuple 4 | 5 | ENV_ASSET_DIR = os.path.join(os.path.dirname(__file__), 'assets') 6 | 7 | 8 | def get_asset_full_path(file_name): 9 | return os.path.join(ENV_ASSET_DIR, file_name) 10 | 11 | 12 | def get_dim(space): 13 | if isinstance(space, Box): 14 | return space.low.size 15 | elif isinstance(space, Discrete): 16 | return space.n 17 | elif isinstance(space, Tuple): 18 | return sum(get_dim(subspace) for subspace in space.spaces) 19 | elif hasattr(space, 'flat_dim'): 20 | return space.flat_dim 21 | else: 22 | raise TypeError("Unknown space: {}".format(space)) 23 | 24 | 25 | def mode(env, mode_type): 26 | try: 27 | getattr(env, mode_type)() 28 | except AttributeError: 29 | pass 30 | -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/envs/environments/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/assets/ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 82 | -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/assets/inverted_pendulum.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 28 | -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/continuous_gridworld/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/envs/environments/continuous_gridworld/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/continuous_gridworld/grids/blank.txt: -------------------------------------------------------------------------------- 1 | ########## 2 | # # 3 | # # 4 | # # 5 | # # 6 | # # 7 | # # 8 | ########## 9 | -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/continuous_gridworld/grids/minecraft/world_1.txt: -------------------------------------------------------------------------------- 1 | ############# 2 | # # 3 | # # 4 | # w r # 5 | # # 6 | # # 7 | # # 8 | # # 9 | # C i # 10 | # # 11 | # # 12 | # # 13 | ############# 14 | -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/continuous_gridworld/grids/one_goal.txt: -------------------------------------------------------------------------------- 1 | ################## 2 | # G# 3 | # # 4 | # # 5 | # # 6 | # # 7 | # # 8 | # # 9 | # # 10 | # # 11 | # # 12 | # # 13 | # # 14 | # # 15 | # # 16 | # # 17 | # # 18 | ################## 19 | -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/continuous_gridworld/grids/volcano_1.txt: -------------------------------------------------------------------------------- 1 | ############# 2 | #LLLLLGLLLLL# 3 | #LLLL LLLL# 4 | #LLLL H LLLL# 5 | #LLLL LLLL# 6 | #LLLLL LLLLL# 7 | #LLLLL LLLLL# 8 | #LLLLL LLLLL# 9 | #LLLLL LLLLL# 10 | #LLLLL LLLLL# 11 | #LLLLL LLLLL# 12 | #LLLLL LLLLL# 13 | ############# 14 | -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/continuous_gridworld/grids/volcano_2.txt: -------------------------------------------------------------------------------- 1 | ############# 2 | #LLLLL LLLLL# 3 | #LLLLL LLLLL# 4 | #LLLLL LLLLL# 5 | #LLLLL LLLLL# 6 | #LLLLL LLLLL# 7 | #LLLLL LLLLL# 8 | #LLLLL LLLLL# 9 | #LLLL LLLL# 10 | #LLLL H LLLL# 11 | #LLLL LLLL# 12 | #LLLLLGLLLLL# 13 | ############# 14 | -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/hopper_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import mujoco_env 3 | from gym import utils 4 | 5 | DEFAULT_VEL = 3 6 | 7 | DEFAULT_CAMERA_CONFIG = { 8 | 'trackbodyid': 2, 9 | 'distance': 3.0, 10 | 'lookat': np.array((0.0, 0.0, 1.15)), 11 | 'elevation': -20.0, 12 | } 13 | 14 | 15 | class LifelongHopperEnv(mujoco_env.MujocoEnv, utils.EzPickle): 16 | 17 | """ 18 | Described in Lu et al. 2020. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | xml_file='hopper.xml', 24 | forward_reward_weight=1.0, 25 | ctrl_cost_weight=1e-3, 26 | healthy_reward=1.0, 27 | terminate_when_unhealthy=True, 28 | healthy_state_range=(-100.0, 100.0), 29 | healthy_z_range=(0.7, float('inf')), 30 | healthy_angle_range=(-0.2, 0.2), 31 | reset_noise_scale=5e-3, 32 | exclude_current_positions_from_observation=True, 33 | target_vel=None, 34 | target_vel_in_obs=False, 35 | rgb_rendering_tracking=True, 36 | ): 37 | utils.EzPickle.__init__(**locals()) 38 | 39 | self._forward_reward_weight = forward_reward_weight 40 | 41 | self._ctrl_cost_weight = ctrl_cost_weight 42 | 43 | self._healthy_reward = healthy_reward 44 | self._terminate_when_unhealthy = terminate_when_unhealthy 45 | 46 | self._healthy_state_range = healthy_state_range 47 | self._healthy_z_range = healthy_z_range 48 | self._healthy_angle_range = healthy_angle_range 49 | 50 | self._reset_noise_scale = reset_noise_scale 51 | 52 | self._exclude_current_positions_from_observation = ( 53 | exclude_current_positions_from_observation) 54 | 55 | self._target_vel = target_vel 56 | self._target_vel_in_obs = target_vel_in_obs 57 | self._target_vel_reward_weight = 1 58 | 59 | mujoco_env.MujocoEnv.__init__(self, xml_file, 4) 60 | 61 | """ 62 | Required for compatibility with lifelong_rl lifelong environment setting 63 | """ 64 | 65 | def get_env_state(self): 66 | return self.sim.get_state() 67 | 68 | def set_env_state(self, state): 69 | self.sim.set_state(state) 70 | 71 | """ 72 | ================================================================= 73 | """ 74 | 75 | @property 76 | def healthy_reward(self): 77 | return float( 78 | self.is_healthy 79 | or self._terminate_when_unhealthy 80 | ) * self._healthy_reward 81 | 82 | def control_cost(self, action): 83 | control_cost = self._ctrl_cost_weight * np.sum(np.square(action)) 84 | return control_cost 85 | 86 | def set_target_vel(self, vel): 87 | self._target_vel = vel 88 | 89 | def get_target_vel(self): 90 | if self._target_vel is not None: 91 | return self._target_vel 92 | else: 93 | return DEFAULT_VEL 94 | 95 | @property 96 | def is_healthy(self): 97 | z, angle = self.sim.data.qpos[1:3] 98 | state = self.state_vector()[2:] 99 | 100 | min_state, max_state = self._healthy_state_range 101 | min_z, max_z = self._healthy_z_range 102 | min_angle, max_angle = self._healthy_angle_range 103 | 104 | healthy_state = np.all( 105 | np.logical_and(min_state < state, state < max_state)) 106 | healthy_z = min_z < z < max_z 107 | healthy_angle = min_angle < angle < max_angle 108 | 109 | is_healthy = all((healthy_state, healthy_z, healthy_angle)) 110 | 111 | return is_healthy 112 | 113 | @property 114 | def done(self): 115 | done = (not self.is_healthy 116 | if self._terminate_when_unhealthy 117 | else False) 118 | return done 119 | 120 | def _get_obs(self): 121 | position = self.sim.data.qpos.flat.copy() 122 | velocity = np.clip( 123 | self.sim.data.qvel.flat.copy(), -10., 10.) 124 | 125 | if self._exclude_current_positions_from_observation: 126 | position = position[1:] 127 | 128 | if self._target_vel is not None and self._target_vel_in_obs: 129 | target_vel = np.array([self.get_target_vel()]) 130 | else: 131 | target_vel = [] 132 | 133 | observation = np.concatenate((position, velocity, target_vel)).ravel() 134 | return observation 135 | 136 | def get_obs(self): 137 | return self._get_obs() 138 | 139 | def step(self, action): 140 | x_position_before = self.sim.data.qpos[0] 141 | self.do_simulation(action, self.frame_skip) 142 | x_position_after = self.sim.data.qpos[0] 143 | x_velocity = ((x_position_after - x_position_before) 144 | / self.dt) 145 | 146 | z, z_des = self.sim.data.qpos[1], 1.8 147 | height_cost = 5 * ((z - z_des) ** 2) 148 | vel_cost = abs(x_velocity-self.get_target_vel()) 149 | ctrl_cost = .1 * np.sum(np.square(action)) 150 | 151 | rewards = abs(self.get_target_vel()) 152 | costs = height_cost + vel_cost + ctrl_cost 153 | 154 | reward = rewards - costs 155 | info = { 156 | 'x_velocity': x_velocity, 157 | 'z': z, 158 | } 159 | 160 | return self._get_obs(), reward, False, info 161 | 162 | def reset_model(self): 163 | noise_low = -self._reset_noise_scale 164 | noise_high = self._reset_noise_scale 165 | 166 | qpos = self.init_qpos + self.np_random.uniform( 167 | low=noise_low, high=noise_high, size=self.model.nq) 168 | qvel = self.init_qvel + self.np_random.uniform( 169 | low=noise_low, high=noise_high, size=self.model.nv) 170 | 171 | self.set_state(qpos, qvel) 172 | 173 | observation = self._get_obs() 174 | return observation 175 | 176 | def viewer_setup(self): 177 | for key, value in DEFAULT_CAMERA_CONFIG.items(): 178 | if isinstance(value, np.ndarray): 179 | getattr(self.viewer.cam, key)[:] = value 180 | else: 181 | setattr(self.viewer.cam, key, value) 182 | 183 | -------------------------------------------------------------------------------- /lifelong_rl/envs/environments/humanoid_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import mujoco_env 3 | from gym import utils 4 | 5 | 6 | def mass_center(model, sim): 7 | mass = np.expand_dims(model.body_mass, 1) 8 | xpos = sim.data.xipos 9 | return (np.sum(mass * xpos, 0) / np.sum(mass))[0] 10 | 11 | 12 | class HumanoidTruncatedObsEnv(mujoco_env.MujocoEnv, utils.EzPickle): 13 | 14 | """ 15 | COM inertia (cinert), COM velocity (cvel), actuator forces (qfrc_actuator), 16 | and external forces (cfrc_ext) are removed from the observation. 17 | Otherwise identical to Humanoid-v2 from 18 | https://github.com/openai/gym/blob/master/gym/envs/mujoco/humanoid.py 19 | """ 20 | 21 | def __init__(self): 22 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5) 23 | utils.EzPickle.__init__(self) 24 | 25 | def _get_obs(self): 26 | data = self.sim.data 27 | return np.concatenate([data.qpos.flat[2:], 28 | data.qvel.flat, 29 | # data.cinert.flat, 30 | # data.cvel.flat, 31 | # data.qfrc_actuator.flat, 32 | # data.cfrc_ext.flat 33 | ]) 34 | 35 | def step(self, a): 36 | pos_before = mass_center(self.model, self.sim) 37 | self.do_simulation(a, self.frame_skip) 38 | pos_after = mass_center(self.model, self.sim) 39 | alive_bonus = 5.0 40 | data = self.sim.data 41 | lin_vel_cost = 0.25 * (pos_after - pos_before) / self.model.opt.timestep 42 | quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() 43 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() 44 | quad_impact_cost = min(quad_impact_cost, 10) 45 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 46 | qpos = self.sim.data.qpos 47 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 48 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 49 | 50 | def reset_model(self): 51 | c = 0.01 52 | self.set_state( 53 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 54 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 55 | ) 56 | return self._get_obs() 57 | 58 | def viewer_setup(self): 59 | self.viewer.cam.trackbodyid = 1 60 | self.viewer.cam.distance = self.model.stat.extent * 1.0 61 | self.viewer.cam.lookat[2] = 2.0 62 | self.viewer.cam.elevation = -20 63 | -------------------------------------------------------------------------------- /lifelong_rl/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/models/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/models/dynamics_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/models/dynamics_models/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/models/dynamics_models/probabilistic_ensemble.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | import torch.nn.functional as F 6 | 7 | import lifelong_rl.torch.pytorch_util as ptu 8 | from lifelong_rl.models.networks import ParallelizedEnsemble 9 | 10 | 11 | class ProbabilisticEnsemble(ParallelizedEnsemble): 12 | 13 | """ 14 | Probabilistic ensemble (Chua et al. 2018). 15 | Implementation is parallelized such that every model uses one forward call. 16 | Each member predicts the mean and variance of the next state. 17 | Sampling is done either uniformly or via trajectory sampling. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | ensemble_size, # Number of members in ensemble 23 | obs_dim, # Observation dim of environment 24 | action_dim, # Action dim of environment 25 | hidden_sizes, # Hidden sizes for each model 26 | spectral_norm=False, # Apply spectral norm to every hidden layer 27 | **kwargs 28 | ): 29 | super().__init__( 30 | ensemble_size=ensemble_size, 31 | hidden_sizes=hidden_sizes, 32 | input_size=obs_dim + action_dim, 33 | output_size=2*(obs_dim + 2), # We predict (reward, done, next_state - state) 34 | hidden_activation=torch.tanh, 35 | spectral_norm=spectral_norm, 36 | **kwargs 37 | ) 38 | 39 | self.obs_dim, self.action_dim = obs_dim, action_dim 40 | self.output_size = obs_dim + 2 41 | 42 | # Note: we do not learn the logstd here, but some implementations do 43 | self.max_logstd = nn.Parameter( 44 | ptu.ones(obs_dim + 2), requires_grad=False) 45 | self.min_logstd = nn.Parameter( 46 | -ptu.ones(obs_dim + 2) * 5, requires_grad=False) 47 | 48 | def forward(self, input, deterministic=False, return_dist=False): 49 | output = super().forward(input) 50 | mean, logstd = torch.chunk(output, 2, dim=-1) 51 | 52 | # Variance clamping to prevent poor numerical predictions 53 | logstd = self.max_logstd - F.softplus(self.max_logstd - logstd) 54 | logstd = self.min_logstd + F.softplus(logstd - self.min_logstd) 55 | 56 | if deterministic: 57 | return mean, logstd if return_dist else mean 58 | 59 | std = torch.exp(logstd) 60 | eps = ptu.randn(std.shape) 61 | samples = mean + std * eps 62 | 63 | if return_dist: 64 | return samples, mean, logstd 65 | else: 66 | return samples 67 | 68 | def get_loss(self, x, y, split_by_model=False, return_l2_error=False): 69 | # Note: we assume y here already accounts for the delta of the next state 70 | 71 | mean, logstd = self.forward(x, deterministic=True, return_dist=True) 72 | if len(y.shape) < 3: 73 | y = y.unsqueeze(0).repeat(self.ensemble_size, 1, 1) 74 | 75 | # Maximize log-probability of transitions 76 | inv_var = torch.exp(-2 * logstd) 77 | sq_l2_error = (mean - y)**2 78 | if return_l2_error: 79 | l2_error = torch.sqrt(sq_l2_error).mean(dim=-1).mean(dim=-1) 80 | 81 | loss = (sq_l2_error * inv_var + 2 * logstd).sum(dim=-1).mean(dim=-1) 82 | 83 | if split_by_model: 84 | losses = [loss[i] for i in range(self.ensemble_size)] 85 | if return_l2_error: 86 | l2_errors = [l2_error[i] for i in range(self.ensemble_size)] 87 | return losses, l2_errors 88 | else: 89 | return losses 90 | else: 91 | if return_l2_error: 92 | return loss.mean(), l2_error.mean() 93 | else: 94 | return loss.mean() 95 | 96 | def sample_with_disagreement(self, input, return_dist=False, disagreement_type='mean'): 97 | preds, mean, logstd = self.forward(input, deterministic=False, return_dist=True) 98 | 99 | # Standard uniformly from the ensemble 100 | inds = torch.randint(0, preds.shape[0], input.shape[:-1]) 101 | 102 | # Ensure we don't use the same member to estimate disagreement 103 | inds_b = torch.randint(0, mean.shape[0], input.shape[:-1]) 104 | inds_b[inds == inds_b] = torch.fmod(inds_b[inds == inds_b] + 1, mean.shape[0]) 105 | 106 | # Repeat for multiplication 107 | inds = inds.unsqueeze(dim=-1).to(device=ptu.device) 108 | inds = inds.repeat(1, preds.shape[2]) 109 | inds_b = inds_b.unsqueeze(dim=-1).to(device=ptu.device) 110 | inds_b = inds_b.repeat(1, preds.shape[2]) 111 | 112 | # Uniformly sample from ensemble 113 | samples = (inds == 0).float() * preds[0] 114 | for i in range(1, preds.shape[0]): 115 | samples += (inds == i).float() * preds[i] 116 | 117 | if disagreement_type == 'mean': 118 | # Disagreement = mean squared difference in mean predictions (Kidambi et al. 2020) 119 | means_a = (inds == 0).float() * mean[0] 120 | means_b = (inds_b == 0).float() * mean[0] 121 | for i in range(1, preds.shape[0]): 122 | means_a += (inds == i).float() * mean[i] 123 | means_b += (inds_b == i).float() * mean[i] 124 | 125 | disagreements = torch.mean((means_a - means_b) ** 2, dim=-1, keepdim=True) 126 | 127 | elif disagreement_type == 'var': 128 | # Disagreement = max Frobenius norm of covariance matrix (Yu et al. 2020) 129 | vars = (2 * logstd).exp() 130 | frobenius = torch.sqrt(vars.sum(dim=-1)) 131 | disagreements, *_ = frobenius.max(dim=0) 132 | disagreements = disagreements.reshape(-1, 1) 133 | 134 | else: 135 | raise NotImplementedError 136 | 137 | if return_dist: 138 | return samples, disagreements, mean, logstd 139 | else: 140 | return samples, disagreements 141 | -------------------------------------------------------------------------------- /lifelong_rl/optimizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/optimizers/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/optimizers/optimizer.py: -------------------------------------------------------------------------------- 1 | 2 | class Optimizer: 3 | 4 | def __init__(self, sol_dim): 5 | self.sol_dim = sol_dim 6 | 7 | def optimize(self, *args, **kwargs): 8 | return 9 | -------------------------------------------------------------------------------- /lifelong_rl/optimizers/random_shooting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/optimizers/random_shooting/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/optimizers/random_shooting/cem.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lifelong_rl.optimizers.random_shooting.rs_optimizer import RSOptimizer 4 | 5 | 6 | class CEMOptimizer(RSOptimizer): 7 | 8 | def __init__( 9 | self, 10 | sol_dim, 11 | num_iters, 12 | population_size, 13 | elites_frac, 14 | cost_function, 15 | upper_bound=1, 16 | lower_bound=-1, 17 | epsilon=1e-3, 18 | polyak=0.2, 19 | ): 20 | super().__init__( 21 | sol_dim, 22 | num_iters, 23 | population_size, 24 | cost_function, 25 | upper_bound=upper_bound, 26 | lower_bound=lower_bound, 27 | epsilon=epsilon, 28 | polyak=polyak, 29 | ) 30 | 31 | self.elites_frac = max(min(elites_frac, 1), .01) 32 | 33 | def update_sol(self, costs, samples, noise, init_mean, init_var): 34 | elites = samples[np.argsort(costs)][:int(self.elites_frac * self.population_size)] 35 | updated_mean = np.mean(elites, axis=0) 36 | updated_var = np.var(elites, axis=0) 37 | return updated_mean, updated_var 38 | -------------------------------------------------------------------------------- /lifelong_rl/optimizers/random_shooting/mppi.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from lifelong_rl.optimizers.random_shooting.rs_optimizer import RSOptimizer 4 | 5 | 6 | class MPPIOptimizer(RSOptimizer): 7 | 8 | def __init__( 9 | self, 10 | sol_dim, 11 | num_iters, 12 | population_size, 13 | temperature, 14 | cost_function, 15 | upper_bound=1, 16 | lower_bound=-1, 17 | epsilon=1e-3, 18 | polyak=0.2, 19 | *args, 20 | **kwargs, 21 | ): 22 | super().__init__( 23 | sol_dim, 24 | num_iters, 25 | population_size, 26 | cost_function, 27 | upper_bound=upper_bound, 28 | lower_bound=lower_bound, 29 | epsilon=epsilon, 30 | polyak=polyak, 31 | *args, 32 | **kwargs, 33 | ) 34 | 35 | self.temperature = temperature 36 | 37 | def update_sol(self, costs, samples, noise, init_mean, init_var): 38 | w = np.exp(-costs / self.temperature) 39 | w_total = np.sum(w) + 1e-6 40 | updated_mean = np.sum((w * samples.T).T, axis=0) / w_total 41 | updated_var = np.sum((w * np.square(noise).T).T, axis=0) / w_total 42 | return updated_mean, updated_var 43 | -------------------------------------------------------------------------------- /lifelong_rl/optimizers/random_shooting/rs_optimizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from collections import OrderedDict 4 | 5 | from lifelong_rl.optimizers.optimizer import Optimizer 6 | 7 | 8 | class RSOptimizer(Optimizer): 9 | 10 | def __init__( 11 | self, 12 | sol_dim, 13 | num_iters, 14 | population_size, 15 | cost_function, 16 | upper_bound=1, 17 | lower_bound=-1, 18 | epsilon=1e-3, 19 | polyak=0.2, 20 | min_var=0.5, 21 | learn_variance=False, 22 | filter_noise=None, 23 | ): 24 | super().__init__(sol_dim) 25 | self.num_iters = num_iters 26 | self.population_size = population_size 27 | self.cost_function = cost_function 28 | self.filter_noise = filter_noise 29 | 30 | self.upper_bound = upper_bound 31 | self.lower_bound = lower_bound 32 | self.epsilon = epsilon 33 | self.polyak = polyak 34 | self.min_var = min_var 35 | self.learn_variance = learn_variance 36 | 37 | def optimize(self, init_mean, init_var): 38 | mean, var = init_mean, init_var 39 | 40 | diagnostics = OrderedDict() 41 | for it in range(self.num_iters): 42 | noise = np.random.randn(self.population_size, self.sol_dim) * np.sqrt(var) 43 | if self.filter_noise is not None: 44 | noise = self.filter_noise(noise) 45 | 46 | samples = mean + noise 47 | samples = np.minimum(np.maximum(samples, self.lower_bound), self.upper_bound) 48 | 49 | costs = self.cost_function(samples, it) 50 | 51 | # normalization technique: puts costs in [0, 1], so softmax will be over [-1, 0] 52 | costs[costs != costs] = np.max(costs) 53 | costs = (costs - np.max(costs)) / (np.max(costs) - np.min(costs) + 1e-6) + 1 54 | 55 | updated_mean, updated_var = self.update_sol(costs, samples, noise, mean, var) 56 | 57 | mean = self.polyak * mean + (1 - self.polyak) * updated_mean 58 | if self.learn_variance: 59 | var = self.polyak * var + (1 - self.polyak) * updated_var 60 | var = np.maximum(var, self.min_var) 61 | 62 | diagnostics['Iteration %d Variance Mean' % it] = np.mean(var) 63 | diagnostics['Iteration %d Variance Std' % it] = np.std(var) 64 | 65 | return mean, diagnostics 66 | 67 | def update_sol(self, costs, samples, noise, init_mean, init_var): 68 | return samples[np.argmin(costs)], init_var 69 | -------------------------------------------------------------------------------- /lifelong_rl/policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/policies/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/policies/base/base.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | import abc 4 | 5 | 6 | class Policy(object, metaclass=abc.ABCMeta): 7 | 8 | """ 9 | General policy interface. 10 | """ 11 | 12 | @abc.abstractmethod 13 | def get_action(self, observation): 14 | """ 15 | 16 | :param observation: 17 | :return: action, debug_dictionary 18 | """ 19 | pass 20 | 21 | def reset(self): 22 | pass 23 | 24 | 25 | class ExplorationPolicy(Policy, metaclass=abc.ABCMeta): 26 | def set_num_steps_total(self, t): 27 | pass 28 | 29 | 30 | class MakeDeterministic(nn.Module, Policy): 31 | def __init__(self, stochastic_policy): 32 | super().__init__() 33 | self.stochastic_policy = stochastic_policy 34 | 35 | def get_action(self, observation): 36 | return self.stochastic_policy.get_action(observation, 37 | deterministic=True) 38 | -------------------------------------------------------------------------------- /lifelong_rl/policies/base/latent_prior_policy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from lifelong_rl.policies.base.base import ExplorationPolicy 4 | import lifelong_rl.torch.pytorch_util as ptu 5 | 6 | 7 | class PriorLatentPolicy(ExplorationPolicy): 8 | 9 | """ 10 | Policy sampling according to some internal latent. 11 | TODO: This class needs refactoring. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | policy, 17 | prior, 18 | unconditional=False, 19 | steps_between_sampling=100, 20 | ): 21 | self.policy = policy 22 | self.prior = prior 23 | self.unconditional = unconditional 24 | self.steps_between_sampling = steps_between_sampling 25 | 26 | self.fixed_latent = False 27 | 28 | self._steps_since_last_sample = 0 29 | self._last_latent = None 30 | 31 | def set_latent(self, latent): 32 | self._last_latent = latent 33 | 34 | def get_current_latent(self): 35 | return ptu.get_numpy(self._last_latent) 36 | 37 | def sample_latent(self, state=None): 38 | if self.unconditional or state is None: # this will probably be changed 39 | latent = self.prior.sample() # n=1).squeeze(0) 40 | else: 41 | latent = self.prior.forward(ptu.from_numpy(state)) 42 | self.set_latent(latent) 43 | return latent 44 | 45 | def get_action(self, state): 46 | if (self._steps_since_last_sample >= self.steps_between_sampling or 47 | self._last_latent is None) and not self.fixed_latent: 48 | latent = self.sample_latent(state) 49 | self._steps_since_last_sample = 0 50 | else: 51 | latent = self._last_latent 52 | self._steps_since_last_sample += 1 53 | 54 | state = ptu.from_numpy(state) 55 | sz = torch.cat((state, latent)) 56 | action, *_ = self.policy.forward(sz) 57 | return ptu.get_numpy(action), dict() 58 | 59 | def eval(self): 60 | self.policy.eval() 61 | 62 | def train(self): 63 | self.policy.train() 64 | -------------------------------------------------------------------------------- /lifelong_rl/policies/base/simple.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.policies.base.base import Policy 2 | 3 | 4 | class RandomPolicy(Policy): 5 | 6 | """ 7 | Policy that outputs random actions. 8 | """ 9 | 10 | def __init__(self, action_space): 11 | self.action_space = action_space 12 | 13 | def get_action(self, obs): 14 | return self.action_space.sample(), {} 15 | -------------------------------------------------------------------------------- /lifelong_rl/policies/models/gaussian_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn as nn 4 | 5 | from lifelong_rl.policies.base.base import ExplorationPolicy 6 | from lifelong_rl.torch.pytorch_util import eval_np 7 | from lifelong_rl.torch.distributions import TanhNormal 8 | from lifelong_rl.models.networks import Mlp 9 | import lifelong_rl.torch.pytorch_util as ptu 10 | 11 | 12 | LOG_SIG_MAX = 2 13 | LOG_SIG_MIN = -20 14 | 15 | 16 | class TanhGaussianPolicy(Mlp, ExplorationPolicy): 17 | 18 | """ 19 | Usage: 20 | 21 | ``` 22 | policy = TanhGaussianPolicy(...) 23 | action, mean, log_std, _ = policy(obs) 24 | action, mean, log_std, _ = policy(obs, deterministic=True) 25 | action, mean, log_std, log_prob = policy(obs, return_log_prob=True) 26 | ``` 27 | 28 | Here, mean and log_std are the mean and log_std of the Gaussian that is 29 | sampled from. 30 | 31 | If deterministic is True, action = tanh(mean). 32 | If return_log_prob is False (default), log_prob = None 33 | This is done because computing the log_prob can be a bit expensive. 34 | """ 35 | 36 | def __init__( 37 | self, 38 | hidden_sizes, 39 | obs_dim, 40 | action_dim, 41 | std=None, 42 | init_w=1e-3, 43 | restrict_obs_dim=0, 44 | **kwargs 45 | ): 46 | super().__init__( 47 | hidden_sizes, 48 | input_size=obs_dim, 49 | output_size=action_dim, 50 | init_w=init_w, 51 | **kwargs 52 | ) 53 | self.log_std = None 54 | self.std = std 55 | self.restrict_obs_dim = restrict_obs_dim 56 | 57 | if std is None: 58 | last_hidden_size = obs_dim 59 | if len(hidden_sizes) > 0: 60 | last_hidden_size = hidden_sizes[-1] 61 | self.last_fc_log_std = nn.Linear(last_hidden_size, action_dim) 62 | self.last_fc_log_std.weight.data.uniform_(-init_w, init_w) 63 | self.last_fc_log_std.bias.data.uniform_(-init_w, init_w) 64 | else: 65 | init_logstd = ptu.ones(1, action_dim) * np.log(std) 66 | self.log_std = torch.nn.Parameter(init_logstd, requires_grad=True) 67 | 68 | # for NPG 69 | model_parameters = filter(lambda p: p.requires_grad, self.parameters()) 70 | self.trainable_params = list(model_parameters) + [self.log_std] 71 | self.param_shapes = [p.cpu().data.numpy().shape for p in self.trainable_params] 72 | self.param_sizes = [p.cpu().data.numpy().size for p in self.trainable_params] 73 | 74 | def get_action(self, obs_np, deterministic=False): 75 | actions = self.get_actions(obs_np[None], deterministic=deterministic) 76 | return actions[0, :], {} 77 | 78 | def get_actions(self, obs_np, deterministic=False): 79 | return eval_np(self, obs_np, deterministic=deterministic)[0] 80 | 81 | def forward( 82 | self, 83 | obs, 84 | reparameterize=True, 85 | deterministic=False, 86 | return_log_prob=False, 87 | ): 88 | """ 89 | :param obs: Observation 90 | :param deterministic: If True, do not sample 91 | :param return_log_prob: If True, return a sample and its log probability 92 | """ 93 | if len(obs.shape) == 1: 94 | obs = obs[self.restrict_obs_dim:] 95 | else: 96 | obs = obs[:,self.restrict_obs_dim:] 97 | 98 | h = obs 99 | for i, fc in enumerate(self.fcs): 100 | h = self.hidden_activation(fc(h)) 101 | mean = self.last_fc(h) 102 | if self.std is None: 103 | log_std = self.last_fc_log_std(h) 104 | log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) 105 | std = torch.exp(log_std) 106 | else: 107 | log_std = self.log_std * ptu.ones(*mean.shape) 108 | log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) 109 | std = log_std.exp() 110 | 111 | log_prob = None 112 | entropy = None 113 | mean_action_log_prob = None 114 | pre_tanh_value = None 115 | if deterministic: 116 | action = torch.tanh(mean) 117 | else: 118 | tanh_normal = TanhNormal(mean, std) 119 | if return_log_prob: 120 | if reparameterize is True: 121 | action, pre_tanh_value = tanh_normal.rsample( 122 | return_pretanh_value=True 123 | ) 124 | else: 125 | action, pre_tanh_value = tanh_normal.sample( 126 | return_pretanh_value=True 127 | ) 128 | log_prob = tanh_normal.log_prob( 129 | action, 130 | pre_tanh_value=pre_tanh_value 131 | ) 132 | log_prob = log_prob.sum(dim=1, keepdim=True) 133 | else: 134 | if reparameterize is True: 135 | action = tanh_normal.rsample() 136 | else: 137 | action = tanh_normal.sample() 138 | 139 | return ( 140 | action, mean, log_std, log_prob, entropy, std, 141 | mean_action_log_prob, pre_tanh_value, 142 | ) 143 | 144 | def get_log_probs(self, obs, actions): 145 | _, mean, log_std, *_ = self.forward(obs, deterministic=True) 146 | tanh_normal = TanhNormal(mean, log_std.exp()) 147 | return tanh_normal.log_prob(actions).sum(dim=-1, keepdim=True) 148 | 149 | def get_param_values(self): 150 | params = np.concatenate([p.contiguous().view(-1).cpu().data.numpy() for p in self.trainable_params]) 151 | return params.copy() 152 | 153 | def set_param_values(self, new_params): 154 | current_idx = 0 155 | for idx, param in enumerate(self.trainable_params): 156 | vals = new_params[current_idx:current_idx + self.param_sizes[idx]] 157 | vals = vals.reshape(self.param_shapes[idx]) 158 | param.data = ptu.from_numpy(vals).float() 159 | current_idx += self.param_sizes[idx] 160 | self.trainable_params[-1].data = torch.clamp(self.trainable_params[-1], LOG_SIG_MIN) 161 | -------------------------------------------------------------------------------- /lifelong_rl/policies/mpc/policy_mpc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from lifelong_rl.policies.mpc.mpc import MPCPolicy 5 | 6 | 7 | class PolicyMPCController(MPCPolicy): 8 | 9 | """ 10 | Perform MPC planning over a policy that takes in an additional latent. 11 | """ 12 | 13 | def __init__( 14 | self, 15 | policy, # control policy to run that takes in a latent 16 | latent_dim, # dimension of the latent to feed the policy 17 | *args, 18 | **kwargs 19 | ): 20 | super().__init__(plan_dim=latent_dim, *args, **kwargs) 21 | self.policy = policy 22 | 23 | def convert_plan_to_action(self, obs, plan, deterministic=False): 24 | action, *_ = self.policy.get_action( 25 | np.concatenate((obs, plan), axis=-1), 26 | deterministic=True, 27 | ) 28 | return action 29 | 30 | def convert_plans_to_actions(self, obs, plans, deterministic=True): 31 | actions, *_ = self.policy( 32 | torch.cat((obs, plans), dim=-1), 33 | deterministic=deterministic, 34 | ) 35 | return actions 36 | -------------------------------------------------------------------------------- /lifelong_rl/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from lifelong_rl.samplers.data_collector.base import DataCollector, PathCollector, StepCollector 2 | from lifelong_rl.samplers.utils.rollout_functions import rollout, multitask_rollout 3 | 4 | 5 | DataCollector, PathCollector, StepCollector = DataCollector, PathCollector, StepCollector 6 | rollout, multitask_rollout = rollout, multitask_rollout 7 | -------------------------------------------------------------------------------- /lifelong_rl/samplers/data_collector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/samplers/data_collector/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/samplers/data_collector/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class DataCollector(object, metaclass=abc.ABCMeta): 5 | def end_epoch(self, epoch): 6 | pass 7 | 8 | def get_diagnostics(self): 9 | return {} 10 | 11 | def get_snapshot(self): 12 | return {} 13 | 14 | @abc.abstractmethod 15 | def get_epoch_paths(self): 16 | pass 17 | 18 | 19 | class PathCollector(DataCollector, metaclass=abc.ABCMeta): 20 | @abc.abstractmethod 21 | def collect_new_paths( 22 | self, 23 | max_path_length, 24 | num_steps, 25 | discard_incomplete_paths, 26 | ): 27 | pass 28 | 29 | 30 | class StepCollector(DataCollector, metaclass=abc.ABCMeta): 31 | @abc.abstractmethod 32 | def collect_new_steps( 33 | self, 34 | max_path_length, 35 | num_steps, 36 | discard_incomplete_paths, 37 | ): 38 | pass 39 | -------------------------------------------------------------------------------- /lifelong_rl/samplers/utils/path_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | import lifelong_rl.torch.pytorch_util as ptu 5 | 6 | 7 | """ 8 | Various path utilities (mostly used by policy gradient experiment_configs) 9 | """ 10 | 11 | 12 | def calculate_baselines(paths, value_func): 13 | for path in paths: 14 | obs = ptu.from_numpy(np.concatenate( 15 | [path['observations'], path['next_observations'][-1:]], axis=0 16 | )) 17 | values = torch.squeeze(value_func(obs), dim=-1) 18 | path['baselines'] = ptu.get_numpy(values) 19 | if path['terminals'][-1]: 20 | path['baselines'][-1] = 0 21 | 22 | 23 | def calculate_returns(paths, discount): 24 | for path in paths: 25 | rewards, dones = path['rewards'], path['terminals'] 26 | if 'baselines' in path: 27 | terminal_value = path['baselines'][-1] 28 | else: 29 | terminal_value = 0 30 | rewards = np.append(rewards, terminal_value) 31 | path['returns'] = discount_cumsum(rewards, dones, discount)[:-1] 32 | assert len(path['returns']) == len(dones) 33 | 34 | 35 | def calculate_advantages(paths, discount, gae_lambda=None, normalize=False): 36 | total_advs = [] 37 | for path in paths: 38 | returns = path['returns'] 39 | if 'baselines' not in path: 40 | advantages = returns 41 | elif gae_lambda is None: 42 | advantages = returns - path['baselines'][:-1] 43 | else: 44 | rewards, baselines, dones = path['rewards'], path['baselines'], path['terminals'] 45 | assert len(baselines) == len(rewards)+1 46 | td_deltas = rewards + discount * baselines[1:] - baselines[:-1] 47 | assert td_deltas.shape == rewards.shape 48 | advantages = discount_cumsum(td_deltas, dones, gae_lambda * discount) 49 | assert advantages.shape == rewards.shape 50 | path['advantages'] = advantages 51 | if normalize: 52 | total_advs = np.append(total_advs, advantages) 53 | if normalize: 54 | mean, std = total_advs.mean(), total_advs.std() 55 | for path in paths: 56 | path['advantages'] = (path['advantages'] - mean) / (std + 1e-6) 57 | 58 | 59 | def discount_cumsum(x, dones, gamma): 60 | discount_cumsum = np.zeros_like(x) 61 | discount_cumsum[-1] = x[-1] 62 | for t in reversed(range(x.shape[0]-1)): 63 | discount_cumsum[t] = x[t] + gamma * discount_cumsum[t+1] * (1-dones[t]) 64 | return discount_cumsum 65 | -------------------------------------------------------------------------------- /lifelong_rl/torch/modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class LayerNorm(nn.Module): 5 | 6 | """ 7 | Simple 1D LayerNorm. 8 | """ 9 | 10 | def __init__(self, features, center=True, scale=False, eps=1e-6): 11 | super().__init__() 12 | self.center = center 13 | self.scale = scale 14 | self.eps = eps 15 | if self.scale: 16 | self.scale_param = nn.Parameter(torch.ones(features)) 17 | else: 18 | self.scale_param = None 19 | if self.center: 20 | self.center_param = nn.Parameter(torch.zeros(features)) 21 | else: 22 | self.center_param = None 23 | 24 | def forward(self, x): 25 | mean = x.mean(-1, keepdim=True) 26 | std = x.std(-1, keepdim=True) 27 | output = (x - mean) / (std + self.eps) 28 | if self.scale: 29 | output = output * self.scale_param 30 | if self.center: 31 | output = output + self.center_param 32 | return output 33 | -------------------------------------------------------------------------------- /lifelong_rl/torch/pytorch_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def soft_update_from_to(source, target, tau): 6 | for target_param, param in zip(target.parameters(), source.parameters()): 7 | target_param.data.copy_( 8 | target_param.data * (1.0 - tau) + param.data * tau 9 | ) 10 | 11 | 12 | def copy_model_params_from_to(source, target): 13 | for target_param, param in zip(target.parameters(), source.parameters()): 14 | target_param.data.copy_(param.data) 15 | 16 | 17 | def fanin_init(tensor, scale=1): 18 | size = tensor.size() 19 | if len(size) == 2: 20 | fan_in = size[0] 21 | elif len(size) > 2: 22 | fan_in = np.prod(size[1:]) 23 | else: 24 | raise Exception("Shape must be have dimension at least 2.") 25 | bound = scale / np.sqrt(fan_in) 26 | return tensor.data.uniform_(-bound, bound) 27 | 28 | 29 | def orthogonal_init(tensor, gain=0.01): 30 | torch.nn.init.orthogonal_(tensor, gain=gain) 31 | 32 | 33 | def fanin_init_weights_like(tensor): 34 | size = tensor.size() 35 | if len(size) == 2: 36 | fan_in = size[0] 37 | elif len(size) > 2: 38 | fan_in = np.prod(size[1:]) 39 | else: 40 | raise Exception("Shape must be have dimension at least 2.") 41 | bound = 1. / np.sqrt(fan_in) 42 | new_tensor = FloatTensor(tensor.size()) 43 | new_tensor.uniform_(-bound, bound) 44 | return new_tensor 45 | 46 | 47 | """ 48 | GPU wrappers 49 | """ 50 | 51 | _use_gpu = False 52 | device = None 53 | _gpu_id = 0 54 | 55 | 56 | def set_gpu_mode(mode, gpu_id=0): 57 | global _use_gpu 58 | global device 59 | global _gpu_id 60 | _gpu_id = gpu_id 61 | _use_gpu = mode 62 | device = torch.device("cuda:" + str(gpu_id) if _use_gpu else "cpu") 63 | 64 | 65 | def gpu_enabled(): 66 | return _use_gpu 67 | 68 | 69 | def set_device(gpu_id): 70 | torch.cuda.set_device(gpu_id) 71 | 72 | 73 | # noinspection PyPep8Naming 74 | def FloatTensor(*args, torch_device=None, **kwargs): 75 | if torch_device is None: 76 | torch_device = device 77 | return torch.FloatTensor(*args, **kwargs, device=torch_device) 78 | 79 | 80 | def from_numpy(*args, **kwargs): 81 | return torch.from_numpy(*args, **kwargs).float().to(device) 82 | 83 | 84 | def get_numpy(tensor): 85 | return tensor.to('cpu').detach().numpy() 86 | 87 | 88 | def zeros(*sizes, torch_device=None, **kwargs): 89 | if torch_device is None: 90 | torch_device = device 91 | return torch.zeros(*sizes, **kwargs, device=torch_device) 92 | 93 | 94 | def ones(*sizes, torch_device=None, **kwargs): 95 | if torch_device is None: 96 | torch_device = device 97 | return torch.ones(*sizes, **kwargs, device=torch_device) 98 | 99 | 100 | def ones_like(*args, torch_device=None, **kwargs): 101 | if torch_device is None: 102 | torch_device = device 103 | return torch.ones_like(*args, **kwargs, device=torch_device) 104 | 105 | 106 | def rand(*args, torch_device=None, **kwargs): 107 | if torch_device is None: 108 | torch_device = device 109 | return torch.rand(*args, **kwargs, device=torch_device) 110 | 111 | 112 | def randn(*args, torch_device=None, **kwargs): 113 | if torch_device is None: 114 | torch_device = device 115 | return torch.randn(*args, **kwargs, device=torch_device) 116 | 117 | 118 | def zeros_like(*args, torch_device=None, **kwargs): 119 | if torch_device is None: 120 | torch_device = device 121 | return torch.zeros_like(*args, **kwargs, device=torch_device) 122 | 123 | 124 | def tensor(*args, torch_device=None, **kwargs): 125 | if torch_device is None: 126 | torch_device = device 127 | return torch.tensor(*args, **kwargs, device=torch_device) 128 | 129 | 130 | def normal(*args, **kwargs): 131 | return torch.normal(*args, **kwargs).to(device) 132 | 133 | 134 | def eval_np(module, *args, **kwargs): 135 | """ 136 | Eval this module with a numpy interface 137 | 138 | Same as a call to __call__ except all Variable input/outputs are 139 | replaced with numpy equivalents. 140 | 141 | Assumes the output is either a single object or a tuple of objects. 142 | """ 143 | torch_args = tuple(torch_ify(x) for x in args) 144 | torch_kwargs = {k: torch_ify(v) for k, v in kwargs.items()} 145 | outputs = module(*torch_args, **torch_kwargs) 146 | if isinstance(outputs, tuple): 147 | return tuple(np_ify(x) for x in outputs) 148 | else: 149 | return np_ify(outputs) 150 | 151 | 152 | def torch_ify(np_array_or_other): 153 | if isinstance(np_array_or_other, np.ndarray): 154 | return from_numpy(np_array_or_other) 155 | else: 156 | return np_array_or_other 157 | 158 | 159 | def np_ify(tensor_or_other): 160 | if isinstance(tensor_or_other, torch.autograd.Variable): 161 | return get_numpy(tensor_or_other) 162 | else: 163 | return tensor_or_other 164 | 165 | 166 | def _elem_or_tuple_to_variable(elem_or_tuple): 167 | if isinstance(elem_or_tuple, tuple): 168 | return tuple( 169 | _elem_or_tuple_to_variable(e) for e in elem_or_tuple 170 | ) 171 | return from_numpy(elem_or_tuple).float() 172 | 173 | 174 | def _filter_batch(np_batch): 175 | for k, v in np_batch.items(): 176 | if v.dtype == np.bool: 177 | yield k, v.astype(int) 178 | else: 179 | yield k, v 180 | 181 | 182 | def np_to_pytorch_batch(np_batch): 183 | return { 184 | k: _elem_or_tuple_to_variable(x) 185 | for k, x in _filter_batch(np_batch) 186 | if x.dtype != np.dtype('O') # ignore object (e.g. dictionaries) 187 | } 188 | -------------------------------------------------------------------------------- /lifelong_rl/torch/risk_aversion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | import lifelong_rl.torch.pytorch_util as ptu 5 | 6 | """ 7 | Risk aversion interface. Risk aversion (or seeking) can be represented as w^T x where x is 8 | a sorted vector of the values from the distribution drawn iid and w is a mask. This 9 | module handles the generation of these masks via defining the inverse beta function. 10 | """ 11 | 12 | 13 | def get_mask(mask_type, n_quantiles, risk_kwargs): 14 | """ 15 | Return a torch mask corresponding to the input parameters. 16 | """ 17 | if mask_type in _masks: 18 | if n_quantiles in _masks[mask_type]: 19 | return _masks[mask_type][n_quantiles] 20 | else: 21 | _masks[mask_type] = dict() 22 | 23 | if mask_type in _inverse_beta_funcs: 24 | _masks[mask_type][n_quantiles] = create_mask( 25 | _inverse_beta_funcs[mask_type], 26 | n_quantiles=n_quantiles, 27 | risk_kwargs=risk_kwargs, 28 | ) 29 | else: 30 | raise NotImplementedError('mask_type not recognized') 31 | 32 | return _masks[mask_type][n_quantiles] 33 | 34 | 35 | """ 36 | Utility functions 37 | """ 38 | 39 | 40 | _masks = dict() 41 | 42 | 43 | def create_mask(inverse_beta_func, n_quantiles, risk_kwargs): 44 | """ 45 | x in [0, 1] represents the CDF of the input. 46 | beta(x) represents the cumulative weight assigned to the lower x% of 47 | values, e.g. it is analogous to the CDF. This is typically easier 48 | to represent via the inverse of the beta function, so we take the 49 | inverse of the inverse beta function to get the original function. 50 | The reweighted function becomes: 51 | R(f, beta) = sum_i f(i/n) * (beta((i+1)/(n+1)) - beta(i/(n+1)) 52 | """ 53 | 54 | tau = np.linspace(0, 1, n_quantiles + 1) 55 | betas = np.zeros(n_quantiles + 1) 56 | mask = np.zeros(n_quantiles) 57 | 58 | # TODO: there are some issues with mask and risk_kwarg caching 59 | 60 | for i in range(n_quantiles + 1): 61 | betas[i] = inverse_beta_func(tau[i], risk_kwargs) 62 | for i in range(n_quantiles): 63 | mask[i] = betas[i+1] - betas[i] 64 | 65 | return ptu.from_numpy(mask) 66 | 67 | 68 | def get_inverse(func, x, n_bins=1024, risk_kwargs=None): 69 | # assumes domain/range is (0, 1), and function is monotonically increasing 70 | 71 | # assume we don't need things finer than 1024 for now, just 72 | # going to use a slow linear search 73 | # TODO: this function can be rewritten much better 74 | 75 | risk_kwargs = risk_kwargs if risk_kwargs is not None else dict() 76 | 77 | for i in range(n_bins): 78 | new_val = func(i / n_bins, risk_kwargs) 79 | if x <= new_val: 80 | return i / n_bins 81 | return 1. 82 | 83 | 84 | """ 85 | Types of risk aversion 86 | """ 87 | 88 | 89 | def neutral_func(tau, risk_kwargs): 90 | # Neutral risk preference / expected value / identity function 91 | return tau 92 | 93 | 94 | def cvar_func(tau, risk_kwargs): 95 | # Conditional Value at Risk (only consider bottom alpha% of outcomes) 96 | alpha = risk_kwargs['alpha'] 97 | if tau < alpha: 98 | return tau / alpha 99 | else: 100 | return 1. 101 | 102 | 103 | def _cpw_weight(tau, risk_kwargs): 104 | eta = risk_kwargs['eta'] 105 | return (tau ** eta) / (((tau ** eta) + (1 - tau) ** eta) ** (1 / eta)) 106 | 107 | 108 | def cpw_func(tau, risk_kwargs): 109 | # Cumulative Probability Weighting (from prospect theory) 110 | return get_inverse(_cpw_weight, tau, risk_kwargs=risk_kwargs) 111 | 112 | 113 | _inverse_beta_funcs = dict( 114 | neutral=neutral_func, 115 | cvar=cvar_func, 116 | cpw=cpw_func, 117 | ) 118 | -------------------------------------------------------------------------------- /lifelong_rl/trainers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/trainers/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/trainers/dads/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/trainers/dads/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/trainers/dads/empowerment_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | import lifelong_rl.torch.pytorch_util as ptu 5 | 6 | 7 | def calculate_contrastive_empowerment( 8 | discriminator, 9 | obs, 10 | next_obs, 11 | latents, 12 | num_prior_samples=512, 13 | distribution_type='uniform', 14 | split_group=4096*32, 15 | obs_mean=None, 16 | obs_std=None, 17 | return_diagnostics=False, 18 | prior=None, 19 | ): 20 | """ 21 | Described in Sharma et al 2019. 22 | Approximate variational lower bound using estimate of s' from s, z. 23 | Uses contrastive negatives to approximate denominator. 24 | """ 25 | 26 | discriminator.eval() 27 | 28 | if obs_mean is not None: 29 | obs = (obs - obs_mean) / (obs_std + 1e-6) 30 | next_obs = (next_obs - obs_mean) / (obs_std + 1e-6) 31 | 32 | obs_deltas = ptu.from_numpy(next_obs - obs) 33 | obs_altz = np.concatenate([obs] * num_prior_samples, axis=0) 34 | 35 | with torch.no_grad(): 36 | logp = discriminator.get_log_prob( 37 | ptu.from_numpy(obs), 38 | ptu.from_numpy(latents), 39 | obs_deltas, 40 | ) 41 | logp = ptu.get_numpy(logp) 42 | 43 | if distribution_type == 'uniform': 44 | latent_altz = np.random.uniform(low=-1, high=1, size=(obs_altz.shape[0], latents.shape[1])) 45 | elif distribution_type == 'prior': 46 | if prior is None: 47 | raise AssertionError('prior specified but not passed in') 48 | obs_t = ptu.from_numpy(obs_altz) 49 | latent_altz, *_ = prior.get_action(obs_t, deterministic=False) 50 | else: 51 | raise NotImplementedError('distribution_type not found') 52 | 53 | # keep track of next obs/delta 54 | next_obs_altz = np.concatenate([next_obs - obs] * num_prior_samples, axis=0) 55 | 56 | with torch.no_grad(): 57 | if obs_altz.shape[0] <= split_group: 58 | logp_altz = ptu.get_numpy(discriminator.get_log_prob( 59 | ptu.from_numpy(obs_altz), 60 | ptu.from_numpy(latent_altz), 61 | ptu.from_numpy(next_obs_altz), 62 | )) 63 | else: 64 | logp_altz = [] 65 | for split_idx in range(obs_altz.shape[0] // split_group): 66 | start_split = split_idx * split_group 67 | end_split = (split_idx + 1) * split_group 68 | logp_altz.append( 69 | ptu.get_numpy(discriminator.get_log_prob( 70 | ptu.from_numpy(obs_altz[start_split:end_split]), 71 | ptu.from_numpy(latent_altz[start_split:end_split]), 72 | ptu.from_numpy(next_obs_altz[start_split:end_split]), 73 | ))) 74 | if obs_altz.shape[0] % split_group: 75 | start_split = obs_altz.shape[0] % split_group 76 | logp_altz.append( 77 | ptu.get_numpy(discriminator.get_log_prob( 78 | ptu.from_numpy(obs_altz[-start_split:]), 79 | ptu.from_numpy(latent_altz[-start_split:]), 80 | ptu.from_numpy(next_obs_altz[-start_split:]), 81 | ))) 82 | logp_altz = np.concatenate(logp_altz) 83 | logp_altz = np.array(np.array_split(logp_altz, num_prior_samples)) 84 | 85 | if return_diagnostics: 86 | diagnostics = dict() 87 | orig_rep = np.repeat(np.expand_dims(logp, axis=0), axis=0, repeats=num_prior_samples) 88 | diagnostics['Pct Random Skills > Original'] = (orig_rep < logp_altz).mean() 89 | 90 | # final DADS reward 91 | intrinsic_reward = np.log(num_prior_samples + 1) - np.log(1 + np.exp( 92 | np.clip(logp_altz - logp.reshape(1, -1), -50, 50)).sum(axis=0)) 93 | 94 | if not return_diagnostics: 95 | return intrinsic_reward, (logp, logp_altz, logp - intrinsic_reward) 96 | else: 97 | return intrinsic_reward, (logp, logp_altz, logp - intrinsic_reward), diagnostics 98 | -------------------------------------------------------------------------------- /lifelong_rl/trainers/dads/skill_dynamics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | import lifelong_rl.torch.pytorch_util as ptu 5 | 6 | 7 | class SkillDynamics(torch.nn.Module): 8 | 9 | def __init__( 10 | self, 11 | observation_size, 12 | action_size, 13 | latent_size, 14 | normalize_observations=True, 15 | fc_layer_params=(256, 256), 16 | fix_variance=True, 17 | activation_func=torch.nn.ReLU, 18 | ): 19 | super().__init__() 20 | 21 | self._observation_size = observation_size 22 | self._action_size = action_size 23 | self._latent_size = latent_size 24 | self._normalize_observations = normalize_observations 25 | 26 | self._fc_layer_params = fc_layer_params 27 | self._fix_variance = fix_variance 28 | 29 | layers = [] 30 | for i in range(len(fc_layer_params)-1): 31 | if i == 0: 32 | layers.append(activation_func()) 33 | layers.append(torch.nn.Linear(fc_layer_params[i], fc_layer_params[i+1])) 34 | layers.append(activation_func()) 35 | self.model = torch.nn.Sequential(*layers) 36 | 37 | in_layers = [] 38 | if self._normalize_observations: 39 | in_layers.append(torch.nn.BatchNorm1d(observation_size + latent_size)) 40 | self.out_preproc = torch.nn.BatchNorm1d(observation_size, affine=False) 41 | else: 42 | print('not normalization observations') 43 | in_layers.append(torch.nn.Linear(observation_size + latent_size, fc_layer_params[0])) 44 | 45 | self.in_func = torch.nn.Sequential(*in_layers) 46 | 47 | self.out_mean = torch.nn.Linear(fc_layer_params[-1], observation_size) 48 | if not self._fix_variance: 49 | self.out_std = torch.nn.Linear(fc_layer_params[-1], observation_size) 50 | # TODO: implement clipping 51 | raise NotImplementedError 52 | 53 | self._normalize_output = True 54 | 55 | def forward(self, obs, latents): 56 | x = torch.cat([obs, latents], dim=-1) 57 | x = self.in_func(x) 58 | x = self.model(x) 59 | if self._fix_variance: 60 | return self.out_mean(x) 61 | else: 62 | return self.out_mean(x), self.out_std(x) 63 | 64 | def _get_distribution(self, obs, latents): 65 | x = torch.cat([obs, latents], dim=-1) 66 | x = self.in_func(x) 67 | x = self.model(x) 68 | 69 | mean = self.out_mean(x) 70 | if self._fix_variance: 71 | std = ptu.ones(*mean.shape) 72 | dist = torch.distributions.independent.Independent( 73 | torch.distributions.Normal(mean, std), 1 74 | ) 75 | else: 76 | raise NotImplementedError 77 | 78 | return dist 79 | 80 | def get_log_prob(self, obs, latents, next_obs): 81 | if self._normalize_observations: 82 | next_obs = self.out_preproc(next_obs) 83 | dist = self._get_distribution(obs, latents) 84 | return dist.log_prob(next_obs) 85 | 86 | def get_loss(self, obs, latents, next_obs, weights=None): 87 | log_probs = self.get_log_prob(obs, latents, next_obs) 88 | if weights is not None: 89 | log_probs = log_probs * weights 90 | return -log_probs.mean() 91 | -------------------------------------------------------------------------------- /lifelong_rl/trainers/her/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/trainers/her/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/trainers/her/her.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import copy 4 | 5 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchTrainer 6 | import lifelong_rl.torch.pytorch_util as ptu 7 | 8 | 9 | class HERTrainer(TorchTrainer): 10 | 11 | """ 12 | Hindsight Experience Replay (Andrychowicz et al. 2017). 13 | Duplicates transitions using different goals with particular reward function. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | policy_trainer, 19 | replay_buffer, 20 | state_dim, 21 | reward_func=None, 22 | relabel_goal_func=None, 23 | num_sampled_goals=1, 24 | relabel_method='future', 25 | policy_batch_size=256, 26 | num_policy_steps=1, 27 | ): 28 | super().__init__() 29 | 30 | self.policy_trainer = policy_trainer 31 | self.replay_buffer = replay_buffer 32 | self.state_dim = state_dim 33 | self.reward_func = reward_func 34 | self.relabel_goal_func = relabel_goal_func 35 | self.num_sampled_goals = num_sampled_goals 36 | self.relabel_method = relabel_method 37 | self.policy_batch_size = policy_batch_size 38 | self.num_policy_steps = num_policy_steps 39 | 40 | # Default goal methods: L2 distance & goal = desired state 41 | if self.reward_func is None: 42 | self.reward_func = lambda s, a, ns, g: np.linalg.norm(ns[:2] - g) < .1 43 | if self.relabel_goal_func is None: 44 | self.relabel_goal_func = lambda s, a, ns, g: ns 45 | 46 | def train_from_paths(self, paths): 47 | 48 | """ 49 | Path processing 50 | """ 51 | 52 | paths = copy.deepcopy(paths) 53 | for path in paths: 54 | obs, next_obs = path['observations'], path['next_observations'] 55 | states, next_states = obs[:,:self.state_dim], next_obs[:,:self.state_dim] 56 | goals = obs[:,self.state_dim:2*self.state_dim] 57 | actions = path['actions'] 58 | terminals = path['terminals'] # this is probably always False, but might want it? 59 | path_len = len(obs) 60 | 61 | # Relabel goals based on transitions taken 62 | relabeled_goals = [] 63 | for t in range(len(obs)): 64 | relabeled_goals.append(self.relabel_goal_func( 65 | states[t], actions[t], next_states[t], goals[t], 66 | )) 67 | relabeled_goals = np.array(relabeled_goals) 68 | 69 | # Add transitions & resampled goals to replay buffer 70 | for t in range(path_len): 71 | goals_t = goals[t:t+1] 72 | for _ in range(self.num_sampled_goals): 73 | if self.relabel_method == 'future': 74 | goal_inds = np.random.randint(t, path_len, self.num_sampled_goals) 75 | goals_t = np.concatenate([goals_t, relabeled_goals[goal_inds]], axis=0) 76 | else: 77 | raise NotImplementedError 78 | 79 | for k in range(len(goals_t)): 80 | if not self.learn_reward_func: 81 | r = self.reward_func(states[t], actions[t], next_states[t], goals_t[k]) 82 | else: 83 | r = ptu.get_numpy( 84 | self.learned_reward_func( 85 | ptu.from_numpy( 86 | np.concatenate([next_states[t], goals[t]])))).mean() 87 | self.replay_buffer.add_sample( 88 | observation=np.concatenate([states[t], goals_t[k], obs[t,2*self.state_dim:]]), 89 | action=actions[t], 90 | reward=r, 91 | terminal=terminals[t], # not obvious what desired behavior is 92 | next_observation=np.concatenate( 93 | [next_states[t,:self.state_dim], goals_t[k], obs[t,2*self.state_dim:]]), 94 | env_info=None, 95 | ) 96 | 97 | """ 98 | Off-policy training 99 | """ 100 | 101 | for _ in range(self.num_policy_steps): 102 | train_data = self.replay_buffer.random_batch(self.policy_batch_size) 103 | self.policy_trainer.train(train_data) 104 | 105 | def get_diagnostics(self): 106 | return self.policy_trainer.get_diagnostics() 107 | 108 | def end_epoch(self, epoch): 109 | self.policy_trainer.end_epoch(epoch) 110 | 111 | @property 112 | def networks(self): 113 | return self.policy_trainer.networks 114 | 115 | def get_snapshot(self): 116 | return self.policy_trainer.get_snapshot() 117 | -------------------------------------------------------------------------------- /lifelong_rl/trainers/lisp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/trainers/lisp/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/trainers/mbrl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/trainers/mbrl/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/trainers/mbrl/mbrl.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import numpy as np 4 | import torch 5 | import torch.optim as optim 6 | 7 | import lifelong_rl.torch.pytorch_util as ptu 8 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchTrainer 9 | 10 | 11 | class MBRLTrainer(TorchTrainer): 12 | def __init__( 13 | self, 14 | ensemble, 15 | num_elites=None, 16 | learning_rate=1e-3, 17 | batch_size=256, 18 | optimizer_class=optim.Adam, 19 | train_call_freq=1, 20 | **kwargs 21 | ): 22 | super().__init__() 23 | 24 | self.ensemble = ensemble 25 | self.ensemble_size = ensemble.ensemble_size 26 | self.num_elites = min(num_elites, self.ensemble_size) if num_elites \ 27 | else self.ensemble_size 28 | 29 | self.obs_dim = ensemble.obs_dim 30 | self.action_dim = ensemble.action_dim 31 | self.batch_size = batch_size 32 | self.train_call_freq = train_call_freq 33 | 34 | self.optimizer = self.construct_optimizer( 35 | ensemble, optimizer_class, learning_rate) 36 | 37 | self._n_train_steps_total = 0 38 | self._need_to_update_eval_statistics = True 39 | self.eval_statistics = OrderedDict() 40 | 41 | def construct_optimizer(self, model, optimizer_class, lr): 42 | decays = [.000025, .00005, .000075, .000075, .0001] 43 | 44 | fcs = model.fcs + [model.last_fc] 45 | optimizer = optimizer_class([ 46 | {'params': fcs[i].parameters(), 'weight_decay': decays[i]} \ 47 | for i in range(len(fcs)) 48 | ], lr=lr 49 | ) 50 | 51 | return optimizer 52 | 53 | def train_from_buffer(self, replay_buffer, holdout_pct=0.2, max_grad_steps=1000, epochs_since_last_update=5): 54 | self._n_train_steps_total += 1 55 | if self._n_train_steps_total % self.train_call_freq > 0 and self._n_train_steps_total > 1: 56 | return 57 | 58 | data = replay_buffer.get_transitions() 59 | x = data[:,:self.obs_dim + self.action_dim] # inputs s, a 60 | y = data[:,self.obs_dim + self.action_dim:] # predict r, d, ns 61 | y[:,-self.obs_dim:] -= x[:,:self.obs_dim] # predict delta in the state 62 | 63 | # normalize network inputs 64 | self.ensemble.fit_input_stats(x) 65 | 66 | # generate holdout set 67 | inds = np.random.permutation(data.shape[0]) 68 | x, y = x[inds], y[inds] 69 | 70 | n_train = max(int((1-holdout_pct) * data.shape[0]), data.shape[0] - 8092) 71 | n_test = data.shape[0] - n_train 72 | 73 | x_train, y_train = x[:n_train], y[:n_train] 74 | x_test, y_test = x[n_train:], y[n_train:] 75 | x_test, y_test = ptu.from_numpy(x_test), ptu.from_numpy(y_test) 76 | 77 | # train until holdout set convergence 78 | num_epochs, num_steps = 0, 0 79 | num_epochs_since_last_update = 0 80 | best_holdout_loss = float('inf') 81 | num_batches = int(np.ceil(n_train / self.batch_size)) 82 | 83 | while num_epochs_since_last_update < epochs_since_last_update and num_steps < max_grad_steps: 84 | # generate idx for each model to bootstrap 85 | self.ensemble.train() 86 | for b in range(num_batches): 87 | b_idxs = np.random.randint(n_train, size=(self.ensemble_size*self.batch_size)) 88 | x_batch, y_batch = x_train[b_idxs], y_train[b_idxs] 89 | x_batch, y_batch = ptu.from_numpy(x_batch), ptu.from_numpy(y_batch) 90 | x_batch = x_batch.view(self.ensemble_size, self.batch_size, -1) 91 | y_batch = y_batch.view(self.ensemble_size, self.batch_size, -1) 92 | loss = self.ensemble.get_loss(x_batch, y_batch) 93 | self.optimizer.zero_grad() 94 | loss.backward() 95 | self.optimizer.step() 96 | num_steps += num_batches 97 | 98 | # stop training based on holdout loss improvement 99 | self.ensemble.eval() 100 | with torch.no_grad(): 101 | holdout_losses, holdout_errors = self.ensemble.get_loss( 102 | x_test, y_test, split_by_model=True, return_l2_error=True) 103 | holdout_loss = sum(sorted(holdout_losses)[:self.num_elites]) / self.num_elites 104 | 105 | if num_epochs == 0 or \ 106 | (best_holdout_loss - holdout_loss) / abs(best_holdout_loss) > 0.01: 107 | best_holdout_loss = holdout_loss 108 | num_epochs_since_last_update = 0 109 | else: 110 | num_epochs_since_last_update += 1 111 | 112 | num_epochs += 1 113 | 114 | self.ensemble.elites = np.argsort(holdout_losses) 115 | 116 | if self._need_to_update_eval_statistics: 117 | self._need_to_update_eval_statistics = False 118 | 119 | self.eval_statistics['Model Elites Holdout Loss'] = \ 120 | np.mean(ptu.get_numpy(holdout_loss)) 121 | self.eval_statistics['Model Holdout Loss'] = \ 122 | np.mean(ptu.get_numpy(sum(holdout_losses))) / self.ensemble_size 123 | self.eval_statistics['Model Training Epochs'] = num_epochs 124 | self.eval_statistics['Model Training Steps'] = num_steps 125 | 126 | for i in range(self.ensemble_size): 127 | name = 'M%d' % (i+1) 128 | self.eval_statistics[name + ' Loss'] = \ 129 | np.mean(ptu.get_numpy(holdout_losses[i])) 130 | self.eval_statistics[name + ' L2 Error'] = \ 131 | np.mean(ptu.get_numpy(holdout_errors[i])) 132 | 133 | def train_from_torch(self, batch, idx=None): 134 | raise NotImplementedError 135 | 136 | def get_diagnostics(self): 137 | return self.eval_statistics 138 | 139 | def end_epoch(self, epoch): 140 | self._need_to_update_eval_statistics = True 141 | 142 | @property 143 | def networks(self): 144 | return [ 145 | self.ensemble 146 | ] 147 | 148 | def get_snapshot(self): 149 | return dict( 150 | ensemble=self.ensemble 151 | ) 152 | -------------------------------------------------------------------------------- /lifelong_rl/trainers/mpc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/trainers/mpc/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/trainers/mpc/mpc_trainer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from collections import OrderedDict 5 | 6 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchTrainer 7 | import lifelong_rl.torch.pytorch_util as ptu 8 | 9 | 10 | class MPPITrainer(TorchTrainer): 11 | 12 | """ 13 | Just a placeholder trainer since MPC does not require training 14 | """ 15 | 16 | def __init__( 17 | self, 18 | policy, 19 | ): 20 | super().__init__() 21 | 22 | self.policy = policy 23 | 24 | self.eval_statistics = OrderedDict() 25 | self._need_to_update_eval_statistics = True 26 | 27 | def train_from_torch(self, batch): 28 | return 29 | 30 | def get_diagnostics(self): 31 | return self.eval_statistics 32 | 33 | def end_epoch(self, epoch): 34 | self.policy.end_epoch(epoch) 35 | self._need_to_update_eval_statistics = True 36 | 37 | @property 38 | def networks(self): 39 | return [self.policy.dynamics_model] 40 | 41 | def get_snapshot(self): 42 | return dict( 43 | dynamics_model=self.policy.dynamics_model, 44 | ) 45 | -------------------------------------------------------------------------------- /lifelong_rl/trainers/multi_trainer.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from lifelong_rl.core.rl_algorithms.torch_rl_algorithm import TorchTrainer 4 | 5 | 6 | class MultiTrainer(TorchTrainer): 7 | 8 | """ 9 | Interface for combining multiple trainers into one trainer. 10 | """ 11 | 12 | def __init__( 13 | self, 14 | trainers, # List of trainers 15 | trainer_steps, # List of number of steps to call each trainer per call of MultiTrainer 16 | trainer_names=None, # Optionally, specify the names (used for printing/logging) 17 | ): 18 | super().__init__() 19 | 20 | assert len(trainers) == len(trainer_steps), 'Must specify number of steps for each trainer' 21 | 22 | self.trainers = trainers 23 | self.trainer_steps = trainer_steps 24 | 25 | if trainer_names is None: 26 | self.trainer_names = ['trainer_%d' % i for i in range(1, len(trainers)+1)] 27 | else: 28 | self.trainer_names = trainer_names 29 | while len(self.trainer_names) < len(trainers): 30 | self.trainer_names.append('trainer_%d' % (len(self.trainer_names)+1)) 31 | 32 | self.eval_statistics = OrderedDict() 33 | 34 | def train_from_torch(self, batch): 35 | for i in range(len(self.trainers)): 36 | self.trainers[i].train_from_torch(batch) 37 | for k, v in self.trainers[i].get_diagnostics().items(): 38 | self.eval_statistics[self.trainer_names[i] + '/' + k] = v 39 | 40 | def get_diagnostics(self): 41 | return self.eval_statistics 42 | 43 | def end_epoch(self, epoch): 44 | for trainer in self.trainers: 45 | trainer.end_epoch(epoch) 46 | 47 | @property 48 | def networks(self): 49 | networks = [] 50 | for trainer in self.trainers: 51 | networks.extend(trainer.networks) 52 | return networks 53 | 54 | def get_snapshot(self): 55 | snapshot = dict() 56 | for i in range(len(self.trainers)): 57 | for k, v in self.trainers[i].get_diagnostics().items(): 58 | snapshot[self.trainer_names[i] + '/' + k] = v 59 | return snapshot 60 | -------------------------------------------------------------------------------- /lifelong_rl/trainers/pg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/trainers/pg/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/trainers/pg/npg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from lifelong_rl.trainers.pg.pg import PGTrainer 5 | import lifelong_rl.torch.pytorch_util as ptu 6 | 7 | 8 | def cg_solve(f_Ax, b, x_0=None, cg_iters=10, residual_tol=1e-10): 9 | x = np.zeros_like(b) #if x_0 is None else x_0 10 | r = b.copy() #if x_0 is None else b-f_Ax(x_0) 11 | p = r.copy() 12 | rdotr = r.dot(r) 13 | 14 | for i in range(cg_iters): 15 | z = f_Ax(p) 16 | v = rdotr / p.dot(z) 17 | x += v * p 18 | r -= v * z 19 | newrdotr = r.dot(r) 20 | mu = newrdotr / rdotr 21 | p = r + mu * p 22 | 23 | rdotr = newrdotr 24 | if rdotr < residual_tol: 25 | break 26 | 27 | return x 28 | 29 | 30 | class NPGTrainer(PGTrainer): 31 | 32 | """ 33 | Natural Policy Gradient with conjugate gradient to estimate the Hessian. 34 | Policy gradient algorithm with normalized update step. 35 | See https://github.com/aravindr93/mjrl/blob/master/mjrl/algos/npg_cg.py. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | normalized_step_size=0.01, 41 | FIM_invert_args=None, 42 | hvp_sample_frac=1, 43 | *args, 44 | **kwargs, 45 | ): 46 | super().__init__(*args, **kwargs) 47 | 48 | self.normalized_step_size = normalized_step_size 49 | self.FIM_invert_args = FIM_invert_args if FIM_invert_args is not None else {'iters': 10, 'damping': 1e-4} 50 | self.hvp_sample_frac = hvp_sample_frac 51 | 52 | def CPI_surrogate(self, obs, actions, advantages, old_policy): 53 | adv_var = torch.autograd.Variable(advantages, requires_grad=False) 54 | log_probs = torch.squeeze(self.policy.get_log_probs(obs, actions), dim=-1) 55 | log_probs_old = torch.squeeze(old_policy.get_log_probs(obs, actions), dim=-1) 56 | LR = torch.exp(log_probs - log_probs_old) 57 | surr = torch.mean(LR * adv_var) 58 | return surr 59 | 60 | def flat_vpg(self, obs, actions, advantages, old_policy): 61 | cpi_surr = self.CPI_surrogate(obs, actions, advantages, old_policy) 62 | vpg_grad = torch.autograd.grad(cpi_surr, self.policy.trainable_params) 63 | vpg_grad = np.concatenate([g.contiguous().view(-1).cpu().data.numpy() for g in vpg_grad]) 64 | return vpg_grad, cpi_surr 65 | 66 | def HVP(self, observations, actions, old_policy, vector, regu_coef=None): 67 | regu_coef = self.FIM_invert_args['damping'] if regu_coef is None else regu_coef 68 | vec = torch.autograd.Variable(ptu.from_numpy(vector).float(), requires_grad=False) 69 | if self.hvp_sample_frac is not None and self.hvp_sample_frac < 0.99: 70 | num_samples = observations.shape[0] 71 | rand_idx = np.random.choice(num_samples, size=int(self.hvp_sample_frac*num_samples)) 72 | obs = observations[rand_idx] 73 | act = actions[rand_idx] 74 | else: 75 | obs = observations 76 | act = actions 77 | log_probs = torch.squeeze(self.policy.get_log_probs(obs, act), dim=-1) 78 | log_probs_old = torch.squeeze(old_policy.get_log_probs(obs, act), dim=-1) 79 | mean_kl = (log_probs_old - log_probs).mean() 80 | grad_fo = torch.autograd.grad(mean_kl, self.policy.trainable_params, create_graph=True) 81 | flat_grad = torch.cat([g.contiguous().view(-1) for g in grad_fo]) 82 | h = torch.sum(flat_grad*vec) 83 | hvp = torch.autograd.grad(h, self.policy.trainable_params) 84 | hvp_flat = np.concatenate([g.contiguous().view(-1).cpu().data.numpy() for g in hvp]) 85 | return hvp_flat + regu_coef * vector 86 | 87 | def build_Hvp_eval(self, inputs, regu_coef=None): 88 | def eval(v): 89 | full_inp = inputs + [v] + [regu_coef] 90 | Hvp = self.HVP(*full_inp) 91 | return Hvp 92 | return eval 93 | 94 | def train_policy(self, batch, old_policy): 95 | obs = ptu.from_numpy(batch['observations']) 96 | actions = ptu.from_numpy(batch['actions']) 97 | advantages = ptu.from_numpy(batch['advantages']) 98 | 99 | log_probs = torch.squeeze(self.policy.get_log_probs(obs, actions), dim=-1) 100 | log_probs_old = torch.squeeze(old_policy.get_log_probs(obs, actions), dim=-1) 101 | kl = (log_probs_old - log_probs).mean() 102 | 103 | vpg_grad, cpi_surr = self.flat_vpg(obs, actions, advantages, old_policy) 104 | hvp = self.build_Hvp_eval([obs, actions, old_policy], regu_coef=self.FIM_invert_args['damping']) 105 | npg_grad = cg_solve(hvp, vpg_grad, x_0=vpg_grad.copy(), cg_iters=self.FIM_invert_args['iters']) 106 | 107 | alpha = np.sqrt(np.abs(self.normalized_step_size / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) 108 | 109 | cur_params = self.policy.get_param_values() 110 | new_params = cur_params + alpha * npg_grad 111 | self.policy.set_param_values(new_params) 112 | 113 | return -cpi_surr, kl 114 | -------------------------------------------------------------------------------- /lifelong_rl/trainers/pg/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from lifelong_rl.trainers.pg.pg import PGTrainer 4 | 5 | 6 | class PPOTrainer(PGTrainer): 7 | 8 | """ 9 | Proximal Policy Optimization (Schulman et al. 2016). 10 | Policy gradient algorithm with clipped surrogate loss. 11 | """ 12 | 13 | def __init__( 14 | self, 15 | ppo_epsilon=0.2, # Epsilon for clipping 16 | *args, 17 | **kwargs, 18 | ): 19 | super().__init__(*args, **kwargs) 20 | 21 | self.ppo_epsilon = ppo_epsilon 22 | 23 | def policy_objective(self, obs, actions, advantages, old_policy): 24 | log_probs = torch.squeeze(self.policy.get_log_probs(obs, actions), dim=-1) 25 | log_probs_old = torch.squeeze(old_policy.get_log_probs(obs, actions), dim=-1) 26 | 27 | ratio = torch.exp(log_probs - log_probs_old) 28 | policy_loss_1 = advantages * ratio 29 | policy_loss_2 = advantages * torch.clamp(ratio, 1-self.ppo_epsilon, 1+self.ppo_epsilon) 30 | objective = torch.min(policy_loss_1, policy_loss_2).mean() 31 | objective += self.entropy_coeff * (-log_probs).mean() 32 | 33 | kl = (log_probs_old - log_probs).mean() 34 | 35 | return objective, kl 36 | -------------------------------------------------------------------------------- /lifelong_rl/trainers/q_learning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kzl/lifelong_rl/44815713f6732abccac6c69bb0e3fe9e68e08210/lifelong_rl/trainers/q_learning/__init__.py -------------------------------------------------------------------------------- /lifelong_rl/trainers/trainer.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class Trainer(object, metaclass=abc.ABCMeta): 5 | 6 | @abc.abstractmethod 7 | def train(self, data): 8 | pass 9 | 10 | def end_epoch(self, epoch): 11 | pass 12 | 13 | def get_snapshot(self): 14 | return {} 15 | 16 | def get_diagnostics(self): 17 | return {} 18 | -------------------------------------------------------------------------------- /lifelong_rl/util/eval_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common evaluation utilities. 3 | """ 4 | 5 | from collections import OrderedDict 6 | from numbers import Number 7 | 8 | import numpy as np 9 | 10 | import lifelong_rl.util.pythonplusplus as ppp 11 | 12 | 13 | def get_generic_path_information(paths, stat_prefix=''): 14 | """ 15 | Get an OrderedDict with a bunch of statistic names and values. 16 | """ 17 | statistics = OrderedDict() 18 | returns = [sum(path["rewards"]) for path in paths] 19 | 20 | rewards = np.vstack([path["rewards"] for path in paths]) 21 | statistics.update(create_stats_ordered_dict('Rewards', rewards, 22 | stat_prefix=stat_prefix)) 23 | statistics.update(create_stats_ordered_dict('Returns', returns, 24 | stat_prefix=stat_prefix)) 25 | actions = [path["actions"] for path in paths] 26 | if len(actions[0].shape) == 1: 27 | actions = np.hstack([path["actions"] for path in paths]) 28 | else: 29 | actions = np.vstack([path["actions"] for path in paths]) 30 | statistics.update(create_stats_ordered_dict( 31 | 'Actions', actions, stat_prefix=stat_prefix 32 | )) 33 | statistics['Num Paths'] = len(paths) 34 | statistics[stat_prefix + 'Average Returns'] = get_average_returns(paths) 35 | 36 | for info_key in ['env_infos', 'agent_infos']: 37 | if info_key in paths[0]: 38 | all_env_infos = [ 39 | ppp.list_of_dicts__to__dict_of_lists(p[info_key]) 40 | for p in paths 41 | ] 42 | for k in all_env_infos[0].keys(): 43 | final_ks = np.array([info[k][-1] for info in all_env_infos]) 44 | first_ks = np.array([info[k][0] for info in all_env_infos]) 45 | all_ks = np.concatenate([info[k] for info in all_env_infos]) 46 | statistics.update(create_stats_ordered_dict( 47 | stat_prefix + k, 48 | final_ks, 49 | stat_prefix='{}/final/'.format(info_key), 50 | )) 51 | statistics.update(create_stats_ordered_dict( 52 | stat_prefix + k, 53 | first_ks, 54 | stat_prefix='{}/initial/'.format(info_key), 55 | )) 56 | statistics.update(create_stats_ordered_dict( 57 | stat_prefix + k, 58 | all_ks, 59 | stat_prefix='{}/'.format(info_key), 60 | )) 61 | 62 | return statistics 63 | 64 | 65 | def get_average_returns(paths): 66 | returns = [sum(path["rewards"]) for path in paths] 67 | return np.mean(returns) 68 | 69 | 70 | def create_stats_ordered_dict( 71 | name, 72 | data, 73 | stat_prefix=None, 74 | always_show_all_stats=True, 75 | exclude_max_min=False, 76 | ): 77 | if stat_prefix is not None: 78 | name = "{}{}".format(stat_prefix, name) 79 | if isinstance(data, Number): 80 | return OrderedDict({name: data}) 81 | 82 | if len(data) == 0: 83 | return OrderedDict() 84 | 85 | if isinstance(data, tuple): 86 | ordered_dict = OrderedDict() 87 | for number, d in enumerate(data): 88 | sub_dict = create_stats_ordered_dict( 89 | "{0}_{1}".format(name, number), 90 | d, 91 | ) 92 | ordered_dict.update(sub_dict) 93 | return ordered_dict 94 | 95 | if isinstance(data, list): 96 | try: 97 | iter(data[0]) 98 | except TypeError: 99 | pass 100 | else: 101 | data = np.concatenate(data) 102 | 103 | if (isinstance(data, np.ndarray) and data.size == 1 104 | and not always_show_all_stats): 105 | return OrderedDict({name: float(data)}) 106 | 107 | stats = OrderedDict([ 108 | (name + ' Mean', np.mean(data)), 109 | (name + ' Std', np.std(data)), 110 | ]) 111 | if not exclude_max_min: 112 | stats[name + ' Max'] = np.max(data) 113 | stats[name + ' Min'] = np.min(data) 114 | return stats 115 | -------------------------------------------------------------------------------- /lifelong_rl/util/visualize_mujoco.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | import copy 5 | import os 6 | import time 7 | 8 | 9 | def visualize_mujoco_from_states(env, sim_states, time_delay=0.008): 10 | """ 11 | Given the states of the simulator, we can visualize the past Mujoco timesteps. 12 | - Simulator states are obtained via env.sim.get_state() 13 | """ 14 | for t in range(len(sim_states)): 15 | env.sim.set_state(sim_states[t]) 16 | env.sim.forward() 17 | env.render() 18 | time.sleep(time_delay) 19 | env.close() 20 | 21 | 22 | def mujoco_rgb_from_states(env, sim_states, time_delay=0.008): 23 | """ 24 | Given the states of the simulator, we can visualize the past Mujoco timesteps. 25 | - Simulator states are obtained via env.sim.get_state() 26 | """ 27 | rgb = [] 28 | for t in range(len(sim_states)): 29 | env.sim.set_state(sim_states[t]) 30 | env.sim.forward() 31 | rgb.append(env.render(mode='rgb_array')) 32 | time.sleep(time_delay) 33 | env.close() 34 | return rgb 35 | 36 | 37 | def record_mujoco_video_from_states(env, file_name, sim_states, time_delay=0.008, video_params=None): 38 | rgb = mujoco_rgb_from_states(env, sim_states, time_delay=0) 39 | 40 | os.makedirs(os.path.dirname(file_name), exist_ok=True) 41 | 42 | if video_params is None: 43 | video_params = dict( 44 | size=(rgb[0].shape[0], rgb[0].shape[1]), # (width, height) 45 | fourcc=cv2.VideoWriter_fourcc(*'mp4v'), # use 'XVID' if not mp4 46 | fps=int(1/time_delay), 47 | ) 48 | 49 | out = cv2.VideoWriter(file_name, video_params['fourcc'], video_params['fps'], video_params['size']) 50 | for i in range(len(rgb)): 51 | img = cv2.cvtColor(rgb[i], cv2.COLOR_BGR2RGB) 52 | out.write(img) 53 | out.release() 54 | -------------------------------------------------------------------------------- /run_scripts/dads.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.dads.dads_config import get_config 4 | from experiment_configs.algorithms.batch import get_algorithm 5 | 6 | ENV_NAME = 'Gridworld' 7 | experiment_kwargs = dict( 8 | exp_name='dads-gridworld', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=True, 12 | ) 13 | 14 | 15 | if __name__ == "__main__": 16 | variant = dict( 17 | algorithm='DADS', 18 | collector_type='batch_latent', 19 | replay_buffer_size=int(1e6), # for DADS, only used to store past history 20 | generated_replay_buffer_size=10000, # off-policy replay buffer helps learning 21 | env_name=ENV_NAME, 22 | env_kwargs=dict( 23 | grid_files=['blank'], # specifies which file to load for gridworld 24 | terminates=False, 25 | ), 26 | policy_kwargs=dict( 27 | layer_size=256, 28 | latent_dim=2, 29 | ), 30 | discriminator_kwargs=dict( 31 | layer_size=512, 32 | num_layers=2, 33 | restrict_input_size=0, 34 | ), 35 | trainer_kwargs=dict( 36 | num_prior_samples=512, 37 | num_discrim_updates=32, 38 | num_policy_updates=128, 39 | discrim_learning_rate=3e-4, 40 | policy_batch_size=256, 41 | reward_bounds=(-30, 30), 42 | reward_scale=5, # increasing reward scale helps learning signal 43 | ), 44 | policy_trainer_kwargs=dict( 45 | discount=0.99, 46 | policy_lr=3e-4, 47 | qf_lr=3e-4, 48 | soft_target_tau=5e-3, 49 | ), 50 | algorithm_kwargs=dict( 51 | num_epochs=100, 52 | num_eval_steps_per_epoch=5000, 53 | num_trains_per_train_loop=1, 54 | num_expl_steps_per_train_loop=2000, 55 | min_num_steps_before_training=0, 56 | max_path_length=100, 57 | save_snapshot_freq=100, 58 | ), 59 | ) 60 | 61 | sweep_values = { 62 | } 63 | 64 | launch_experiment( 65 | get_config=get_config, 66 | get_algorithm=get_algorithm, 67 | variant=variant, 68 | sweep_values=sweep_values, 69 | **experiment_kwargs 70 | ) 71 | -------------------------------------------------------------------------------- /run_scripts/lisp.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.algorithms.mbrl import get_algorithm 4 | from experiment_configs.algorithms.offline_mbrl import get_offline_algorithm 5 | from experiment_configs.configs.lisp.lisp_config import get_config 6 | from experiment_configs.configs.mpc.make_mpc_policy import make_get_config 7 | 8 | 9 | ENV_NAME = 'LifelongHopper' 10 | experiment_kwargs = dict( 11 | exp_name='lisp-lifelong-hopper', 12 | num_seeds=1, 13 | instance_type='c4.4xlarge', 14 | use_gpu=True, 15 | ) 16 | 17 | 18 | if __name__ == "__main__": 19 | variant = dict( 20 | algorithm='LiSP', 21 | collector_type='rf', # reset-free exploration environment 22 | env_name=ENV_NAME, 23 | env_kwargs=dict( 24 | terminates=False, 25 | ), 26 | do_offline_training=True, # perform both offline and online training (offline always first) 27 | do_online_training=True, 28 | teacher_data_files=['lifelong_hopper_full'], # see README to download 29 | replay_buffer_size=int(1e6), 30 | generated_replay_buffer_size=5000, # off-policy buffer for policy training 31 | use_as_eval_policy='uniform', # sample uniformly from skill policy for evaluation 32 | policy_kwargs=dict( 33 | layer_size=256, 34 | latent_dim=4, 35 | ), 36 | discriminator_kwargs=dict( 37 | layer_size=512, 38 | num_layers=2, 39 | restrict_input_size=0, 40 | ), 41 | rollout_len_schedule=[-1, -1, 1, 1], 42 | trainer_kwargs=dict( 43 | num_model_samples=400, 44 | num_prior_samples=32, 45 | num_discrim_updates=4, 46 | num_policy_updates=8, 47 | discrim_learning_rate=3e-4, 48 | policy_batch_size=256, 49 | reward_bounds=(-30, 30), 50 | empowerment_horizon=1, 51 | reward_scale=10, 52 | disagreement_threshold=.1, 53 | relabel_rewards=True, 54 | train_every=10, 55 | practice_batch_size=256, 56 | practice_train_steps=4, 57 | epsilon_greedy=0.2, 58 | ), 59 | policy_trainer_kwargs=dict( 60 | discount=0.99, 61 | policy_lr=3e-4, 62 | qf_lr=3e-4, 63 | soft_target_tau=5e-3, 64 | ), 65 | skill_practice_trainer_kwargs=dict( 66 | discount=0.99, 67 | policy_lr=3e-4, 68 | qf_lr=3e-4, 69 | soft_target_tau=5e-3, 70 | use_automatic_entropy_tuning=True, 71 | target_entropy=-4, 72 | ), 73 | mppi_kwargs=dict( 74 | discount=.99, 75 | horizon=60, 76 | repeat_length=3, 77 | plan_every=1, 78 | temperature=0.01, 79 | noise_std=1, 80 | num_rollouts=400, 81 | num_particles=5, 82 | planning_iters=10, 83 | polyak=0.2, 84 | sampling_mode='ts', 85 | sampling_kwargs=dict( 86 | reward_penalty=-20, 87 | disagreement_threshold=0.1, 88 | ), 89 | filter_coefs=(0.05, 0.8, 0), 90 | ), 91 | mbrl_kwargs=dict( 92 | ensemble_size=5, 93 | num_elites=5, 94 | layer_size=256, 95 | learning_rate=1e-3, 96 | batch_size=256, 97 | ), 98 | offline_kwargs=dict( 99 | num_epochs=2000, 100 | num_eval_steps_per_epoch=1000, 101 | num_trains_per_train_loop=100, 102 | model_batch_size=256, 103 | max_path_length=200, 104 | batch_size=256, 105 | save_snapshot_freq=1000, 106 | ), 107 | algorithm_kwargs=dict( 108 | num_epochs=10000, 109 | num_eval_steps_per_epoch=0, 110 | num_trains_per_train_loop=10, 111 | num_expl_steps_per_train_loop=10, 112 | min_num_steps_before_training=0, 113 | num_model_trains_per_train_loop=1, 114 | max_path_length=200, 115 | batch_size=256, 116 | model_batch_size=256, 117 | save_snapshot_freq=2500, 118 | ), 119 | ) 120 | 121 | sweep_values = { 122 | } 123 | 124 | launch_experiment( 125 | get_config=make_get_config(get_config), 126 | get_algorithm=get_algorithm, 127 | get_offline_algorithm=get_offline_algorithm, 128 | variant=variant, 129 | sweep_values=sweep_values, 130 | **experiment_kwargs 131 | ) 132 | -------------------------------------------------------------------------------- /run_scripts/loop.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.mpc.loop_config import get_config 4 | from experiment_configs.algorithms.mbrl import get_algorithm 5 | 6 | ENV_NAME = 'InvertedPendulum' 7 | experiment_kwargs = dict( 8 | exp_name='loop-pendulum', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=True, 12 | ) 13 | 14 | """ 15 | Note: the current implementation does not include some of the tricks used to reduce policy 16 | divergence which is important for high-dimensional environments since the value function is 17 | learned off-policy. POLO and AOP have on-policy value function learning, with implementations here: 18 | https://github.com/kzl/aop 19 | """ 20 | 21 | 22 | if __name__ == "__main__": 23 | variant = dict( 24 | algorithm='LOOP', 25 | collector_type='step', 26 | env_name=ENV_NAME, 27 | env_kwargs=dict(), 28 | replay_buffer_size=int(1e6), 29 | mpc_kwargs=dict( 30 | discount=.99, 31 | horizon=5, 32 | repeat_length=1, 33 | plan_every=1, 34 | temperature=.01, 35 | noise_std=.5, 36 | num_rollouts=400, 37 | num_particles=5, # this is the num_particles PER ensemble member 38 | planning_iters=5, 39 | polyak=0., 40 | sampling_mode='ts', # note that model is written specifically for trajectory sampling 41 | filter_coefs=(0.2, 0.8, 0), # smoothing of noise for planning 42 | predict_terminal=True, 43 | ), 44 | mbrl_kwargs=dict( 45 | ensemble_size=4, 46 | layer_size=256, 47 | learning_rate=1e-3, 48 | batch_size=256, 49 | ), 50 | policy_kwargs=dict( 51 | layer_size=256, 52 | ), 53 | policy_trainer_kwargs=dict( 54 | discount=0.99, 55 | soft_target_tau=5e-3, 56 | target_update_period=1, 57 | policy_lr=3e-4, 58 | qf_lr=3e-4, 59 | ), 60 | algorithm_kwargs=dict( 61 | num_epochs=500, 62 | num_eval_steps_per_epoch=200, # Note for LOOP it's set so the SAC policy is used for eval 63 | num_trains_per_train_loop=200, 64 | num_expl_steps_per_train_loop=200, 65 | min_num_steps_before_training=200, 66 | num_model_trains_per_train_loop=1, 67 | max_path_length=200, 68 | batch_size=256, 69 | model_batch_size=256, 70 | save_snapshot_freq=500, 71 | ), 72 | ) 73 | 74 | sweep_values = dict() 75 | 76 | launch_experiment( 77 | get_config=get_config, 78 | get_algorithm=get_algorithm, 79 | variant=variant, 80 | sweep_values=sweep_values, 81 | **experiment_kwargs 82 | ) 83 | -------------------------------------------------------------------------------- /run_scripts/mbpo.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.q_learning.mbpo_config import get_config 4 | from experiment_configs.algorithms.mbrl import get_algorithm 5 | 6 | ENV_NAME = 'Hopper' 7 | experiment_kwargs = dict( 8 | exp_name='mbpo-hopper', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=True, 12 | ) 13 | 14 | 15 | if __name__ == "__main__": 16 | variant = dict( 17 | algorithm='MBPO', 18 | collector_type='step', 19 | env_name=ENV_NAME, 20 | env_kwargs=dict(), 21 | replay_buffer_size=int(1e6), 22 | policy_kwargs=dict( 23 | layer_size=256, 24 | ), 25 | trainer_kwargs=dict( 26 | num_model_rollouts=400, 27 | rollout_generation_freq=250, 28 | rollout_len_schedule=[20, 100, 1, 15], # same format as MBPO codebase 29 | generated_buffer_size=int(1e5), # size of synthetic generated replay buffer 30 | num_policy_updates=20, 31 | real_data_pct=0.05, 32 | policy_kwargs=dict( # kwargs for training the policy (note: inside trainer_kwargs) 33 | discount=0.99, 34 | soft_target_tau=5e-3, 35 | target_update_period=1, 36 | policy_lr=3e-4, 37 | qf_lr=3e-4, 38 | ), 39 | ), 40 | mbrl_kwargs=dict( 41 | ensemble_size=7, 42 | num_elites=5, 43 | learning_rate=1e-3, 44 | batch_size=256, 45 | hidden_sizes=[256,256,256,256], 46 | ), 47 | algorithm_kwargs=dict( 48 | num_epochs=100, 49 | num_eval_steps_per_epoch=5000, 50 | num_trains_per_train_loop=1000, 51 | num_expl_steps_per_train_loop=1000, 52 | min_num_steps_before_training=1000, 53 | num_model_trains_per_train_loop=4, 54 | max_path_length=1000, 55 | batch_size=256, 56 | model_batch_size=256, 57 | save_snapshot_freq=100, 58 | ), 59 | ) 60 | 61 | sweep_values = {} 62 | 63 | launch_experiment( 64 | get_config=get_config, 65 | get_algorithm=get_algorithm, 66 | variant=variant, 67 | sweep_values=sweep_values, 68 | **experiment_kwargs 69 | ) 70 | -------------------------------------------------------------------------------- /run_scripts/mopo.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.q_learning.mbpo_config import get_config 4 | from experiment_configs.algorithms.offline_mbrl import get_offline_algorithm 5 | 6 | ENV_NAME = 'Hopper' 7 | experiment_kwargs = dict( 8 | exp_name='mopo-hopper-medexp', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=True, 12 | ) 13 | 14 | 15 | if __name__ == "__main__": 16 | variant = dict( 17 | algorithm='MOPO', 18 | collector_type='step', 19 | env_name=ENV_NAME, 20 | env_kwargs=dict(), 21 | do_offline_training=True, # here we specify we want to do offline training 22 | do_online_training=False, 23 | teacher_data_files=['d4rl-hopper-medium-expert'], # download this with example script 24 | replay_buffer_size=int(1e6), 25 | policy_kwargs=dict( 26 | layer_size=256, 27 | ), 28 | trainer_kwargs=dict( 29 | num_model_rollouts=400, 30 | rollout_generation_freq=250, 31 | rollout_len_schedule=[1, 1, 5, 5], # Using a constant rollout length of 5 32 | generated_buffer_size=int(1e5), 33 | num_policy_updates=20, 34 | real_data_pct=0.05, 35 | sampling_mode='var_disagreement', # Here we specify the MOPO algorithm (special case of MBPO) 36 | sampling_kwargs=dict( 37 | reward_penalty=1, # Original paper searched this out of (1, 5) 38 | ), 39 | policy_kwargs=dict( 40 | discount=0.99, 41 | soft_target_tau=5e-3, 42 | target_update_period=1, 43 | policy_lr=3e-4, 44 | qf_lr=3e-4, 45 | reward_scale=1, 46 | ), 47 | ), 48 | mbrl_kwargs=dict( 49 | ensemble_size=7, 50 | num_elites=5, 51 | learning_rate=3e-4, 52 | batch_size=256, 53 | hidden_sizes=[256] * 4, 54 | ), 55 | offline_kwargs=dict( 56 | num_epochs=10000, 57 | num_eval_steps_per_epoch=5000, 58 | num_trains_per_train_loop=100, 59 | model_batch_size=256, 60 | max_path_length=1000, 61 | batch_size=256, 62 | save_snapshot_freq=10000, 63 | ), 64 | ) 65 | 66 | sweep_values = { 67 | 'trainer_kwargs/sampling_kwargs/reward_penalty': [1, 5], # grid search over hyperparameter 68 | } 69 | 70 | launch_experiment( 71 | get_config=get_config, 72 | get_offline_algorithm=get_offline_algorithm, 73 | variant=variant, 74 | sweep_values=sweep_values, 75 | **experiment_kwargs 76 | ) 77 | -------------------------------------------------------------------------------- /run_scripts/morel.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.q_learning.mbpo_config import get_config 4 | from experiment_configs.algorithms.offline_mbrl import get_offline_algorithm 5 | 6 | ENV_NAME = 'Hopper' 7 | experiment_kwargs = dict( 8 | exp_name='morel-hopper-medexp', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=True, 12 | ) 13 | 14 | """ 15 | Note: implementation uses MBPO/SAC as base instead of NPG. 16 | """ 17 | 18 | if __name__ == "__main__": 19 | variant = dict( 20 | algorithm='MOReL', 21 | collector_type='rf', 22 | env_name=ENV_NAME, 23 | env_kwargs=dict(), 24 | do_offline_training=True, # here we specify we want to train offline 25 | do_online_training=False, 26 | teacher_data_files=['d4rl-hopper-medium-expert'], # download this from example script 27 | replay_buffer_size=int(1e6), 28 | policy_kwargs=dict( 29 | layer_size=256, 30 | ), 31 | trainer_kwargs=dict( 32 | num_model_rollouts=400, 33 | rollout_generation_freq=250, 34 | rollout_len_schedule=[1, 1, 5, 5], # Using a constant rollout length of 5 35 | generated_buffer_size=int(1e5), 36 | num_policy_updates=20, 37 | real_data_pct=0.05, 38 | sampling_mode='mean_disagreement', # Here we specify the MOReL algorithm (special case of MBPO) 39 | sampling_kwargs=dict( 40 | threshold=0.05, 41 | penalty=100, # Kappa in paper 42 | ), 43 | policy_kwargs=dict( 44 | discount=0.99, 45 | soft_target_tau=5e-3, 46 | target_update_period=1, 47 | policy_lr=3e-4, 48 | qf_lr=3e-4, 49 | ), 50 | ), 51 | mbrl_kwargs=dict( 52 | ensemble_size=7, 53 | num_elites=5, 54 | learning_rate=3e-4, 55 | batch_size=256, 56 | hidden_sizes=[256] * 4, 57 | ), 58 | offline_kwargs=dict( 59 | num_epochs=10000, 60 | num_eval_steps_per_epoch=5000, 61 | num_trains_per_train_loop=100, 62 | model_batch_size=256, 63 | max_path_length=1000, 64 | batch_size=256, 65 | save_snapshot_freq=10000, 66 | ), 67 | ) 68 | 69 | sweep_values = {} 70 | 71 | launch_experiment( 72 | get_config=get_config, 73 | get_offline_algorithm=get_offline_algorithm, 74 | variant=variant, 75 | sweep_values=sweep_values, 76 | **experiment_kwargs 77 | ) 78 | -------------------------------------------------------------------------------- /run_scripts/mpc.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.mpc.mpc import get_config 4 | from experiment_configs.algorithms.mbrl import get_algorithm 5 | 6 | ENV_NAME = 'InvertedPendulum' 7 | experiment_kwargs = dict( 8 | exp_name='mpc-pendulum', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=True, 12 | ) 13 | 14 | 15 | if __name__ == "__main__": 16 | variant = dict( 17 | algorithm='PETS', # most closely resembles PETS but can be adjusted with hyperparameters 18 | collector_type='step', 19 | env_name=ENV_NAME, 20 | env_kwargs=dict(), 21 | replay_buffer_size=int(1e6), 22 | mpc_kwargs=dict( 23 | discount=.99, 24 | horizon=25, 25 | repeat_length=1, 26 | plan_every=1, 27 | temperature=.01, 28 | noise_std=.5, 29 | num_rollouts=400, 30 | num_particles=5, # this is the num_particles PER ensemble member 31 | planning_iters=5, 32 | polyak=0., 33 | sampling_mode='ts', # note that model is written specifically for trajectory sampling 34 | filter_coefs=(0.2, 0.8, 0), # smoothing of noise for planning 35 | predict_terminal=True, 36 | ), 37 | mbrl_kwargs=dict( 38 | ensemble_size=4, 39 | layer_size=256, 40 | learning_rate=1e-3, 41 | batch_size=256, 42 | ), 43 | trainer_kwargs=dict(), 44 | algorithm_kwargs=dict( 45 | num_epochs=500, 46 | num_eval_steps_per_epoch=0, 47 | num_trains_per_train_loop=200, 48 | num_expl_steps_per_train_loop=200, 49 | min_num_steps_before_training=200, 50 | num_model_trains_per_train_loop=1, 51 | max_path_length=200, 52 | batch_size=256, 53 | model_batch_size=256, 54 | save_snapshot_freq=500, 55 | ), 56 | ) 57 | 58 | sweep_values = dict() 59 | 60 | launch_experiment( 61 | get_config=get_config, 62 | get_algorithm=get_algorithm, 63 | variant=variant, 64 | sweep_values=sweep_values, 65 | **experiment_kwargs 66 | ) 67 | -------------------------------------------------------------------------------- /run_scripts/npg.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.pg.npg_config import get_config 4 | from experiment_configs.algorithms.batch import get_algorithm 5 | 6 | ENV_NAME = 'Hopper' 7 | experiment_kwargs = dict( 8 | exp_name='npg-hopper', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=False, 12 | ) 13 | 14 | 15 | if __name__ == "__main__": 16 | variant = dict( 17 | algorithm='NPG', 18 | collector_type='batch', 19 | env_name=ENV_NAME, 20 | env_kwargs=dict(), 21 | replay_buffer_size=int(1e6), 22 | policy_kwargs=dict( 23 | layer_size=64, 24 | ), 25 | value_kwargs=dict( 26 | layer_size=256, 27 | ), 28 | policy_trainer_kwargs=dict( 29 | discount=0.99, 30 | normalized_step_size=0.01, 31 | target_kl=0.01, 32 | gae_lambda=0.97, 33 | policy_lr=None, # no fixed learning rate in NPG, instead use normalized_step_size 34 | value_lr=3e-4, 35 | num_epochs=10, 36 | policy_batch_size=2048, 37 | value_batch_size=64, 38 | normalize_advantages=True, 39 | num_policy_epochs=None, 40 | ), 41 | algorithm_kwargs=dict( 42 | num_epochs=1000, 43 | num_eval_steps_per_epoch=5000, 44 | num_trains_per_train_loop=1, 45 | num_expl_steps_per_train_loop=2048, 46 | min_num_steps_before_training=1000, 47 | max_path_length=1000, 48 | save_snapshot_freq=1000, 49 | ), 50 | ) 51 | 52 | sweep_values = { 53 | } 54 | 55 | launch_experiment( 56 | get_config=get_config, 57 | get_algorithm=get_algorithm, 58 | variant=variant, 59 | sweep_values=sweep_values, 60 | **experiment_kwargs 61 | ) 62 | -------------------------------------------------------------------------------- /run_scripts/ppo.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.pg.ppo_config import get_config 4 | from experiment_configs.algorithms.batch import get_algorithm 5 | 6 | ENV_NAME = 'Hopper' 7 | experiment_kwargs = dict( 8 | exp_name='ppo-hopper', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=False, 12 | ) 13 | 14 | 15 | if __name__ == "__main__": 16 | variant = dict( 17 | algorithm='PPO', 18 | collector_type='batch', 19 | env_name=ENV_NAME, 20 | env_kwargs=dict(), 21 | replay_buffer_size=int(1e6), 22 | policy_kwargs=dict( 23 | layer_size=64, 24 | ), 25 | value_kwargs=dict( 26 | layer_size=256, 27 | ), 28 | policy_trainer_kwargs=dict( 29 | discount=0.99, 30 | gae_lambda=0.97, 31 | ppo_epsilon=0.2, 32 | policy_lr=3e-4, 33 | value_lr=3e-4, 34 | target_kl=None, 35 | num_epochs=10, 36 | policy_batch_size=64, 37 | value_batch_size=64, 38 | normalize_advantages=True, 39 | ), 40 | algorithm_kwargs=dict( 41 | num_epochs=1000, 42 | num_eval_steps_per_epoch=5000, 43 | num_trains_per_train_loop=1, 44 | num_expl_steps_per_train_loop=2048, 45 | min_num_steps_before_training=1000, 46 | max_path_length=1000, 47 | save_snapshot_freq=1000, 48 | ), 49 | ) 50 | 51 | sweep_values = { 52 | } 53 | 54 | launch_experiment( 55 | get_config=get_config, 56 | get_algorithm=get_algorithm, 57 | variant=variant, 58 | sweep_values=sweep_values, 59 | **experiment_kwargs 60 | ) 61 | -------------------------------------------------------------------------------- /run_scripts/sac.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.q_learning.sac_config import get_config 4 | from experiment_configs.algorithms.online import get_algorithm 5 | 6 | ENV_NAME = 'Hopper' 7 | experiment_kwargs = dict( 8 | exp_name='sac-hopper', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=False, 12 | ) 13 | 14 | 15 | if __name__ == "__main__": 16 | variant = dict( 17 | algorithm='SAC', 18 | collector_type='step', 19 | env_name=ENV_NAME, 20 | env_kwargs=dict(), 21 | replay_buffer_size=int(1e6), 22 | policy_kwargs=dict( 23 | layer_size=256, 24 | ), 25 | trainer_kwargs=dict( 26 | discount=0.99, 27 | soft_target_tau=5e-3, 28 | target_update_period=1, 29 | policy_lr=3e-4, 30 | qf_lr=3e-4, 31 | ), 32 | algorithm_kwargs=dict( 33 | num_epochs=1000, 34 | num_eval_steps_per_epoch=5000, 35 | num_trains_per_train_loop=1000, 36 | num_expl_steps_per_train_loop=1000, 37 | min_num_steps_before_training=1000, 38 | max_path_length=1000, 39 | batch_size=256, 40 | save_snapshot_freq=1000, 41 | ), 42 | ) 43 | 44 | sweep_values = {} 45 | 46 | launch_experiment( 47 | get_config=get_config, 48 | get_algorithm=get_algorithm, 49 | variant=variant, 50 | sweep_values=sweep_values, 51 | **experiment_kwargs 52 | ) 53 | -------------------------------------------------------------------------------- /run_scripts/vpg.py: -------------------------------------------------------------------------------- 1 | from experiment_utils.launch_experiment import launch_experiment 2 | 3 | from experiment_configs.configs.pg.vpg_config import get_config 4 | from experiment_configs.algorithms.batch import get_algorithm 5 | 6 | ENV_NAME = 'Hopper' 7 | experiment_kwargs = dict( 8 | exp_name='vpg-hopper', 9 | num_seeds=1, 10 | instance_type='c4.4xlarge', 11 | use_gpu=False, 12 | ) 13 | 14 | 15 | if __name__ == "__main__": 16 | variant = dict( 17 | algorithm='VPG', 18 | collector_type='batch', 19 | env_name=ENV_NAME, 20 | env_kwargs=dict(), 21 | replay_buffer_size=int(1e6), 22 | policy_kwargs=dict( 23 | layer_size=64, 24 | ), 25 | value_kwargs=dict( 26 | layer_size=256, 27 | ), 28 | policy_trainer_kwargs=dict( 29 | discount=0.99, 30 | gae_lambda=.95, 31 | policy_lr=3e-4, 32 | value_lr=3e-4, 33 | num_epochs=10, 34 | policy_batch_size=8192, 35 | value_batch_size=64, 36 | normalize_advantages=False, 37 | input_normalization=True, 38 | ), 39 | algorithm_kwargs=dict( 40 | num_epochs=1000, 41 | num_eval_steps_per_epoch=5000, 42 | num_trains_per_train_loop=1, 43 | num_expl_steps_per_train_loop=8192, 44 | min_num_steps_before_training=5000, # for input normalization 45 | max_path_length=1000, 46 | save_snapshot_freq=1000, 47 | ), 48 | ) 49 | 50 | sweep_values = { 51 | } 52 | 53 | launch_experiment( 54 | get_config=get_config, 55 | get_algorithm=get_algorithm, 56 | variant=variant, 57 | sweep_values=sweep_values, 58 | **experiment_kwargs 59 | ) 60 | -------------------------------------------------------------------------------- /scripts/download_d4rl_dataset.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | 4 | import argparse 5 | import pickle 6 | import sys 7 | 8 | 9 | """ 10 | Download D4RL dataset and store in lifelong_rl format 11 | - Note this script requires having D4RL installed 12 | - See: https://github.com/rail-berkeley/d4rl 13 | """ 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--task', type=str, 17 | help='Which task to download dataset of (ex. halfcheetah-random-v0)') 18 | parser.add_argument('--output', type=str, default='$$$', 19 | help='What to name the output file of transitions (default: same as task)') 20 | args = parser.parse_args(sys.argv[1:]) 21 | 22 | print('Getting dataset for %s' % args.task) 23 | 24 | env = gym.make(args.task) 25 | dataset = env.get_dataset() 26 | dataset_len = len(dataset['observations']) 27 | 28 | print('%d transitions found with average reward %.4f' % (dataset_len, dataset['rewards'].mean())) 29 | 30 | # Note we store data as (obs, act, r, d, next_obs) 31 | np_dataset = np.concatenate([ 32 | dataset['observations'][:dataset_len-1], 33 | dataset['actions'][:dataset_len-1], 34 | dataset['rewards'][:dataset_len-1].reshape(dataset_len-1, 1), 35 | dataset['terminals'][:dataset_len-1].reshape(dataset_len-1, 1), 36 | dataset['observations'][1:], 37 | ], axis=-1) 38 | 39 | output_file = args.output 40 | if output_file == '$$$': 41 | output_file = args.task 42 | 43 | with open('agent_data/%s.pkl' % output_file, 'wb') as f: 44 | pickle.dump(np_dataset, f) 45 | 46 | print('Stored output in agent_data/%s.pkl' % output_file) 47 | -------------------------------------------------------------------------------- /scripts/viz_hist.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import argparse 4 | import sys 5 | 6 | from lifelong_rl.envs.env_processor import make_env 7 | import lifelong_rl.torch.pytorch_util as ptu 8 | from lifelong_rl.util.visualize_mujoco import record_mujoco_video_from_states 9 | 10 | from mujoco_py import GlfwContext 11 | GlfwContext(offscreen=True) # Create a window to init GLFW. 12 | 13 | 14 | """ 15 | Visualize replay buffer of agent and store as .mp4 16 | """ 17 | 18 | 19 | def get_env_states(snapshot_name): 20 | with open(snapshot_name + '.pt', 'rb') as f: 21 | snapshot = torch.load(f, map_location='cpu') 22 | env_states = snapshot['replay_buffer/env_states'] 23 | return env_states 24 | 25 | 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--snapshot', '-name', type=str, 28 | help='Name of snapshot to visualize (ex. 12-07-hopper/run_1/itr_999') 29 | parser.add_argument('--env', type=str, 30 | help='Which environment to visualize for') 31 | parser.add_argument('--output', '-o', type=str, 32 | help='Name of file to output mp4 video to') 33 | parser.add_argument('--start', '-s', type=int, 34 | help='Timestep to start visualization from') 35 | parser.add_argument('--end', '-e', type=int, 36 | help='Timestep to end visualization (should be > start)') 37 | parser.add_argument('--time_delay', '-dt', type=float, default=0.008, 38 | help='Length of time between frames') 39 | args = parser.parse_args(sys.argv[1:]) 40 | 41 | 42 | ptu.set_gpu_mode(False) 43 | 44 | env_states = get_env_states(args.snapshot) 45 | env, _ = make_env(args.env) 46 | record_mujoco_video_from_states( 47 | args.env, 48 | args.output, 49 | env_states[args.start:args.end], 50 | time_delay=args.time_delay, 51 | ) 52 | --------------------------------------------------------------------------------