├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── download_data.sh ├── examples └── 01_LSIQ │ ├── 01_episode │ ├── launcher.py │ └── lsiq_experiments.py │ ├── 02_episode_5 │ ├── launcher.py │ └── lsiq_experiments.py │ ├── 03_episode_10 │ ├── launcher.py │ └── lsiq_experiments.py │ └── 04_episode_25 │ ├── launcher.py │ └── lsiq_experiments.py ├── img └── Divergence_Minimization.gif ├── imitation_lib ├── __init__.py ├── imitation │ ├── __init__.py │ ├── gail_TRPO.py │ ├── iq_sac.py │ ├── iqfo_orig.py │ ├── iqfo_sac.py │ ├── lsiq.py │ ├── lsiq_h.py │ ├── lsiq_hc.py │ ├── lsiqfo.py │ ├── lsiqfo_h.py │ ├── lsiqfo_hc.py │ ├── offline │ │ ├── __init__.py │ │ ├── behavioral_cloning.py │ │ ├── iq_offline.py │ │ ├── lsiq_offline.py │ │ └── lsiq_offline_dm.py │ ├── sqil_sac.py │ └── vail_TRPO.py └── utils │ ├── __init__.py │ ├── action_models.py │ ├── distributions.py │ ├── math.py │ ├── networks.py │ ├── preprocessor.py │ └── training.py └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | .npz filter=lfs diff=lfs merge=lfs -text 2 | *.npz filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # examples 88 | logs/ 89 | 90 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 91 | __pypackages__/ 92 | 93 | # Celery stuff 94 | celerybeat-schedule 95 | celerybeat.pid 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | 127 | # expert datasets 128 | *.npz 129 | .idea/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Al-Hafez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LS-IQ: Implicit Reward Regularization for Inverse Reinforcement Learning 2 | This is the official code base of the paper [*LS-IQ: Implicit Reward Regularization for Inverse Reinforcement Learning*](https://arxiv.org/pdf/2303.00599.pdf), 3 | which was presented at the eleventh International Conference on Learning Representations ([ICLR 2023](https://iclr.cc/Conferences/2023)) 4 | in Kigali Ruanda. Here, we also provide all the baselines for the [LocoMuJoCo](https://github.com/robfiras/loco-mujoco) imitation learning benchmark [*LocoMuJoCo: A Comprehensive Imitation Learning Benchmark for Locomotion*](https://arxiv.org/pdf/2311.02496.pdf) presented at the Robot Learning workshop at [NeurIPS 2023](https://nips.cc/). 5 | 6 | --- 7 | ![Divergence_Minimization](img/Divergence_Minimization.gif) 8 | ## Method 9 | Within this work, we analyze the effect of a squared norm regularizer on the implicit reward function in the inverse reinforcement learning setting. 10 | We build on previous work ([IQ-Learn](https://arxiv.org/pdf/2106.12142.pdf)), and show that this regularizer results in a minimzation 11 | of the Chi^2-Divergence between the expert and a mixture distribution. **We show that - unlike previously used divergences - this divergence is bounded 12 | and the resulting reward function is also bounded**. An example is given in the picture above, where the target distribution is blue, 13 | the current policy distribution is green, and the mixture is orange. As can be seen, the vanilla Chi^2 divergence can reach very high values - despite the support area being non-zero - 14 | while the divergence on the mixture is bounded. Both optimization share the same optimal solution. 15 | 16 | Also, this regularizer provides a particularly illuminating perspective: the original **objective can be understood as 17 | squared Bellman error minimization with fixed rewards for the expert and the policy**. This setting can be further used to 18 | stabilize training as shown in our paper. 19 | 20 | ### Key Advatanages 21 | ✅ Simple implementation on top of SAC \ 22 | ✅ Bounded objective with bounded reward yields stable and convenient training\ 23 | ✅ Retains performance even without expert actions\ 24 | ✅ Performs even when only 1 expert trajectory is given\ 25 | ✅ Works in complex and realistic environments such as on the Atlas Locomotion task\ 26 | ✅ Unlike previous methods, no survival bias! 27 | 28 | --- 29 | ## Installation 30 | You can install this repo by cloning and then 31 | 32 | ```shell 33 | cd ls-iq 34 | pip install -e . 35 | ``` 36 | 37 | ### Download the Datasets [not needed for LocoMuJoCo] 38 | In order to run the examples and reproduce the results, you have to download the datasets used in our paper. To do so, you have to install `gdown`: 39 | 40 | ```shell 41 | pip install gdown 42 | ``` 43 | Then you can just run the download script: 44 | ```shell 45 | chmod u+x ./download_data.sh 46 | ./download_data.sh 47 | ``` 48 | 49 | --- 50 | ## Examples 51 | You can find launcher files in the example folder to launch all different versions of LSIQ and to reproduce the main results 52 | of the paper. 53 | 54 | Here is how you run the training of LSIQ with 5 expert trajectories on all Mujoco Gym Tasks: 55 | 56 | ```shell 57 | cd examples/02_episode_5/ 58 | python launcher.py 59 | ``` 60 | To monitor the training, you have to use Tensorboard. Once the training is launched, the directory `logs` will be created, which contains 61 | the Tensorboard logging data. Here is how you run Tensorboard: 62 | 63 | ```shell 64 | tensorboard --logdir logs 65 | ``` 66 | 67 | 68 | Some experiments were such as the Atlas locomotion task were conducted on environment, which are yet not 69 | available on Mushroom-RL, but will be available soon! Once the environments are part of Mushroom-RL, the experiment files will be added here. 70 | Follow Mushroom-RL on Twitter [@Mushroom_RL](https://twitter.com/Mushroom_RL) to immediately get notified once the 71 | new environment package is available! 72 | 73 | --- 74 | ## Citation 75 | ``` 76 | @inproceedings{alhafez2023, 77 | title={LS-IQ: Implicit Reward Regularization for Inverse Reinforcement Learning}, 78 | author={Firas Al-Hafez and Davide Tateo and Oleg Arenz and Guoping Zhao and Jan Peters}, 79 | booktitle={Eleventh International Conference on Learning Representations (ICLR)}, 80 | year={2023}, 81 | url={https://openreview.net/pdf?id=o3Q4m8jg4BR}} 82 | ``` 83 | -------------------------------------------------------------------------------- /download_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd examples 4 | gdown --folder https://drive.google.com/drive/folders/1I246M9aPzW1rAqyRC1hcqEeNno0X4usi?usp=share_link 5 | 6 | -------------------------------------------------------------------------------- /examples/01_LSIQ/01_episode/launcher.py: -------------------------------------------------------------------------------- 1 | import os 2 | from itertools import product 3 | from experiment_launcher import Launcher 4 | 5 | from experiment_launcher.utils import bool_local_cluster 6 | 7 | if __name__ == '__main__': 8 | LOCAL = bool_local_cluster() 9 | TEST = False 10 | USE_CUDA = False 11 | 12 | JOBLIB_PARALLEL_JOBS = 1 # or os.cpu_count() to use all cores 13 | N_SEEDS = 5 14 | 15 | launcher = Launcher(exp_name='lsiq_1', 16 | python_file='lsiq_experiments', 17 | n_exps=N_SEEDS, 18 | joblib_n_jobs=JOBLIB_PARALLEL_JOBS, 19 | n_cores=JOBLIB_PARALLEL_JOBS * 1, 20 | memory_per_core=JOBLIB_PARALLEL_JOBS * 6000, 21 | days=2, 22 | hours=0, 23 | minutes=0, 24 | seconds=0, 25 | use_timestamp=True, 26 | ) 27 | 28 | default_params = dict(n_epochs=150, 29 | n_steps_per_epoch=10000, 30 | n_eval_episodes=10, 31 | n_steps_per_fit=1, 32 | n_epochs_save=-1, 33 | logging_iter=10000, 34 | gamma=0.99, 35 | use_cuda=USE_CUDA, 36 | tau=0.005, 37 | use_target=True, 38 | loss_mode_exp="fix", 39 | regularizer_mode="plcy", 40 | learnable_alpha=False) 41 | 42 | log_std = [(-5, 2)] 43 | envs = ["Ant-v3", 44 | "HalfCheetah-v3", 45 | "Hopper-v3", 46 | "Humanoid-v3", 47 | "Walker2d-v3"] 48 | path_to_datasets = "../../00_Datasets/1_episode/" 49 | expert_data_filenames = ["expert_dataset_Ant-v3_6321.34_1_SAC.npz", 50 | "expert_dataset_HalfCheetah-v3_12312.93_1_SAC.npz", 51 | "expert_dataset_Hopper-v3_3729.74_1_SAC.npz", 52 | "expert_dataset_Humanoid-v3_6335.31_1_SAC.npz", 53 | "expert_dataset_Walker2d-v3_5830.37_1_SAC.npz"] 54 | 55 | expert_data_paths = [path_to_datasets + name for name in expert_data_filenames] 56 | 57 | # Ant 58 | launcher.add_experiment(env_id__=envs[0], expert_data_path=expert_data_paths[0], 59 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 60 | 61 | # HalfCheetah 62 | launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1], 63 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 64 | 65 | # Hopper 66 | launcher.add_experiment(env_id__=envs[2], expert_data_path=expert_data_paths[2], 67 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 68 | 69 | # Humanoid 70 | launcher.add_experiment(env_id__=envs[3], expert_data_path=expert_data_paths[3], 71 | plcy_loss_mode__="q_old_policy", init_alpha__=0.1, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 72 | 73 | # Walker2d 74 | launcher.add_experiment(env_id__=envs[4], expert_data_path=expert_data_paths[4], 75 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 76 | 77 | launcher.run(LOCAL, TEST) 78 | -------------------------------------------------------------------------------- /examples/01_LSIQ/01_episode/lsiq_experiments.py: -------------------------------------------------------------------------------- 1 | import os 2 | from time import perf_counter 3 | from contextlib import contextmanager 4 | 5 | import numpy as np 6 | import torch 7 | import torch.optim as optim 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | from mushroom_rl.core import Core 11 | from mushroom_rl.environments import Gym 12 | from mushroom_rl.utils.dataset import compute_J, compute_episodes_length 13 | from mushroom_rl.core.logger.logger import Logger 14 | 15 | 16 | from imitation_lib.imitation import LSIQ 17 | from imitation_lib.utils import FullyConnectedNetwork 18 | from imitation_lib.utils import prepare_expert_data, BestAgentSaver 19 | 20 | 21 | from experiment_launcher import run_experiment 22 | 23 | 24 | def _create_agent(mdp, expert_data, sw, lr_critic, lr_actor, plcy_loss_mode, 25 | regularizer_mode, use_target, lossQ_type, use_cuda, tau, 26 | learnable_alpha, init_alpha, reg_mult, Q_exp_loss, gamma, 27 | loss_mode_exp, log_std_min, log_std_max, delay_Q, n_fits, 28 | logging_iter): 29 | 30 | # calculate the minimum and maximum Q-function 31 | Q_max = 1.0 / (reg_mult * (1 - gamma)) 32 | Q_min = - 1.0 / (reg_mult * (1 - gamma)) 33 | 34 | # Settings 35 | initial_replay_size = 10000 36 | max_replay_size = 1000000 37 | batch_size = 256 # the real batch size is double the size as an expert batch is going to be added 38 | warmup_transitions = 15000 39 | 40 | lr_alpha = 2e-6 41 | weight_decay_actor = 0.0 42 | weight_decay_critic = 0.0 43 | 44 | target_entropy = -22.0 45 | 46 | # Approximator 47 | actor_input_shape = mdp.info.observation_space.shape 48 | actor_output_shape = (mdp.info.action_space.shape[0]*2,) 49 | actor_params = dict(network=FullyConnectedNetwork, 50 | n_features=[256, 256], 51 | input_shape=actor_input_shape, 52 | output_shape=actor_output_shape, 53 | activations=["relu", "relu", "identity"], 54 | use_cuda=use_cuda) 55 | 56 | actor_optimizer = {'class': optim.Adam, 57 | 'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}} 58 | 59 | critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) 60 | critic_params = dict(network=FullyConnectedNetwork, 61 | optimizer={'class': optim.Adam, 62 | 'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}}, 63 | n_features=[256, 256], 64 | input_shape=critic_input_shape, 65 | activations=["relu", "relu", "identity"], 66 | squeeze_out=False, 67 | output_shape=(1,), 68 | use_cuda=use_cuda) 69 | 70 | # create IQfO agent 71 | agent = LSIQ(mdp_info=mdp.info, batch_size=batch_size, initial_replay_size=initial_replay_size, 72 | max_replay_size=max_replay_size, demonstrations=expert_data, sw=sw, use_target=use_target, 73 | warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha, actor_params=actor_params, 74 | actor_optimizer=actor_optimizer, critic_params=critic_params, delay_Q=delay_Q, lossQ_type=lossQ_type, 75 | target_entropy=target_entropy, critic_fit_params=None, plcy_loss_mode=plcy_loss_mode, 76 | regularizer_mode=regularizer_mode, learnable_alpha=learnable_alpha, init_alpha=init_alpha, 77 | reg_mult=reg_mult, Q_min=Q_min, Q_max=Q_max, log_std_min=log_std_min, log_std_max=log_std_max, 78 | loss_mode_exp=loss_mode_exp, Q_exp_loss=Q_exp_loss, n_fits=n_fits, logging_iter=logging_iter) 79 | 80 | return agent 81 | 82 | 83 | def experiment(env_id: str = "HalfCheetah-v2", 84 | n_epochs: int = 500, 85 | n_steps_per_epoch: int = 10000, 86 | n_steps_per_fit: int = 1, 87 | n_eval_episodes: int = 50, 88 | n_epochs_save: int = 100, 89 | logging_iter: int = 100, 90 | expert_data_path: str = None, 91 | use_cuda: bool = False, 92 | lr_critic: float = 3e-4, 93 | lr_actor: float = 3e-5, 94 | results_dir: str = "./logs", 95 | plcy_loss_mode: str = "value", 96 | regularizer_mode: str = "exp_and_plcy", 97 | reg_mult: float = 0.5, 98 | Q_exp_loss: str = "MSE", 99 | n_fits: int = 1, 100 | loss_mode_exp: str = "fix", 101 | log_std_min: float = -5.0, 102 | log_std_max: float = 2.0, 103 | learnable_alpha: bool = False, 104 | use_target: bool = True, 105 | init_alpha: float = 0.001, 106 | tau: float = 0.005, 107 | delay_Q: int = 1, 108 | lossQ_type: str = "sqil_like", 109 | gamma: float = 0.99, 110 | horizon: int = 1000, 111 | seed: int = 0): 112 | 113 | np.random.seed(seed) 114 | torch.random.manual_seed(seed) 115 | 116 | logger_stoch = Logger(results_dir=results_dir, log_name="stochastic_logging", seed=seed, append=True) 117 | logger_deter = Logger(results_dir=results_dir, log_name="deterministic_logging", seed=seed, append=True) 118 | 119 | results_dir = os.path.join(results_dir, str(seed)) 120 | 121 | env_params = dict(name=env_id, horizon=horizon, gamma=gamma) 122 | 123 | mdp = Gym(**env_params) 124 | 125 | # load expert data 126 | expert_data = prepare_expert_data(data_path=expert_data_path) 127 | 128 | # logging stuff 129 | tb_writer = SummaryWriter(log_dir=results_dir) 130 | agent_saver = BestAgentSaver(save_path=results_dir, n_epochs_save=n_epochs_save) 131 | 132 | # create agent and core 133 | agent = _create_agent(mdp, expert_data, sw=tb_writer, lr_critic=lr_critic, lr_actor=lr_actor, 134 | plcy_loss_mode=plcy_loss_mode, regularizer_mode=regularizer_mode, 135 | use_cuda=use_cuda, use_target=use_target, lossQ_type=lossQ_type, 136 | delay_Q=delay_Q, tau=tau, learnable_alpha=learnable_alpha, init_alpha=init_alpha, 137 | reg_mult=reg_mult, gamma=gamma, Q_exp_loss=Q_exp_loss, 138 | loss_mode_exp=loss_mode_exp, log_std_min=log_std_min, 139 | n_fits=n_fits, log_std_max=log_std_max, logging_iter=logging_iter) 140 | 141 | core = Core(agent, mdp) 142 | 143 | # iqfo train loop 144 | for epoch in range(n_epochs): 145 | with catchtime() as t: 146 | # training 147 | core.learn(n_steps=n_steps_per_epoch, n_steps_per_fit=n_steps_per_fit, quiet=True) 148 | print('Epoch %d | Time %fs ' % (epoch + 1, float(t()))) 149 | 150 | # evaluate with deterministic policy 151 | agent.policy.use_mean = True 152 | dataset = core.evaluate(n_episodes=n_eval_episodes) 153 | R_mean = np.mean(compute_J(dataset)) 154 | J_mean = np.mean(compute_J(dataset, gamma=gamma)) 155 | L = np.mean(compute_episodes_length(dataset)) 156 | logger_deter.log_numpy(Epoch=epoch, R_mean=R_mean, J_mean=J_mean, L=L) 157 | tb_writer.add_scalar("Eval_R-deterministic", R_mean, epoch) 158 | tb_writer.add_scalar("Eval_J-deterministic", J_mean, epoch) 159 | tb_writer.add_scalar("Eval_L-deterministic", L, epoch) 160 | agent.policy.use_mean = False 161 | 162 | # evaluate with stochastic policy 163 | dataset = core.evaluate(n_episodes=n_eval_episodes) 164 | R_mean_stoch = np.mean(compute_J(dataset)) 165 | J_mean_stoch = np.mean(compute_J(dataset, gamma=gamma)) 166 | L = np.mean(compute_episodes_length(dataset)) 167 | logger_stoch.log_numpy(Epoch=epoch, R_mean=R_mean_stoch, J_mean=J_mean_stoch, L=L) 168 | tb_writer.add_scalar("Eval_R-stochastic", R_mean_stoch, epoch) 169 | tb_writer.add_scalar("Eval_J-stochastic", J_mean_stoch, epoch) 170 | tb_writer.add_scalar("Eval_L-stochastic", L, epoch) 171 | 172 | print("R_mean (deter): %f | R_mean (stoch): %f" % (R_mean, R_mean_stoch)) 173 | 174 | # save agent if needed 175 | agent_saver.save(core.agent, J_mean) 176 | 177 | agent_saver.save_curr_best_agent() 178 | print("Finished.") 179 | 180 | @contextmanager 181 | def catchtime() -> float: 182 | start = perf_counter() 183 | yield lambda: perf_counter() - start 184 | 185 | 186 | if __name__ == "__main__": 187 | 188 | # Leave unchanged 189 | run_experiment(experiment) 190 | -------------------------------------------------------------------------------- /examples/01_LSIQ/02_episode_5/launcher.py: -------------------------------------------------------------------------------- 1 | from experiment_launcher import Launcher 2 | 3 | from experiment_launcher.utils import bool_local_cluster 4 | 5 | if __name__ == '__main__': 6 | LOCAL = bool_local_cluster() 7 | TEST = False 8 | USE_CUDA = False 9 | 10 | JOBLIB_PARALLEL_JOBS = 1 # or os.cpu_count() to use all cores 11 | N_SEEDS = 5 12 | 13 | launcher = Launcher(exp_name='lsiq_5', 14 | python_file='lsiq_experiments', 15 | n_exps=N_SEEDS, 16 | joblib_n_jobs=JOBLIB_PARALLEL_JOBS, 17 | n_cores=JOBLIB_PARALLEL_JOBS * 1, 18 | memory_per_core=JOBLIB_PARALLEL_JOBS * 6000, 19 | days=2, 20 | hours=0, 21 | minutes=0, 22 | seconds=0, 23 | use_timestamp=True, 24 | ) 25 | 26 | default_params = dict(n_epochs=1500, 27 | n_steps_per_epoch=1000, 28 | n_eval_episodes=10, 29 | n_steps_per_fit=1, 30 | n_epochs_save=-1, 31 | logging_iter=10000, 32 | gamma=0.99, 33 | use_cuda=USE_CUDA, 34 | tau=0.005, 35 | use_target=True, 36 | loss_mode_exp="fix", 37 | regularizer_mode="plcy", 38 | learnable_alpha=False) 39 | 40 | log_std = [(-5, 2)] 41 | envs = ["Ant-v3", 42 | "HalfCheetah-v3", 43 | "Hopper-v3", 44 | "Humanoid-v3", 45 | "Walker2d-v3"] 46 | path_to_datasets = "../../00_Datasets/5_episodes/" 47 | expert_data_filenames = ["expert_dataset_Ant-v3_6424.22_5_SAC.npz", 48 | "expert_dataset_HalfCheetah-v3_12543.01_5_SAC.npz", 49 | "expert_dataset_Hopper-v3_3348.59_5_SAC.npz", 50 | "expert_dataset_Humanoid-v3_6321.39_5_SAC.npz", 51 | "expert_dataset_Walker2d-v3_5854.7_5_SAC.npz"] 52 | 53 | expert_data_paths = [path_to_datasets + name for name in expert_data_filenames] 54 | 55 | # Ant 56 | launcher.add_experiment(env_id__=envs[0], expert_data_path=expert_data_paths[0], 57 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 58 | 59 | # HalfCheetah 60 | launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1], 61 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 62 | launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1], 63 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=10.0, **default_params) 64 | 65 | # Hopper 66 | launcher.add_experiment(env_id__=envs[2], expert_data_path=expert_data_paths[2], 67 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 68 | 69 | # Humanoid 70 | launcher.add_experiment(env_id__=envs[3], expert_data_path=expert_data_paths[3], 71 | plcy_loss_mode__="value", init_alpha__=0.1, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 72 | 73 | # Walker2d 74 | launcher.add_experiment(env_id__=envs[4], expert_data_path=expert_data_paths[4], 75 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 76 | 77 | 78 | launcher.run(LOCAL, TEST) 79 | -------------------------------------------------------------------------------- /examples/01_LSIQ/02_episode_5/lsiq_experiments.py: -------------------------------------------------------------------------------- 1 | import os 2 | from time import perf_counter 3 | from contextlib import contextmanager 4 | 5 | import numpy as np 6 | import torch 7 | import torch.optim as optim 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | from mushroom_rl.core import Core 11 | from mushroom_rl.environments import Gym 12 | from mushroom_rl.utils.dataset import compute_J, compute_episodes_length 13 | from mushroom_rl.core.logger.logger import Logger 14 | 15 | 16 | from imitation_lib.imitation import LSIQ 17 | from imitation_lib.utils import FullyConnectedNetwork 18 | from imitation_lib.utils import prepare_expert_data, BestAgentSaver 19 | 20 | 21 | from experiment_launcher import run_experiment 22 | 23 | 24 | def _create_agent(mdp, expert_data, sw, lr_critic, lr_actor, plcy_loss_mode, 25 | regularizer_mode, use_target, lossQ_type, use_cuda, tau, 26 | learnable_alpha, init_alpha, reg_mult, Q_exp_loss, gamma, 27 | loss_mode_exp, log_std_min, log_std_max, delay_Q, n_fits, 28 | logging_iter): 29 | 30 | # calculate the minimum and maximum Q-function 31 | Q_max = 1.0 / (reg_mult * (1 - gamma)) 32 | Q_min = - 1.0 / (reg_mult * (1 - gamma)) 33 | 34 | # Settings 35 | initial_replay_size = 10000 36 | max_replay_size = 1000000 37 | batch_size = 256 # the real batch size is double the size as an expert batch is going to be added 38 | warmup_transitions = 15000 39 | 40 | lr_alpha = 2e-6 41 | weight_decay_actor = 0.0 42 | weight_decay_critic = 0.0 43 | 44 | target_entropy = -22.0 45 | 46 | # Approximator 47 | actor_input_shape = mdp.info.observation_space.shape 48 | actor_output_shape = (mdp.info.action_space.shape[0]*2,) 49 | actor_params = dict(network=FullyConnectedNetwork, 50 | n_features=[256, 256], 51 | input_shape=actor_input_shape, 52 | output_shape=actor_output_shape, 53 | activations=["relu", "relu", "identity"], 54 | use_cuda=use_cuda) 55 | 56 | actor_optimizer = {'class': optim.Adam, 57 | 'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}} 58 | 59 | critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) 60 | critic_params = dict(network=FullyConnectedNetwork, 61 | optimizer={'class': optim.Adam, 62 | 'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}}, 63 | n_features=[256, 256], 64 | input_shape=critic_input_shape, 65 | activations=["relu", "relu", "identity"], 66 | squeeze_out=False, 67 | output_shape=(1,), 68 | use_cuda=use_cuda) 69 | 70 | # create IQfO agent 71 | agent = LSIQ(mdp_info=mdp.info, batch_size=batch_size, initial_replay_size=initial_replay_size, 72 | max_replay_size=max_replay_size, demonstrations=expert_data, sw=sw, use_target=use_target, 73 | warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha, actor_params=actor_params, 74 | actor_optimizer=actor_optimizer, critic_params=critic_params, delay_Q=delay_Q, lossQ_type=lossQ_type, 75 | target_entropy=target_entropy, critic_fit_params=None, plcy_loss_mode=plcy_loss_mode, 76 | regularizer_mode=regularizer_mode, learnable_alpha=learnable_alpha, init_alpha=init_alpha, 77 | reg_mult=reg_mult, Q_min=Q_min, Q_max=Q_max, log_std_min=log_std_min, log_std_max=log_std_max, 78 | loss_mode_exp=loss_mode_exp, Q_exp_loss=Q_exp_loss, n_fits=n_fits, logging_iter=logging_iter) 79 | 80 | return agent 81 | 82 | 83 | def experiment(env_id: str = "HalfCheetah-v2", 84 | n_epochs: int = 500, 85 | n_steps_per_epoch: int = 10000, 86 | n_steps_per_fit: int = 1, 87 | n_eval_episodes: int = 50, 88 | n_epochs_save: int = 100, 89 | logging_iter: int = 100, 90 | expert_data_path: str = None, 91 | use_cuda: bool = False, 92 | lr_critic: float = 3e-4, 93 | lr_actor: float = 3e-5, 94 | results_dir: str = "./logs", 95 | plcy_loss_mode: str = "value", 96 | regularizer_mode: str = "exp_and_plcy", 97 | reg_mult: float = 0.5, 98 | Q_exp_loss: str = "MSE", 99 | n_fits: int = 1, 100 | loss_mode_exp: str = "fix", 101 | log_std_min: float = -5.0, 102 | log_std_max: float = 2.0, 103 | learnable_alpha: bool = False, 104 | use_target: bool = True, 105 | init_alpha: float = 0.001, 106 | tau: float = 0.005, 107 | delay_Q: int = 1, 108 | lossQ_type: str = "sqil_like", 109 | gamma: float = 0.99, 110 | horizon: int = 1000, 111 | seed: int = 0): 112 | 113 | np.random.seed(seed) 114 | torch.random.manual_seed(seed) 115 | 116 | logger_stoch = Logger(results_dir=results_dir, log_name="stochastic_logging", seed=seed, append=True) 117 | logger_deter = Logger(results_dir=results_dir, log_name="deterministic_logging", seed=seed, append=True) 118 | 119 | results_dir = os.path.join(results_dir, str(seed)) 120 | 121 | env_params = dict(name=env_id, horizon=horizon, gamma=gamma) 122 | 123 | mdp = Gym(**env_params) 124 | 125 | # load expert data 126 | expert_data = prepare_expert_data(data_path=expert_data_path) 127 | 128 | # logging stuff 129 | tb_writer = SummaryWriter(log_dir=results_dir) 130 | agent_saver = BestAgentSaver(save_path=results_dir, n_epochs_save=n_epochs_save) 131 | 132 | # create agent and core 133 | agent = _create_agent(mdp, expert_data, sw=tb_writer, lr_critic=lr_critic, lr_actor=lr_actor, 134 | plcy_loss_mode=plcy_loss_mode, regularizer_mode=regularizer_mode, 135 | use_cuda=use_cuda, use_target=use_target, lossQ_type=lossQ_type, 136 | delay_Q=delay_Q, tau=tau, learnable_alpha=learnable_alpha, init_alpha=init_alpha, 137 | reg_mult=reg_mult, gamma=gamma, Q_exp_loss=Q_exp_loss, 138 | loss_mode_exp=loss_mode_exp, log_std_min=log_std_min, 139 | n_fits=n_fits, log_std_max=log_std_max, logging_iter=logging_iter) 140 | 141 | core = Core(agent, mdp) 142 | 143 | # iqfo train loop 144 | for epoch in range(n_epochs): 145 | with catchtime() as t: 146 | # training 147 | core.learn(n_steps=n_steps_per_epoch, n_steps_per_fit=n_steps_per_fit, quiet=True) 148 | print('Epoch %d | Time %fs ' % (epoch + 1, float(t()))) 149 | 150 | # evaluate with deterministic policy 151 | agent.policy.use_mean = True 152 | dataset = core.evaluate(n_episodes=n_eval_episodes) 153 | R_mean = np.mean(compute_J(dataset)) 154 | J_mean = np.mean(compute_J(dataset, gamma=gamma)) 155 | L = np.mean(compute_episodes_length(dataset)) 156 | logger_deter.log_numpy(Epoch=epoch, R_mean=R_mean, J_mean=J_mean, L=L) 157 | tb_writer.add_scalar("Eval_R-deterministic", R_mean, epoch) 158 | tb_writer.add_scalar("Eval_J-deterministic", J_mean, epoch) 159 | tb_writer.add_scalar("Eval_L-deterministic", L, epoch) 160 | agent.policy.use_mean = False 161 | 162 | # evaluate with stochastic policy 163 | dataset = core.evaluate(n_episodes=n_eval_episodes) 164 | R_mean_stoch = np.mean(compute_J(dataset)) 165 | J_mean_stoch = np.mean(compute_J(dataset, gamma=gamma)) 166 | L = np.mean(compute_episodes_length(dataset)) 167 | logger_stoch.log_numpy(Epoch=epoch, R_mean=R_mean_stoch, J_mean=J_mean_stoch, L=L) 168 | tb_writer.add_scalar("Eval_R-stochastic", R_mean_stoch, epoch) 169 | tb_writer.add_scalar("Eval_J-stochastic", J_mean_stoch, epoch) 170 | tb_writer.add_scalar("Eval_L-stochastic", L, epoch) 171 | 172 | print("R_mean (deter): %f | R_mean (stoch): %f" % (R_mean, R_mean_stoch)) 173 | 174 | # save agent if needed 175 | agent_saver.save(core.agent, J_mean) 176 | 177 | agent_saver.save_curr_best_agent() 178 | print("Finished.") 179 | 180 | @contextmanager 181 | def catchtime() -> float: 182 | start = perf_counter() 183 | yield lambda: perf_counter() - start 184 | 185 | 186 | if __name__ == "__main__": 187 | 188 | # Leave unchanged 189 | run_experiment(experiment) 190 | -------------------------------------------------------------------------------- /examples/01_LSIQ/03_episode_10/launcher.py: -------------------------------------------------------------------------------- 1 | from experiment_launcher import Launcher 2 | 3 | from experiment_launcher.utils import bool_local_cluster 4 | 5 | if __name__ == '__main__': 6 | LOCAL = bool_local_cluster() 7 | TEST = False 8 | USE_CUDA = False 9 | 10 | JOBLIB_PARALLEL_JOBS = 1 # or os.cpu_count() to use all cores 11 | N_SEEDS = 5 12 | 13 | launcher = Launcher(exp_name='lsiq_10', 14 | python_file='lsiq_experiments', 15 | n_exps=N_SEEDS, 16 | joblib_n_jobs=JOBLIB_PARALLEL_JOBS, 17 | n_cores=JOBLIB_PARALLEL_JOBS * 1, 18 | memory_per_core=JOBLIB_PARALLEL_JOBS * 6000, 19 | days=2, 20 | hours=0, 21 | minutes=0, 22 | seconds=0, 23 | use_timestamp=True, 24 | ) 25 | 26 | default_params = dict(n_epochs=150, 27 | n_steps_per_epoch=10000, 28 | n_eval_episodes=10, 29 | n_steps_per_fit=1, 30 | n_epochs_save=-1, 31 | logging_iter=10000, 32 | gamma=0.99, 33 | use_cuda=USE_CUDA, 34 | tau=0.005, 35 | use_target=True, 36 | loss_mode_exp="fix", 37 | regularizer_mode="plcy", 38 | learnable_alpha=False) 39 | 40 | log_std = [(-5, 2)] 41 | envs = ["Ant-v3", 42 | "HalfCheetah-v3", 43 | "Hopper-v3", 44 | "Humanoid-v3", 45 | "Walker2d-v3"] 46 | path_to_datasets = "../../00_Datasets/10_episodes/" 47 | expert_data_filenames = ["expert_dataset_Ant-v3_6421.34_10_SAC.npz", 48 | "expert_dataset_HalfCheetah-v3_12360.31_10_SAC.npz", 49 | "expert_dataset_Hopper-v3_3549.94_10_SAC.npz", 50 | "expert_dataset_Humanoid-v3_6346.43_10_SAC.npz", 51 | "expert_dataset_Walker2d-v3_5852.24_10_SAC.npz"] 52 | 53 | expert_data_paths = [path_to_datasets + name for name in expert_data_filenames] 54 | 55 | # Ant 56 | launcher.add_experiment(env_id__=envs[0], expert_data_path=expert_data_paths[0], 57 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 58 | 59 | 60 | # HalfCheetah 61 | launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1], 62 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 63 | launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1], 64 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=10.0, **default_params) 65 | 66 | # Hopper 67 | launcher.add_experiment(env_id__=envs[2], expert_data_path=expert_data_paths[2], 68 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 69 | 70 | # Humanoid 71 | launcher.add_experiment(env_id__=envs[3], expert_data_path=expert_data_paths[3], 72 | plcy_loss_mode__="value", init_alpha__=0.1, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 73 | 74 | # Walker2d 75 | launcher.add_experiment(env_id__=envs[4], expert_data_path=expert_data_paths[4], 76 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 77 | 78 | launcher.run(LOCAL, TEST) 79 | -------------------------------------------------------------------------------- /examples/01_LSIQ/03_episode_10/lsiq_experiments.py: -------------------------------------------------------------------------------- 1 | import os 2 | from time import perf_counter 3 | from contextlib import contextmanager 4 | 5 | import numpy as np 6 | import torch 7 | import torch.optim as optim 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | from mushroom_rl.core import Core 11 | from mushroom_rl.environments import Gym 12 | from mushroom_rl.utils.dataset import compute_J, compute_episodes_length 13 | from mushroom_rl.core.logger.logger import Logger 14 | 15 | 16 | from imitation_lib.imitation import LSIQ 17 | from imitation_lib.utils import FullyConnectedNetwork 18 | from imitation_lib.utils import prepare_expert_data, BestAgentSaver 19 | 20 | 21 | from experiment_launcher import run_experiment 22 | 23 | 24 | def _create_agent(mdp, expert_data, sw, lr_critic, lr_actor, plcy_loss_mode, 25 | regularizer_mode, use_target, lossQ_type, use_cuda, tau, 26 | learnable_alpha, init_alpha, reg_mult, Q_exp_loss, gamma, 27 | loss_mode_exp, log_std_min, log_std_max, delay_Q, n_fits, 28 | logging_iter): 29 | 30 | # calculate the minimum and maximum Q-function 31 | Q_max = 1.0 / (reg_mult * (1 - gamma)) 32 | Q_min = - 1.0 / (reg_mult * (1 - gamma)) 33 | 34 | # Settings 35 | initial_replay_size = 10000 36 | max_replay_size = 1000000 37 | batch_size = 256 # the real batch size is double the size as an expert batch is going to be added 38 | warmup_transitions = 15000 39 | 40 | lr_alpha = 2e-6 41 | weight_decay_actor = 0.0 42 | weight_decay_critic = 0.0 43 | 44 | target_entropy = -22.0 45 | 46 | # Approximator 47 | actor_input_shape = mdp.info.observation_space.shape 48 | actor_output_shape = (mdp.info.action_space.shape[0]*2,) 49 | actor_params = dict(network=FullyConnectedNetwork, 50 | n_features=[256, 256], 51 | input_shape=actor_input_shape, 52 | output_shape=actor_output_shape, 53 | activations=["relu", "relu", "identity"], 54 | use_cuda=use_cuda) 55 | 56 | actor_optimizer = {'class': optim.Adam, 57 | 'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}} 58 | 59 | critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) 60 | critic_params = dict(network=FullyConnectedNetwork, 61 | optimizer={'class': optim.Adam, 62 | 'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}}, 63 | n_features=[256, 256], 64 | input_shape=critic_input_shape, 65 | activations=["relu", "relu", "identity"], 66 | squeeze_out=False, 67 | output_shape=(1,), 68 | use_cuda=use_cuda) 69 | 70 | # create IQfO agent 71 | agent = LSIQ(mdp_info=mdp.info, batch_size=batch_size, initial_replay_size=initial_replay_size, 72 | max_replay_size=max_replay_size, demonstrations=expert_data, sw=sw, use_target=use_target, 73 | warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha, actor_params=actor_params, 74 | actor_optimizer=actor_optimizer, critic_params=critic_params, delay_Q=delay_Q, lossQ_type=lossQ_type, 75 | target_entropy=target_entropy, critic_fit_params=None, plcy_loss_mode=plcy_loss_mode, 76 | regularizer_mode=regularizer_mode, learnable_alpha=learnable_alpha, init_alpha=init_alpha, 77 | reg_mult=reg_mult, Q_min=Q_min, Q_max=Q_max, log_std_min=log_std_min, log_std_max=log_std_max, 78 | loss_mode_exp=loss_mode_exp, Q_exp_loss=Q_exp_loss, n_fits=n_fits, logging_iter=logging_iter) 79 | 80 | return agent 81 | 82 | 83 | def experiment(env_id: str = "HalfCheetah-v2", 84 | n_epochs: int = 500, 85 | n_steps_per_epoch: int = 10000, 86 | n_steps_per_fit: int = 1, 87 | n_eval_episodes: int = 50, 88 | n_epochs_save: int = 100, 89 | logging_iter: int = 100, 90 | expert_data_path: str = None, 91 | use_cuda: bool = False, 92 | lr_critic: float = 3e-4, 93 | lr_actor: float = 3e-5, 94 | results_dir: str = "./logs", 95 | plcy_loss_mode: str = "value", 96 | regularizer_mode: str = "exp_and_plcy", 97 | reg_mult: float = 0.5, 98 | Q_exp_loss: str = "MSE", 99 | n_fits: int = 1, 100 | loss_mode_exp: str = "fix", 101 | log_std_min: float = -5.0, 102 | log_std_max: float = 2.0, 103 | learnable_alpha: bool = False, 104 | use_target: bool = True, 105 | init_alpha: float = 0.001, 106 | tau: float = 0.005, 107 | delay_Q: int = 1, 108 | lossQ_type: str = "sqil_like", 109 | gamma: float = 0.99, 110 | horizon: int = 1000, 111 | seed: int = 0): 112 | 113 | np.random.seed(seed) 114 | torch.random.manual_seed(seed) 115 | 116 | logger_stoch = Logger(results_dir=results_dir, log_name="stochastic_logging", seed=seed, append=True) 117 | logger_deter = Logger(results_dir=results_dir, log_name="deterministic_logging", seed=seed, append=True) 118 | 119 | results_dir = os.path.join(results_dir, str(seed)) 120 | 121 | env_params = dict(name=env_id, horizon=horizon, gamma=gamma) 122 | 123 | mdp = Gym(**env_params) 124 | 125 | # load expert data 126 | expert_data = prepare_expert_data(data_path=expert_data_path) 127 | 128 | # logging stuff 129 | tb_writer = SummaryWriter(log_dir=results_dir) 130 | agent_saver = BestAgentSaver(save_path=results_dir, n_epochs_save=n_epochs_save) 131 | 132 | # create agent and core 133 | agent = _create_agent(mdp, expert_data, sw=tb_writer, lr_critic=lr_critic, lr_actor=lr_actor, 134 | plcy_loss_mode=plcy_loss_mode, regularizer_mode=regularizer_mode, 135 | use_cuda=use_cuda, use_target=use_target, lossQ_type=lossQ_type, 136 | delay_Q=delay_Q, tau=tau, learnable_alpha=learnable_alpha, init_alpha=init_alpha, 137 | reg_mult=reg_mult, gamma=gamma, Q_exp_loss=Q_exp_loss, 138 | loss_mode_exp=loss_mode_exp, log_std_min=log_std_min, 139 | n_fits=n_fits, log_std_max=log_std_max, logging_iter=logging_iter) 140 | 141 | core = Core(agent, mdp) 142 | 143 | # iqfo train loop 144 | for epoch in range(n_epochs): 145 | with catchtime() as t: 146 | # training 147 | core.learn(n_steps=n_steps_per_epoch, n_steps_per_fit=n_steps_per_fit, quiet=True) 148 | print('Epoch %d | Time %fs ' % (epoch + 1, float(t()))) 149 | 150 | # evaluate with deterministic policy 151 | agent.policy.use_mean = True 152 | dataset = core.evaluate(n_episodes=n_eval_episodes) 153 | R_mean = np.mean(compute_J(dataset)) 154 | J_mean = np.mean(compute_J(dataset, gamma=gamma)) 155 | L = np.mean(compute_episodes_length(dataset)) 156 | logger_deter.log_numpy(Epoch=epoch, R_mean=R_mean, J_mean=J_mean, L=L) 157 | tb_writer.add_scalar("Eval_R-deterministic", R_mean, epoch) 158 | tb_writer.add_scalar("Eval_J-deterministic", J_mean, epoch) 159 | tb_writer.add_scalar("Eval_L-deterministic", L, epoch) 160 | agent.policy.use_mean = False 161 | 162 | # evaluate with stochastic policy 163 | dataset = core.evaluate(n_episodes=n_eval_episodes) 164 | R_mean_stoch = np.mean(compute_J(dataset)) 165 | J_mean_stoch = np.mean(compute_J(dataset, gamma=gamma)) 166 | L = np.mean(compute_episodes_length(dataset)) 167 | logger_stoch.log_numpy(Epoch=epoch, R_mean=R_mean_stoch, J_mean=J_mean_stoch, L=L) 168 | tb_writer.add_scalar("Eval_R-stochastic", R_mean_stoch, epoch) 169 | tb_writer.add_scalar("Eval_J-stochastic", J_mean_stoch, epoch) 170 | tb_writer.add_scalar("Eval_L-stochastic", L, epoch) 171 | 172 | print("R_mean (deter): %f | R_mean (stoch): %f" % (R_mean, R_mean_stoch)) 173 | 174 | # save agent if needed 175 | agent_saver.save(core.agent, J_mean) 176 | 177 | agent_saver.save_curr_best_agent() 178 | print("Finished.") 179 | 180 | @contextmanager 181 | def catchtime() -> float: 182 | start = perf_counter() 183 | yield lambda: perf_counter() - start 184 | 185 | 186 | if __name__ == "__main__": 187 | 188 | # Leave unchanged 189 | run_experiment(experiment) 190 | -------------------------------------------------------------------------------- /examples/01_LSIQ/04_episode_25/launcher.py: -------------------------------------------------------------------------------- 1 | from experiment_launcher import Launcher 2 | 3 | from experiment_launcher.utils import bool_local_cluster 4 | 5 | if __name__ == '__main__': 6 | LOCAL = bool_local_cluster() 7 | TEST = False 8 | USE_CUDA = False 9 | 10 | JOBLIB_PARALLEL_JOBS = 1 # or os.cpu_count() to use all cores 11 | N_SEEDS = 5 12 | 13 | launcher = Launcher(exp_name='lsiq_25', 14 | python_file='lsiq_experiments', 15 | n_exps=N_SEEDS, 16 | joblib_n_jobs=JOBLIB_PARALLEL_JOBS, 17 | n_cores=JOBLIB_PARALLEL_JOBS * 1, 18 | memory_per_core=JOBLIB_PARALLEL_JOBS * 6000, 19 | days=2, 20 | hours=0, 21 | minutes=0, 22 | seconds=0, 23 | use_timestamp=True, 24 | ) 25 | 26 | default_params = dict(n_epochs=150, 27 | n_steps_per_epoch=10000, 28 | n_eval_episodes=10, 29 | n_steps_per_fit=1, 30 | n_epochs_save=-1, 31 | logging_iter=10000, 32 | gamma=0.99, 33 | use_cuda=USE_CUDA, 34 | tau=0.005, 35 | use_target=True, 36 | loss_mode_exp="fix", 37 | regularizer_mode="plcy", 38 | learnable_alpha=False) 39 | 40 | log_std = [(-5, 2)] 41 | envs = ["Ant-v3", 42 | "HalfCheetah-v3", 43 | "Hopper-v3", 44 | "Humanoid-v3", 45 | "Walker2d-v3"] 46 | path_to_datasets = "../../00_Datasets/25_episodes/" 47 | expert_data_filenames = ["expert_dataset_Ant-v3_6399.04_25_SAC.npz", 48 | "expert_dataset_HalfCheetah-v3_12328.78_25_SAC.npz", 49 | "expert_dataset_Hopper-v3_3299.81_25_SAC.npz", 50 | "expert_dataset_Humanoid-v3_6273.29_25_SAC.npz", 51 | "expert_dataset_Walker2d-v3_5841.73_25_SAC.npz"] 52 | 53 | expert_data_paths = [path_to_datasets + name for name in expert_data_filenames] 54 | 55 | # Ant 56 | launcher.add_experiment(env_id__=envs[0], expert_data_path=expert_data_paths[0], 57 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 58 | 59 | # HalfCheetah 60 | launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1], 61 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 62 | launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1], 63 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=10.0, **default_params) 64 | 65 | # Hopper 66 | launcher.add_experiment(env_id__=envs[2], expert_data_path=expert_data_paths[2], 67 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 68 | 69 | # Humanoid 70 | launcher.add_experiment(env_id__=envs[3], expert_data_path=expert_data_paths[3], 71 | plcy_loss_mode__="value", init_alpha__=0.1, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 72 | 73 | # Walker2d 74 | launcher.add_experiment(env_id__=envs[4], expert_data_path=expert_data_paths[4], 75 | plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params) 76 | 77 | 78 | launcher.run(LOCAL, TEST) 79 | -------------------------------------------------------------------------------- /examples/01_LSIQ/04_episode_25/lsiq_experiments.py: -------------------------------------------------------------------------------- 1 | import os 2 | from time import perf_counter 3 | from contextlib import contextmanager 4 | 5 | import numpy as np 6 | import torch 7 | import torch.optim as optim 8 | from torch.utils.tensorboard import SummaryWriter 9 | 10 | from mushroom_rl.core import Core 11 | from mushroom_rl.environments import Gym 12 | from mushroom_rl.utils.dataset import compute_J, compute_episodes_length 13 | from mushroom_rl.core.logger.logger import Logger 14 | 15 | 16 | from imitation_lib.imitation import LSIQ 17 | from imitation_lib.utils import FullyConnectedNetwork 18 | from imitation_lib.utils import prepare_expert_data, BestAgentSaver 19 | 20 | 21 | from experiment_launcher import run_experiment 22 | 23 | 24 | def _create_agent(mdp, expert_data, sw, lr_critic, lr_actor, plcy_loss_mode, 25 | regularizer_mode, use_target, lossQ_type, use_cuda, tau, 26 | learnable_alpha, init_alpha, reg_mult, Q_exp_loss, gamma, 27 | loss_mode_exp, log_std_min, log_std_max, delay_Q, n_fits, 28 | logging_iter): 29 | 30 | # calculate the minimum and maximum Q-function 31 | Q_max = 1.0 / (reg_mult * (1 - gamma)) 32 | Q_min = - 1.0 / (reg_mult * (1 - gamma)) 33 | 34 | # Settings 35 | initial_replay_size = 10000 36 | max_replay_size = 1000000 37 | batch_size = 256 # the real batch size is double the size as an expert batch is going to be added 38 | warmup_transitions = 15000 39 | 40 | lr_alpha = 2e-6 41 | weight_decay_actor = 0.0 42 | weight_decay_critic = 0.0 43 | 44 | target_entropy = -22.0 45 | 46 | # Approximator 47 | actor_input_shape = mdp.info.observation_space.shape 48 | actor_output_shape = (mdp.info.action_space.shape[0]*2,) 49 | actor_params = dict(network=FullyConnectedNetwork, 50 | n_features=[256, 256], 51 | input_shape=actor_input_shape, 52 | output_shape=actor_output_shape, 53 | activations=["relu", "relu", "identity"], 54 | use_cuda=use_cuda) 55 | 56 | actor_optimizer = {'class': optim.Adam, 57 | 'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}} 58 | 59 | critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) 60 | critic_params = dict(network=FullyConnectedNetwork, 61 | optimizer={'class': optim.Adam, 62 | 'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}}, 63 | n_features=[256, 256], 64 | input_shape=critic_input_shape, 65 | activations=["relu", "relu", "identity"], 66 | squeeze_out=False, 67 | output_shape=(1,), 68 | use_cuda=use_cuda) 69 | 70 | # create IQfO agent 71 | agent = LSIQ(mdp_info=mdp.info, batch_size=batch_size, initial_replay_size=initial_replay_size, 72 | max_replay_size=max_replay_size, demonstrations=expert_data, sw=sw, use_target=use_target, 73 | warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha, actor_params=actor_params, 74 | actor_optimizer=actor_optimizer, critic_params=critic_params, delay_Q=delay_Q, lossQ_type=lossQ_type, 75 | target_entropy=target_entropy, critic_fit_params=None, plcy_loss_mode=plcy_loss_mode, 76 | regularizer_mode=regularizer_mode, learnable_alpha=learnable_alpha, init_alpha=init_alpha, 77 | reg_mult=reg_mult, Q_min=Q_min, Q_max=Q_max, log_std_min=log_std_min, log_std_max=log_std_max, 78 | loss_mode_exp=loss_mode_exp, Q_exp_loss=Q_exp_loss, n_fits=n_fits, logging_iter=logging_iter) 79 | 80 | return agent 81 | 82 | 83 | def experiment(env_id: str = "HalfCheetah-v2", 84 | n_epochs: int = 500, 85 | n_steps_per_epoch: int = 10000, 86 | n_steps_per_fit: int = 1, 87 | n_eval_episodes: int = 50, 88 | n_epochs_save: int = 100, 89 | logging_iter: int = 100, 90 | expert_data_path: str = None, 91 | use_cuda: bool = False, 92 | lr_critic: float = 3e-4, 93 | lr_actor: float = 3e-5, 94 | results_dir: str = "./logs", 95 | plcy_loss_mode: str = "value", 96 | regularizer_mode: str = "exp_and_plcy", 97 | reg_mult: float = 0.5, 98 | Q_exp_loss: str = "MSE", 99 | n_fits: int = 1, 100 | loss_mode_exp: str = "fix", 101 | log_std_min: float = -5.0, 102 | log_std_max: float = 2.0, 103 | learnable_alpha: bool = False, 104 | use_target: bool = True, 105 | init_alpha: float = 0.001, 106 | tau: float = 0.005, 107 | delay_Q: int = 1, 108 | lossQ_type: str = "sqil_like", 109 | gamma: float = 0.99, 110 | horizon: int = 1000, 111 | seed: int = 0): 112 | 113 | np.random.seed(seed) 114 | torch.random.manual_seed(seed) 115 | 116 | logger_stoch = Logger(results_dir=results_dir, log_name="stochastic_logging", seed=seed, append=True) 117 | logger_deter = Logger(results_dir=results_dir, log_name="deterministic_logging", seed=seed, append=True) 118 | 119 | results_dir = os.path.join(results_dir, str(seed)) 120 | 121 | env_params = dict(name=env_id, horizon=horizon, gamma=gamma) 122 | 123 | mdp = Gym(**env_params) 124 | 125 | # load expert data 126 | expert_data = prepare_expert_data(data_path=expert_data_path) 127 | 128 | # logging stuff 129 | tb_writer = SummaryWriter(log_dir=results_dir) 130 | agent_saver = BestAgentSaver(save_path=results_dir, n_epochs_save=n_epochs_save) 131 | 132 | # create agent and core 133 | agent = _create_agent(mdp, expert_data, sw=tb_writer, lr_critic=lr_critic, lr_actor=lr_actor, 134 | plcy_loss_mode=plcy_loss_mode, regularizer_mode=regularizer_mode, 135 | use_cuda=use_cuda, use_target=use_target, lossQ_type=lossQ_type, 136 | delay_Q=delay_Q, tau=tau, learnable_alpha=learnable_alpha, init_alpha=init_alpha, 137 | reg_mult=reg_mult, gamma=gamma, Q_exp_loss=Q_exp_loss, 138 | loss_mode_exp=loss_mode_exp, log_std_min=log_std_min, 139 | n_fits=n_fits, log_std_max=log_std_max, logging_iter=logging_iter) 140 | 141 | core = Core(agent, mdp) 142 | 143 | # iqfo train loop 144 | for epoch in range(n_epochs): 145 | with catchtime() as t: 146 | # training 147 | core.learn(n_steps=n_steps_per_epoch, n_steps_per_fit=n_steps_per_fit, quiet=True) 148 | print('Epoch %d | Time %fs ' % (epoch + 1, float(t()))) 149 | 150 | # evaluate with deterministic policy 151 | agent.policy.use_mean = True 152 | dataset = core.evaluate(n_episodes=n_eval_episodes) 153 | R_mean = np.mean(compute_J(dataset)) 154 | J_mean = np.mean(compute_J(dataset, gamma=gamma)) 155 | L = np.mean(compute_episodes_length(dataset)) 156 | logger_deter.log_numpy(Epoch=epoch, R_mean=R_mean, J_mean=J_mean, L=L) 157 | tb_writer.add_scalar("Eval_R-deterministic", R_mean, epoch) 158 | tb_writer.add_scalar("Eval_J-deterministic", J_mean, epoch) 159 | tb_writer.add_scalar("Eval_L-deterministic", L, epoch) 160 | agent.policy.use_mean = False 161 | 162 | # evaluate with stochastic policy 163 | dataset = core.evaluate(n_episodes=n_eval_episodes) 164 | R_mean_stoch = np.mean(compute_J(dataset)) 165 | J_mean_stoch = np.mean(compute_J(dataset, gamma=gamma)) 166 | L = np.mean(compute_episodes_length(dataset)) 167 | logger_stoch.log_numpy(Epoch=epoch, R_mean=R_mean_stoch, J_mean=J_mean_stoch, L=L) 168 | tb_writer.add_scalar("Eval_R-stochastic", R_mean_stoch, epoch) 169 | tb_writer.add_scalar("Eval_J-stochastic", J_mean_stoch, epoch) 170 | tb_writer.add_scalar("Eval_L-stochastic", L, epoch) 171 | 172 | print("R_mean (deter): %f | R_mean (stoch): %f" % (R_mean, R_mean_stoch)) 173 | 174 | # save agent if needed 175 | agent_saver.save(core.agent, J_mean) 176 | 177 | agent_saver.save_curr_best_agent() 178 | print("Finished.") 179 | 180 | @contextmanager 181 | def catchtime() -> float: 182 | start = perf_counter() 183 | yield lambda: perf_counter() - start 184 | 185 | 186 | if __name__ == "__main__": 187 | 188 | # Leave unchanged 189 | run_experiment(experiment) 190 | -------------------------------------------------------------------------------- /img/Divergence_Minimization.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robfiras/ls-iq/b097abd97f5e51c16d583bc9805cb40fb8e2ac01/img/Divergence_Minimization.gif -------------------------------------------------------------------------------- /imitation_lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robfiras/ls-iq/b097abd97f5e51c16d583bc9805cb40fb8e2ac01/imitation_lib/__init__.py -------------------------------------------------------------------------------- /imitation_lib/imitation/__init__.py: -------------------------------------------------------------------------------- 1 | from .gail_TRPO import GAIL as GAIL_TRPO 2 | from .vail_TRPO import VAIL as VAIL_TRPO 3 | from .iq_sac import IQ_SAC 4 | from .iqfo_orig import IQfO_ORIG 5 | from .sqil_sac import SQIL 6 | 7 | from .lsiq import LSIQ 8 | from .lsiq_h import LSIQ_H 9 | from .lsiq_hc import LSIQ_HC 10 | 11 | from .iqfo_sac import IQfO_SAC 12 | from .lsiqfo import LSIQfO 13 | from .lsiqfo_h import LSIQfO_H 14 | from .lsiqfo_hc import LSIQfO_HC 15 | 16 | 17 | from .offline import IQ_Offline, LSIQ_Offline, LSIQ_Offline_DM, BehavioralCloning 18 | __all__ = ['GAIL_TRPO', 'VAIL_TRPO', 'IQ_SAC', 'IQfO_SAC', 'IQfO_ORIG', 19 | 'LSIQ', 'SQIL', 'LSIQfO', 'LSIQ_H','LSIQ_HC', 'LSIQfO_HC', 20 | 'LSIQfO_H', "IQ_Offline", "LSIQ_Offline", "LSIQ_Offline_DM", "BehavioralCloning"] 21 | -------------------------------------------------------------------------------- /imitation_lib/imitation/iqfo_orig.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from mushroom_rl.approximators import Regressor 5 | from mushroom_rl.approximators.parametric import TorchApproximator 6 | from imitation_lib.imitation.iq_sac import IQ_SAC 7 | from mushroom_rl.utils.minibatches import minibatch_generator 8 | from mushroom_rl.utils.torch import to_float_tensor 9 | 10 | 11 | class IQfO_ORIG(IQ_SAC): 12 | 13 | 14 | def fit(self, dataset): 15 | 16 | # add to replay memory 17 | self._replay_memory.add(dataset) 18 | 19 | if self._replay_memory.initialized: 20 | 21 | # sample batch from policy replay buffer 22 | state, action, reward, next_state, absorbing, _ = \ 23 | self._replay_memory.get(self._batch_size()) 24 | 25 | # sample batch of same size from expert replay buffer and concatenate with samples from own policy 26 | demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0], 27 | self._demonstrations["states"], 28 | self._demonstrations["next_states"], 29 | self._demonstrations["absorbing"])) 30 | 31 | # the action by the expert is predicted by the policy 32 | with torch.no_grad(): 33 | demo_act, _ = self.policy.compute_action_and_log_prob_t(demo_obs) 34 | demo_act = demo_act.detach().numpy() 35 | 36 | # prepare data for IQ update 37 | input_states = to_float_tensor(np.concatenate([state, demo_obs.astype(np.float32)])) 38 | input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)])) 39 | input_n_states = to_float_tensor(np.concatenate([next_state, demo_nobs.astype(np.float32)])) 40 | input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)])) 41 | is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool), 42 | torch.ones(len(state), dtype=torch.bool)]) 43 | # make IQ update 44 | loss1, loss2, chi2_loss = self._lossQ(input_states, input_actions, input_n_states, input_absorbing, 45 | is_expert) 46 | self._sw.add_scalar('IQ-Loss/Loss1', loss1, self._iter) 47 | self._sw.add_scalar('IQ-Loss/Loss2', loss2, self._iter) 48 | self._sw.add_scalar('IQ-Loss/Chi2 Loss', chi2_loss, self._iter) 49 | self._sw.add_scalar('IQ-Loss/Alpha', self._alpha, self._iter) 50 | 51 | # update policy 52 | if self._replay_memory.size > self._warmup_transitions() and self._iter % self._delay_pi == 0: 53 | action_new, log_prob = self.policy.compute_action_and_log_prob_t(input_states) 54 | loss = self._actor_loss(input_states, action_new, log_prob) 55 | self._optimize_actor_parameters(loss) 56 | grads = [] 57 | for param in self.policy._approximator.model.network.parameters(): 58 | grads.append(param.grad.view(-1)) 59 | grads = torch.cat(grads) 60 | norm = grads.norm(dim=0, p=2) 61 | self._sw.add_scalar('Gradients/Norm2 Gradient Q wrt. Pi-parameters', norm, 62 | self._iter) 63 | self._sw.add_scalar('Actor/Loss', loss, self._iter) 64 | self._sw.add_scalar('Actor/Entropy', torch.mean(-log_prob).detach().item(), self._iter) 65 | if self._learnable_alpha: 66 | self._update_alpha(log_prob.detach()) 67 | 68 | self._update_target(self._critic_approximator, 69 | self._target_critic_approximator) 70 | 71 | self._iter += 1 72 | 73 | -------------------------------------------------------------------------------- /imitation_lib/imitation/iqfo_sac.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from copy import deepcopy 5 | 6 | from mushroom_rl.core import Serializable 7 | from mushroom_rl.approximators import Regressor 8 | from mushroom_rl.approximators.parametric import TorchApproximator 9 | from imitation_lib.imitation.iq_sac import IQ_SAC 10 | from mushroom_rl.utils.minibatches import minibatch_generator 11 | from mushroom_rl.utils.torch import to_float_tensor 12 | from mushroom_rl.utils.parameters import to_parameter 13 | from imitation_lib.utils.action_models import GaussianInvActionModel, LearnableVarGaussianInvActionModel,\ 14 | GCPActionModel, KLGCPActionModel, KLGaussianInvActionModel 15 | 16 | from imitation_lib.utils.distributions import InverseGamma 17 | 18 | 19 | class IQfO_SAC(IQ_SAC): 20 | 21 | def __init__(self, action_model, action_model_params, action_model_fit_params=None, action_model_noise_std=0.0, 22 | action_model_noise_clip=None, add_noise_to_obs=False, ext_normalizer_action_model=None, 23 | interpolate_expert_states=False, interpolation_coef=1.0, **kwargs): 24 | 25 | super().__init__(**kwargs) 26 | 27 | if action_model == GaussianInvActionModel or action_model == GCPActionModel \ 28 | or action_model == KLGCPActionModel or action_model == KLGaussianInvActionModel: 29 | action_model_params.setdefault("min_a", self.mdp_info.action_space.low) 30 | action_model_params.setdefault("max_a", self.mdp_info.action_space.high) 31 | action_model_params.setdefault("use_cuda", self._use_cuda) 32 | elif action_model == LearnableVarGaussianInvActionModel: 33 | action_model_params.setdefault("use_cuda", self._use_cuda) 34 | 35 | # setup the action model 36 | self._action_model = action_model(**action_model_params, demonstration=self._demonstrations) 37 | 38 | self._action_model_fit_params = dict(fits_per_step=1, init_epochs=0, )\ 39 | if action_model_fit_params is None else action_model_fit_params 40 | self._action_model_initialized = True if self._action_model_fit_params["init_epochs"] > 0 else False 41 | self._action_model_batch_size = action_model_params["batch_size"] 42 | 43 | self._action_model_noise_std = action_model_noise_std 44 | self._action_model_noise_clip = action_model_noise_clip 45 | self.ext_normalizer_action_model = ext_normalizer_action_model 46 | self._add_noise_to_obs = add_noise_to_obs 47 | self._interpolate_expert_states = interpolate_expert_states 48 | self._interpolation_coef = interpolation_coef 49 | 50 | self._add_save_attr( 51 | _action_model='mushroom', 52 | _action_model_fit_params='pickle', 53 | _action_model_noise_std='primitive', 54 | _action_model_noise_clip='primitive', 55 | ext_normalizer_action_model='pickle', 56 | _add_noise_to_obs='primitive' 57 | ) 58 | 59 | def fit(self, dataset): 60 | 61 | # add to replay memory 62 | self._replay_memory.add(dataset) 63 | 64 | if self._replay_memory.initialized: 65 | 66 | # train the action model 67 | if not self._action_model_initialized: 68 | self.train_action_model(init=True) 69 | self._action_model_initialized = True 70 | else: 71 | self.train_action_model() 72 | 73 | # sample batch from policy replay buffer 74 | state, action, reward, next_state, absorbing, _ = \ 75 | self._replay_memory.get(self._batch_size()) 76 | 77 | # sample batch of same size from expert replay buffer and concatenate with samples from own policy 78 | demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0], 79 | self._demonstrations["states"], 80 | self._demonstrations["next_states"], 81 | self._demonstrations["absorbing"])) 82 | 83 | # predict the actions for our expert dataset 84 | demo_obs_act = demo_obs.astype(np.float32)[:, self._state_mask] 85 | demo_nobs_act = demo_nobs.astype(np.float32)[:, self._state_mask] 86 | demo_act = self._action_model.draw_action(to_float_tensor(demo_obs_act), 87 | to_float_tensor(demo_nobs_act)) 88 | 89 | if self._add_noise_to_obs: 90 | assert self.ext_normalizer_action_model is not None, "Normalizer is needed to be defined." 91 | 92 | demo_obs = self.ext_normalizer_action_model(demo_obs) 93 | demo_nobs = self.ext_normalizer_action_model(demo_nobs) 94 | demo_obs += self._get_noise(demo_obs) 95 | demo_nobs += self._get_noise(demo_nobs) 96 | demo_obs = self.ext_normalizer_action_model.inv(demo_obs) 97 | demo_nobs = self.ext_normalizer_action_model.inv(demo_nobs) 98 | 99 | # make interpolation if needed 100 | if self._interpolate_expert_states: 101 | demo_obs = self.interpolate(demo_obs[:, self._state_mask], state[:, self._state_mask], 102 | mixing_coef=self._interpolation_coef) 103 | demo_act = self.interpolate(demo_act, action, 104 | mixing_coef=self._interpolation_coef) 105 | 106 | # prepare data for IQ update 107 | input_states = to_float_tensor(np.concatenate([state, 108 | demo_obs.astype(np.float32)[:, self._state_mask]])) 109 | input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)])) 110 | input_n_states = to_float_tensor(np.concatenate([next_state, 111 | demo_nobs.astype(np.float32)[:, self._state_mask]])) 112 | input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)])) 113 | is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool), 114 | torch.ones(len(state), dtype=torch.bool)]) 115 | 116 | # make IQ update 117 | self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert) 118 | 119 | self._iter += 1 120 | self.policy.iter += 1 121 | 122 | def _get_noise(self, x): 123 | noise = np.random.normal(loc=0.0, scale=self._action_model_noise_std, 124 | size=np.size(x)).reshape(x.shape) 125 | noise = np.clip(noise, -self._action_model_noise_clip, self._action_model_noise_clip) \ 126 | if self._action_model_noise_clip is not None else noise 127 | return noise 128 | 129 | def interpolate(self, expert_data, policy_data, mixing_coef=None): 130 | interpolated = mixing_coef * expert_data + (1 - mixing_coef) * policy_data 131 | return interpolated 132 | 133 | def train_action_model(self, init=False): 134 | 135 | if init and self._action_model_fit_params["init_epochs"] > 0: 136 | n_epochs = self._action_model_fit_params["init_epochs"] 137 | # initialize the model 138 | state, action, _, next_state, _, _ = self._replay_memory.get(self._replay_memory.size) 139 | state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state 140 | next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state 141 | state_train = state[0:int(len(state)*0.9), :] 142 | state_val = state[int(len(state)*0.9):, :] 143 | next_state_train = next_state[0:int(len(next_state)*0.9), :] 144 | next_state_val = next_state[int(len(next_state)*0.9):, :] 145 | action_train = action[0:int(len(next_state)*0.9), :] 146 | action_val = action[int(len(next_state)*0.9):, :] 147 | state_nstate_train = np.concatenate([state_train, next_state_train], axis=1) 148 | state_nstate_val = np.concatenate([state_val, next_state_val], axis=1) 149 | 150 | # make eval before training 151 | action_pred = self._action_model(state_nstate_val) 152 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val)) 153 | self._sw.add_scalar('Action-Model/Loss', loss, self._iter) 154 | print("Action Model Validation Loss before training: ", loss) 155 | action_pred = self._action_model(state_nstate_train) 156 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train)) 157 | print("Action Model Training Loss before training: ", loss) 158 | w = self._action_model.get_weights() 159 | norm = np.linalg.norm(w) 160 | self.sw_add_scalar("Action-Model/Norm", norm, self._iter) 161 | 162 | # make training 163 | self._action_model.fit(state_nstate_train, action_train, n_epochs=n_epochs) 164 | 165 | # make eval after training 166 | action_pred = self._action_model(state_nstate_val) 167 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val)) 168 | self._sw.add_scalar('Action-Model/Loss', loss, self._iter) 169 | print("Action Model Validation Loss After training: ", loss) 170 | action_pred = self._action_model(state_nstate_train) 171 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train)) 172 | print("Action Model Validation Loss After training: ", loss) 173 | 174 | else: 175 | state_nstates = [] 176 | actions = [] 177 | for i in range(self._action_model_fit_params["fits_per_step"]): 178 | # sample batch from policy replay buffer 179 | state, action, reward, next_state, absorbing, _ = \ 180 | self._replay_memory.get(self._action_model_batch_size) 181 | 182 | state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state 183 | next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state 184 | self._action_model.fit(state, next_state, action) 185 | 186 | state_nstates.append([state, next_state]) 187 | actions.append(action) 188 | 189 | if self._iter % self._logging_iter == 0: 190 | 191 | # sample batch from policy replay buffer 192 | states, actions, rewards, next_states, absorbings, _ = \ 193 | self._replay_memory.get(self._action_model_batch_size) 194 | 195 | # we need to check if we have a dataset with expert actions available or not 196 | try: 197 | exp_states, exp_next_states, exp_actions = next( 198 | minibatch_generator(self._action_model_batch_size, 199 | self._demonstrations["states"], 200 | self._demonstrations["next_states"], 201 | self._demonstrations["actions"])) 202 | except KeyError: 203 | exp_states, exp_next_states = next(minibatch_generator(self._action_model_batch_size, 204 | self._demonstrations["states"], 205 | self._demonstrations["next_states"])) 206 | exp_actions = None 207 | 208 | # log mse 209 | action_pred = self._action_model(states[:, self._state_mask], next_states[:, self._state_mask]) 210 | mse = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(actions)) 211 | self.sw_add_scalar('Action-Model/Loss Policy', mse, self._iter) 212 | if exp_actions is not None: 213 | action_pred_exp = self._action_model(exp_states[:, self._state_mask], 214 | exp_next_states[:, self._state_mask]) 215 | mse_exp = F.mse_loss(to_float_tensor(action_pred_exp), to_float_tensor(exp_actions)) 216 | self.sw_add_scalar('Action-Model/Loss Exp', mse_exp, self._iter) 217 | 218 | # log entropy 219 | ent_plcy = self._action_model.entropy(states[:, self._state_mask], 220 | next_states[:, self._state_mask]) 221 | ent_exp = self._action_model.entropy(exp_states[:, self._state_mask], 222 | exp_next_states[:, self._state_mask]) 223 | self.sw_add_scalar('Action-Model/Entropy Plcy', ent_plcy, 224 | self._iter) 225 | self.sw_add_scalar('Action-Model/Entropy Exp', ent_exp, 226 | self._iter) 227 | 228 | # log mu, lam, alpha, beta 229 | if type(self._action_model) == GCPActionModel or type(self._action_model) == KLGCPActionModel: 230 | mu, lam, alpha, beta = self._action_model.get_prior_params(states[:, self._state_mask], 231 | next_states[:, self._state_mask]) 232 | self.sw_add_scalar('Action-Model/Mu', np.mean(mu.detach().cpu().numpy()), self._iter) 233 | self.sw_add_scalar('Action-Model/Lambda', np.mean(lam.detach().cpu().numpy()), self._iter) 234 | self.sw_add_scalar('Action-Model/Lambda Counter', self._action_model.lam_counter, self._iter) 235 | self.sw_add_scalar('Action-Model/Alpha', np.mean(alpha.detach().cpu().numpy()), self._iter) 236 | self.sw_add_scalar('Action-Model/Beta', np.mean(beta.detach().cpu().numpy()), self._iter) 237 | self.sw_add_scalar('Action-Model/Var', 238 | np.mean(self._action_model.get_corrected_pred_var(lam, 239 | alpha, 240 | beta).detach().cpu().numpy()), 241 | self._iter) 242 | mu_exp, lam_exp, alpha_exp, beta_exp = \ 243 | self._action_model.get_prior_params(exp_states[:, self._state_mask], 244 | exp_next_states[:, self._state_mask]) 245 | self.sw_add_scalar('Action-Model/Mu Exp', np.mean(mu_exp.detach().cpu().numpy()), self._iter) 246 | self.sw_add_scalar('Action-Model/Lambda Exp', np.mean(lam_exp.detach().cpu().numpy()), self._iter) 247 | self.sw_add_scalar('Action-Model/Alpha Exp', np.mean(alpha_exp.detach().cpu().numpy()), self._iter) 248 | self.sw_add_scalar('Action-Model/Beta Exp', np.mean(beta_exp.detach().cpu().numpy()), self._iter) 249 | self.sw_add_scalar('Action-Model/Var Exp', 250 | np.mean(self._action_model.get_corrected_pred_var(lam_exp, 251 | alpha_exp, 252 | beta_exp).detach().cpu().numpy()), 253 | self._iter) 254 | elif type(self._action_model) == GaussianInvActionModel or \ 255 | type(self._action_model) == KLGaussianInvActionModel: 256 | mu, log_sigma = self._action_model.get_mu_log_sigma(state[:, self._state_mask], 257 | next_state[:, self._state_mask]) 258 | mu_exp, log_sigma_exp = self._action_model.get_mu_log_sigma(exp_states.astype(np.float32)[:, self._state_mask], 259 | exp_next_states.astype(np.float32)[:, self._state_mask]) 260 | 261 | self._sw.add_scalar('Action-Model/Std Exp', torch.mean(torch.exp(log_sigma_exp)), self._iter) 262 | self._sw.add_scalar('Action-Model/Std', torch.mean(torch.exp(log_sigma)), self._iter) 263 | self._sw.add_scalar('Action-Model/Mu Exp', torch.mean(mu_exp), self._iter) 264 | self._sw.add_scalar('Action-Model/Mu', torch.mean(mu), self._iter) 265 | 266 | # log norm 267 | #w = self._action_model.get_weights() 268 | #norm = np.linalg.norm(w) 269 | #self.sw_add_scalar("Action-Model/Norm", norm, self._iter) 270 | -------------------------------------------------------------------------------- /imitation_lib/imitation/lsiq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from .iq_sac import IQ_SAC 4 | import torch.nn.functional as F 5 | 6 | from mushroom_rl.utils.torch import to_float_tensor 7 | 8 | 9 | class LSIQ(IQ_SAC): 10 | 11 | def __init__(self, Q_max=1.0, Q_min =-1.0, loss_mode_exp="fix", Q_exp_loss=None, 12 | treat_absorbing_states=False, target_clipping=True, lossQ_type="iq_like", **kwargs): 13 | 14 | # call parent 15 | super(LSIQ, self).__init__(**kwargs) 16 | 17 | self._Q_max = Q_max 18 | self._Q_min = Q_min 19 | self._loss_mode_exp = loss_mode_exp # or bootstrap 20 | self._Q_exp_loss = Q_exp_loss 21 | self._treat_absorbing_states = treat_absorbing_states 22 | self._target_clipping = target_clipping 23 | self._lossQ_type = lossQ_type 24 | 25 | def _lossQ(self, obs, act, next_obs, absorbing, is_expert): 26 | if self._lossQ_type == "sqil_like": 27 | return self._lossQ_sqil_like(obs, act, next_obs, absorbing, is_expert) 28 | elif self._lossQ_type == "iq_like": 29 | return self._lossQ_iq_like(obs, act, next_obs, absorbing, is_expert) 30 | else: 31 | raise ValueError("Unsupported lossQ type %s" % self._lossQ_type) 32 | 33 | def _lossQ_iq_like(self, obs, act, next_obs, absorbing, is_expert): 34 | 35 | # 1st expert term of loss 36 | gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma) 37 | absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing 38 | current_Q = self._critic_approximator(obs, act, output_tensor=True) 39 | if not self._use_target: 40 | next_v = self.getV(next_obs) 41 | else: 42 | with torch.no_grad(): 43 | next_v = self.get_targetV(next_obs).detach() 44 | absorbing = torch.unsqueeze(absorbing, 1) 45 | 46 | if self._target_clipping: 47 | y = (1 - absorbing) * gamma.detach() * torch.clip(next_v, self._Q_min, self._Q_max) 48 | else: 49 | y = (1 - absorbing) * gamma.detach() * next_v 50 | 51 | reward = (current_Q - y) 52 | exp_reward = reward[is_expert] 53 | 54 | if self._loss_mode_exp == "bootstrap": 55 | loss_term1 = - exp_reward.mean() 56 | elif self._loss_mode_exp == "fix": 57 | if self._Q_exp_loss == "MSE": 58 | loss_term1 = F.mse_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max) 59 | elif self._Q_exp_loss == "Huber": 60 | loss_term1 = F.huber_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max) 61 | elif self._Q_exp_loss is None: 62 | raise ValueError("If you choose loss_mode_exp == fix, you have to specify Q_exp_loss. Setting it to" 63 | "None is not valid.") 64 | else: 65 | raise ValueError( 66 | "Choosen Q_exp_loss %s is not supported. Choose either MSE or Huber." % self._Q_exp_loss) 67 | 68 | # do the logging 69 | self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing) 70 | 71 | # 2nd policy term for our loss 72 | V = self.getV(obs) 73 | value = (V - y) 74 | self.sw_add_scalar('V for policy on all states', V.mean(), self._iter) 75 | value_loss = value 76 | if self._plcy_loss_mode == "value": 77 | loss_term2 = value_loss.mean() 78 | elif self._plcy_loss_mode == "value_expert": 79 | value_loss_exp = value_loss[is_expert] 80 | loss_term2 = value_loss_exp.mean() 81 | elif self._plcy_loss_mode == "value_policy": 82 | value_loss_plcy = value_loss[~is_expert] 83 | loss_term2 = value_loss_plcy.mean() 84 | elif self._plcy_loss_mode == "q_old_policy": 85 | reward_plcy = reward[~is_expert] 86 | loss_term2 = reward_plcy.mean() 87 | elif self._plcy_loss_mode == "value_q_old_policy": 88 | reward_plcy = reward[~is_expert] 89 | loss_term2 = reward_plcy.mean() + value_loss.mean() 90 | elif self._plcy_loss_mode == "v0": 91 | value_loss_v0 = (1-gamma.detach()) * self.getV(obs[is_expert]) 92 | loss_term2 = value_loss_v0.mean() 93 | elif self._plcy_loss_mode == "off": 94 | loss_term2 = 0.0 95 | else: 96 | raise ValueError("Undefined policy loss mode: %s" % self._plcy_loss_mode) 97 | 98 | # regularize 99 | chi2_loss = self.regularizer_loss(absorbing, reward, gamma, is_expert, treat_absorbing_states=self._treat_absorbing_states) 100 | 101 | loss_Q = loss_term1 + loss_term2 + chi2_loss 102 | self.update_Q_parameters(loss_Q) 103 | 104 | if self._iter % self._logging_iter == 0: 105 | grads = [] 106 | for param in self._critic_approximator.model.network.parameters(): 107 | grads.append(param.grad.view(-1)) 108 | grads = torch.cat(grads) 109 | norm = grads.norm(dim=0, p=2) 110 | self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter) 111 | 112 | return loss_term1, loss_term2, chi2_loss 113 | 114 | def _lossQ_sqil_like(self, obs, act, next_obs, absorbing, is_expert): 115 | 116 | gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma) 117 | absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing 118 | current_Q = self._critic_approximator(obs, act, output_tensor=True) 119 | if not self._use_target: 120 | next_v = self.getV(next_obs) 121 | else: 122 | with torch.no_grad(): 123 | next_v = self.get_targetV(next_obs).detach() 124 | absorbing = torch.unsqueeze(absorbing, 1) 125 | if self._target_clipping: 126 | y = (1 - absorbing) * gamma.detach() * torch.clip(next_v, self._Q_min, self._Q_max) 127 | else: 128 | y = (1 - absorbing) * gamma.detach() * next_v 129 | 130 | # define the rewards 131 | if self._treat_absorbing_states: 132 | r_max = (1 - absorbing) * ((1 / self._reg_mult)) \ 133 | + absorbing * (1 / (1 - gamma.detach())) * ((1 / self._reg_mult)) 134 | r_min = (1 - absorbing) * (-(1 / self._reg_mult))\ 135 | + absorbing * (1 / (1 - gamma.detach())) * (-(1 / self._reg_mult)) 136 | else: 137 | r_max = torch.ones_like(absorbing) * ((1 / self._reg_mult)) 138 | r_min = torch.ones_like(absorbing) * (-(1 / self._reg_mult)) 139 | 140 | r_max = r_max[is_expert] 141 | r_min = r_min[~is_expert] 142 | 143 | # expert part 144 | if self._loss_mode_exp == "bootstrap": 145 | if self._Q_exp_loss == "MSE": 146 | loss_term1 = torch.mean(torch.square(current_Q[is_expert] - (r_max + y[is_expert]))) 147 | elif self._Q_exp_loss == "Huber": 148 | loss_term1 = F.huber_loss(current_Q[is_expert], (r_max + y[is_expert])) 149 | else: 150 | raise ValueError("Unknown loss.") 151 | elif self._loss_mode_exp == "fix": 152 | if self._Q_exp_loss == "MSE": 153 | loss_term1 = F.mse_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max) 154 | elif self._Q_exp_loss == "Huber": 155 | loss_term1 = F.huber_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max) 156 | else: 157 | raise ValueError("Unknown loss.") 158 | else: 159 | raise ValueError("Unknown expert loss mode.") 160 | 161 | # policy part 162 | if self._plcy_loss_mode == "value": 163 | value = self.getV(obs) 164 | target = y 165 | r_min = torch.concat([r_min, torch.ones_like(r_min) * (-(1 / self._reg_mult))]) 166 | elif self._plcy_loss_mode == "value_plcy": 167 | value = self.getV(obs[~is_expert]) 168 | target = y[~is_expert] 169 | elif self._plcy_loss_mode == "q_old_policy": 170 | value = current_Q[~is_expert] 171 | target = y[~is_expert] 172 | 173 | if self._Q_exp_loss == "MSE": 174 | loss_term2 = torch.mean(torch.square(value - (r_min + target))) 175 | elif self._Q_exp_loss == "Huber": 176 | loss_term2 = F.huber_loss(value, (r_min + target)) 177 | else: 178 | raise ValueError("Unknown loss.") 179 | 180 | # do the logging 181 | reward = (current_Q - y) 182 | self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing) 183 | 184 | loss_Q = loss_term1 + loss_term2 185 | self.update_Q_parameters(loss_Q) 186 | 187 | grads = [] 188 | for param in self._critic_approximator.model.network.parameters(): 189 | grads.append(param.grad.view(-1)) 190 | grads = torch.cat(grads) 191 | norm = grads.norm(dim=0, p=2) 192 | if self._iter % self._logging_iter == 0: 193 | self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter) 194 | 195 | return loss_term1, loss_term2, 0.0 -------------------------------------------------------------------------------- /imitation_lib/imitation/lsiq_h.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import torch 3 | import numpy as np 4 | from .lsiq import LSIQ 5 | import torch.nn.functional as F 6 | from mushroom_rl.approximators import Regressor 7 | from mushroom_rl.approximators.parametric import TorchApproximator 8 | 9 | from mushroom_rl.utils.torch import to_float_tensor 10 | 11 | 12 | class LSIQ_H(LSIQ): 13 | 14 | def __init__(self, H_params=None, clip_expert_entropy_to_policy_max=True , 15 | max_H_policy_tau_down = 1e-4, max_H_policy_tau_up = 1e-2, **kwargs): 16 | 17 | # call parent 18 | super().__init__(**kwargs) 19 | 20 | # define the H function with the target 21 | target_H_params = deepcopy(H_params) 22 | self._H_approximator = Regressor(TorchApproximator, 23 | **H_params) 24 | self._target_H_approximator = Regressor(TorchApproximator, 25 | **target_H_params) 26 | self._clip_expert_entropy_to_policy_max = clip_expert_entropy_to_policy_max 27 | self._max_H_policy = None 28 | self._max_H_policy_tau_down = max_H_policy_tau_down 29 | self._max_H_policy_tau_up = max_H_policy_tau_up 30 | 31 | # define the optimizer for the H function 32 | net_params = self._H_approximator.model.network.parameters() 33 | self._H_optimizer = H_params["optimizer"]["class"](net_params, **H_params["optimizer"]["params"]) 34 | 35 | def _lossQ_iq_like(self, obs, act, next_obs, absorbing, is_expert): 36 | 37 | # update Q according to lsiq_update 38 | loss_term1, loss_term2, chi2_loss = super()._lossQ_iq_like(obs, act, next_obs, absorbing, is_expert) 39 | 40 | # update the H function 41 | gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma) 42 | self.update_H_function(obs, act, next_obs, absorbing, gamma.detach(), is_expert) 43 | 44 | return loss_term1, loss_term2, chi2_loss 45 | 46 | def _lossQ_sqil_like(self, obs, act, next_obs, absorbing, is_expert): 47 | 48 | # update Q according to lsiq_update 49 | loss_term1, loss_term2, chi2_loss = super(LSIQ_H, self)._lossQ_sqil_like(obs, act, next_obs, absorbing, is_expert) 50 | 51 | # update the H function 52 | gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma) 53 | self.update_H_function(obs, act, next_obs, absorbing, gamma.detach(), is_expert) 54 | 55 | return loss_term1, loss_term2, chi2_loss 56 | 57 | def update_H_function(self, obs, action, next_obs, absorbing, gamma, is_expert): 58 | H = self._H_approximator(obs, action, output_tensor=True) 59 | with torch.no_grad(): 60 | next_action, log_pi = self.policy.compute_action_and_log_prob_t(next_obs) 61 | 62 | # restrict the target H of the expert to the maximum one of the policy 63 | neg_log_pi = -log_pi 64 | if self._clip_expert_entropy_to_policy_max: 65 | if self._max_H_policy is None: 66 | self._max_H_policy = torch.max(neg_log_pi[~is_expert]) 67 | else: 68 | curr_max_H_policy = torch.max(neg_log_pi[~is_expert]) 69 | if curr_max_H_policy > self._max_H_policy: 70 | self._max_H_policy = (1 - self._max_H_policy_tau_up) * self._max_H_policy + \ 71 | self._max_H_policy_tau_up * curr_max_H_policy 72 | else: 73 | self._max_H_policy = (1 - self._max_H_policy_tau_down) * self._max_H_policy + \ 74 | self._max_H_policy_tau_down * curr_max_H_policy 75 | neg_log_pi[is_expert] = torch.clip(neg_log_pi[is_expert], self._max_H_policy, 100000) 76 | 77 | next_H = (self._target_H_approximator(next_obs, next_action, output_tensor=True).detach() + 78 | self._alpha.detach() * torch.unsqueeze(neg_log_pi, 1)) 79 | target_H = (1 - absorbing) * gamma * next_H 80 | 81 | # clip the target for numerical stability 82 | target_H = torch.clip(target_H, -10000, 1000) 83 | loss_H = F.mse_loss(H, target_H) 84 | 85 | self._H_optimizer.zero_grad() 86 | loss_H.backward() 87 | self._H_optimizer.step() 88 | 89 | H = H.detach().cpu().numpy() 90 | log_pi = log_pi.detach().cpu().numpy() 91 | 92 | # do some additional logging 93 | if self._iter % self._logging_iter == 0: 94 | self.sw_add_scalar('H function/Loss', loss_H, self._iter) 95 | self.sw_add_scalar('H function/H', np.mean(H), self._iter) 96 | self.sw_add_scalar('H function/H plcy', np.mean(H[~is_expert]), self._iter) 97 | self.sw_add_scalar('H function/H expert', np.mean(H[is_expert]), self._iter) 98 | self.sw_add_scalar('H function/H_step', np.mean(-log_pi), self._iter) 99 | self.sw_add_scalar('H function/H_step plcy', np.mean(-log_pi[~is_expert]), self._iter) 100 | self.sw_add_scalar('H function/H_step expert', np.mean(-log_pi[is_expert]), self._iter) 101 | 102 | return loss_H, H, log_pi 103 | 104 | def _actor_loss(self, state, action_new, log_prob): 105 | q = self._critic_approximator(state, action_new, output_tensor=True) 106 | H = self._H_approximator(state, action_new, output_tensor=True) 107 | soft_q = q + H 108 | return (self._alpha.detach() * log_prob - soft_q).mean() 109 | 110 | def getV(self, obs): 111 | with torch.no_grad(): 112 | action, _ = self.policy.compute_action_and_log_prob_t(obs) 113 | current_V = self._critic_approximator(obs, action.detach().cpu().numpy(), output_tensor=True) 114 | return current_V 115 | 116 | def get_targetV(self, obs): 117 | with torch.no_grad(): 118 | action, _ = self.policy.compute_action_and_log_prob_t(obs) 119 | target_V = self._target_critic_approximator(obs, action.detach().cpu().numpy(), output_tensor=True) 120 | return target_V 121 | 122 | def _update_all_targets(self): 123 | self._update_target(self._critic_approximator, 124 | self._target_critic_approximator) 125 | self._update_target(self._H_approximator, 126 | self._target_H_approximator) 127 | -------------------------------------------------------------------------------- /imitation_lib/imitation/lsiq_hc.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import torch 3 | import numpy as np 4 | from .lsiq_h import LSIQ_H 5 | import torch.nn.functional as F 6 | from mushroom_rl.utils.parameters import to_parameter 7 | from mushroom_rl.approximators import Regressor 8 | from mushroom_rl.approximators.parametric import TorchApproximator 9 | 10 | from mushroom_rl.utils.torch import to_float_tensor 11 | 12 | 13 | class LSIQ_HC(LSIQ_H): 14 | 15 | def __init__(self, H_tau, H_loss_mode="Huber", **kwargs): 16 | 17 | # call parent 18 | super().__init__(**kwargs) 19 | 20 | self._H_tau = to_parameter(H_tau) 21 | self._H_loss_mode = H_loss_mode # either MSE or Huber 22 | 23 | def update_H_function(self, obs, action, next_obs, absorbing, gamma, is_expert): 24 | 25 | # calculate the squared reward of the current Q 26 | H = self._H_approximator(obs, action, output_tensor=True) 27 | with torch.no_grad(): 28 | next_action, log_pi = self.policy.compute_action_and_log_prob_t(next_obs) 29 | Q_plcy = self._target_critic_approximator(obs, action, output_tensor=True) 30 | V_plcy = self.get_targetV(obs) 31 | y = (1 - absorbing) * gamma.detach() * torch.clip(V_plcy, self._Q_min, 32 | self._Q_max) 33 | 34 | reward_non_abs = torch.square(torch.clip(Q_plcy - y, -1/self._reg_mult, 1/self._reg_mult)).detach() 35 | reward_abs = torch.square(torch.clip(Q_plcy - y, self._Q_min, self._Q_max)).detach() 36 | 37 | squared_reg_reward_plcy = (1 - absorbing) * self._reg_mult * reward_non_abs \ 38 | + absorbing * (1.0 - gamma.detach()) * self._reg_mult * reward_abs 39 | 40 | # restrict the target H of the expert to the maximum one of the policy 41 | neg_log_pi = -log_pi 42 | if self._clip_expert_entropy_to_policy_max: 43 | if self._max_H_policy is None: 44 | self._max_H_policy = torch.max(neg_log_pi[~is_expert]) 45 | else: 46 | curr_max_H_policy = torch.max(neg_log_pi[~is_expert]) 47 | if curr_max_H_policy > self._max_H_policy: 48 | self._max_H_policy = (1 - self._max_H_policy_tau_up) * self._max_H_policy + \ 49 | self._max_H_policy_tau_up * curr_max_H_policy 50 | else: 51 | self._max_H_policy = (1 - self._max_H_policy_tau_down) * self._max_H_policy + \ 52 | self._max_H_policy_tau_down * curr_max_H_policy 53 | neg_log_pi[is_expert] = torch.clip(neg_log_pi[is_expert], self._max_H_policy, 100000) 54 | 55 | # calculate the target for the HC-function 56 | next_H = (self._target_H_approximator(next_obs, next_action, output_tensor=True).detach() + 57 | self._alpha.detach() * torch.unsqueeze(neg_log_pi, 1)) 58 | target_H = squared_reg_reward_plcy + (1 - absorbing) * gamma * next_H 59 | 60 | # clip the target for numerical stability 61 | Q2_max = (1.0/self._reg_mult)**2 / (1 - gamma.detach()) 62 | target_H = torch.clip(target_H, -1000, Q2_max+100) 63 | 64 | if self._H_loss_mode == "Huber": 65 | loss_H = F.huber_loss(H, target_H) 66 | elif self._H_loss_mode == "MSE": 67 | loss_H = F.mse_loss(H, target_H) 68 | else: 69 | raise ValueError("Unsupported H_loss %s" % self._H_loss_mode) 70 | 71 | self._H_optimizer.zero_grad() 72 | loss_H.backward() 73 | self._H_optimizer.step() 74 | 75 | H = H.detach().cpu().numpy() 76 | log_pi = log_pi.detach().cpu().numpy() 77 | 78 | # do some additional logging 79 | if self._iter % self._logging_iter == 0: 80 | self.sw_add_scalar('H function/Loss', loss_H, self._iter) 81 | self.sw_add_scalar('H function/H', np.mean(H), self._iter) 82 | self.sw_add_scalar('H function/H plcy', np.mean(H[~is_expert]), self._iter) 83 | self.sw_add_scalar('H function/H expert', np.mean(H[is_expert]), self._iter) 84 | self.sw_add_scalar('H function/H_step', np.mean(-log_pi), self._iter) 85 | self.sw_add_scalar('H function/H_step plcy', np.mean(-log_pi[~is_expert]), self._iter) 86 | self.sw_add_scalar('H function/H_step expert', np.mean(-log_pi[is_expert]), self._iter) 87 | 88 | return loss_H, H, log_pi 89 | 90 | def _update_all_targets(self): 91 | self._update_target(self._critic_approximator, 92 | self._target_critic_approximator) 93 | self._update_target_H(self._H_approximator, 94 | self._target_H_approximator) 95 | 96 | def _update_target_H(self, online, target): 97 | for i in range(len(target)): 98 | weights = self._H_tau() * online[i].get_weights() 99 | weights += (1 - self._H_tau.get_value()) * target[i].get_weights() 100 | target[i].set_weights(weights) 101 | -------------------------------------------------------------------------------- /imitation_lib/imitation/lsiqfo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from copy import deepcopy 5 | 6 | from mushroom_rl.core import Serializable 7 | from mushroom_rl.approximators import Regressor 8 | from mushroom_rl.approximators.parametric import TorchApproximator 9 | from imitation_lib.imitation.lsiq import LSIQ 10 | from mushroom_rl.utils.minibatches import minibatch_generator 11 | from mushroom_rl.utils.torch import to_float_tensor 12 | from mushroom_rl.utils.parameters import to_parameter 13 | from imitation_lib.utils.action_models import GaussianInvActionModel, LearnableVarGaussianInvActionModel,\ 14 | GCPActionModel, KLGCPActionModel, KLGaussianInvActionModel 15 | 16 | from imitation_lib.utils.distributions import InverseGamma 17 | 18 | 19 | class LSIQfO(LSIQ): 20 | 21 | def __init__(self, action_model, action_model_params, action_model_fit_params=None, action_model_noise_std=0.0, 22 | action_model_noise_clip=None, add_noise_to_obs=False, ext_normalizer_action_model=None, 23 | interpolate_expert_states=False, interpolation_coef=1.0, **kwargs): 24 | 25 | super().__init__(**kwargs) 26 | 27 | if action_model == GaussianInvActionModel or action_model == GCPActionModel \ 28 | or action_model == KLGCPActionModel or action_model == KLGaussianInvActionModel: 29 | action_model_params.setdefault("min_a", self.mdp_info.action_space.low) 30 | action_model_params.setdefault("max_a", self.mdp_info.action_space.high) 31 | action_model_params.setdefault("use_cuda", self._use_cuda) 32 | elif action_model == LearnableVarGaussianInvActionModel: 33 | action_model_params.setdefault("use_cuda", self._use_cuda) 34 | 35 | # setup the action model 36 | self._action_model = action_model(**action_model_params, demonstration=self._demonstrations) 37 | 38 | self._action_model_fit_params = dict(fits_per_step=1, init_epochs=0, )\ 39 | if action_model_fit_params is None else action_model_fit_params 40 | self._action_model_initialized = True if self._action_model_fit_params["init_epochs"] > 0 else False 41 | self._action_model_batch_size = action_model_params["batch_size"] 42 | 43 | self._action_model_noise_std = action_model_noise_std 44 | self._action_model_noise_clip = action_model_noise_clip 45 | self.ext_normalizer_action_model = ext_normalizer_action_model 46 | self._add_noise_to_obs = add_noise_to_obs 47 | self._interpolate_expert_states = interpolate_expert_states 48 | self._interpolation_coef = interpolation_coef 49 | 50 | self._add_save_attr( 51 | _action_model='mushroom', 52 | _action_model_fit_params='pickle', 53 | _action_model_noise_std='primitive', 54 | _action_model_noise_clip='primitive', 55 | ext_normalizer_action_model='pickle', 56 | _add_noise_to_obs='primitive' 57 | ) 58 | 59 | def fit(self, dataset): 60 | 61 | # add to replay memory 62 | self._replay_memory.add(dataset) 63 | 64 | if self._replay_memory.initialized: 65 | 66 | # train the action model 67 | if not self._action_model_initialized: 68 | self.train_action_model(init=True) 69 | self._action_model_initialized = True 70 | else: 71 | self.train_action_model() 72 | 73 | # sample batch from policy replay buffer 74 | state, action, reward, next_state, absorbing, _ = \ 75 | self._replay_memory.get(self._batch_size()) 76 | 77 | # sample batch of same size from expert replay buffer and concatenate with samples from own policy 78 | demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0], 79 | self._demonstrations["states"], 80 | self._demonstrations["next_states"], 81 | self._demonstrations["absorbing"])) 82 | 83 | # predict the actions for our expert dataset 84 | demo_obs_act = demo_obs.astype(np.float32)[:, self._state_mask] 85 | demo_nobs_act = demo_nobs.astype(np.float32)[:, self._state_mask] 86 | demo_act = self._action_model.draw_action(to_float_tensor(demo_obs_act), 87 | to_float_tensor(demo_nobs_act)) 88 | 89 | # clip predicted action to action range 90 | demo_act = np.clip(demo_act, self.mdp_info.action_space.low, self.mdp_info.action_space.high) 91 | 92 | if self._add_noise_to_obs: 93 | assert self.ext_normalizer_action_model is not None, "Normalizer is needed to be defined." 94 | 95 | demo_obs = self.ext_normalizer_action_model(demo_obs) 96 | demo_nobs = self.ext_normalizer_action_model(demo_nobs) 97 | demo_obs += self._get_noise(demo_obs) 98 | demo_nobs += self._get_noise(demo_nobs) 99 | demo_obs = self.ext_normalizer_action_model.inv(demo_obs) 100 | demo_nobs = self.ext_normalizer_action_model.inv(demo_nobs) 101 | 102 | # make interpolation if needed 103 | if self._interpolate_expert_states: 104 | demo_obs = self.interpolate(demo_obs[:, self._state_mask], state[:, self._state_mask], 105 | mixing_coef=self._interpolation_coef) 106 | demo_act = self.interpolate(demo_act, action, 107 | mixing_coef=self._interpolation_coef) 108 | 109 | # prepare data for IQ update 110 | input_states = to_float_tensor(np.concatenate([state, 111 | demo_obs.astype(np.float32)[:, self._state_mask]])) 112 | input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)])) 113 | input_n_states = to_float_tensor(np.concatenate([next_state, 114 | demo_nobs.astype(np.float32)[:, self._state_mask]])) 115 | input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)])) 116 | is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool), 117 | torch.ones(len(state), dtype=torch.bool)]) 118 | 119 | # make IQ update 120 | self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert) 121 | 122 | self._iter += 1 123 | self.policy.iter += 1 124 | 125 | def _get_noise(self, x): 126 | noise = np.random.normal(loc=0.0, scale=self._action_model_noise_std, 127 | size=np.size(x)).reshape(x.shape) 128 | noise = np.clip(noise, -self._action_model_noise_clip, self._action_model_noise_clip) \ 129 | if self._action_model_noise_clip is not None else noise 130 | return noise 131 | 132 | def interpolate(self, expert_data, policy_data, mixing_coef=None): 133 | interpolated = mixing_coef * expert_data + (1 - mixing_coef) * policy_data 134 | return interpolated 135 | 136 | def train_action_model(self, init=False): 137 | 138 | if init and self._action_model_fit_params["init_epochs"] > 0: 139 | n_epochs = self._action_model_fit_params["init_epochs"] 140 | # initialize the model 141 | state, action, _, next_state, _, _ = self._replay_memory.get(self._replay_memory.size) 142 | state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state 143 | next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state 144 | state_train = state[0:int(len(state)*0.9), :] 145 | state_val = state[int(len(state)*0.9):, :] 146 | next_state_train = next_state[0:int(len(next_state)*0.9), :] 147 | next_state_val = next_state[int(len(next_state)*0.9):, :] 148 | action_train = action[0:int(len(next_state)*0.9), :] 149 | action_val = action[int(len(next_state)*0.9):, :] 150 | state_nstate_train = np.concatenate([state_train, next_state_train], axis=1) 151 | state_nstate_val = np.concatenate([state_val, next_state_val], axis=1) 152 | 153 | # make eval before training 154 | action_pred = self._action_model(state_nstate_val) 155 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val)) 156 | self._sw.add_scalar('Action-Model/Loss', loss, self._iter) 157 | print("Action Model Validation Loss before training: ", loss) 158 | action_pred = self._action_model(state_nstate_train) 159 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train)) 160 | print("Action Model Training Loss before training: ", loss) 161 | w = self._action_model.get_weights() 162 | norm = np.linalg.norm(w) 163 | self.sw_add_scalar("Action-Model/Norm", norm, self._iter) 164 | 165 | # make training 166 | self._action_model.fit(state_nstate_train, action_train, n_epochs=n_epochs) 167 | 168 | # make eval after training 169 | action_pred = self._action_model(state_nstate_val) 170 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val)) 171 | self._sw.add_scalar('Action-Model/Loss', loss, self._iter) 172 | print("Action Model Validation Loss After training: ", loss) 173 | action_pred = self._action_model(state_nstate_train) 174 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train)) 175 | print("Action Model Validation Loss After training: ", loss) 176 | 177 | else: 178 | state_nstates = [] 179 | actions = [] 180 | for i in range(self._action_model_fit_params["fits_per_step"]): 181 | # sample batch from policy replay buffer 182 | state, action, reward, next_state, absorbing, _ = \ 183 | self._replay_memory.get(self._action_model_batch_size) 184 | 185 | state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state 186 | next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state 187 | self._action_model.fit(state, next_state, action) 188 | 189 | state_nstates.append([state, next_state]) 190 | actions.append(action) 191 | 192 | if self._iter % self._logging_iter == 0: 193 | 194 | # sample batch from policy replay buffer 195 | states, actions, rewards, next_states, absorbings, _ = \ 196 | self._replay_memory.get(self._action_model_batch_size) 197 | 198 | # we need to check if we have a dataset with expert actions available or not 199 | try: 200 | exp_states, exp_next_states, exp_actions = next( 201 | minibatch_generator(self._action_model_batch_size, 202 | self._demonstrations["states"], 203 | self._demonstrations["next_states"], 204 | self._demonstrations["actions"])) 205 | except KeyError: 206 | exp_states, exp_next_states = next(minibatch_generator(self._action_model_batch_size, 207 | self._demonstrations["states"], 208 | self._demonstrations["next_states"])) 209 | exp_actions = None 210 | 211 | # log mse 212 | action_pred = self._action_model(states[:, self._state_mask], next_states[:, self._state_mask]) 213 | mse = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(actions)) 214 | self.sw_add_scalar('Action-Model/Loss Policy', mse, self._iter) 215 | if exp_actions is not None: 216 | action_pred_exp = self._action_model(exp_states[:, self._state_mask], 217 | exp_next_states[:, self._state_mask]) 218 | mse_exp = F.mse_loss(to_float_tensor(action_pred_exp), to_float_tensor(exp_actions)) 219 | self.sw_add_scalar('Action-Model/Loss Exp', mse_exp, self._iter) 220 | 221 | # log entropy 222 | ent_plcy = self._action_model.entropy(states[:, self._state_mask], 223 | next_states[:, self._state_mask]) 224 | ent_exp = self._action_model.entropy(exp_states[:, self._state_mask], 225 | exp_next_states[:, self._state_mask]) 226 | self.sw_add_scalar('Action-Model/Entropy Plcy', ent_plcy, 227 | self._iter) 228 | self.sw_add_scalar('Action-Model/Entropy Exp', ent_exp, 229 | self._iter) 230 | 231 | # log mu, lam, alpha, beta 232 | if type(self._action_model) == GCPActionModel or type(self._action_model) == KLGCPActionModel: 233 | mu, lam, alpha, beta = self._action_model.get_prior_params(states[:, self._state_mask], 234 | next_states[:, self._state_mask]) 235 | self.sw_add_scalar('Action-Model/Mu', np.mean(mu.detach().cpu().numpy()), self._iter) 236 | self.sw_add_scalar('Action-Model/Lambda', np.mean(lam.detach().cpu().numpy()), self._iter) 237 | self.sw_add_scalar('Action-Model/Lambda Counter', self._action_model.lam_counter, self._iter) 238 | self.sw_add_scalar('Action-Model/Alpha', np.mean(alpha.detach().cpu().numpy()), self._iter) 239 | self.sw_add_scalar('Action-Model/Beta', np.mean(beta.detach().cpu().numpy()), self._iter) 240 | self.sw_add_scalar('Action-Model/Var', 241 | np.mean(self._action_model.get_corrected_pred_var(lam, 242 | alpha, 243 | beta).detach().cpu().numpy()), 244 | self._iter) 245 | mu_exp, lam_exp, alpha_exp, beta_exp = \ 246 | self._action_model.get_prior_params(exp_states[:, self._state_mask], 247 | exp_next_states[:, self._state_mask]) 248 | self.sw_add_scalar('Action-Model/Mu Exp', np.mean(mu_exp.detach().cpu().numpy()), self._iter) 249 | self.sw_add_scalar('Action-Model/Lambda Exp', np.mean(lam_exp.detach().cpu().numpy()), self._iter) 250 | self.sw_add_scalar('Action-Model/Alpha Exp', np.mean(alpha_exp.detach().cpu().numpy()), self._iter) 251 | self.sw_add_scalar('Action-Model/Beta Exp', np.mean(beta_exp.detach().cpu().numpy()), self._iter) 252 | self.sw_add_scalar('Action-Model/Var Exp', 253 | np.mean(self._action_model.get_corrected_pred_var(lam_exp, 254 | alpha_exp, 255 | beta_exp).detach().cpu().numpy()), 256 | self._iter) 257 | elif type(self._action_model) == GaussianInvActionModel or \ 258 | type(self._action_model) == KLGaussianInvActionModel: 259 | mu, log_sigma = self._action_model.get_mu_log_sigma(state[:, self._state_mask], 260 | next_state[:, self._state_mask]) 261 | mu_exp, log_sigma_exp = self._action_model.get_mu_log_sigma(exp_states.astype(np.float32)[:, self._state_mask], 262 | exp_next_states.astype(np.float32)[:, self._state_mask]) 263 | 264 | self._sw.add_scalar('Action-Model/Std Exp', torch.mean(torch.exp(log_sigma_exp)), self._iter) 265 | self._sw.add_scalar('Action-Model/Std', torch.mean(torch.exp(log_sigma)), self._iter) 266 | self._sw.add_scalar('Action-Model/Mu Exp', torch.mean(mu_exp), self._iter) 267 | self._sw.add_scalar('Action-Model/Mu', torch.mean(mu), self._iter) 268 | -------------------------------------------------------------------------------- /imitation_lib/imitation/lsiqfo_h.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | 5 | from imitation_lib.imitation.lsiq_h import LSIQ_H 6 | from mushroom_rl.utils.minibatches import minibatch_generator 7 | from mushroom_rl.utils.torch import to_float_tensor 8 | from imitation_lib.utils.action_models import GaussianInvActionModel, LearnableVarGaussianInvActionModel,\ 9 | GCPActionModel, KLGCPActionModel, KLGaussianInvActionModel 10 | 11 | 12 | class LSIQfO_H(LSIQ_H): 13 | 14 | def __init__(self, action_model, action_model_params, action_model_fit_params=None, action_model_noise_std=0.0, 15 | action_model_noise_clip=None, add_noise_to_obs=False, ext_normalizer_action_model=None, 16 | interpolate_expert_states=False, interpolation_coef=1.0, **kwargs): 17 | 18 | super().__init__(**kwargs) 19 | 20 | if action_model == GaussianInvActionModel or action_model == GCPActionModel \ 21 | or action_model == KLGCPActionModel or action_model == KLGaussianInvActionModel: 22 | action_model_params.setdefault("min_a", self.mdp_info.action_space.low) 23 | action_model_params.setdefault("max_a", self.mdp_info.action_space.high) 24 | action_model_params.setdefault("use_cuda", self._use_cuda) 25 | elif action_model == LearnableVarGaussianInvActionModel: 26 | action_model_params.setdefault("use_cuda", self._use_cuda) 27 | 28 | # setup the action model 29 | self._action_model = action_model(**action_model_params, demonstration=self._demonstrations) 30 | 31 | self._action_model_fit_params = dict(fits_per_step=1, init_epochs=0, )\ 32 | if action_model_fit_params is None else action_model_fit_params 33 | self._action_model_initialized = True if self._action_model_fit_params["init_epochs"] > 0 else False 34 | self._action_model_batch_size = action_model_params["batch_size"] 35 | 36 | self._action_model_noise_std = action_model_noise_std 37 | self._action_model_noise_clip = action_model_noise_clip 38 | self.ext_normalizer_action_model = ext_normalizer_action_model 39 | self._add_noise_to_obs = add_noise_to_obs 40 | self._interpolate_expert_states = interpolate_expert_states 41 | self._interpolation_coef = interpolation_coef 42 | 43 | self._add_save_attr( 44 | _action_model='mushroom', 45 | _action_model_fit_params='pickle', 46 | _action_model_noise_std='primitive', 47 | _action_model_noise_clip='primitive', 48 | ext_normalizer_action_model='pickle', 49 | _add_noise_to_obs='primitive' 50 | ) 51 | 52 | def fit(self, dataset): 53 | 54 | # add to replay memory 55 | self._replay_memory.add(dataset) 56 | 57 | if self._replay_memory.initialized: 58 | 59 | # train the action model 60 | if not self._action_model_initialized: 61 | self.train_action_model(init=True) 62 | self._action_model_initialized = True 63 | else: 64 | self.train_action_model() 65 | 66 | # sample batch from policy replay buffer 67 | state, action, reward, next_state, absorbing, _ = \ 68 | self._replay_memory.get(self._batch_size()) 69 | 70 | # sample batch of same size from expert replay buffer and concatenate with samples from own policy 71 | demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0], 72 | self._demonstrations["states"], 73 | self._demonstrations["next_states"], 74 | self._demonstrations["absorbing"])) 75 | 76 | # predict the actions for our expert dataset 77 | demo_obs_act = demo_obs.astype(np.float32)[:, self._state_mask] 78 | demo_nobs_act = demo_nobs.astype(np.float32)[:, self._state_mask] 79 | demo_act = self._action_model.draw_action(to_float_tensor(demo_obs_act), 80 | to_float_tensor(demo_nobs_act)) 81 | 82 | if self._add_noise_to_obs: 83 | assert self.ext_normalizer_action_model is not None, "Normalizer is needed to be defined." 84 | 85 | demo_obs = self.ext_normalizer_action_model(demo_obs) 86 | demo_nobs = self.ext_normalizer_action_model(demo_nobs) 87 | demo_obs += self._get_noise(demo_obs) 88 | demo_nobs += self._get_noise(demo_nobs) 89 | demo_obs = self.ext_normalizer_action_model.inv(demo_obs) 90 | demo_nobs = self.ext_normalizer_action_model.inv(demo_nobs) 91 | 92 | # make interpolation if needed 93 | if self._interpolate_expert_states: 94 | demo_obs = self.interpolate(demo_obs[:, self._state_mask], state[:, self._state_mask], 95 | mixing_coef=self._interpolation_coef) 96 | demo_act = self.interpolate(demo_act, action, 97 | mixing_coef=self._interpolation_coef) 98 | 99 | # prepare data for IQ update 100 | input_states = to_float_tensor(np.concatenate([state, 101 | demo_obs.astype(np.float32)[:, self._state_mask]])) 102 | input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)])) 103 | input_n_states = to_float_tensor(np.concatenate([next_state, 104 | demo_nobs.astype(np.float32)[:, self._state_mask]])) 105 | input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)])) 106 | is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool), 107 | torch.ones(len(state), dtype=torch.bool)]) 108 | 109 | # make IQ update 110 | self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert) 111 | 112 | self._iter += 1 113 | self.policy.iter += 1 114 | 115 | def _get_noise(self, x): 116 | noise = np.random.normal(loc=0.0, scale=self._action_model_noise_std, 117 | size=np.size(x)).reshape(x.shape) 118 | noise = np.clip(noise, -self._action_model_noise_clip, self._action_model_noise_clip) \ 119 | if self._action_model_noise_clip is not None else noise 120 | return noise 121 | 122 | def interpolate(self, expert_data, policy_data, mixing_coef=None): 123 | interpolated = mixing_coef * expert_data + (1 - mixing_coef) * policy_data 124 | return interpolated 125 | 126 | def train_action_model(self, init=False): 127 | 128 | if init and self._action_model_fit_params["init_epochs"] > 0: 129 | n_epochs = self._action_model_fit_params["init_epochs"] 130 | # initialize the model 131 | state, action, _, next_state, _, _ = self._replay_memory.get(self._replay_memory.size) 132 | state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state 133 | next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state 134 | state_train = state[0:int(len(state)*0.9), :] 135 | state_val = state[int(len(state)*0.9):, :] 136 | next_state_train = next_state[0:int(len(next_state)*0.9), :] 137 | next_state_val = next_state[int(len(next_state)*0.9):, :] 138 | action_train = action[0:int(len(next_state)*0.9), :] 139 | action_val = action[int(len(next_state)*0.9):, :] 140 | state_nstate_train = np.concatenate([state_train, next_state_train], axis=1) 141 | state_nstate_val = np.concatenate([state_val, next_state_val], axis=1) 142 | 143 | # make eval before training 144 | action_pred = self._action_model(state_nstate_val) 145 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val)) 146 | self._sw.add_scalar('Action-Model/Loss', loss, self._iter) 147 | print("Action Model Validation Loss before training: ", loss) 148 | action_pred = self._action_model(state_nstate_train) 149 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train)) 150 | print("Action Model Training Loss before training: ", loss) 151 | w = self._action_model.get_weights() 152 | norm = np.linalg.norm(w) 153 | self.sw_add_scalar("Action-Model/Norm", norm, self._iter) 154 | 155 | # make training 156 | self._action_model.fit(state_nstate_train, action_train, n_epochs=n_epochs) 157 | 158 | # make eval after training 159 | action_pred = self._action_model(state_nstate_val) 160 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val)) 161 | self._sw.add_scalar('Action-Model/Loss', loss, self._iter) 162 | print("Action Model Validation Loss After training: ", loss) 163 | action_pred = self._action_model(state_nstate_train) 164 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train)) 165 | print("Action Model Validation Loss After training: ", loss) 166 | 167 | else: 168 | state_nstates = [] 169 | actions = [] 170 | for i in range(self._action_model_fit_params["fits_per_step"]): 171 | # sample batch from policy replay buffer 172 | state, action, reward, next_state, absorbing, _ = \ 173 | self._replay_memory.get(self._action_model_batch_size) 174 | 175 | state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state 176 | next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state 177 | self._action_model.fit(state, next_state, action) 178 | 179 | state_nstates.append([state, next_state]) 180 | actions.append(action) 181 | 182 | if self._iter % self._logging_iter == 0: 183 | 184 | # sample batch from policy replay buffer 185 | states, actions, rewards, next_states, absorbings, _ = \ 186 | self._replay_memory.get(self._action_model_batch_size) 187 | 188 | # we need to check if we have a dataset with expert actions available or not 189 | try: 190 | exp_states, exp_next_states, exp_actions = next( 191 | minibatch_generator(self._action_model_batch_size, 192 | self._demonstrations["states"], 193 | self._demonstrations["next_states"], 194 | self._demonstrations["actions"])) 195 | except KeyError: 196 | exp_states, exp_next_states = next(minibatch_generator(self._action_model_batch_size, 197 | self._demonstrations["states"], 198 | self._demonstrations["next_states"])) 199 | exp_actions = None 200 | 201 | # log mse 202 | action_pred = self._action_model(states[:, self._state_mask], next_states[:, self._state_mask]) 203 | mse = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(actions)) 204 | self.sw_add_scalar('Action-Model/Loss Policy', mse, self._iter) 205 | if exp_actions is not None: 206 | action_pred_exp = self._action_model(exp_states[:, self._state_mask], 207 | exp_next_states[:, self._state_mask]) 208 | mse_exp = F.mse_loss(to_float_tensor(action_pred_exp), to_float_tensor(exp_actions)) 209 | self.sw_add_scalar('Action-Model/Loss Exp', mse_exp, self._iter) 210 | 211 | # log entropy 212 | ent_plcy = self._action_model.entropy(states[:, self._state_mask], 213 | next_states[:, self._state_mask]) 214 | ent_exp = self._action_model.entropy(exp_states[:, self._state_mask], 215 | exp_next_states[:, self._state_mask]) 216 | self.sw_add_scalar('Action-Model/Entropy Plcy', ent_plcy, 217 | self._iter) 218 | self.sw_add_scalar('Action-Model/Entropy Exp', ent_exp, 219 | self._iter) 220 | 221 | # log mu, lam, alpha, beta 222 | if type(self._action_model) == GCPActionModel or type(self._action_model) == KLGCPActionModel: 223 | mu, lam, alpha, beta = self._action_model.get_prior_params(states[:, self._state_mask], 224 | next_states[:, self._state_mask]) 225 | self.sw_add_scalar('Action-Model/Mu', np.mean(mu.detach().cpu().numpy()), self._iter) 226 | self.sw_add_scalar('Action-Model/Lambda', np.mean(lam.detach().cpu().numpy()), self._iter) 227 | self.sw_add_scalar('Action-Model/Lambda Counter', self._action_model.lam_counter, self._iter) 228 | self.sw_add_scalar('Action-Model/Alpha', np.mean(alpha.detach().cpu().numpy()), self._iter) 229 | self.sw_add_scalar('Action-Model/Beta', np.mean(beta.detach().cpu().numpy()), self._iter) 230 | self.sw_add_scalar('Action-Model/Var', 231 | np.mean(self._action_model.get_corrected_pred_var(lam, 232 | alpha, 233 | beta).detach().cpu().numpy()), 234 | self._iter) 235 | mu_exp, lam_exp, alpha_exp, beta_exp = \ 236 | self._action_model.get_prior_params(exp_states[:, self._state_mask], 237 | exp_next_states[:, self._state_mask]) 238 | self.sw_add_scalar('Action-Model/Mu Exp', np.mean(mu_exp.detach().cpu().numpy()), self._iter) 239 | self.sw_add_scalar('Action-Model/Lambda Exp', np.mean(lam_exp.detach().cpu().numpy()), self._iter) 240 | self.sw_add_scalar('Action-Model/Alpha Exp', np.mean(alpha_exp.detach().cpu().numpy()), self._iter) 241 | self.sw_add_scalar('Action-Model/Beta Exp', np.mean(beta_exp.detach().cpu().numpy()), self._iter) 242 | self.sw_add_scalar('Action-Model/Var Exp', 243 | np.mean(self._action_model.get_corrected_pred_var(lam_exp, 244 | alpha_exp, 245 | beta_exp).detach().cpu().numpy()), 246 | self._iter) 247 | elif type(self._action_model) == GaussianInvActionModel or \ 248 | type(self._action_model) == KLGaussianInvActionModel: 249 | mu, log_sigma = self._action_model.get_mu_log_sigma(state[:, self._state_mask], 250 | next_state[:, self._state_mask]) 251 | mu_exp, log_sigma_exp = self._action_model.get_mu_log_sigma(exp_states.astype(np.float32)[:, self._state_mask], 252 | exp_next_states.astype(np.float32)[:, self._state_mask]) 253 | 254 | self._sw.add_scalar('Action-Model/Std Exp', torch.mean(torch.exp(log_sigma_exp)), self._iter) 255 | self._sw.add_scalar('Action-Model/Std', torch.mean(torch.exp(log_sigma)), self._iter) 256 | self._sw.add_scalar('Action-Model/Mu Exp', torch.mean(mu_exp), self._iter) 257 | self._sw.add_scalar('Action-Model/Mu', torch.mean(mu), self._iter) 258 | -------------------------------------------------------------------------------- /imitation_lib/imitation/lsiqfo_hc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | 5 | from imitation_lib.imitation.lsiq_hc import LSIQ_HC 6 | from mushroom_rl.utils.minibatches import minibatch_generator 7 | from mushroom_rl.utils.torch import to_float_tensor 8 | from imitation_lib.utils.action_models import GaussianInvActionModel, LearnableVarGaussianInvActionModel,\ 9 | GCPActionModel, KLGCPActionModel, KLGaussianInvActionModel 10 | 11 | 12 | class LSIQfO_HC(LSIQ_HC): 13 | 14 | def __init__(self, action_model, action_model_params, action_model_fit_params=None, action_model_noise_std=0.0, 15 | action_model_noise_clip=None, add_noise_to_obs=False, ext_normalizer_action_model=None, 16 | interpolate_expert_states=False, interpolation_coef=1.0, **kwargs): 17 | 18 | super().__init__(**kwargs) 19 | 20 | if action_model == GaussianInvActionModel or action_model == GCPActionModel \ 21 | or action_model == KLGCPActionModel or action_model == KLGaussianInvActionModel: 22 | action_model_params.setdefault("min_a", self.mdp_info.action_space.low) 23 | action_model_params.setdefault("max_a", self.mdp_info.action_space.high) 24 | action_model_params.setdefault("use_cuda", self._use_cuda) 25 | elif action_model == LearnableVarGaussianInvActionModel: 26 | action_model_params.setdefault("use_cuda", self._use_cuda) 27 | 28 | # setup the action model 29 | self._action_model = action_model(**action_model_params, demonstration=self._demonstrations) 30 | 31 | self._action_model_fit_params = dict(fits_per_step=1, init_epochs=0, )\ 32 | if action_model_fit_params is None else action_model_fit_params 33 | self._action_model_initialized = True if self._action_model_fit_params["init_epochs"] > 0 else False 34 | self._action_model_batch_size = action_model_params["batch_size"] 35 | 36 | self._action_model_noise_std = action_model_noise_std 37 | self._action_model_noise_clip = action_model_noise_clip 38 | self.ext_normalizer_action_model = ext_normalizer_action_model 39 | self._add_noise_to_obs = add_noise_to_obs 40 | self._interpolate_expert_states = interpolate_expert_states 41 | self._interpolation_coef = interpolation_coef 42 | 43 | self._add_save_attr( 44 | _action_model='mushroom', 45 | _action_model_fit_params='pickle', 46 | _action_model_noise_std='primitive', 47 | _action_model_noise_clip='primitive', 48 | ext_normalizer_action_model='pickle', 49 | _add_noise_to_obs='primitive' 50 | ) 51 | 52 | def fit(self, dataset): 53 | 54 | # add to replay memory 55 | self._replay_memory.add(dataset) 56 | 57 | if self._replay_memory.initialized: 58 | 59 | # train the action model 60 | if not self._action_model_initialized: 61 | self.train_action_model(init=True) 62 | self._action_model_initialized = True 63 | else: 64 | self.train_action_model() 65 | 66 | # sample batch from policy replay buffer 67 | state, action, reward, next_state, absorbing, _ = \ 68 | self._replay_memory.get(self._batch_size()) 69 | 70 | # sample batch of same size from expert replay buffer and concatenate with samples from own policy 71 | demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0], 72 | self._demonstrations["states"], 73 | self._demonstrations["next_states"], 74 | self._demonstrations["absorbing"])) 75 | 76 | # predict the actions for our expert dataset 77 | demo_obs_act = demo_obs.astype(np.float32)[:, self._state_mask] 78 | demo_nobs_act = demo_nobs.astype(np.float32)[:, self._state_mask] 79 | demo_act = self._action_model.draw_action(to_float_tensor(demo_obs_act), 80 | to_float_tensor(demo_nobs_act)) 81 | 82 | if self._add_noise_to_obs: 83 | assert self.ext_normalizer_action_model is not None, "Normalizer is needed to be defined." 84 | 85 | demo_obs = self.ext_normalizer_action_model(demo_obs) 86 | demo_nobs = self.ext_normalizer_action_model(demo_nobs) 87 | demo_obs += self._get_noise(demo_obs) 88 | demo_nobs += self._get_noise(demo_nobs) 89 | demo_obs = self.ext_normalizer_action_model.inv(demo_obs) 90 | demo_nobs = self.ext_normalizer_action_model.inv(demo_nobs) 91 | 92 | # make interpolation if needed 93 | if self._interpolate_expert_states: 94 | demo_obs = self.interpolate(demo_obs[:, self._state_mask], state[:, self._state_mask], 95 | mixing_coef=self._interpolation_coef) 96 | demo_act = self.interpolate(demo_act, action, 97 | mixing_coef=self._interpolation_coef) 98 | 99 | # prepare data for IQ update 100 | input_states = to_float_tensor(np.concatenate([state, 101 | demo_obs.astype(np.float32)[:, self._state_mask]])) 102 | input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)])) 103 | input_n_states = to_float_tensor(np.concatenate([next_state, 104 | demo_nobs.astype(np.float32)[:, self._state_mask]])) 105 | input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)])) 106 | is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool), 107 | torch.ones(len(state), dtype=torch.bool)]) 108 | 109 | # make IQ update 110 | self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert) 111 | 112 | self._iter += 1 113 | self.policy.iter += 1 114 | 115 | def _get_noise(self, x): 116 | noise = np.random.normal(loc=0.0, scale=self._action_model_noise_std, 117 | size=np.size(x)).reshape(x.shape) 118 | noise = np.clip(noise, -self._action_model_noise_clip, self._action_model_noise_clip) \ 119 | if self._action_model_noise_clip is not None else noise 120 | return noise 121 | 122 | def interpolate(self, expert_data, policy_data, mixing_coef=None): 123 | interpolated = mixing_coef * expert_data + (1 - mixing_coef) * policy_data 124 | return interpolated 125 | 126 | def train_action_model(self, init=False): 127 | 128 | if init and self._action_model_fit_params["init_epochs"] > 0: 129 | n_epochs = self._action_model_fit_params["init_epochs"] 130 | # initialize the model 131 | state, action, _, next_state, _, _ = self._replay_memory.get(self._replay_memory.size) 132 | state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state 133 | next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state 134 | state_train = state[0:int(len(state)*0.9), :] 135 | state_val = state[int(len(state)*0.9):, :] 136 | next_state_train = next_state[0:int(len(next_state)*0.9), :] 137 | next_state_val = next_state[int(len(next_state)*0.9):, :] 138 | action_train = action[0:int(len(next_state)*0.9), :] 139 | action_val = action[int(len(next_state)*0.9):, :] 140 | state_nstate_train = np.concatenate([state_train, next_state_train], axis=1) 141 | state_nstate_val = np.concatenate([state_val, next_state_val], axis=1) 142 | 143 | # make eval before training 144 | action_pred = self._action_model(state_nstate_val) 145 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val)) 146 | self._sw.add_scalar('Action-Model/Loss', loss, self._iter) 147 | print("Action Model Validation Loss before training: ", loss) 148 | action_pred = self._action_model(state_nstate_train) 149 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train)) 150 | print("Action Model Training Loss before training: ", loss) 151 | w = self._action_model.get_weights() 152 | norm = np.linalg.norm(w) 153 | self.sw_add_scalar("Action-Model/Norm", norm, self._iter) 154 | 155 | # make training 156 | self._action_model.fit(state_nstate_train, action_train, n_epochs=n_epochs) 157 | 158 | # make eval after training 159 | action_pred = self._action_model(state_nstate_val) 160 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val)) 161 | self._sw.add_scalar('Action-Model/Loss', loss, self._iter) 162 | print("Action Model Validation Loss After training: ", loss) 163 | action_pred = self._action_model(state_nstate_train) 164 | loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train)) 165 | print("Action Model Validation Loss After training: ", loss) 166 | 167 | else: 168 | state_nstates = [] 169 | actions = [] 170 | for i in range(self._action_model_fit_params["fits_per_step"]): 171 | # sample batch from policy replay buffer 172 | state, action, reward, next_state, absorbing, _ = \ 173 | self._replay_memory.get(self._action_model_batch_size) 174 | 175 | state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state 176 | next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state 177 | self._action_model.fit(state, next_state, action) 178 | 179 | state_nstates.append([state, next_state]) 180 | actions.append(action) 181 | 182 | if self._iter % self._logging_iter == 0: 183 | 184 | # sample batch from policy replay buffer 185 | states, actions, rewards, next_states, absorbings, _ = \ 186 | self._replay_memory.get(self._action_model_batch_size) 187 | 188 | # we need to check if we have a dataset with expert actions available or not 189 | try: 190 | exp_states, exp_next_states, exp_actions = next( 191 | minibatch_generator(self._action_model_batch_size, 192 | self._demonstrations["states"], 193 | self._demonstrations["next_states"], 194 | self._demonstrations["actions"])) 195 | except KeyError: 196 | exp_states, exp_next_states = next(minibatch_generator(self._action_model_batch_size, 197 | self._demonstrations["states"], 198 | self._demonstrations["next_states"])) 199 | exp_actions = None 200 | 201 | # log mse 202 | action_pred = self._action_model(states[:, self._state_mask], next_states[:, self._state_mask]) 203 | mse = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(actions)) 204 | self.sw_add_scalar('Action-Model/Loss Policy', mse, self._iter) 205 | if exp_actions is not None: 206 | action_pred_exp = self._action_model(exp_states[:, self._state_mask], 207 | exp_next_states[:, self._state_mask]) 208 | mse_exp = F.mse_loss(to_float_tensor(action_pred_exp), to_float_tensor(exp_actions)) 209 | self.sw_add_scalar('Action-Model/Loss Exp', mse_exp, self._iter) 210 | 211 | # log entropy 212 | ent_plcy = self._action_model.entropy(states[:, self._state_mask], 213 | next_states[:, self._state_mask]) 214 | ent_exp = self._action_model.entropy(exp_states[:, self._state_mask], 215 | exp_next_states[:, self._state_mask]) 216 | self.sw_add_scalar('Action-Model/Entropy Plcy', ent_plcy, 217 | self._iter) 218 | self.sw_add_scalar('Action-Model/Entropy Exp', ent_exp, 219 | self._iter) 220 | 221 | # log mu, lam, alpha, beta 222 | if type(self._action_model) == GCPActionModel or type(self._action_model) == KLGCPActionModel: 223 | mu, lam, alpha, beta = self._action_model.get_prior_params(states[:, self._state_mask], 224 | next_states[:, self._state_mask]) 225 | self.sw_add_scalar('Action-Model/Mu', np.mean(mu.detach().cpu().numpy()), self._iter) 226 | self.sw_add_scalar('Action-Model/Lambda', np.mean(lam.detach().cpu().numpy()), self._iter) 227 | self.sw_add_scalar('Action-Model/Lambda Counter', self._action_model.lam_counter, self._iter) 228 | self.sw_add_scalar('Action-Model/Alpha', np.mean(alpha.detach().cpu().numpy()), self._iter) 229 | self.sw_add_scalar('Action-Model/Beta', np.mean(beta.detach().cpu().numpy()), self._iter) 230 | self.sw_add_scalar('Action-Model/Var', 231 | np.mean(self._action_model.get_corrected_pred_var(lam, 232 | alpha, 233 | beta).detach().cpu().numpy()), 234 | self._iter) 235 | mu_exp, lam_exp, alpha_exp, beta_exp = \ 236 | self._action_model.get_prior_params(exp_states[:, self._state_mask], 237 | exp_next_states[:, self._state_mask]) 238 | self.sw_add_scalar('Action-Model/Mu Exp', np.mean(mu_exp.detach().cpu().numpy()), self._iter) 239 | self.sw_add_scalar('Action-Model/Lambda Exp', np.mean(lam_exp.detach().cpu().numpy()), self._iter) 240 | self.sw_add_scalar('Action-Model/Alpha Exp', np.mean(alpha_exp.detach().cpu().numpy()), self._iter) 241 | self.sw_add_scalar('Action-Model/Beta Exp', np.mean(beta_exp.detach().cpu().numpy()), self._iter) 242 | self.sw_add_scalar('Action-Model/Var Exp', 243 | np.mean(self._action_model.get_corrected_pred_var(lam_exp, 244 | alpha_exp, 245 | beta_exp).detach().cpu().numpy()), 246 | self._iter) 247 | elif type(self._action_model) == GaussianInvActionModel or \ 248 | type(self._action_model) == KLGaussianInvActionModel: 249 | mu, log_sigma = self._action_model.get_mu_log_sigma(state[:, self._state_mask], 250 | next_state[:, self._state_mask]) 251 | mu_exp, log_sigma_exp = self._action_model.get_mu_log_sigma(exp_states.astype(np.float32)[:, self._state_mask], 252 | exp_next_states.astype(np.float32)[:, self._state_mask]) 253 | 254 | self._sw.add_scalar('Action-Model/Std Exp', torch.mean(torch.exp(log_sigma_exp)), self._iter) 255 | self._sw.add_scalar('Action-Model/Std', torch.mean(torch.exp(log_sigma)), self._iter) 256 | self._sw.add_scalar('Action-Model/Mu Exp', torch.mean(mu_exp), self._iter) 257 | self._sw.add_scalar('Action-Model/Mu', torch.mean(mu), self._iter) 258 | -------------------------------------------------------------------------------- /imitation_lib/imitation/offline/__init__.py: -------------------------------------------------------------------------------- 1 | from .iq_offline import IQ_Offline 2 | from .lsiq_offline import LSIQ_Offline 3 | from .lsiq_offline_dm import LSIQ_Offline_DM 4 | from .behavioral_cloning import BehavioralCloning -------------------------------------------------------------------------------- /imitation_lib/imitation/offline/behavioral_cloning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.nn import GaussianNLLLoss 4 | import torch.nn.functional as F 5 | from mushroom_rl.core import Agent 6 | from mushroom_rl.approximators import Regressor 7 | from mushroom_rl.utils.torch import to_float_tensor 8 | from mushroom_rl.utils.minibatches import minibatch_generator 9 | from mushroom_rl.approximators.parametric import TorchApproximator 10 | from imitation_lib.imitation.iq_sac import IQ_Learn_Policy 11 | 12 | 13 | class BehavioralCloning(Agent): 14 | 15 | def __init__(self, mdp_info, actor_params, actor_optimizer, demonstrations, log_std_min=-20, 16 | log_std_max=2, use_cuda=False, logging_iter=1, batch_size=32, sw=None): 17 | 18 | actor_approximator = Regressor(TorchApproximator, 19 | **actor_params) 20 | policy = IQ_Learn_Policy(actor_approximator, 21 | mdp_info.action_space.low, 22 | mdp_info.action_space.high, 23 | log_std_min, 24 | log_std_max) 25 | 26 | policy_parameters = actor_approximator.model.network.parameters() 27 | 28 | self._demonstrations = demonstrations 29 | self._optimizer = actor_optimizer['class'](policy_parameters, **actor_optimizer['params']) 30 | self._actor_loss = GaussianNLLLoss() 31 | self._use_cuda = use_cuda 32 | self._iter = 0 33 | self._batch_size = batch_size 34 | self._logging_iter = logging_iter 35 | 36 | if sw: 37 | self._sw = sw 38 | setattr(self._sw, '__deepcopy__', lambda self: None) # dont need to be copyable, causes pickle error otherwise 39 | 40 | super(BehavioralCloning, self).__init__(mdp_info, policy) 41 | 42 | def fit(self, dataset): 43 | raise AttributeError("This is a behavior cloning algorithms, which is meant to run offline. It is not supposed" 44 | "to use the fit function. Use the fit_offline function instead.") 45 | 46 | def fit_offline(self, n_steps): 47 | 48 | for i in range(n_steps): 49 | 50 | # sample batch of same size from expert replay buffer and concatenate with samples from own policy 51 | demo_obs, demo_act, demo_nobs, demo_absorbing = next(minibatch_generator(self._batch_size, 52 | self._demonstrations["states"], 53 | self._demonstrations["actions"], 54 | self._demonstrations["next_states"], 55 | self._demonstrations["absorbing"])) 56 | 57 | # prepare tensors 58 | states = to_float_tensor(demo_obs, self._use_cuda) \ 59 | if self._use_cuda else to_float_tensor(demo_obs) 60 | target_actions = to_float_tensor(demo_act, self._use_cuda) \ 61 | if self._use_cuda else to_float_tensor(demo_act) 62 | 63 | # do unsquashing of target actions 64 | central, delta = self.policy.get_central_delta() 65 | target_actions = torch.clip((target_actions - central) / delta, -1.0 + 1e-7, 1.0 - 1e-7) 66 | target_actions = torch.arctanh(target_actions) 67 | 68 | # predict mu and log_sigma 69 | mu, log_sigma = self.policy.get_mu_log_sigma(states) 70 | 71 | # calculate loss and do an optimizer step 72 | loss = self._actor_loss(input=mu, target=target_actions, var=torch.square(log_sigma.exp())) 73 | self._optimizer.zero_grad() 74 | loss.backward() 75 | self._optimizer.step() 76 | 77 | # make some logging 78 | self.logging(states, target_actions, loss, mu, log_sigma) 79 | 80 | self._iter += 1 81 | 82 | def logging(self, states, target_actions, loss, mu, log_sigma): 83 | # log some useful information 84 | if self._iter % self._logging_iter == 0: 85 | self.sw_add_scalar("GaussianNLLLoss", np.mean(loss.detach().cpu().numpy())) 86 | 87 | gauss_ent = self.policy.entropy_from_logsigma(log_sigma) 88 | self.sw_add_scalar("Squashed Gaussian Entropy", np.mean(gauss_ent.detach().cpu().numpy())) 89 | act, log_prob = self.policy.compute_action_and_log_prob(states) 90 | squashed_gauss_ent = -np.mean(log_prob) 91 | self.sw_add_scalar("Squashed Gaussian Entropy (Empirical)", squashed_gauss_ent) 92 | 93 | mse_loss = F.mse_loss(mu, target_actions) 94 | self.sw_add_scalar("MSELoss (between mean & target actions)", np.mean(mse_loss.detach().cpu().numpy())) 95 | 96 | def sw_add_scalar(self, name, val): 97 | if self._iter % self._logging_iter == 0: 98 | self._sw.add_scalar(name, val, self._iter) 99 | -------------------------------------------------------------------------------- /imitation_lib/imitation/offline/iq_offline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from mushroom_rl.utils.minibatches import minibatch_generator 4 | from mushroom_rl.utils.torch import to_float_tensor 5 | from imitation_lib.imitation.iq_sac import IQ_SAC 6 | 7 | 8 | class IQ_Offline(IQ_SAC): 9 | 10 | def __init__(self, **kwargs): 11 | 12 | if "regularizer_mode" in kwargs.keys(): 13 | if kwargs["regularizer_mode"] != "exp": 14 | raise ValueError("This is the offline implementation of IQ, which expects the regularizer to take only" 15 | "samples from the expert.") 16 | else: 17 | kwargs["regularizer_mode"] = "exp" 18 | if "plcy_loss_mode" in kwargs.keys(): 19 | if kwargs["plcy_loss_mode"] != "v0": 20 | raise ValueError("This is the offline implementation of IQ, which expects: plcy_loss_mode=\"v0\".") 21 | else: 22 | kwargs["plcy_loss_mode"] = "v0" 23 | 24 | super(IQ_Offline, self).__init__(**kwargs) 25 | 26 | def fit(self, dataset): 27 | raise AttributeError("This is the offline implementation of IQ, it is not supposed to use the fit function. " 28 | "Use the fit_offline function instead.") 29 | 30 | def fit_offline(self, n_steps): 31 | 32 | for i in range(n_steps): 33 | 34 | # sample batch of same size from expert replay buffer and concatenate with samples from own policy 35 | assert self._act_mask.size > 0, "IQ-Learn needs demo actions!" 36 | demo_obs, demo_act, demo_nobs, demo_absorbing = next(minibatch_generator(self._batch_size(), 37 | self._demonstrations["states"], 38 | self._demonstrations["actions"], 39 | self._demonstrations["next_states"], 40 | self._demonstrations["absorbing"])) 41 | 42 | # prepare data for IQ update 43 | input_states = to_float_tensor(demo_obs.astype(np.float32)[:, self._state_mask]) 44 | input_actions = to_float_tensor(demo_act.astype(np.float32)) 45 | input_n_states = to_float_tensor(demo_nobs.astype(np.float32)[:, self._state_mask]) 46 | input_absorbing = to_float_tensor(demo_absorbing.astype(np.float32)) 47 | is_expert = torch.ones(len(demo_obs), dtype=torch.bool) 48 | 49 | # make IQ update 50 | self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert) 51 | 52 | self._iter += 1 53 | self.policy.iter += 1 54 | 55 | def _lossQ(self, obs, act, next_obs, absorbing, is_expert): 56 | """ 57 | Main contribution of the IQ-learn paper. This function is based on the repository of the paper: 58 | https://github.com/Div99/IQ-Learn 59 | """ 60 | # Calculate 1st term of loss: -E_(ρ_expert)[Q(s, a) - γV(s')] 61 | gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma) 62 | absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing 63 | current_Q = self._critic_approximator(obs, act, output_tensor=True) 64 | if not self._use_target: 65 | next_v = self.getV(next_obs) 66 | else: 67 | with torch.no_grad(): 68 | next_v = self.get_targetV(next_obs).detach() 69 | 70 | y = (1 - torch.unsqueeze(absorbing, 1)) * gamma.detach() * self._Q_Q_multiplier * next_v 71 | 72 | reward = (self._Q_Q_multiplier * current_Q - y) 73 | exp_reward = reward[is_expert] 74 | loss_term1 = -exp_reward.mean() 75 | 76 | # do the logging 77 | self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing) 78 | 79 | # 2nd term for our loss (use expert and policy states): E_(ρ)[Q(s,a) - γV(s')] 80 | V = self._Q_Q_multiplier * self.getV(obs) 81 | value = (V - y) 82 | self.sw_add_scalar('V for policy on all states', self._Q_Q_multiplier * V.mean(), self._iter) 83 | value_loss = value 84 | if self._plcy_loss_mode == "value": 85 | loss_term2 = value_loss.mean() 86 | elif self._plcy_loss_mode == "value_expert": 87 | value_loss_exp = value_loss[is_expert] 88 | loss_term2 = value_loss_exp.mean() 89 | elif self._plcy_loss_mode == "value_policy": 90 | value_loss_plcy = value_loss[~is_expert] 91 | loss_term2 = value_loss_plcy.mean() 92 | elif self._plcy_loss_mode == "q_old_policy": 93 | reward = (current_Q - y) 94 | reward_plcy = reward[~is_expert] 95 | loss_term2 = reward_plcy.mean() 96 | elif self._plcy_loss_mode == "v0": 97 | value_loss_v0 = (1 - gamma.detach()) * self.getV(obs[is_expert]) 98 | loss_term2 = value_loss_v0.mean() 99 | else: 100 | raise ValueError("Undefined policy loss mode: %s" % self._plcy_loss_mode) 101 | 102 | # regularize 103 | if not self._use_target: 104 | next_v = self.getV(next_obs) 105 | else: 106 | with torch.no_grad(): 107 | next_v = self.get_targetV(next_obs).detach() 108 | # WARNING: TURNED OFF absorbing in regularization TODO: check if this works, if not go back 109 | # y = (1 - torch.unsqueeze(absorbing, 1)) * gamma.detach() * self._Q_Q_multiplier * next_v 110 | abs_mult = 1.0 if self._reg_no_absorbing else (1 - torch.unsqueeze(absorbing, 1)) 111 | y = abs_mult * gamma.detach() * self._Q_Q_multiplier * next_v 112 | current_Q = self._Q_Q_multiplier * self._critic_approximator(obs, act, output_tensor=True) 113 | if self._turn_off_reg_absorbing: 114 | reward = (1 - torch.unsqueeze(absorbing, 1)) * (current_Q - y) 115 | else: 116 | reward = current_Q - y 117 | 118 | reg_multiplier = (1.0 / (1 - gamma.detach())) if self._normalized_val_func else 1.0 119 | if self._regularizer_mode == "exp_and_plcy": 120 | chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(reward)).mean() 121 | elif self._regularizer_mode == "exp": 122 | chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(reward[is_expert])).mean() 123 | elif self._regularizer_mode == "plcy": 124 | chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(reward[~is_expert])).mean() 125 | elif self._regularizer_mode == "value": 126 | V = self._Q_Q_multiplier * self.getV(obs) 127 | value = (V - y) 128 | chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(value)).mean() 129 | elif self._regularizer_mode == "exp_and_plcy_and_value": 130 | V = self._Q_Q_multiplier * self.getV(obs[is_expert]) 131 | value = (V - y[is_expert]) 132 | reward = torch.concat([reward, value]) 133 | chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(reward)).mean() 134 | elif self._regularizer_mode == "off": 135 | chi2_loss = 0.0 136 | else: 137 | raise ValueError("Undefined regularizer mode %s." % (self._regularizer_mode)) 138 | 139 | # Add Q penalty TODO: maybe remove, since it did not work that great 140 | if self._use_Q_regularizer: 141 | loss_Q_pen = self._Q_reg_mult * torch.mean( 142 | torch.square(current_Q - torch.ones_like(current_Q) * self._Q_reg_target)) 143 | else: 144 | loss_Q_pen = 0.0 145 | 146 | # add gradient penalty if needed 147 | if self._gp_lambda > 0: 148 | with torch.no_grad(): 149 | act_plcy, _ = self.policy.compute_action_and_log_prob_t(obs[is_expert]) 150 | loss_gp = self._gradient_penalty(obs[is_expert], act[is_expert], 151 | obs[is_expert], act_plcy, self._gp_lambda) 152 | else: 153 | loss_gp = 0.0 154 | 155 | loss_Q = loss_term1 + loss_term2 + chi2_loss + loss_Q_pen + loss_gp 156 | self.update_Q_parameters(loss_Q) 157 | 158 | grads = [] 159 | for param in self._critic_approximator.model.network.parameters(): 160 | grads.append(param.grad.view(-1)) 161 | grads = torch.cat(grads) 162 | norm = grads.norm(dim=0, p=2) 163 | if self._iter % self._logging_iter == 0: 164 | self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter) 165 | 166 | return loss_term1, loss_term2, chi2_loss 167 | 168 | def iq_update(self, input_states, input_actions, input_n_states, input_absorbing, is_expert): 169 | 170 | # update Q function 171 | if self._iter % self._delay_Q == 0: 172 | self.update_Q_function(input_states, input_actions, input_n_states, input_absorbing, is_expert) 173 | 174 | # update policy 175 | if self._iter % self._delay_pi == 0: 176 | self.update_policy(input_states, is_expert) 177 | 178 | if self._iter % self._delay_Q == 0: 179 | self._update_target(self._critic_approximator, 180 | self._target_critic_approximator) 181 | 182 | def logging_loss(self, current_Q, y, reward, is_expert, obs, act, absorbing): 183 | 184 | if self._iter % self._logging_iter == 0: 185 | self.sw_add_scalar('Action-Value/Q for expert', self._Q_Q_multiplier * current_Q[is_expert].mean(), self._iter) 186 | self.sw_add_scalar('Action-Value/Q^2 for expert', self._Q_Q_multiplier * torch.square(current_Q[is_expert]).mean(), self._iter) 187 | self.sw_add_scalar('Action-Value/Reward_Expert', reward[is_expert].mean(), self._iter) 188 | 189 | Q_exp = current_Q[is_expert] 190 | Q_plcy = current_Q[~is_expert] 191 | abs_exp = absorbing[is_expert].bool() 192 | abs_plcy = absorbing[~is_expert].bool() 193 | self.sw_add_scalar('Action-Value/Q Absorbing state exp', torch.mean(Q_exp[abs_exp]), self._iter) 194 | self.sw_add_scalar('Action-Value/Q Absorbing state plcy', torch.mean(Q_plcy[abs_plcy]), self._iter) 195 | 196 | # norm 197 | w = self._critic_approximator.get_weights() 198 | self.sw_add_scalar("Action-Value/Norm of Q net: ",np.linalg.norm(w), self._iter) 199 | self.sw_add_scalar('Targets/expert data', y[is_expert].mean(), self._iter) 200 | # log mean squared action 201 | self.sw_add_scalar('Actions/mean squared action expert (from data)', torch.square(act[is_expert]).mean(), self._iter) 202 | self.sw_add_scalar('Actions/mean squared action expert (from policy)', np.square(self.policy.draw_action(obs[is_expert])).mean(), self._iter) 203 | 204 | # log mean of each action 205 | n_actions = len(act[0]) 206 | for i in range(n_actions): 207 | self.sw_add_scalar('All Actions means/action %d expert' % i, act[is_expert, i].mean(), 208 | self._iter) 209 | self.sw_add_scalar('All Actions variances/action %d expert' % i, torch.var(act[is_expert, i]), 210 | self._iter) 211 | self.sw_add_scalar('All Actions mins/action %d expert' % i, torch.min(act[is_expert, i]), 212 | self._iter) 213 | self.sw_add_scalar('All Actions mins/action %d expert' % i, torch.min(act[is_expert, i]), 214 | self._iter) 215 | self.sw_add_scalar('All Actions maxs/action %d expert' % i, torch.max(act[is_expert, i]), 216 | self._iter) 217 | -------------------------------------------------------------------------------- /imitation_lib/imitation/offline/lsiq_offline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.functional import F 3 | import numpy as np 4 | from mushroom_rl.utils.minibatches import minibatch_generator 5 | from mushroom_rl.utils.torch import to_float_tensor 6 | from imitation_lib.imitation.lsiq import LSIQ 7 | 8 | 9 | class LSIQ_Offline(LSIQ): 10 | 11 | def __init__(self, loss_mode_exp="fix", regularizer_mode="off", **kwargs): 12 | 13 | if "plcy_loss_mode" in kwargs.keys(): 14 | if kwargs["plcy_loss_mode"] != "v0": 15 | raise ValueError("This is the offline implementation of IQ, which expects: plcy_loss_mode=\"v0\".") 16 | else: 17 | kwargs["plcy_loss_mode"] = "v0" 18 | 19 | super().__init__(loss_mode_exp=loss_mode_exp, regularizer_mode=regularizer_mode, **kwargs) 20 | 21 | def fit(self, dataset): 22 | raise AttributeError("This is the offline implementation of IQ, it is not supposed to use the fit function. " 23 | "Use the fit_offline function instead.") 24 | 25 | def fit_offline(self, n_steps): 26 | 27 | for i in range(n_steps): 28 | 29 | # sample batch of same size from expert replay buffer and concatenate with samples from own policy 30 | assert self._act_mask.size > 0, "IQ-Learn needs demo actions!" 31 | demo_obs, demo_act, demo_nobs, demo_absorbing = next(minibatch_generator(self._batch_size(), 32 | self._demonstrations["states"], 33 | self._demonstrations["actions"], 34 | self._demonstrations["next_states"], 35 | self._demonstrations["absorbing"])) 36 | 37 | # prepare data for IQ update 38 | input_states = to_float_tensor(demo_obs.astype(np.float32)[:, self._state_mask], self._use_cuda) 39 | input_actions = to_float_tensor(demo_act.astype(np.float32), self._use_cuda) 40 | input_n_states = to_float_tensor(demo_nobs.astype(np.float32)[:, self._state_mask], self._use_cuda) 41 | input_absorbing = to_float_tensor(demo_absorbing.astype(np.float32), self._use_cuda) 42 | is_expert = torch.ones(len(demo_obs), dtype=torch.bool).cuda() if self._use_cuda else torch.ones(len(demo_obs), dtype=torch.bool) 43 | 44 | # make IQ update 45 | self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert) 46 | 47 | self._iter += 1 48 | self.policy.iter += 1 49 | 50 | def _lossQ(self, obs, act, next_obs, absorbing, is_expert): 51 | 52 | # Calculate 1st term of loss: -E_(ρ_expert)[Q(s, a) - γV(s')] 53 | gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma) 54 | absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing 55 | current_Q = self._critic_approximator(obs, act, output_tensor=True) 56 | if not self._use_target: 57 | next_v = self.getV(next_obs) 58 | else: 59 | with torch.no_grad(): 60 | next_v = self.get_targetV(next_obs).detach() 61 | absorbing = torch.unsqueeze(absorbing, 1) 62 | y = (1 - absorbing) * gamma.detach() * self._Q_Q_multiplier * torch.clip(next_v, self._Q_min, self._Q_max) 63 | 64 | reward = (self._Q_Q_multiplier*current_Q - y) 65 | exp_reward = reward[is_expert] 66 | 67 | if self._loss_mode_exp == "bootstrap": 68 | loss_term1 = - exp_reward.mean() 69 | elif self._loss_mode_exp == "fix": 70 | if self._Q_exp_loss == "MSE": 71 | loss_term1 = F.mse_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max) 72 | elif self._Q_exp_loss == "Huber": 73 | loss_term1 = F.huber_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max) 74 | elif self._Q_exp_loss is None: 75 | raise ValueError("If you choose loss_mode_exp == fix, you have to specify Q_exp_loss. Setting it to" 76 | "None is not valid.") 77 | else: 78 | raise ValueError( 79 | "Choosen Q_exp_loss %s is not supported. Choose either MSE or Huber." % self._Q_exp_loss) 80 | 81 | # do the logging 82 | self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing) 83 | 84 | # 2nd term for our loss (use expert and policy states): E_(ρ)[Q(s,a) - γV(s')] 85 | V = self._Q_Q_multiplier * self.getV(obs) 86 | value = (V - y) 87 | self.sw_add_scalar('V for policy on all states', self._Q_Q_multiplier * V.mean(), self._iter) 88 | value_loss = value 89 | if self._plcy_loss_mode == "value": 90 | loss_term2 = value_loss.mean() 91 | elif self._plcy_loss_mode == "value_expert": 92 | value_loss_exp = value_loss[is_expert] 93 | loss_term2 = value_loss_exp.mean() 94 | elif self._plcy_loss_mode == "value_policy": 95 | value_loss_plcy = value_loss[~is_expert] 96 | loss_term2 = value_loss_plcy.mean() 97 | elif self._plcy_loss_mode == "q_old_policy": 98 | reward_plcy = reward[~is_expert] 99 | loss_term2 = reward_plcy.mean() 100 | elif self._plcy_loss_mode == "value_q_old_policy": 101 | reward_plcy = reward[~is_expert] 102 | loss_term2 = reward_plcy.mean() + value_loss.mean() 103 | elif self._plcy_loss_mode == "v0": 104 | #value_loss_v0 = (1-gamma.detach()) * self.getV(obs[is_expert]) 105 | value_loss_v0 = (1-gamma.detach()) * self.getV(obs[is_expert]) 106 | loss_term2 = value_loss_v0.mean() 107 | elif self._plcy_loss_mode == "off": 108 | loss_term2 = 0.0 109 | else: 110 | raise ValueError("Undefined policy loss mode: %s" % self._plcy_loss_mode) 111 | 112 | # regularize 113 | if self._regularizer_mode == "exp_and_plcy": 114 | chi2_loss = ((1 - absorbing) * torch.tensor(self._reg_mult) * torch.square(reward) 115 | + self._abs_mult * absorbing * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult) 116 | * (torch.square(reward))).mean() 117 | elif self._regularizer_mode == "exp": 118 | chi2_loss = ((1 - absorbing[is_expert]) * torch.tensor(self._reg_mult) * torch.square(reward[is_expert]) 119 | + self._abs_mult * absorbing[is_expert] * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult) 120 | * (torch.square(reward[is_expert]))).mean() 121 | elif self._regularizer_mode == "plcy": 122 | chi2_loss = ((1 - absorbing[~is_expert]) * torch.tensor(self._reg_mult) * torch.square(reward[~is_expert]) 123 | + self._abs_mult * absorbing[~is_expert] * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult) 124 | * (torch.square(reward[~is_expert]))).mean() 125 | elif self._regularizer_mode == "value": 126 | V = self._Q_Q_multiplier * self.getV(obs) 127 | value = (V - y) 128 | chi2_loss = torch.tensor(self._reg_mult) * (torch.square(value)).mean() 129 | elif self._regularizer_mode == "exp_and_plcy_and_value": 130 | V = self._Q_Q_multiplier * self.getV(obs[is_expert]) 131 | value = (V - y[is_expert]) 132 | reward = torch.concat([reward, value]) 133 | chi2_loss = torch.tensor(self._reg_mult) * (torch.square(reward)).mean() 134 | elif self._regularizer_mode == "off": 135 | chi2_loss = 0.0 136 | else: 137 | raise ValueError("Undefined regularizer mode %s." % (self._regularizer_mode)) 138 | 139 | # add gradient penalty if needed 140 | if self._gp_lambda > 0: 141 | with torch.no_grad(): 142 | act_plcy, _ = self.policy.compute_action_and_log_prob_t(obs[is_expert]) 143 | loss_gp = self._gradient_penalty(obs[is_expert], act[is_expert], 144 | obs[is_expert], act_plcy, self._gp_lambda) 145 | else: 146 | loss_gp = 0.0 147 | 148 | loss_Q = loss_term1 + loss_term2 + chi2_loss + loss_gp 149 | self.update_Q_parameters(loss_Q) 150 | 151 | grads = [] 152 | for param in self._critic_approximator.model.network.parameters(): 153 | grads.append(param.grad.view(-1)) 154 | grads = torch.cat(grads) 155 | norm = grads.norm(dim=0, p=2) 156 | if self._iter % self._logging_iter == 0: 157 | self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter) 158 | 159 | return loss_term1, loss_term2, chi2_loss 160 | 161 | def iq_update(self, input_states, input_actions, input_n_states, input_absorbing, is_expert): 162 | 163 | # update Q function 164 | if self._iter % self._delay_Q == 0: 165 | self.update_Q_function(input_states, input_actions, input_n_states, input_absorbing, is_expert) 166 | 167 | # update policy 168 | if self._iter % self._delay_pi == 0: 169 | self.update_policy(input_states, input_actions, is_expert) 170 | 171 | if self._iter % self._delay_Q == 0: 172 | self._update_target(self._critic_approximator, 173 | self._target_critic_approximator) 174 | 175 | def update_policy(self, input_states, input_actions, is_expert): 176 | 177 | if self._train_policy_only_on_own_states: 178 | policy_training_states = input_states[~is_expert] 179 | else: 180 | policy_training_states = input_states 181 | action_new, log_prob = self.policy.compute_action_and_log_prob_t(policy_training_states) 182 | loss = self._actor_loss(policy_training_states, action_new, log_prob) 183 | 184 | self._optimize_actor_parameters(loss) 185 | grads = [] 186 | for param in self.policy._approximator.model.network.parameters(): 187 | grads.append(param.grad.view(-1)) 188 | grads = torch.cat(grads) 189 | norm = grads.norm(dim=0, p=2) 190 | if self._iter % self._logging_iter == 0: 191 | self.sw_add_scalar('Gradients/Norm2 Gradient Q wrt. Pi-parameters', norm, 192 | self._iter) 193 | self.sw_add_scalar('Actor/Loss', loss, self._iter) 194 | _, log_prob = self.policy.compute_action_and_log_prob_t(input_states) 195 | self.sw_add_scalar('Actor/Entropy Expert States', torch.mean(-log_prob[is_expert]).detach().item(), 196 | self._iter) 197 | self.sw_add_scalar('Actor/Entropy Policy States', torch.mean(-log_prob[~is_expert]).detach().item(), 198 | self._iter) 199 | _, logsigma = self.policy.get_mu_log_sigma(input_states[~is_expert]) 200 | ent_gauss = self.policy.entropy_from_logsigma(logsigma) 201 | e_lb = self.policy.get_e_lb() 202 | self.sw_add_scalar('Actor/Entropy from Gaussian Policy States', torch.mean(ent_gauss).detach().item(), 203 | self._iter) 204 | self.sw_add_scalar('Actor/Entropy Lower Bound', e_lb, self._iter) 205 | _, logsigma = self.policy.get_mu_log_sigma(input_states[is_expert]) 206 | ent_gauss = self.policy.entropy_from_logsigma(logsigma) 207 | self.sw_add_scalar('Actor/Entropy from Gaussian Expert States', torch.mean(ent_gauss).detach().item(), 208 | self._iter) 209 | if self._learnable_alpha: 210 | self._update_alpha(log_prob.detach()) 211 | 212 | def logging_loss(self, current_Q, y, reward, is_expert, obs, act, absorbing): 213 | 214 | if self._iter % self._logging_iter == 0: 215 | self.sw_add_scalar('Action-Value/Q for expert', self._Q_Q_multiplier * current_Q[is_expert].mean(), self._iter) 216 | self.sw_add_scalar('Action-Value/Q^2 for expert', self._Q_Q_multiplier * torch.square(current_Q[is_expert]).mean(), self._iter) 217 | self.sw_add_scalar('Action-Value/Reward_Expert', reward[is_expert].mean(), self._iter) 218 | 219 | Q_exp = current_Q[is_expert] 220 | Q_plcy = current_Q[~is_expert] 221 | abs_exp = absorbing[is_expert].bool() 222 | abs_plcy = absorbing[~is_expert].bool() 223 | self.sw_add_scalar('Action-Value/Q Absorbing state exp', torch.mean(Q_exp[abs_exp]), self._iter) 224 | self.sw_add_scalar('Action-Value/Q Absorbing state plcy', torch.mean(Q_plcy[abs_plcy]), self._iter) 225 | 226 | # norm 227 | w = self._critic_approximator.get_weights() 228 | self.sw_add_scalar("Action-Value/Norm of Q net: ",np.linalg.norm(w), self._iter) 229 | self.sw_add_scalar('Targets/expert data', y[is_expert].mean(), self._iter) 230 | # log mean squared action 231 | self.sw_add_scalar('Actions/mean squared action expert (from data)', torch.square(act[is_expert]).mean(), self._iter) 232 | self.sw_add_scalar('Actions/mean squared action expert (from policy)', np.square(self.policy.draw_action(obs[is_expert])).mean(), self._iter) 233 | 234 | # log mean of each action 235 | n_actions = len(act[0]) 236 | for i in range(n_actions): 237 | self.sw_add_scalar('All Actions means/action %d expert' % i, act[is_expert, i].mean(), 238 | self._iter) 239 | self.sw_add_scalar('All Actions variances/action %d expert' % i, torch.var(act[is_expert, i]), 240 | self._iter) 241 | self.sw_add_scalar('All Actions mins/action %d expert' % i, torch.min(act[is_expert, i]), 242 | self._iter) 243 | self.sw_add_scalar('All Actions mins/action %d expert' % i, torch.min(act[is_expert, i]), 244 | self._iter) 245 | self.sw_add_scalar('All Actions maxs/action %d expert' % i, torch.max(act[is_expert, i]), 246 | self._iter) 247 | -------------------------------------------------------------------------------- /imitation_lib/imitation/offline/lsiq_offline_dm.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | import torch 3 | from torch.functional import F 4 | import numpy as np 5 | from mushroom_rl.utils.minibatches import minibatch_generator 6 | from mushroom_rl.utils.torch import to_float_tensor 7 | from mushroom_rl.approximators import Regressor 8 | from mushroom_rl.approximators.parametric import TorchApproximator 9 | from imitation_lib.imitation.lsiq import LSIQ 10 | 11 | 12 | class LSIQ_Offline_DM(LSIQ): 13 | 14 | def __init__(self, dynamics_model_params, dynamics_model_init_epochs=250, 15 | random_demonstrations=None, loss_mode_exp="fix", regularizer_mode="off", **kwargs): 16 | 17 | super().__init__(loss_mode_exp=loss_mode_exp, regularizer_mode=regularizer_mode, **kwargs) 18 | 19 | self._dynamics_model = Regressor(TorchApproximator, **dynamics_model_params) 20 | self._dynamics_model_init_epochs = dynamics_model_init_epochs 21 | self._dynamics_model_initialized = False 22 | 23 | expert_demonstrations = deepcopy(kwargs["demonstrations"]) 24 | if random_demonstrations is not None: 25 | self._dynamics_model_training_data = dict() 26 | for key, value in expert_demonstrations.items(): 27 | if key != "episode_starts": 28 | self._dynamics_model_training_data[key] = np.concatenate([value, random_demonstrations[key]]) 29 | self.add_dataset_to_replay_memory(random_demonstrations) 30 | else: 31 | self._dynamics_model_training_data = expert_demonstrations 32 | 33 | low, high = self.mdp_info.observation_space.low.copy(),\ 34 | self.mdp_info.observation_space.high.copy() 35 | self.norm_act_mean = (high + low) / 2.0 36 | self.norm_act_delta = (high - low) / 2.0 37 | 38 | self._state = None 39 | self._idx_state = 0 40 | self._max_traj_len = 200 41 | 42 | def fit(self, dataset): 43 | raise AttributeError("This is the offline implementation of IQ, it is not supposed to use the fit function. " 44 | "Use the fit_offline function instead.") 45 | 46 | def fit_offline(self, n_steps): 47 | 48 | if not self._dynamics_model_initialized: 49 | self.fit_dynamics_model(self._dynamics_model_init_epochs) 50 | self.predict_trajectories_and_add_to_replay_buffer(100, 100) 51 | self._dynamics_model_initialized = True 52 | #else: 53 | # self.fit_dynamics_model(1) 54 | 55 | 56 | for i in range(n_steps): 57 | 58 | self.add_next_step_to_buffer() 59 | 60 | # sample batch from policy replay buffer 61 | state, action, reward, next_state, absorbing, _ = \ 62 | self._replay_memory.get(self._batch_size()) 63 | 64 | # sample batch of same size from expert replay buffer and concatenate with samples from own policy 65 | assert self._act_mask.size > 0, "IQ-Learn needs demo actions!" 66 | demo_obs, demo_act, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0], 67 | self._demonstrations["states"], 68 | self._demonstrations["actions"], 69 | self._demonstrations[ 70 | "next_states"], 71 | self._demonstrations["absorbing"])) 72 | 73 | # prepare data for IQ update 74 | input_states = to_float_tensor(np.concatenate([state, demo_obs.astype(np.float32)[:, self._state_mask]])) 75 | input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)])) 76 | input_n_states = to_float_tensor(np.concatenate([next_state, 77 | demo_nobs.astype(np.float32)[:, self._state_mask]])) 78 | input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)])) 79 | is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool), 80 | torch.ones(len(state), dtype=torch.bool)]) 81 | 82 | # make IQ update 83 | self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert) 84 | 85 | self._iter += 1 86 | self.policy.iter += 1 87 | 88 | def _lossQ(self, obs, act, next_obs, absorbing, is_expert): 89 | 90 | # Calculate 1st term of loss: -E_(ρ_expert)[Q(s, a) - γV(s')] 91 | gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma) 92 | absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing 93 | current_Q = self._critic_approximator(obs, act, output_tensor=True) 94 | if not self._use_target: 95 | next_v = self.getV(next_obs) 96 | else: 97 | with torch.no_grad(): 98 | next_v = self.get_targetV(next_obs).detach() 99 | absorbing = torch.unsqueeze(absorbing, 1) 100 | y = (1 - absorbing) * gamma.detach() * self._Q_Q_multiplier * torch.clip(next_v, self._Q_min, self._Q_max) 101 | 102 | reward = (self._Q_Q_multiplier*current_Q - y) 103 | exp_reward = reward[is_expert] 104 | 105 | #if self._loss_mode_exp == "bootstrap": # todo: remove this was just for testing 106 | # loss_term1 = F.mse_loss(current_Q[is_expert], 107 | # torch.ones_like(current_Q[is_expert]) * (1.0/self._reg_mult) + gamma.detach() * current_Q[is_expert].detach().cpu()) 108 | if self._loss_mode_exp == "bootstrap": 109 | loss_term1 = - exp_reward.mean() 110 | elif self._loss_mode_exp == "fix": 111 | if self._Q_exp_loss == "MSE": 112 | loss_term1 = F.mse_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max) 113 | elif self._Q_exp_loss == "Huber": 114 | loss_term1 = F.huber_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max) 115 | elif self._Q_exp_loss is None: 116 | raise ValueError("If you choose loss_mode_exp == fix, you have to specify Q_exp_loss. Setting it to" 117 | "None is not valid.") 118 | else: 119 | raise ValueError( 120 | "Choosen Q_exp_loss %s is not supported. Choose either MSE or Huber." % self._Q_exp_loss) 121 | 122 | # do the logging 123 | self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing) 124 | 125 | # 2nd term for our loss (use expert and policy states): E_(ρ)[Q(s,a) - γV(s')] 126 | V = self._Q_Q_multiplier * self.getV(obs) 127 | value = (V - y) 128 | self.sw_add_scalar('V for policy on all states', self._Q_Q_multiplier * V.mean(), self._iter) 129 | value_loss = value 130 | if self._plcy_loss_mode == "value": 131 | loss_term2 = value_loss.mean() 132 | elif self._plcy_loss_mode == "value_expert": 133 | value_loss_exp = value_loss[is_expert] 134 | loss_term2 = value_loss_exp.mean() 135 | elif self._plcy_loss_mode == "value_policy": 136 | value_loss_plcy = value_loss[~is_expert] 137 | loss_term2 = value_loss_plcy.mean() 138 | elif self._plcy_loss_mode == "q_old_policy": 139 | reward_plcy = reward[~is_expert] 140 | loss_term2 = reward_plcy.mean() 141 | elif self._plcy_loss_mode == "value_q_old_policy": 142 | reward_plcy = reward[~is_expert] 143 | loss_term2 = reward_plcy.mean() + value_loss.mean() 144 | elif self._plcy_loss_mode == "v0": 145 | value_loss_v0 = (1-gamma.detach()) * self.getV(obs[is_expert]) 146 | loss_term2 = value_loss_v0.mean() 147 | elif self._plcy_loss_mode == "off": 148 | loss_term2 = 0.0 149 | else: 150 | raise ValueError("Undefined policy loss mode: %s" % self._plcy_loss_mode) 151 | 152 | # regularize 153 | if self._regularizer_mode == "exp_and_plcy": 154 | chi2_loss = ((1 - absorbing) * torch.tensor(self._reg_mult) * torch.square(reward) 155 | + self._abs_mult * absorbing * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult) 156 | * (torch.square(reward))).mean() 157 | elif self._regularizer_mode == "exp": 158 | chi2_loss = ((1 - absorbing[is_expert]) * torch.tensor(self._reg_mult) * torch.square(reward[is_expert]) 159 | + self._abs_mult * absorbing[is_expert] * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult) 160 | * (torch.square(reward[is_expert]))).mean() 161 | elif self._regularizer_mode == "plcy": 162 | chi2_loss = ((1 - absorbing[~is_expert]) * torch.tensor(self._reg_mult) * torch.square(reward[~is_expert]) 163 | + self._abs_mult * absorbing[~is_expert] * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult) 164 | * (torch.square(reward[~is_expert]))).mean() 165 | elif self._regularizer_mode == "value": 166 | V = self._Q_Q_multiplier * self.getV(obs) 167 | value = (V - y) 168 | chi2_loss = torch.tensor(self._reg_mult) * (torch.square(value)).mean() 169 | elif self._regularizer_mode == "exp_and_plcy_and_value": 170 | V = self._Q_Q_multiplier * self.getV(obs[is_expert]) 171 | value = (V - y[is_expert]) 172 | reward = torch.concat([reward, value]) 173 | chi2_loss = torch.tensor(self._reg_mult) * (torch.square(reward)).mean() 174 | elif self._regularizer_mode == "off": 175 | chi2_loss = 0.0 176 | else: 177 | raise ValueError("Undefined regularizer mode %s." % (self._regularizer_mode)) 178 | 179 | # add gradient penalty if needed 180 | if self._gp_lambda > 0: 181 | with torch.no_grad(): 182 | act_plcy, _ = self.policy.compute_action_and_log_prob_t(obs[is_expert]) 183 | loss_gp = self._gradient_penalty(obs[is_expert], act[is_expert], 184 | obs[is_expert], act_plcy, self._gp_lambda) 185 | else: 186 | loss_gp = 0.0 187 | 188 | loss_Q = loss_term1 + loss_term2 + chi2_loss + loss_gp 189 | self.update_Q_parameters(loss_Q) 190 | 191 | grads = [] 192 | for param in self._critic_approximator.model.network.parameters(): 193 | grads.append(param.grad.view(-1)) 194 | grads = torch.cat(grads) 195 | norm = grads.norm(dim=0, p=2) 196 | if self._iter % self._logging_iter == 0: 197 | self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter) 198 | 199 | return loss_term1, loss_term2, chi2_loss 200 | 201 | def iq_update(self, input_states, input_actions, input_n_states, input_absorbing, is_expert): 202 | 203 | # update Q function 204 | if self._iter % self._delay_Q == 0: 205 | self.update_Q_function(input_states, input_actions, input_n_states, input_absorbing, is_expert) 206 | 207 | # update policy 208 | if self._iter % self._delay_pi == 0: 209 | self.update_policy(input_states, is_expert) 210 | 211 | if self._iter % self._delay_Q == 0: 212 | self._update_target(self._critic_approximator, 213 | self._target_critic_approximator) 214 | 215 | def fit_dynamics_model(self, n_epochs=1): 216 | 217 | states = self._dynamics_model_training_data["states"] 218 | actions = self._dynamics_model_training_data["actions"] 219 | inputs = np.concatenate([states, actions], axis=1) 220 | targets = self._dynamics_model_training_data["next_states"] 221 | 222 | # normalize targets 223 | targets = (targets - self.norm_act_mean) / self.norm_act_delta 224 | 225 | self._dynamics_model.fit(inputs, targets, n_epochs=n_epochs) 226 | 227 | preds = self._dynamics_model.predict(inputs) 228 | loss = F.mse_loss(to_float_tensor(preds), to_float_tensor(targets)) 229 | self.sw_add_scalar("Forward_DM/Loss", torch.mean(loss), self._iter) 230 | print("Loss", torch.mean(loss).detach().cpu().numpy()) 231 | 232 | def add_next_step_to_buffer(self): 233 | 234 | if self._idx_state >= self._max_traj_len or self._state is None: 235 | init_state_idx = np.random.randint(len(self._dynamics_model_training_data["states"])*0.8) 236 | self._state = self._dynamics_model_training_data["states"][init_state_idx] 237 | self._idx_state = 0 238 | 239 | action = self.policy.draw_action(self._state) 240 | action = np.clip(action, self.mdp_info.action_space.low, self.mdp_info.action_space.high) 241 | inputs = np.concatenate([self._state, action]) 242 | next_state = self._dynamics_model.predict(inputs) 243 | 244 | # unnormalize next state 245 | next_state = (next_state * self.norm_act_delta) + self.norm_act_mean 246 | 247 | self._replay_memory.add([[self._state, action, 0.0, next_state, 0, 0]]) 248 | 249 | self._state = next_state 250 | self._idx_state += 1 251 | 252 | 253 | def predict_trajectories_and_add_to_replay_buffer(self, n_trajec, trajec_len): 254 | 255 | for i in range(n_trajec): 256 | # get initial state 257 | init_state_idx = np.random.randint(len(self._dynamics_model_training_data["states"])*0.8) 258 | state = self._dynamics_model_training_data["states"][init_state_idx] 259 | for j in range(trajec_len): 260 | action = self.policy.draw_action(state) 261 | action = np.clip(action, self.mdp_info.action_space.low, self.mdp_info.action_space.high) 262 | inputs = np.concatenate([state, action]) 263 | next_state = self._dynamics_model.predict(inputs) 264 | 265 | # unnormalize next state 266 | next_state = (next_state * self.norm_act_delta) + self.norm_act_mean 267 | 268 | self._replay_memory.add([[state, action, 0.0, next_state, 0, 0]]) 269 | 270 | state = next_state 271 | 272 | def add_dataset_to_replay_memory(self, dataset): 273 | 274 | for i in range(len(dataset["states"])): 275 | self._replay_memory.add([[dataset["states"][i], dataset["actions"][i], dataset["rewards"][i], 276 | dataset["next_states"][i], dataset["absorbing"][i], dataset["last"][i]]]) 277 | -------------------------------------------------------------------------------- /imitation_lib/imitation/sqil_sac.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from .iq_sac import IQ_SAC 4 | from mushroom_rl.utils.torch import to_float_tensor 5 | 6 | 7 | class SQIL(IQ_SAC): 8 | 9 | def __init__(self, R_min=0.0, R_max=1.0, plcy_loss_mode="plcy", **kwargs): 10 | 11 | super(SQIL, self).__init__(plcy_loss_mode=plcy_loss_mode, **kwargs) 12 | 13 | self._R_min = R_min 14 | self._R_max = R_max 15 | 16 | def iq_update(self, input_states, input_actions, input_n_states, input_absorbing, is_expert): 17 | """ This function overrides the respective function from iq. It makes only slight changes. """ 18 | if self._iter % self._delay_Q == 0: 19 | lossQ = self._lossQ(input_states, input_actions, input_n_states, input_absorbing, 20 | is_expert) 21 | if self._iter % self._logging_iter == 0: 22 | self.sw_add_scalar('IQ-Loss/LossQ', lossQ, self._iter) 23 | 24 | # update policy 25 | if self._replay_memory.size > self._warmup_transitions() and self._iter % self._delay_pi == 0: 26 | if self._train_policy_only_on_own_states: 27 | policy_training_states = input_states[~is_expert] 28 | else: 29 | policy_training_states = input_states 30 | action_new, log_prob = self.policy.compute_action_and_log_prob_t(policy_training_states) 31 | loss = self._actor_loss(policy_training_states, action_new, log_prob) 32 | self._optimize_actor_parameters(loss) 33 | grads = [] 34 | for param in self.policy._approximator.model.network.parameters(): 35 | grads.append(param.grad.view(-1)) 36 | grads = torch.cat(grads) 37 | norm = grads.norm(dim=0, p=2) 38 | if self._iter % self._logging_iter == 0: 39 | self.sw_add_scalar('Gradients/Norm2 Gradient Q wrt. Pi-parameters', norm, 40 | self._iter) 41 | self.sw_add_scalar('Actor/Loss', loss, self._iter) 42 | _, log_prob = self.policy.compute_action_and_log_prob_t(input_states) 43 | self.sw_add_scalar('Actor/Entropy Expert States', torch.mean(-log_prob[is_expert]).detach().item(), 44 | self._iter) 45 | self.sw_add_scalar('Actor/Entropy Policy States', torch.mean(-log_prob[~is_expert]).detach().item(), 46 | self._iter) 47 | _, logsigma = self.policy.get_mu_log_sigma(input_states[~is_expert]) 48 | ent_gauss = self.policy.entropy_from_logsigma(logsigma) 49 | e_lb = self.policy.get_e_lb() 50 | self.sw_add_scalar('Actor/Entropy from Gaussian Policy States', torch.mean(ent_gauss).detach().item(), 51 | self._iter) 52 | self.sw_add_scalar('Actor/Entropy Lower Bound', e_lb, self._iter) 53 | _, logsigma = self.policy.get_mu_log_sigma(input_states[is_expert]) 54 | ent_gauss = self.policy.entropy_from_logsigma(logsigma) 55 | self.sw_add_scalar('Actor/Entropy from Gaussian Expert States', torch.mean(ent_gauss).detach().item(), 56 | self._iter) 57 | if self._learnable_alpha: 58 | self._update_alpha(log_prob.detach()) 59 | 60 | if self._iter % self._delay_Q == 0: 61 | self._update_target(self._critic_approximator, 62 | self._target_critic_approximator) 63 | 64 | def _lossQ(self, obs, act, next_obs, absorbing, is_expert): 65 | """ 66 | This function overrides the iq-loss and replaces it with the sqil loss. 67 | """ 68 | gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma) 69 | absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing 70 | current_Q = self._critic_approximator(obs, act, output_tensor=True) 71 | if not self._use_target: 72 | next_v = self.getV(next_obs) 73 | else: 74 | with torch.no_grad(): 75 | next_v = self.get_targetV(next_obs).detach() 76 | 77 | y = (1 - torch.unsqueeze(absorbing, 1)) * gamma.detach() * next_v 78 | 79 | # expert part of loss 80 | loss_Q = 0.5 * torch.mean(torch.square(current_Q[is_expert] - (self._R_max + y[is_expert]))) 81 | 82 | # plcy part of loss 83 | if self._plcy_loss_mode == "value": 84 | V = self.getV(obs) 85 | loss_Q += 0.5 * torch.mean(torch.square(V - (self._R_min + y))) 86 | elif self._plcy_loss_mode == "value_plcy": 87 | V = self.getV(obs) 88 | loss_Q += 0.5 * torch.mean(torch.square(V[~is_expert] - (self._R_min + y[~is_expert]))) 89 | elif self._plcy_loss_mode == "plcy": # this is the true sqil objective for the policy. 90 | loss_Q += 0.5 * torch.mean(torch.square(current_Q[~is_expert] - (self._R_min + y[~is_expert]))) 91 | 92 | loss_Q *= self._reg_mult 93 | 94 | if self._iter % self._logging_iter == 0: 95 | reward = (current_Q - y) 96 | self.sw_add_scalar('Action-Value/Q for expert', current_Q[is_expert].mean(), self._iter) 97 | self.sw_add_scalar('Action-Value/Q^2 for expert', torch.square(current_Q[is_expert]).mean(), self._iter) 98 | self.sw_add_scalar('Action-Value/Q for policy', current_Q[~is_expert].mean(), self._iter) 99 | self.sw_add_scalar('Action-Value/Q^2 for policy', torch.square(current_Q[~is_expert]).mean(), self._iter) 100 | self.sw_add_scalar('Action-Value/Reward', reward.mean(), self._iter) 101 | self.sw_add_scalar('Action-Value/Reward_Expert', reward[is_expert].mean(), self._iter) 102 | self.sw_add_scalar('Action-Value/Reward_Policy', reward[~is_expert].mean(), self._iter) 103 | self.sw_add_scalar('Action-Value/R_min', self._R_min, self._iter) 104 | # norm 105 | w = self._critic_approximator.get_weights() 106 | self.sw_add_scalar("Action-Value/Norm of Q net: ",np.linalg.norm(w), self._iter) 107 | self.sw_add_scalar('Targets/expert data', y[is_expert].mean(), self._iter) 108 | self.sw_add_scalar('Targets/policy data', y[~is_expert].mean(), self._iter) 109 | # log mean squared action 110 | self.sw_add_scalar('Actions/mean squared action expert (from data)', torch.square(act[is_expert]).mean(), self._iter) 111 | self.sw_add_scalar('Actions/mean squared action expert (from policy)', np.square(self.policy.draw_action(obs[is_expert])).mean(), self._iter) 112 | self.sw_add_scalar('Actions/mean squared action policy', torch.square(act[~is_expert]).mean(), self._iter) 113 | self.sw_add_scalar('Actions/mean squared action both', torch.square(act).mean(), self._iter) 114 | 115 | # log mean of each action 116 | n_actions = len(act[0]) 117 | for i in range(n_actions): 118 | self.sw_add_scalar('All Actions means/action %d expert' % i, act[is_expert, i].mean(), 119 | self._iter) 120 | self.sw_add_scalar('All Actions means/action %d policy' % i, act[~is_expert, i].mean(), 121 | self._iter) 122 | self.sw_add_scalar('All Actions variances/action %d expert' % i, torch.var(act[is_expert, i]), 123 | self._iter) 124 | self.sw_add_scalar('All Actions variances/action %d policy' % i, torch.var(act[~is_expert, i]), 125 | self._iter) 126 | 127 | self.update_Q_parameters(loss_Q) 128 | 129 | grads = [] 130 | for param in self._critic_approximator.model.network.parameters(): 131 | grads.append(param.grad.view(-1)) 132 | grads = torch.cat(grads) 133 | norm = grads.norm(dim=0, p=2) 134 | self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter) 135 | 136 | return loss_Q 137 | -------------------------------------------------------------------------------- /imitation_lib/imitation/vail_TRPO.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | import numpy as np 6 | 7 | from imitation_lib.imitation import GAIL_TRPO 8 | from imitation_lib.utils import to_float_tensors 9 | 10 | 11 | class VAIL(GAIL_TRPO): 12 | 13 | def __init__(self, **kwargs): 14 | 15 | # call base constructor 16 | super(VAIL, self).__init__(**kwargs) 17 | 18 | def discrim_output(self, *inputs, apply_mask=True): 19 | inputs = self.prepare_discrim_inputs(inputs, apply_mask=apply_mask) 20 | d_out,_ ,_ = self._D(*inputs) 21 | return d_out 22 | 23 | def _discriminator_logging(self, inputs, targets): 24 | super(VAIL, self)._discriminator_logging(inputs, targets) 25 | if self._sw: 26 | # calculate bottleneck loss 27 | loss = deepcopy(self._loss) 28 | d, mu, logvar = to_float_tensors(self._D(*inputs)) 29 | bottleneck_loss = loss.bottleneck_loss(mu, logvar) 30 | self._sw.add_scalar('Bottleneck_Loss', bottleneck_loss, self._iter // 3) 31 | self._sw.add_scalar('Beta', loss._beta, self._iter // 3) 32 | self._sw.add_scalar('Bottleneck_Loss_times_Beta', loss._beta * bottleneck_loss, self._iter // 3) 33 | 34 | -------------------------------------------------------------------------------- /imitation_lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .math import * 2 | from .networks import * 3 | from .training import * 4 | from .preprocessor import * -------------------------------------------------------------------------------- /imitation_lib/utils/distributions.py: -------------------------------------------------------------------------------- 1 | from torch.distributions import constraints 2 | from torch.distributions.transforms import PowerTransform 3 | from torch.distributions import TransformedDistribution, Gamma 4 | 5 | 6 | 7 | class InverseGamma(TransformedDistribution): 8 | 9 | def __init__(self, concentration, rate, validate_args=None): 10 | base_dist = Gamma(concentration, rate) 11 | super().__init__( 12 | base_dist, 13 | PowerTransform(-base_dist.rate.new_ones(())), 14 | validate_args=validate_args, 15 | ) 16 | 17 | def expand(self, batch_shape, _instance=None): 18 | new = self._get_checked_instance(InverseGamma, _instance) 19 | return super().expand(batch_shape, _instance=new) 20 | 21 | @property 22 | def concentration(self): 23 | return self.base_dist.concentration 24 | 25 | @property 26 | def rate(self): 27 | return self.base_dist.rate 28 | -------------------------------------------------------------------------------- /imitation_lib/utils/math.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import numpy as np 3 | import torch 4 | import torch.nn.functional as F 5 | from mushroom_rl.utils.angles import euler_to_quat 6 | 7 | 8 | from mushroom_rl.utils.torch import to_float_tensor 9 | 10 | 11 | class GailDiscriminatorLoss(torch.nn.modules.BCEWithLogitsLoss): 12 | 13 | def __init__(self, entcoeff=1e-3, weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean', 14 | pos_weight: Optional[torch.Tensor] = None) -> None: 15 | 16 | super(GailDiscriminatorLoss, self).__init__(weight, size_average, reduce, reduction, pos_weight) 17 | 18 | self.sigmoid = torch.nn.Sigmoid() 19 | self.logsigmoid = torch.nn.LogSigmoid() 20 | self.entcoeff = entcoeff 21 | 22 | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 23 | # overrides original BCELoss 24 | # from tensorflow max(x, 0) - x * z + log(1 + exp(-abs(x))) 25 | bce_loss = torch.maximum(input, torch.zeros_like(input)) - input * target + torch.log(1 + torch.exp(-torch.abs(input))) 26 | bce_loss = torch.mean(bce_loss) 27 | 28 | bernoulli_ent = self.entcoeff * torch.mean(self.logit_bernoulli_entropy(input)) 29 | return bce_loss - bernoulli_ent 30 | 31 | def logit_bernoulli_entropy(self, logits): 32 | """ 33 | Adapted from: 34 | https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51 35 | """ 36 | return (1. - self.sigmoid(logits)) * logits - self.logsigmoid(logits) 37 | 38 | 39 | class VDBLoss(GailDiscriminatorLoss): 40 | 41 | def __init__(self, info_constraint, lr_beta, use_bernoulli_ent=False, entcoeff=1e-3, weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean', 42 | pos_weight: Optional[torch.Tensor] = None) -> None: 43 | 44 | # call base constructor 45 | super().__init__(entcoeff, weight, size_average, reduce, reduction, pos_weight) 46 | 47 | self._use_bernoulli_ent = use_bernoulli_ent 48 | self._info_constr = info_constraint 49 | self._lr_beta = lr_beta 50 | self._beta = 0.1 51 | 52 | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 53 | logits, mu, logvar = input 54 | 55 | # bottleneck loss 56 | bottleneck_loss = self.bottleneck_loss(mu, logvar) 57 | 58 | # binary cross entropy loss 59 | bce_loss = F.binary_cross_entropy_with_logits(torch.squeeze(logits), torch.squeeze(target), self.weight, 60 | pos_weight=self.pos_weight, 61 | reduction=self.reduction) 62 | 63 | # optional, additional bernoulli entropy (as in gail, but this was not used in the paper) 64 | bernoulli_ent = self.logit_bernoulli_entropy(logits) if self._use_bernoulli_ent else torch.zeros_like(bce_loss) 65 | 66 | # overall vdb loss 67 | vdb_loss = bce_loss + self._beta * bottleneck_loss + bernoulli_ent 68 | 69 | # update beta 70 | self._update_beta(bottleneck_loss) 71 | 72 | return vdb_loss 73 | 74 | def bottleneck_loss(self, mu, logvar): 75 | kld = self.kl_divergence(mu, logvar).mean() 76 | bottleneck_loss = kld - self._info_constr 77 | return bottleneck_loss 78 | 79 | @torch.no_grad() 80 | def _update_beta(self, bottleneck_loss): 81 | self._beta = max(0, self._beta + self._lr_beta * bottleneck_loss) 82 | 83 | @staticmethod 84 | def kl_divergence(mu, logvar): 85 | kl_div = 0.5 * torch.sum(torch.pow(mu, 2) + torch.exp(logvar) - logvar - 1, dim=1) 86 | return kl_div 87 | 88 | 89 | def to_float_tensors(inputs): 90 | """ Takes a list or tuple of of numpy arrays and converts them to a list of torch tensors. If only an array is 91 | provided, it returns a torch tensor.""" 92 | if type(inputs) is not tuple and type(inputs) is not list: 93 | return to_float_tensor(inputs) 94 | else: 95 | out = [] 96 | for elem in inputs: 97 | out.append(to_float_tensor(elem)) 98 | return out 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /imitation_lib/utils/preprocessor.py: -------------------------------------------------------------------------------- 1 | from mushroom_rl.core import Serializable 2 | 3 | 4 | class MaskingPreprocessor(Serializable): 5 | 6 | def __init__(self, mask): 7 | self._mask = mask 8 | self._add_save_attr(_mask='primitive') 9 | 10 | def __call__(self, obs): 11 | masked_obs = obs[self._mask] 12 | return masked_obs 13 | -------------------------------------------------------------------------------- /imitation_lib/utils/training.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | import numpy as np 4 | 5 | from mushroom_rl.utils.dataset import compute_J, parse_dataset 6 | 7 | 8 | class BestAgentSaver: 9 | 10 | def __init__(self, save_path, n_epochs_save=10, save_replay_memory=False): 11 | self.best_curr_agent = None 12 | self.save_path = save_path 13 | self.n_epochs_save = n_epochs_save 14 | self.last_save = 0 15 | self.epoch_counter = 0 16 | self.best_J_since_last_save = -float('inf') 17 | self.save_replay_memory = save_replay_memory 18 | 19 | def save(self, agent, J): 20 | 21 | if self.n_epochs_save != -1: 22 | if J > self.best_J_since_last_save: 23 | self.best_J_since_last_save = J 24 | # if the agent has a replay memory that should not be saved, we can save memory by not copying it, 25 | # i.e., temporarily removing it from the current agent and then giving it back. 26 | mem = None 27 | tmp_store_mem = hasattr(agent, '_replay_memory') and not self.save_replay_memory 28 | if tmp_store_mem: 29 | mem = agent._replay_memory 30 | agent._replay_memory = None 31 | self.best_curr_agent = (deepcopy(agent), J, self.epoch_counter) 32 | if tmp_store_mem: 33 | agent._replay_memory = mem 34 | 35 | if self.last_save + self.n_epochs_save <= self.epoch_counter: 36 | self.save_curr_best_agent() 37 | 38 | self.epoch_counter += 1 39 | 40 | def save_curr_best_agent(self): 41 | 42 | if self.best_curr_agent is not None: 43 | path = os.path.join(self.save_path, 'agent_epoch_%d_J_%f.msh' % (self.best_curr_agent[2], 44 | self.best_curr_agent[1])) 45 | self.best_curr_agent[0].save(path, full_save=True) 46 | self.best_curr_agent = None 47 | self.best_J_since_last_save = -float('inf') 48 | self.last_save = self.epoch_counter 49 | 50 | def save_agent(self, agent, J): 51 | path = os.path.join(self.save_path, 'agent_J_%f.msh' % J) 52 | agent.save(path, full_save=True) 53 | 54 | 55 | def prepare_expert_data(data_path): 56 | dataset = dict() 57 | 58 | # load expert training data 59 | expert_files = np.load(data_path) 60 | dataset["states"] = expert_files["states"] 61 | dataset["actions"] = expert_files["actions"] 62 | dataset["episode_starts"] = expert_files["episode_starts"] 63 | 64 | # maybe we have next action and next next state 65 | try: 66 | dataset["next_actions"] = expert_files["next_actions"] 67 | dataset["next_next_states"] = expert_files["next_next_states"] 68 | except KeyError as e: 69 | print("Did not find next action or next next state.") 70 | 71 | # maybe we have next states and dones in the dataset 72 | try: 73 | dataset["next_states"] = expert_files["next_states"] 74 | dataset["absorbing"] = expert_files["absorbing"] 75 | except KeyError as e: 76 | print("Warning Dataset: %s" % e) 77 | 78 | # maybe we have episode returns, if yes done 79 | try: 80 | dataset["episode_returns"] = expert_files["episode_returns"] 81 | return dataset 82 | except KeyError: 83 | print("Warning Dataset: No episode returns. Falling back to step-based reward.") 84 | 85 | # this has to work 86 | try: 87 | dataset["rewards"] = expert_files["rewards"] 88 | return dataset 89 | except KeyError: 90 | raise KeyError("The dataset has neither an episode nor a step-based reward!") 91 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | requires_list = ["mushroom_rl>=1.10.0", "tensorboard", "experiment-launcher"] 4 | 5 | setup(name='imitation_lib', 6 | version='0.1', 7 | description='Code base of the paper: LS-IQ: Implicit Reward Regularization for Inverse Reinforcement Learning.', 8 | license='MIT', 9 | author="Firas Al-Hafez", 10 | packages=[package for package in find_packages() 11 | if package.startswith('imitation_lib')], 12 | install_requires=requires_list, 13 | zip_safe=False, 14 | ) --------------------------------------------------------------------------------