├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── download_data.sh
├── examples
    └── 01_LSIQ
    │   ├── 01_episode
    │       ├── launcher.py
    │       └── lsiq_experiments.py
    │   ├── 02_episode_5
    │       ├── launcher.py
    │       └── lsiq_experiments.py
    │   ├── 03_episode_10
    │       ├── launcher.py
    │       └── lsiq_experiments.py
    │   └── 04_episode_25
    │       ├── launcher.py
    │       └── lsiq_experiments.py
├── img
    └── Divergence_Minimization.gif
├── imitation_lib
    ├── __init__.py
    ├── imitation
    │   ├── __init__.py
    │   ├── gail_TRPO.py
    │   ├── iq_sac.py
    │   ├── iqfo_orig.py
    │   ├── iqfo_sac.py
    │   ├── lsiq.py
    │   ├── lsiq_h.py
    │   ├── lsiq_hc.py
    │   ├── lsiqfo.py
    │   ├── lsiqfo_h.py
    │   ├── lsiqfo_hc.py
    │   ├── offline
    │   │   ├── __init__.py
    │   │   ├── behavioral_cloning.py
    │   │   ├── iq_offline.py
    │   │   ├── lsiq_offline.py
    │   │   └── lsiq_offline_dm.py
    │   ├── sqil_sac.py
    │   └── vail_TRPO.py
    └── utils
    │   ├── __init__.py
    │   ├── action_models.py
    │   ├── distributions.py
    │   ├── math.py
    │   ├── networks.py
    │   ├── preprocessor.py
    │   └── training.py
└── setup.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | .npz filter=lfs diff=lfs merge=lfs -text
2 | *.npz filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # examples
 88 | logs/
 89 | 
 90 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 91 | __pypackages__/
 92 | 
 93 | # Celery stuff
 94 | celerybeat-schedule
 95 | celerybeat.pid
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 | 
124 | # Pyre type checker
125 | .pyre/
126 | 
127 | # expert datasets
128 | *.npz
129 | .idea/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Al-Hafez
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LS-IQ: Implicit Reward Regularization for Inverse Reinforcement Learning
 2 | This is the official code base of the paper [*LS-IQ: Implicit Reward Regularization for Inverse Reinforcement Learning*](https://arxiv.org/pdf/2303.00599.pdf), 
 3 | which was presented at the eleventh International Conference on Learning Representations ([ICLR 2023](https://iclr.cc/Conferences/2023))
 4 | in Kigali Ruanda. Here, we also provide all the baselines for the [LocoMuJoCo](https://github.com/robfiras/loco-mujoco) imitation learning benchmark [*LocoMuJoCo: A Comprehensive Imitation Learning Benchmark for Locomotion*](https://arxiv.org/pdf/2311.02496.pdf) presented at the Robot Learning workshop at [NeurIPS 2023](https://nips.cc/).
 5 | 
 6 | ---
 7 | ![Divergence_Minimization](img/Divergence_Minimization.gif)
 8 | ## Method
 9 | Within this work, we analyze the effect of a squared norm regularizer on the implicit reward function in the inverse reinforcement learning setting.
10 | We build on previous work ([IQ-Learn](https://arxiv.org/pdf/2106.12142.pdf)), and show that this regularizer results in a minimzation
11 | of the Chi^2-Divergence between the expert and a mixture distribution. **We show that - unlike previously used divergences - this divergence is bounded 
12 | and the resulting reward function is also bounded**. An example is given in the picture above, where the target distribution is blue, 
13 | the current policy distribution is green, and the mixture is orange. As can be seen, the vanilla Chi^2 divergence can reach very high values - despite the support area being non-zero -
14 | while the divergence on the mixture is bounded. Both optimization share the same optimal solution.
15 | 
16 | Also, this regularizer provides a particularly illuminating perspective: the original **objective can be understood as
17 | squared Bellman error minimization with fixed rewards for the expert and the policy**. This setting can be further used to
18 | stabilize training as shown in our paper.
19 | 
20 | ### Key Advatanages 
21 | ✅ Simple implementation on top of SAC \
22 | ✅ Bounded objective with bounded reward yields stable and convenient training\
23 | ✅ Retains performance even without expert actions\
24 | ✅ Performs even when only 1 expert trajectory is given\
25 | ✅ Works in complex and realistic environments such as on the Atlas Locomotion task\
26 | ✅ Unlike previous methods, no survival bias!
27 | 
28 | ---
29 | ## Installation
30 | You can install this repo by cloning and then 
31 | 
32 | ```shell
33 | cd ls-iq
34 | pip install -e .
35 | ```
36 | 
37 | ### Download the Datasets [not needed for LocoMuJoCo]
38 | In order to run the examples and reproduce the results, you have to download the datasets used in our paper. To do so, you have to install `gdown`:
39 | 
40 | ```shell
41 | pip install gdown
42 | ```
43 | Then you can just run the download script:
44 | ```shell
45 | chmod u+x ./download_data.sh
46 | ./download_data.sh
47 | ```
48 | 
49 | ---
50 | ## Examples
51 | You can find launcher files in the example folder to launch all different versions of LSIQ and to reproduce the main results
52 | of the paper. 
53 | 
54 | Here is how you run the training of LSIQ with 5 expert trajectories on all Mujoco Gym Tasks:
55 | 
56 | ```shell
57 | cd examples/02_episode_5/
58 | python launcher.py
59 | ```
60 | To monitor the training, you have to use Tensorboard. Once the training is launched, the directory `logs` will be created, which contains
61 | the Tensorboard logging data. Here is how you run Tensorboard:
62 | 
63 | ```shell
64 | tensorboard --logdir logs
65 | ```
66 | 
67 | 
68 | Some experiments were such as the Atlas locomotion task were conducted on environment, which are yet not
69 | available on Mushroom-RL, but will be available soon! Once the environments are part of Mushroom-RL, the experiment files will be added here.
70 | Follow Mushroom-RL on Twitter [@Mushroom_RL](https://twitter.com/Mushroom_RL) to immediately get notified once the
71 | new environment package is available!
72 | 
73 | ---
74 | ## Citation
75 | ```
76 | @inproceedings{alhafez2023,
77 | title={LS-IQ: Implicit Reward Regularization for Inverse Reinforcement Learning},
78 | author={Firas Al-Hafez and Davide Tateo and Oleg Arenz and Guoping Zhao and Jan Peters},
79 | booktitle={Eleventh International Conference on Learning Representations (ICLR)},
80 | year={2023},
81 | url={https://openreview.net/pdf?id=o3Q4m8jg4BR}}
82 | ```
83 | 


--------------------------------------------------------------------------------
/download_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | cd examples
4 | gdown --folder https://drive.google.com/drive/folders/1I246M9aPzW1rAqyRC1hcqEeNno0X4usi?usp=share_link
5 | 
6 | 


--------------------------------------------------------------------------------
/examples/01_LSIQ/01_episode/launcher.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from itertools import product
 3 | from experiment_launcher import Launcher
 4 | 
 5 | from experiment_launcher.utils import bool_local_cluster
 6 | 
 7 | if __name__ == '__main__':
 8 |     LOCAL = bool_local_cluster()
 9 |     TEST = False
10 |     USE_CUDA = False
11 | 
12 |     JOBLIB_PARALLEL_JOBS = 1  # or os.cpu_count() to use all cores
13 |     N_SEEDS = 5
14 | 
15 |     launcher = Launcher(exp_name='lsiq_1',
16 |                         python_file='lsiq_experiments',
17 |                         n_exps=N_SEEDS,
18 |                         joblib_n_jobs=JOBLIB_PARALLEL_JOBS,
19 |                         n_cores=JOBLIB_PARALLEL_JOBS * 1,
20 |                         memory_per_core=JOBLIB_PARALLEL_JOBS * 6000,
21 |                         days=2,
22 |                         hours=0,
23 |                         minutes=0,
24 |                         seconds=0,
25 |                         use_timestamp=True,
26 |                         )
27 | 
28 |     default_params = dict(n_epochs=150,
29 |                           n_steps_per_epoch=10000,
30 |                           n_eval_episodes=10,
31 |                           n_steps_per_fit=1,
32 |                           n_epochs_save=-1,
33 |                           logging_iter=10000,
34 |                           gamma=0.99,
35 |                           use_cuda=USE_CUDA,
36 |                           tau=0.005,
37 |                           use_target=True,
38 |                           loss_mode_exp="fix",
39 |                           regularizer_mode="plcy",
40 |                           learnable_alpha=False)
41 | 
42 |     log_std = [(-5, 2)]
43 |     envs = ["Ant-v3",
44 |             "HalfCheetah-v3",
45 |             "Hopper-v3",
46 |             "Humanoid-v3",
47 |             "Walker2d-v3"]
48 |     path_to_datasets = "../../00_Datasets/1_episode/"
49 |     expert_data_filenames = ["expert_dataset_Ant-v3_6321.34_1_SAC.npz",
50 |                              "expert_dataset_HalfCheetah-v3_12312.93_1_SAC.npz",
51 |                              "expert_dataset_Hopper-v3_3729.74_1_SAC.npz",
52 |                              "expert_dataset_Humanoid-v3_6335.31_1_SAC.npz",
53 |                              "expert_dataset_Walker2d-v3_5830.37_1_SAC.npz"]
54 | 
55 |     expert_data_paths = [path_to_datasets + name for name in expert_data_filenames]
56 | 
57 |     # Ant
58 |     launcher.add_experiment(env_id__=envs[0], expert_data_path=expert_data_paths[0],
59 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
60 | 
61 |     # HalfCheetah
62 |     launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1],
63 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
64 | 
65 |     # Hopper
66 |     launcher.add_experiment(env_id__=envs[2], expert_data_path=expert_data_paths[2],
67 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
68 | 
69 |     # Humanoid
70 |     launcher.add_experiment(env_id__=envs[3], expert_data_path=expert_data_paths[3],
71 |                             plcy_loss_mode__="q_old_policy", init_alpha__=0.1, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
72 | 
73 |     # Walker2d
74 |     launcher.add_experiment(env_id__=envs[4], expert_data_path=expert_data_paths[4],
75 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
76 | 
77 |     launcher.run(LOCAL, TEST)
78 | 


--------------------------------------------------------------------------------
/examples/01_LSIQ/01_episode/lsiq_experiments.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from time import perf_counter
  3 | from contextlib import contextmanager
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.optim as optim
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | from mushroom_rl.core import Core
 11 | from mushroom_rl.environments import Gym
 12 | from mushroom_rl.utils.dataset import compute_J, compute_episodes_length
 13 | from mushroom_rl.core.logger.logger import Logger
 14 | 
 15 | 
 16 | from imitation_lib.imitation import LSIQ
 17 | from imitation_lib.utils import FullyConnectedNetwork
 18 | from imitation_lib.utils import prepare_expert_data, BestAgentSaver
 19 | 
 20 | 
 21 | from experiment_launcher import run_experiment
 22 | 
 23 | 
 24 | def _create_agent(mdp, expert_data, sw, lr_critic, lr_actor, plcy_loss_mode,
 25 |                   regularizer_mode, use_target, lossQ_type, use_cuda, tau,
 26 |                   learnable_alpha, init_alpha, reg_mult, Q_exp_loss, gamma,
 27 |                   loss_mode_exp, log_std_min, log_std_max, delay_Q, n_fits,
 28 |                   logging_iter):
 29 | 
 30 |     # calculate the minimum and maximum Q-function
 31 |     Q_max = 1.0 / (reg_mult * (1 - gamma))
 32 |     Q_min = - 1.0 / (reg_mult * (1 - gamma))
 33 | 
 34 |     # Settings
 35 |     initial_replay_size = 10000
 36 |     max_replay_size = 1000000
 37 |     batch_size = 256     # the real batch size is double the size as an expert batch is going to be added
 38 |     warmup_transitions = 15000
 39 | 
 40 |     lr_alpha = 2e-6
 41 |     weight_decay_actor = 0.0
 42 |     weight_decay_critic = 0.0
 43 | 
 44 |     target_entropy = -22.0
 45 | 
 46 |     # Approximator
 47 |     actor_input_shape = mdp.info.observation_space.shape
 48 |     actor_output_shape = (mdp.info.action_space.shape[0]*2,)
 49 |     actor_params = dict(network=FullyConnectedNetwork,
 50 |                         n_features=[256, 256],
 51 |                         input_shape=actor_input_shape,
 52 |                         output_shape=actor_output_shape,
 53 |                         activations=["relu", "relu", "identity"],
 54 |                         use_cuda=use_cuda)
 55 | 
 56 |     actor_optimizer = {'class': optim.Adam,
 57 |                        'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}}
 58 | 
 59 |     critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
 60 |     critic_params = dict(network=FullyConnectedNetwork,
 61 |                          optimizer={'class': optim.Adam,
 62 |                                     'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}},
 63 |                          n_features=[256, 256],
 64 |                          input_shape=critic_input_shape,
 65 |                          activations=["relu", "relu", "identity"],
 66 |                          squeeze_out=False,
 67 |                          output_shape=(1,),
 68 |                          use_cuda=use_cuda)
 69 | 
 70 |     # create IQfO agent
 71 |     agent = LSIQ(mdp_info=mdp.info, batch_size=batch_size, initial_replay_size=initial_replay_size,
 72 |                  max_replay_size=max_replay_size, demonstrations=expert_data, sw=sw, use_target=use_target,
 73 |                  warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha, actor_params=actor_params,
 74 |                  actor_optimizer=actor_optimizer, critic_params=critic_params, delay_Q=delay_Q, lossQ_type=lossQ_type,
 75 |                  target_entropy=target_entropy, critic_fit_params=None, plcy_loss_mode=plcy_loss_mode,
 76 |                  regularizer_mode=regularizer_mode, learnable_alpha=learnable_alpha, init_alpha=init_alpha,
 77 |                  reg_mult=reg_mult, Q_min=Q_min, Q_max=Q_max, log_std_min=log_std_min, log_std_max=log_std_max,
 78 |                  loss_mode_exp=loss_mode_exp, Q_exp_loss=Q_exp_loss, n_fits=n_fits, logging_iter=logging_iter)
 79 | 
 80 |     return agent
 81 | 
 82 | 
 83 | def experiment(env_id: str = "HalfCheetah-v2",
 84 |                n_epochs: int = 500,
 85 |                n_steps_per_epoch: int = 10000,
 86 |                n_steps_per_fit: int = 1,
 87 |                n_eval_episodes: int = 50,
 88 |                n_epochs_save: int = 100,
 89 |                logging_iter: int = 100,
 90 |                expert_data_path: str = None,
 91 |                use_cuda: bool = False,
 92 |                lr_critic: float = 3e-4,
 93 |                lr_actor: float = 3e-5,
 94 |                results_dir: str = "./logs",
 95 |                plcy_loss_mode: str = "value",
 96 |                regularizer_mode: str = "exp_and_plcy",
 97 |                reg_mult: float = 0.5,
 98 |                Q_exp_loss: str = "MSE",
 99 |                n_fits: int = 1,
100 |                loss_mode_exp: str = "fix",
101 |                log_std_min: float = -5.0,
102 |                log_std_max: float = 2.0,
103 |                learnable_alpha: bool = False,
104 |                use_target: bool = True,
105 |                init_alpha: float = 0.001,
106 |                tau: float = 0.005,
107 |                delay_Q: int = 1,
108 |                lossQ_type: str = "sqil_like",
109 |                gamma: float = 0.99,
110 |                horizon: int = 1000,
111 |                seed: int = 0):
112 | 
113 |     np.random.seed(seed)
114 |     torch.random.manual_seed(seed)
115 | 
116 |     logger_stoch = Logger(results_dir=results_dir, log_name="stochastic_logging", seed=seed, append=True)
117 |     logger_deter = Logger(results_dir=results_dir, log_name="deterministic_logging", seed=seed, append=True)
118 | 
119 |     results_dir = os.path.join(results_dir, str(seed))
120 | 
121 |     env_params = dict(name=env_id, horizon=horizon, gamma=gamma)
122 | 
123 |     mdp = Gym(**env_params)
124 | 
125 |     # load expert data
126 |     expert_data = prepare_expert_data(data_path=expert_data_path)
127 | 
128 |     # logging stuff
129 |     tb_writer = SummaryWriter(log_dir=results_dir)
130 |     agent_saver = BestAgentSaver(save_path=results_dir, n_epochs_save=n_epochs_save)
131 | 
132 |     # create agent and core
133 |     agent = _create_agent(mdp, expert_data,  sw=tb_writer, lr_critic=lr_critic, lr_actor=lr_actor,
134 |                           plcy_loss_mode=plcy_loss_mode, regularizer_mode=regularizer_mode,
135 |                           use_cuda=use_cuda, use_target=use_target, lossQ_type=lossQ_type,
136 |                           delay_Q=delay_Q, tau=tau, learnable_alpha=learnable_alpha, init_alpha=init_alpha,
137 |                           reg_mult=reg_mult, gamma=gamma, Q_exp_loss=Q_exp_loss,
138 |                           loss_mode_exp=loss_mode_exp, log_std_min=log_std_min,
139 |                           n_fits=n_fits, log_std_max=log_std_max, logging_iter=logging_iter)
140 | 
141 |     core = Core(agent, mdp)
142 | 
143 |     # iqfo train loop
144 |     for epoch in range(n_epochs):
145 |         with catchtime() as t:
146 |             # training
147 |             core.learn(n_steps=n_steps_per_epoch, n_steps_per_fit=n_steps_per_fit, quiet=True)
148 |             print('Epoch %d | Time %fs ' % (epoch + 1, float(t())))
149 | 
150 |             # evaluate with deterministic policy
151 |             agent.policy.use_mean = True
152 |             dataset = core.evaluate(n_episodes=n_eval_episodes)
153 |             R_mean = np.mean(compute_J(dataset))
154 |             J_mean = np.mean(compute_J(dataset, gamma=gamma))
155 |             L = np.mean(compute_episodes_length(dataset))
156 |             logger_deter.log_numpy(Epoch=epoch, R_mean=R_mean, J_mean=J_mean, L=L)
157 |             tb_writer.add_scalar("Eval_R-deterministic", R_mean, epoch)
158 |             tb_writer.add_scalar("Eval_J-deterministic", J_mean, epoch)
159 |             tb_writer.add_scalar("Eval_L-deterministic", L, epoch)
160 |             agent.policy.use_mean = False
161 | 
162 |             # evaluate with stochastic policy
163 |             dataset = core.evaluate(n_episodes=n_eval_episodes)
164 |             R_mean_stoch = np.mean(compute_J(dataset))
165 |             J_mean_stoch = np.mean(compute_J(dataset, gamma=gamma))
166 |             L = np.mean(compute_episodes_length(dataset))
167 |             logger_stoch.log_numpy(Epoch=epoch, R_mean=R_mean_stoch, J_mean=J_mean_stoch, L=L)
168 |             tb_writer.add_scalar("Eval_R-stochastic", R_mean_stoch, epoch)
169 |             tb_writer.add_scalar("Eval_J-stochastic", J_mean_stoch, epoch)
170 |             tb_writer.add_scalar("Eval_L-stochastic", L, epoch)
171 | 
172 |             print("R_mean (deter): %f | R_mean (stoch): %f" % (R_mean, R_mean_stoch))
173 | 
174 |             # save agent if needed
175 |             agent_saver.save(core.agent, J_mean)
176 | 
177 |     agent_saver.save_curr_best_agent()
178 |     print("Finished.")
179 | 
180 | @contextmanager
181 | def catchtime() -> float:
182 |     start = perf_counter()
183 |     yield lambda: perf_counter() - start
184 | 
185 | 
186 | if __name__ == "__main__":
187 | 
188 |     # Leave unchanged
189 |     run_experiment(experiment)
190 | 


--------------------------------------------------------------------------------
/examples/01_LSIQ/02_episode_5/launcher.py:
--------------------------------------------------------------------------------
 1 | from experiment_launcher import Launcher
 2 | 
 3 | from experiment_launcher.utils import bool_local_cluster
 4 | 
 5 | if __name__ == '__main__':
 6 |     LOCAL = bool_local_cluster()
 7 |     TEST = False
 8 |     USE_CUDA = False
 9 | 
10 |     JOBLIB_PARALLEL_JOBS = 1  # or os.cpu_count() to use all cores
11 |     N_SEEDS = 5
12 | 
13 |     launcher = Launcher(exp_name='lsiq_5',
14 |                         python_file='lsiq_experiments',
15 |                         n_exps=N_SEEDS,
16 |                         joblib_n_jobs=JOBLIB_PARALLEL_JOBS,
17 |                         n_cores=JOBLIB_PARALLEL_JOBS * 1,
18 |                         memory_per_core=JOBLIB_PARALLEL_JOBS * 6000,
19 |                         days=2,
20 |                         hours=0,
21 |                         minutes=0,
22 |                         seconds=0,
23 |                         use_timestamp=True,
24 |                         )
25 | 
26 |     default_params = dict(n_epochs=1500,
27 |                           n_steps_per_epoch=1000,
28 |                           n_eval_episodes=10,
29 |                           n_steps_per_fit=1,
30 |                           n_epochs_save=-1,
31 |                           logging_iter=10000,
32 |                           gamma=0.99,
33 |                           use_cuda=USE_CUDA,
34 |                           tau=0.005,
35 |                           use_target=True,
36 |                           loss_mode_exp="fix",
37 |                           regularizer_mode="plcy",
38 |                           learnable_alpha=False)
39 | 
40 |     log_std = [(-5, 2)]
41 |     envs = ["Ant-v3",
42 |             "HalfCheetah-v3",
43 |             "Hopper-v3",
44 |             "Humanoid-v3",
45 |             "Walker2d-v3"]
46 |     path_to_datasets = "../../00_Datasets/5_episodes/"
47 |     expert_data_filenames = ["expert_dataset_Ant-v3_6424.22_5_SAC.npz",
48 |                              "expert_dataset_HalfCheetah-v3_12543.01_5_SAC.npz",
49 |                              "expert_dataset_Hopper-v3_3348.59_5_SAC.npz",
50 |                              "expert_dataset_Humanoid-v3_6321.39_5_SAC.npz",
51 |                              "expert_dataset_Walker2d-v3_5854.7_5_SAC.npz"]
52 | 
53 |     expert_data_paths = [path_to_datasets + name for name in expert_data_filenames]
54 | 
55 |     # Ant
56 |     launcher.add_experiment(env_id__=envs[0], expert_data_path=expert_data_paths[0],
57 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
58 | 
59 |     # HalfCheetah
60 |     launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1],
61 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
62 |     launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1],
63 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=10.0, **default_params)
64 | 
65 |     # Hopper
66 |     launcher.add_experiment(env_id__=envs[2], expert_data_path=expert_data_paths[2],
67 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
68 | 
69 |     # Humanoid
70 |     launcher.add_experiment(env_id__=envs[3], expert_data_path=expert_data_paths[3],
71 |                             plcy_loss_mode__="value", init_alpha__=0.1, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
72 | 
73 |     # Walker2d
74 |     launcher.add_experiment(env_id__=envs[4], expert_data_path=expert_data_paths[4],
75 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
76 | 
77 | 
78 |     launcher.run(LOCAL, TEST)
79 | 


--------------------------------------------------------------------------------
/examples/01_LSIQ/02_episode_5/lsiq_experiments.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from time import perf_counter
  3 | from contextlib import contextmanager
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.optim as optim
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | from mushroom_rl.core import Core
 11 | from mushroom_rl.environments import Gym
 12 | from mushroom_rl.utils.dataset import compute_J, compute_episodes_length
 13 | from mushroom_rl.core.logger.logger import Logger
 14 | 
 15 | 
 16 | from imitation_lib.imitation import LSIQ
 17 | from imitation_lib.utils import FullyConnectedNetwork
 18 | from imitation_lib.utils import prepare_expert_data, BestAgentSaver
 19 | 
 20 | 
 21 | from experiment_launcher import run_experiment
 22 | 
 23 | 
 24 | def _create_agent(mdp, expert_data, sw, lr_critic, lr_actor, plcy_loss_mode,
 25 |                   regularizer_mode, use_target, lossQ_type, use_cuda, tau,
 26 |                   learnable_alpha, init_alpha, reg_mult, Q_exp_loss, gamma,
 27 |                   loss_mode_exp, log_std_min, log_std_max, delay_Q, n_fits,
 28 |                   logging_iter):
 29 | 
 30 |     # calculate the minimum and maximum Q-function
 31 |     Q_max = 1.0 / (reg_mult * (1 - gamma))
 32 |     Q_min = - 1.0 / (reg_mult * (1 - gamma))
 33 | 
 34 |     # Settings
 35 |     initial_replay_size = 10000
 36 |     max_replay_size = 1000000
 37 |     batch_size = 256     # the real batch size is double the size as an expert batch is going to be added
 38 |     warmup_transitions = 15000
 39 | 
 40 |     lr_alpha = 2e-6
 41 |     weight_decay_actor = 0.0
 42 |     weight_decay_critic = 0.0
 43 | 
 44 |     target_entropy = -22.0
 45 | 
 46 |     # Approximator
 47 |     actor_input_shape = mdp.info.observation_space.shape
 48 |     actor_output_shape = (mdp.info.action_space.shape[0]*2,)
 49 |     actor_params = dict(network=FullyConnectedNetwork,
 50 |                         n_features=[256, 256],
 51 |                         input_shape=actor_input_shape,
 52 |                         output_shape=actor_output_shape,
 53 |                         activations=["relu", "relu", "identity"],
 54 |                         use_cuda=use_cuda)
 55 | 
 56 |     actor_optimizer = {'class': optim.Adam,
 57 |                        'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}}
 58 | 
 59 |     critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
 60 |     critic_params = dict(network=FullyConnectedNetwork,
 61 |                          optimizer={'class': optim.Adam,
 62 |                                     'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}},
 63 |                          n_features=[256, 256],
 64 |                          input_shape=critic_input_shape,
 65 |                          activations=["relu", "relu", "identity"],
 66 |                          squeeze_out=False,
 67 |                          output_shape=(1,),
 68 |                          use_cuda=use_cuda)
 69 | 
 70 |     # create IQfO agent
 71 |     agent = LSIQ(mdp_info=mdp.info, batch_size=batch_size, initial_replay_size=initial_replay_size,
 72 |                  max_replay_size=max_replay_size, demonstrations=expert_data, sw=sw, use_target=use_target,
 73 |                  warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha, actor_params=actor_params,
 74 |                  actor_optimizer=actor_optimizer, critic_params=critic_params, delay_Q=delay_Q, lossQ_type=lossQ_type,
 75 |                  target_entropy=target_entropy, critic_fit_params=None, plcy_loss_mode=plcy_loss_mode,
 76 |                  regularizer_mode=regularizer_mode, learnable_alpha=learnable_alpha, init_alpha=init_alpha,
 77 |                  reg_mult=reg_mult, Q_min=Q_min, Q_max=Q_max, log_std_min=log_std_min, log_std_max=log_std_max,
 78 |                  loss_mode_exp=loss_mode_exp, Q_exp_loss=Q_exp_loss, n_fits=n_fits, logging_iter=logging_iter)
 79 | 
 80 |     return agent
 81 | 
 82 | 
 83 | def experiment(env_id: str = "HalfCheetah-v2",
 84 |                n_epochs: int = 500,
 85 |                n_steps_per_epoch: int = 10000,
 86 |                n_steps_per_fit: int = 1,
 87 |                n_eval_episodes: int = 50,
 88 |                n_epochs_save: int = 100,
 89 |                logging_iter: int = 100,
 90 |                expert_data_path: str = None,
 91 |                use_cuda: bool = False,
 92 |                lr_critic: float = 3e-4,
 93 |                lr_actor: float = 3e-5,
 94 |                results_dir: str = "./logs",
 95 |                plcy_loss_mode: str = "value",
 96 |                regularizer_mode: str = "exp_and_plcy",
 97 |                reg_mult: float = 0.5,
 98 |                Q_exp_loss: str = "MSE",
 99 |                n_fits: int = 1,
100 |                loss_mode_exp: str = "fix",
101 |                log_std_min: float = -5.0,
102 |                log_std_max: float = 2.0,
103 |                learnable_alpha: bool = False,
104 |                use_target: bool = True,
105 |                init_alpha: float = 0.001,
106 |                tau: float = 0.005,
107 |                delay_Q: int = 1,
108 |                lossQ_type: str = "sqil_like",
109 |                gamma: float = 0.99,
110 |                horizon: int = 1000,
111 |                seed: int = 0):
112 | 
113 |     np.random.seed(seed)
114 |     torch.random.manual_seed(seed)
115 | 
116 |     logger_stoch = Logger(results_dir=results_dir, log_name="stochastic_logging", seed=seed, append=True)
117 |     logger_deter = Logger(results_dir=results_dir, log_name="deterministic_logging", seed=seed, append=True)
118 | 
119 |     results_dir = os.path.join(results_dir, str(seed))
120 | 
121 |     env_params = dict(name=env_id, horizon=horizon, gamma=gamma)
122 | 
123 |     mdp = Gym(**env_params)
124 | 
125 |     # load expert data
126 |     expert_data = prepare_expert_data(data_path=expert_data_path)
127 | 
128 |     # logging stuff
129 |     tb_writer = SummaryWriter(log_dir=results_dir)
130 |     agent_saver = BestAgentSaver(save_path=results_dir, n_epochs_save=n_epochs_save)
131 | 
132 |     # create agent and core
133 |     agent = _create_agent(mdp, expert_data,  sw=tb_writer, lr_critic=lr_critic, lr_actor=lr_actor,
134 |                           plcy_loss_mode=plcy_loss_mode, regularizer_mode=regularizer_mode,
135 |                           use_cuda=use_cuda, use_target=use_target, lossQ_type=lossQ_type,
136 |                           delay_Q=delay_Q, tau=tau, learnable_alpha=learnable_alpha, init_alpha=init_alpha,
137 |                           reg_mult=reg_mult, gamma=gamma, Q_exp_loss=Q_exp_loss,
138 |                           loss_mode_exp=loss_mode_exp, log_std_min=log_std_min,
139 |                           n_fits=n_fits, log_std_max=log_std_max, logging_iter=logging_iter)
140 | 
141 |     core = Core(agent, mdp)
142 | 
143 |     # iqfo train loop
144 |     for epoch in range(n_epochs):
145 |         with catchtime() as t:
146 |             # training
147 |             core.learn(n_steps=n_steps_per_epoch, n_steps_per_fit=n_steps_per_fit, quiet=True)
148 |             print('Epoch %d | Time %fs ' % (epoch + 1, float(t())))
149 | 
150 |             # evaluate with deterministic policy
151 |             agent.policy.use_mean = True
152 |             dataset = core.evaluate(n_episodes=n_eval_episodes)
153 |             R_mean = np.mean(compute_J(dataset))
154 |             J_mean = np.mean(compute_J(dataset, gamma=gamma))
155 |             L = np.mean(compute_episodes_length(dataset))
156 |             logger_deter.log_numpy(Epoch=epoch, R_mean=R_mean, J_mean=J_mean, L=L)
157 |             tb_writer.add_scalar("Eval_R-deterministic", R_mean, epoch)
158 |             tb_writer.add_scalar("Eval_J-deterministic", J_mean, epoch)
159 |             tb_writer.add_scalar("Eval_L-deterministic", L, epoch)
160 |             agent.policy.use_mean = False
161 | 
162 |             # evaluate with stochastic policy
163 |             dataset = core.evaluate(n_episodes=n_eval_episodes)
164 |             R_mean_stoch = np.mean(compute_J(dataset))
165 |             J_mean_stoch = np.mean(compute_J(dataset, gamma=gamma))
166 |             L = np.mean(compute_episodes_length(dataset))
167 |             logger_stoch.log_numpy(Epoch=epoch, R_mean=R_mean_stoch, J_mean=J_mean_stoch, L=L)
168 |             tb_writer.add_scalar("Eval_R-stochastic", R_mean_stoch, epoch)
169 |             tb_writer.add_scalar("Eval_J-stochastic", J_mean_stoch, epoch)
170 |             tb_writer.add_scalar("Eval_L-stochastic", L, epoch)
171 | 
172 |             print("R_mean (deter): %f | R_mean (stoch): %f" % (R_mean, R_mean_stoch))
173 | 
174 |             # save agent if needed
175 |             agent_saver.save(core.agent, J_mean)
176 | 
177 |     agent_saver.save_curr_best_agent()
178 |     print("Finished.")
179 | 
180 | @contextmanager
181 | def catchtime() -> float:
182 |     start = perf_counter()
183 |     yield lambda: perf_counter() - start
184 | 
185 | 
186 | if __name__ == "__main__":
187 | 
188 |     # Leave unchanged
189 |     run_experiment(experiment)
190 | 


--------------------------------------------------------------------------------
/examples/01_LSIQ/03_episode_10/launcher.py:
--------------------------------------------------------------------------------
 1 | from experiment_launcher import Launcher
 2 | 
 3 | from experiment_launcher.utils import bool_local_cluster
 4 | 
 5 | if __name__ == '__main__':
 6 |     LOCAL = bool_local_cluster()
 7 |     TEST = False
 8 |     USE_CUDA = False
 9 | 
10 |     JOBLIB_PARALLEL_JOBS = 1  # or os.cpu_count() to use all cores
11 |     N_SEEDS = 5
12 | 
13 |     launcher = Launcher(exp_name='lsiq_10',
14 |                         python_file='lsiq_experiments',
15 |                         n_exps=N_SEEDS,
16 |                         joblib_n_jobs=JOBLIB_PARALLEL_JOBS,
17 |                         n_cores=JOBLIB_PARALLEL_JOBS * 1,
18 |                         memory_per_core=JOBLIB_PARALLEL_JOBS * 6000,
19 |                         days=2,
20 |                         hours=0,
21 |                         minutes=0,
22 |                         seconds=0,
23 |                         use_timestamp=True,
24 |                         )
25 | 
26 |     default_params = dict(n_epochs=150,
27 |                           n_steps_per_epoch=10000,
28 |                           n_eval_episodes=10,
29 |                           n_steps_per_fit=1,
30 |                           n_epochs_save=-1,
31 |                           logging_iter=10000,
32 |                           gamma=0.99,
33 |                           use_cuda=USE_CUDA,
34 |                           tau=0.005,
35 |                           use_target=True,
36 |                           loss_mode_exp="fix",
37 |                           regularizer_mode="plcy",
38 |                           learnable_alpha=False)
39 | 
40 |     log_std = [(-5, 2)]
41 |     envs = ["Ant-v3",
42 |             "HalfCheetah-v3",
43 |             "Hopper-v3",
44 |             "Humanoid-v3",
45 |             "Walker2d-v3"]
46 |     path_to_datasets = "../../00_Datasets/10_episodes/"
47 |     expert_data_filenames = ["expert_dataset_Ant-v3_6421.34_10_SAC.npz",
48 |                              "expert_dataset_HalfCheetah-v3_12360.31_10_SAC.npz",
49 |                              "expert_dataset_Hopper-v3_3549.94_10_SAC.npz",
50 |                              "expert_dataset_Humanoid-v3_6346.43_10_SAC.npz",
51 |                              "expert_dataset_Walker2d-v3_5852.24_10_SAC.npz"]
52 | 
53 |     expert_data_paths = [path_to_datasets + name for name in expert_data_filenames]
54 | 
55 |     # Ant
56 |     launcher.add_experiment(env_id__=envs[0], expert_data_path=expert_data_paths[0],
57 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
58 | 
59 | 
60 |     # HalfCheetah
61 |     launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1],
62 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
63 |     launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1],
64 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=10.0, **default_params)
65 | 
66 |     # Hopper
67 |     launcher.add_experiment(env_id__=envs[2], expert_data_path=expert_data_paths[2],
68 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
69 | 
70 |     # Humanoid
71 |     launcher.add_experiment(env_id__=envs[3], expert_data_path=expert_data_paths[3],
72 |                             plcy_loss_mode__="value", init_alpha__=0.1, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
73 | 
74 |     # Walker2d
75 |     launcher.add_experiment(env_id__=envs[4], expert_data_path=expert_data_paths[4],
76 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
77 | 
78 |     launcher.run(LOCAL, TEST)
79 | 


--------------------------------------------------------------------------------
/examples/01_LSIQ/03_episode_10/lsiq_experiments.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from time import perf_counter
  3 | from contextlib import contextmanager
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.optim as optim
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | from mushroom_rl.core import Core
 11 | from mushroom_rl.environments import Gym
 12 | from mushroom_rl.utils.dataset import compute_J, compute_episodes_length
 13 | from mushroom_rl.core.logger.logger import Logger
 14 | 
 15 | 
 16 | from imitation_lib.imitation import LSIQ
 17 | from imitation_lib.utils import FullyConnectedNetwork
 18 | from imitation_lib.utils import prepare_expert_data, BestAgentSaver
 19 | 
 20 | 
 21 | from experiment_launcher import run_experiment
 22 | 
 23 | 
 24 | def _create_agent(mdp, expert_data, sw, lr_critic, lr_actor, plcy_loss_mode,
 25 |                   regularizer_mode, use_target, lossQ_type, use_cuda, tau,
 26 |                   learnable_alpha, init_alpha, reg_mult, Q_exp_loss, gamma,
 27 |                   loss_mode_exp, log_std_min, log_std_max, delay_Q, n_fits,
 28 |                   logging_iter):
 29 | 
 30 |     # calculate the minimum and maximum Q-function
 31 |     Q_max = 1.0 / (reg_mult * (1 - gamma))
 32 |     Q_min = - 1.0 / (reg_mult * (1 - gamma))
 33 | 
 34 |     # Settings
 35 |     initial_replay_size = 10000
 36 |     max_replay_size = 1000000
 37 |     batch_size = 256     # the real batch size is double the size as an expert batch is going to be added
 38 |     warmup_transitions = 15000
 39 | 
 40 |     lr_alpha = 2e-6
 41 |     weight_decay_actor = 0.0
 42 |     weight_decay_critic = 0.0
 43 | 
 44 |     target_entropy = -22.0
 45 | 
 46 |     # Approximator
 47 |     actor_input_shape = mdp.info.observation_space.shape
 48 |     actor_output_shape = (mdp.info.action_space.shape[0]*2,)
 49 |     actor_params = dict(network=FullyConnectedNetwork,
 50 |                         n_features=[256, 256],
 51 |                         input_shape=actor_input_shape,
 52 |                         output_shape=actor_output_shape,
 53 |                         activations=["relu", "relu", "identity"],
 54 |                         use_cuda=use_cuda)
 55 | 
 56 |     actor_optimizer = {'class': optim.Adam,
 57 |                        'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}}
 58 | 
 59 |     critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
 60 |     critic_params = dict(network=FullyConnectedNetwork,
 61 |                          optimizer={'class': optim.Adam,
 62 |                                     'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}},
 63 |                          n_features=[256, 256],
 64 |                          input_shape=critic_input_shape,
 65 |                          activations=["relu", "relu", "identity"],
 66 |                          squeeze_out=False,
 67 |                          output_shape=(1,),
 68 |                          use_cuda=use_cuda)
 69 | 
 70 |     # create IQfO agent
 71 |     agent = LSIQ(mdp_info=mdp.info, batch_size=batch_size, initial_replay_size=initial_replay_size,
 72 |                  max_replay_size=max_replay_size, demonstrations=expert_data, sw=sw, use_target=use_target,
 73 |                  warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha, actor_params=actor_params,
 74 |                  actor_optimizer=actor_optimizer, critic_params=critic_params, delay_Q=delay_Q, lossQ_type=lossQ_type,
 75 |                  target_entropy=target_entropy, critic_fit_params=None, plcy_loss_mode=plcy_loss_mode,
 76 |                  regularizer_mode=regularizer_mode, learnable_alpha=learnable_alpha, init_alpha=init_alpha,
 77 |                  reg_mult=reg_mult, Q_min=Q_min, Q_max=Q_max, log_std_min=log_std_min, log_std_max=log_std_max,
 78 |                  loss_mode_exp=loss_mode_exp, Q_exp_loss=Q_exp_loss, n_fits=n_fits, logging_iter=logging_iter)
 79 | 
 80 |     return agent
 81 | 
 82 | 
 83 | def experiment(env_id: str = "HalfCheetah-v2",
 84 |                n_epochs: int = 500,
 85 |                n_steps_per_epoch: int = 10000,
 86 |                n_steps_per_fit: int = 1,
 87 |                n_eval_episodes: int = 50,
 88 |                n_epochs_save: int = 100,
 89 |                logging_iter: int = 100,
 90 |                expert_data_path: str = None,
 91 |                use_cuda: bool = False,
 92 |                lr_critic: float = 3e-4,
 93 |                lr_actor: float = 3e-5,
 94 |                results_dir: str = "./logs",
 95 |                plcy_loss_mode: str = "value",
 96 |                regularizer_mode: str = "exp_and_plcy",
 97 |                reg_mult: float = 0.5,
 98 |                Q_exp_loss: str = "MSE",
 99 |                n_fits: int = 1,
100 |                loss_mode_exp: str = "fix",
101 |                log_std_min: float = -5.0,
102 |                log_std_max: float = 2.0,
103 |                learnable_alpha: bool = False,
104 |                use_target: bool = True,
105 |                init_alpha: float = 0.001,
106 |                tau: float = 0.005,
107 |                delay_Q: int = 1,
108 |                lossQ_type: str = "sqil_like",
109 |                gamma: float = 0.99,
110 |                horizon: int = 1000,
111 |                seed: int = 0):
112 | 
113 |     np.random.seed(seed)
114 |     torch.random.manual_seed(seed)
115 | 
116 |     logger_stoch = Logger(results_dir=results_dir, log_name="stochastic_logging", seed=seed, append=True)
117 |     logger_deter = Logger(results_dir=results_dir, log_name="deterministic_logging", seed=seed, append=True)
118 | 
119 |     results_dir = os.path.join(results_dir, str(seed))
120 | 
121 |     env_params = dict(name=env_id, horizon=horizon, gamma=gamma)
122 | 
123 |     mdp = Gym(**env_params)
124 | 
125 |     # load expert data
126 |     expert_data = prepare_expert_data(data_path=expert_data_path)
127 | 
128 |     # logging stuff
129 |     tb_writer = SummaryWriter(log_dir=results_dir)
130 |     agent_saver = BestAgentSaver(save_path=results_dir, n_epochs_save=n_epochs_save)
131 | 
132 |     # create agent and core
133 |     agent = _create_agent(mdp, expert_data,  sw=tb_writer, lr_critic=lr_critic, lr_actor=lr_actor,
134 |                           plcy_loss_mode=plcy_loss_mode, regularizer_mode=regularizer_mode,
135 |                           use_cuda=use_cuda, use_target=use_target, lossQ_type=lossQ_type,
136 |                           delay_Q=delay_Q, tau=tau, learnable_alpha=learnable_alpha, init_alpha=init_alpha,
137 |                           reg_mult=reg_mult, gamma=gamma, Q_exp_loss=Q_exp_loss,
138 |                           loss_mode_exp=loss_mode_exp, log_std_min=log_std_min,
139 |                           n_fits=n_fits, log_std_max=log_std_max, logging_iter=logging_iter)
140 | 
141 |     core = Core(agent, mdp)
142 | 
143 |     # iqfo train loop
144 |     for epoch in range(n_epochs):
145 |         with catchtime() as t:
146 |             # training
147 |             core.learn(n_steps=n_steps_per_epoch, n_steps_per_fit=n_steps_per_fit, quiet=True)
148 |             print('Epoch %d | Time %fs ' % (epoch + 1, float(t())))
149 | 
150 |             # evaluate with deterministic policy
151 |             agent.policy.use_mean = True
152 |             dataset = core.evaluate(n_episodes=n_eval_episodes)
153 |             R_mean = np.mean(compute_J(dataset))
154 |             J_mean = np.mean(compute_J(dataset, gamma=gamma))
155 |             L = np.mean(compute_episodes_length(dataset))
156 |             logger_deter.log_numpy(Epoch=epoch, R_mean=R_mean, J_mean=J_mean, L=L)
157 |             tb_writer.add_scalar("Eval_R-deterministic", R_mean, epoch)
158 |             tb_writer.add_scalar("Eval_J-deterministic", J_mean, epoch)
159 |             tb_writer.add_scalar("Eval_L-deterministic", L, epoch)
160 |             agent.policy.use_mean = False
161 | 
162 |             # evaluate with stochastic policy
163 |             dataset = core.evaluate(n_episodes=n_eval_episodes)
164 |             R_mean_stoch = np.mean(compute_J(dataset))
165 |             J_mean_stoch = np.mean(compute_J(dataset, gamma=gamma))
166 |             L = np.mean(compute_episodes_length(dataset))
167 |             logger_stoch.log_numpy(Epoch=epoch, R_mean=R_mean_stoch, J_mean=J_mean_stoch, L=L)
168 |             tb_writer.add_scalar("Eval_R-stochastic", R_mean_stoch, epoch)
169 |             tb_writer.add_scalar("Eval_J-stochastic", J_mean_stoch, epoch)
170 |             tb_writer.add_scalar("Eval_L-stochastic", L, epoch)
171 | 
172 |             print("R_mean (deter): %f | R_mean (stoch): %f" % (R_mean, R_mean_stoch))
173 | 
174 |             # save agent if needed
175 |             agent_saver.save(core.agent, J_mean)
176 | 
177 |     agent_saver.save_curr_best_agent()
178 |     print("Finished.")
179 | 
180 | @contextmanager
181 | def catchtime() -> float:
182 |     start = perf_counter()
183 |     yield lambda: perf_counter() - start
184 | 
185 | 
186 | if __name__ == "__main__":
187 | 
188 |     # Leave unchanged
189 |     run_experiment(experiment)
190 | 


--------------------------------------------------------------------------------
/examples/01_LSIQ/04_episode_25/launcher.py:
--------------------------------------------------------------------------------
 1 | from experiment_launcher import Launcher
 2 | 
 3 | from experiment_launcher.utils import bool_local_cluster
 4 | 
 5 | if __name__ == '__main__':
 6 |     LOCAL = bool_local_cluster()
 7 |     TEST = False
 8 |     USE_CUDA = False
 9 | 
10 |     JOBLIB_PARALLEL_JOBS = 1  # or os.cpu_count() to use all cores
11 |     N_SEEDS = 5
12 | 
13 |     launcher = Launcher(exp_name='lsiq_25',
14 |                         python_file='lsiq_experiments',
15 |                         n_exps=N_SEEDS,
16 |                         joblib_n_jobs=JOBLIB_PARALLEL_JOBS,
17 |                         n_cores=JOBLIB_PARALLEL_JOBS * 1,
18 |                         memory_per_core=JOBLIB_PARALLEL_JOBS * 6000,
19 |                         days=2,
20 |                         hours=0,
21 |                         minutes=0,
22 |                         seconds=0,
23 |                         use_timestamp=True,
24 |                         )
25 | 
26 |     default_params = dict(n_epochs=150,
27 |                           n_steps_per_epoch=10000,
28 |                           n_eval_episodes=10,
29 |                           n_steps_per_fit=1,
30 |                           n_epochs_save=-1,
31 |                           logging_iter=10000,
32 |                           gamma=0.99,
33 |                           use_cuda=USE_CUDA,
34 |                           tau=0.005,
35 |                           use_target=True,
36 |                           loss_mode_exp="fix",
37 |                           regularizer_mode="plcy",
38 |                           learnable_alpha=False)
39 | 
40 |     log_std = [(-5, 2)]
41 |     envs = ["Ant-v3",
42 |             "HalfCheetah-v3",
43 |             "Hopper-v3",
44 |             "Humanoid-v3",
45 |             "Walker2d-v3"]
46 |     path_to_datasets = "../../00_Datasets/25_episodes/"
47 |     expert_data_filenames = ["expert_dataset_Ant-v3_6399.04_25_SAC.npz",
48 |                              "expert_dataset_HalfCheetah-v3_12328.78_25_SAC.npz",
49 |                              "expert_dataset_Hopper-v3_3299.81_25_SAC.npz",
50 |                              "expert_dataset_Humanoid-v3_6273.29_25_SAC.npz",
51 |                              "expert_dataset_Walker2d-v3_5841.73_25_SAC.npz"]
52 | 
53 |     expert_data_paths = [path_to_datasets + name for name in expert_data_filenames]
54 | 
55 |     # Ant
56 |     launcher.add_experiment(env_id__=envs[0], expert_data_path=expert_data_paths[0],
57 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
58 | 
59 |     # HalfCheetah
60 |     launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1],
61 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
62 |     launcher.add_experiment(env_id__=envs[1], expert_data_path=expert_data_paths[1],
63 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=10.0, **default_params)
64 | 
65 |     # Hopper
66 |     launcher.add_experiment(env_id__=envs[2], expert_data_path=expert_data_paths[2],
67 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
68 | 
69 |     # Humanoid
70 |     launcher.add_experiment(env_id__=envs[3], expert_data_path=expert_data_paths[3],
71 |                             plcy_loss_mode__="value", init_alpha__=0.1, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
72 | 
73 |     # Walker2d
74 |     launcher.add_experiment(env_id__=envs[4], expert_data_path=expert_data_paths[4],
75 |                             plcy_loss_mode__="value", init_alpha__=1e-3, Q_exp_loss__="MSE", reg_mult__=0.5, **default_params)
76 | 
77 | 
78 |     launcher.run(LOCAL, TEST)
79 | 


--------------------------------------------------------------------------------
/examples/01_LSIQ/04_episode_25/lsiq_experiments.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from time import perf_counter
  3 | from contextlib import contextmanager
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.optim as optim
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | 
 10 | from mushroom_rl.core import Core
 11 | from mushroom_rl.environments import Gym
 12 | from mushroom_rl.utils.dataset import compute_J, compute_episodes_length
 13 | from mushroom_rl.core.logger.logger import Logger
 14 | 
 15 | 
 16 | from imitation_lib.imitation import LSIQ
 17 | from imitation_lib.utils import FullyConnectedNetwork
 18 | from imitation_lib.utils import prepare_expert_data, BestAgentSaver
 19 | 
 20 | 
 21 | from experiment_launcher import run_experiment
 22 | 
 23 | 
 24 | def _create_agent(mdp, expert_data, sw, lr_critic, lr_actor, plcy_loss_mode,
 25 |                   regularizer_mode, use_target, lossQ_type, use_cuda, tau,
 26 |                   learnable_alpha, init_alpha, reg_mult, Q_exp_loss, gamma,
 27 |                   loss_mode_exp, log_std_min, log_std_max, delay_Q, n_fits,
 28 |                   logging_iter):
 29 | 
 30 |     # calculate the minimum and maximum Q-function
 31 |     Q_max = 1.0 / (reg_mult * (1 - gamma))
 32 |     Q_min = - 1.0 / (reg_mult * (1 - gamma))
 33 | 
 34 |     # Settings
 35 |     initial_replay_size = 10000
 36 |     max_replay_size = 1000000
 37 |     batch_size = 256     # the real batch size is double the size as an expert batch is going to be added
 38 |     warmup_transitions = 15000
 39 | 
 40 |     lr_alpha = 2e-6
 41 |     weight_decay_actor = 0.0
 42 |     weight_decay_critic = 0.0
 43 | 
 44 |     target_entropy = -22.0
 45 | 
 46 |     # Approximator
 47 |     actor_input_shape = mdp.info.observation_space.shape
 48 |     actor_output_shape = (mdp.info.action_space.shape[0]*2,)
 49 |     actor_params = dict(network=FullyConnectedNetwork,
 50 |                         n_features=[256, 256],
 51 |                         input_shape=actor_input_shape,
 52 |                         output_shape=actor_output_shape,
 53 |                         activations=["relu", "relu", "identity"],
 54 |                         use_cuda=use_cuda)
 55 | 
 56 |     actor_optimizer = {'class': optim.Adam,
 57 |                        'params': {'lr': lr_actor, 'weight_decay': weight_decay_actor}}
 58 | 
 59 |     critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],)
 60 |     critic_params = dict(network=FullyConnectedNetwork,
 61 |                          optimizer={'class': optim.Adam,
 62 |                                     'params': {'lr': lr_critic, 'weight_decay': weight_decay_critic}},
 63 |                          n_features=[256, 256],
 64 |                          input_shape=critic_input_shape,
 65 |                          activations=["relu", "relu", "identity"],
 66 |                          squeeze_out=False,
 67 |                          output_shape=(1,),
 68 |                          use_cuda=use_cuda)
 69 | 
 70 |     # create IQfO agent
 71 |     agent = LSIQ(mdp_info=mdp.info, batch_size=batch_size, initial_replay_size=initial_replay_size,
 72 |                  max_replay_size=max_replay_size, demonstrations=expert_data, sw=sw, use_target=use_target,
 73 |                  warmup_transitions=warmup_transitions, tau=tau, lr_alpha=lr_alpha, actor_params=actor_params,
 74 |                  actor_optimizer=actor_optimizer, critic_params=critic_params, delay_Q=delay_Q, lossQ_type=lossQ_type,
 75 |                  target_entropy=target_entropy, critic_fit_params=None, plcy_loss_mode=plcy_loss_mode,
 76 |                  regularizer_mode=regularizer_mode, learnable_alpha=learnable_alpha, init_alpha=init_alpha,
 77 |                  reg_mult=reg_mult, Q_min=Q_min, Q_max=Q_max, log_std_min=log_std_min, log_std_max=log_std_max,
 78 |                  loss_mode_exp=loss_mode_exp, Q_exp_loss=Q_exp_loss, n_fits=n_fits, logging_iter=logging_iter)
 79 | 
 80 |     return agent
 81 | 
 82 | 
 83 | def experiment(env_id: str = "HalfCheetah-v2",
 84 |                n_epochs: int = 500,
 85 |                n_steps_per_epoch: int = 10000,
 86 |                n_steps_per_fit: int = 1,
 87 |                n_eval_episodes: int = 50,
 88 |                n_epochs_save: int = 100,
 89 |                logging_iter: int = 100,
 90 |                expert_data_path: str = None,
 91 |                use_cuda: bool = False,
 92 |                lr_critic: float = 3e-4,
 93 |                lr_actor: float = 3e-5,
 94 |                results_dir: str = "./logs",
 95 |                plcy_loss_mode: str = "value",
 96 |                regularizer_mode: str = "exp_and_plcy",
 97 |                reg_mult: float = 0.5,
 98 |                Q_exp_loss: str = "MSE",
 99 |                n_fits: int = 1,
100 |                loss_mode_exp: str = "fix",
101 |                log_std_min: float = -5.0,
102 |                log_std_max: float = 2.0,
103 |                learnable_alpha: bool = False,
104 |                use_target: bool = True,
105 |                init_alpha: float = 0.001,
106 |                tau: float = 0.005,
107 |                delay_Q: int = 1,
108 |                lossQ_type: str = "sqil_like",
109 |                gamma: float = 0.99,
110 |                horizon: int = 1000,
111 |                seed: int = 0):
112 | 
113 |     np.random.seed(seed)
114 |     torch.random.manual_seed(seed)
115 | 
116 |     logger_stoch = Logger(results_dir=results_dir, log_name="stochastic_logging", seed=seed, append=True)
117 |     logger_deter = Logger(results_dir=results_dir, log_name="deterministic_logging", seed=seed, append=True)
118 | 
119 |     results_dir = os.path.join(results_dir, str(seed))
120 | 
121 |     env_params = dict(name=env_id, horizon=horizon, gamma=gamma)
122 | 
123 |     mdp = Gym(**env_params)
124 | 
125 |     # load expert data
126 |     expert_data = prepare_expert_data(data_path=expert_data_path)
127 | 
128 |     # logging stuff
129 |     tb_writer = SummaryWriter(log_dir=results_dir)
130 |     agent_saver = BestAgentSaver(save_path=results_dir, n_epochs_save=n_epochs_save)
131 | 
132 |     # create agent and core
133 |     agent = _create_agent(mdp, expert_data,  sw=tb_writer, lr_critic=lr_critic, lr_actor=lr_actor,
134 |                           plcy_loss_mode=plcy_loss_mode, regularizer_mode=regularizer_mode,
135 |                           use_cuda=use_cuda, use_target=use_target, lossQ_type=lossQ_type,
136 |                           delay_Q=delay_Q, tau=tau, learnable_alpha=learnable_alpha, init_alpha=init_alpha,
137 |                           reg_mult=reg_mult, gamma=gamma, Q_exp_loss=Q_exp_loss,
138 |                           loss_mode_exp=loss_mode_exp, log_std_min=log_std_min,
139 |                           n_fits=n_fits, log_std_max=log_std_max, logging_iter=logging_iter)
140 | 
141 |     core = Core(agent, mdp)
142 | 
143 |     # iqfo train loop
144 |     for epoch in range(n_epochs):
145 |         with catchtime() as t:
146 |             # training
147 |             core.learn(n_steps=n_steps_per_epoch, n_steps_per_fit=n_steps_per_fit, quiet=True)
148 |             print('Epoch %d | Time %fs ' % (epoch + 1, float(t())))
149 | 
150 |             # evaluate with deterministic policy
151 |             agent.policy.use_mean = True
152 |             dataset = core.evaluate(n_episodes=n_eval_episodes)
153 |             R_mean = np.mean(compute_J(dataset))
154 |             J_mean = np.mean(compute_J(dataset, gamma=gamma))
155 |             L = np.mean(compute_episodes_length(dataset))
156 |             logger_deter.log_numpy(Epoch=epoch, R_mean=R_mean, J_mean=J_mean, L=L)
157 |             tb_writer.add_scalar("Eval_R-deterministic", R_mean, epoch)
158 |             tb_writer.add_scalar("Eval_J-deterministic", J_mean, epoch)
159 |             tb_writer.add_scalar("Eval_L-deterministic", L, epoch)
160 |             agent.policy.use_mean = False
161 | 
162 |             # evaluate with stochastic policy
163 |             dataset = core.evaluate(n_episodes=n_eval_episodes)
164 |             R_mean_stoch = np.mean(compute_J(dataset))
165 |             J_mean_stoch = np.mean(compute_J(dataset, gamma=gamma))
166 |             L = np.mean(compute_episodes_length(dataset))
167 |             logger_stoch.log_numpy(Epoch=epoch, R_mean=R_mean_stoch, J_mean=J_mean_stoch, L=L)
168 |             tb_writer.add_scalar("Eval_R-stochastic", R_mean_stoch, epoch)
169 |             tb_writer.add_scalar("Eval_J-stochastic", J_mean_stoch, epoch)
170 |             tb_writer.add_scalar("Eval_L-stochastic", L, epoch)
171 | 
172 |             print("R_mean (deter): %f | R_mean (stoch): %f" % (R_mean, R_mean_stoch))
173 | 
174 |             # save agent if needed
175 |             agent_saver.save(core.agent, J_mean)
176 | 
177 |     agent_saver.save_curr_best_agent()
178 |     print("Finished.")
179 | 
180 | @contextmanager
181 | def catchtime() -> float:
182 |     start = perf_counter()
183 |     yield lambda: perf_counter() - start
184 | 
185 | 
186 | if __name__ == "__main__":
187 | 
188 |     # Leave unchanged
189 |     run_experiment(experiment)
190 | 


--------------------------------------------------------------------------------
/img/Divergence_Minimization.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robfiras/ls-iq/b097abd97f5e51c16d583bc9805cb40fb8e2ac01/img/Divergence_Minimization.gif


--------------------------------------------------------------------------------
/imitation_lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robfiras/ls-iq/b097abd97f5e51c16d583bc9805cb40fb8e2ac01/imitation_lib/__init__.py


--------------------------------------------------------------------------------
/imitation_lib/imitation/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gail_TRPO import GAIL as GAIL_TRPO
 2 | from .vail_TRPO import VAIL as VAIL_TRPO
 3 | from .iq_sac import IQ_SAC
 4 | from .iqfo_orig import IQfO_ORIG
 5 | from .sqil_sac import SQIL
 6 | 
 7 | from .lsiq import LSIQ
 8 | from .lsiq_h import LSIQ_H
 9 | from .lsiq_hc import LSIQ_HC
10 | 
11 | from .iqfo_sac import IQfO_SAC
12 | from .lsiqfo import LSIQfO
13 | from .lsiqfo_h import LSIQfO_H
14 | from .lsiqfo_hc import LSIQfO_HC
15 | 
16 | 
17 | from .offline import IQ_Offline, LSIQ_Offline, LSIQ_Offline_DM, BehavioralCloning
18 | __all__ = ['GAIL_TRPO', 'VAIL_TRPO', 'IQ_SAC', 'IQfO_SAC', 'IQfO_ORIG',
19 |            'LSIQ', 'SQIL', 'LSIQfO',  'LSIQ_H','LSIQ_HC', 'LSIQfO_HC',
20 |            'LSIQfO_H', "IQ_Offline", "LSIQ_Offline", "LSIQ_Offline_DM", "BehavioralCloning"]
21 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/iqfo_orig.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import numpy as np
 4 | from mushroom_rl.approximators import Regressor
 5 | from mushroom_rl.approximators.parametric import TorchApproximator
 6 | from imitation_lib.imitation.iq_sac import IQ_SAC
 7 | from mushroom_rl.utils.minibatches import minibatch_generator
 8 | from mushroom_rl.utils.torch import to_float_tensor
 9 | 
10 | 
11 | class IQfO_ORIG(IQ_SAC):
12 | 
13 | 
14 |     def fit(self, dataset):
15 | 
16 |         # add to replay memory
17 |         self._replay_memory.add(dataset)
18 | 
19 |         if self._replay_memory.initialized:
20 | 
21 |             # sample batch from policy replay buffer
22 |             state, action, reward, next_state, absorbing, _ = \
23 |                 self._replay_memory.get(self._batch_size())
24 | 
25 |             # sample batch of same size from expert replay buffer and concatenate with samples from own policy
26 |             demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0],
27 |                                                                            self._demonstrations["states"],
28 |                                                                            self._demonstrations["next_states"],
29 |                                                                            self._demonstrations["absorbing"]))
30 | 
31 |             # the action by the expert is predicted by the policy
32 |             with torch.no_grad():
33 |                 demo_act, _ = self.policy.compute_action_and_log_prob_t(demo_obs)
34 |                 demo_act = demo_act.detach().numpy()
35 | 
36 |             # prepare data for IQ update
37 |             input_states = to_float_tensor(np.concatenate([state, demo_obs.astype(np.float32)]))
38 |             input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)]))
39 |             input_n_states = to_float_tensor(np.concatenate([next_state, demo_nobs.astype(np.float32)]))
40 |             input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)]))
41 |             is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool),
42 |                                       torch.ones(len(state), dtype=torch.bool)])
43 |             # make IQ update
44 |             loss1, loss2, chi2_loss = self._lossQ(input_states, input_actions, input_n_states, input_absorbing,
45 |                                                   is_expert)
46 |             self._sw.add_scalar('IQ-Loss/Loss1', loss1, self._iter)
47 |             self._sw.add_scalar('IQ-Loss/Loss2', loss2, self._iter)
48 |             self._sw.add_scalar('IQ-Loss/Chi2 Loss', chi2_loss, self._iter)
49 |             self._sw.add_scalar('IQ-Loss/Alpha', self._alpha, self._iter)
50 | 
51 |             # update policy
52 |             if self._replay_memory.size > self._warmup_transitions() and self._iter % self._delay_pi == 0:
53 |                 action_new, log_prob = self.policy.compute_action_and_log_prob_t(input_states)
54 |                 loss = self._actor_loss(input_states, action_new, log_prob)
55 |                 self._optimize_actor_parameters(loss)
56 |                 grads = []
57 |                 for param in self.policy._approximator.model.network.parameters():
58 |                     grads.append(param.grad.view(-1))
59 |                 grads = torch.cat(grads)
60 |                 norm = grads.norm(dim=0, p=2)
61 |                 self._sw.add_scalar('Gradients/Norm2 Gradient Q wrt. Pi-parameters', norm,
62 |                                     self._iter)
63 |                 self._sw.add_scalar('Actor/Loss', loss, self._iter)
64 |                 self._sw.add_scalar('Actor/Entropy', torch.mean(-log_prob).detach().item(), self._iter)
65 |                 if self._learnable_alpha:
66 |                     self._update_alpha(log_prob.detach())
67 | 
68 |             self._update_target(self._critic_approximator,
69 |                                 self._target_critic_approximator)
70 | 
71 |         self._iter += 1
72 | 
73 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/iqfo_sac.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | from copy import deepcopy
  5 | 
  6 | from mushroom_rl.core import Serializable
  7 | from mushroom_rl.approximators import Regressor
  8 | from mushroom_rl.approximators.parametric import TorchApproximator
  9 | from imitation_lib.imitation.iq_sac import IQ_SAC
 10 | from mushroom_rl.utils.minibatches import minibatch_generator
 11 | from mushroom_rl.utils.torch import to_float_tensor
 12 | from mushroom_rl.utils.parameters import to_parameter
 13 | from imitation_lib.utils.action_models import GaussianInvActionModel, LearnableVarGaussianInvActionModel,\
 14 |     GCPActionModel, KLGCPActionModel, KLGaussianInvActionModel
 15 | 
 16 | from imitation_lib.utils.distributions import InverseGamma
 17 | 
 18 | 
 19 | class IQfO_SAC(IQ_SAC):
 20 | 
 21 |     def __init__(self, action_model, action_model_params, action_model_fit_params=None, action_model_noise_std=0.0,
 22 |                  action_model_noise_clip=None, add_noise_to_obs=False, ext_normalizer_action_model=None,
 23 |                  interpolate_expert_states=False, interpolation_coef=1.0, **kwargs):
 24 | 
 25 |         super().__init__(**kwargs)
 26 | 
 27 |         if action_model == GaussianInvActionModel or action_model == GCPActionModel \
 28 |                 or action_model == KLGCPActionModel or action_model == KLGaussianInvActionModel:
 29 |             action_model_params.setdefault("min_a", self.mdp_info.action_space.low)
 30 |             action_model_params.setdefault("max_a", self.mdp_info.action_space.high)
 31 |             action_model_params.setdefault("use_cuda", self._use_cuda)
 32 |         elif action_model == LearnableVarGaussianInvActionModel:
 33 |             action_model_params.setdefault("use_cuda", self._use_cuda)
 34 | 
 35 |         # setup the action model
 36 |         self._action_model = action_model(**action_model_params, demonstration=self._demonstrations)
 37 | 
 38 |         self._action_model_fit_params = dict(fits_per_step=1, init_epochs=0, )\
 39 |             if action_model_fit_params is None else action_model_fit_params
 40 |         self._action_model_initialized = True if self._action_model_fit_params["init_epochs"] > 0 else False
 41 |         self._action_model_batch_size = action_model_params["batch_size"]
 42 | 
 43 |         self._action_model_noise_std = action_model_noise_std
 44 |         self._action_model_noise_clip = action_model_noise_clip
 45 |         self.ext_normalizer_action_model = ext_normalizer_action_model
 46 |         self._add_noise_to_obs = add_noise_to_obs
 47 |         self._interpolate_expert_states = interpolate_expert_states
 48 |         self._interpolation_coef = interpolation_coef
 49 | 
 50 |         self._add_save_attr(
 51 |             _action_model='mushroom',
 52 |             _action_model_fit_params='pickle',
 53 |             _action_model_noise_std='primitive',
 54 |             _action_model_noise_clip='primitive',
 55 |             ext_normalizer_action_model='pickle',
 56 |             _add_noise_to_obs='primitive'
 57 |         )
 58 | 
 59 |     def fit(self, dataset):
 60 | 
 61 |         # add to replay memory
 62 |         self._replay_memory.add(dataset)
 63 | 
 64 |         if self._replay_memory.initialized:
 65 | 
 66 |             # train the action model
 67 |             if not self._action_model_initialized:
 68 |                 self.train_action_model(init=True)
 69 |                 self._action_model_initialized = True
 70 |             else:
 71 |                 self.train_action_model()
 72 | 
 73 |             # sample batch from policy replay buffer
 74 |             state, action, reward, next_state, absorbing, _ = \
 75 |                 self._replay_memory.get(self._batch_size())
 76 | 
 77 |             # sample batch of same size from expert replay buffer and concatenate with samples from own policy
 78 |             demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0],
 79 |                                                                            self._demonstrations["states"],
 80 |                                                                            self._demonstrations["next_states"],
 81 |                                                                            self._demonstrations["absorbing"]))
 82 | 
 83 |             # predict the actions for our expert dataset
 84 |             demo_obs_act = demo_obs.astype(np.float32)[:, self._state_mask]
 85 |             demo_nobs_act = demo_nobs.astype(np.float32)[:, self._state_mask]
 86 |             demo_act = self._action_model.draw_action(to_float_tensor(demo_obs_act),
 87 |                                                       to_float_tensor(demo_nobs_act))
 88 | 
 89 |             if self._add_noise_to_obs:
 90 |                 assert self.ext_normalizer_action_model is not None, "Normalizer is needed to be defined."
 91 | 
 92 |                 demo_obs = self.ext_normalizer_action_model(demo_obs)
 93 |                 demo_nobs = self.ext_normalizer_action_model(demo_nobs)
 94 |                 demo_obs += self._get_noise(demo_obs)
 95 |                 demo_nobs += self._get_noise(demo_nobs)
 96 |                 demo_obs = self.ext_normalizer_action_model.inv(demo_obs)
 97 |                 demo_nobs = self.ext_normalizer_action_model.inv(demo_nobs)
 98 | 
 99 |             # make interpolation if needed
100 |             if self._interpolate_expert_states:
101 |                 demo_obs = self.interpolate(demo_obs[:, self._state_mask], state[:, self._state_mask],
102 |                                             mixing_coef=self._interpolation_coef)
103 |                 demo_act = self.interpolate(demo_act, action,
104 |                                             mixing_coef=self._interpolation_coef)
105 | 
106 |             # prepare data for IQ update
107 |             input_states = to_float_tensor(np.concatenate([state,
108 |                                                            demo_obs.astype(np.float32)[:, self._state_mask]]))
109 |             input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)]))
110 |             input_n_states = to_float_tensor(np.concatenate([next_state,
111 |                                                              demo_nobs.astype(np.float32)[:, self._state_mask]]))
112 |             input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)]))
113 |             is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool),
114 |                                       torch.ones(len(state), dtype=torch.bool)])
115 | 
116 |             # make IQ update
117 |             self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert)
118 | 
119 |         self._iter += 1
120 |         self.policy.iter += 1
121 | 
122 |     def _get_noise(self, x):
123 |         noise = np.random.normal(loc=0.0, scale=self._action_model_noise_std,
124 |                                  size=np.size(x)).reshape(x.shape)
125 |         noise = np.clip(noise, -self._action_model_noise_clip, self._action_model_noise_clip) \
126 |             if self._action_model_noise_clip is not None else noise
127 |         return noise
128 | 
129 |     def interpolate(self, expert_data, policy_data, mixing_coef=None):
130 |         interpolated = mixing_coef * expert_data + (1 - mixing_coef) * policy_data
131 |         return interpolated
132 | 
133 |     def train_action_model(self, init=False):
134 | 
135 |         if init and self._action_model_fit_params["init_epochs"] > 0:
136 |             n_epochs = self._action_model_fit_params["init_epochs"]
137 |             # initialize the model
138 |             state, action, _, next_state, _, _ = self._replay_memory.get(self._replay_memory.size)
139 |             state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state
140 |             next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state
141 |             state_train = state[0:int(len(state)*0.9), :]
142 |             state_val = state[int(len(state)*0.9):, :]
143 |             next_state_train = next_state[0:int(len(next_state)*0.9), :]
144 |             next_state_val = next_state[int(len(next_state)*0.9):, :]
145 |             action_train = action[0:int(len(next_state)*0.9), :]
146 |             action_val = action[int(len(next_state)*0.9):, :]
147 |             state_nstate_train = np.concatenate([state_train, next_state_train], axis=1)
148 |             state_nstate_val = np.concatenate([state_val, next_state_val], axis=1)
149 | 
150 |             # make eval before training
151 |             action_pred = self._action_model(state_nstate_val)
152 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val))
153 |             self._sw.add_scalar('Action-Model/Loss', loss, self._iter)
154 |             print("Action Model Validation Loss before training: ", loss)
155 |             action_pred = self._action_model(state_nstate_train)
156 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train))
157 |             print("Action Model Training Loss before training: ", loss)
158 |             w = self._action_model.get_weights()
159 |             norm = np.linalg.norm(w)
160 |             self.sw_add_scalar("Action-Model/Norm", norm, self._iter)
161 | 
162 |             # make training
163 |             self._action_model.fit(state_nstate_train, action_train, n_epochs=n_epochs)
164 | 
165 |             # make eval after training
166 |             action_pred = self._action_model(state_nstate_val)
167 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val))
168 |             self._sw.add_scalar('Action-Model/Loss', loss, self._iter)
169 |             print("Action Model Validation Loss After training: ", loss)
170 |             action_pred = self._action_model(state_nstate_train)
171 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train))
172 |             print("Action Model Validation Loss After training: ", loss)
173 | 
174 |         else:
175 |             state_nstates = []
176 |             actions = []
177 |             for i in range(self._action_model_fit_params["fits_per_step"]):
178 |                 # sample batch from policy replay buffer
179 |                 state, action, reward, next_state, absorbing, _ = \
180 |                     self._replay_memory.get(self._action_model_batch_size)
181 | 
182 |                 state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state
183 |                 next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state
184 |                 self._action_model.fit(state, next_state, action)
185 | 
186 |                 state_nstates.append([state, next_state])
187 |                 actions.append(action)
188 | 
189 |             if self._iter % self._logging_iter == 0:
190 | 
191 |                 # sample batch from policy replay buffer
192 |                 states, actions, rewards, next_states, absorbings, _ = \
193 |                     self._replay_memory.get(self._action_model_batch_size)
194 | 
195 |                 # we need to check if we have a dataset with expert actions available or not
196 |                 try:
197 |                     exp_states, exp_next_states, exp_actions = next(
198 |                         minibatch_generator(self._action_model_batch_size,
199 |                                             self._demonstrations["states"],
200 |                                             self._demonstrations["next_states"],
201 |                                             self._demonstrations["actions"]))
202 |                 except KeyError:
203 |                     exp_states, exp_next_states = next(minibatch_generator(self._action_model_batch_size,
204 |                                                                            self._demonstrations["states"],
205 |                                                                            self._demonstrations["next_states"]))
206 |                     exp_actions = None
207 | 
208 |                 # log mse
209 |                 action_pred = self._action_model(states[:, self._state_mask], next_states[:, self._state_mask])
210 |                 mse = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(actions))
211 |                 self.sw_add_scalar('Action-Model/Loss Policy', mse, self._iter)
212 |                 if exp_actions is not None:
213 |                     action_pred_exp = self._action_model(exp_states[:, self._state_mask],
214 |                                                          exp_next_states[:, self._state_mask])
215 |                     mse_exp = F.mse_loss(to_float_tensor(action_pred_exp), to_float_tensor(exp_actions))
216 |                     self.sw_add_scalar('Action-Model/Loss Exp', mse_exp, self._iter)
217 | 
218 |                 # log entropy
219 |                 ent_plcy = self._action_model.entropy(states[:, self._state_mask],
220 |                                                       next_states[:, self._state_mask])
221 |                 ent_exp = self._action_model.entropy(exp_states[:, self._state_mask],
222 |                                                      exp_next_states[:, self._state_mask])
223 |                 self.sw_add_scalar('Action-Model/Entropy Plcy', ent_plcy,
224 |                                     self._iter)
225 |                 self.sw_add_scalar('Action-Model/Entropy Exp', ent_exp,
226 |                                     self._iter)
227 | 
228 |                 # log mu, lam, alpha, beta
229 |                 if type(self._action_model) == GCPActionModel or type(self._action_model) == KLGCPActionModel:
230 |                     mu, lam, alpha, beta = self._action_model.get_prior_params(states[:, self._state_mask],
231 |                                                                                next_states[:, self._state_mask])
232 |                     self.sw_add_scalar('Action-Model/Mu', np.mean(mu.detach().cpu().numpy()), self._iter)
233 |                     self.sw_add_scalar('Action-Model/Lambda', np.mean(lam.detach().cpu().numpy()), self._iter)
234 |                     self.sw_add_scalar('Action-Model/Lambda Counter', self._action_model.lam_counter, self._iter)
235 |                     self.sw_add_scalar('Action-Model/Alpha', np.mean(alpha.detach().cpu().numpy()), self._iter)
236 |                     self.sw_add_scalar('Action-Model/Beta', np.mean(beta.detach().cpu().numpy()), self._iter)
237 |                     self.sw_add_scalar('Action-Model/Var',
238 |                                        np.mean(self._action_model.get_corrected_pred_var(lam,
239 |                                                                                          alpha,
240 |                                                                                          beta).detach().cpu().numpy()),
241 |                                        self._iter)
242 |                     mu_exp, lam_exp, alpha_exp, beta_exp = \
243 |                         self._action_model.get_prior_params(exp_states[:, self._state_mask],
244 |                                                             exp_next_states[:, self._state_mask])
245 |                     self.sw_add_scalar('Action-Model/Mu Exp', np.mean(mu_exp.detach().cpu().numpy()), self._iter)
246 |                     self.sw_add_scalar('Action-Model/Lambda Exp', np.mean(lam_exp.detach().cpu().numpy()), self._iter)
247 |                     self.sw_add_scalar('Action-Model/Alpha Exp', np.mean(alpha_exp.detach().cpu().numpy()), self._iter)
248 |                     self.sw_add_scalar('Action-Model/Beta Exp', np.mean(beta_exp.detach().cpu().numpy()), self._iter)
249 |                     self.sw_add_scalar('Action-Model/Var Exp',
250 |                                        np.mean(self._action_model.get_corrected_pred_var(lam_exp,
251 |                                                                                          alpha_exp,
252 |                                                                                          beta_exp).detach().cpu().numpy()),
253 |                                        self._iter)
254 |                 elif type(self._action_model) == GaussianInvActionModel or \
255 |                         type(self._action_model) == KLGaussianInvActionModel:
256 |                     mu, log_sigma = self._action_model.get_mu_log_sigma(state[:, self._state_mask],
257 |                                                                         next_state[:, self._state_mask])
258 |                     mu_exp, log_sigma_exp = self._action_model.get_mu_log_sigma(exp_states.astype(np.float32)[:, self._state_mask],
259 |                                                                                 exp_next_states.astype(np.float32)[:, self._state_mask])
260 | 
261 |                     self._sw.add_scalar('Action-Model/Std Exp', torch.mean(torch.exp(log_sigma_exp)), self._iter)
262 |                     self._sw.add_scalar('Action-Model/Std', torch.mean(torch.exp(log_sigma)), self._iter)
263 |                     self._sw.add_scalar('Action-Model/Mu Exp', torch.mean(mu_exp), self._iter)
264 |                     self._sw.add_scalar('Action-Model/Mu', torch.mean(mu), self._iter)
265 | 
266 |                 # log norm
267 |                 #w = self._action_model.get_weights()
268 |                 #norm = np.linalg.norm(w)
269 |                 #self.sw_add_scalar("Action-Model/Norm", norm, self._iter)
270 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/lsiq.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from .iq_sac import IQ_SAC
  4 | import torch.nn.functional as F
  5 | 
  6 | from mushroom_rl.utils.torch import to_float_tensor
  7 | 
  8 | 
  9 | class LSIQ(IQ_SAC):
 10 | 
 11 |     def __init__(self, Q_max=1.0, Q_min =-1.0, loss_mode_exp="fix", Q_exp_loss=None,
 12 |                  treat_absorbing_states=False, target_clipping=True, lossQ_type="iq_like", **kwargs):
 13 | 
 14 |         # call parent
 15 |         super(LSIQ, self).__init__(**kwargs)
 16 | 
 17 |         self._Q_max = Q_max
 18 |         self._Q_min = Q_min
 19 |         self._loss_mode_exp = loss_mode_exp # or bootstrap
 20 |         self._Q_exp_loss = Q_exp_loss  
 21 |         self._treat_absorbing_states = treat_absorbing_states
 22 |         self._target_clipping = target_clipping
 23 |         self._lossQ_type = lossQ_type
 24 | 
 25 |     def _lossQ(self, obs, act, next_obs, absorbing, is_expert):
 26 |         if self._lossQ_type == "sqil_like":
 27 |             return self._lossQ_sqil_like(obs, act, next_obs, absorbing, is_expert)
 28 |         elif self._lossQ_type == "iq_like":
 29 |             return self._lossQ_iq_like(obs, act, next_obs, absorbing, is_expert)
 30 |         else:
 31 |             raise ValueError("Unsupported lossQ type %s" % self._lossQ_type)
 32 |         
 33 |     def _lossQ_iq_like(self, obs, act, next_obs, absorbing, is_expert):
 34 | 
 35 |         # 1st expert term of loss
 36 |         gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma)
 37 |         absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing
 38 |         current_Q = self._critic_approximator(obs, act, output_tensor=True)
 39 |         if not self._use_target:
 40 |             next_v = self.getV(next_obs)
 41 |         else:
 42 |             with torch.no_grad():
 43 |                 next_v = self.get_targetV(next_obs).detach()
 44 |         absorbing = torch.unsqueeze(absorbing, 1)
 45 | 
 46 |         if self._target_clipping:
 47 |             y = (1 - absorbing) * gamma.detach() * torch.clip(next_v, self._Q_min, self._Q_max)
 48 |         else:
 49 |             y = (1 - absorbing) * gamma.detach() * next_v
 50 | 
 51 |         reward = (current_Q - y)
 52 |         exp_reward = reward[is_expert]
 53 | 
 54 |         if self._loss_mode_exp == "bootstrap":
 55 |             loss_term1 = - exp_reward.mean()
 56 |         elif self._loss_mode_exp == "fix":
 57 |             if self._Q_exp_loss == "MSE":
 58 |                 loss_term1 = F.mse_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max)
 59 |             elif self._Q_exp_loss == "Huber":
 60 |                 loss_term1 = F.huber_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max)
 61 |             elif self._Q_exp_loss is None:
 62 |                 raise ValueError("If you choose loss_mode_exp == fix, you have to specify Q_exp_loss. Setting it to"
 63 |                                  "None is not valid.")
 64 |             else:
 65 |                 raise ValueError(
 66 |                     "Choosen Q_exp_loss %s is not supported. Choose either MSE or Huber." % self._Q_exp_loss)
 67 | 
 68 |         # do the logging
 69 |         self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing)
 70 | 
 71 |         # 2nd policy term for our loss
 72 |         V = self.getV(obs)
 73 |         value = (V - y)
 74 |         self.sw_add_scalar('V for policy on all states', V.mean(), self._iter)
 75 |         value_loss = value
 76 |         if self._plcy_loss_mode == "value":
 77 |             loss_term2 = value_loss.mean()
 78 |         elif self._plcy_loss_mode == "value_expert":
 79 |             value_loss_exp = value_loss[is_expert]
 80 |             loss_term2 = value_loss_exp.mean()
 81 |         elif self._plcy_loss_mode == "value_policy":
 82 |             value_loss_plcy = value_loss[~is_expert]
 83 |             loss_term2 = value_loss_plcy.mean()
 84 |         elif self._plcy_loss_mode == "q_old_policy":
 85 |             reward_plcy = reward[~is_expert]
 86 |             loss_term2 = reward_plcy.mean()
 87 |         elif self._plcy_loss_mode == "value_q_old_policy":
 88 |             reward_plcy = reward[~is_expert]
 89 |             loss_term2 = reward_plcy.mean() + value_loss.mean()
 90 |         elif self._plcy_loss_mode == "v0":
 91 |             value_loss_v0 = (1-gamma.detach()) * self.getV(obs[is_expert])
 92 |             loss_term2 = value_loss_v0.mean()
 93 |         elif self._plcy_loss_mode == "off":
 94 |             loss_term2 = 0.0
 95 |         else:
 96 |             raise ValueError("Undefined policy loss mode: %s" % self._plcy_loss_mode)
 97 | 
 98 |         # regularize
 99 |         chi2_loss = self.regularizer_loss(absorbing, reward, gamma, is_expert, treat_absorbing_states=self._treat_absorbing_states)
100 | 
101 |         loss_Q = loss_term1 + loss_term2 + chi2_loss
102 |         self.update_Q_parameters(loss_Q)
103 | 
104 |         if self._iter % self._logging_iter == 0:
105 |             grads = []
106 |             for param in self._critic_approximator.model.network.parameters():
107 |                 grads.append(param.grad.view(-1))
108 |             grads = torch.cat(grads)
109 |             norm = grads.norm(dim=0, p=2)
110 |             self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter)
111 | 
112 |         return loss_term1, loss_term2, chi2_loss
113 | 
114 |     def _lossQ_sqil_like(self, obs, act, next_obs, absorbing, is_expert):
115 |         
116 |         gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma)
117 |         absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing
118 |         current_Q = self._critic_approximator(obs, act, output_tensor=True)
119 |         if not self._use_target:
120 |             next_v = self.getV(next_obs)
121 |         else:
122 |             with torch.no_grad():
123 |                 next_v = self.get_targetV(next_obs).detach()
124 |         absorbing = torch.unsqueeze(absorbing, 1)
125 |         if self._target_clipping:
126 |             y = (1 - absorbing) * gamma.detach() * torch.clip(next_v, self._Q_min, self._Q_max)
127 |         else:
128 |             y = (1 - absorbing) * gamma.detach() * next_v
129 | 
130 |         # define the rewards
131 |         if self._treat_absorbing_states:
132 |             r_max = (1 - absorbing) * ((1 / self._reg_mult)) \
133 |                     + absorbing * (1 / (1 - gamma.detach())) * ((1 / self._reg_mult))
134 |             r_min = (1 - absorbing) * (-(1 / self._reg_mult))\
135 |                     + absorbing * (1 / (1 - gamma.detach())) * (-(1 / self._reg_mult))
136 |         else:
137 |             r_max = torch.ones_like(absorbing) * ((1 / self._reg_mult))
138 |             r_min = torch.ones_like(absorbing) * (-(1 / self._reg_mult))
139 | 
140 |         r_max = r_max[is_expert]
141 |         r_min = r_min[~is_expert]
142 | 
143 |         # expert part
144 |         if self._loss_mode_exp == "bootstrap":
145 |             if self._Q_exp_loss == "MSE":
146 |                 loss_term1 = torch.mean(torch.square(current_Q[is_expert] - (r_max + y[is_expert])))
147 |             elif self._Q_exp_loss == "Huber":
148 |                 loss_term1 = F.huber_loss(current_Q[is_expert], (r_max + y[is_expert]))
149 |             else:
150 |                 raise ValueError("Unknown loss.")
151 |         elif self._loss_mode_exp == "fix":
152 |             if self._Q_exp_loss == "MSE":
153 |                 loss_term1 = F.mse_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max)
154 |             elif self._Q_exp_loss == "Huber":
155 |                 loss_term1 = F.huber_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max)
156 |             else:
157 |                 raise ValueError("Unknown loss.")
158 |         else:
159 |             raise ValueError("Unknown expert loss mode.")
160 | 
161 |         # policy part
162 |         if self._plcy_loss_mode == "value":
163 |             value = self.getV(obs)
164 |             target = y
165 |             r_min = torch.concat([r_min, torch.ones_like(r_min) * (-(1 / self._reg_mult))])
166 |         elif self._plcy_loss_mode == "value_plcy":
167 |             value = self.getV(obs[~is_expert])
168 |             target = y[~is_expert]
169 |         elif self._plcy_loss_mode == "q_old_policy":
170 |             value = current_Q[~is_expert]
171 |             target = y[~is_expert]
172 | 
173 |         if self._Q_exp_loss == "MSE":
174 |             loss_term2 = torch.mean(torch.square(value - (r_min + target)))
175 |         elif self._Q_exp_loss == "Huber":
176 |             loss_term2 = F.huber_loss(value, (r_min + target))
177 |         else:
178 |             raise ValueError("Unknown loss.")
179 | 
180 |         # do the logging
181 |         reward = (current_Q - y)
182 |         self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing)
183 | 
184 |         loss_Q = loss_term1 + loss_term2
185 |         self.update_Q_parameters(loss_Q)
186 | 
187 |         grads = []
188 |         for param in self._critic_approximator.model.network.parameters():
189 |             grads.append(param.grad.view(-1))
190 |         grads = torch.cat(grads)
191 |         norm = grads.norm(dim=0, p=2)
192 |         if self._iter % self._logging_iter == 0:
193 |             self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter)
194 | 
195 |         return loss_term1, loss_term2, 0.0


--------------------------------------------------------------------------------
/imitation_lib/imitation/lsiq_h.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import torch
  3 | import numpy as np
  4 | from .lsiq import LSIQ
  5 | import torch.nn.functional as F
  6 | from mushroom_rl.approximators import Regressor
  7 | from mushroom_rl.approximators.parametric import TorchApproximator
  8 | 
  9 | from mushroom_rl.utils.torch import to_float_tensor
 10 | 
 11 | 
 12 | class LSIQ_H(LSIQ):
 13 | 
 14 |     def __init__(self, H_params=None, clip_expert_entropy_to_policy_max=True ,
 15 |                  max_H_policy_tau_down = 1e-4, max_H_policy_tau_up = 1e-2, **kwargs):
 16 | 
 17 |         # call parent
 18 |         super().__init__(**kwargs)
 19 |         
 20 |         # define the H function with the target
 21 |         target_H_params = deepcopy(H_params)
 22 |         self._H_approximator = Regressor(TorchApproximator,
 23 |                                          **H_params)
 24 |         self._target_H_approximator = Regressor(TorchApproximator,
 25 |                                                 **target_H_params)
 26 |         self._clip_expert_entropy_to_policy_max = clip_expert_entropy_to_policy_max
 27 |         self._max_H_policy = None
 28 |         self._max_H_policy_tau_down = max_H_policy_tau_down
 29 |         self._max_H_policy_tau_up = max_H_policy_tau_up
 30 | 
 31 |         # define the optimizer for the H function
 32 |         net_params = self._H_approximator.model.network.parameters()
 33 |         self._H_optimizer = H_params["optimizer"]["class"](net_params, **H_params["optimizer"]["params"])
 34 | 
 35 |     def _lossQ_iq_like(self, obs, act, next_obs, absorbing, is_expert):
 36 |         
 37 |         # update Q according to lsiq_update
 38 |         loss_term1, loss_term2, chi2_loss = super()._lossQ_iq_like(obs, act, next_obs, absorbing, is_expert)
 39 | 
 40 |         # update the H function
 41 |         gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma)
 42 |         self.update_H_function(obs, act, next_obs, absorbing, gamma.detach(), is_expert)
 43 | 
 44 |         return loss_term1, loss_term2, chi2_loss
 45 |     
 46 |     def _lossQ_sqil_like(self, obs, act, next_obs, absorbing, is_expert):
 47 | 
 48 |         # update Q according to lsiq_update
 49 |         loss_term1, loss_term2, chi2_loss = super(LSIQ_H, self)._lossQ_sqil_like(obs, act, next_obs, absorbing, is_expert)
 50 |         
 51 |         # update the H function
 52 |         gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma)
 53 |         self.update_H_function(obs, act, next_obs, absorbing, gamma.detach(), is_expert)
 54 | 
 55 |         return loss_term1, loss_term2, chi2_loss
 56 | 
 57 |     def update_H_function(self, obs, action, next_obs, absorbing, gamma, is_expert):
 58 |         H = self._H_approximator(obs, action, output_tensor=True)
 59 |         with torch.no_grad():
 60 |             next_action, log_pi = self.policy.compute_action_and_log_prob_t(next_obs)
 61 | 
 62 |         # restrict the target H of the expert to the maximum one of the policy
 63 |         neg_log_pi = -log_pi
 64 |         if self._clip_expert_entropy_to_policy_max:
 65 |             if self._max_H_policy is None:
 66 |                 self._max_H_policy = torch.max(neg_log_pi[~is_expert])
 67 |             else:
 68 |                 curr_max_H_policy = torch.max(neg_log_pi[~is_expert])
 69 |                 if curr_max_H_policy > self._max_H_policy:
 70 |                     self._max_H_policy = (1 - self._max_H_policy_tau_up) * self._max_H_policy + \
 71 |                                          self._max_H_policy_tau_up * curr_max_H_policy
 72 |                 else:
 73 |                     self._max_H_policy = (1 - self._max_H_policy_tau_down) * self._max_H_policy + \
 74 |                                           self._max_H_policy_tau_down * curr_max_H_policy
 75 |             neg_log_pi[is_expert] = torch.clip(neg_log_pi[is_expert], self._max_H_policy, 100000)
 76 |         
 77 |         next_H = (self._target_H_approximator(next_obs, next_action, output_tensor=True).detach() +
 78 |                                               self._alpha.detach() * torch.unsqueeze(neg_log_pi, 1))
 79 |         target_H = (1 - absorbing) * gamma * next_H
 80 | 
 81 |         # clip the target for numerical stability
 82 |         target_H = torch.clip(target_H, -10000, 1000)
 83 |         loss_H = F.mse_loss(H, target_H)
 84 | 
 85 |         self._H_optimizer.zero_grad()
 86 |         loss_H.backward()
 87 |         self._H_optimizer.step()
 88 | 
 89 |         H = H.detach().cpu().numpy()
 90 |         log_pi = log_pi.detach().cpu().numpy()
 91 | 
 92 |         # do some additional logging
 93 |         if self._iter % self._logging_iter == 0:
 94 |             self.sw_add_scalar('H function/Loss', loss_H, self._iter)
 95 |             self.sw_add_scalar('H function/H', np.mean(H), self._iter)
 96 |             self.sw_add_scalar('H function/H plcy', np.mean(H[~is_expert]), self._iter)
 97 |             self.sw_add_scalar('H function/H expert', np.mean(H[is_expert]), self._iter)
 98 |             self.sw_add_scalar('H function/H_step', np.mean(-log_pi), self._iter)
 99 |             self.sw_add_scalar('H function/H_step plcy', np.mean(-log_pi[~is_expert]), self._iter)
100 |             self.sw_add_scalar('H function/H_step expert', np.mean(-log_pi[is_expert]), self._iter)
101 | 
102 |         return loss_H, H, log_pi
103 | 
104 |     def _actor_loss(self, state, action_new, log_prob):
105 |         q = self._critic_approximator(state, action_new, output_tensor=True)
106 |         H = self._H_approximator(state, action_new, output_tensor=True)
107 |         soft_q = q + H
108 |         return (self._alpha.detach() * log_prob - soft_q).mean()
109 | 
110 |     def getV(self, obs):
111 |         with torch.no_grad():
112 |             action, _ = self.policy.compute_action_and_log_prob_t(obs)
113 |         current_V = self._critic_approximator(obs, action.detach().cpu().numpy(), output_tensor=True)
114 |         return current_V
115 | 
116 |     def get_targetV(self, obs):
117 |         with torch.no_grad():
118 |             action, _ = self.policy.compute_action_and_log_prob_t(obs)
119 |         target_V = self._target_critic_approximator(obs, action.detach().cpu().numpy(), output_tensor=True)
120 |         return target_V
121 | 
122 |     def _update_all_targets(self):
123 |         self._update_target(self._critic_approximator,
124 |                             self._target_critic_approximator)
125 |         self._update_target(self._H_approximator,
126 |                             self._target_H_approximator)
127 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/lsiq_hc.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import torch
  3 | import numpy as np
  4 | from .lsiq_h import LSIQ_H
  5 | import torch.nn.functional as F
  6 | from mushroom_rl.utils.parameters import to_parameter
  7 | from mushroom_rl.approximators import Regressor
  8 | from mushroom_rl.approximators.parametric import TorchApproximator
  9 | 
 10 | from mushroom_rl.utils.torch import to_float_tensor
 11 | 
 12 | 
 13 | class LSIQ_HC(LSIQ_H):
 14 | 
 15 |     def __init__(self, H_tau, H_loss_mode="Huber", **kwargs):
 16 | 
 17 |         # call parent
 18 |         super().__init__(**kwargs)
 19 | 
 20 |         self._H_tau = to_parameter(H_tau)
 21 |         self._H_loss_mode = H_loss_mode     # either MSE or Huber
 22 | 
 23 |     def update_H_function(self, obs, action, next_obs, absorbing, gamma, is_expert):
 24 | 
 25 |         # calculate the squared reward of the current Q
 26 |         H = self._H_approximator(obs, action, output_tensor=True)
 27 |         with torch.no_grad():
 28 |             next_action, log_pi = self.policy.compute_action_and_log_prob_t(next_obs)
 29 |             Q_plcy = self._target_critic_approximator(obs, action, output_tensor=True)
 30 |             V_plcy = self.get_targetV(obs)
 31 |             y = (1 - absorbing) * gamma.detach() * torch.clip(V_plcy, self._Q_min,
 32 |                                                                                                  self._Q_max)
 33 | 
 34 |             reward_non_abs = torch.square(torch.clip(Q_plcy - y, -1/self._reg_mult, 1/self._reg_mult)).detach()
 35 |             reward_abs = torch.square(torch.clip(Q_plcy - y, self._Q_min, self._Q_max)).detach()
 36 | 
 37 |             squared_reg_reward_plcy = (1 - absorbing) * self._reg_mult * reward_non_abs \
 38 |                                       + absorbing * (1.0 - gamma.detach()) * self._reg_mult * reward_abs
 39 | 
 40 |         # restrict the target H of the expert to the maximum one of the policy
 41 |         neg_log_pi = -log_pi
 42 |         if self._clip_expert_entropy_to_policy_max:
 43 |             if self._max_H_policy is None:
 44 |                 self._max_H_policy = torch.max(neg_log_pi[~is_expert])
 45 |             else:
 46 |                 curr_max_H_policy = torch.max(neg_log_pi[~is_expert])
 47 |                 if curr_max_H_policy > self._max_H_policy:
 48 |                     self._max_H_policy = (1 - self._max_H_policy_tau_up) * self._max_H_policy + \
 49 |                                          self._max_H_policy_tau_up * curr_max_H_policy
 50 |                 else:
 51 |                     self._max_H_policy = (1 - self._max_H_policy_tau_down) * self._max_H_policy + \
 52 |                                           self._max_H_policy_tau_down * curr_max_H_policy
 53 |             neg_log_pi[is_expert] = torch.clip(neg_log_pi[is_expert], self._max_H_policy, 100000)
 54 | 
 55 |         # calculate the target for the HC-function
 56 |         next_H = (self._target_H_approximator(next_obs, next_action, output_tensor=True).detach() +
 57 |                                               self._alpha.detach() * torch.unsqueeze(neg_log_pi, 1))
 58 |         target_H = squared_reg_reward_plcy + (1 - absorbing) * gamma * next_H
 59 | 
 60 |         # clip the target for numerical stability
 61 |         Q2_max = (1.0/self._reg_mult)**2 / (1 - gamma.detach())
 62 |         target_H = torch.clip(target_H, -1000, Q2_max+100)
 63 | 
 64 |         if self._H_loss_mode == "Huber":
 65 |             loss_H = F.huber_loss(H, target_H)
 66 |         elif self._H_loss_mode == "MSE":
 67 |             loss_H = F.mse_loss(H, target_H)
 68 |         else:
 69 |             raise ValueError("Unsupported H_loss %s" % self._H_loss_mode)
 70 | 
 71 |         self._H_optimizer.zero_grad()
 72 |         loss_H.backward()
 73 |         self._H_optimizer.step()
 74 | 
 75 |         H = H.detach().cpu().numpy()
 76 |         log_pi = log_pi.detach().cpu().numpy()
 77 | 
 78 |         # do some additional logging
 79 |         if self._iter % self._logging_iter == 0:
 80 |             self.sw_add_scalar('H function/Loss', loss_H, self._iter)
 81 |             self.sw_add_scalar('H function/H', np.mean(H), self._iter)
 82 |             self.sw_add_scalar('H function/H plcy', np.mean(H[~is_expert]), self._iter)
 83 |             self.sw_add_scalar('H function/H expert', np.mean(H[is_expert]), self._iter)
 84 |             self.sw_add_scalar('H function/H_step', np.mean(-log_pi), self._iter)
 85 |             self.sw_add_scalar('H function/H_step plcy', np.mean(-log_pi[~is_expert]), self._iter)
 86 |             self.sw_add_scalar('H function/H_step expert', np.mean(-log_pi[is_expert]), self._iter)
 87 | 
 88 |         return loss_H, H, log_pi
 89 | 
 90 |     def _update_all_targets(self):
 91 |         self._update_target(self._critic_approximator,
 92 |                             self._target_critic_approximator)
 93 |         self._update_target_H(self._H_approximator,
 94 |                               self._target_H_approximator)
 95 | 
 96 |     def _update_target_H(self, online, target):
 97 |         for i in range(len(target)):
 98 |             weights = self._H_tau() * online[i].get_weights()
 99 |             weights += (1 - self._H_tau.get_value()) * target[i].get_weights()
100 |             target[i].set_weights(weights)
101 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/lsiqfo.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | from copy import deepcopy
  5 | 
  6 | from mushroom_rl.core import Serializable
  7 | from mushroom_rl.approximators import Regressor
  8 | from mushroom_rl.approximators.parametric import TorchApproximator
  9 | from imitation_lib.imitation.lsiq import LSIQ
 10 | from mushroom_rl.utils.minibatches import minibatch_generator
 11 | from mushroom_rl.utils.torch import to_float_tensor
 12 | from mushroom_rl.utils.parameters import to_parameter
 13 | from imitation_lib.utils.action_models import GaussianInvActionModel, LearnableVarGaussianInvActionModel,\
 14 |     GCPActionModel, KLGCPActionModel, KLGaussianInvActionModel
 15 | 
 16 | from imitation_lib.utils.distributions import InverseGamma
 17 | 
 18 | 
 19 | class LSIQfO(LSIQ):
 20 | 
 21 |     def __init__(self, action_model, action_model_params, action_model_fit_params=None, action_model_noise_std=0.0,
 22 |                  action_model_noise_clip=None, add_noise_to_obs=False, ext_normalizer_action_model=None,
 23 |                  interpolate_expert_states=False, interpolation_coef=1.0, **kwargs):
 24 | 
 25 |         super().__init__(**kwargs)
 26 | 
 27 |         if action_model == GaussianInvActionModel or action_model == GCPActionModel \
 28 |                 or action_model == KLGCPActionModel or action_model == KLGaussianInvActionModel:
 29 |             action_model_params.setdefault("min_a", self.mdp_info.action_space.low)
 30 |             action_model_params.setdefault("max_a", self.mdp_info.action_space.high)
 31 |             action_model_params.setdefault("use_cuda", self._use_cuda)
 32 |         elif action_model == LearnableVarGaussianInvActionModel:
 33 |             action_model_params.setdefault("use_cuda", self._use_cuda)
 34 | 
 35 |         # setup the action model
 36 |         self._action_model = action_model(**action_model_params, demonstration=self._demonstrations)
 37 | 
 38 |         self._action_model_fit_params = dict(fits_per_step=1, init_epochs=0, )\
 39 |             if action_model_fit_params is None else action_model_fit_params
 40 |         self._action_model_initialized = True if self._action_model_fit_params["init_epochs"] > 0 else False
 41 |         self._action_model_batch_size = action_model_params["batch_size"]
 42 | 
 43 |         self._action_model_noise_std = action_model_noise_std
 44 |         self._action_model_noise_clip = action_model_noise_clip
 45 |         self.ext_normalizer_action_model = ext_normalizer_action_model
 46 |         self._add_noise_to_obs = add_noise_to_obs
 47 |         self._interpolate_expert_states = interpolate_expert_states
 48 |         self._interpolation_coef = interpolation_coef
 49 | 
 50 |         self._add_save_attr(
 51 |             _action_model='mushroom',
 52 |             _action_model_fit_params='pickle',
 53 |             _action_model_noise_std='primitive',
 54 |             _action_model_noise_clip='primitive',
 55 |             ext_normalizer_action_model='pickle',
 56 |             _add_noise_to_obs='primitive'
 57 |         )
 58 | 
 59 |     def fit(self, dataset):
 60 | 
 61 |         # add to replay memory
 62 |         self._replay_memory.add(dataset)
 63 | 
 64 |         if self._replay_memory.initialized:
 65 | 
 66 |             # train the action model
 67 |             if not self._action_model_initialized:
 68 |                 self.train_action_model(init=True)
 69 |                 self._action_model_initialized = True
 70 |             else:
 71 |                 self.train_action_model()
 72 | 
 73 |             # sample batch from policy replay buffer
 74 |             state, action, reward, next_state, absorbing, _ = \
 75 |                 self._replay_memory.get(self._batch_size())
 76 | 
 77 |             # sample batch of same size from expert replay buffer and concatenate with samples from own policy
 78 |             demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0],
 79 |                                                                            self._demonstrations["states"],
 80 |                                                                            self._demonstrations["next_states"],
 81 |                                                                            self._demonstrations["absorbing"]))
 82 | 
 83 |             # predict the actions for our expert dataset
 84 |             demo_obs_act = demo_obs.astype(np.float32)[:, self._state_mask]
 85 |             demo_nobs_act = demo_nobs.astype(np.float32)[:, self._state_mask]
 86 |             demo_act = self._action_model.draw_action(to_float_tensor(demo_obs_act),
 87 |                                                       to_float_tensor(demo_nobs_act))
 88 | 
 89 |             # clip predicted action to action range
 90 |             demo_act = np.clip(demo_act, self.mdp_info.action_space.low, self.mdp_info.action_space.high)
 91 | 
 92 |             if self._add_noise_to_obs:
 93 |                 assert self.ext_normalizer_action_model is not None, "Normalizer is needed to be defined."
 94 | 
 95 |                 demo_obs = self.ext_normalizer_action_model(demo_obs)
 96 |                 demo_nobs = self.ext_normalizer_action_model(demo_nobs)
 97 |                 demo_obs += self._get_noise(demo_obs)
 98 |                 demo_nobs += self._get_noise(demo_nobs)
 99 |                 demo_obs = self.ext_normalizer_action_model.inv(demo_obs)
100 |                 demo_nobs = self.ext_normalizer_action_model.inv(demo_nobs)
101 | 
102 |             # make interpolation if needed
103 |             if self._interpolate_expert_states:
104 |                 demo_obs = self.interpolate(demo_obs[:, self._state_mask], state[:, self._state_mask],
105 |                                             mixing_coef=self._interpolation_coef)
106 |                 demo_act = self.interpolate(demo_act, action,
107 |                                             mixing_coef=self._interpolation_coef)
108 | 
109 |             # prepare data for IQ update
110 |             input_states = to_float_tensor(np.concatenate([state,
111 |                                                            demo_obs.astype(np.float32)[:, self._state_mask]]))
112 |             input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)]))
113 |             input_n_states = to_float_tensor(np.concatenate([next_state,
114 |                                                              demo_nobs.astype(np.float32)[:, self._state_mask]]))
115 |             input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)]))
116 |             is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool),
117 |                                       torch.ones(len(state), dtype=torch.bool)])
118 | 
119 |             # make IQ update
120 |             self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert)
121 | 
122 |         self._iter += 1
123 |         self.policy.iter += 1
124 | 
125 |     def _get_noise(self, x):
126 |         noise = np.random.normal(loc=0.0, scale=self._action_model_noise_std,
127 |                                  size=np.size(x)).reshape(x.shape)
128 |         noise = np.clip(noise, -self._action_model_noise_clip, self._action_model_noise_clip) \
129 |             if self._action_model_noise_clip is not None else noise
130 |         return noise
131 | 
132 |     def interpolate(self, expert_data, policy_data, mixing_coef=None):
133 |         interpolated = mixing_coef * expert_data + (1 - mixing_coef) * policy_data
134 |         return interpolated
135 | 
136 |     def train_action_model(self, init=False):
137 | 
138 |         if init and self._action_model_fit_params["init_epochs"] > 0:
139 |             n_epochs = self._action_model_fit_params["init_epochs"]
140 |             # initialize the model
141 |             state, action, _, next_state, _, _ = self._replay_memory.get(self._replay_memory.size)
142 |             state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state
143 |             next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state
144 |             state_train = state[0:int(len(state)*0.9), :]
145 |             state_val = state[int(len(state)*0.9):, :]
146 |             next_state_train = next_state[0:int(len(next_state)*0.9), :]
147 |             next_state_val = next_state[int(len(next_state)*0.9):, :]
148 |             action_train = action[0:int(len(next_state)*0.9), :]
149 |             action_val = action[int(len(next_state)*0.9):, :]
150 |             state_nstate_train = np.concatenate([state_train, next_state_train], axis=1)
151 |             state_nstate_val = np.concatenate([state_val, next_state_val], axis=1)
152 | 
153 |             # make eval before training
154 |             action_pred = self._action_model(state_nstate_val)
155 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val))
156 |             self._sw.add_scalar('Action-Model/Loss', loss, self._iter)
157 |             print("Action Model Validation Loss before training: ", loss)
158 |             action_pred = self._action_model(state_nstate_train)
159 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train))
160 |             print("Action Model Training Loss before training: ", loss)
161 |             w = self._action_model.get_weights()
162 |             norm = np.linalg.norm(w)
163 |             self.sw_add_scalar("Action-Model/Norm", norm, self._iter)
164 | 
165 |             # make training
166 |             self._action_model.fit(state_nstate_train, action_train, n_epochs=n_epochs)
167 | 
168 |             # make eval after training
169 |             action_pred = self._action_model(state_nstate_val)
170 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val))
171 |             self._sw.add_scalar('Action-Model/Loss', loss, self._iter)
172 |             print("Action Model Validation Loss After training: ", loss)
173 |             action_pred = self._action_model(state_nstate_train)
174 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train))
175 |             print("Action Model Validation Loss After training: ", loss)
176 | 
177 |         else:
178 |             state_nstates = []
179 |             actions = []
180 |             for i in range(self._action_model_fit_params["fits_per_step"]):
181 |                 # sample batch from policy replay buffer
182 |                 state, action, reward, next_state, absorbing, _ = \
183 |                     self._replay_memory.get(self._action_model_batch_size)
184 | 
185 |                 state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state
186 |                 next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state
187 |                 self._action_model.fit(state, next_state, action)
188 | 
189 |                 state_nstates.append([state, next_state])
190 |                 actions.append(action)
191 | 
192 |             if self._iter % self._logging_iter == 0:
193 | 
194 |                 # sample batch from policy replay buffer
195 |                 states, actions, rewards, next_states, absorbings, _ = \
196 |                     self._replay_memory.get(self._action_model_batch_size)
197 | 
198 |                 # we need to check if we have a dataset with expert actions available or not
199 |                 try:
200 |                     exp_states, exp_next_states, exp_actions = next(
201 |                         minibatch_generator(self._action_model_batch_size,
202 |                                             self._demonstrations["states"],
203 |                                             self._demonstrations["next_states"],
204 |                                             self._demonstrations["actions"]))
205 |                 except KeyError:
206 |                     exp_states, exp_next_states = next(minibatch_generator(self._action_model_batch_size,
207 |                                                                            self._demonstrations["states"],
208 |                                                                            self._demonstrations["next_states"]))
209 |                     exp_actions = None
210 | 
211 |                 # log mse
212 |                 action_pred = self._action_model(states[:, self._state_mask], next_states[:, self._state_mask])
213 |                 mse = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(actions))
214 |                 self.sw_add_scalar('Action-Model/Loss Policy', mse, self._iter)
215 |                 if exp_actions is not None:
216 |                     action_pred_exp = self._action_model(exp_states[:, self._state_mask],
217 |                                                          exp_next_states[:, self._state_mask])
218 |                     mse_exp = F.mse_loss(to_float_tensor(action_pred_exp), to_float_tensor(exp_actions))
219 |                     self.sw_add_scalar('Action-Model/Loss Exp', mse_exp, self._iter)
220 | 
221 |                 # log entropy
222 |                 ent_plcy = self._action_model.entropy(states[:, self._state_mask],
223 |                                                       next_states[:, self._state_mask])
224 |                 ent_exp = self._action_model.entropy(exp_states[:, self._state_mask],
225 |                                                      exp_next_states[:, self._state_mask])
226 |                 self.sw_add_scalar('Action-Model/Entropy Plcy', ent_plcy,
227 |                                     self._iter)
228 |                 self.sw_add_scalar('Action-Model/Entropy Exp', ent_exp,
229 |                                     self._iter)
230 | 
231 |                 # log mu, lam, alpha, beta
232 |                 if type(self._action_model) == GCPActionModel or type(self._action_model) == KLGCPActionModel:
233 |                     mu, lam, alpha, beta = self._action_model.get_prior_params(states[:, self._state_mask],
234 |                                                                                next_states[:, self._state_mask])
235 |                     self.sw_add_scalar('Action-Model/Mu', np.mean(mu.detach().cpu().numpy()), self._iter)
236 |                     self.sw_add_scalar('Action-Model/Lambda', np.mean(lam.detach().cpu().numpy()), self._iter)
237 |                     self.sw_add_scalar('Action-Model/Lambda Counter', self._action_model.lam_counter, self._iter)
238 |                     self.sw_add_scalar('Action-Model/Alpha', np.mean(alpha.detach().cpu().numpy()), self._iter)
239 |                     self.sw_add_scalar('Action-Model/Beta', np.mean(beta.detach().cpu().numpy()), self._iter)
240 |                     self.sw_add_scalar('Action-Model/Var',
241 |                                        np.mean(self._action_model.get_corrected_pred_var(lam,
242 |                                                                                          alpha,
243 |                                                                                          beta).detach().cpu().numpy()),
244 |                                        self._iter)
245 |                     mu_exp, lam_exp, alpha_exp, beta_exp = \
246 |                         self._action_model.get_prior_params(exp_states[:, self._state_mask],
247 |                                                             exp_next_states[:, self._state_mask])
248 |                     self.sw_add_scalar('Action-Model/Mu Exp', np.mean(mu_exp.detach().cpu().numpy()), self._iter)
249 |                     self.sw_add_scalar('Action-Model/Lambda Exp', np.mean(lam_exp.detach().cpu().numpy()), self._iter)
250 |                     self.sw_add_scalar('Action-Model/Alpha Exp', np.mean(alpha_exp.detach().cpu().numpy()), self._iter)
251 |                     self.sw_add_scalar('Action-Model/Beta Exp', np.mean(beta_exp.detach().cpu().numpy()), self._iter)
252 |                     self.sw_add_scalar('Action-Model/Var Exp',
253 |                                        np.mean(self._action_model.get_corrected_pred_var(lam_exp,
254 |                                                                                          alpha_exp,
255 |                                                                                          beta_exp).detach().cpu().numpy()),
256 |                                        self._iter)
257 |                 elif type(self._action_model) == GaussianInvActionModel or \
258 |                         type(self._action_model) == KLGaussianInvActionModel:
259 |                     mu, log_sigma = self._action_model.get_mu_log_sigma(state[:, self._state_mask],
260 |                                                                         next_state[:, self._state_mask])
261 |                     mu_exp, log_sigma_exp = self._action_model.get_mu_log_sigma(exp_states.astype(np.float32)[:, self._state_mask],
262 |                                                                                 exp_next_states.astype(np.float32)[:, self._state_mask])
263 | 
264 |                     self._sw.add_scalar('Action-Model/Std Exp', torch.mean(torch.exp(log_sigma_exp)), self._iter)
265 |                     self._sw.add_scalar('Action-Model/Std', torch.mean(torch.exp(log_sigma)), self._iter)
266 |                     self._sw.add_scalar('Action-Model/Mu Exp', torch.mean(mu_exp), self._iter)
267 |                     self._sw.add_scalar('Action-Model/Mu', torch.mean(mu), self._iter)
268 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/lsiqfo_h.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | 
  5 | from imitation_lib.imitation.lsiq_h import LSIQ_H
  6 | from mushroom_rl.utils.minibatches import minibatch_generator
  7 | from mushroom_rl.utils.torch import to_float_tensor
  8 | from imitation_lib.utils.action_models import GaussianInvActionModel, LearnableVarGaussianInvActionModel,\
  9 |     GCPActionModel, KLGCPActionModel, KLGaussianInvActionModel
 10 | 
 11 | 
 12 | class LSIQfO_H(LSIQ_H):
 13 | 
 14 |     def __init__(self, action_model, action_model_params, action_model_fit_params=None, action_model_noise_std=0.0,
 15 |                  action_model_noise_clip=None, add_noise_to_obs=False, ext_normalizer_action_model=None,
 16 |                  interpolate_expert_states=False, interpolation_coef=1.0, **kwargs):
 17 | 
 18 |         super().__init__(**kwargs)
 19 | 
 20 |         if action_model == GaussianInvActionModel or action_model == GCPActionModel \
 21 |                 or action_model == KLGCPActionModel or action_model == KLGaussianInvActionModel:
 22 |             action_model_params.setdefault("min_a", self.mdp_info.action_space.low)
 23 |             action_model_params.setdefault("max_a", self.mdp_info.action_space.high)
 24 |             action_model_params.setdefault("use_cuda", self._use_cuda)
 25 |         elif action_model == LearnableVarGaussianInvActionModel:
 26 |             action_model_params.setdefault("use_cuda", self._use_cuda)
 27 | 
 28 |         # setup the action model
 29 |         self._action_model = action_model(**action_model_params, demonstration=self._demonstrations)
 30 | 
 31 |         self._action_model_fit_params = dict(fits_per_step=1, init_epochs=0, )\
 32 |             if action_model_fit_params is None else action_model_fit_params
 33 |         self._action_model_initialized = True if self._action_model_fit_params["init_epochs"] > 0 else False
 34 |         self._action_model_batch_size = action_model_params["batch_size"]
 35 | 
 36 |         self._action_model_noise_std = action_model_noise_std
 37 |         self._action_model_noise_clip = action_model_noise_clip
 38 |         self.ext_normalizer_action_model = ext_normalizer_action_model
 39 |         self._add_noise_to_obs = add_noise_to_obs
 40 |         self._interpolate_expert_states = interpolate_expert_states
 41 |         self._interpolation_coef = interpolation_coef
 42 | 
 43 |         self._add_save_attr(
 44 |             _action_model='mushroom',
 45 |             _action_model_fit_params='pickle',
 46 |             _action_model_noise_std='primitive',
 47 |             _action_model_noise_clip='primitive',
 48 |             ext_normalizer_action_model='pickle',
 49 |             _add_noise_to_obs='primitive'
 50 |         )
 51 | 
 52 |     def fit(self, dataset):
 53 | 
 54 |         # add to replay memory
 55 |         self._replay_memory.add(dataset)
 56 | 
 57 |         if self._replay_memory.initialized:
 58 | 
 59 |             # train the action model
 60 |             if not self._action_model_initialized:
 61 |                 self.train_action_model(init=True)
 62 |                 self._action_model_initialized = True
 63 |             else:
 64 |                 self.train_action_model()
 65 | 
 66 |             # sample batch from policy replay buffer
 67 |             state, action, reward, next_state, absorbing, _ = \
 68 |                 self._replay_memory.get(self._batch_size())
 69 | 
 70 |             # sample batch of same size from expert replay buffer and concatenate with samples from own policy
 71 |             demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0],
 72 |                                                                            self._demonstrations["states"],
 73 |                                                                            self._demonstrations["next_states"],
 74 |                                                                            self._demonstrations["absorbing"]))
 75 | 
 76 |             # predict the actions for our expert dataset
 77 |             demo_obs_act = demo_obs.astype(np.float32)[:, self._state_mask]
 78 |             demo_nobs_act = demo_nobs.astype(np.float32)[:, self._state_mask]
 79 |             demo_act = self._action_model.draw_action(to_float_tensor(demo_obs_act),
 80 |                                                       to_float_tensor(demo_nobs_act))
 81 | 
 82 |             if self._add_noise_to_obs:
 83 |                 assert self.ext_normalizer_action_model is not None, "Normalizer is needed to be defined."
 84 | 
 85 |                 demo_obs = self.ext_normalizer_action_model(demo_obs)
 86 |                 demo_nobs = self.ext_normalizer_action_model(demo_nobs)
 87 |                 demo_obs += self._get_noise(demo_obs)
 88 |                 demo_nobs += self._get_noise(demo_nobs)
 89 |                 demo_obs = self.ext_normalizer_action_model.inv(demo_obs)
 90 |                 demo_nobs = self.ext_normalizer_action_model.inv(demo_nobs)
 91 | 
 92 |             # make interpolation if needed
 93 |             if self._interpolate_expert_states:
 94 |                 demo_obs = self.interpolate(demo_obs[:, self._state_mask], state[:, self._state_mask],
 95 |                                             mixing_coef=self._interpolation_coef)
 96 |                 demo_act = self.interpolate(demo_act, action,
 97 |                                             mixing_coef=self._interpolation_coef)
 98 | 
 99 |             # prepare data for IQ update
100 |             input_states = to_float_tensor(np.concatenate([state,
101 |                                                            demo_obs.astype(np.float32)[:, self._state_mask]]))
102 |             input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)]))
103 |             input_n_states = to_float_tensor(np.concatenate([next_state,
104 |                                                              demo_nobs.astype(np.float32)[:, self._state_mask]]))
105 |             input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)]))
106 |             is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool),
107 |                                       torch.ones(len(state), dtype=torch.bool)])
108 | 
109 |             # make IQ update
110 |             self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert)
111 | 
112 |         self._iter += 1
113 |         self.policy.iter += 1
114 | 
115 |     def _get_noise(self, x):
116 |         noise = np.random.normal(loc=0.0, scale=self._action_model_noise_std,
117 |                                  size=np.size(x)).reshape(x.shape)
118 |         noise = np.clip(noise, -self._action_model_noise_clip, self._action_model_noise_clip) \
119 |             if self._action_model_noise_clip is not None else noise
120 |         return noise
121 | 
122 |     def interpolate(self, expert_data, policy_data, mixing_coef=None):
123 |         interpolated = mixing_coef * expert_data + (1 - mixing_coef) * policy_data
124 |         return interpolated
125 | 
126 |     def train_action_model(self, init=False):
127 | 
128 |         if init and self._action_model_fit_params["init_epochs"] > 0:
129 |             n_epochs = self._action_model_fit_params["init_epochs"]
130 |             # initialize the model
131 |             state, action, _, next_state, _, _ = self._replay_memory.get(self._replay_memory.size)
132 |             state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state
133 |             next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state
134 |             state_train = state[0:int(len(state)*0.9), :]
135 |             state_val = state[int(len(state)*0.9):, :]
136 |             next_state_train = next_state[0:int(len(next_state)*0.9), :]
137 |             next_state_val = next_state[int(len(next_state)*0.9):, :]
138 |             action_train = action[0:int(len(next_state)*0.9), :]
139 |             action_val = action[int(len(next_state)*0.9):, :]
140 |             state_nstate_train = np.concatenate([state_train, next_state_train], axis=1)
141 |             state_nstate_val = np.concatenate([state_val, next_state_val], axis=1)
142 | 
143 |             # make eval before training
144 |             action_pred = self._action_model(state_nstate_val)
145 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val))
146 |             self._sw.add_scalar('Action-Model/Loss', loss, self._iter)
147 |             print("Action Model Validation Loss before training: ", loss)
148 |             action_pred = self._action_model(state_nstate_train)
149 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train))
150 |             print("Action Model Training Loss before training: ", loss)
151 |             w = self._action_model.get_weights()
152 |             norm = np.linalg.norm(w)
153 |             self.sw_add_scalar("Action-Model/Norm", norm, self._iter)
154 | 
155 |             # make training
156 |             self._action_model.fit(state_nstate_train, action_train, n_epochs=n_epochs)
157 | 
158 |             # make eval after training
159 |             action_pred = self._action_model(state_nstate_val)
160 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val))
161 |             self._sw.add_scalar('Action-Model/Loss', loss, self._iter)
162 |             print("Action Model Validation Loss After training: ", loss)
163 |             action_pred = self._action_model(state_nstate_train)
164 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train))
165 |             print("Action Model Validation Loss After training: ", loss)
166 | 
167 |         else:
168 |             state_nstates = []
169 |             actions = []
170 |             for i in range(self._action_model_fit_params["fits_per_step"]):
171 |                 # sample batch from policy replay buffer
172 |                 state, action, reward, next_state, absorbing, _ = \
173 |                     self._replay_memory.get(self._action_model_batch_size)
174 | 
175 |                 state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state
176 |                 next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state
177 |                 self._action_model.fit(state, next_state, action)
178 | 
179 |                 state_nstates.append([state, next_state])
180 |                 actions.append(action)
181 | 
182 |             if self._iter % self._logging_iter == 0:
183 | 
184 |                 # sample batch from policy replay buffer
185 |                 states, actions, rewards, next_states, absorbings, _ = \
186 |                     self._replay_memory.get(self._action_model_batch_size)
187 | 
188 |                 # we need to check if we have a dataset with expert actions available or not
189 |                 try:
190 |                     exp_states, exp_next_states, exp_actions = next(
191 |                         minibatch_generator(self._action_model_batch_size,
192 |                                             self._demonstrations["states"],
193 |                                             self._demonstrations["next_states"],
194 |                                             self._demonstrations["actions"]))
195 |                 except KeyError:
196 |                     exp_states, exp_next_states = next(minibatch_generator(self._action_model_batch_size,
197 |                                                                            self._demonstrations["states"],
198 |                                                                            self._demonstrations["next_states"]))
199 |                     exp_actions = None
200 | 
201 |                 # log mse
202 |                 action_pred = self._action_model(states[:, self._state_mask], next_states[:, self._state_mask])
203 |                 mse = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(actions))
204 |                 self.sw_add_scalar('Action-Model/Loss Policy', mse, self._iter)
205 |                 if exp_actions is not None:
206 |                     action_pred_exp = self._action_model(exp_states[:, self._state_mask],
207 |                                                          exp_next_states[:, self._state_mask])
208 |                     mse_exp = F.mse_loss(to_float_tensor(action_pred_exp), to_float_tensor(exp_actions))
209 |                     self.sw_add_scalar('Action-Model/Loss Exp', mse_exp, self._iter)
210 | 
211 |                 # log entropy
212 |                 ent_plcy = self._action_model.entropy(states[:, self._state_mask],
213 |                                                       next_states[:, self._state_mask])
214 |                 ent_exp = self._action_model.entropy(exp_states[:, self._state_mask],
215 |                                                      exp_next_states[:, self._state_mask])
216 |                 self.sw_add_scalar('Action-Model/Entropy Plcy', ent_plcy,
217 |                                     self._iter)
218 |                 self.sw_add_scalar('Action-Model/Entropy Exp', ent_exp,
219 |                                     self._iter)
220 | 
221 |                 # log mu, lam, alpha, beta
222 |                 if type(self._action_model) == GCPActionModel or type(self._action_model) == KLGCPActionModel:
223 |                     mu, lam, alpha, beta = self._action_model.get_prior_params(states[:, self._state_mask],
224 |                                                                                next_states[:, self._state_mask])
225 |                     self.sw_add_scalar('Action-Model/Mu', np.mean(mu.detach().cpu().numpy()), self._iter)
226 |                     self.sw_add_scalar('Action-Model/Lambda', np.mean(lam.detach().cpu().numpy()), self._iter)
227 |                     self.sw_add_scalar('Action-Model/Lambda Counter', self._action_model.lam_counter, self._iter)
228 |                     self.sw_add_scalar('Action-Model/Alpha', np.mean(alpha.detach().cpu().numpy()), self._iter)
229 |                     self.sw_add_scalar('Action-Model/Beta', np.mean(beta.detach().cpu().numpy()), self._iter)
230 |                     self.sw_add_scalar('Action-Model/Var',
231 |                                        np.mean(self._action_model.get_corrected_pred_var(lam,
232 |                                                                                          alpha,
233 |                                                                                          beta).detach().cpu().numpy()),
234 |                                        self._iter)
235 |                     mu_exp, lam_exp, alpha_exp, beta_exp = \
236 |                         self._action_model.get_prior_params(exp_states[:, self._state_mask],
237 |                                                             exp_next_states[:, self._state_mask])
238 |                     self.sw_add_scalar('Action-Model/Mu Exp', np.mean(mu_exp.detach().cpu().numpy()), self._iter)
239 |                     self.sw_add_scalar('Action-Model/Lambda Exp', np.mean(lam_exp.detach().cpu().numpy()), self._iter)
240 |                     self.sw_add_scalar('Action-Model/Alpha Exp', np.mean(alpha_exp.detach().cpu().numpy()), self._iter)
241 |                     self.sw_add_scalar('Action-Model/Beta Exp', np.mean(beta_exp.detach().cpu().numpy()), self._iter)
242 |                     self.sw_add_scalar('Action-Model/Var Exp',
243 |                                        np.mean(self._action_model.get_corrected_pred_var(lam_exp,
244 |                                                                                          alpha_exp,
245 |                                                                                          beta_exp).detach().cpu().numpy()),
246 |                                        self._iter)
247 |                 elif type(self._action_model) == GaussianInvActionModel or \
248 |                         type(self._action_model) == KLGaussianInvActionModel:
249 |                     mu, log_sigma = self._action_model.get_mu_log_sigma(state[:, self._state_mask],
250 |                                                                         next_state[:, self._state_mask])
251 |                     mu_exp, log_sigma_exp = self._action_model.get_mu_log_sigma(exp_states.astype(np.float32)[:, self._state_mask],
252 |                                                                                 exp_next_states.astype(np.float32)[:, self._state_mask])
253 | 
254 |                     self._sw.add_scalar('Action-Model/Std Exp', torch.mean(torch.exp(log_sigma_exp)), self._iter)
255 |                     self._sw.add_scalar('Action-Model/Std', torch.mean(torch.exp(log_sigma)), self._iter)
256 |                     self._sw.add_scalar('Action-Model/Mu Exp', torch.mean(mu_exp), self._iter)
257 |                     self._sw.add_scalar('Action-Model/Mu', torch.mean(mu), self._iter)
258 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/lsiqfo_hc.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | 
  5 | from imitation_lib.imitation.lsiq_hc import LSIQ_HC
  6 | from mushroom_rl.utils.minibatches import minibatch_generator
  7 | from mushroom_rl.utils.torch import to_float_tensor
  8 | from imitation_lib.utils.action_models import GaussianInvActionModel, LearnableVarGaussianInvActionModel,\
  9 |     GCPActionModel, KLGCPActionModel, KLGaussianInvActionModel
 10 | 
 11 | 
 12 | class LSIQfO_HC(LSIQ_HC):
 13 | 
 14 |     def __init__(self, action_model, action_model_params, action_model_fit_params=None, action_model_noise_std=0.0,
 15 |                  action_model_noise_clip=None, add_noise_to_obs=False, ext_normalizer_action_model=None,
 16 |                  interpolate_expert_states=False, interpolation_coef=1.0, **kwargs):
 17 | 
 18 |         super().__init__(**kwargs)
 19 | 
 20 |         if action_model == GaussianInvActionModel or action_model == GCPActionModel \
 21 |                 or action_model == KLGCPActionModel or action_model == KLGaussianInvActionModel:
 22 |             action_model_params.setdefault("min_a", self.mdp_info.action_space.low)
 23 |             action_model_params.setdefault("max_a", self.mdp_info.action_space.high)
 24 |             action_model_params.setdefault("use_cuda", self._use_cuda)
 25 |         elif action_model == LearnableVarGaussianInvActionModel:
 26 |             action_model_params.setdefault("use_cuda", self._use_cuda)
 27 | 
 28 |         # setup the action model
 29 |         self._action_model = action_model(**action_model_params, demonstration=self._demonstrations)
 30 | 
 31 |         self._action_model_fit_params = dict(fits_per_step=1, init_epochs=0, )\
 32 |             if action_model_fit_params is None else action_model_fit_params
 33 |         self._action_model_initialized = True if self._action_model_fit_params["init_epochs"] > 0 else False
 34 |         self._action_model_batch_size = action_model_params["batch_size"]
 35 | 
 36 |         self._action_model_noise_std = action_model_noise_std
 37 |         self._action_model_noise_clip = action_model_noise_clip
 38 |         self.ext_normalizer_action_model = ext_normalizer_action_model
 39 |         self._add_noise_to_obs = add_noise_to_obs
 40 |         self._interpolate_expert_states = interpolate_expert_states
 41 |         self._interpolation_coef = interpolation_coef
 42 | 
 43 |         self._add_save_attr(
 44 |             _action_model='mushroom',
 45 |             _action_model_fit_params='pickle',
 46 |             _action_model_noise_std='primitive',
 47 |             _action_model_noise_clip='primitive',
 48 |             ext_normalizer_action_model='pickle',
 49 |             _add_noise_to_obs='primitive'
 50 |         )
 51 | 
 52 |     def fit(self, dataset):
 53 | 
 54 |         # add to replay memory
 55 |         self._replay_memory.add(dataset)
 56 | 
 57 |         if self._replay_memory.initialized:
 58 | 
 59 |             # train the action model
 60 |             if not self._action_model_initialized:
 61 |                 self.train_action_model(init=True)
 62 |                 self._action_model_initialized = True
 63 |             else:
 64 |                 self.train_action_model()
 65 | 
 66 |             # sample batch from policy replay buffer
 67 |             state, action, reward, next_state, absorbing, _ = \
 68 |                 self._replay_memory.get(self._batch_size())
 69 | 
 70 |             # sample batch of same size from expert replay buffer and concatenate with samples from own policy
 71 |             demo_obs, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0],
 72 |                                                                            self._demonstrations["states"],
 73 |                                                                            self._demonstrations["next_states"],
 74 |                                                                            self._demonstrations["absorbing"]))
 75 | 
 76 |             # predict the actions for our expert dataset
 77 |             demo_obs_act = demo_obs.astype(np.float32)[:, self._state_mask]
 78 |             demo_nobs_act = demo_nobs.astype(np.float32)[:, self._state_mask]
 79 |             demo_act = self._action_model.draw_action(to_float_tensor(demo_obs_act),
 80 |                                                       to_float_tensor(demo_nobs_act))
 81 | 
 82 |             if self._add_noise_to_obs:
 83 |                 assert self.ext_normalizer_action_model is not None, "Normalizer is needed to be defined."
 84 | 
 85 |                 demo_obs = self.ext_normalizer_action_model(demo_obs)
 86 |                 demo_nobs = self.ext_normalizer_action_model(demo_nobs)
 87 |                 demo_obs += self._get_noise(demo_obs)
 88 |                 demo_nobs += self._get_noise(demo_nobs)
 89 |                 demo_obs = self.ext_normalizer_action_model.inv(demo_obs)
 90 |                 demo_nobs = self.ext_normalizer_action_model.inv(demo_nobs)
 91 | 
 92 |             # make interpolation if needed
 93 |             if self._interpolate_expert_states:
 94 |                 demo_obs = self.interpolate(demo_obs[:, self._state_mask], state[:, self._state_mask],
 95 |                                             mixing_coef=self._interpolation_coef)
 96 |                 demo_act = self.interpolate(demo_act, action,
 97 |                                             mixing_coef=self._interpolation_coef)
 98 | 
 99 |             # prepare data for IQ update
100 |             input_states = to_float_tensor(np.concatenate([state,
101 |                                                            demo_obs.astype(np.float32)[:, self._state_mask]]))
102 |             input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)]))
103 |             input_n_states = to_float_tensor(np.concatenate([next_state,
104 |                                                              demo_nobs.astype(np.float32)[:, self._state_mask]]))
105 |             input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)]))
106 |             is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool),
107 |                                       torch.ones(len(state), dtype=torch.bool)])
108 | 
109 |             # make IQ update
110 |             self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert)
111 | 
112 |         self._iter += 1
113 |         self.policy.iter += 1
114 | 
115 |     def _get_noise(self, x):
116 |         noise = np.random.normal(loc=0.0, scale=self._action_model_noise_std,
117 |                                  size=np.size(x)).reshape(x.shape)
118 |         noise = np.clip(noise, -self._action_model_noise_clip, self._action_model_noise_clip) \
119 |             if self._action_model_noise_clip is not None else noise
120 |         return noise
121 | 
122 |     def interpolate(self, expert_data, policy_data, mixing_coef=None):
123 |         interpolated = mixing_coef * expert_data + (1 - mixing_coef) * policy_data
124 |         return interpolated
125 | 
126 |     def train_action_model(self, init=False):
127 | 
128 |         if init and self._action_model_fit_params["init_epochs"] > 0:
129 |             n_epochs = self._action_model_fit_params["init_epochs"]
130 |             # initialize the model
131 |             state, action, _, next_state, _, _ = self._replay_memory.get(self._replay_memory.size)
132 |             state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state
133 |             next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state
134 |             state_train = state[0:int(len(state)*0.9), :]
135 |             state_val = state[int(len(state)*0.9):, :]
136 |             next_state_train = next_state[0:int(len(next_state)*0.9), :]
137 |             next_state_val = next_state[int(len(next_state)*0.9):, :]
138 |             action_train = action[0:int(len(next_state)*0.9), :]
139 |             action_val = action[int(len(next_state)*0.9):, :]
140 |             state_nstate_train = np.concatenate([state_train, next_state_train], axis=1)
141 |             state_nstate_val = np.concatenate([state_val, next_state_val], axis=1)
142 | 
143 |             # make eval before training
144 |             action_pred = self._action_model(state_nstate_val)
145 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val))
146 |             self._sw.add_scalar('Action-Model/Loss', loss, self._iter)
147 |             print("Action Model Validation Loss before training: ", loss)
148 |             action_pred = self._action_model(state_nstate_train)
149 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train))
150 |             print("Action Model Training Loss before training: ", loss)
151 |             w = self._action_model.get_weights()
152 |             norm = np.linalg.norm(w)
153 |             self.sw_add_scalar("Action-Model/Norm", norm, self._iter)
154 | 
155 |             # make training
156 |             self._action_model.fit(state_nstate_train, action_train, n_epochs=n_epochs)
157 | 
158 |             # make eval after training
159 |             action_pred = self._action_model(state_nstate_val)
160 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_val))
161 |             self._sw.add_scalar('Action-Model/Loss', loss, self._iter)
162 |             print("Action Model Validation Loss After training: ", loss)
163 |             action_pred = self._action_model(state_nstate_train)
164 |             loss = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(action_train))
165 |             print("Action Model Validation Loss After training: ", loss)
166 | 
167 |         else:
168 |             state_nstates = []
169 |             actions = []
170 |             for i in range(self._action_model_fit_params["fits_per_step"]):
171 |                 # sample batch from policy replay buffer
172 |                 state, action, reward, next_state, absorbing, _ = \
173 |                     self._replay_memory.get(self._action_model_batch_size)
174 | 
175 |                 state = self.ext_normalizer_action_model(state) if self.ext_normalizer_action_model else state
176 |                 next_state = self.ext_normalizer_action_model(next_state) if self.ext_normalizer_action_model else next_state
177 |                 self._action_model.fit(state, next_state, action)
178 | 
179 |                 state_nstates.append([state, next_state])
180 |                 actions.append(action)
181 | 
182 |             if self._iter % self._logging_iter == 0:
183 | 
184 |                 # sample batch from policy replay buffer
185 |                 states, actions, rewards, next_states, absorbings, _ = \
186 |                     self._replay_memory.get(self._action_model_batch_size)
187 | 
188 |                 # we need to check if we have a dataset with expert actions available or not
189 |                 try:
190 |                     exp_states, exp_next_states, exp_actions = next(
191 |                         minibatch_generator(self._action_model_batch_size,
192 |                                             self._demonstrations["states"],
193 |                                             self._demonstrations["next_states"],
194 |                                             self._demonstrations["actions"]))
195 |                 except KeyError:
196 |                     exp_states, exp_next_states = next(minibatch_generator(self._action_model_batch_size,
197 |                                                                            self._demonstrations["states"],
198 |                                                                            self._demonstrations["next_states"]))
199 |                     exp_actions = None
200 | 
201 |                 # log mse
202 |                 action_pred = self._action_model(states[:, self._state_mask], next_states[:, self._state_mask])
203 |                 mse = F.mse_loss(to_float_tensor(action_pred), to_float_tensor(actions))
204 |                 self.sw_add_scalar('Action-Model/Loss Policy', mse, self._iter)
205 |                 if exp_actions is not None:
206 |                     action_pred_exp = self._action_model(exp_states[:, self._state_mask],
207 |                                                          exp_next_states[:, self._state_mask])
208 |                     mse_exp = F.mse_loss(to_float_tensor(action_pred_exp), to_float_tensor(exp_actions))
209 |                     self.sw_add_scalar('Action-Model/Loss Exp', mse_exp, self._iter)
210 | 
211 |                 # log entropy
212 |                 ent_plcy = self._action_model.entropy(states[:, self._state_mask],
213 |                                                       next_states[:, self._state_mask])
214 |                 ent_exp = self._action_model.entropy(exp_states[:, self._state_mask],
215 |                                                      exp_next_states[:, self._state_mask])
216 |                 self.sw_add_scalar('Action-Model/Entropy Plcy', ent_plcy,
217 |                                     self._iter)
218 |                 self.sw_add_scalar('Action-Model/Entropy Exp', ent_exp,
219 |                                     self._iter)
220 | 
221 |                 # log mu, lam, alpha, beta
222 |                 if type(self._action_model) == GCPActionModel or type(self._action_model) == KLGCPActionModel:
223 |                     mu, lam, alpha, beta = self._action_model.get_prior_params(states[:, self._state_mask],
224 |                                                                                next_states[:, self._state_mask])
225 |                     self.sw_add_scalar('Action-Model/Mu', np.mean(mu.detach().cpu().numpy()), self._iter)
226 |                     self.sw_add_scalar('Action-Model/Lambda', np.mean(lam.detach().cpu().numpy()), self._iter)
227 |                     self.sw_add_scalar('Action-Model/Lambda Counter', self._action_model.lam_counter, self._iter)
228 |                     self.sw_add_scalar('Action-Model/Alpha', np.mean(alpha.detach().cpu().numpy()), self._iter)
229 |                     self.sw_add_scalar('Action-Model/Beta', np.mean(beta.detach().cpu().numpy()), self._iter)
230 |                     self.sw_add_scalar('Action-Model/Var',
231 |                                        np.mean(self._action_model.get_corrected_pred_var(lam,
232 |                                                                                          alpha,
233 |                                                                                          beta).detach().cpu().numpy()),
234 |                                        self._iter)
235 |                     mu_exp, lam_exp, alpha_exp, beta_exp = \
236 |                         self._action_model.get_prior_params(exp_states[:, self._state_mask],
237 |                                                             exp_next_states[:, self._state_mask])
238 |                     self.sw_add_scalar('Action-Model/Mu Exp', np.mean(mu_exp.detach().cpu().numpy()), self._iter)
239 |                     self.sw_add_scalar('Action-Model/Lambda Exp', np.mean(lam_exp.detach().cpu().numpy()), self._iter)
240 |                     self.sw_add_scalar('Action-Model/Alpha Exp', np.mean(alpha_exp.detach().cpu().numpy()), self._iter)
241 |                     self.sw_add_scalar('Action-Model/Beta Exp', np.mean(beta_exp.detach().cpu().numpy()), self._iter)
242 |                     self.sw_add_scalar('Action-Model/Var Exp',
243 |                                        np.mean(self._action_model.get_corrected_pred_var(lam_exp,
244 |                                                                                          alpha_exp,
245 |                                                                                          beta_exp).detach().cpu().numpy()),
246 |                                        self._iter)
247 |                 elif type(self._action_model) == GaussianInvActionModel or \
248 |                         type(self._action_model) == KLGaussianInvActionModel:
249 |                     mu, log_sigma = self._action_model.get_mu_log_sigma(state[:, self._state_mask],
250 |                                                                         next_state[:, self._state_mask])
251 |                     mu_exp, log_sigma_exp = self._action_model.get_mu_log_sigma(exp_states.astype(np.float32)[:, self._state_mask],
252 |                                                                                 exp_next_states.astype(np.float32)[:, self._state_mask])
253 | 
254 |                     self._sw.add_scalar('Action-Model/Std Exp', torch.mean(torch.exp(log_sigma_exp)), self._iter)
255 |                     self._sw.add_scalar('Action-Model/Std', torch.mean(torch.exp(log_sigma)), self._iter)
256 |                     self._sw.add_scalar('Action-Model/Mu Exp', torch.mean(mu_exp), self._iter)
257 |                     self._sw.add_scalar('Action-Model/Mu', torch.mean(mu), self._iter)
258 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/offline/__init__.py:
--------------------------------------------------------------------------------
1 | from .iq_offline import IQ_Offline
2 | from .lsiq_offline import LSIQ_Offline
3 | from .lsiq_offline_dm import LSIQ_Offline_DM
4 | from .behavioral_cloning import BehavioralCloning


--------------------------------------------------------------------------------
/imitation_lib/imitation/offline/behavioral_cloning.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch.nn import GaussianNLLLoss
 4 | import torch.nn.functional as F
 5 | from mushroom_rl.core import Agent
 6 | from mushroom_rl.approximators import Regressor
 7 | from mushroom_rl.utils.torch import to_float_tensor
 8 | from mushroom_rl.utils.minibatches import minibatch_generator
 9 | from mushroom_rl.approximators.parametric import TorchApproximator
10 | from imitation_lib.imitation.iq_sac import IQ_Learn_Policy
11 | 
12 | 
13 | class BehavioralCloning(Agent):
14 | 
15 |     def __init__(self, mdp_info, actor_params, actor_optimizer, demonstrations, log_std_min=-20,
16 |                  log_std_max=2, use_cuda=False, logging_iter=1, batch_size=32, sw=None):
17 | 
18 |         actor_approximator = Regressor(TorchApproximator,
19 |                                        **actor_params)
20 |         policy = IQ_Learn_Policy(actor_approximator,
21 |                                  mdp_info.action_space.low,
22 |                                  mdp_info.action_space.high,
23 |                                  log_std_min,
24 |                                  log_std_max)
25 | 
26 |         policy_parameters = actor_approximator.model.network.parameters()
27 | 
28 |         self._demonstrations = demonstrations
29 |         self._optimizer = actor_optimizer['class'](policy_parameters, **actor_optimizer['params'])
30 |         self._actor_loss = GaussianNLLLoss()
31 |         self._use_cuda = use_cuda
32 |         self._iter = 0
33 |         self._batch_size = batch_size
34 |         self._logging_iter = logging_iter
35 | 
36 |         if sw:
37 |             self._sw = sw
38 |             setattr(self._sw, '__deepcopy__', lambda self: None)    # dont need to be copyable, causes pickle error otherwise
39 | 
40 |         super(BehavioralCloning, self).__init__(mdp_info, policy)
41 | 
42 |     def fit(self, dataset):
43 |         raise AttributeError("This is a behavior cloning algorithms, which is meant to run offline. It is not supposed"
44 |                              "to use the fit function. Use the fit_offline function instead.")
45 | 
46 |     def fit_offline(self, n_steps):
47 | 
48 |         for i in range(n_steps):
49 | 
50 |             # sample batch of same size from expert replay buffer and concatenate with samples from own policy
51 |             demo_obs, demo_act, demo_nobs, demo_absorbing = next(minibatch_generator(self._batch_size,
52 |                                                                  self._demonstrations["states"],
53 |                                                                  self._demonstrations["actions"],
54 |                                                                  self._demonstrations["next_states"],
55 |                                                                  self._demonstrations["absorbing"]))
56 | 
57 |             # prepare tensors
58 |             states = to_float_tensor(demo_obs, self._use_cuda) \
59 |                 if self._use_cuda else to_float_tensor(demo_obs)
60 |             target_actions = to_float_tensor(demo_act, self._use_cuda) \
61 |                 if self._use_cuda else to_float_tensor(demo_act)
62 | 
63 |             # do unsquashing of target actions
64 |             central, delta = self.policy.get_central_delta()
65 |             target_actions = torch.clip((target_actions - central) / delta, -1.0 + 1e-7, 1.0 - 1e-7)
66 |             target_actions = torch.arctanh(target_actions)
67 | 
68 |             # predict mu and log_sigma
69 |             mu, log_sigma = self.policy.get_mu_log_sigma(states)
70 | 
71 |             # calculate loss and do an optimizer step
72 |             loss = self._actor_loss(input=mu, target=target_actions, var=torch.square(log_sigma.exp()))
73 |             self._optimizer.zero_grad()
74 |             loss.backward()
75 |             self._optimizer.step()
76 | 
77 |             # make some logging
78 |             self.logging(states, target_actions, loss, mu, log_sigma)
79 | 
80 |             self._iter += 1
81 | 
82 |     def logging(self, states, target_actions, loss, mu, log_sigma):
83 |         # log some useful information
84 |         if self._iter % self._logging_iter == 0:
85 |             self.sw_add_scalar("GaussianNLLLoss", np.mean(loss.detach().cpu().numpy()))
86 | 
87 |             gauss_ent = self.policy.entropy_from_logsigma(log_sigma)
88 |             self.sw_add_scalar("Squashed Gaussian Entropy", np.mean(gauss_ent.detach().cpu().numpy()))
89 |             act, log_prob = self.policy.compute_action_and_log_prob(states)
90 |             squashed_gauss_ent = -np.mean(log_prob)
91 |             self.sw_add_scalar("Squashed Gaussian Entropy (Empirical)", squashed_gauss_ent)
92 | 
93 |             mse_loss = F.mse_loss(mu, target_actions)
94 |             self.sw_add_scalar("MSELoss (between mean & target actions)", np.mean(mse_loss.detach().cpu().numpy()))
95 | 
96 |     def sw_add_scalar(self, name, val):
97 |         if self._iter % self._logging_iter == 0:
98 |             self._sw.add_scalar(name, val, self._iter)
99 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/offline/iq_offline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from mushroom_rl.utils.minibatches import minibatch_generator
  4 | from mushroom_rl.utils.torch import to_float_tensor
  5 | from imitation_lib.imitation.iq_sac import IQ_SAC
  6 | 
  7 | 
  8 | class IQ_Offline(IQ_SAC):
  9 | 
 10 |     def __init__(self, **kwargs):
 11 | 
 12 |         if "regularizer_mode" in kwargs.keys():
 13 |             if kwargs["regularizer_mode"] != "exp":
 14 |                 raise ValueError("This is the offline implementation of IQ, which expects the regularizer to take only"
 15 |                                  "samples from the expert.")
 16 |         else:
 17 |             kwargs["regularizer_mode"] = "exp"
 18 |         if "plcy_loss_mode" in kwargs.keys():
 19 |             if kwargs["plcy_loss_mode"] != "v0":
 20 |                 raise ValueError("This is the offline implementation of IQ, which expects: plcy_loss_mode=\"v0\".")
 21 |         else:
 22 |             kwargs["plcy_loss_mode"] = "v0"
 23 | 
 24 |         super(IQ_Offline, self).__init__(**kwargs)
 25 | 
 26 |     def fit(self, dataset):
 27 |         raise AttributeError("This is the offline implementation of IQ, it is not supposed to use the fit function. "
 28 |                              "Use the fit_offline function instead.")
 29 | 
 30 |     def fit_offline(self, n_steps):
 31 | 
 32 |         for i in range(n_steps):
 33 | 
 34 |             # sample batch of same size from expert replay buffer and concatenate with samples from own policy
 35 |             assert self._act_mask.size > 0, "IQ-Learn needs demo actions!"
 36 |             demo_obs, demo_act, demo_nobs, demo_absorbing = next(minibatch_generator(self._batch_size(),
 37 |                                                                  self._demonstrations["states"],
 38 |                                                                  self._demonstrations["actions"],
 39 |                                                                  self._demonstrations["next_states"],
 40 |                                                                  self._demonstrations["absorbing"]))
 41 | 
 42 |             # prepare data for IQ update
 43 |             input_states = to_float_tensor(demo_obs.astype(np.float32)[:, self._state_mask])
 44 |             input_actions = to_float_tensor(demo_act.astype(np.float32))
 45 |             input_n_states = to_float_tensor(demo_nobs.astype(np.float32)[:, self._state_mask])
 46 |             input_absorbing = to_float_tensor(demo_absorbing.astype(np.float32))
 47 |             is_expert = torch.ones(len(demo_obs), dtype=torch.bool)
 48 | 
 49 |             # make IQ update
 50 |             self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert)
 51 | 
 52 |             self._iter += 1
 53 |             self.policy.iter += 1
 54 | 
 55 |     def _lossQ(self, obs, act, next_obs, absorbing, is_expert):
 56 |         """
 57 |         Main contribution of the IQ-learn paper. This function is based on the repository of the paper:
 58 |         https://github.com/Div99/IQ-Learn
 59 |         """
 60 |         # Calculate 1st term of loss: -E_(ρ_expert)[Q(s, a) - γV(s')]
 61 |         gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma)
 62 |         absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing
 63 |         current_Q = self._critic_approximator(obs, act, output_tensor=True)
 64 |         if not self._use_target:
 65 |             next_v = self.getV(next_obs)
 66 |         else:
 67 |             with torch.no_grad():
 68 |                 next_v = self.get_targetV(next_obs).detach()
 69 | 
 70 |         y = (1 - torch.unsqueeze(absorbing, 1)) * gamma.detach() * self._Q_Q_multiplier * next_v
 71 | 
 72 |         reward = (self._Q_Q_multiplier * current_Q - y)
 73 |         exp_reward = reward[is_expert]
 74 |         loss_term1 = -exp_reward.mean()
 75 | 
 76 |         # do the logging
 77 |         self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing)
 78 | 
 79 |         # 2nd term for our loss (use expert and policy states): E_(ρ)[Q(s,a) - γV(s')]
 80 |         V = self._Q_Q_multiplier * self.getV(obs)
 81 |         value = (V - y)
 82 |         self.sw_add_scalar('V for policy on all states', self._Q_Q_multiplier * V.mean(), self._iter)
 83 |         value_loss = value
 84 |         if self._plcy_loss_mode == "value":
 85 |             loss_term2 = value_loss.mean()
 86 |         elif self._plcy_loss_mode == "value_expert":
 87 |             value_loss_exp = value_loss[is_expert]
 88 |             loss_term2 = value_loss_exp.mean()
 89 |         elif self._plcy_loss_mode == "value_policy":
 90 |             value_loss_plcy = value_loss[~is_expert]
 91 |             loss_term2 = value_loss_plcy.mean()
 92 |         elif self._plcy_loss_mode == "q_old_policy":
 93 |             reward = (current_Q - y)
 94 |             reward_plcy = reward[~is_expert]
 95 |             loss_term2 = reward_plcy.mean()
 96 |         elif self._plcy_loss_mode == "v0":
 97 |             value_loss_v0 = (1 - gamma.detach()) * self.getV(obs[is_expert])
 98 |             loss_term2 = value_loss_v0.mean()
 99 |         else:
100 |             raise ValueError("Undefined policy loss mode: %s" % self._plcy_loss_mode)
101 | 
102 |         # regularize
103 |         if not self._use_target:
104 |             next_v = self.getV(next_obs)
105 |         else:
106 |             with torch.no_grad():
107 |                 next_v = self.get_targetV(next_obs).detach()
108 |         # WARNING: TURNED OFF absorbing in regularization TODO: check if this works, if not go back
109 |         # y = (1 - torch.unsqueeze(absorbing, 1)) * gamma.detach() * self._Q_Q_multiplier * next_v
110 |         abs_mult = 1.0 if self._reg_no_absorbing else (1 - torch.unsqueeze(absorbing, 1))
111 |         y = abs_mult * gamma.detach() * self._Q_Q_multiplier * next_v
112 |         current_Q = self._Q_Q_multiplier * self._critic_approximator(obs, act, output_tensor=True)
113 |         if self._turn_off_reg_absorbing:
114 |             reward = (1 - torch.unsqueeze(absorbing, 1)) * (current_Q - y)
115 |         else:
116 |             reward = current_Q - y
117 | 
118 |         reg_multiplier = (1.0 / (1 - gamma.detach())) if self._normalized_val_func else 1.0
119 |         if self._regularizer_mode == "exp_and_plcy":
120 |             chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(reward)).mean()
121 |         elif self._regularizer_mode == "exp":
122 |             chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(reward[is_expert])).mean()
123 |         elif self._regularizer_mode == "plcy":
124 |             chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(reward[~is_expert])).mean()
125 |         elif self._regularizer_mode == "value":
126 |             V = self._Q_Q_multiplier * self.getV(obs)
127 |             value = (V - y)
128 |             chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(value)).mean()
129 |         elif self._regularizer_mode == "exp_and_plcy_and_value":
130 |             V = self._Q_Q_multiplier * self.getV(obs[is_expert])
131 |             value = (V - y[is_expert])
132 |             reward = torch.concat([reward, value])
133 |             chi2_loss = torch.tensor(self._reg_mult) * reg_multiplier * (torch.square(reward)).mean()
134 |         elif self._regularizer_mode == "off":
135 |             chi2_loss = 0.0
136 |         else:
137 |             raise ValueError("Undefined regularizer mode %s." % (self._regularizer_mode))
138 | 
139 |         # Add Q penalty TODO: maybe remove, since it did not work that great
140 |         if self._use_Q_regularizer:
141 |             loss_Q_pen = self._Q_reg_mult * torch.mean(
142 |                 torch.square(current_Q - torch.ones_like(current_Q) * self._Q_reg_target))
143 |         else:
144 |             loss_Q_pen = 0.0
145 | 
146 |         # add gradient penalty if needed
147 |         if self._gp_lambda > 0:
148 |             with torch.no_grad():
149 |                 act_plcy, _ = self.policy.compute_action_and_log_prob_t(obs[is_expert])
150 |             loss_gp = self._gradient_penalty(obs[is_expert], act[is_expert],
151 |                                              obs[is_expert], act_plcy, self._gp_lambda)
152 |         else:
153 |             loss_gp = 0.0
154 | 
155 |         loss_Q = loss_term1 + loss_term2 + chi2_loss + loss_Q_pen + loss_gp
156 |         self.update_Q_parameters(loss_Q)
157 | 
158 |         grads = []
159 |         for param in self._critic_approximator.model.network.parameters():
160 |             grads.append(param.grad.view(-1))
161 |         grads = torch.cat(grads)
162 |         norm = grads.norm(dim=0, p=2)
163 |         if self._iter % self._logging_iter == 0:
164 |             self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter)
165 | 
166 |         return loss_term1, loss_term2, chi2_loss
167 | 
168 |     def iq_update(self, input_states, input_actions, input_n_states, input_absorbing, is_expert):
169 | 
170 |         # update Q function
171 |         if self._iter % self._delay_Q == 0:
172 |             self.update_Q_function(input_states, input_actions, input_n_states, input_absorbing, is_expert)
173 | 
174 |         # update policy
175 |         if self._iter % self._delay_pi == 0:
176 |             self.update_policy(input_states, is_expert)
177 | 
178 |         if self._iter % self._delay_Q == 0:
179 |             self._update_target(self._critic_approximator,
180 |                                 self._target_critic_approximator)
181 | 
182 |     def logging_loss(self, current_Q, y, reward, is_expert, obs, act, absorbing):
183 | 
184 |         if self._iter % self._logging_iter == 0:
185 |             self.sw_add_scalar('Action-Value/Q for expert', self._Q_Q_multiplier * current_Q[is_expert].mean(), self._iter)
186 |             self.sw_add_scalar('Action-Value/Q^2 for expert', self._Q_Q_multiplier * torch.square(current_Q[is_expert]).mean(), self._iter)
187 |             self.sw_add_scalar('Action-Value/Reward_Expert', reward[is_expert].mean(), self._iter)
188 | 
189 |             Q_exp = current_Q[is_expert]
190 |             Q_plcy = current_Q[~is_expert]
191 |             abs_exp = absorbing[is_expert].bool()
192 |             abs_plcy = absorbing[~is_expert].bool()
193 |             self.sw_add_scalar('Action-Value/Q Absorbing state exp', torch.mean(Q_exp[abs_exp]), self._iter)
194 |             self.sw_add_scalar('Action-Value/Q Absorbing state plcy', torch.mean(Q_plcy[abs_plcy]), self._iter)
195 | 
196 |             # norm
197 |             w = self._critic_approximator.get_weights()
198 |             self.sw_add_scalar("Action-Value/Norm of Q net: ",np.linalg.norm(w), self._iter)
199 |             self.sw_add_scalar('Targets/expert data', y[is_expert].mean(), self._iter)
200 |             # log mean squared action
201 |             self.sw_add_scalar('Actions/mean squared action expert (from data)', torch.square(act[is_expert]).mean(), self._iter)
202 |             self.sw_add_scalar('Actions/mean squared action expert (from policy)', np.square(self.policy.draw_action(obs[is_expert])).mean(), self._iter)
203 | 
204 |             # log mean of each action
205 |             n_actions = len(act[0])
206 |             for i in range(n_actions):
207 |                 self.sw_add_scalar('All Actions means/action %d expert' % i, act[is_expert, i].mean(),
208 |                                    self._iter)
209 |                 self.sw_add_scalar('All Actions variances/action %d expert' % i, torch.var(act[is_expert, i]),
210 |                                    self._iter)
211 |                 self.sw_add_scalar('All Actions mins/action %d expert' % i, torch.min(act[is_expert, i]),
212 |                                    self._iter)
213 |                 self.sw_add_scalar('All Actions mins/action %d expert' % i, torch.min(act[is_expert, i]),
214 |                                    self._iter)
215 |                 self.sw_add_scalar('All Actions maxs/action %d expert' % i, torch.max(act[is_expert, i]),
216 |                                    self._iter)
217 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/offline/lsiq_offline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.functional import F
  3 | import numpy as np
  4 | from mushroom_rl.utils.minibatches import minibatch_generator
  5 | from mushroom_rl.utils.torch import to_float_tensor
  6 | from imitation_lib.imitation.lsiq import LSIQ
  7 | 
  8 | 
  9 | class LSIQ_Offline(LSIQ):
 10 | 
 11 |     def __init__(self, loss_mode_exp="fix", regularizer_mode="off", **kwargs):
 12 | 
 13 |         if "plcy_loss_mode" in kwargs.keys():
 14 |             if kwargs["plcy_loss_mode"] != "v0":
 15 |                 raise ValueError("This is the offline implementation of IQ, which expects: plcy_loss_mode=\"v0\".")
 16 |         else:
 17 |             kwargs["plcy_loss_mode"] = "v0"
 18 | 
 19 |         super().__init__(loss_mode_exp=loss_mode_exp, regularizer_mode=regularizer_mode, **kwargs)
 20 | 
 21 |     def fit(self, dataset):
 22 |         raise AttributeError("This is the offline implementation of IQ, it is not supposed to use the fit function. "
 23 |                              "Use the fit_offline function instead.")
 24 | 
 25 |     def fit_offline(self, n_steps):
 26 | 
 27 |         for i in range(n_steps):
 28 | 
 29 |             # sample batch of same size from expert replay buffer and concatenate with samples from own policy
 30 |             assert self._act_mask.size > 0, "IQ-Learn needs demo actions!"
 31 |             demo_obs, demo_act, demo_nobs, demo_absorbing = next(minibatch_generator(self._batch_size(),
 32 |                                                                  self._demonstrations["states"],
 33 |                                                                  self._demonstrations["actions"],
 34 |                                                                  self._demonstrations["next_states"],
 35 |                                                                  self._demonstrations["absorbing"]))
 36 | 
 37 |             # prepare data for IQ update
 38 |             input_states = to_float_tensor(demo_obs.astype(np.float32)[:, self._state_mask], self._use_cuda)
 39 |             input_actions = to_float_tensor(demo_act.astype(np.float32), self._use_cuda)
 40 |             input_n_states = to_float_tensor(demo_nobs.astype(np.float32)[:, self._state_mask], self._use_cuda)
 41 |             input_absorbing = to_float_tensor(demo_absorbing.astype(np.float32), self._use_cuda)
 42 |             is_expert = torch.ones(len(demo_obs), dtype=torch.bool).cuda() if self._use_cuda else torch.ones(len(demo_obs), dtype=torch.bool)
 43 | 
 44 |             # make IQ update
 45 |             self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert)
 46 | 
 47 |             self._iter += 1
 48 |             self.policy.iter += 1
 49 | 
 50 |     def _lossQ(self, obs, act, next_obs, absorbing, is_expert):
 51 | 
 52 |         # Calculate 1st term of loss: -E_(ρ_expert)[Q(s, a) - γV(s')]
 53 |         gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma)
 54 |         absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing
 55 |         current_Q = self._critic_approximator(obs, act, output_tensor=True)
 56 |         if not self._use_target:
 57 |             next_v = self.getV(next_obs)
 58 |         else:
 59 |             with torch.no_grad():
 60 |                 next_v = self.get_targetV(next_obs).detach()
 61 |         absorbing = torch.unsqueeze(absorbing, 1)
 62 |         y = (1 - absorbing) * gamma.detach() * self._Q_Q_multiplier * torch.clip(next_v, self._Q_min, self._Q_max)
 63 | 
 64 |         reward = (self._Q_Q_multiplier*current_Q - y)
 65 |         exp_reward = reward[is_expert]
 66 | 
 67 |         if self._loss_mode_exp == "bootstrap":
 68 |             loss_term1 = - exp_reward.mean()
 69 |         elif self._loss_mode_exp == "fix":
 70 |             if self._Q_exp_loss == "MSE":
 71 |                 loss_term1 = F.mse_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max)
 72 |             elif self._Q_exp_loss == "Huber":
 73 |                 loss_term1 = F.huber_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max)
 74 |             elif self._Q_exp_loss is None:
 75 |                 raise ValueError("If you choose loss_mode_exp == fix, you have to specify Q_exp_loss. Setting it to"
 76 |                                  "None is not valid.")
 77 |             else:
 78 |                 raise ValueError(
 79 |                     "Choosen Q_exp_loss %s is not supported. Choose either MSE or Huber." % self._Q_exp_loss)
 80 | 
 81 |         # do the logging
 82 |         self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing)
 83 | 
 84 |         # 2nd term for our loss (use expert and policy states): E_(ρ)[Q(s,a) - γV(s')]
 85 |         V = self._Q_Q_multiplier * self.getV(obs)
 86 |         value = (V - y)
 87 |         self.sw_add_scalar('V for policy on all states', self._Q_Q_multiplier * V.mean(), self._iter)
 88 |         value_loss = value
 89 |         if self._plcy_loss_mode == "value":
 90 |             loss_term2 = value_loss.mean()
 91 |         elif self._plcy_loss_mode == "value_expert":
 92 |             value_loss_exp = value_loss[is_expert]
 93 |             loss_term2 = value_loss_exp.mean()
 94 |         elif self._plcy_loss_mode == "value_policy":
 95 |             value_loss_plcy = value_loss[~is_expert]
 96 |             loss_term2 = value_loss_plcy.mean()
 97 |         elif self._plcy_loss_mode == "q_old_policy":
 98 |             reward_plcy = reward[~is_expert]
 99 |             loss_term2 = reward_plcy.mean()
100 |         elif self._plcy_loss_mode == "value_q_old_policy":
101 |             reward_plcy = reward[~is_expert]
102 |             loss_term2 = reward_plcy.mean() + value_loss.mean()
103 |         elif self._plcy_loss_mode == "v0":
104 |             #value_loss_v0 = (1-gamma.detach()) * self.getV(obs[is_expert])
105 |             value_loss_v0 = (1-gamma.detach()) * self.getV(obs[is_expert])
106 |             loss_term2 = value_loss_v0.mean()
107 |         elif self._plcy_loss_mode == "off":
108 |             loss_term2 = 0.0
109 |         else:
110 |             raise ValueError("Undefined policy loss mode: %s" % self._plcy_loss_mode)
111 | 
112 |         # regularize
113 |         if self._regularizer_mode == "exp_and_plcy":
114 |             chi2_loss = ((1 - absorbing) * torch.tensor(self._reg_mult) * torch.square(reward)
115 |                          + self._abs_mult * absorbing * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult)
116 |                          * (torch.square(reward))).mean()
117 |         elif self._regularizer_mode == "exp":
118 |             chi2_loss = ((1 - absorbing[is_expert]) * torch.tensor(self._reg_mult) * torch.square(reward[is_expert])
119 |                          + self._abs_mult * absorbing[is_expert] * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult)
120 |                          * (torch.square(reward[is_expert]))).mean()
121 |         elif self._regularizer_mode == "plcy":
122 |             chi2_loss = ((1 - absorbing[~is_expert]) * torch.tensor(self._reg_mult) * torch.square(reward[~is_expert])
123 |                          + self._abs_mult * absorbing[~is_expert] * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult)
124 |                          * (torch.square(reward[~is_expert]))).mean()
125 |         elif self._regularizer_mode == "value":
126 |             V = self._Q_Q_multiplier * self.getV(obs)
127 |             value = (V - y)
128 |             chi2_loss = torch.tensor(self._reg_mult) * (torch.square(value)).mean()
129 |         elif self._regularizer_mode == "exp_and_plcy_and_value":
130 |             V = self._Q_Q_multiplier * self.getV(obs[is_expert])
131 |             value = (V - y[is_expert])
132 |             reward = torch.concat([reward, value])
133 |             chi2_loss = torch.tensor(self._reg_mult) * (torch.square(reward)).mean()
134 |         elif self._regularizer_mode == "off":
135 |             chi2_loss = 0.0
136 |         else:
137 |             raise ValueError("Undefined regularizer mode %s." % (self._regularizer_mode))
138 | 
139 |         # add gradient penalty if needed
140 |         if self._gp_lambda > 0:
141 |             with torch.no_grad():
142 |                 act_plcy, _ = self.policy.compute_action_and_log_prob_t(obs[is_expert])
143 |             loss_gp = self._gradient_penalty(obs[is_expert], act[is_expert],
144 |                                              obs[is_expert], act_plcy, self._gp_lambda)
145 |         else:
146 |             loss_gp = 0.0
147 | 
148 |         loss_Q = loss_term1 + loss_term2 + chi2_loss + loss_gp
149 |         self.update_Q_parameters(loss_Q)
150 | 
151 |         grads = []
152 |         for param in self._critic_approximator.model.network.parameters():
153 |             grads.append(param.grad.view(-1))
154 |         grads = torch.cat(grads)
155 |         norm = grads.norm(dim=0, p=2)
156 |         if self._iter % self._logging_iter == 0:
157 |             self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter)
158 | 
159 |         return loss_term1, loss_term2, chi2_loss
160 | 
161 |     def iq_update(self, input_states, input_actions, input_n_states, input_absorbing, is_expert):
162 | 
163 |         # update Q function
164 |         if self._iter % self._delay_Q == 0:
165 |             self.update_Q_function(input_states, input_actions, input_n_states, input_absorbing, is_expert)
166 | 
167 |         # update policy
168 |         if self._iter % self._delay_pi == 0:
169 |             self.update_policy(input_states, input_actions, is_expert)
170 | 
171 |         if self._iter % self._delay_Q == 0:
172 |             self._update_target(self._critic_approximator,
173 |                                 self._target_critic_approximator)
174 | 
175 |     def update_policy(self, input_states, input_actions, is_expert):
176 | 
177 |         if self._train_policy_only_on_own_states:
178 |             policy_training_states = input_states[~is_expert]
179 |         else:
180 |             policy_training_states = input_states
181 |         action_new, log_prob = self.policy.compute_action_and_log_prob_t(policy_training_states)
182 |         loss = self._actor_loss(policy_training_states, action_new, log_prob)
183 | 
184 |         self._optimize_actor_parameters(loss)
185 |         grads = []
186 |         for param in self.policy._approximator.model.network.parameters():
187 |             grads.append(param.grad.view(-1))
188 |         grads = torch.cat(grads)
189 |         norm = grads.norm(dim=0, p=2)
190 |         if self._iter % self._logging_iter == 0:
191 |             self.sw_add_scalar('Gradients/Norm2 Gradient Q wrt. Pi-parameters', norm,
192 |                                self._iter)
193 |             self.sw_add_scalar('Actor/Loss', loss, self._iter)
194 |             _, log_prob = self.policy.compute_action_and_log_prob_t(input_states)
195 |             self.sw_add_scalar('Actor/Entropy Expert States', torch.mean(-log_prob[is_expert]).detach().item(),
196 |                                self._iter)
197 |             self.sw_add_scalar('Actor/Entropy Policy States', torch.mean(-log_prob[~is_expert]).detach().item(),
198 |                                self._iter)
199 |             _, logsigma = self.policy.get_mu_log_sigma(input_states[~is_expert])
200 |             ent_gauss = self.policy.entropy_from_logsigma(logsigma)
201 |             e_lb = self.policy.get_e_lb()
202 |             self.sw_add_scalar('Actor/Entropy from Gaussian Policy States', torch.mean(ent_gauss).detach().item(),
203 |                                self._iter)
204 |             self.sw_add_scalar('Actor/Entropy Lower Bound', e_lb, self._iter)
205 |             _, logsigma = self.policy.get_mu_log_sigma(input_states[is_expert])
206 |             ent_gauss = self.policy.entropy_from_logsigma(logsigma)
207 |             self.sw_add_scalar('Actor/Entropy from Gaussian Expert States', torch.mean(ent_gauss).detach().item(),
208 |                                self._iter)
209 |         if self._learnable_alpha:
210 |             self._update_alpha(log_prob.detach())
211 | 
212 |     def logging_loss(self, current_Q, y, reward, is_expert, obs, act, absorbing):
213 | 
214 |         if self._iter % self._logging_iter == 0:
215 |             self.sw_add_scalar('Action-Value/Q for expert', self._Q_Q_multiplier * current_Q[is_expert].mean(), self._iter)
216 |             self.sw_add_scalar('Action-Value/Q^2 for expert', self._Q_Q_multiplier * torch.square(current_Q[is_expert]).mean(), self._iter)
217 |             self.sw_add_scalar('Action-Value/Reward_Expert', reward[is_expert].mean(), self._iter)
218 | 
219 |             Q_exp = current_Q[is_expert]
220 |             Q_plcy = current_Q[~is_expert]
221 |             abs_exp = absorbing[is_expert].bool()
222 |             abs_plcy = absorbing[~is_expert].bool()
223 |             self.sw_add_scalar('Action-Value/Q Absorbing state exp', torch.mean(Q_exp[abs_exp]), self._iter)
224 |             self.sw_add_scalar('Action-Value/Q Absorbing state plcy', torch.mean(Q_plcy[abs_plcy]), self._iter)
225 | 
226 |             # norm
227 |             w = self._critic_approximator.get_weights()
228 |             self.sw_add_scalar("Action-Value/Norm of Q net: ",np.linalg.norm(w), self._iter)
229 |             self.sw_add_scalar('Targets/expert data', y[is_expert].mean(), self._iter)
230 |             # log mean squared action
231 |             self.sw_add_scalar('Actions/mean squared action expert (from data)', torch.square(act[is_expert]).mean(), self._iter)
232 |             self.sw_add_scalar('Actions/mean squared action expert (from policy)', np.square(self.policy.draw_action(obs[is_expert])).mean(), self._iter)
233 | 
234 |             # log mean of each action
235 |             n_actions = len(act[0])
236 |             for i in range(n_actions):
237 |                 self.sw_add_scalar('All Actions means/action %d expert' % i, act[is_expert, i].mean(),
238 |                                    self._iter)
239 |                 self.sw_add_scalar('All Actions variances/action %d expert' % i, torch.var(act[is_expert, i]),
240 |                                    self._iter)
241 |                 self.sw_add_scalar('All Actions mins/action %d expert' % i, torch.min(act[is_expert, i]),
242 |                                    self._iter)
243 |                 self.sw_add_scalar('All Actions mins/action %d expert' % i, torch.min(act[is_expert, i]),
244 |                                    self._iter)
245 |                 self.sw_add_scalar('All Actions maxs/action %d expert' % i, torch.max(act[is_expert, i]),
246 |                                    self._iter)
247 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/offline/lsiq_offline_dm.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import torch
  3 | from torch.functional import F
  4 | import numpy as np
  5 | from mushroom_rl.utils.minibatches import minibatch_generator
  6 | from mushroom_rl.utils.torch import to_float_tensor
  7 | from mushroom_rl.approximators import Regressor
  8 | from mushroom_rl.approximators.parametric import TorchApproximator
  9 | from imitation_lib.imitation.lsiq import LSIQ
 10 | 
 11 | 
 12 | class LSIQ_Offline_DM(LSIQ):
 13 | 
 14 |     def __init__(self, dynamics_model_params, dynamics_model_init_epochs=250,
 15 |                  random_demonstrations=None, loss_mode_exp="fix", regularizer_mode="off",  **kwargs):
 16 | 
 17 |         super().__init__(loss_mode_exp=loss_mode_exp, regularizer_mode=regularizer_mode, **kwargs)
 18 | 
 19 |         self._dynamics_model = Regressor(TorchApproximator, **dynamics_model_params)
 20 |         self._dynamics_model_init_epochs = dynamics_model_init_epochs
 21 |         self._dynamics_model_initialized = False
 22 | 
 23 |         expert_demonstrations = deepcopy(kwargs["demonstrations"])
 24 |         if random_demonstrations is not None:
 25 |             self._dynamics_model_training_data = dict()
 26 |             for key, value in expert_demonstrations.items():
 27 |                 if key != "episode_starts":
 28 |                     self._dynamics_model_training_data[key] = np.concatenate([value, random_demonstrations[key]])
 29 |                 self.add_dataset_to_replay_memory(random_demonstrations)
 30 |         else:
 31 |             self._dynamics_model_training_data = expert_demonstrations
 32 | 
 33 |         low, high = self.mdp_info.observation_space.low.copy(),\
 34 |                     self.mdp_info.observation_space.high.copy()
 35 |         self.norm_act_mean = (high + low) / 2.0
 36 |         self.norm_act_delta = (high - low) / 2.0
 37 | 
 38 |         self._state = None
 39 |         self._idx_state = 0
 40 |         self._max_traj_len = 200
 41 | 
 42 |     def fit(self, dataset):
 43 |         raise AttributeError("This is the offline implementation of IQ, it is not supposed to use the fit function. "
 44 |                              "Use the fit_offline function instead.")
 45 | 
 46 |     def fit_offline(self, n_steps):
 47 | 
 48 |         if not self._dynamics_model_initialized:
 49 |             self.fit_dynamics_model(self._dynamics_model_init_epochs)
 50 |             self.predict_trajectories_and_add_to_replay_buffer(100, 100)
 51 |             self._dynamics_model_initialized = True
 52 |         #else:
 53 |         #     self.fit_dynamics_model(1)
 54 | 
 55 | 
 56 |         for i in range(n_steps):
 57 | 
 58 |             self.add_next_step_to_buffer()
 59 | 
 60 |             # sample batch from policy replay buffer
 61 |             state, action, reward, next_state, absorbing, _ = \
 62 |                 self._replay_memory.get(self._batch_size())
 63 | 
 64 |             # sample batch of same size from expert replay buffer and concatenate with samples from own policy
 65 |             assert self._act_mask.size > 0, "IQ-Learn needs demo actions!"
 66 |             demo_obs, demo_act, demo_nobs, demo_absorbing = next(minibatch_generator(state.shape[0],
 67 |                                                                                      self._demonstrations["states"],
 68 |                                                                                      self._demonstrations["actions"],
 69 |                                                                                      self._demonstrations[
 70 |                                                                                          "next_states"],
 71 |                                                                                      self._demonstrations["absorbing"]))
 72 | 
 73 |             # prepare data for IQ update
 74 |             input_states = to_float_tensor(np.concatenate([state, demo_obs.astype(np.float32)[:, self._state_mask]]))
 75 |             input_actions = to_float_tensor(np.concatenate([action, demo_act.astype(np.float32)]))
 76 |             input_n_states = to_float_tensor(np.concatenate([next_state,
 77 |                                                              demo_nobs.astype(np.float32)[:, self._state_mask]]))
 78 |             input_absorbing = to_float_tensor(np.concatenate([absorbing, demo_absorbing.astype(np.float32)]))
 79 |             is_expert = torch.concat([torch.zeros(len(state), dtype=torch.bool),
 80 |                                       torch.ones(len(state), dtype=torch.bool)])
 81 | 
 82 |             # make IQ update
 83 |             self.iq_update(input_states, input_actions, input_n_states, input_absorbing, is_expert)
 84 | 
 85 |             self._iter += 1
 86 |             self.policy.iter += 1
 87 | 
 88 |     def _lossQ(self, obs, act, next_obs, absorbing, is_expert):
 89 | 
 90 |         # Calculate 1st term of loss: -E_(ρ_expert)[Q(s, a) - γV(s')]
 91 |         gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma)
 92 |         absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing
 93 |         current_Q = self._critic_approximator(obs, act, output_tensor=True)
 94 |         if not self._use_target:
 95 |             next_v = self.getV(next_obs)
 96 |         else:
 97 |             with torch.no_grad():
 98 |                 next_v = self.get_targetV(next_obs).detach()
 99 |         absorbing = torch.unsqueeze(absorbing, 1)
100 |         y = (1 - absorbing) * gamma.detach() * self._Q_Q_multiplier * torch.clip(next_v, self._Q_min, self._Q_max)
101 | 
102 |         reward = (self._Q_Q_multiplier*current_Q - y)
103 |         exp_reward = reward[is_expert]
104 | 
105 |         #if self._loss_mode_exp == "bootstrap": # todo: remove this was just for testing
106 |         #    loss_term1 = F.mse_loss(current_Q[is_expert],
107 |         #                            torch.ones_like(current_Q[is_expert]) * (1.0/self._reg_mult) + gamma.detach() * current_Q[is_expert].detach().cpu())
108 |         if self._loss_mode_exp == "bootstrap":
109 |             loss_term1 = - exp_reward.mean()
110 |         elif self._loss_mode_exp == "fix":
111 |             if self._Q_exp_loss == "MSE":
112 |                 loss_term1 = F.mse_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max)
113 |             elif self._Q_exp_loss == "Huber":
114 |                 loss_term1 = F.huber_loss(current_Q[is_expert], torch.ones_like(current_Q[is_expert]) * self._Q_max)
115 |             elif self._Q_exp_loss is None:
116 |                 raise ValueError("If you choose loss_mode_exp == fix, you have to specify Q_exp_loss. Setting it to"
117 |                                  "None is not valid.")
118 |             else:
119 |                 raise ValueError(
120 |                     "Choosen Q_exp_loss %s is not supported. Choose either MSE or Huber." % self._Q_exp_loss)
121 | 
122 |         # do the logging
123 |         self.logging_loss(current_Q, y, reward, is_expert, obs, act, absorbing)
124 | 
125 |         # 2nd term for our loss (use expert and policy states): E_(ρ)[Q(s,a) - γV(s')]
126 |         V = self._Q_Q_multiplier * self.getV(obs)
127 |         value = (V - y)
128 |         self.sw_add_scalar('V for policy on all states', self._Q_Q_multiplier * V.mean(), self._iter)
129 |         value_loss = value
130 |         if self._plcy_loss_mode == "value":
131 |             loss_term2 = value_loss.mean()
132 |         elif self._plcy_loss_mode == "value_expert":
133 |             value_loss_exp = value_loss[is_expert]
134 |             loss_term2 = value_loss_exp.mean()
135 |         elif self._plcy_loss_mode == "value_policy":
136 |             value_loss_plcy = value_loss[~is_expert]
137 |             loss_term2 = value_loss_plcy.mean()
138 |         elif self._plcy_loss_mode == "q_old_policy":
139 |             reward_plcy = reward[~is_expert]
140 |             loss_term2 = reward_plcy.mean()
141 |         elif self._plcy_loss_mode == "value_q_old_policy":
142 |             reward_plcy = reward[~is_expert]
143 |             loss_term2 = reward_plcy.mean() + value_loss.mean()
144 |         elif self._plcy_loss_mode == "v0":
145 |             value_loss_v0 = (1-gamma.detach()) * self.getV(obs[is_expert])
146 |             loss_term2 = value_loss_v0.mean()
147 |         elif self._plcy_loss_mode == "off":
148 |             loss_term2 = 0.0
149 |         else:
150 |             raise ValueError("Undefined policy loss mode: %s" % self._plcy_loss_mode)
151 | 
152 |         # regularize
153 |         if self._regularizer_mode == "exp_and_plcy":
154 |             chi2_loss = ((1 - absorbing) * torch.tensor(self._reg_mult) * torch.square(reward)
155 |                          + self._abs_mult * absorbing * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult)
156 |                          * (torch.square(reward))).mean()
157 |         elif self._regularizer_mode == "exp":
158 |             chi2_loss = ((1 - absorbing[is_expert]) * torch.tensor(self._reg_mult) * torch.square(reward[is_expert])
159 |                          + self._abs_mult * absorbing[is_expert] * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult)
160 |                          * (torch.square(reward[is_expert]))).mean()
161 |         elif self._regularizer_mode == "plcy":
162 |             chi2_loss = ((1 - absorbing[~is_expert]) * torch.tensor(self._reg_mult) * torch.square(reward[~is_expert])
163 |                          + self._abs_mult * absorbing[~is_expert] * (1.0 - gamma.detach()) * torch.tensor(self._reg_mult)
164 |                          * (torch.square(reward[~is_expert]))).mean()
165 |         elif self._regularizer_mode == "value":
166 |             V = self._Q_Q_multiplier * self.getV(obs)
167 |             value = (V - y)
168 |             chi2_loss = torch.tensor(self._reg_mult) * (torch.square(value)).mean()
169 |         elif self._regularizer_mode == "exp_and_plcy_and_value":
170 |             V = self._Q_Q_multiplier * self.getV(obs[is_expert])
171 |             value = (V - y[is_expert])
172 |             reward = torch.concat([reward, value])
173 |             chi2_loss = torch.tensor(self._reg_mult) * (torch.square(reward)).mean()
174 |         elif self._regularizer_mode == "off":
175 |             chi2_loss = 0.0
176 |         else:
177 |             raise ValueError("Undefined regularizer mode %s." % (self._regularizer_mode))
178 | 
179 |         # add gradient penalty if needed
180 |         if self._gp_lambda > 0:
181 |             with torch.no_grad():
182 |                 act_plcy, _ = self.policy.compute_action_and_log_prob_t(obs[is_expert])
183 |             loss_gp = self._gradient_penalty(obs[is_expert], act[is_expert],
184 |                                              obs[is_expert], act_plcy, self._gp_lambda)
185 |         else:
186 |             loss_gp = 0.0
187 | 
188 |         loss_Q = loss_term1 + loss_term2 + chi2_loss + loss_gp
189 |         self.update_Q_parameters(loss_Q)
190 | 
191 |         grads = []
192 |         for param in self._critic_approximator.model.network.parameters():
193 |             grads.append(param.grad.view(-1))
194 |         grads = torch.cat(grads)
195 |         norm = grads.norm(dim=0, p=2)
196 |         if self._iter % self._logging_iter == 0:
197 |             self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter)
198 | 
199 |         return loss_term1, loss_term2, chi2_loss
200 | 
201 |     def iq_update(self, input_states, input_actions, input_n_states, input_absorbing, is_expert):
202 | 
203 |         # update Q function
204 |         if self._iter % self._delay_Q == 0:
205 |             self.update_Q_function(input_states, input_actions, input_n_states, input_absorbing, is_expert)
206 | 
207 |         # update policy
208 |         if self._iter % self._delay_pi == 0:
209 |             self.update_policy(input_states, is_expert)
210 | 
211 |         if self._iter % self._delay_Q == 0:
212 |             self._update_target(self._critic_approximator,
213 |                                 self._target_critic_approximator)
214 | 
215 |     def fit_dynamics_model(self, n_epochs=1):
216 | 
217 |         states = self._dynamics_model_training_data["states"]
218 |         actions = self._dynamics_model_training_data["actions"]
219 |         inputs = np.concatenate([states, actions], axis=1)
220 |         targets = self._dynamics_model_training_data["next_states"]
221 | 
222 |         # normalize targets
223 |         targets = (targets - self.norm_act_mean) / self.norm_act_delta
224 | 
225 |         self._dynamics_model.fit(inputs, targets, n_epochs=n_epochs)
226 | 
227 |         preds = self._dynamics_model.predict(inputs)
228 |         loss = F.mse_loss(to_float_tensor(preds), to_float_tensor(targets))
229 |         self.sw_add_scalar("Forward_DM/Loss", torch.mean(loss), self._iter)
230 |         print("Loss", torch.mean(loss).detach().cpu().numpy())
231 | 
232 |     def add_next_step_to_buffer(self):
233 | 
234 |         if self._idx_state >= self._max_traj_len or self._state is None:
235 |             init_state_idx = np.random.randint(len(self._dynamics_model_training_data["states"])*0.8)
236 |             self._state = self._dynamics_model_training_data["states"][init_state_idx]
237 |             self._idx_state = 0
238 | 
239 |         action = self.policy.draw_action(self._state)
240 |         action = np.clip(action, self.mdp_info.action_space.low, self.mdp_info.action_space.high)
241 |         inputs = np.concatenate([self._state, action])
242 |         next_state = self._dynamics_model.predict(inputs)
243 | 
244 |         # unnormalize next state
245 |         next_state = (next_state * self.norm_act_delta) + self.norm_act_mean
246 | 
247 |         self._replay_memory.add([[self._state, action, 0.0, next_state, 0, 0]])
248 | 
249 |         self._state = next_state
250 |         self._idx_state += 1
251 | 
252 | 
253 |     def predict_trajectories_and_add_to_replay_buffer(self, n_trajec, trajec_len):
254 | 
255 |         for i in range(n_trajec):
256 |             # get initial state
257 |             init_state_idx = np.random.randint(len(self._dynamics_model_training_data["states"])*0.8)
258 |             state = self._dynamics_model_training_data["states"][init_state_idx]
259 |             for j in range(trajec_len):
260 |                 action = self.policy.draw_action(state)
261 |                 action = np.clip(action, self.mdp_info.action_space.low, self.mdp_info.action_space.high)
262 |                 inputs = np.concatenate([state, action])
263 |                 next_state = self._dynamics_model.predict(inputs)
264 | 
265 |                 # unnormalize next state
266 |                 next_state = (next_state * self.norm_act_delta) + self.norm_act_mean
267 | 
268 |                 self._replay_memory.add([[state, action, 0.0, next_state, 0, 0]])
269 | 
270 |                 state = next_state
271 | 
272 |     def add_dataset_to_replay_memory(self, dataset):
273 | 
274 |         for i in range(len(dataset["states"])):
275 |             self._replay_memory.add([[dataset["states"][i], dataset["actions"][i], dataset["rewards"][i],
276 |                                     dataset["next_states"][i], dataset["absorbing"][i], dataset["last"][i]]])
277 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/sqil_sac.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from .iq_sac import IQ_SAC
  4 | from mushroom_rl.utils.torch import to_float_tensor
  5 | 
  6 | 
  7 | class SQIL(IQ_SAC):
  8 | 
  9 |     def __init__(self, R_min=0.0, R_max=1.0, plcy_loss_mode="plcy", **kwargs):
 10 | 
 11 |         super(SQIL, self).__init__(plcy_loss_mode=plcy_loss_mode, **kwargs)
 12 | 
 13 |         self._R_min = R_min
 14 |         self._R_max = R_max
 15 | 
 16 |     def iq_update(self, input_states, input_actions, input_n_states, input_absorbing, is_expert):
 17 |         """ This function overrides the respective function from iq. It makes only slight changes. """
 18 |         if self._iter % self._delay_Q == 0:
 19 |             lossQ = self._lossQ(input_states, input_actions, input_n_states, input_absorbing,
 20 |                                                   is_expert)
 21 |             if self._iter % self._logging_iter == 0:
 22 |                 self.sw_add_scalar('IQ-Loss/LossQ', lossQ, self._iter)
 23 | 
 24 |         # update policy
 25 |         if self._replay_memory.size > self._warmup_transitions() and self._iter % self._delay_pi == 0:
 26 |             if self._train_policy_only_on_own_states:
 27 |                 policy_training_states = input_states[~is_expert]
 28 |             else:
 29 |                 policy_training_states = input_states
 30 |             action_new, log_prob = self.policy.compute_action_and_log_prob_t(policy_training_states)
 31 |             loss = self._actor_loss(policy_training_states, action_new, log_prob)
 32 |             self._optimize_actor_parameters(loss)
 33 |             grads = []
 34 |             for param in self.policy._approximator.model.network.parameters():
 35 |                 grads.append(param.grad.view(-1))
 36 |             grads = torch.cat(grads)
 37 |             norm = grads.norm(dim=0, p=2)
 38 |             if self._iter % self._logging_iter == 0:
 39 |                 self.sw_add_scalar('Gradients/Norm2 Gradient Q wrt. Pi-parameters', norm,
 40 |                                    self._iter)
 41 |                 self.sw_add_scalar('Actor/Loss', loss, self._iter)
 42 |                 _, log_prob = self.policy.compute_action_and_log_prob_t(input_states)
 43 |                 self.sw_add_scalar('Actor/Entropy Expert States', torch.mean(-log_prob[is_expert]).detach().item(),
 44 |                                    self._iter)
 45 |                 self.sw_add_scalar('Actor/Entropy Policy States', torch.mean(-log_prob[~is_expert]).detach().item(),
 46 |                                    self._iter)
 47 |                 _, logsigma = self.policy.get_mu_log_sigma(input_states[~is_expert])
 48 |                 ent_gauss = self.policy.entropy_from_logsigma(logsigma)
 49 |                 e_lb = self.policy.get_e_lb()
 50 |                 self.sw_add_scalar('Actor/Entropy from Gaussian Policy States', torch.mean(ent_gauss).detach().item(),
 51 |                                    self._iter)
 52 |                 self.sw_add_scalar('Actor/Entropy Lower Bound', e_lb, self._iter)
 53 |                 _, logsigma = self.policy.get_mu_log_sigma(input_states[is_expert])
 54 |                 ent_gauss = self.policy.entropy_from_logsigma(logsigma)
 55 |                 self.sw_add_scalar('Actor/Entropy from Gaussian Expert States', torch.mean(ent_gauss).detach().item(),
 56 |                                    self._iter)
 57 |             if self._learnable_alpha:
 58 |                 self._update_alpha(log_prob.detach())
 59 | 
 60 |         if self._iter % self._delay_Q == 0:
 61 |             self._update_target(self._critic_approximator,
 62 |                                 self._target_critic_approximator)
 63 | 
 64 |     def _lossQ(self, obs, act, next_obs, absorbing, is_expert):
 65 |         """
 66 |         This function overrides the iq-loss and replaces it with the sqil loss.
 67 |         """
 68 |         gamma = to_float_tensor(self.mdp_info.gamma).cuda() if self._use_cuda else to_float_tensor(self.mdp_info.gamma)
 69 |         absorbing = torch.tensor(absorbing).cuda() if self._use_cuda else absorbing
 70 |         current_Q = self._critic_approximator(obs, act, output_tensor=True)
 71 |         if not self._use_target:
 72 |             next_v = self.getV(next_obs)
 73 |         else:
 74 |             with torch.no_grad():
 75 |                 next_v = self.get_targetV(next_obs).detach()
 76 | 
 77 |         y = (1 - torch.unsqueeze(absorbing, 1)) * gamma.detach() * next_v
 78 | 
 79 |         # expert part of loss
 80 |         loss_Q = 0.5 * torch.mean(torch.square(current_Q[is_expert] - (self._R_max + y[is_expert])))
 81 | 
 82 |         # plcy part of loss
 83 |         if self._plcy_loss_mode == "value":
 84 |             V = self.getV(obs)
 85 |             loss_Q += 0.5 * torch.mean(torch.square(V - (self._R_min + y)))
 86 |         elif self._plcy_loss_mode == "value_plcy":
 87 |             V = self.getV(obs)
 88 |             loss_Q += 0.5 * torch.mean(torch.square(V[~is_expert] - (self._R_min + y[~is_expert])))
 89 |         elif self._plcy_loss_mode == "plcy":    # this is the true sqil objective for the policy.
 90 |             loss_Q += 0.5 * torch.mean(torch.square(current_Q[~is_expert] - (self._R_min + y[~is_expert])))
 91 | 
 92 |         loss_Q *= self._reg_mult
 93 | 
 94 |         if self._iter % self._logging_iter == 0:
 95 |             reward = (current_Q - y)
 96 |             self.sw_add_scalar('Action-Value/Q for expert', current_Q[is_expert].mean(), self._iter)
 97 |             self.sw_add_scalar('Action-Value/Q^2 for expert', torch.square(current_Q[is_expert]).mean(), self._iter)
 98 |             self.sw_add_scalar('Action-Value/Q for policy', current_Q[~is_expert].mean(), self._iter)
 99 |             self.sw_add_scalar('Action-Value/Q^2 for policy',  torch.square(current_Q[~is_expert]).mean(), self._iter)
100 |             self.sw_add_scalar('Action-Value/Reward', reward.mean(), self._iter)
101 |             self.sw_add_scalar('Action-Value/Reward_Expert', reward[is_expert].mean(), self._iter)
102 |             self.sw_add_scalar('Action-Value/Reward_Policy', reward[~is_expert].mean(), self._iter)
103 |             self.sw_add_scalar('Action-Value/R_min', self._R_min, self._iter)
104 |             # norm
105 |             w = self._critic_approximator.get_weights()
106 |             self.sw_add_scalar("Action-Value/Norm of Q net: ",np.linalg.norm(w), self._iter)
107 |             self.sw_add_scalar('Targets/expert data', y[is_expert].mean(), self._iter)
108 |             self.sw_add_scalar('Targets/policy data', y[~is_expert].mean(), self._iter)
109 |             # log mean squared action
110 |             self.sw_add_scalar('Actions/mean squared action expert (from data)', torch.square(act[is_expert]).mean(), self._iter)
111 |             self.sw_add_scalar('Actions/mean squared action expert (from policy)', np.square(self.policy.draw_action(obs[is_expert])).mean(), self._iter)
112 |             self.sw_add_scalar('Actions/mean squared action policy', torch.square(act[~is_expert]).mean(), self._iter)
113 |             self.sw_add_scalar('Actions/mean squared action both', torch.square(act).mean(), self._iter)
114 | 
115 |             # log mean of each action
116 |             n_actions = len(act[0])
117 |             for i in range(n_actions):
118 |                 self.sw_add_scalar('All Actions means/action %d expert' % i, act[is_expert, i].mean(),
119 |                                    self._iter)
120 |                 self.sw_add_scalar('All Actions means/action %d policy' % i, act[~is_expert, i].mean(),
121 |                                    self._iter)
122 |                 self.sw_add_scalar('All Actions variances/action %d expert' % i, torch.var(act[is_expert, i]),
123 |                                    self._iter)
124 |                 self.sw_add_scalar('All Actions variances/action %d policy' % i, torch.var(act[~is_expert, i]),
125 |                                    self._iter)
126 | 
127 |         self.update_Q_parameters(loss_Q)
128 | 
129 |         grads = []
130 |         for param in self._critic_approximator.model.network.parameters():
131 |             grads.append(param.grad.view(-1))
132 |         grads = torch.cat(grads)
133 |         norm = grads.norm(dim=0, p=2)
134 |         self.sw_add_scalar('Gradients/Norm2 Gradient LossQ wrt. Q-parameters', norm, self._iter)
135 | 
136 |         return loss_Q
137 | 


--------------------------------------------------------------------------------
/imitation_lib/imitation/vail_TRPO.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | import numpy as np
 6 | 
 7 | from imitation_lib.imitation import GAIL_TRPO
 8 | from imitation_lib.utils import to_float_tensors
 9 | 
10 | 
11 | class VAIL(GAIL_TRPO):
12 | 
13 |     def __init__(self, **kwargs):
14 | 
15 |         # call base constructor
16 |         super(VAIL, self).__init__(**kwargs)
17 | 
18 |     def discrim_output(self, *inputs, apply_mask=True):
19 |         inputs = self.prepare_discrim_inputs(inputs, apply_mask=apply_mask)
20 |         d_out,_ ,_ = self._D(*inputs)
21 |         return d_out
22 | 
23 |     def _discriminator_logging(self, inputs, targets):
24 |         super(VAIL, self)._discriminator_logging(inputs, targets)
25 |         if self._sw:
26 |             # calculate bottleneck loss
27 |             loss = deepcopy(self._loss)
28 |             d, mu, logvar = to_float_tensors(self._D(*inputs))
29 |             bottleneck_loss = loss.bottleneck_loss(mu, logvar)
30 |             self._sw.add_scalar('Bottleneck_Loss', bottleneck_loss, self._iter // 3)
31 |             self._sw.add_scalar('Beta', loss._beta, self._iter // 3)
32 |             self._sw.add_scalar('Bottleneck_Loss_times_Beta', loss._beta * bottleneck_loss, self._iter // 3)
33 | 
34 | 


--------------------------------------------------------------------------------
/imitation_lib/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .math import *
2 | from .networks import *
3 | from .training import *
4 | from .preprocessor import *


--------------------------------------------------------------------------------
/imitation_lib/utils/distributions.py:
--------------------------------------------------------------------------------
 1 | from torch.distributions import constraints
 2 | from torch.distributions.transforms import PowerTransform
 3 | from torch.distributions import TransformedDistribution, Gamma
 4 | 
 5 | 
 6 | 
 7 | class InverseGamma(TransformedDistribution):
 8 | 
 9 |     def __init__(self, concentration, rate, validate_args=None):
10 |         base_dist = Gamma(concentration, rate)
11 |         super().__init__(
12 |             base_dist,
13 |             PowerTransform(-base_dist.rate.new_ones(())),
14 |             validate_args=validate_args,
15 |         )
16 | 
17 |     def expand(self, batch_shape, _instance=None):
18 |         new = self._get_checked_instance(InverseGamma, _instance)
19 |         return super().expand(batch_shape, _instance=new)
20 | 
21 |     @property
22 |     def concentration(self):
23 |         return self.base_dist.concentration
24 | 
25 |     @property
26 |     def rate(self):
27 |         return self.base_dist.rate
28 | 


--------------------------------------------------------------------------------
/imitation_lib/utils/math.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn.functional as F
  5 | from mushroom_rl.utils.angles import euler_to_quat
  6 | 
  7 | 
  8 | from mushroom_rl.utils.torch import to_float_tensor
  9 | 
 10 | 
 11 | class GailDiscriminatorLoss(torch.nn.modules.BCEWithLogitsLoss):
 12 | 
 13 |     def __init__(self, entcoeff=1e-3, weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean',
 14 |                  pos_weight: Optional[torch.Tensor] = None) -> None:
 15 | 
 16 |         super(GailDiscriminatorLoss, self).__init__(weight, size_average, reduce, reduction, pos_weight)
 17 | 
 18 |         self.sigmoid = torch.nn.Sigmoid()
 19 |         self.logsigmoid = torch.nn.LogSigmoid()
 20 |         self.entcoeff = entcoeff
 21 | 
 22 |     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
 23 |         # overrides original BCELoss
 24 |         # from tensorflow max(x, 0) - x * z + log(1 + exp(-abs(x)))
 25 |         bce_loss = torch.maximum(input, torch.zeros_like(input)) - input * target + torch.log(1 + torch.exp(-torch.abs(input)))
 26 |         bce_loss = torch.mean(bce_loss)
 27 |         
 28 |         bernoulli_ent = self.entcoeff * torch.mean(self.logit_bernoulli_entropy(input))
 29 |         return bce_loss - bernoulli_ent
 30 | 
 31 |     def logit_bernoulli_entropy(self, logits):
 32 |         """
 33 |         Adapted from:
 34 |         https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51
 35 |         """
 36 |         return (1. - self.sigmoid(logits)) * logits - self.logsigmoid(logits)
 37 | 
 38 | 
 39 | class VDBLoss(GailDiscriminatorLoss):
 40 | 
 41 |     def __init__(self, info_constraint, lr_beta, use_bernoulli_ent=False, entcoeff=1e-3, weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean',
 42 |                  pos_weight: Optional[torch.Tensor] = None) -> None:
 43 | 
 44 |         # call base constructor
 45 |         super().__init__(entcoeff, weight, size_average, reduce, reduction, pos_weight)
 46 | 
 47 |         self._use_bernoulli_ent = use_bernoulli_ent
 48 |         self._info_constr = info_constraint
 49 |         self._lr_beta = lr_beta
 50 |         self._beta = 0.1
 51 | 
 52 |     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
 53 |         logits, mu, logvar = input
 54 | 
 55 |         # bottleneck loss
 56 |         bottleneck_loss = self.bottleneck_loss(mu, logvar)
 57 | 
 58 |         # binary cross entropy loss
 59 |         bce_loss = F.binary_cross_entropy_with_logits(torch.squeeze(logits), torch.squeeze(target), self.weight,
 60 |                                                       pos_weight=self.pos_weight,
 61 |                                                       reduction=self.reduction)
 62 | 
 63 |         # optional, additional bernoulli entropy (as in gail, but this was not used in the paper)
 64 |         bernoulli_ent = self.logit_bernoulli_entropy(logits) if self._use_bernoulli_ent else torch.zeros_like(bce_loss)
 65 | 
 66 |         # overall vdb loss
 67 |         vdb_loss = bce_loss + self._beta * bottleneck_loss + bernoulli_ent
 68 | 
 69 |         # update beta
 70 |         self._update_beta(bottleneck_loss)
 71 | 
 72 |         return vdb_loss
 73 | 
 74 |     def bottleneck_loss(self, mu, logvar):
 75 |         kld = self.kl_divergence(mu, logvar).mean()
 76 |         bottleneck_loss = kld - self._info_constr
 77 |         return bottleneck_loss
 78 | 
 79 |     @torch.no_grad()
 80 |     def _update_beta(self, bottleneck_loss):
 81 |         self._beta = max(0, self._beta + self._lr_beta * bottleneck_loss)
 82 | 
 83 |     @staticmethod
 84 |     def kl_divergence(mu, logvar):
 85 |         kl_div = 0.5 * torch.sum(torch.pow(mu, 2) + torch.exp(logvar) - logvar - 1, dim=1)
 86 |         return kl_div
 87 | 
 88 | 
 89 | def to_float_tensors(inputs):
 90 |     """ Takes a list or tuple of of numpy arrays and converts them to a list of torch tensors. If only an array is
 91 |         provided, it returns a torch tensor."""
 92 |     if type(inputs) is not tuple and type(inputs) is not list:
 93 |         return to_float_tensor(inputs)
 94 |     else:
 95 |         out = []
 96 |         for elem in inputs:
 97 |             out.append(to_float_tensor(elem))
 98 |         return out
 99 | 
100 | 
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/imitation_lib/utils/preprocessor.py:
--------------------------------------------------------------------------------
 1 | from mushroom_rl.core import Serializable
 2 | 
 3 | 
 4 | class MaskingPreprocessor(Serializable):
 5 | 
 6 |     def __init__(self, mask):
 7 |         self._mask = mask
 8 |         self._add_save_attr(_mask='primitive')
 9 |     
10 |     def __call__(self, obs):
11 |         masked_obs = obs[self._mask]
12 |         return masked_obs
13 | 


--------------------------------------------------------------------------------
/imitation_lib/utils/training.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from copy import deepcopy
 3 | import numpy as np
 4 | 
 5 | from mushroom_rl.utils.dataset import compute_J, parse_dataset
 6 | 
 7 | 
 8 | class BestAgentSaver:
 9 | 
10 |     def __init__(self, save_path, n_epochs_save=10, save_replay_memory=False):
11 |         self.best_curr_agent = None
12 |         self.save_path = save_path
13 |         self.n_epochs_save = n_epochs_save
14 |         self.last_save = 0
15 |         self.epoch_counter = 0
16 |         self.best_J_since_last_save = -float('inf')
17 |         self.save_replay_memory = save_replay_memory
18 | 
19 |     def save(self, agent, J):
20 | 
21 |         if self.n_epochs_save != -1:
22 |             if J > self.best_J_since_last_save:
23 |                 self.best_J_since_last_save = J
24 |                 # if the agent has a replay memory that should not be saved, we can save memory by not copying it,
25 |                 # i.e., temporarily removing it from the current agent and then giving it back.
26 |                 mem = None
27 |                 tmp_store_mem = hasattr(agent, '_replay_memory') and not self.save_replay_memory
28 |                 if tmp_store_mem:
29 |                     mem = agent._replay_memory
30 |                     agent._replay_memory = None
31 |                 self.best_curr_agent = (deepcopy(agent), J, self.epoch_counter)
32 |                 if tmp_store_mem:
33 |                     agent._replay_memory = mem
34 | 
35 |             if self.last_save + self.n_epochs_save <= self.epoch_counter:
36 |                 self.save_curr_best_agent()
37 | 
38 |             self.epoch_counter += 1
39 | 
40 |     def save_curr_best_agent(self):
41 | 
42 |         if self.best_curr_agent is not None:
43 |             path = os.path.join(self.save_path, 'agent_epoch_%d_J_%f.msh' % (self.best_curr_agent[2],
44 |                                                                              self.best_curr_agent[1]))
45 |             self.best_curr_agent[0].save(path, full_save=True)
46 |             self.best_curr_agent = None
47 |             self.best_J_since_last_save = -float('inf')
48 |             self.last_save = self.epoch_counter
49 | 
50 |     def save_agent(self,  agent, J):
51 |         path = os.path.join(self.save_path, 'agent_J_%f.msh' % J)
52 |         agent.save(path, full_save=True)
53 | 
54 | 
55 | def prepare_expert_data(data_path):
56 |     dataset = dict()
57 | 
58 |     # load expert training data
59 |     expert_files = np.load(data_path)
60 |     dataset["states"] = expert_files["states"]
61 |     dataset["actions"] = expert_files["actions"]
62 |     dataset["episode_starts"] = expert_files["episode_starts"]
63 | 
64 |     # maybe we have next action and next next state
65 |     try:
66 |         dataset["next_actions"] = expert_files["next_actions"]
67 |         dataset["next_next_states"] = expert_files["next_next_states"]
68 |     except KeyError as e:
69 |         print("Did not find next action or next next state.")
70 | 
71 |     # maybe we have next states and dones in the dataset
72 |     try:
73 |         dataset["next_states"] = expert_files["next_states"]
74 |         dataset["absorbing"] = expert_files["absorbing"]
75 |     except KeyError as e:
76 |         print("Warning Dataset: %s" % e)
77 | 
78 |     # maybe we have episode returns, if yes done
79 |     try:
80 |         dataset["episode_returns"] = expert_files["episode_returns"]
81 |         return dataset
82 |     except KeyError:
83 |         print("Warning Dataset: No episode returns. Falling back to step-based reward.")
84 | 
85 |     # this has to work
86 |     try:
87 |         dataset["rewards"] = expert_files["rewards"]
88 |         return dataset
89 |     except KeyError:
90 |         raise KeyError("The dataset has neither an episode nor a step-based reward!")
91 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | requires_list = ["mushroom_rl>=1.10.0", "tensorboard", "experiment-launcher"]
 4 | 
 5 | setup(name='imitation_lib',
 6 |       version='0.1',
 7 |       description='Code base of the paper: LS-IQ: Implicit Reward Regularization for Inverse Reinforcement Learning.',
 8 |       license='MIT',
 9 |       author="Firas Al-Hafez",
10 |       packages=[package for package in find_packages()
11 |                 if package.startswith('imitation_lib')],
12 |       install_requires=requires_list,
13 |       zip_safe=False,
14 |       )


--------------------------------------------------------------------------------