├── .gitignore ├── LICENSE ├── README.md ├── examples ├── model_tune.py ├── train_d4rl.py ├── train_task.py └── train_tune.py ├── offlinerl ├── __init__.py ├── algo │ ├── __init__.py │ ├── base.py │ ├── dynamics_model │ │ ├── __init__.py │ │ └── bc_model.py │ ├── modelbase │ │ ├── __init__.py │ │ ├── bremen.py │ │ ├── combo.py │ │ ├── maple.py │ │ ├── maple_new.py │ │ ├── mobile.py │ │ ├── model_base.py │ │ ├── moose.py │ │ ├── mopo.py │ │ └── rambo.py │ ├── modelfree │ │ ├── __init__.py │ │ ├── bc.py │ │ ├── bcq.py │ │ ├── bcqd.py │ │ ├── cql.py │ │ ├── crr.py │ │ ├── edac.py │ │ ├── mcq.py │ │ ├── plas.py │ │ ├── prdc.py │ │ └── td3bc.py │ └── online │ │ ├── __init__.py │ │ └── bremen.py ├── config │ ├── __init__.py │ └── algo │ │ ├── __init__.py │ │ ├── bc_config.py │ │ ├── bc_model_config.py │ │ ├── bcq_config.py │ │ ├── bcqd_config.py │ │ ├── bremen_config.py │ │ ├── combo_config.py │ │ ├── cql_config.py │ │ ├── crr_config.py │ │ ├── edac_config.py │ │ ├── maple_config.py │ │ ├── maple_config_new.py │ │ ├── mcq_config.py │ │ ├── mobile_config.py │ │ ├── moose_config.py │ │ ├── mopo_config.py │ │ ├── plas_config.py │ │ ├── prdc_config.py │ │ ├── rambo_config.py │ │ └── td3bc_config.py ├── data │ ├── __init__.py │ ├── d4rl.py │ └── neorl.py ├── evaluation │ ├── __init__.py │ ├── d4rl.py │ ├── fqe.py │ ├── gym.py │ └── neorl.py ├── outside_utils │ ├── buffer │ │ ├── __init__.py │ │ └── buffer.py │ ├── dynamics │ │ ├── __init__.py │ │ ├── base_dynamics.py │ │ ├── ensemble_dynamics.py │ │ ├── mujoco_oracle_dynamics.py │ │ └── rnn_dynamics.py │ ├── modules │ │ ├── __init__.py │ │ ├── actor_module.py │ │ ├── critic_module.py │ │ ├── dist_module.py │ │ ├── dynamics_module.py │ │ └── ensemble_critic_module.py │ ├── nets │ │ ├── __init__.py │ │ ├── ensemble_linear.py │ │ ├── mlp.py │ │ ├── rnn.py │ │ └── vae.py │ └── utils │ │ ├── __init__.py │ │ ├── logger.py │ │ ├── scaler.py │ │ └── termination_fns.py └── utils │ ├── __init__.py │ ├── config.py │ ├── data.py │ ├── env.py │ ├── exp.py │ ├── flexible_replay_pool.py │ ├── function.py │ ├── io.py │ ├── loader.py │ ├── logger.py │ ├── net │ ├── __init__.py │ ├── bcq_net.py │ ├── common.py │ ├── continuous.py │ ├── maple_actor.py │ ├── mlas.py │ ├── model │ │ ├── __init__.py │ │ ├── ensemble.py │ │ ├── maple_critic.py │ │ └── new_ensemble.py │ ├── model_GRU.py │ ├── moose.py │ ├── tanhpolicy.py │ ├── terminal_check.py │ └── vae.py │ ├── replay_pool.py │ └── simple_replay_pool.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | 7 | # define 8 | test/ 9 | .aim* 10 | offlinerl_tmp/ 11 | 12 | # C extensions 13 | *.so 14 | 15 | # .idea folder 16 | .idea/ 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | cover/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | *.out 71 | local_settings.py 72 | db.sqlite3 73 | db.sqlite3-journal 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | # For a library or package, you might want to ignore these files since the code is 97 | # intended to run in multiple environments; otherwise, check them in: 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 108 | __pypackages__/ 109 | 110 | # Celery stuff 111 | celerybeat-schedule 112 | celerybeat.pid 113 | 114 | # SageMath parsed files 115 | *.sage.py 116 | 117 | # Environments 118 | .env 119 | .venv 120 | venv/ 121 | ENV/ 122 | env.bak/ 123 | venv.bak/ 124 | 125 | # Spyder project settings 126 | .spyderproject 127 | .spyproject 128 | 129 | # Rope project settings 130 | .ropeproject 131 | 132 | # mkdocs documentation 133 | /site 134 | 135 | # mypy 136 | .mypy_cache/ 137 | .dmypy.json 138 | dmypy.json 139 | 140 | # Pyre type checker 141 | .pyre/ 142 | 143 | # pytype static type analyzer 144 | .pytype/ 145 | 146 | # customize 147 | log/ 148 | MUJOCO_LOG.TXT 149 | *.pth 150 | .vscode/ 151 | .DS_Store 152 | *.zip 153 | *.pstats 154 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OfflineRL 2 | 3 | OfflineRL is a repository for Offline RL (batch reinforcement learning or offline reinforcement learning). 4 | 5 | ## Re-implemented Algorithms 6 | ### Model-free methods 7 | - **CRR**: Wang, Ziyu, et al. “Critic Regularized Regression.” Advances in Neural Information Processing Systems, vol. 33, 2020, pp. 7768–7778. [paper](https://arxiv.org/abs/2006.15134) 8 | - **CQL**: Kumar, Aviral, et al. “Conservative Q-Learning for Offline Reinforcement Learning.” Advances in Neural Information Processing Systems, vol. 33, 2020. [paper](https://arxiv.org/abs/2006.04779) [code](https://github.com/aviralkumar2907/CQL) 9 | - **PLAS**: Zhou, Wenxuan, et al. “PLAS: Latent Action Space for Offline Reinforcement Learning.” ArXiv Preprint ArXiv:2011.07213, 2020. 10 | [website](https://sites.google.com/view/latent-policy) [paper](https://arxiv.org/abs/2011.07213) [code](https://github.com/Wenxuan-Zhou/PLAS) 11 | - **BCQ**: Fujimoto, Scott, et al. “Off-Policy Deep Reinforcement Learning without Exploration.” International Conference on Machine Learning, 2018, pp. 2052–2062. [paper](https://arxiv.org/abs/1812.02900) [code](https://github.com/sfujim/BCQ) 12 | - **EDAC**: An, Gaon, et al. "Uncertainty-based offline reinforcement learning with diversified q-ensemble." Advances in neural information processing systems 34 (2021): 7436-7447. [paper](https://arxiv.org/abs/2110.01548) [code](https://github.com/snu-mllab/EDAC) 13 | - **MCQ**: Lyu, Jiafei, et al. "Mildly conservative q-learning for offline reinforcement learning." Advances in Neural Information Processing Systems 35 (2022): 1711-1724. [paper](https://arxiv.org/abs/2206.04745) [code](https://github.com/dmksjfl/MCQ) 14 | - **TD3BC**: Fujimoto, Scott, and Shixiang Shane Gu. "A minimalist approach to offline reinforcement learning." Advances in neural information processing systems 34 (2021): 20132-20145. [paper](https://arxiv.org/abs/2106.06860) [code](https://github.com/sfujim/TD3_BC) 15 | - **PRDC**: Ran, Yuhang, et al. “Policy Regularization with Dataset Constraint for Offline Reinforcement Learning.” International Conference on Machine Learning, 2023, pp. 28701-28717. [paper](https://arxiv.org/abs/2306.06569) [code](https://github.com/LAMDA-RL/PRDC) 16 | ### Model-based methods 17 | - **BREMEN**: Matsushima, Tatsuya, et al. “Deployment-Efficient Reinforcement Learning via Model-Based Offline Optimization.” International Conference on Learning Representations, 2021. [paper](https://openreview.net/forum?id=3hGNqpI4WS) [code](https://github.com/matsuolab/BREMEN) 18 | - **COMBO**: Yu, Tianhe, et al. "COMBO: Conservative Offline Model-Based Policy Optimization." arXiv preprint arXiv:2102.08363 (2021). [paper](https://arxiv.org/abs/2102.08363) 19 | - **MOPO**: Yu, Tianhe, et al. “MOPO: Model-Based Offline Policy Optimization.” Advances in Neural Information Processing Systems, vol. 33, 2020. [paper](https://papers.nips.cc/paper/2020/hash/a322852ce0df73e204b7e67cbbef0d0a-Abstract.html) [code](https://github.com/tianheyu927/mopo) 20 | - **MAPLE**: Xiong-Hui Chen, et al. "MAPLE: Offline Model-based Adaptable Policy Learning". Advances in Neural Information Processing Systems, vol. 34, 2021. [paper](https://proceedings.neurips.cc/paper/2021/hash/470e7a4f017a5476afb7eeb3f8b96f9b-Abstract.html) [code](https://github.com/xionghuichen/MAPLE) 21 | - **MOBILE**: Yihao Sun, et al. "Model-Bellman Inconsistency for Model-based Offline Reinforcement Learning". Proceedings of the 40th International Conference on Machine Learning, PMLR 202:33177-33194, 2023. [paper](https://proceedings.mlr.press/v202/sun23q.html) [code](https://github.com/yihaosun1124/mobile) 22 | - **RAMBO**: Rigter, Marc, Bruno Lacerda, and Nick Hawes. "Rambo-rl: Robust adversarial model-based offline reinforcement learning." Advances in neural information processing systems 35 (2022): 16082-16097. [paper](https://arxiv.org/abs/2204.12581) [code](https://github.com/marc-rigter/rambo) 23 | 24 | ## Install Datasets 25 | ### NeoRL 26 | 27 | ```shell 28 | git clone https://github.com/Polixir/neorl.git 29 | cd neorl 30 | pip install -e . 31 | ``` 32 | 33 | For more details on use, please see [neorl](https://github.com/Polixir/neorl). 34 | 35 | ### D4RL (Optional) 36 | ```shell 37 | pip install git+https://github.com/rail-berkeley/d4rl@master#egg=d4rl 38 | ``` 39 | 40 | For more details on use, please see [d4rl](https://github.com/rail-berkeley/d4rl). 41 | 42 | ## Install offlinerl 43 | 44 | ```shell 45 | pip install -e . 46 | ``` 47 | 48 | ## Example 49 | 50 | ```python 51 | # Training in HalfCheetah-v3-L-9 task using default parameters of cql algorithm 52 | python examples/train_task.py --algo_name=cql --exp_name=halfcheetah --task HalfCheetah-v3 --task_data_type low --task_train_num 100 53 | 54 | # Training in SafetyHalfCheetahtask using default parameters of cql algorithm 55 | python examples/train_task.py --algo_name=mcq --exp_name=SafetyHalfCheetah --task SafetyHalfCheetah 56 | 57 | # Parameter search in the default parameter space using the cql algorithm in the HalfCheetah-v3-L-9 task 58 | python examples/train_tune.py --algo_name=cql --exp_name=halfcheetah --task HalfCheetah-v3 --task_data_type low --task_train_num 100 59 | 60 | # Parameter search in the default parameter space using the cql algorithm in the SafetyHalfCheetahtask task 61 | # python examples/train_tune.py --algo_name=mcq --exp_name=SafetyHalfCheetah --task SafetyHalfCheetah 62 | 63 | # Training in D4RL halfcheetah-medium task using default parameters of cql algorithm (D4RL need to be installed) 64 | python examples/train_d4rl.py --algo_name=cql --exp_name=d4rl-halfcheetah-medium-cql --task d4rl-halfcheetah-medium-v0 65 | ``` 66 | 67 | **Parameters:** 68 | 69 | - ​**algo_name**: Algorithm name . There are now bc, cql, plas, bcq and mopo algorithms available. 70 | - ​**exp_name**: Experiment name for easy visualization using aim. 71 | - ​**task**: Task name, See [neorl](https://github.com/Polixir/neorl/wiki/Tasks) for details. 72 | - ​**task_data_type**: Data level. Each task collects data using low, medium, and high level strategies in [neorl](https://github.com/Polixir/neorl). 73 | - ​**task_train_num**: Number of training data trajectories. For each task, neorl provides training data for up to 10000 trajectories. 74 | 75 | 76 | 77 | ## View experimental results 78 | We use **Aim** to store and visualize results. Aim is an experiment logger that is easy to manage thousands of experiments. For more details, see [aim](https://github.com/aimhubio/aim). 79 | 80 | To visualize results in this repository: 81 | ```shell 82 | cd offlinerl_tmp 83 | aim up 84 | ``` 85 | Then you can see the results on http://127.0.0.1:43800. 86 | 87 | 88 | ## Model-based Running Example 89 | 90 | ```python 91 | # Tune and save the transition models 92 | python examples/model_tune.py --algo_name bc_model --exp_name neorl-RandomFrictionHopper-model --task RandomFrictionHopper 93 | ``` 94 | 95 | ```python 96 | # Training MOPO and load the best transition model 97 | python examples/train_task.py --algo_name mopo --exp_name neorl-safecheetah-mopo-new --task SafetyHalfCheetah --dynamics_path best_run_id 98 | 99 | # Training COMBO and load the best transition model 100 | python examples/train_task.py --algo_name combo --exp_name neorl-safecheetah-combo-new --task SafetyHalfCheetah --dynamics_path best_run_id 101 | 102 | # Training RAMBO and load the best transition model 103 | python examples/train_task.py --algo_name rambo --exp_name neorl-safecheetah-rambo-new --task SafetyHalfCheetah --dynamics_path best_run_id 104 | 105 | # Training MOBILE and load the best transition model 106 | python examples/train_task.py --algo_name mobile --exp_name neorl-safecheetah-mobile-new --task SafetyHalfCheetah --dynamics_path best_run_id 107 | ``` 108 | -------------------------------------------------------------------------------- /examples/model_tune.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import random 3 | from ray import tune 4 | 5 | from offlinerl.algo import algo_select 6 | from offlinerl.data import load_data_from_neorl 7 | from offlinerl.evaluation import get_defalut_callback, ModelCallBackFunction 8 | 9 | def training_function(config): 10 | algo_init_fn, algo_trainer_obj, algo_config = algo_select(config["kwargs"]) 11 | train_buffer, val_buffer = load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"]) 12 | algo_config.update(config) 13 | algo_config["device"] = "cuda" 14 | algo_init = algo_init_fn(algo_config) 15 | algo_trainer = algo_trainer_obj(algo_init, algo_config) 16 | 17 | callback = ModelCallBackFunction() 18 | callback.initialize(train_buffer=train_buffer, val_buffer=val_buffer, task=algo_config["task"]) 19 | 20 | score = algo_trainer.train(train_buffer, None, callback_fn=callback) 21 | 22 | # return score 23 | return 0 24 | 25 | 26 | def run_algo(**kwargs): 27 | config = {} 28 | config["kwargs"] = kwargs 29 | config["kwargs"]['seed'] = random.randint(0, 1000000) 30 | _, _, algo_config = algo_select(kwargs) 31 | # Prepare Dataset 32 | load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"]) 33 | grid_tune = algo_config["grid_tune"] 34 | for k,v in grid_tune.items(): 35 | config[k] = tune.grid_search(v) 36 | 37 | analysis = tune.run( 38 | training_function, 39 | config=config, 40 | resources_per_trial={"gpu": 0.5}, 41 | ) 42 | 43 | 44 | if __name__ == "__main__": 45 | fire.Fire(run_algo) 46 | -------------------------------------------------------------------------------- /examples/train_d4rl.py: -------------------------------------------------------------------------------- 1 | import fire 2 | 3 | from offlinerl.algo import algo_select 4 | from offlinerl.data.d4rl import load_d4rl_buffer 5 | from offlinerl.evaluation import OnlineCallBackFunction 6 | 7 | 8 | def run_algo(**kwargs): 9 | algo_init_fn, algo_trainer_obj, algo_config = algo_select(kwargs) 10 | train_buffer = load_d4rl_buffer(algo_config["task"]) 11 | algo_init = algo_init_fn(algo_config) 12 | algo_trainer = algo_trainer_obj(algo_init, algo_config) 13 | callback = OnlineCallBackFunction() 14 | callback.initialize(train_buffer=train_buffer, val_buffer=None, 15 | task=algo_config["task"], number_of_runs=algo_config.get("eval_episodes",100)) 16 | 17 | algo_trainer.train(train_buffer, None, callback_fn=callback) 18 | 19 | if __name__ == "__main__": 20 | fire.Fire(run_algo) 21 | -------------------------------------------------------------------------------- /examples/train_task.py: -------------------------------------------------------------------------------- 1 | import fire 2 | 3 | from offlinerl.algo import algo_select 4 | from offlinerl.data import load_data_from_neorl 5 | from offlinerl.evaluation import get_defalut_callback, OnlineCallBackFunction 6 | 7 | 8 | def run_algo(**kwargs): 9 | algo_init_fn, algo_trainer_obj, algo_config = algo_select(kwargs) 10 | train_buffer, val_buffer = load_data_from_neorl(algo_config["task"], 11 | algo_config["task_data_type"], algo_config["task_train_num"]) 12 | algo_config['data_name'] = "neorl2-" + algo_config["task"] 13 | algo_init = algo_init_fn(algo_config) 14 | algo_trainer = algo_trainer_obj(algo_init, algo_config) 15 | callback = OnlineCallBackFunction() 16 | callback.initialize(train_buffer=train_buffer, val_buffer=val_buffer, 17 | task=algo_config["task"], number_of_runs=algo_config.get("eval_episodes",100)) 18 | 19 | algo_trainer.train(train_buffer, None, callback_fn=callback) 20 | 21 | if __name__ == "__main__": 22 | fire.Fire(run_algo) 23 | -------------------------------------------------------------------------------- /examples/train_tune.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import random 3 | from ray import tune 4 | 5 | from offlinerl.algo import algo_select 6 | from offlinerl.data import load_data_from_neorl 7 | from offlinerl.evaluation import get_defalut_callback, OnlineCallBackFunction 8 | 9 | def training_function(config): 10 | algo_init_fn, algo_trainer_obj, algo_config = algo_select(config["kwargs"]) 11 | train_buffer, val_buffer = load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"]) 12 | algo_config.update(config) 13 | algo_config["device"] = "cuda" 14 | algo_init = algo_init_fn(algo_config) 15 | algo_trainer = algo_trainer_obj(algo_init, algo_config) 16 | 17 | callback = OnlineCallBackFunction() 18 | callback.initialize(train_buffer=train_buffer, val_buffer=val_buffer, task=algo_config["task"]) 19 | 20 | score = algo_trainer.train(train_buffer, None, callback_fn=callback) 21 | 22 | # return score 23 | return 0 24 | 25 | 26 | def run_algo(**kwargs): 27 | config = {} 28 | config["kwargs"] = kwargs 29 | config["kwargs"]['seed'] = random.randint(0, 1000000) 30 | _, _, algo_config = algo_select(kwargs) 31 | # Prepare Dataset 32 | load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"]) 33 | grid_tune = algo_config["grid_tune"] 34 | for k,v in grid_tune.items(): 35 | config[k] = tune.grid_search(v) 36 | 37 | analysis = tune.run( 38 | training_function, 39 | config=config, 40 | resources_per_trial={"gpu": 0.333333}, 41 | ) 42 | 43 | 44 | if __name__ == "__main__": 45 | fire.Fire(run_algo) -------------------------------------------------------------------------------- /offlinerl/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from loguru import logger 3 | 4 | from offlinerl import algo, data, evaluation, utils, config 5 | 6 | logger_config = { 7 | "handlers": [ 8 | {"sink": sys.stdout, 9 | "colorize" : True, 10 | #"format" : "{time} {message}", 11 | "format" : "{time:YYYY-MM-DD at HH:mm:ss.SSS} | {level} | {message}", 12 | "enqueue" : True, 13 | "backtrace" : True, 14 | "diagnose" : True, 15 | }, 16 | ], 17 | 18 | } 19 | logger.configure(**logger_config) 20 | 21 | #logger.disable("offlinerl") 22 | logger.enable("offlinerl") 23 | 24 | __version__ = "0.0.1" 25 | 26 | __all__ = [ 27 | "algo", 28 | "data", 29 | "evaluation", 30 | "utils", 31 | "config", 32 | ] -------------------------------------------------------------------------------- /offlinerl/algo/__init__.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | import warnings 3 | 4 | warnings.filterwarnings('ignore') 5 | 6 | 7 | from offlinerl.config.algo import edac_config, mcq_config, cql_config, plas_config, mopo_config, moose_config, bcqd_config, bcq_config, bc_config, crr_config, combo_config, bremen_config, maple_config, mobile_config, rambo_config, td3bc_config, bc_model_config, maple_config_new,prdc_config 8 | from offlinerl.utils.config import parse_config 9 | from offlinerl.algo.modelfree import cql, plas, bcqd, bcq, bc, crr, edac, mcq, td3bc, prdc 10 | from offlinerl.algo.modelbase import mopo, moose, combo, bremen, maple, mobile, rambo, maple_new 11 | from offlinerl.algo.dynamics_model import bc_model 12 | 13 | algo_dict = { 14 | 'edac' : {"algo" : edac, "config" : edac_config}, 15 | 'bc' : {"algo" : bc, "config" : bc_config}, 16 | 'bcq' : {"algo" : bcq, "config" : bcq_config}, 17 | 'mcq' : {"algo" : mcq, "config" : mcq_config}, 18 | 'bcqd' : {"algo" : bcqd, "config" : bcqd_config}, 19 | 'combo' : {"algo" : combo, "config" : combo_config}, 20 | "cql" : {"algo" : cql, "config" : cql_config}, 21 | "crr" : {"algo" : crr, "config" : crr_config}, 22 | "plas" : {"algo" : plas, "config" : plas_config}, 23 | "prdc" : {"algo" : prdc, "config" : prdc_config}, 24 | 'moose' : {"algo" : moose, "config" : moose_config}, 25 | 'mopo': {"algo" : mopo, "config": mopo_config}, 26 | 'bremen' : {"algo" : bremen, "config" : bremen_config}, 27 | 'maple': {'algo':maple , 'config':maple_config}, 28 | 'mobile': {'algo':mobile , 'config':mobile_config}, 29 | 'rambo': {'algo':rambo , 'config':rambo_config}, 30 | 'td3bc': {'algo':td3bc , 'config':td3bc_config}, 31 | 'bc_model': {'algo':bc_model , 'config':bc_model_config}, 32 | 'maple_new': {'algo':maple_new , 'config':maple_config_new}, 33 | } 34 | 35 | def algo_select(command_args, algo_config_module=None): 36 | algo_name = command_args["algo_name"] 37 | logger.info('Use {} algorithm!', algo_name) 38 | assert algo_name in algo_dict.keys() 39 | algo = algo_dict[algo_name]["algo"] 40 | 41 | if algo_config_module is None: 42 | algo_config_module = algo_dict[algo_name]["config"] 43 | algo_config = parse_config(algo_config_module) 44 | algo_config.update(command_args) 45 | 46 | algo_init = algo.algo_init 47 | algo_trainer = algo.AlgoTrainer 48 | 49 | return algo_init, algo_trainer, algo_config 50 | 51 | -------------------------------------------------------------------------------- /offlinerl/algo/base.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | import json 4 | from abc import ABC, abstractmethod 5 | 6 | import torch 7 | from collections import OrderedDict 8 | from loguru import logger 9 | from offlinerl.utils.exp import init_exp_run 10 | from offlinerl.utils.io import create_dir 11 | from offlinerl.utils.logger import log_path 12 | 13 | 14 | import time 15 | import random 16 | 17 | class BaseAlgo(ABC): 18 | def __init__(self, args): 19 | logger.info('Init AlgoTrainer') 20 | if "exp_name" not in args.keys(): 21 | exp_name = str(uuid.uuid1()).replace("-","") 22 | else: 23 | exp_name = args["exp_name"] 24 | 25 | if "aim_path" in args.keys(): 26 | if os.path.exists(args["aim_path"]): 27 | time.sleep(random.randint(1, 5)) 28 | repo = args["aim_path"] 29 | else: 30 | os.makedirs(args["aim_path"]) 31 | repo = args["aim_path"] 32 | else: 33 | repo = None 34 | 35 | self.repo = repo 36 | 37 | try: 38 | self.exp_run = init_exp_run(repo = repo, experiment_name = exp_name) 39 | except: 40 | time.sleep(random.randint(1, 5)) 41 | self.exp_run = init_exp_run(repo = repo, experiment_name = exp_name) 42 | 43 | if self.exp_run.repo is not None: # a naive fix of aim exp_logger.repo is None 44 | self.index_path = self.exp_run.repo.path 45 | else: 46 | repo = os.path.join(log_path(),"./.aim") 47 | if not os.path.exists(repo): 48 | logger.info('{} dir is not exist, create {}',repo, repo) 49 | os.system(str("cd " + os.path.join(repo,"../") + "&& aim init")) 50 | self.index_path = repo 51 | 52 | print(f'self.index_path/{self.index_path}') 53 | self.models_save_dir = os.path.join(self.index_path, "models") 54 | self.metric_logs = OrderedDict() 55 | self.metric_logs_path = os.path.join(self.index_path, "metric_logs.json") 56 | create_dir(self.models_save_dir) 57 | 58 | # self.exp_run.set_params(args, name='hparams') 59 | self.exp_run['hparams'] = args 60 | 61 | def log_res(self, epoch, result): 62 | logger.info('Epoch : {}', epoch) 63 | for k,v in result.items(): 64 | logger.info('{} : {}',k, v) 65 | self.exp_run.track(v, name=k.split(" ")[0], epoch=epoch,) 66 | 67 | self.metric_logs[str(epoch)] = result 68 | with open(self.metric_logs_path,"w") as f: 69 | json.dump(self.metric_logs,f) 70 | 71 | self.run_id = self.exp_run.name.split( )[-1] 72 | tmp_dir = os.path.join(self.models_save_dir, self.run_id) 73 | if not os.path.exists(tmp_dir): 74 | os.makedirs(tmp_dir) 75 | # self.save_model(os.path.join(tmp_dir, str(epoch) + ".pt")) 76 | self.save_model(os.path.join(tmp_dir, "policy.pt")) 77 | 78 | self.report_result = result 79 | self.report_result["hparams"] = self.exp_run['hparams'] 80 | self.report_result["model_path"] = os.path.join(tmp_dir, "policy.pt") 81 | 82 | 83 | @abstractmethod 84 | def train(self, 85 | history_buffer, 86 | eval_fn=None,): 87 | pass 88 | 89 | def _sync_weight(self, net_target, net, soft_target_tau = 5e-3): 90 | for o, n in zip(net_target.parameters(), net.parameters()): 91 | o.data.copy_(o.data * (1.0 - soft_target_tau) + n.data * soft_target_tau) 92 | 93 | @abstractmethod 94 | def get_policy(self,): 95 | pass 96 | 97 | #@abstractmethod 98 | def save_model(self, model_path): 99 | torch.save(self.get_policy(), model_path) 100 | 101 | #@abstractmethod 102 | def load_model(self, model_path): 103 | model = torch.load(model_path) 104 | 105 | return model -------------------------------------------------------------------------------- /offlinerl/algo/dynamics_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/dynamics_model/__init__.py -------------------------------------------------------------------------------- /offlinerl/algo/modelbase/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/modelbase/__init__.py -------------------------------------------------------------------------------- /offlinerl/algo/modelbase/mopo.py: -------------------------------------------------------------------------------- 1 | # MOPO: Model-based Offline Policy Optimization 2 | # https://arxiv.org/abs/2005.13239 3 | # https://github.com/tianheyu927/mopo 4 | import os 5 | import torch 6 | import numpy as np 7 | from copy import deepcopy 8 | from loguru import logger 9 | from collections import deque 10 | from typing import Dict 11 | 12 | from offlinerl.algo.modelbase.model_base import algo_init, ModelBasedAlgoTrainer 13 | 14 | 15 | class AlgoTrainer(ModelBasedAlgoTrainer): 16 | def __init__(self, algo_init, args): 17 | super(AlgoTrainer, self).__init__(algo_init, args) 18 | 19 | self.fake_buffer_size = self.args["model_retain_epochs"] * self.args["rollout_batch_size"] * self.args["horizon"] 20 | self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.actor_optim, args['max_epoch']) 21 | 22 | def policy_learn(self, batch: Dict): 23 | real_batch, fake_batch = batch["real"], batch["fake"] 24 | mix_batch = {k: torch.cat([real_batch[k], fake_batch[k]], 0) for k in real_batch.keys()} 25 | 26 | obss, actions, next_obss, rewards, terminals = mix_batch["observations"], mix_batch["actions"], \ 27 | mix_batch["next_observations"], mix_batch["rewards"], mix_batch["terminals"] 28 | 29 | # update critic 30 | q1, q2 = self.critic1(obss, actions), self.critic2(obss, actions) 31 | with torch.no_grad(): 32 | next_actions, next_log_probs = self.actforward(next_obss) 33 | next_q = torch.min( 34 | self.target_critic1(next_obss, next_actions), self.target_critic2(next_obss, next_actions) 35 | ) - self._alpha * next_log_probs 36 | target_q = rewards + self._gamma * (1 - terminals) * next_q 37 | 38 | critic1_loss = ((q1 - target_q).pow(2)).mean() 39 | self.critic1_optim.zero_grad() 40 | critic1_loss.backward() 41 | self.critic1_optim.step() 42 | 43 | critic2_loss = ((q2 - target_q).pow(2)).mean() 44 | self.critic2_optim.zero_grad() 45 | critic2_loss.backward() 46 | self.critic2_optim.step() 47 | 48 | # update actor 49 | a, log_probs = self.actforward(obss) 50 | q1a, q2a = self.critic1(obss, a), self.critic2(obss, a) 51 | 52 | actor_loss = - torch.min(q1a, q2a).mean() + self._alpha * log_probs.mean() 53 | self.actor_optim.zero_grad() 54 | actor_loss.backward() 55 | self.actor_optim.step() 56 | 57 | if self._is_auto_alpha: 58 | log_probs = log_probs.detach() + self._target_entropy 59 | alpha_loss = -(self._log_alpha * log_probs).mean() 60 | self.alpha_optim.zero_grad() 61 | alpha_loss.backward() 62 | self.alpha_optim.step() 63 | self._alpha = torch.clamp(self._log_alpha.detach().exp(), 0.0, 1.0) 64 | 65 | self._sync_weight() 66 | 67 | result = { 68 | "loss/actor": actor_loss.item(), 69 | "loss/critic1": critic1_loss.item(), 70 | "loss/critic2": critic2_loss.item(), 71 | } 72 | 73 | if self._is_auto_alpha: 74 | result["loss/alpha"] = alpha_loss.item() 75 | result["alpha"] = self._alpha.item() 76 | 77 | return result 78 | -------------------------------------------------------------------------------- /offlinerl/algo/modelfree/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/modelfree/__init__.py -------------------------------------------------------------------------------- /offlinerl/algo/modelfree/bc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from copy import deepcopy 3 | from loguru import logger 4 | 5 | from offlinerl.algo.base import BaseAlgo 6 | from offlinerl.utils.net.continuous import GaussianActor 7 | from offlinerl.utils.exp import setup_seed 8 | 9 | 10 | def algo_init(args): 11 | logger.info('Run algo_init function') 12 | 13 | setup_seed(args['seed']) 14 | 15 | if args["obs_shape"] and args["action_shape"]: 16 | obs_shape, action_shape = args["obs_shape"], args["action_shape"] 17 | max_action = args["max_action"] 18 | elif "task" in args.keys(): 19 | from offlinerl.utils.env import get_env_shape, get_env_action_range 20 | obs_shape, action_shape = get_env_shape(args['task']) 21 | max_action, _ = get_env_action_range(args["task"]) 22 | args["obs_shape"], args["action_shape"] = obs_shape, action_shape 23 | else: 24 | raise NotImplementedError 25 | 26 | actor = GaussianActor(obs_shape, action_shape, args['actor_features'], args['actor_layers']).to(args['device']) 27 | actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr']) 28 | 29 | return { 30 | "actor" : {"net" : actor, "opt" : actor_optim}, 31 | } 32 | 33 | 34 | class AlgoTrainer(BaseAlgo): 35 | def __init__(self, algo_init, args): 36 | super(AlgoTrainer, self).__init__(args) 37 | self.args = args 38 | 39 | self.actor = algo_init['actor']['net'] 40 | self.actor_optim = algo_init['actor']['opt'] 41 | 42 | self.batch_size = self.args['batch_size'] 43 | self.device = self.args['device'] 44 | 45 | self.best_actor = deepcopy(self.actor) 46 | self.best_loss = float('inf') 47 | 48 | def train(self, train_buffer, val_buffer, callback_fn): 49 | if val_buffer == None: 50 | from offlinerl.utils.data import SampleBatch 51 | ori_buffer = deepcopy(train_buffer) 52 | sep_len = int(len(ori_buffer)*0.1) 53 | val_buffer = SampleBatch(ori_buffer[-sep_len:]) 54 | train_buffer = SampleBatch(ori_buffer[:-sep_len]) 55 | # breakpoint() 56 | for epoch in range(self.args['max_epoch']): 57 | for i in range(self.args['steps_per_epoch']): 58 | batch_data = train_buffer.sample(self.batch_size) 59 | batch_data.to_torch(device=self.device) 60 | obs = batch_data['obs'] 61 | action = batch_data['act'] 62 | 63 | action_dist = self.actor(obs) 64 | # loss = - action_dist.log_prob(action).mean() 65 | loss = ((action_dist.mode - action) ** 2).mean() 66 | 67 | self.actor_optim.zero_grad() 68 | loss.backward() 69 | self.actor_optim.step() 70 | 71 | with torch.no_grad(): 72 | val_loss = 0 73 | for i in range(len(val_buffer) // self.batch_size + (len(val_buffer) % self.batch_size > 0)): 74 | batch_data = val_buffer[i*self.batch_size:(i+1)*self.batch_size] 75 | batch_data.to_torch(device=self.device) 76 | obs = batch_data['obs'] 77 | action = batch_data['act'] 78 | 79 | action_dist = self.actor(obs) 80 | val_loss += ((action_dist.mean - action) ** 2).mean().item() 81 | 82 | if val_loss < self.best_loss: 83 | self.best_loss = val_loss 84 | self.best_actor.load_state_dict(self.actor.state_dict()) 85 | 86 | res = callback_fn(self.get_policy()) 87 | res['loss'] = val_loss 88 | self.log_res(epoch, res) 89 | 90 | return self.report_result 91 | 92 | def get_policy(self): 93 | return self.best_actor 94 | -------------------------------------------------------------------------------- /offlinerl/algo/modelfree/bcqd.py: -------------------------------------------------------------------------------- 1 | #Discrete Batch-Constrained deep Q-Learning (BCQ) 2 | import copy 3 | 4 | import torch 5 | import numpy as np 6 | from torch import nn 7 | from torch import optim 8 | import torch.nn.functional as F 9 | from loguru import logger 10 | 11 | from offlinerl.algo.base import BaseAlgo 12 | from offlinerl.utils.net.bcq_net import Conv_Q, FC_Q 13 | from offlinerl.utils.exp import setup_seed 14 | 15 | 16 | def algo_init(args): 17 | logger.info('Run algo_init function') 18 | 19 | setup_seed(args['seed']) 20 | 21 | if args["obs_shape"] and args["action_shape"]: 22 | obs_shape, action_shape = args["obs_shape"], args["action_shape"] 23 | elif "task" in args.keys(): 24 | from offlinerl.utils.env import get_env_shape 25 | obs_shape, action_shape = get_env_shape(args['task']) 26 | args["obs_shape"], args["action_shape"] = obs_shape, action_shape 27 | else: 28 | raise NotImplementedError 29 | 30 | if isinstance(args["obs_shape"], int): 31 | state_dim = ( 32 | 4, 33 | 84, 34 | 84 35 | ) 36 | 37 | critic = Conv_Q(state_dim[0], args["action_shape"]).to(args['device']) 38 | else: 39 | critic = FC_Q(np.prod(args["obs_shape"]), args["action_shape"]).to(args['device']) 40 | 41 | critic_opt = optim.Adam(critic.parameters(), **args["optimizer_parameters"]) 42 | 43 | 44 | nets = { 45 | "critic" : {"net" : critic, "opt" : critic_opt}, 46 | 47 | } 48 | 49 | return nets 50 | 51 | 52 | class AlgoTrainer(BaseAlgo): 53 | def __init__(self, algo_init, args): 54 | super(AlgoTrainer, self).__init__(args) 55 | self.args = args 56 | 57 | self.Q = algo_init["critic"]["net"] 58 | self.Q_target = copy.deepcopy(self.Q) 59 | self.Q_optimizer = algo_init["critic"]["opt"] 60 | 61 | self.discount = self.args["discount"] 62 | 63 | # Target update rule 64 | self.maybe_update_target = self.polyak_target_update if self.args["polyak_target_update"] else self.copy_target_update 65 | self.target_update_frequency = self.args["target_update_frequency"] 66 | self.tau = self.args["tau"] 67 | 68 | # Decay for eps 69 | self.initial_eps = self.args["initial_eps"] 70 | self.end_eps = self.args["end_eps"] 71 | self.slope = (self.end_eps - self.initial_eps) / self.args["eps_decay_period"] 72 | 73 | # Evaluation hyper-parameters 74 | self.state_shape = (-1,) + self.args["obs_shape"] if isinstance(self.args["obs_shape"], int) else (-1, self.args["obs_shape"]) 75 | self.eval_eps = self.args["eval_eps"] 76 | self.num_actions = self.args["action_shape"] 77 | 78 | # Threshold for "unlikely" actions 79 | self.threshold = self.args["BCQ_threshold"] 80 | 81 | # Number of training iterations 82 | self.iterations = 0 83 | 84 | def train(self, train_buffer, val_buffer, callback_fn): 85 | training_iters = 0 86 | while training_iters < self.args["max_timesteps"]: 87 | 88 | # Sample replay buffer 89 | batch = train_buffer.sample(self.args["batch_size"]) 90 | batch = batch.to_torch(dtype=torch.float32, device=self.args["device"]) 91 | reward = batch.rew 92 | done = batch.done 93 | state = batch.obs 94 | action = batch.act.to(torch.int64) 95 | next_state = batch.obs_next 96 | 97 | # Compute the target Q value 98 | with torch.no_grad(): 99 | q, imt, i = self.Q(next_state) 100 | imt = imt.exp() 101 | imt = (imt/imt.max(1, keepdim=True)[0] > self.threshold).float() 102 | 103 | # Use large negative number to mask actions from argmax 104 | next_action = (imt * q + (1 - imt) * -1e8).argmax(1, keepdim=True) 105 | 106 | q, imt, i = self.Q_target(next_state) 107 | target_Q = reward + done * self.discount * q.gather(1, next_action).reshape(-1, 1) 108 | 109 | # Get current Q estimate 110 | current_Q, imt, i = self.Q(state) 111 | 112 | current_Q = current_Q.gather(1, action) 113 | 114 | # Compute Q loss 115 | q_loss = F.smooth_l1_loss(current_Q, target_Q) 116 | i_loss = F.nll_loss(imt, action.reshape(-1)) 117 | 118 | Q_loss = q_loss + i_loss + 1e-2 * i.pow(2).mean() 119 | 120 | # Optimize the Q 121 | self.Q_optimizer.zero_grad() 122 | Q_loss.backward() 123 | self.Q_optimizer.step() 124 | 125 | # Update target network by polyak or full copy every X iterations. 126 | self.maybe_update_target() 127 | training_iters += 1 128 | #print(training_iters ,self.args["eval_freq"]) 129 | if training_iters % self.args["eval_freq"] == 0: 130 | res = callback_fn(self.get_policy()) 131 | 132 | self.log_res(training_iters // self.args["eval_freq"], res) 133 | 134 | return self.report_result 135 | 136 | 137 | def polyak_target_update(self): 138 | for param, target_param in zip(self.Q.parameters(), self.Q_target.parameters()): 139 | target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) 140 | 141 | 142 | def copy_target_update(self): 143 | if self.iterations % self.target_update_frequency == 0: 144 | self.Q_target.load_state_dict(self.Q.state_dict()) 145 | 146 | def save(self, filename): 147 | torch.save(self.Q.state_dict(), filename + "_Q") 148 | torch.save(self.Q_optimizer.state_dict(), filename + "_optimizer") 149 | 150 | 151 | def load(self, filename): 152 | self.Q.load_state_dict(torch.load(filename + "_Q")) 153 | self.Q_target = copy.deepcopy(self.Q) 154 | self.Q_optimizer.load_state_dict(torch.load(filename + "_optimizer")) 155 | 156 | def get_policy(self,): 157 | return self.Q 158 | 159 | def save_model(self): 160 | pass -------------------------------------------------------------------------------- /offlinerl/algo/modelfree/crr.py: -------------------------------------------------------------------------------- 1 | # Critic regularized regression 2 | # Paper: https://arxiv.org/abs/2006.15134 3 | 4 | import torch 5 | from copy import deepcopy 6 | from loguru import logger 7 | 8 | from offlinerl.algo.base import BaseAlgo 9 | from offlinerl.utils.net.common import Net 10 | from offlinerl.utils.net.continuous import DistributionalCritic 11 | from offlinerl.utils.net.tanhpolicy import TanhGaussianPolicy 12 | from offlinerl.utils.exp import setup_seed 13 | 14 | def algo_init(args): 15 | logger.info('Run algo_init function') 16 | 17 | setup_seed(args['seed']) 18 | 19 | if args["obs_shape"] and args["action_shape"]: 20 | obs_shape, action_shape = args["obs_shape"], args["action_shape"] 21 | max_action = args["max_action"] 22 | elif "task" in args.keys(): 23 | from offlinerl.utils.env import get_env_shape, get_env_action_range 24 | obs_shape, action_shape = get_env_shape(args['task']) 25 | max_action, _ = get_env_action_range(args["task"]) 26 | args["obs_shape"], args["action_shape"] = obs_shape, action_shape 27 | else: 28 | raise NotImplementedError 29 | 30 | net_a = Net(layer_num=args['hidden_layers'], 31 | state_shape=obs_shape, 32 | hidden_layer_size=args['hidden_features']) 33 | 34 | actor = TanhGaussianPolicy(preprocess_net=net_a, 35 | action_shape=action_shape, 36 | hidden_layer_size=args['hidden_features'], 37 | conditioned_sigma=True).to(args['device']) 38 | 39 | actor_optim = torch.optim.Adam(actor.parameters(), lr=args['lr']) 40 | 41 | critic = DistributionalCritic(obs_shape, action_shape, args['atoms'], 42 | args['hidden_features'], args['hidden_layers'], 43 | None, None).to(args['device']) 44 | critic_optim = torch.optim.Adam(critic.parameters(), lr=args['lr']) 45 | 46 | return { 47 | "actor" : {"net" : actor, "opt" : actor_optim}, 48 | "critic" : {"net" : critic, "opt" : critic_optim}, 49 | } 50 | 51 | 52 | class AlgoTrainer(BaseAlgo): 53 | def __init__(self, algo_init, args): 54 | super(AlgoTrainer, self).__init__(args) 55 | self.args = args 56 | 57 | self.actor = algo_init['actor']['net'] 58 | self.actor_target = deepcopy(self.actor) 59 | self.actor_target.requires_grad_(False) 60 | self.actor_optim = algo_init['actor']['opt'] 61 | 62 | self.critic = algo_init['critic']['net'] 63 | self.critic_target = deepcopy(self.critic) 64 | self.critic_target.requires_grad_(False) 65 | self.critic_optim = algo_init['critic']['opt'] 66 | 67 | self.batch_size = self.args['batch_size'] 68 | self.gamma = self.args['gamma'] 69 | self.beta = self.args['beta'] 70 | self.m = self.args['advantage_samples'] 71 | self.advantage_mode = self.args['advantage_mode'] 72 | self.weight_mode = self.args['weight_mode'] 73 | self.device = self.args['device'] 74 | 75 | def train(self, train_buffer, val_buffer, callback_fn): 76 | rewards = train_buffer['rew'] 77 | self.critic.set_interval(rewards.min() / (1 - self.gamma), rewards.max() / (1 - self.gamma)) 78 | self.critic_target.set_interval(rewards.min() / (1 - self.gamma), rewards.max() / (1 - self.gamma)) 79 | for epoch in range(self.args['max_epoch']): 80 | for i in range(self.args['steps_per_epoch']): 81 | batch_data = train_buffer.sample(self.batch_size) 82 | batch_data.to_torch(device=self.device) 83 | obs = batch_data['obs'] 84 | action = batch_data['act'] 85 | next_obs = batch_data['obs_next'] 86 | reward = batch_data['rew'] 87 | done = batch_data['done'].float() 88 | 89 | # update critic 90 | p = self.critic(obs, action) 91 | next_action = self.actor_target.get_action(next_obs) 92 | target_p = self.critic_target.get_target(next_obs, next_action, reward, self.gamma * (1 - done)) 93 | critic_loss = - (target_p * torch.log(p + 1e-8)).mean() 94 | 95 | self.critic_optim.zero_grad() 96 | critic_loss.backward() 97 | self.critic_optim.step() 98 | 99 | # update actor 100 | action_dist = self.actor(obs) 101 | log_prob = action_dist.log_prob(action) 102 | actions = torch.stack([action_dist.sample() for _ in range(self.m)], dim=0) 103 | repeat_obs = torch.repeat_interleave(obs.unsqueeze(0), self.m, 0) 104 | _, values = self.critic(repeat_obs, actions, with_q=True) 105 | _, value = self.critic(obs, action, with_q=True) 106 | 107 | if self.advantage_mode == 'mean': 108 | advantage = value - values.mean(dim=0) 109 | elif self.advantage_mode == 'max': 110 | advantage = value - values.max(dim=0)[0] 111 | 112 | if self.weight_mode == 'exp': 113 | weight = torch.exp(advantage / self.beta) 114 | elif self.weight_mode == 'binary': 115 | weight = (advantage > 0).float() 116 | 117 | weight = torch.clamp_max(weight, 20).detach() 118 | actor_loss = - torch.mean(weight * log_prob) 119 | 120 | self.actor_optim.zero_grad() 121 | actor_loss.backward() 122 | self.actor_optim.step() 123 | 124 | if i % self.args['update_frequency']: 125 | self._sync_weight(self.critic_target, self.critic, 1.0) 126 | self._sync_weight(self.actor_target, self.actor, 1.0) 127 | print("actor_loss: ", actor_loss.item()) 128 | res = callback_fn(self.get_policy()) 129 | 130 | self.log_res(epoch, res) 131 | 132 | return self.report_result 133 | 134 | def get_policy(self): 135 | return self.actor -------------------------------------------------------------------------------- /offlinerl/algo/modelfree/td3bc.py: -------------------------------------------------------------------------------- 1 | # A Minimalist Approach to Offline Reinforcement Learning 2 | # https://arxiv.org/pdf/2106.06860 3 | # https://github.com/sfujim/TD3_BC 4 | import torch 5 | from copy import deepcopy 6 | from loguru import logger 7 | from torch.functional import F 8 | 9 | from offlinerl.algo.base import BaseAlgo 10 | from offlinerl.utils.net.common import MLP,Net 11 | from offlinerl.utils.net.tanhpolicy import TanhGaussianPolicy 12 | from offlinerl.utils.exp import setup_seed 13 | 14 | 15 | def algo_init(args): 16 | logger.info('Run algo_init function') 17 | setup_seed(args['seed']) 18 | if args["obs_shape"] and args["action_shape"]: 19 | obs_shape, action_shape = args["obs_shape"], args["action_shape"] 20 | max_action = args["max_action"] 21 | elif "task" in args.keys(): 22 | from offlinerl.utils.env import get_env_shape, get_env_action_range 23 | obs_shape, action_shape = get_env_shape(args['task']) 24 | max_action, _ = get_env_action_range(args["task"]) 25 | args["obs_shape"], args["action_shape"] = obs_shape, action_shape 26 | else: 27 | raise NotImplementedError 28 | 29 | net_a = Net(layer_num = args['actor_layers'], 30 | state_shape = obs_shape, 31 | hidden_layer_size = args['actor_features']) 32 | 33 | actor = TanhGaussianPolicy(preprocess_net = net_a, 34 | action_shape = action_shape, 35 | hidden_layer_size = args['actor_features'], 36 | conditioned_sigma = True, 37 | ).to(args['device']) 38 | 39 | actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr']) 40 | 41 | critic_1 = MLP(obs_shape + action_shape, 1, args['value_features'], args['value_layers'], hidden_activation='relu').to(args['device']) 42 | critic_2 = MLP(obs_shape + action_shape, 1, args['value_features'], args['value_layers'], hidden_activation='relu').to(args['device']) 43 | critic_1_optim = torch.optim.Adam([*critic_1.parameters()], lr=args['critic_lr']) 44 | critic_2_optim = torch.optim.Adam([*critic_2.parameters()], lr=args['critic_lr']) 45 | 46 | nets = { 47 | "actor" : {"net" : actor, "opt" : actor_optim}, 48 | "critic" : {"net" : [critic_1, critic_2], "opt" : [critic_1_optim,critic_2_optim]}, 49 | 50 | } 51 | 52 | return nets 53 | 54 | 55 | class AlgoTrainer(BaseAlgo): 56 | def __init__(self, algo_init, args): 57 | super(AlgoTrainer, self).__init__(args) 58 | self.args = args 59 | 60 | self.actor = algo_init['actor']['net'] 61 | self.actor_optim = algo_init['actor']['opt'] 62 | 63 | self.critic_1, self.critic_2 = algo_init['critic']['net'] 64 | self.target_critic_1 = deepcopy(self.critic_1) 65 | self.target_critic_2 = deepcopy(self.critic_2) 66 | self.critic_1_optim = algo_init['critic']['opt'][0] 67 | self.critic_2_optim = algo_init['critic']['opt'][1] 68 | 69 | self.alpha = self.args['alpha'] 70 | self.policy_noise = self.args['policy_noise'] 71 | self.noise_clip = self.args['noise_clip'] 72 | self.policy_freq = self.args['policy_freq'] 73 | self.discount = self.args['discount'] 74 | 75 | self.batch_size = self.args['batch_size'] 76 | self.device = self.args['device'] 77 | self.max_action = 1 78 | 79 | 80 | def forward(self, obs, reparameterize=True, return_log_prob=True): 81 | log_prob = None 82 | tanh_normal = self.actor(obs,reparameterize=reparameterize,) 83 | if return_log_prob: 84 | if reparameterize is True: 85 | action, pre_tanh_value = tanh_normal.rsample( 86 | return_pretanh_value=True 87 | ) 88 | else: 89 | action, pre_tanh_value = tanh_normal.sample( 90 | return_pretanh_value=True 91 | ) 92 | log_prob = tanh_normal.log_prob( 93 | action, 94 | pre_tanh_value=pre_tanh_value 95 | ) 96 | log_prob = log_prob.sum(dim=1, keepdim=True) 97 | else: 98 | if reparameterize is True: 99 | action = tanh_normal.rsample() 100 | else: 101 | action = tanh_normal.sample() 102 | return action, log_prob 103 | 104 | def train(self, train_buffer, val_buffer, callback_fn): 105 | # train_buffer 106 | obs_mean = train_buffer["obs"].mean(0) 107 | obs_std = train_buffer["obs"].std(0) + 1e-3 108 | obs_mean = torch.as_tensor(obs_mean, dtype=torch.float32) 109 | obs_std = torch.as_tensor(obs_std, dtype=torch.float32) 110 | self.actor.preprocess.s_mean = obs_mean 111 | self.actor.preprocess.s_std = obs_std 112 | 113 | self.target_actor = deepcopy(self.actor) 114 | 115 | for epoch in range(self.args['max_epoch']): 116 | for i in range(self.args['steps_per_epoch']): 117 | batch_data = train_buffer.sample(self.batch_size) 118 | batch_data.to_torch(device=self.device) 119 | 120 | obs = batch_data['obs'] 121 | action = batch_data['act'] 122 | next_obs = batch_data['obs_next'] 123 | reward = batch_data['rew'] 124 | done = batch_data['done'].float() 125 | 126 | with torch.no_grad(): 127 | noise = (torch.randn_like(action) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) 128 | next_action = (self.target_actor(next_obs).mode + noise).clamp(-self.max_action, self.max_action) 129 | next_obs_action = torch.cat([next_obs, next_action], dim=-1) 130 | target_q = torch.min( 131 | self.target_critic_1(next_obs_action), self.target_critic_2(next_obs_action) 132 | )*self.discount*(1-done) + reward 133 | 134 | obs_action = torch.cat([obs, action], dim=-1) 135 | current_q1, current_q2 = self.critic_1(obs_action), self.critic_2(obs_action) 136 | critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q) 137 | 138 | # Optimize the critic 139 | self.critic_1_optim.zero_grad() 140 | self.critic_2_optim.zero_grad() 141 | critic_loss.backward() 142 | self.critic_1_optim.step() 143 | self.critic_2_optim.step() 144 | 145 | 146 | if i % self.policy_freq == 0: 147 | pi = self.actor(obs).mode 148 | q = self.critic_1(torch.cat([obs, pi], dim=-1)) 149 | lmbda = self.alpha / q.abs().mean().detach() 150 | actor_loss = -lmbda * q.mean() + F.mse_loss(pi, action) 151 | 152 | self.actor_optim.zero_grad() 153 | actor_loss.backward() 154 | self.actor_optim.step() 155 | 156 | self._sync_weight(self.target_actor, self.actor, soft_target_tau=self.args['soft_target_tau']) 157 | self._sync_weight(self.target_critic_1, self.critic_1, soft_target_tau=self.args['soft_target_tau']) 158 | self._sync_weight(self.target_critic_2, self.critic_2, soft_target_tau=self.args['soft_target_tau']) 159 | 160 | res = callback_fn(self.get_policy()) 161 | 162 | res.update({ 163 | "actor_loss" : actor_loss.item(), 164 | "critic_loss" : critic_loss.item(), 165 | "lmbda" : lmbda.item(), 166 | "q" : q.mean().item(), 167 | }) 168 | 169 | 170 | self.log_res(epoch, res) 171 | 172 | return self.report_result 173 | 174 | def get_model(self): 175 | return self.actor 176 | 177 | def get_policy(self): 178 | return self.actor -------------------------------------------------------------------------------- /offlinerl/algo/online/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/online/__init__.py -------------------------------------------------------------------------------- /offlinerl/algo/online/bremen.py: -------------------------------------------------------------------------------- 1 | # Deployment-Efficient Reinforcement Learning via Model-Based Offline Optimization 2 | # https://arxiv.org/abs/2006.03647 3 | # https://github.com/matsuolab/BREMEN 4 | 5 | # TODO -------------------------------------------------------------------------------- /offlinerl/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/config/__init__.py -------------------------------------------------------------------------------- /offlinerl/config/algo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/config/algo/__init__.py -------------------------------------------------------------------------------- /offlinerl/config/algo/bc_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | actor_features = 256 16 | actor_layers = 2 17 | 18 | batch_size = 256 19 | steps_per_epoch = 1000 20 | max_epoch = 1000 21 | 22 | actor_lr = 1e-3 23 | 24 | #tune 25 | params_tune = { 26 | "actor_lr" : {"type" : "continuous", "value": [1e-4, 1e-3]}, 27 | } 28 | 29 | #tune 30 | grid_tune = { 31 | "actor_lr" : [1e-4, 5e-4, 1e-3], 32 | "actor_layers" : [2,3], 33 | } 34 | 35 | -------------------------------------------------------------------------------- /offlinerl/config/algo/bc_model_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | # model save path 16 | dynamics_path = None 17 | dynamics_save_path = None 18 | 19 | # transition model train 20 | transition_init_num = 7 21 | transition_select_num = 5 22 | val_ratio = 0.2 23 | max_epochs_since_update = 10 24 | transition_max_epochs = None 25 | 26 | # trick config 27 | normalize_obs = False 28 | transition_scaler = True 29 | 30 | # transition config 31 | transition_batch_size = 256 32 | transition_lr = 1e-3 33 | logvar_loss_coef = 0.01 34 | dynamics_hidden_dims = [200, 200, 200, 200] 35 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4] 36 | 37 | #tune 38 | params_tune = { 39 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]}, 40 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]}, 41 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]}, 42 | "lam" : {"type" : "continuous", "value": [0.1, 10]}, 43 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]}, 44 | } 45 | 46 | #tune 47 | grid_tune = { 48 | "transition_scaler" : [True, False], 49 | "transition_lr" : [1e-3, 3e-4], 50 | "logvar_loss_coef" : [0.01, 1e-3], 51 | } 52 | -------------------------------------------------------------------------------- /offlinerl/config/algo/bcq_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | vae_features = 750 16 | vae_layers = 2 17 | jitter_features = 400 18 | jitter_layers = 2 19 | value_features = 400 20 | value_layers = 2 21 | phi = 0.05 22 | lam = 0.75 23 | 24 | batch_size = 100 25 | steps_per_epoch = 5000 26 | max_epoch = 200 27 | 28 | vae_lr = 1e-3 29 | jitter_lr = 3e-4 30 | critic_lr = 3e-4 31 | gamma = 0.99 32 | soft_target_tau = 5e-3 33 | 34 | #tune 35 | params_tune = { 36 | "phi" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]}, 37 | "lam" : {"type" : "continuous", "value": [0, 1]}, 38 | } 39 | 40 | #tune 41 | grid_tune = { 42 | "phi" : [0.05, 0.1, 0.2, 0.5], 43 | } 44 | -------------------------------------------------------------------------------- /offlinerl/config/algo/bcqd_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | 16 | max_timesteps = 1e6 17 | eval_freq = 1e3 18 | 19 | optimizer_parameters = { 20 | "lr": 3e-4, 21 | } 22 | 23 | BCQ_threshold = 0.3 24 | 25 | discount = 0.99 26 | tau = 0.005 27 | polyak_target_update = True 28 | target_update_frequency=1 29 | start_timesteps = 1e3 30 | initial_eps = 0.1 31 | end_eps = 0.1 32 | eps_decay_period = 1 33 | eval_eps = 0.001 34 | buffer_size = 1e6 35 | batch_size = 256 36 | train_freq = 1 -------------------------------------------------------------------------------- /offlinerl/config/algo/bremen_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 10 | obs_shape = None 11 | act_shape = None 12 | 13 | dynamics_path = None 14 | behavior_path = None 15 | 16 | transition_hidden_size = 256 17 | transition_hidden_layers = 4 18 | transition_init_num = 7 19 | transition_select_num = 5 20 | 21 | actor_hidden_size = 256 22 | actor_hidden_layers = 2 23 | value_hidden_size = 256 24 | value_hidden_layers = 2 25 | 26 | transition_batch_size = 256 27 | data_collection_per_epoch = 50000 28 | max_epoch = 250 29 | trpo_steps_per_epoch = 25 30 | 31 | bc_batch_size = 256 32 | bc_init = True 33 | 34 | transition_lr = 1e-3 35 | bc_lr = 1e-3 36 | value_lr = 3e-4 37 | 38 | cg_iters = 10 39 | damping_coeff = 0.1 40 | backtrack_iters = 10 41 | backtrack_coeff = 0.8 42 | train_v_iters = 50 43 | trpo_step_size = 0.01 44 | explore_mode = 'sample' 45 | static_noise = 0.1 46 | 47 | horizon = 250 48 | gamma = 0.99 49 | lam = 0.95 50 | 51 | #tune 52 | params_tune = { 53 | "horizon" : {"type" : "discrete", "value": [250, 500, 1000]} 54 | } 55 | 56 | #tune 57 | grid_tune = { 58 | 'horizon' : [250, 1000], 59 | # 'trpo_step_size' : [0.01, 0.05], 60 | 'explore_mode' : ['sample', 'static'], 61 | } 62 | -------------------------------------------------------------------------------- /offlinerl/config/algo/combo_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | # model save path 16 | dynamics_path = None 17 | dynamics_save_path = None 18 | 19 | # transition model train 20 | transition_init_num = 7 21 | transition_select_num = 5 22 | val_ratio = 0.2 23 | max_epochs_since_update = 5 24 | transition_max_epochs = None 25 | 26 | # trick config 27 | trainsition_clip = True 28 | normalize_obs = False 29 | transition_scaler = True 30 | policy_scaler = False 31 | 32 | # transition config 33 | transition_batch_size = 256 34 | transition_lr = 1e-3 35 | logvar_loss_coef = 0.01 36 | dynamics_hidden_dims = [200, 200, 200, 200] 37 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4] 38 | 39 | # alpha config 40 | learnable_alpha = True 41 | alpha_lr = 1e-4 42 | alpha = 0.2 43 | 44 | # train config 45 | horizon = 1 46 | real_data_ratio = 0.5 47 | max_epoch = 1000 48 | steps_per_epoch = 1000 49 | rollout_freq = 1000 50 | rollout_batch_size = 5e+4 51 | 52 | # policy config 53 | hidden_dims = [256, 256, 256] 54 | policy_batch_size = 256 55 | actor_lr = 1e-4 56 | 57 | # critic config 58 | critic_lr = 3e-4 59 | discount = 0.99 60 | soft_target_tau = 5e-3 61 | target_entropy = None 62 | 63 | # others 64 | val_frequency = 10 65 | eval_episodes = 10 66 | model_retain_epochs = 5 67 | 68 | # combo config 69 | cql_weight = 2.5 70 | temperatue = 1.0 71 | max_q_backup = False 72 | deterministic_backup = True 73 | with_lagrange = False 74 | lagrange_threshold = 10.0 75 | cql_alpha_lr = 3e-4 76 | num_repeat_actions = 10 77 | uniform_rollout = False 78 | rho_s = "mix" # choose from ["model", "mix"] 79 | 80 | #tune 81 | params_tune = { 82 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]}, 83 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]}, 84 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]}, 85 | "lam" : {"type" : "continuous", "value": [0.1, 10]}, 86 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]}, 87 | } 88 | 89 | #tune 90 | grid_tune = { 91 | "horizon" : [1, 5], 92 | "cql_weight" : [2.5, 3.5, 5], 93 | "rho_s": ["model", "mix"], 94 | } 95 | -------------------------------------------------------------------------------- /offlinerl/config/algo/cql_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | max_epoch = 1000 16 | steps_per_epoch = 1000 17 | policy_bc_steps = 40000 18 | 19 | batch_size = 256 20 | hidden_layer_size = 256 21 | layer_num = 2 22 | actor_lr=1E-4 23 | critic_lr=3E-4 24 | reward_scale=1 25 | use_automatic_entropy_tuning=True 26 | target_entropy = None 27 | discount = 0.99 28 | soft_target_tau=5e-3 29 | 30 | # min Q 31 | explore=1.0 32 | temp=1.0 33 | min_q_version=3 34 | min_q_weight=5.0 35 | # lagrange 36 | with_lagrange=False 37 | lagrange_thresh=2.0 38 | 39 | # extra params 40 | num_random=10 41 | type_q_backup= "min" 42 | q_backup_lmbda = 0.75 43 | deterministic_backup=False 44 | 45 | discrete = False 46 | 47 | #tune 48 | params_tune = { 49 | "actor_lr" : {"type" : "discrete", "value":[1e-4, 3e-4]}, 50 | "min_q_version" : {"type" : "discrete", "value":[2, 3]}, 51 | "min_q_weight" : {"type": "discrete", "value":[5, 10]}, 52 | "lagrange_thresh" : {"type": "discrete", "value":[-1, 2, 5, 10]}, 53 | "type_q_backup" : {"type": "discrete", "value":["max", "none"]}, 54 | } 55 | 56 | #tune 57 | grid_tune = { 58 | #"actor_lr" : [1e-4, 3e-4], 59 | "min_q_version" : [2, 3], 60 | "min_q_weight" : [5, 10], 61 | "lagrange_thresh" : [-1, 2, 5, 10], 62 | # "type_q_backup" : ["min", "none"], 63 | } 64 | -------------------------------------------------------------------------------- /offlinerl/config/algo/crr_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | hidden_features = 256 16 | hidden_layers = 2 17 | atoms = 21 18 | 19 | advantage_mode = 'mean' 20 | weight_mode = 'exp' 21 | advantage_samples = 4 22 | beta = 1.0 23 | gamma = 0.99 24 | 25 | batch_size = 1024 26 | steps_per_epoch = 1000 27 | max_epoch = 200 28 | 29 | lr = 1e-4 30 | update_frequency = 100 31 | 32 | #tune 33 | params_tune = { 34 | "beta" : {"type" : "continuous", "value": [0.0, 10.0]}, 35 | } 36 | 37 | #tune 38 | grid_tune = { 39 | "advantage_mode" : ['mean', 'max'], 40 | "weight_mode" : ['exp', 'binary'], 41 | } 42 | -------------------------------------------------------------------------------- /offlinerl/config/algo/edac_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | # from datetime import datetime 4 | 5 | task = "Hopper-v3" 6 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 7 | 8 | 9 | # parser.add_argument("--algo-name", type=str, default="edac") 10 | # parser.add_argument("--task", type=str, default="SafetyHalfCheetah") 11 | obs_shape = None 12 | act_shape = None 13 | 14 | seed = 42 15 | actor_lr=1e-4 16 | critic_lr=3e-4 17 | task_train_num = 99 18 | task_data_type = 'high' 19 | # hidden_dims=[256, 256, 256] 20 | hidden_layer_size = 256 21 | layer_num = 2 22 | gamma=0.99 23 | tau=0.005 24 | alpha=0.2 25 | auto_alpha=True 26 | 27 | target_entropy = None 28 | alpha_lr =1e-4 29 | num_critics = 50 30 | 31 | max_q_backup = False 32 | deterministic_backup=False 33 | 34 | eta=1.0 35 | normalize_reward=False 36 | 37 | epoch=3000 38 | step_per_epoch=1000 39 | 40 | eval_episodes=100 41 | batch_size=256 42 | 43 | #tune 44 | params_tune = { 45 | "num_critics" : {"type" : "discrete", "value":[10,50]}, 46 | "eta" : {"type" : "discrete", "value":[1, 5]}, 47 | } 48 | 49 | grid_tune = { 50 | "num_critics" : [10, 50], 51 | "eta" : [1, 5], 52 | } 53 | 54 | 55 | # task_data_type = "low" 56 | # task_train_num = 99 57 | 58 | # seed = 42 59 | 60 | # device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 61 | # obs_shape = None 62 | # act_shape = None 63 | # max_action = None 64 | 65 | # max_epoch = 300 66 | # steps_per_epoch = 1000 67 | # policy_bc_steps = 40000 68 | 69 | # batch_size = 256 70 | # hidden_layer_size = 256 71 | # layer_num = 2 72 | # actor_lr=1E-4 73 | # critic_lr=3E-4 74 | # reward_scale=1 75 | # use_automatic_entropy_tuning=True 76 | # target_entropy = None 77 | # discount = 0.99 78 | # soft_target_tau=5e-3 79 | 80 | # # min Q 81 | # explore=1.0 82 | # temp=1.0 83 | # min_q_version=3 84 | # min_q_weight=5.0 85 | 86 | # # lagrange 87 | # with_lagrange=False 88 | # lagrange_thresh=2.0 89 | 90 | # # extra params 91 | # num_random=10 92 | # type_q_backup= "min" 93 | # q_backup_lmbda = 0.75 94 | # deterministic_backup=False 95 | 96 | # discrete = False 97 | 98 | #tune 99 | # params_tune = { 100 | # "actor_lr" : {"type" : "discrete", "value":[1e-4, 3e-4]}, 101 | # "min_q_version" : {"type" : "discrete", "value":[2, 3]}, 102 | # "min_q_weight" : {"type": "discrete", "value":[5, 10]}, 103 | # "lagrange_thresh" : {"type": "discrete", "value":[-1, 2, 5, 10]}, 104 | # "type_q_backup" : {"type": "discrete", "value":["max", "none"]}, 105 | # } 106 | 107 | # #tune 108 | # grid_tune = { 109 | # #"actor_lr" : [1e-4, 3e-4], 110 | # "min_q_version" : [2, 3], 111 | # "min_q_weight" : [5, 10], 112 | # "lagrange_thresh" : [-1, 2, 5, 10], 113 | # # "type_q_backup" : ["min", "none"], 114 | # } 115 | -------------------------------------------------------------------------------- /offlinerl/config/algo/maple_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | # device = 'cuda:0' 12 | obs_shape = None 13 | act_shape = None 14 | max_action = None 15 | # new parameters based on mopo 16 | lstm_hidden_unit = 128 17 | Guassain_hidden_sizes = (256,256) 18 | value_hidden_sizes=(256,256) 19 | hidden_sizes=(16,) 20 | model_pool_size = 250000 21 | rollout_batch_size = 50000 22 | handle_per_round = 400 23 | out_train_epoch = 1000 24 | in_train_epoch = 1000 25 | 26 | train_batch_size = 256 # train policy num of trajectories 27 | 28 | number_runs_eval = 40 # evaluation epochs in mujoco 29 | 30 | #------------- 31 | dynamics_path = None 32 | dynamics_save_path = None 33 | only_dynamics = False 34 | 35 | hidden_layer_size = 256 36 | hidden_layers = 2 37 | transition_layers = 4 38 | 39 | transition_init_num = 20 40 | transition_select_num = 14 41 | # by selecting a number smaller than rollout_batch_size, you can protect the model rollout from OOM error 42 | mini_forward_size = -1 43 | 44 | real_data_ratio = 0.05 45 | 46 | transition_batch_size = 256 47 | policy_batch_size = 256 48 | data_collection_per_epoch = 50e3 49 | steps_per_epoch = 1000 50 | max_epoch = 1000 51 | 52 | 53 | eval_episodes = 100 54 | 55 | learnable_alpha = True 56 | uncertainty_mode = 'aleatoric' 57 | transition_lr = 1e-3 58 | actor_lr = 3e-4 59 | critic_lr = 3e-4 60 | discount = 0.99 61 | soft_target_tau = 5e-3 62 | 63 | horizon = 10 64 | lam = 0.25 65 | 66 | penalty_clip = 20 67 | mode = 'normalize' # 'normalize', 'local', 'noRes' 68 | 69 | #tune 70 | params_tune = { 71 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]}, 72 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]}, 73 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]}, 74 | "lam" : {"type" : "continuous", "value": [0.1, 10]}, 75 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]}, 76 | } 77 | 78 | #tune 79 | grid_tune = { 80 | "horizon" : [1, 5], 81 | "lam" : [0.5, 1, 2, 5], 82 | "uncertainty_mode" : ['aleatoric', 'disagreement'], 83 | } 84 | -------------------------------------------------------------------------------- /offlinerl/config/algo/maple_config_new.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | # transition model train 16 | transition_init_num = 20 17 | transition_select_num = 14 18 | val_ratio = 0.2 19 | max_epochs_since_update = 5 20 | transition_max_epochs = None 21 | 22 | # trick config 23 | trainsition_clip = False 24 | normalize_obs = False # should set to False 25 | transition_scaler = True 26 | 27 | # transition config 28 | transition_batch_size = 256 29 | transition_lr = 1e-3 30 | logvar_loss_coef = 0.01 31 | dynamics_hidden_dims = [200, 200, 200, 200] 32 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4] 33 | 34 | # new parameters based on mopo 35 | lstm_hidden_unit = 128 36 | Guassain_hidden_sizes = (256,256) 37 | value_hidden_sizes=(256,256) 38 | hidden_sizes=(16,) 39 | model_pool_size = 250000 40 | rollout_batch_size = 50000 41 | handle_per_round = 400 42 | out_train_epoch = 1000 43 | in_train_epoch = 1000 44 | 45 | train_batch_size = 256 # train policy num of trajectories 46 | 47 | number_runs_eval = 40 # evaluation epochs in mujoco 48 | 49 | #------------- 50 | dynamics_path = None 51 | dynamics_save_path = None 52 | only_dynamics = False 53 | 54 | hidden_layer_size = 256 55 | hidden_layers = 2 56 | 57 | real_data_ratio = 0.05 58 | 59 | policy_batch_size = 256 60 | data_collection_per_epoch = 50e3 61 | steps_per_epoch = 1000 62 | max_epoch = 1000 63 | 64 | eval_episodes = 100 65 | 66 | # alpha config 67 | learnable_alpha = True 68 | alpha_lr = 1e-4 69 | alpha = 0.2 70 | target_entropy = None 71 | 72 | uncertainty_mode = 'aleatoric' 73 | actor_lr = 3e-4 74 | critic_lr = 3e-4 75 | discount = 0.99 76 | soft_target_tau = 5e-3 77 | 78 | horizon = 10 79 | penalty_coef = 0.25 80 | 81 | penalty_clip = 20 82 | 83 | #tune 84 | params_tune = { 85 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]}, 86 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]}, 87 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]}, 88 | "lam" : {"type" : "continuous", "value": [0.1, 10]}, 89 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]}, 90 | } 91 | 92 | #tune 93 | grid_tune = { 94 | "horizon" : [1, 5], 95 | "lam" : [0.5, 1, 2, 5], 96 | "uncertainty_mode" : ['aleatoric', 'disagreement'], 97 | } 98 | -------------------------------------------------------------------------------- /offlinerl/config/algo/mcq_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | vae_features = 750 16 | vae_layers = 2 17 | actor_features = 400 18 | actor_layers = 2 19 | value_features = 400 20 | value_layers = 2 21 | lam = 0.95 22 | 23 | alpha = 0.2 24 | auto_alpha = True 25 | target_entropy = None 26 | 27 | batch_size = 256 28 | steps_per_epoch = 1000 29 | max_epoch = 1000 30 | 31 | vae_lr = 1e-3 32 | actor_lr = 3e-4 33 | critic_lr = 3e-4 34 | alpha_lr = 3e-4 35 | gamma = 0.99 36 | soft_target_tau = 5e-3 37 | 38 | num_sampled_actions = 10 39 | eval_episodes = 100 40 | 41 | #tune 42 | params_tune = { 43 | "lam" : {"type" : "continuous", "value": [0.3, 0.95]}, 44 | } 45 | 46 | #tune 47 | grid_tune = { 48 | "lam" : [0.3,0.4,0.5, 0.6, 0.7, 0.8, 0.9, 0.95], 49 | "auto_alpha" : [True, False], 50 | } 51 | -------------------------------------------------------------------------------- /offlinerl/config/algo/mobile_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | # model save path 16 | dynamics_path = None 17 | dynamics_save_path = None 18 | 19 | # transition model train 20 | transition_init_num = 7 21 | transition_select_num = 5 22 | val_ratio = 0.2 23 | max_epochs_since_update = 5 24 | transition_max_epochs = None 25 | 26 | # trick config 27 | trainsition_clip = True 28 | normalize_obs = False 29 | transition_scaler = True 30 | policy_scaler = False 31 | 32 | # transition config 33 | transition_batch_size = 256 34 | transition_lr = 1e-3 35 | logvar_loss_coef = 0.01 36 | dynamics_hidden_dims = [200, 200, 200, 200] 37 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4] 38 | 39 | # alpha config 40 | learnable_alpha = True 41 | alpha_lr = 1e-4 42 | alpha = 0.2 43 | 44 | # train config 45 | horizon = 5 46 | real_data_ratio = 0.05 47 | max_epoch = 3000 48 | steps_per_epoch = 1000 49 | rollout_freq = 1000 50 | rollout_batch_size = 5e+4 51 | 52 | # policy config 53 | hidden_dims = [256, 256] 54 | policy_batch_size = 256 55 | actor_lr = 1e-4 56 | 57 | # critic config 58 | critic_lr = 3e-4 59 | discount = 0.99 60 | soft_target_tau = 5e-3 61 | target_entropy = None 62 | 63 | # others 64 | val_frequency = 10 65 | eval_episodes = 10 66 | model_retain_epochs = 5 67 | 68 | # mobile config 69 | num_q_ensemble = 2 70 | penalty_coef = 3.5 71 | num_samples = 10 72 | 73 | #tune 74 | params_tune = { 75 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]}, 76 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]}, 77 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]}, 78 | "lam" : {"type" : "continuous", "value": [0.1, 10]}, 79 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]}, 80 | } 81 | 82 | #tune 83 | grid_tune = { 84 | "horizon" : [1, 5], 85 | "penalty_coef" : [0.5, 1.5, 2.5, 3.5], 86 | "real_data_ratio" :[0.05], 87 | } 88 | -------------------------------------------------------------------------------- /offlinerl/config/algo/moose_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | vae_iterations = 500000 16 | vae_hidden_size = 750 17 | vae_batch_size = 100 18 | vae_kl_weight = 0.5 19 | #vae_pretrain_model = "/tmp/vae_499999.pkl" 20 | 21 | 22 | latent = False 23 | layer_num = 3 24 | actor_batch_size = 100 25 | hidden_layer_size = 256 26 | actor_iterations = 500000 27 | vae_lr = 1e-4 28 | actor_lr = 1e-4 29 | critic_lr = 1e-3 30 | soft_target_tau = 0.005 31 | lmbda = 0.75 32 | discount = 0.99 33 | 34 | max_latent_action = 2 35 | phi = 0.05 36 | 37 | #tune 38 | params_tune = { 39 | "vae_iterations" : {"type" : "continuous", "value":[50000, 100000, 500000,]}, 40 | "actor_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]}, 41 | "vae_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]}, 42 | "lmbda" :{"type": "discrete", "value":[0.0, 0.25, 0.5, 0.75, 1.0]}, 43 | } 44 | -------------------------------------------------------------------------------- /offlinerl/config/algo/mopo_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | # model save path 16 | dynamics_path = None 17 | dynamics_save_path = None 18 | 19 | # transition model train 20 | transition_init_num = 7 21 | transition_select_num = 5 22 | val_ratio = 0.2 23 | max_epochs_since_update = 5 24 | transition_max_epochs = None 25 | 26 | # trick config 27 | trainsition_clip = False 28 | normalize_obs = False 29 | transition_scaler = True 30 | policy_scaler = False 31 | 32 | # transition config 33 | transition_batch_size = 256 34 | transition_lr = 1e-3 35 | logvar_loss_coef = 0.01 36 | dynamics_hidden_dims = [200, 200, 200, 200] 37 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4] 38 | 39 | # alpha config 40 | learnable_alpha = True 41 | alpha_lr = 1e-4 42 | alpha = 0.2 43 | target_entropy = None 44 | 45 | # train config 46 | horizon = 1 47 | real_data_ratio = 0.05 48 | max_epoch = 3000 49 | steps_per_epoch = 1000 50 | rollout_freq = 1000 51 | rollout_batch_size = 5e+4 52 | 53 | # policy config 54 | hidden_dims = [256, 256] 55 | policy_batch_size = 256 56 | actor_lr = 1e-4 57 | 58 | # critic config 59 | critic_lr = 3e-4 60 | discount = 0.99 61 | soft_target_tau = 5e-3 62 | 63 | # others 64 | model_retain_epochs = 5 65 | 66 | # mopo config 67 | uncertainty_mode = 'aleatoric' 68 | penalty_coef = 1 69 | 70 | #tune 71 | params_tune = { 72 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]}, 73 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]}, 74 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]}, 75 | "lam" : {"type" : "continuous", "value": [0.1, 10]}, 76 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]}, 77 | } 78 | 79 | #tune 80 | grid_tune = { 81 | "horizon" : [1, 5], 82 | "penalty_coef" : [0.5, 1, 2, 5], 83 | "uncertainty_mode" : ['aleatoric', 'disagreement'], 84 | } 85 | -------------------------------------------------------------------------------- /offlinerl/config/algo/plas_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | vae_iterations = 500000 16 | vae_hidden_size = 750 17 | vae_batch_size = 100 18 | vae_kl_weight = 0.5 19 | 20 | latent = True 21 | layer_num = 2 22 | actor_batch_size = 100 23 | hidden_layer_size = 256 24 | actor_iterations = 500000 25 | vae_lr = 1e-4 26 | actor_lr = 1e-4 27 | critic_lr = 1e-3 28 | soft_target_tau = 0.005 29 | lmbda = 0.75 30 | discount = 0.99 31 | 32 | max_latent_action = 2 33 | phi = 0.05 34 | 35 | #tune 36 | params_tune = { 37 | "vae_iterations" : {"type" : "discrete", "value":[50000, 100000, 500000,]}, 38 | "actor_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]}, 39 | "vae_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]}, 40 | "actor_batch_size" : {"type": "discrete", "value":[128, 256, 512]}, 41 | "latent" : {"type": "discrete", "value":[True, False]}, 42 | "lmbda" :{"type": "discrete", "value":[0.65, 0.75, 0.85]}, 43 | } 44 | 45 | #tune 46 | grid_tune = { 47 | "phi" : [0, 0.05, 0.1, 0.2, 0.4], 48 | } 49 | -------------------------------------------------------------------------------- /offlinerl/config/algo/prdc_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 10 | 11 | 12 | steps_per_epoch = 1000 13 | max_epoch = 1000 14 | batch_size = 256 15 | state_dim = None 16 | action_dim = None 17 | alpha = 2.5 18 | beta = 2.0 19 | k = 1 20 | policy_freq = 2 21 | noise_clip = 0.5 22 | policy_noise = 2 23 | discount = 0.99 24 | tau = 0.005 25 | expl_noise = 0.1 26 | critic_lr = 3e-4 27 | actor_lr = 3e-4 28 | max_action = 1.0 29 | 30 | 31 | 32 | #tune 33 | grid_tune = { 34 | "alpha" : [2.5, 7.5, 20.0, 40.0], 35 | "beta" : [2.0, 7.5, 15.0], 36 | } -------------------------------------------------------------------------------- /offlinerl/config/algo/rambo_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Simglucose" 5 | task_data_type = "medium" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | 10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 11 | obs_shape = None 12 | act_shape = None 13 | max_action = None 14 | 15 | # model save path 16 | policy_bc_path = None 17 | policy_bc_save_path = None 18 | dynamics_path = None 19 | dynamics_save_path = None 20 | 21 | # transition model train 22 | transition_init_num = 7 23 | transition_select_num = 5 24 | val_ratio = 0.2 25 | max_epochs_since_update = 5 26 | transition_max_epochs = None 27 | 28 | # trick config 29 | trainsition_clip = True 30 | normalize_obs = False 31 | transition_scaler = True 32 | policy_scaler = True 33 | 34 | # transition config 35 | transition_batch_size = 256 36 | transition_lr = 1e-3 # 3e-4 37 | logvar_loss_coef = 0.01 # 1e-3 38 | dynamics_hidden_dims = [200, 200, 200, 200] 39 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4] 40 | 41 | # alpha config 42 | learnable_alpha = True 43 | alpha_lr = 1e-4 44 | alpha = 0.2 45 | 46 | # train config 47 | horizon = 5 48 | real_data_ratio = 0.5 49 | max_epoch = 2000 50 | steps_per_epoch = 1000 51 | rollout_freq = 250 52 | rollout_batch_size = 5e+4 53 | 54 | # policy config 55 | hidden_dims = [256, 256] 56 | policy_batch_size = 256 57 | actor_lr = 1e-4 58 | 59 | # critic config 60 | critic_lr = 3e-4 61 | discount = 0.99 62 | soft_target_tau = 5e-3 63 | target_entropy = None 64 | 65 | # others 66 | val_frequency = 10 67 | eval_episodes = 10 68 | model_retain_epochs = 5 69 | 70 | # rambo config 71 | policy_bc_epoch = 50 72 | policy_bc_batch_size = 256 73 | policy_bc_lr = 1e-4 74 | 75 | transition_adv_lr = 3e-4 76 | dynamics_update_freq = 1000 77 | adv_train_steps = 1000 78 | adv_rollout_batch_size = 256 79 | adv_rollout_length = 5 80 | include_ent_in_adv = False 81 | adv_weight = 3e-4 82 | 83 | #tune 84 | params_tune = { 85 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]}, 86 | "horizon" : {"type" : "discrete", "value": [1, 2, 5]}, 87 | "adv_weight" : {"type" : "discrete", "value": [0, 3e-4]}, 88 | } 89 | 90 | #tune 91 | grid_tune = { 92 | "horizon" : [1, 5], 93 | "transition_adv_lr" : [1e-3, 3e-4], 94 | "adv_weight" : [0, 1e-3, 3e-4], 95 | } -------------------------------------------------------------------------------- /offlinerl/config/algo/td3bc_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from offlinerl.utils.exp import select_free_cuda 3 | 4 | task = "Hopper-v3" 5 | task_data_type = "low" 6 | task_train_num = 99 7 | 8 | seed = 42 9 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 10 | obs_shape = None 11 | act_shape = None 12 | max_action = None 13 | 14 | 15 | actor_features = 256 16 | actor_layers = 2 17 | value_features = 256 18 | value_layers = 2 19 | 20 | alpha = 2.5 21 | policy_noise = 0.2 22 | noise_clip = 0.5 23 | policy_freq = 2 24 | 25 | 26 | batch_size = 256 27 | steps_per_epoch = 1000 28 | max_epoch = 1000 29 | 30 | 31 | actor_lr = 3e-4 32 | critic_lr = 3e-4 33 | alpha_lr = 3e-4 34 | discount = 0.99 35 | soft_target_tau = 5e-3 36 | 37 | num_sampled_actions = 10 38 | eval_episodes = 100 39 | 40 | #tune 41 | grid_tune = { 42 | "alpha" : [0.05, 0.1, 0.2], 43 | "policy_noise" : [0.5, 1.5, 2.5], 44 | } 45 | -------------------------------------------------------------------------------- /offlinerl/data/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import random 4 | import numpy as np 5 | from loguru import logger 6 | 7 | from offlinerl.utils.logger import log_path 8 | from offlinerl.utils.io import create_dir, download_helper, read_json 9 | 10 | from offlinerl.data.neorl import load_neorl_buffer 11 | 12 | dataset_dir = os.path.join(log_path(),"./offlinerl_datasets") 13 | create_dir(dataset_dir) 14 | 15 | def load_data_from_neorl2_util(task): 16 | 17 | import neorl2 18 | import gymnasium as gym 19 | 20 | env = neorl2.make(task) 21 | if 'fusion' in task.lower(): 22 | train_data, val_data = env.get_dataset(traj_num=20) 23 | else: 24 | train_data, val_data = env.get_dataset() 25 | 26 | return train_data, val_data 27 | 28 | def load_data_from_neorl2(task): 29 | train_data, val_data = load_data_from_neorl2_util(task) 30 | train_buffer = load_neorl_buffer({ 31 | 'obs': train_data["obs"].astype(np.float32), 32 | 'action': train_data["action"].astype(np.float32), 33 | 'next_obs': train_data["next_obs"].astype(np.float32), 34 | 'reward': train_data["reward"].astype(np.float32).reshape(-1, 1), 35 | 'done': np.bool_(train_data["done"]).reshape(-1, 1), 36 | }) 37 | 38 | val_buffer = load_neorl_buffer({ 39 | 'obs': val_data["obs"].astype(np.float32), 40 | 'action': val_data["action"].astype(np.float32), 41 | 'next_obs': val_data["next_obs"].astype(np.float32), 42 | 'reward': val_data["reward"].astype(np.float32).reshape(-1, 1), 43 | 'done': np.bool_(val_data["done"]).reshape(-1, 1), 44 | }) 45 | 46 | return train_buffer, val_buffer 47 | 48 | def load_data_from_neorl(task, task_data_type = "low", task_train_num = 99): 49 | try: 50 | import neorl 51 | env = neorl.make(task) 52 | train_data, val_data = env.get_dataset(data_type = task_data_type, train_num = task_train_num) 53 | train_buffer, val_buffer = load_neorl_buffer(train_data), load_neorl_buffer(val_data) 54 | logger.info(f"Load task data from neorl. -> {task}") 55 | except: 56 | train_buffer, val_buffer = load_data_from_neorl2(task) 57 | logger.info(f"Load task data from neorl2. -> {task}") 58 | return train_buffer, val_buffer -------------------------------------------------------------------------------- /offlinerl/data/d4rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | from d4rl import gym_mujoco 5 | import gym 6 | import d4rl 7 | import numpy as np 8 | from loguru import logger 9 | 10 | from offlinerl.utils.data import SampleBatch 11 | 12 | def load_d4rl_buffer(task): 13 | env = gym.make(task[5:]) 14 | dataset = d4rl.qlearning_dataset(env) 15 | 16 | buffer = SampleBatch( 17 | obs=dataset['observations'], 18 | obs_next=dataset['next_observations'], 19 | act=dataset['actions'], 20 | rew=np.expand_dims(np.squeeze(dataset['rewards']), 1), 21 | done=np.expand_dims(np.squeeze(dataset['terminals']), 1), 22 | ) 23 | 24 | logger.info('obs shape: {}', buffer.obs.shape) 25 | logger.info('obs_next shape: {}', buffer.obs_next.shape) 26 | logger.info('act shape: {}', buffer.act.shape) 27 | logger.info('rew shape: {}', buffer.rew.shape) 28 | logger.info('done shape: {}', buffer.done.shape) 29 | logger.info('Episode reward: {}', buffer.rew.sum() /np.sum(buffer.done) ) 30 | logger.info('Number of terminals on: {}', np.sum(buffer.done)) 31 | return buffer 32 | -------------------------------------------------------------------------------- /offlinerl/data/neorl.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from loguru import logger 3 | 4 | from offlinerl.utils.data import SampleBatch, get_scaler 5 | from offlinerl.utils.data import BufferDataset, BufferDataloader 6 | 7 | def load_neorl_buffer(data): 8 | buffer = SampleBatch( 9 | obs = data["obs"], 10 | obs_next = data["next_obs"], 11 | act = data["action"], 12 | rew = data["reward"], 13 | done = data["done"], 14 | ) 15 | 16 | logger.info('obs shape: {}', buffer.obs.shape) 17 | logger.info('obs_next shape: {}', buffer.obs_next.shape) 18 | logger.info('act shape: {}', buffer.act.shape) 19 | logger.info('rew shape: {}', buffer.rew.shape) 20 | logger.info('done shape: {}', buffer.done.shape) 21 | logger.info('Episode reward: {}', buffer.rew.sum() /np.sum(buffer.done) ) 22 | logger.info('Number of terminals on: {}', np.sum(buffer.done)) 23 | 24 | """ 25 | rew_scaler = get_scaler(buffer.rew) 26 | buffer.rew = rew_scaler.transform(buffer.rew) 27 | buffer.rew = buffer.rew * 0.01 28 | buffer.done[buffer.rew < np.sort(buffer.rew.reshape(-1))[int(len(buffer)*0.01)]] = 1 29 | 30 | buffer = BufferDataset(buffer) 31 | buffer = BufferDataloader(buffer, batch_size=1, collate_fn=lambda x: x[0], num_workers=8) 32 | """ 33 | 34 | return buffer 35 | -------------------------------------------------------------------------------- /offlinerl/evaluation/d4rl.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import d4rl 3 | import torch 4 | import numpy as np 5 | from tqdm import tqdm 6 | from collections import OrderedDict 7 | from d4rl.infos import REF_MIN_SCORE, REF_MAX_SCORE 8 | 9 | from offlinerl.utils.env import get_env 10 | 11 | 12 | def d4rl_score(task, rew_mean, len_mean): 13 | score = (rew_mean - REF_MIN_SCORE[task]) / (REF_MAX_SCORE[task] - REF_MIN_SCORE[task]) * 100 14 | 15 | return score 16 | 17 | 18 | def d4rl_eval_fn(task, eval_episodes=100): 19 | env = get_env(task) 20 | 21 | def d4rl_eval(policy): 22 | episode_rewards = [] 23 | episode_lengths = [] 24 | for _ in range(eval_episodes): 25 | state, done = env.reset(), False 26 | rewards = 0 27 | lengths = 0 28 | while not done: 29 | state = state[np.newaxis] 30 | action = policy.get_action(state) 31 | state, reward, done, _ = env.step(action) 32 | rewards += reward 33 | lengths += 1 34 | 35 | episode_rewards.append(rewards) 36 | episode_lengths.append(lengths) 37 | 38 | 39 | rew_mean = np.mean(episode_rewards) 40 | len_mean = np.mean(episode_lengths) 41 | 42 | score = d4rl_score(task, rew_mean, len_mean) 43 | 44 | res = OrderedDict() 45 | res["Reward_Mean"] = rew_mean 46 | res["Length_Mean"] = len_mean 47 | res["D4rl_Score"] = score 48 | 49 | return res 50 | 51 | return d4rl_eval -------------------------------------------------------------------------------- /offlinerl/evaluation/fqe.py: -------------------------------------------------------------------------------- 1 | # https://arxiv.org/abs/2007.09055 2 | # Hyperparameter Selection for Offline Reinforcement Learning 3 | from copy import deepcopy 4 | import torch 5 | from tqdm import tqdm 6 | 7 | from offlinerl.utils.net.common import MLP 8 | from offlinerl.utils.net.continuous import DistributionalCritic 9 | 10 | class FQE: 11 | # https://arxiv.org/abs/2007.09055 12 | # Hyperparameter Selection for Offline Reinforcement Learning 13 | def __init__(self, 14 | policy, 15 | buffer, 16 | q_hidden_features=1024, 17 | q_hidden_layers=4, 18 | device="cuda" if torch.cuda.is_available() else "cpu" 19 | ): 20 | self.policy = policy 21 | self.buffer = buffer 22 | self.critic_hidden_features = q_hidden_features 23 | self.critic_hidden_layers = q_hidden_layers 24 | self._device = device 25 | 26 | def train_estimator(self, 27 | init_critic=None, 28 | discount=0.99, 29 | target_update_period=100, 30 | critic_lr=1e-4, 31 | num_steps=250000, 32 | polyak=0.0, 33 | batch_size=256, 34 | verbose=False): 35 | 36 | min_reward = self.buffer.rew.min() 37 | max_reward = self.buffer.rew.max() 38 | 39 | max_value = (1.2 * max_reward + 0.8 * min_reward) / (1 - discount) 40 | min_value = (1.2 * min_reward + 0.8 * max_reward) / (1 - discount) 41 | 42 | data = self.buffer.sample(batch_size) 43 | input_dim = data.obs.shape[-1] + data.act.shape[-1] 44 | critic = MLP(input_dim, 1, self.critic_hidden_features, self.critic_hidden_layers).to(self._device) 45 | if init_critic is not None: critic.load_state_dict(init_critic.state_dict()) 46 | critic_optimizer = torch.optim.Adam(critic.parameters(), lr=critic_lr) 47 | target_critic = deepcopy(critic).to(self._device) 48 | target_critic.requires_grad_(False) 49 | 50 | if verbose: 51 | counter = tqdm(total=num_steps) 52 | 53 | print('Training Fqe...') 54 | for t in range(num_steps): 55 | batch = self.buffer.sample(batch_size) 56 | data = batch.to_torch(dtype=torch.float32, device=self._device) 57 | r = data.rew 58 | terminals = data.done 59 | o1 = data.obs 60 | a1 = data.act 61 | 62 | o2 = data.obs_next 63 | a2 = self.policy.get_action(o2) 64 | q_target = target_critic(torch.cat((o2, a2), -1)).detach() 65 | current_discount = discount * (1 - terminals) 66 | backup = r + current_discount * q_target 67 | backup = torch.clamp(backup, min_value, max_value) # prevent explosion 68 | 69 | q = critic(torch.cat((o1, a1), -1)) 70 | critic_loss = ((q - backup) ** 2).mean() 71 | 72 | critic_optimizer.zero_grad() 73 | critic_loss.backward() 74 | critic_optimizer.step() 75 | 76 | if t % target_update_period == 0: 77 | with torch.no_grad(): 78 | for p, p_targ in zip(critic.parameters(), target_critic.parameters()): 79 | p_targ.data.mul_(polyak) 80 | p_targ.data.add_((1 - polyak) * p.data) 81 | 82 | if verbose: 83 | counter.update(1) 84 | 85 | return critic -------------------------------------------------------------------------------- /offlinerl/evaluation/gym.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from collections import OrderedDict 4 | 5 | from offlinerl.utils.env import get_env 6 | 7 | def gym_policy_eval(task, eval_episodes=100): 8 | env = get_env(task) 9 | 10 | def policy_eval(policy): 11 | episode_rewards = [] 12 | episode_lengths = [] 13 | for _ in range(eval_episodes): 14 | state, done = env.reset(), False 15 | rewards = 0 16 | lengths = 0 17 | while not done: 18 | state = state[np.newaxis] 19 | action = policy.get_action(state).reshape(-1) 20 | state, reward, done, _ = env.step(action) 21 | rewards += reward 22 | lengths += 1 23 | 24 | episode_rewards.append(rewards) 25 | episode_lengths.append(lengths) 26 | 27 | 28 | rew_mean = np.mean(episode_rewards) 29 | len_mean = np.mean(episode_lengths) 30 | 31 | 32 | res = OrderedDict() 33 | res["Reward_Mean"] = rew_mean 34 | res["Length_Mean"] = len_mean 35 | 36 | return res 37 | 38 | return policy_eval 39 | 40 | 41 | def gym_env_eval(task, eval_episodes=100): 42 | env = get_env(task) 43 | 44 | def env_eval(policy, obs_scaler=None, act_scaler=None): 45 | env_mae = [] 46 | for _ in range(eval_episodes): 47 | state, done = env.reset(), False 48 | rewards = 0 49 | lengths = 0 50 | while not done: 51 | state = state[np.newaxis] 52 | action = env.action_space.sample() 53 | 54 | obs = state.reshape(1,-1) 55 | act = action.reshape(1,-1) 56 | if obs_scaler is not None: 57 | obs = obs_scaler.transform(obs) 58 | if act_scaler is not None: 59 | act = act_scaler.transform(act) 60 | 61 | policy_state = policy.get_action(np.concatenate([obs,act], axis=1)) 62 | 63 | if obs_scaler is not None: 64 | policy_state = obs_scaler.inverse_transform(policy_state) 65 | 66 | state, reward, done, _ = env.step(action) 67 | 68 | env_mae.append(np.mean(np.abs(policy_state -state))) 69 | 70 | env_mae = np.mean(env_mae) 71 | 72 | 73 | res = OrderedDict() 74 | res["Env_Mae"] = env_mae 75 | 76 | return res 77 | 78 | return env_eval -------------------------------------------------------------------------------- /offlinerl/evaluation/neorl.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import ray 3 | from copy import deepcopy 4 | import numpy as np 5 | from collections import OrderedDict 6 | 7 | from offlinerl.utils.env import get_env 8 | from multiprocessing import Pool 9 | 10 | 11 | #@ray.remote(num_gpus=0.1) 12 | def test_one_trail(env, policy): 13 | # env = deepcopy(env) 14 | # policy = deepcopy(policy) 15 | 16 | state, done = env.reset(), False 17 | if isinstance(state, tuple): 18 | state = state[0] 19 | rewards = 0 20 | lengths = 0 21 | while not done: 22 | state = state[np.newaxis] 23 | action = policy.get_action(state).reshape(-1) 24 | result = env.step(action) 25 | if len(result) == 4: 26 | state, reward, done, _ = result 27 | else: 28 | state, reward, done, timeout,_ = result 29 | done = done or timeout 30 | rewards += reward 31 | lengths += 1 32 | 33 | return (rewards, lengths) 34 | 35 | def test_one_trail_sp_local(env, policy): 36 | # env = deepcopy(env) 37 | # policy = deepcopy(policy) 38 | 39 | state, done = env.reset(), False 40 | rewards = 0 41 | lengths = 0 42 | obs_dim = env.observation_space.shape[0] 43 | act_dim = env.action_space.shape[0] 44 | 45 | while not done: 46 | state = state.reshape(-1, obs_dim) 47 | action = policy.get_action(state).reshape(-1, act_dim) 48 | # print("actions: ", action[0:3,]) 49 | state, reward, done, _ = env.step(action) 50 | rewards += reward 51 | lengths += 1 52 | 53 | return (rewards, lengths) 54 | 55 | def test_on_real_env(policy, env, number_of_runs=100): 56 | rewards = [] 57 | episode_lengths = [] 58 | policy = deepcopy(policy) 59 | policy.eval() 60 | 61 | if (not hasattr(env.spec, "id")) and ("sp" in env._name or "sales" in env._name): 62 | results = [test_one_trail_sp_local(env, policy) for _ in range(number_of_runs)] 63 | else: 64 | pool = Pool(processes=10) 65 | results = [pool.apply_async(test_one_trail, args=(env, policy)) for _ in range(number_of_runs)] 66 | results = [result.get() for result in results] 67 | pool.close() 68 | pool.join() 69 | 70 | policy.train() 71 | 72 | rewards = [result[0] for result in results] 73 | episode_lengths = [result[1] for result in results] 74 | 75 | rew_mean = np.mean(rewards) 76 | rew_std = np.std(rewards) 77 | len_mean = np.mean(episode_lengths) 78 | 79 | 80 | res = OrderedDict() 81 | res["Reward_Mean_Env"] = rew_mean 82 | res["Reward_Std_Env"] = rew_std 83 | res["Length_Mean_Env"] = len_mean 84 | res["Length_Std_Env"] = np.std(episode_lengths) 85 | 86 | return res 87 | -------------------------------------------------------------------------------- /offlinerl/outside_utils/buffer/__init__.py: -------------------------------------------------------------------------------- 1 | from offlinerl.outside_utils.buffer.buffer import ReplayBuffer 2 | 3 | 4 | __all__ = [ 5 | "ReplayBuffer" 6 | ] -------------------------------------------------------------------------------- /offlinerl/outside_utils/buffer/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from typing import Optional, Union, Tuple, Dict 5 | 6 | 7 | class ReplayBuffer: 8 | def __init__( 9 | self, 10 | buffer_size: int, 11 | obs_shape: Tuple, 12 | obs_dtype: np.dtype, 13 | action_dim: int, 14 | action_dtype: np.dtype, 15 | device: str = "cpu" 16 | ) -> None: 17 | self._max_size = buffer_size 18 | self.obs_shape = obs_shape 19 | self.obs_dtype = obs_dtype 20 | self.action_dim = action_dim 21 | self.action_dtype = action_dtype 22 | 23 | self._ptr = 0 24 | self._size = 0 25 | self.observations = np.zeros((self._max_size,) + self.obs_shape, dtype=obs_dtype) 26 | self.next_observations = np.zeros((self._max_size,) + self.obs_shape, dtype=obs_dtype) 27 | self.actions = np.zeros((self._max_size, self.action_dim), dtype=action_dtype) 28 | self.rewards = np.zeros((self._max_size, 1), dtype=np.float32) 29 | self.terminals = np.zeros((self._max_size, 1), dtype=np.float32) 30 | 31 | self.device = torch.device(device) 32 | 33 | def add( 34 | self, 35 | obs: np.ndarray, 36 | next_obs: np.ndarray, 37 | action: np.ndarray, 38 | reward: np.ndarray, 39 | terminal: np.ndarray 40 | ) -> None: 41 | # Copy to avoid modification by reference 42 | self.observations[self._ptr] = np.array(obs).copy() 43 | self.next_observations[self._ptr] = np.array(next_obs).copy() 44 | self.actions[self._ptr] = np.array(action).copy() 45 | self.rewards[self._ptr] = np.array(reward).copy() 46 | self.terminals[self._ptr] = np.array(terminal).copy() 47 | 48 | self._ptr = (self._ptr + 1) % self._max_size 49 | self._size = min(self._size + 1, self._max_size) 50 | 51 | def add_batch( 52 | self, 53 | obss: np.ndarray, 54 | next_obss: np.ndarray, 55 | actions: np.ndarray, 56 | rewards: np.ndarray, 57 | terminals: np.ndarray 58 | ) -> None: 59 | batch_size = len(obss) 60 | indexes = np.arange(self._ptr, self._ptr + batch_size) % self._max_size 61 | 62 | self.observations[indexes] = np.array(obss).copy() 63 | self.next_observations[indexes] = np.array(next_obss).copy() 64 | self.actions[indexes] = np.array(actions).copy() 65 | self.rewards[indexes] = np.array(rewards).copy() 66 | self.terminals[indexes] = np.array(terminals).copy() 67 | 68 | self._ptr = (self._ptr + batch_size) % self._max_size 69 | self._size = min(self._size + batch_size, self._max_size) 70 | 71 | def load_dataset(self, dataset: Dict[str, np.ndarray]) -> None: 72 | observations = np.array(dataset["obs"], dtype=self.obs_dtype) 73 | next_observations = np.array(dataset["obs_next"], dtype=self.obs_dtype) 74 | actions = np.array(dataset["act"], dtype=self.action_dtype) 75 | rewards = np.array(dataset["rew"], dtype=np.float32).reshape(-1, 1) 76 | terminals = np.array(dataset["done"], dtype=np.float32).reshape(-1, 1) 77 | 78 | self.observations = observations 79 | self.next_observations = next_observations 80 | self.actions = actions 81 | self.rewards = rewards 82 | self.terminals = terminals 83 | 84 | self._ptr = len(observations) 85 | self._size = len(observations) 86 | 87 | def normalize_obs(self, eps: float = 1e-3, inplace : bool = True) -> Tuple[np.ndarray, np.ndarray]: 88 | mean = self.observations.mean(0, keepdims=True) 89 | std = self.observations.std(0, keepdims=True) + eps 90 | if inplace: 91 | self.observations = (self.observations - mean) / std 92 | self.next_observations = (self.next_observations - mean) / std 93 | obs_mean, obs_std = mean, std 94 | return obs_mean, obs_std 95 | 96 | def sample(self, batch_size: int) -> Dict[str, torch.Tensor]: 97 | 98 | batch_indexes = np.random.randint(0, self._size, size=batch_size) 99 | 100 | return { 101 | "observations": torch.tensor(self.observations[batch_indexes]).to(self.device), 102 | "actions": torch.tensor(self.actions[batch_indexes]).to(self.device), 103 | "next_observations": torch.tensor(self.next_observations[batch_indexes]).to(self.device), 104 | "terminals": torch.tensor(self.terminals[batch_indexes]).to(self.device), 105 | "rewards": torch.tensor(self.rewards[batch_indexes]).to(self.device) 106 | } 107 | 108 | def sample_all(self) -> Dict[str, np.ndarray]: 109 | return { 110 | "observations": self.observations[:self._size].copy(), 111 | "actions": self.actions[:self._size].copy(), 112 | "next_observations": self.next_observations[:self._size].copy(), 113 | "terminals": self.terminals[:self._size].copy(), 114 | "rewards": self.rewards[:self._size].copy() 115 | } -------------------------------------------------------------------------------- /offlinerl/outside_utils/dynamics/__init__.py: -------------------------------------------------------------------------------- 1 | from offlinerl.outside_utils.dynamics.base_dynamics import BaseDynamics 2 | from offlinerl.outside_utils.dynamics.ensemble_dynamics import EnsembleDynamics 3 | from offlinerl.outside_utils.dynamics.rnn_dynamics import RNNDynamics 4 | from offlinerl.outside_utils.dynamics.mujoco_oracle_dynamics import MujocoOracleDynamics 5 | 6 | 7 | __all__ = [ 8 | "BaseDynamics", 9 | "EnsembleDynamics", 10 | "RNNDynamics", 11 | "MujocoOracleDynamics" 12 | ] -------------------------------------------------------------------------------- /offlinerl/outside_utils/dynamics/base_dynamics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | from typing import Callable, List, Tuple, Dict 6 | 7 | 8 | class BaseDynamics(object): 9 | def __init__( 10 | self, 11 | model: nn.Module, 12 | optim: torch.optim.Optimizer 13 | ) -> None: 14 | super().__init__() 15 | self.model = model 16 | self.optim = optim 17 | 18 | def step( 19 | self, 20 | obs: np.ndarray, 21 | action: np.ndarray 22 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]: 23 | raise NotImplementedError 24 | -------------------------------------------------------------------------------- /offlinerl/outside_utils/dynamics/ensemble_dynamics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | 6 | from typing import Callable, List, Tuple, Dict, Optional 7 | from offlinerl.outside_utils.dynamics import BaseDynamics 8 | from offlinerl.outside_utils.utils.scaler import StandardScaler 9 | from offlinerl.outside_utils.utils.logger import Logger 10 | import warnings 11 | 12 | 13 | class EnsembleDynamics(BaseDynamics): 14 | def __init__( 15 | self, 16 | model: nn.Module, 17 | optim: torch.optim.Optimizer, 18 | scaler: StandardScaler, 19 | terminal_fn: Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray], 20 | penalty_coef: float = 0.0, 21 | uncertainty_mode: str = "aleatoric", 22 | data_range: tuple = None, 23 | ) -> None: 24 | super().__init__(model, optim) 25 | self.scaler = scaler 26 | self.terminal_fn = terminal_fn 27 | self._penalty_coef = penalty_coef 28 | self._uncertainty_mode = uncertainty_mode 29 | self.obs_min, self.obs_max, self.rew_min, self.rew_max = data_range 30 | 31 | @ torch.no_grad() 32 | def step( 33 | self, 34 | obs: np.ndarray, 35 | action: np.ndarray, 36 | transition_scaler: bool = True, 37 | transition_clip: bool = False, 38 | clip_penalty: bool = False, 39 | max_penalty: float = 0, 40 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]: 41 | "imagine single forward step" 42 | obs_act = np.concatenate([obs, action], axis=-1) 43 | if transition_scaler: 44 | obs_act = self.scaler.transform(obs_act) 45 | mean, logvar = self.model(obs_act) 46 | mean = mean.cpu().numpy() 47 | logvar = logvar.cpu().numpy() 48 | mean[..., :-1] += obs 49 | std = np.sqrt(np.exp(logvar)) 50 | 51 | ensemble_samples = (mean + np.random.normal(size=mean.shape) * std).astype(np.float32) 52 | 53 | # choose one model from ensemble 54 | num_models, batch_size, _ = ensemble_samples.shape 55 | model_idxs = self.model.random_elite_idxs(batch_size) 56 | samples = ensemble_samples[model_idxs, np.arange(batch_size)] 57 | 58 | next_obs = samples[..., :-1] 59 | reward = samples[..., -1:] 60 | terminal = self.terminal_fn(obs, action, next_obs) 61 | if transition_clip: 62 | next_obs = np.clip(next_obs, self.obs_min, self.obs_max) 63 | reward = np.clip(reward, self.rew_min, self.rew_max) 64 | 65 | info = {} 66 | info["raw_reward"] = reward 67 | 68 | if self._penalty_coef > 0.0: 69 | norm_mean = mean 70 | norm_std = std 71 | if self._uncertainty_mode == "aleatoric": 72 | penalty = np.amax(np.linalg.norm(norm_std, axis=2), axis=0) 73 | elif self._uncertainty_mode == "pairwise-diff": 74 | next_obses_mean = norm_mean[..., :-1] 75 | next_obs_mean = np.mean(next_obses_mean, axis=0) 76 | diff = next_obses_mean - next_obs_mean 77 | penalty = np.amax(np.linalg.norm(diff, axis=2), axis=0) 78 | elif self._uncertainty_mode == "ensemble_std": 79 | next_obses_mean = norm_mean[..., :-1] 80 | penalty = np.sqrt(next_obses_mean.var(0).mean(1)) 81 | else: 82 | warnings.warn("Invalid uncertainty mode. No penalty applied!!!") 83 | penalty = np.zeros_like(reward).mean(1) 84 | 85 | penalty = np.expand_dims(penalty, 1).astype(np.float32) 86 | if clip_penalty: 87 | penalty = np.clip(penalty, a_max=max_penalty) 88 | assert penalty.shape == reward.shape 89 | reward = reward - self._penalty_coef * penalty 90 | info["penalty"] = penalty 91 | 92 | return next_obs, reward, np.bool_(terminal), info 93 | 94 | @ torch.no_grad() 95 | def sample_next_obss( 96 | self, 97 | obs: torch.Tensor, 98 | action: torch.Tensor, 99 | num_samples: int, 100 | transition_scaler: bool = True, 101 | transition_clip: bool = False, 102 | ) -> torch.Tensor: 103 | obs_act = torch.cat([obs, action], dim=-1) 104 | if transition_scaler: 105 | obs_act = self.scaler.transform_tensor(obs_act) 106 | mean, logvar = self.model(obs_act) 107 | mean[..., :-1] += obs 108 | std = torch.sqrt(torch.exp(logvar)) 109 | 110 | mean = mean[self.model.elites.data.cpu().numpy()] 111 | std = std[self.model.elites.data.cpu().numpy()] 112 | 113 | samples = torch.stack([mean + torch.randn_like(std) * std for i in range(num_samples)], 0) 114 | next_obss = samples[..., :-1] 115 | if transition_clip: 116 | obs_min = torch.as_tensor(self.obs_min).to(next_obss.device) 117 | obs_max = torch.as_tensor(self.obs_max).to(next_obss.device) 118 | next_obss = torch.clamp(next_obss, obs_min, obs_max) 119 | return next_obss 120 | 121 | def format_samples_for_training(self, data: Dict) -> Tuple[np.ndarray, np.ndarray]: 122 | obss = data["observations"] 123 | actions = data["actions"] 124 | next_obss = data["next_observations"] 125 | rewards = data["rewards"] 126 | delta_obss = next_obss - obss 127 | inputs = np.concatenate((obss, actions), axis=-1) 128 | targets = np.concatenate((delta_obss, rewards), axis=-1) 129 | return inputs, targets 130 | 131 | def select_elites(self, metrics: List) -> List[int]: 132 | pairs = [(metric, index) for metric, index in zip(metrics, range(len(metrics)))] 133 | pairs = sorted(pairs, key=lambda x: x[0]) 134 | elites = [pairs[i][1] for i in range(self.model.num_elites)] 135 | return elites 136 | 137 | def save(self, save_path: str) -> None: 138 | torch.save(self.model.state_dict(), os.path.join(save_path, "dynamics.pth")) 139 | self.scaler.save_scaler(save_path) 140 | 141 | def load(self, load_path: str) -> None: 142 | self.model.load_state_dict(torch.load(os.path.join(load_path, "dynamics.pth"), map_location=self.model.device)) 143 | self.scaler.load_scaler(load_path) 144 | -------------------------------------------------------------------------------- /offlinerl/outside_utils/dynamics/mujoco_oracle_dynamics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gym.envs.mujoco import mujoco_env 4 | from typing import Callable, List, Tuple, Dict 5 | 6 | 7 | class MujocoOracleDynamics(object): 8 | def __init__(self, env: mujoco_env.MujocoEnv) -> None: 9 | self.env = env 10 | 11 | def _set_state_from_obs(self, obs:np.ndarray) -> None: 12 | if len(obs) == (self.env.model.nq + self.env.model.nv - 1): 13 | xpos = np.zeros(1) 14 | obs = np.concatenate([xpos, obs]) 15 | qpos = obs[:self.env.model.nq] 16 | qvel = obs[self.env.model.nq:] 17 | self.env._elapsed_steps = 0 18 | self.env.set_state(qpos, qvel) 19 | 20 | def step( 21 | self, 22 | obs: np.ndarray, 23 | action: np.ndarray 24 | ) -> Tuple[np.ndarray, float, bool, Dict]: 25 | if (len(obs.shape) > 1) or (len(action.shape) > 1): 26 | raise ValueError 27 | self._set_state_from_obs(obs) 28 | next_obs, reward, terminal, info = self.env.step(action) 29 | return next_obs, reward, terminal, info -------------------------------------------------------------------------------- /offlinerl/outside_utils/dynamics/rnn_dynamics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | 6 | from typing import Callable, List, Tuple, Dict 7 | from torch.utils.data.dataloader import DataLoader 8 | from offlinerl.outside_utils.dynamics import BaseDynamics 9 | from offlinerl.outside_utils.utils.scaler import StandardScaler 10 | from offlinerl.outside_utils.utils.logger import Logger 11 | 12 | 13 | class RNNDynamics(BaseDynamics): 14 | def __init__( 15 | self, 16 | model: nn.Module, 17 | optim: torch.optim.Optimizer, 18 | scaler: StandardScaler, 19 | terminal_fn: Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray], 20 | ) -> None: 21 | super().__init__(model, optim) 22 | self.scaler = scaler 23 | self.terminal_fn = terminal_fn 24 | 25 | @ torch.no_grad() 26 | def step( 27 | self, 28 | obss: np.ndarray, 29 | actions: np.ndarray 30 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]: 31 | "imagine single forward step" 32 | inputs = np.concatenate([obss, actions], axis=-1) 33 | inputs = self.scaler.transform(inputs) 34 | preds, _ = self.model(inputs) 35 | # get last timestep pred 36 | preds = preds[:, -1] 37 | next_obss = preds[..., :-1].cpu().numpy() + obss[:, -1] 38 | rewards = preds[..., -1:].cpu().numpy() 39 | 40 | terminals = self.terminal_fn(obss[:, -1], actions[:, -1], next_obss) 41 | info = {} 42 | 43 | return next_obss, rewards, terminals, info 44 | 45 | def train(self, data: Dict, batch_size: int, max_iters: int, logger: Logger) -> None: 46 | self.model.train() 47 | loader = DataLoader(data, shuffle=True, batch_size=batch_size) 48 | for iter in range(max_iters): 49 | for batch in loader: 50 | train_loss = self.learn(batch) 51 | logger.logkv_mean("loss/model", train_loss) 52 | 53 | logger.set_timestep(iter) 54 | logger.dumpkvs(exclude=["policy_training_progress"]) 55 | self.save(logger.model_dir) 56 | self.model.eval() 57 | 58 | def learn(self, batch) -> float: 59 | inputs, targets, masks = batch 60 | preds, _ = self.model.forward(inputs) 61 | 62 | loss = (((preds - targets) ** 2).mean(-1) * masks).mean() 63 | 64 | self.optim.zero_grad() 65 | loss.backward() 66 | self.optim.step() 67 | 68 | return loss.item() 69 | 70 | def save(self, save_path: str) -> None: 71 | torch.save(self.model.state_dict(), os.path.join(save_path, "dynamics.pth")) 72 | self.scaler.save_scaler(save_path) 73 | 74 | def load(self, load_path: str) -> None: 75 | self.model.load_state_dict(torch.load(os.path.join(load_path, "dynamics.pth"), map_location=self.model.device)) 76 | self.scaler.load_scaler(load_path) -------------------------------------------------------------------------------- /offlinerl/outside_utils/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from offlinerl.outside_utils.modules.actor_module import Actor, ActorProb 2 | from offlinerl.outside_utils.modules.critic_module import Critic 3 | from offlinerl.outside_utils.modules.ensemble_critic_module import EnsembleCritic 4 | from offlinerl.outside_utils.modules.dist_module import DiagGaussian, TanhDiagGaussian 5 | from offlinerl.outside_utils.modules.dynamics_module import EnsembleDynamicsModel 6 | 7 | 8 | __all__ = [ 9 | "Actor", 10 | "ActorProb", 11 | "Critic", 12 | "EnsembleCritic", 13 | "DiagGaussian", 14 | "TanhDiagGaussian", 15 | "EnsembleDynamicsModel" 16 | ] -------------------------------------------------------------------------------- /offlinerl/outside_utils/modules/actor_module.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from torch.nn import functional as F 5 | from typing import Union, Optional 6 | 7 | 8 | # for SAC 9 | class ActorProb(nn.Module): 10 | def __init__( 11 | self, 12 | backbone: nn.Module, 13 | dist_net: nn.Module, 14 | device: str = "cpu" 15 | ) -> None: 16 | super().__init__() 17 | 18 | self.device = torch.device(device) 19 | self.backbone = backbone.to(device) 20 | self.dist_net = dist_net.to(device) 21 | self.scaler = None 22 | 23 | def set_scaler(self, scaler): 24 | self.scaler = scaler 25 | 26 | def forward(self, obs: Union[np.ndarray, torch.Tensor]) -> torch.distributions.Normal: 27 | obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32) 28 | logits = self.backbone(obs) 29 | dist = self.dist_net(logits) 30 | return dist 31 | 32 | def get_action(self, obs: Union[np.ndarray, torch.Tensor]) -> torch.Tensor: 33 | if self.scaler is not None: 34 | obs = self.scaler.transform(obs) 35 | dist = self.forward(obs) 36 | action, _ = dist.mode() 37 | return action.detach().cpu().numpy() 38 | 39 | def to(self, device: str) -> None: 40 | self.device = torch.device(device) 41 | self.backbone.to(device) 42 | self.dist_net.to(device) 43 | return self 44 | 45 | 46 | # for TD3 47 | class Actor(nn.Module): 48 | def __init__( 49 | self, 50 | backbone: nn.Module, 51 | action_dim: int, 52 | max_action: float = 1.0, 53 | device: str = "cpu" 54 | ) -> None: 55 | super().__init__() 56 | 57 | self.device = torch.device(device) 58 | self.backbone = backbone.to(device) 59 | latent_dim = getattr(backbone, "output_dim") 60 | output_dim = action_dim 61 | self.last = nn.Linear(latent_dim, output_dim).to(device) 62 | self._max = max_action 63 | 64 | def forward(self, obs: Union[np.ndarray, torch.Tensor]) -> torch.Tensor: 65 | obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32) 66 | logits = self.backbone(obs) 67 | actions = self._max * torch.tanh(self.last(logits)) 68 | return actions -------------------------------------------------------------------------------- /offlinerl/outside_utils/modules/critic_module.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from torch.nn import functional as F 5 | from typing import Union, Optional 6 | 7 | 8 | class Critic(nn.Module): 9 | def __init__(self, backbone: nn.Module, device: str = "cpu") -> None: 10 | super().__init__() 11 | 12 | self.device = torch.device(device) 13 | self.backbone = backbone.to(device) 14 | latent_dim = getattr(backbone, "output_dim") 15 | self.last = nn.Linear(latent_dim, 1).to(device) 16 | 17 | def forward( 18 | self, 19 | obs: Union[np.ndarray, torch.Tensor], 20 | actions: Optional[Union[np.ndarray, torch.Tensor]] = None 21 | ) -> torch.Tensor: 22 | obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32) 23 | if actions is not None: 24 | actions = torch.as_tensor(actions, device=self.device, dtype=torch.float32).flatten(1) 25 | obs = torch.cat([obs, actions], dim=1) 26 | logits = self.backbone(obs) 27 | values = self.last(logits) 28 | return values -------------------------------------------------------------------------------- /offlinerl/outside_utils/modules/dist_module.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class NormalWrapper(torch.distributions.Normal): 7 | def log_prob(self, actions): 8 | return super().log_prob(actions).sum(-1, keepdim=True) 9 | 10 | def entropy(self): 11 | return super().entropy().sum(-1) 12 | 13 | def mode(self): 14 | return self.mean 15 | 16 | 17 | class TanhNormalWrapper(torch.distributions.Normal): 18 | def log_prob(self, action, raw_action=None): 19 | if raw_action is None: 20 | raw_action = self.arctanh(action) 21 | log_prob = super().log_prob(raw_action).sum(-1, keepdim=True) 22 | eps = 1e-6 23 | log_prob = log_prob - torch.log((1 - action.pow(2)) + eps).sum(-1, keepdim=True) 24 | return log_prob 25 | 26 | def mode(self): 27 | raw_action = self.mean 28 | action = torch.tanh(self.mean) 29 | return action, raw_action 30 | 31 | def arctanh(self, x): 32 | one_plus_x = (1 + x).clamp(min=1e-6) 33 | one_minus_x = (1 - x).clamp(min=1e-6) 34 | return 0.5 * torch.log(one_plus_x / one_minus_x) 35 | 36 | def rsample(self): 37 | raw_action = super().rsample() 38 | action = torch.tanh(raw_action) 39 | return action, raw_action 40 | 41 | 42 | class DiagGaussian(nn.Module): 43 | def __init__( 44 | self, 45 | latent_dim, 46 | output_dim, 47 | unbounded=False, 48 | conditioned_sigma=False, 49 | max_mu=1.0, 50 | sigma_min=-5.0, 51 | sigma_max=2.0 52 | ): 53 | super().__init__() 54 | self.mu = nn.Linear(latent_dim, output_dim) 55 | self._c_sigma = conditioned_sigma 56 | if conditioned_sigma: 57 | self.sigma = nn.Linear(latent_dim, output_dim) 58 | else: 59 | self.sigma_param = nn.Parameter(torch.zeros(output_dim, 1)) 60 | self._unbounded = unbounded 61 | self._max = max_mu 62 | self._sigma_min = sigma_min 63 | self._sigma_max = sigma_max 64 | 65 | def forward(self, logits): 66 | mu = self.mu(logits) 67 | if not self._unbounded: 68 | mu = self._max * torch.tanh(mu) 69 | if self._c_sigma: 70 | sigma = torch.clamp(self.sigma(logits), min=self._sigma_min, max=self._sigma_max).exp() 71 | else: 72 | shape = [1] * len(mu.shape) 73 | shape[1] = -1 74 | sigma = (self.sigma_param.view(shape) + torch.zeros_like(mu)).exp() 75 | return NormalWrapper(mu, sigma) 76 | 77 | 78 | class TanhDiagGaussian(DiagGaussian): 79 | def __init__( 80 | self, 81 | latent_dim, 82 | output_dim, 83 | unbounded=False, 84 | conditioned_sigma=False, 85 | max_mu=1.0, 86 | sigma_min=-5.0, 87 | sigma_max=2.0 88 | ): 89 | super().__init__( 90 | latent_dim=latent_dim, 91 | output_dim=output_dim, 92 | unbounded=unbounded, 93 | conditioned_sigma=conditioned_sigma, 94 | max_mu=max_mu, 95 | sigma_min=sigma_min, 96 | sigma_max=sigma_max 97 | ) 98 | 99 | def forward(self, logits): 100 | mu = self.mu(logits) 101 | if not self._unbounded: 102 | mu = self._max * torch.tanh(mu) 103 | if self._c_sigma: 104 | sigma = torch.clamp(self.sigma(logits), min=self._sigma_min, max=self._sigma_max).exp() 105 | else: 106 | shape = [1] * len(mu.shape) 107 | shape[1] = -1 108 | sigma = (self.sigma_param.view(shape) + torch.zeros_like(mu)).exp() 109 | return TanhNormalWrapper(mu, sigma) 110 | -------------------------------------------------------------------------------- /offlinerl/outside_utils/modules/dynamics_module.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from torch.nn import functional as F 5 | from typing import Dict, List, Union, Tuple, Optional 6 | from offlinerl.outside_utils.nets import EnsembleLinear 7 | 8 | 9 | class Swish(nn.Module): 10 | def __init__(self) -> None: 11 | super(Swish, self).__init__() 12 | 13 | def forward(self, x: torch.Tensor) -> torch.Tensor: 14 | x = x * torch.sigmoid(x) 15 | return x 16 | 17 | 18 | def soft_clamp( 19 | x : torch.Tensor, 20 | _min: Optional[torch.Tensor] = None, 21 | _max: Optional[torch.Tensor] = None 22 | ) -> torch.Tensor: 23 | # clamp tensor values while mataining the gradient 24 | if _max is not None: 25 | x = _max - F.softplus(_max - x) 26 | if _min is not None: 27 | x = _min + F.softplus(x - _min) 28 | return x 29 | 30 | 31 | class EnsembleDynamicsModel(nn.Module): 32 | def __init__( 33 | self, 34 | obs_dim: int, 35 | action_dim: int, 36 | hidden_dims: Union[List[int], Tuple[int]], 37 | num_ensemble: int = 7, 38 | num_elites: int = 5, 39 | activation: nn.Module = Swish, 40 | weight_decays: Optional[Union[List[float], Tuple[float]]] = None, 41 | with_reward: bool = True, 42 | device: str = "cpu" 43 | ) -> None: 44 | super().__init__() 45 | 46 | self.num_ensemble = num_ensemble 47 | self.num_elites = num_elites 48 | self._with_reward = with_reward 49 | self.device = torch.device(device) 50 | 51 | self.activation = activation() 52 | 53 | assert len(weight_decays) == (len(hidden_dims) + 1) 54 | 55 | module_list = [] 56 | hidden_dims = [obs_dim+action_dim] + list(hidden_dims) 57 | if weight_decays is None: 58 | weight_decays = [0.0] * (len(hidden_dims) + 1) 59 | for in_dim, out_dim, weight_decay in zip(hidden_dims[:-1], hidden_dims[1:], weight_decays[:-1]): 60 | module_list.append(EnsembleLinear(in_dim, out_dim, num_ensemble, weight_decay)) 61 | self.backbones = nn.ModuleList(module_list) 62 | 63 | self.output_layer = EnsembleLinear( 64 | hidden_dims[-1], 65 | 2 * (obs_dim + self._with_reward), 66 | num_ensemble, 67 | weight_decays[-1] 68 | ) 69 | 70 | self.register_parameter( 71 | "max_logvar", 72 | nn.Parameter(torch.ones(obs_dim + self._with_reward) * 0.5, requires_grad=True) 73 | ) 74 | self.register_parameter( 75 | "min_logvar", 76 | nn.Parameter(torch.ones(obs_dim + self._with_reward) * -10, requires_grad=True) 77 | ) 78 | 79 | self.register_parameter( 80 | "elites", 81 | nn.Parameter(torch.tensor(list(range(0, self.num_elites))), requires_grad=False) 82 | ) 83 | 84 | self.to(self.device) 85 | 86 | def forward(self, obs_action: np.ndarray) -> Tuple[torch.Tensor, torch.Tensor]: 87 | if isinstance(obs_action, np.ndarray): 88 | obs_action = torch.as_tensor(obs_action, dtype=torch.float32).to(self.device) 89 | output = obs_action 90 | for layer in self.backbones: 91 | output = self.activation(layer(output)) 92 | mean, logvar = torch.chunk(self.output_layer(output), 2, dim=-1) 93 | logvar = soft_clamp(logvar, self.min_logvar, self.max_logvar) 94 | return mean, logvar 95 | 96 | def load_save(self) -> None: 97 | for layer in self.backbones: 98 | layer.load_save() 99 | self.output_layer.load_save() 100 | 101 | def update_save(self, indexes: List[int]) -> None: 102 | for layer in self.backbones: 103 | layer.update_save(indexes) 104 | self.output_layer.update_save(indexes) 105 | 106 | def get_decay_loss(self) -> torch.Tensor: 107 | decay_loss = 0 108 | for layer in self.backbones: 109 | decay_loss += layer.get_decay_loss() 110 | decay_loss += self.output_layer.get_decay_loss() 111 | return decay_loss 112 | 113 | def set_elites(self, indexes: List[int]) -> None: 114 | assert len(indexes) <= self.num_ensemble and max(indexes) < self.num_ensemble 115 | self.register_parameter('elites', nn.Parameter(torch.tensor(indexes), requires_grad=False)) 116 | 117 | def random_elite_idxs(self, batch_size: int) -> np.ndarray: 118 | idxs = np.random.choice(self.elites.data.cpu().numpy(), size=batch_size) 119 | return idxs -------------------------------------------------------------------------------- /offlinerl/outside_utils/modules/ensemble_critic_module.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from torch.nn import functional as F 5 | from typing import Union, Optional, List, Tuple 6 | 7 | from offlinerl.outside_utils.nets import EnsembleLinear 8 | 9 | 10 | class EnsembleCritic(nn.Module): 11 | def __init__( 12 | self, 13 | obs_dim: int, 14 | action_dim: int, 15 | hidden_dims: Union[List[int], Tuple[int]], 16 | activation: nn.Module = nn.ReLU, 17 | num_ensemble: int = 10, 18 | device: str = "cpu" 19 | ) -> None: 20 | super().__init__() 21 | input_dim = obs_dim + action_dim 22 | hidden_dims = [input_dim] + list(hidden_dims) 23 | model = [] 24 | for in_dim, out_dim in zip(hidden_dims[:-1], hidden_dims[1:]): 25 | model += [EnsembleLinear(in_dim, out_dim, num_ensemble), activation()] 26 | model.append(EnsembleLinear(hidden_dims[-1], 1, num_ensemble)) 27 | self.model = nn.Sequential(*model) 28 | 29 | self.device = torch.device(device) 30 | self.model = self.model.to(device) 31 | self._num_ensemble = num_ensemble 32 | 33 | def forward( 34 | self, 35 | obs: Union[np.ndarray, torch.Tensor], 36 | actions: Optional[Union[np.ndarray, torch.Tensor]] = None 37 | ) -> torch.Tensor: 38 | obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32) 39 | if actions is not None: 40 | actions = torch.as_tensor(actions, device=self.device, dtype=torch.float32) 41 | obs = torch.cat([obs, actions], dim=-1) 42 | values = self.model(obs) 43 | # values: [num_ensemble, batch_size, 1] 44 | return values -------------------------------------------------------------------------------- /offlinerl/outside_utils/nets/__init__.py: -------------------------------------------------------------------------------- 1 | from offlinerl.outside_utils.nets.mlp import MLP 2 | from offlinerl.outside_utils.nets.vae import VAE 3 | from offlinerl.outside_utils.nets.ensemble_linear import EnsembleLinear 4 | from offlinerl.outside_utils.nets.rnn import RNNModel 5 | 6 | 7 | __all__ = [ 8 | "MLP", 9 | "VAE", 10 | "EnsembleLinear", 11 | "RNNModel" 12 | ] -------------------------------------------------------------------------------- /offlinerl/outside_utils/nets/ensemble_linear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from torch.nn import functional as F 5 | from typing import Dict, List, Union, Tuple, Optional 6 | 7 | 8 | class EnsembleLinear(nn.Module): 9 | def __init__( 10 | self, 11 | input_dim: int, 12 | output_dim: int, 13 | num_ensemble: int, 14 | weight_decay: float = 0.0 15 | ) -> None: 16 | super().__init__() 17 | 18 | self.num_ensemble = num_ensemble 19 | 20 | self.register_parameter("weight", nn.Parameter(torch.zeros(num_ensemble, input_dim, output_dim))) 21 | self.register_parameter("bias", nn.Parameter(torch.zeros(num_ensemble, 1, output_dim))) 22 | 23 | nn.init.trunc_normal_(self.weight, std=1/(2*input_dim**0.5)) 24 | 25 | self.register_parameter("saved_weight", nn.Parameter(self.weight.detach().clone())) 26 | self.register_parameter("saved_bias", nn.Parameter(self.bias.detach().clone())) 27 | 28 | self.weight_decay = weight_decay 29 | 30 | def forward(self, x: torch.Tensor) -> torch.Tensor: 31 | weight = self.weight 32 | bias = self.bias 33 | 34 | if len(x.shape) == 2: 35 | x = torch.einsum('ij,bjk->bik', x, weight) 36 | else: 37 | x = torch.einsum('bij,bjk->bik', x, weight) 38 | 39 | x = x + bias 40 | 41 | return x 42 | 43 | def load_save(self) -> None: 44 | self.weight.data.copy_(self.saved_weight.data) 45 | self.bias.data.copy_(self.saved_bias.data) 46 | 47 | def update_save(self, indexes: List[int]) -> None: 48 | self.saved_weight.data[indexes] = self.weight.data[indexes] 49 | self.saved_bias.data[indexes] = self.bias.data[indexes] 50 | 51 | def get_decay_loss(self) -> torch.Tensor: 52 | decay_loss = self.weight_decay * (0.5*((self.weight**2).sum())) 53 | return decay_loss -------------------------------------------------------------------------------- /offlinerl/outside_utils/nets/mlp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | from torch.nn import functional as F 6 | from typing import Dict, List, Union, Tuple, Optional 7 | 8 | 9 | class MLP(nn.Module): 10 | def __init__( 11 | self, 12 | input_dim: int, 13 | hidden_dims: Union[List[int], Tuple[int]], 14 | output_dim: Optional[int] = None, 15 | activation: nn.Module = nn.ReLU, 16 | dropout_rate: Optional[float] = None 17 | ) -> None: 18 | super().__init__() 19 | hidden_dims = [input_dim] + list(hidden_dims) 20 | model = [] 21 | for in_dim, out_dim in zip(hidden_dims[:-1], hidden_dims[1:]): 22 | model += [nn.Linear(in_dim, out_dim), activation()] 23 | if dropout_rate is not None: 24 | model += [nn.Dropout(p=dropout_rate)] 25 | 26 | self.output_dim = hidden_dims[-1] 27 | if output_dim is not None: 28 | model += [nn.Linear(hidden_dims[-1], output_dim)] 29 | self.output_dim = output_dim 30 | self.model = nn.Sequential(*model) 31 | 32 | def forward(self, x: torch.Tensor) -> torch.Tensor: 33 | return self.model(x) -------------------------------------------------------------------------------- /offlinerl/outside_utils/nets/rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | 5 | 6 | class Swish(nn.Module): 7 | def __init__(self): 8 | super(Swish, self).__init__() 9 | 10 | def forward(self, x): 11 | x = x * torch.sigmoid(x) 12 | return x 13 | 14 | 15 | def soft_clamp(x : torch.Tensor, _min=None, _max=None): 16 | # clamp tensor values while mataining the gradient 17 | if _max is not None: 18 | x = _max - F.softplus(_max - x) 19 | if _min is not None: 20 | x = _min + F.softplus(x - _min) 21 | return x 22 | 23 | 24 | class ResBlock(nn.Module): 25 | def __init__( 26 | self, 27 | input_dim, 28 | output_dim, 29 | activation=Swish(), 30 | layer_norm=True, 31 | with_residual=True, 32 | dropout=0.1 33 | ): 34 | super().__init__() 35 | 36 | self.linear = nn.Linear(input_dim, output_dim) 37 | self.activation = activation 38 | self.layer_norm = nn.LayerNorm(output_dim) if layer_norm else None 39 | self.dropout = nn.Dropout(dropout) if dropout else None 40 | self.with_residual = with_residual 41 | 42 | def forward(self, x): 43 | y = self.activation(self.linear(x)) 44 | if self.dropout is not None: 45 | y = self.dropout(y) 46 | if self.with_residual: 47 | y = x + y 48 | if self.layer_norm is not None: 49 | y = self.layer_norm(y) 50 | return y 51 | 52 | 53 | class RNNModel(nn.Module): 54 | def __init__( 55 | self, 56 | input_dim, 57 | output_dim, 58 | hidden_dims=[200, 200, 200, 200], 59 | rnn_num_layers=3, 60 | dropout_rate=0.1, 61 | device="cpu" 62 | ): 63 | super().__init__() 64 | self.input_dim = input_dim 65 | self.hidden_dims = hidden_dims 66 | self.output_dim = output_dim 67 | self.device = torch.device(device) 68 | 69 | self.activation = Swish() 70 | self.rnn_layer = nn.GRU( 71 | input_size=input_dim, 72 | hidden_size=hidden_dims[0], 73 | num_layers=rnn_num_layers, 74 | batch_first=True 75 | ) 76 | module_list = [] 77 | self.input_layer = ResBlock(input_dim, hidden_dims[0], dropout=dropout_rate, with_residual=False) 78 | dims = list(hidden_dims) 79 | for in_dim, out_dim in zip(dims[:-1], dims[1:]): 80 | module_list.append(ResBlock(in_dim, out_dim, dropout=dropout_rate)) 81 | self.backbones = nn.ModuleList(module_list) 82 | self.merge_layer = nn.Linear(dims[0] + dims[-1], hidden_dims[0]) 83 | self.output_layer = nn.Linear(hidden_dims[-1], output_dim) 84 | 85 | self.to(self.device) 86 | 87 | def forward(self, input, h_state=None): 88 | batch_size, num_timesteps, _ = input.shape 89 | input = torch.as_tensor(input, dtype=torch.float32).to(self.device) 90 | rnn_output, h_state = self.rnn_layer(input, h_state) 91 | rnn_output = rnn_output.reshape(-1, self.hidden_dims[0]) 92 | input = input.view(-1, self.input_dim) 93 | output = self.input_layer(input) 94 | output = torch.cat([output, rnn_output], dim=-1) 95 | output = self.activation(self.merge_layer(output)) 96 | for layer in self.backbones: 97 | output = layer(output) 98 | output = self.output_layer(output) 99 | output = output.view(batch_size, num_timesteps, -1) 100 | return output, h_state 101 | 102 | 103 | if __name__ == "__main__": 104 | model = RNNModel(14, 12) 105 | x = torch.randn(64, 20, 14) 106 | y, _ = model(x) 107 | print(y.shape) -------------------------------------------------------------------------------- /offlinerl/outside_utils/nets/vae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional as F 4 | from typing import Dict, List, Union, Tuple, Optional 5 | 6 | 7 | # Vanilla Variational Auto-Encoder 8 | class VAE(nn.Module): 9 | def __init__( 10 | self, 11 | input_dim: int, 12 | output_dim: int, 13 | hidden_dim: int, 14 | latent_dim: int, 15 | max_action: Union[int, float], 16 | device: str = "cpu" 17 | ) -> None: 18 | super(VAE, self).__init__() 19 | self.e1 = nn.Linear(input_dim + output_dim, hidden_dim) 20 | self.e2 = nn.Linear(hidden_dim, hidden_dim) 21 | 22 | self.mean = nn.Linear(hidden_dim, latent_dim) 23 | self.log_std = nn.Linear(hidden_dim, latent_dim) 24 | 25 | self.d1 = nn.Linear(input_dim + latent_dim, hidden_dim) 26 | self.d2 = nn.Linear(hidden_dim, hidden_dim) 27 | self.d3 = nn.Linear(hidden_dim, output_dim) 28 | 29 | self.max_action = max_action 30 | self.latent_dim = latent_dim 31 | self.device = torch.device(device) 32 | 33 | self.to(device=self.device) 34 | 35 | 36 | def forward( 37 | self, 38 | obs: torch.Tensor, 39 | action: torch.Tensor 40 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: 41 | z = F.relu(self.e1(torch.cat([obs, action], 1))) 42 | z = F.relu(self.e2(z)) 43 | 44 | mean = self.mean(z) 45 | # Clamped for numerical stability 46 | log_std = self.log_std(z).clamp(-4, 15) 47 | std = torch.exp(log_std) 48 | z = mean + std * torch.randn_like(std) 49 | 50 | u = self.decode(obs, z) 51 | 52 | return u, mean, std 53 | 54 | def decode(self, obs: torch.Tensor, z: Optional[torch.Tensor] = None) -> torch.Tensor: 55 | # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5] 56 | if z is None: 57 | z = torch.randn((obs.shape[0], self.latent_dim)).to(self.device).clamp(-0.5,0.5) 58 | 59 | a = F.relu(self.d1(torch.cat([obs, z], 1))) 60 | a = F.relu(self.d2(a)) 61 | return self.max_action * torch.tanh(self.d3(a)) -------------------------------------------------------------------------------- /offlinerl/outside_utils/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/outside_utils/utils/__init__.py -------------------------------------------------------------------------------- /offlinerl/outside_utils/utils/scaler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os.path as path 3 | import torch 4 | 5 | 6 | class StandardScaler(object): 7 | def __init__(self, mu=None, std=None): 8 | self.mu = mu 9 | self.std = std 10 | 11 | def fit(self, data): 12 | """Runs two ops, one for assigning the mean of the data to the internal mean, and 13 | another for assigning the standard deviation of the data to the internal standard deviation. 14 | This function must be called within a 'with .as_default()' block. 15 | 16 | Arguments: 17 | data (np.ndarray): A numpy array containing the input 18 | 19 | Returns: None. 20 | """ 21 | self.mu = np.mean(data, axis=0, keepdims=True) 22 | self.std = np.std(data, axis=0, keepdims=True) 23 | self.std[self.std < 1e-12] = 1.0 24 | 25 | def transform(self, data): 26 | """Transforms the input matrix data using the parameters of this scaler. 27 | 28 | Arguments: 29 | data (np.array): A numpy array containing the points to be transformed. 30 | 31 | Returns: (np.array) The transformed dataset. 32 | """ 33 | if isinstance(data, torch.Tensor): 34 | data = data.cpu().numpy() 35 | return (data - self.mu) / self.std 36 | 37 | def inverse_transform(self, data): 38 | """Undoes the transformation performed by this scaler. 39 | 40 | Arguments: 41 | data (np.array): A numpy array containing the points to be transformed. 42 | 43 | Returns: (np.array) The transformed dataset. 44 | """ 45 | if isinstance(data, torch.Tensor): 46 | data = data.cpu().numpy() 47 | return self.std * data + self.mu 48 | 49 | def save_scaler(self, save_path, surfix=""): 50 | mu_path = path.join(save_path, surfix+"mu.npy") 51 | std_path = path.join(save_path, surfix+"std.npy") 52 | np.save(mu_path, self.mu) 53 | np.save(std_path, self.std) 54 | 55 | def load_scaler(self, load_path, surfix=""): 56 | mu_path = path.join(load_path, surfix+"mu.npy") 57 | std_path = path.join(load_path, surfix+"std.npy") 58 | self.mu = np.load(mu_path) 59 | self.std = np.load(std_path) 60 | 61 | def transform_tensor(self, data: torch.Tensor): 62 | device = data.device 63 | data = self.transform(data.cpu().numpy()) 64 | data = torch.tensor(data, device=device) 65 | return data 66 | 67 | def inverse_transform_to_array(self, data: torch.Tensor): 68 | device = data.device 69 | data = self.inverse_transform(data.cpu().numpy()) 70 | # data = torch.tensor(data, device=device) 71 | return data -------------------------------------------------------------------------------- /offlinerl/outside_utils/utils/termination_fns.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def obs_unnormalization(termination_fn, obs_mean, obs_std): 4 | def thunk(obs, act, next_obs): 5 | obs = obs*obs_std + obs_mean 6 | next_obs = next_obs*obs_std + obs_mean 7 | return termination_fn(obs, act, next_obs) 8 | return thunk 9 | 10 | def termination_fn_halfcheetah(obs, act, next_obs): 11 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 12 | 13 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) 14 | done = ~not_done 15 | done = done[:, None] 16 | return done 17 | 18 | def termination_fn_hopper(obs, act, next_obs): 19 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 20 | 21 | height = next_obs[:, 0] 22 | angle = next_obs[:, 1] 23 | not_done = np.isfinite(next_obs).all(axis=-1) \ 24 | * np.abs(next_obs[:,1:] < 100).all(axis=-1) \ 25 | * (height > .7) \ 26 | * (np.abs(angle) < .2) 27 | 28 | done = ~not_done 29 | done = done[:,None] 30 | return done 31 | 32 | def termination_fn_halfcheetahveljump(obs, act, next_obs): 33 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 34 | 35 | done = np.array([False]).repeat(len(obs)) 36 | done = done[:,None] 37 | return done 38 | 39 | def termination_fn_antangle(obs, act, next_obs): 40 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 41 | 42 | x = next_obs[:, 0] 43 | not_done = np.isfinite(next_obs).all(axis=-1) \ 44 | * (x >= 0.2) \ 45 | * (x <= 1.0) 46 | 47 | done = ~not_done 48 | done = done[:,None] 49 | return done 50 | 51 | def termination_fn_ant(obs, act, next_obs): 52 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 53 | 54 | x = next_obs[:, 0] 55 | not_done = np.isfinite(next_obs).all(axis=-1) \ 56 | * (x >= 0.2) \ 57 | * (x <= 1.0) 58 | 59 | done = ~not_done 60 | done = done[:,None] 61 | return done 62 | 63 | def termination_fn_walker2d(obs, act, next_obs): 64 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 65 | 66 | height = next_obs[:, 0] 67 | angle = next_obs[:, 1] 68 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) \ 69 | * (height > 0.8) \ 70 | * (height < 2.0) \ 71 | * (angle > -1.0) \ 72 | * (angle < 1.0) 73 | done = ~not_done 74 | done = done[:,None] 75 | return done 76 | 77 | def termination_fn_point2denv(obs, act, next_obs): 78 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 79 | 80 | done = np.array([False]).repeat(len(obs)) 81 | done = done[:,None] 82 | return done 83 | 84 | def termination_fn_point2dwallenv(obs, act, next_obs): 85 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 86 | 87 | done = np.array([False]).repeat(len(obs)) 88 | done = done[:,None] 89 | return done 90 | 91 | def termination_fn_pendulum(obs, act, next_obs): 92 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 93 | 94 | done = np.zeros((len(obs), 1)) 95 | return done 96 | 97 | def termination_fn_humanoid(obs, act, next_obs): 98 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 99 | 100 | z = next_obs[:,0] 101 | done = (z < 1.0) + (z > 2.0) 102 | 103 | done = done[:,None] 104 | return done 105 | 106 | def termination_fn_pen(obs, act, next_obs): 107 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 108 | 109 | obj_pos = next_obs[:, 24:27] 110 | done = obj_pos[:, 2] < 0.075 111 | 112 | done = done[:,None] 113 | return done 114 | 115 | def terminaltion_fn_door(obs, act, next_obs): 116 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 117 | 118 | done = np.array([False] * obs.shape[0]) 119 | 120 | done = done[:, None] 121 | return done 122 | 123 | def get_termination_fn(task): 124 | if 'halfcheetahvel' in task: 125 | return termination_fn_halfcheetahveljump 126 | elif 'halfcheetah' in task: 127 | return termination_fn_halfcheetah 128 | elif 'hopper' in task: 129 | return termination_fn_hopper 130 | elif 'antangle' in task: 131 | return termination_fn_antangle 132 | elif 'ant' in task: 133 | return termination_fn_ant 134 | elif 'walker2d' in task: 135 | return termination_fn_walker2d 136 | elif 'point2denv' in task: 137 | return termination_fn_point2denv 138 | elif 'point2dwallenv' in task: 139 | return termination_fn_point2dwallenv 140 | elif 'pendulum' in task: 141 | return termination_fn_pendulum 142 | elif 'humanoid' in task: 143 | return termination_fn_humanoid 144 | elif 'pen' in task: 145 | return termination_fn_pen 146 | elif 'door' in task: 147 | return terminaltion_fn_door 148 | else: 149 | raise np.zeros 150 | -------------------------------------------------------------------------------- /offlinerl/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/utils/__init__.py -------------------------------------------------------------------------------- /offlinerl/utils/config.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | 4 | del_attr = ["function", "module"] 5 | 6 | def parse_config(cfg_module): 7 | args = [ i for i in dir(cfg_module) if not i.startswith("__")] 8 | 9 | config = OrderedDict() 10 | for arg in args: 11 | k = arg 12 | v = getattr(cfg_module, arg) 13 | if type(v).__name__ in del_attr and k != "device": 14 | continue 15 | else: 16 | config[k] = v 17 | 18 | 19 | return config 20 | -------------------------------------------------------------------------------- /offlinerl/utils/env.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | from typing import Tuple 4 | 5 | def create_terminal_function(): 6 | return terminal_function 7 | 8 | def get_env(task : str) -> gym.Env: 9 | try: 10 | if task in ['Pipeline', 'Simglucose', 'RocketRecovery', 'RandomFrictionHopper', 'DMSD', 'Fusion', 'Salespromotion', 'SafetyHalfCheetah']: 11 | import neorl2 12 | import gymnasium as gym 13 | env = gym.make(task) 14 | elif task.startswith("HalfCheetah-v3"): 15 | import neorl 16 | env = neorl.make("HalfCheetah-v3") 17 | elif task.startswith("Hopper-v3"): 18 | import neorl 19 | env = neorl.make("Hopper-v3") 20 | elif task.startswith("Walker2d-v3"): 21 | import neorl 22 | env = neorl.make("Walker2d-v3") 23 | elif task.startswith('d4rl'): 24 | import gym 25 | import d4rl 26 | from d4rl import gym_mujoco 27 | env = gym.make(task[5:]) 28 | # hack to add terminal function 29 | if 'hopper' in task: 30 | def terminal_function(data : dict): 31 | obs = data["obs"] 32 | action = data["action"] 33 | obs_next = data["next_obs"] 34 | 35 | singel_done = False 36 | if len(obs.shape) == 1: 37 | singel_done = True 38 | obs = obs.reshape(1, -1) 39 | if len(action.shape) == 1: 40 | action = action.reshape(1, -1) 41 | if len(obs_next.shape) == 1: 42 | obs_next = obs_next.reshape(1, -1) 43 | 44 | if isinstance(obs, np.ndarray): 45 | array_type = np 46 | else: 47 | import torch 48 | array_type = torch 49 | 50 | z = obs_next[:, 0:1] 51 | angle = obs_next[:, 1:2] 52 | states = obs_next[:, 1:] 53 | 54 | min_state, max_state = (-100.0, 100.0) 55 | min_z, max_z = (0.7, float('inf')) 56 | min_angle, max_angle = (-0.2, 0.2) 57 | 58 | healthy_state = array_type.all(array_type.logical_and(min_state < states, states < max_state), axis=-1, keepdim=True) 59 | healthy_z = array_type.logical_and(min_z < z, z < max_z) 60 | healthy_angle = array_type.logical_and(min_angle < angle, angle < max_angle) 61 | 62 | is_healthy = array_type.logical_and(array_type.logical_and(healthy_state, healthy_z), healthy_angle) 63 | 64 | done = array_type.logical_not(is_healthy) 65 | 66 | if singel_done: 67 | done = done 68 | else: 69 | done = done.reshape(-1, 1) 70 | return done 71 | 72 | # env.get_done_func = lambda: terminal_function 73 | env.get_done_func = create_terminal_function 74 | elif 'walker' in task: 75 | def terminal_function(data : dict): 76 | 77 | obs = data["obs"] 78 | action = data["action"] 79 | obs_next = data["next_obs"] 80 | 81 | singel_done = False 82 | if len(obs.shape) == 1: 83 | singel_done = True 84 | obs = obs.reshape(1, -1) 85 | if len(action.shape) == 1: 86 | action = action.reshape(1, -1) 87 | if len(obs_next.shape) == 1: 88 | obs_next = obs_next.reshape(1, -1) 89 | 90 | if isinstance(obs, np.ndarray): 91 | array_type = np 92 | else: 93 | import torch 94 | array_type = torch 95 | 96 | min_z, max_z = (0.8, 2.0) 97 | min_angle, max_angle = (-1.0, 1.0) 98 | min_state, max_state = (-100.0, 100.0) 99 | 100 | z = obs_next[:, 0:1] 101 | angle = obs_next[:, 1:2] 102 | state = obs_next[:, 2:] 103 | 104 | healthy_state = array_type.all(array_type.logical_and(min_state < state, state < max_state), axis=-1, keepdim=True) 105 | healthy_z = array_type.logical_and(min_z < z, z < max_z) 106 | healthy_angle = array_type.logical_and(min_angle < angle, angle < max_angle) 107 | is_healthy = array_type.logical_and(array_type.logical_and(healthy_state, healthy_z), healthy_angle) 108 | done = array_type.logical_not(is_healthy) 109 | 110 | if singel_done: 111 | done = done 112 | else: 113 | done = done.reshape(-1, 1) 114 | 115 | return done 116 | 117 | # env.get_done_func = lambda: terminal_function 118 | env.get_done_func = create_terminal_function 119 | else: 120 | task_name = task.strip().split("-")[0] 121 | env = neorl.make(task_name) 122 | except: 123 | raise NotImplementedError 124 | 125 | return env 126 | 127 | def get_env_shape(task : str) -> Tuple[int, int]: 128 | env = get_env(task) 129 | obs_dim = env.observation_space.shape 130 | action_space = env.action_space 131 | 132 | if len(obs_dim) == 1: 133 | obs_dim = obs_dim[0] 134 | 135 | if hasattr(env.action_space, 'n'): 136 | act_dim = env.action_space.n 137 | else: 138 | act_dim = action_space.shape[0] 139 | 140 | return obs_dim, act_dim 141 | 142 | def get_env_obs_act_spaces(task : str): 143 | env = get_env(task) 144 | obs_space = env.observation_space 145 | act_space = env.action_space 146 | return obs_space, act_space 147 | 148 | def get_env_action_range(task : str) -> Tuple[float, float]: 149 | env = get_env(task) 150 | act_max = float(env.action_space.high[0]) 151 | act_min = float(env.action_space.low[0]) 152 | 153 | return act_max, act_min 154 | 155 | def get_env_state_range(task : str) -> Tuple[float, float]: 156 | env = get_env(task) 157 | obs_max = float(env.observation_space.high[0]) 158 | obs_min = float(env.observation_space.low[0]) 159 | 160 | return obs_max, obs_min -------------------------------------------------------------------------------- /offlinerl/utils/exp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | import random 4 | 5 | 6 | import torch 7 | import numpy as np 8 | from aim import Run 9 | from loguru import logger 10 | 11 | from offlinerl.utils.logger import log_path 12 | 13 | 14 | def setup_seed(seed=1024): 15 | torch.manual_seed(seed) 16 | torch.cuda.manual_seed_all(seed) 17 | np.random.seed(seed) 18 | random.seed(seed) 19 | torch.backends.cudnn.deterministic = True 20 | 21 | def select_free_cuda(): 22 | # 获取可用的 GPU 数量 23 | num_gpus = torch.cuda.device_count() 24 | 25 | if num_gpus == 0: 26 | print("No GPU available.") 27 | return None 28 | 29 | # 遍历所有 GPU,选择利用率最低的 GPU 30 | min_memory_usage = float('inf') 31 | selected_gpu_id = None 32 | 33 | for gpu_id in range(num_gpus): 34 | torch.cuda.set_device(gpu_id) 35 | gpu_memory_usage = torch.cuda.max_memory_allocated() / 1024**3 # in GB 36 | # 选择利用率最低的 GPU 37 | if gpu_memory_usage < min_memory_usage: 38 | min_memory_usage = gpu_memory_usage 39 | selected_gpu_id = gpu_id 40 | 41 | return selected_gpu_id 42 | 43 | def set_free_device_fn(): 44 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu' 45 | 46 | return device 47 | 48 | 49 | def init_exp_run(repo=None, experiment_name=None, flush_frequency=1): 50 | if repo is None: 51 | repo = os.path.join(log_path(),"./.aim") 52 | if not os.path.exists(repo): 53 | print(f'=====repo:{repo}') 54 | logger.info('{} dir is not exist, create {}',repo, repo) 55 | os.system(str("cd " + os.path.join(repo,"../") + "&& aim init")) 56 | else: 57 | repo = os.path.join(repo,"./.aim") 58 | if not os.path.exists(repo): 59 | print(f'=====repo:{repo}') 60 | logger.info('{} dir is not exist, create {}',repo, repo) 61 | os.system(str("cd " + os.path.join(repo,"../") + "&& aim init")) 62 | run = Run( 63 | repo=repo, 64 | experiment=experiment_name 65 | ) 66 | 67 | return run -------------------------------------------------------------------------------- /offlinerl/utils/flexible_replay_pool.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import pickle 3 | 4 | import numpy as np 5 | 6 | from .replay_pool import ReplayPool 7 | 8 | 9 | class FlexibleReplayPool(ReplayPool): 10 | def __init__(self, max_size, fields_attrs, obs_filter=False, modify_rew=False): 11 | super(FlexibleReplayPool, self).__init__() 12 | 13 | max_size = int(max_size) 14 | self._max_size = max_size 15 | 16 | self.fields = {} 17 | self.fields_attrs = {} 18 | 19 | self.add_fields(fields_attrs) 20 | 21 | self.obs_filter = obs_filter 22 | self.modify_rew = modify_rew 23 | 24 | self._pointer = 0 25 | self._size = 0 26 | self._samples_since_save = 0 27 | 28 | @property 29 | def size(self): 30 | return self._size 31 | 32 | @property 33 | def field_names(self): 34 | return list(self.fields.keys()) 35 | 36 | def add_fields(self, fields_attrs): 37 | self.fields_attrs.update(fields_attrs) 38 | 39 | for field_name, field_attrs in fields_attrs.items(): 40 | field_shape = (self._max_size, *field_attrs['shape']) 41 | initializer = field_attrs.get('initializer', np.zeros) 42 | self.fields[field_name] = initializer( 43 | field_shape, dtype=field_attrs['dtype']) 44 | 45 | def _advance(self, count=1): 46 | self._pointer = (self._pointer + count) % self._max_size 47 | self._size = min(self._size + count, self._max_size) 48 | self._samples_since_save += count 49 | 50 | def add_sample(self, sample): 51 | samples = { 52 | key: value[None, ...] 53 | for key, value in sample.items() 54 | } 55 | self.add_samples(samples) 56 | 57 | def add_samples(self, samples): 58 | # if 'infos' not in samples: 59 | # samples['infos'] = {} 60 | field_names = list(samples.keys()) 61 | num_samples = samples[field_names[0]].shape[0] 62 | index = np.arange( 63 | self._pointer, self._pointer + num_samples) % self._max_size 64 | for field_name in self.field_names: 65 | # print(field_name) 66 | default_value = ( 67 | self.fields_attrs[field_name].get('default_value', 0.0)) 68 | values = samples.get(field_name, default_value) 69 | if field_name not in samples.keys() and 'infos' in samples and field_name in samples['infos'][0].keys(): 70 | values = np.expand_dims(np.array([samples['infos'][i].get(field_name, default_value) for i in range(num_samples)]), axis=1) 71 | try: 72 | assert values.shape[0] == num_samples, f'value shape: {values.shape[0]}, expected: {num_samples}' 73 | if isinstance(values[0], dict): 74 | values = np.stack([np.concatenate([ 75 | value[key] 76 | for key in value.keys() 77 | ], axis=-1) for value in values]) 78 | self.fields[field_name][index] = values 79 | except Exception as e: 80 | import traceback 81 | traceback.print_exc(limit=10) 82 | print('[ DEBUG ] errors occurs: {}'.format(e)) 83 | 84 | import pdb; pdb.set_trace() 85 | self._advance(num_samples) 86 | 87 | def restore_samples(self, samples): 88 | num_samples = samples[list(samples.keys())[0]].shape[0] 89 | index = np.arange( 90 | 0, num_samples) % self._max_size 91 | for key, values in samples.items(): 92 | assert key in self.field_names 93 | self.fields[key][index] = values 94 | 95 | def random_indices(self, batch_size): 96 | if self._size == 0: return np.arange(0, 0) 97 | return np.random.randint(0, self._size, batch_size) 98 | 99 | def random_batch(self, batch_size, field_name_filter=None, **kwargs): 100 | random_indices = self.random_indices(batch_size) 101 | return self.batch_by_indices( 102 | random_indices, field_name_filter=field_name_filter, **kwargs) 103 | 104 | def last_n_batch(self, last_n, field_name_filter=None, **kwargs): 105 | last_n_indices = np.arange( 106 | self._pointer - min(self.size, last_n), self._pointer 107 | ) % self._max_size 108 | return self.batch_by_indices( 109 | last_n_indices, field_name_filter=field_name_filter, **kwargs) 110 | 111 | def filter_fields(self, field_names, field_name_filter): 112 | if isinstance(field_name_filter, str): 113 | field_name_filter = [field_name_filter] 114 | 115 | if isinstance(field_name_filter, (list, tuple)): 116 | field_name_list = field_name_filter 117 | 118 | def filter_fn(field_name): 119 | return field_name in field_name_list 120 | 121 | else: 122 | filter_fn = field_name_filter 123 | 124 | filtered_field_names = [ 125 | field_name for field_name in field_names 126 | if filter_fn(field_name) 127 | ] 128 | 129 | return filtered_field_names 130 | 131 | def batch_by_indices(self, indices, field_name_filter=None): 132 | if np.any(indices % self._max_size > self.size): 133 | raise ValueError( 134 | "Tried to retrieve batch with indices greater than current" 135 | " size") 136 | 137 | field_names = self.field_names 138 | if field_name_filter is not None: 139 | field_names = self.filter_fields( 140 | field_names, field_name_filter) 141 | 142 | return { 143 | field_name: self.fields[field_name][indices] 144 | for field_name in field_names 145 | } 146 | 147 | def save_latest_experience(self, pickle_path): 148 | latest_samples = self.last_n_batch(self._samples_since_save) 149 | 150 | with gzip.open(pickle_path, 'wb') as f: 151 | pickle.dump(latest_samples, f) 152 | 153 | self._samples_since_save = 0 154 | 155 | def load_experience(self, experience_path): 156 | with gzip.open(experience_path, 'rb') as f: 157 | latest_samples = pickle.load(f) 158 | 159 | key = list(latest_samples.keys())[0] 160 | num_samples = latest_samples[key].shape[0] 161 | for field_name, data in latest_samples.items(): 162 | assert data.shape[0] == num_samples, data.shape 163 | 164 | self.add_samples(latest_samples) 165 | self._samples_since_save = 0 166 | 167 | def return_all_samples(self): 168 | return { 169 | field_name: self.fields[field_name][:self.size] 170 | for field_name in self.field_names 171 | } 172 | 173 | def __getstate__(self): 174 | state = self.__dict__.copy() 175 | state['fields'] = { 176 | field_name: self.fields[field_name][:self.size] 177 | for field_name in self.field_names 178 | } 179 | 180 | return state 181 | 182 | def __setstate__(self, state): 183 | if state['_size'] < state['_max_size']: 184 | pad_size = state['_max_size'] - state['_size'] 185 | for field_name in state['fields'].keys(): 186 | field_shape = state['fields_attrs'][field_name]['shape'] 187 | state['fields'][field_name] = np.concatenate(( 188 | state['fields'][field_name], 189 | np.zeros((pad_size, *field_shape)) 190 | ), axis=0) 191 | 192 | self.__dict__ = state 193 | -------------------------------------------------------------------------------- /offlinerl/utils/function.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.functional import F 3 | 4 | def soft_clamp(x : torch.Tensor, _min=None, _max=None): 5 | # clamp tensor values while mataining the gradient 6 | if _max is not None: 7 | x = _max - F.softplus(_max - x) 8 | if _min is not None: 9 | x = _min + F.softplus(x - _min) 10 | return x -------------------------------------------------------------------------------- /offlinerl/utils/io.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pickle 4 | import urllib 5 | import urllib.request 6 | from tqdm import tqdm 7 | 8 | def read_json(file_path): 9 | with open(file_path, 'r') as f: 10 | data = json.load(f) 11 | 12 | return data 13 | 14 | 15 | def load_pkl(file_path): 16 | assert os.path.exists(file_path) 17 | with open(file_path, 'rb') as handle: 18 | data = pickle.load(handle) 19 | 20 | return data 21 | 22 | def save_pkl(data, file_path): 23 | with open(file_path, 'wb') as handle: 24 | pickle.dump(data, handle) 25 | 26 | 27 | def del_dir(dir_path): 28 | os.removedirs(dir_path) 29 | 30 | def create_dir(dir_path, cover=False): 31 | if cover or not os.path.exists(dir_path): 32 | if cover and os.path.exists(dir_path): 33 | os.removedirs(dir_path) 34 | os.makedirs(dir_path) 35 | 36 | 37 | def save_video(video_array, video_save_path): 38 | import cv2 39 | fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') 40 | output_movie = cv2.VideoWriter(video_save_path, fourcc, 10, (640, 360)) 41 | 42 | for frame in video_array: 43 | output_movie.write(frame) 44 | 45 | out.release() 46 | cv2.destroyAllWindows() 47 | 48 | def download_helper(url, filename): 49 | 'Download file from given url. Modified from `torchvision.dataset.utils`' 50 | def gen_bar_updater(): 51 | pbar = tqdm(total=None) 52 | 53 | def bar_update(count, block_size, total_size): 54 | if pbar.total is None and total_size: 55 | pbar.total = total_size 56 | progress_bytes = count * block_size 57 | pbar.update(progress_bytes - pbar.n) 58 | 59 | return bar_update 60 | 61 | try: 62 | print('Downloading ' + url + ' to ' + filename) 63 | urllib.request.urlretrieve( 64 | url, filename, 65 | reporthook=gen_bar_updater() 66 | ) 67 | 68 | return True 69 | except (urllib.error.URLError, IOError) as e: 70 | if url[:5] == 'https': 71 | url = url.replace('https:', 'http:') 72 | print('Failed download. Trying https -> http instead.' 73 | ' Downloading ' + url + ' to ' + filename) 74 | urllib.request.urlretrieve( 75 | url, filename, 76 | reporthook=gen_bar_updater() 77 | ) 78 | 79 | return True 80 | else: 81 | raise e -------------------------------------------------------------------------------- /offlinerl/utils/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | import aim 4 | 5 | from offlinerl.utils.io import create_dir 6 | 7 | def log_path(): 8 | import offlinerl 9 | log_path = os.path.abspath(os.path.join(offlinerl.__file__,"../../","offlinerl_tmp")) 10 | 11 | create_dir(log_path) 12 | 13 | return log_path 14 | 15 | """ 16 | class exp_logger(): 17 | def __init__(self, experiment_name=None,flush_frequency=1): 18 | print("experiment_name:",experiment_name) 19 | self.aim_logger = aim.Session(experiment=experiment_name, flush_frequency=flush_frequency) 20 | 21 | def log_hparams(self, hparams_dict): 22 | self.aim_logger.set_params(hparams_dict, name='hparams') 23 | """ -------------------------------------------------------------------------------- /offlinerl/utils/net/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/utils/net/__init__.py -------------------------------------------------------------------------------- /offlinerl/utils/net/bcq_net.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | from offlinerl.utils.net.common import BasePolicy 8 | 9 | 10 | # Used for Atari 11 | class Conv_Q(nn.Module): 12 | def __init__(self, frames, num_actions): 13 | super(Conv_Q, self).__init__() 14 | self.c1 = nn.Conv2d(frames, 32, kernel_size=8, stride=4) 15 | self.c2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) 16 | self.c3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) 17 | 18 | self.q1 = nn.Linear(3136, 512) 19 | self.q2 = nn.Linear(512, 16) 20 | self.q3 = nn.Linear(16, num_actions) 21 | 22 | self.i1 = nn.Linear(3136, 512) 23 | self.i2 = nn.Linear(512, 16) 24 | self.i3 = nn.Linear(16, num_actions) 25 | 26 | 27 | def forward(self, state): 28 | c = F.relu(self.c1(state)) 29 | c = F.relu(self.c2(c)) 30 | c = F.relu(self.c3(c)) 31 | 32 | q = F.relu(self.q1(c.reshape(-1, 3136))) 33 | q = F.relu(self.q2(q)) 34 | q = self.q3(q) 35 | 36 | i = F.relu(self.i1(c.reshape(-1, 3136))) 37 | i = F.relu(self.i2(i)) 38 | i = self.i3(i) 39 | return q, F.log_softmax(i, dim=1), i 40 | 41 | def encode(self, state): 42 | with torch.no_grad(): 43 | c = F.relu(self.c1(state)) 44 | c = F.relu(self.c2(c)) 45 | c = F.relu(self.c3(c)) 46 | 47 | q = F.relu(self.q1(c.reshape(-1, 3136))) 48 | q = F.relu(self.q2(q)) 49 | 50 | i = F.relu(self.i1(c.reshape(-1, 3136))) 51 | i = F.relu(self.i2(i)) 52 | return i 53 | 54 | 55 | 56 | # Used for Box2D / Toy problems 57 | class FC_Q(nn.Module, BasePolicy): 58 | def __init__(self, state_dim, num_actions): 59 | super(FC_Q, self).__init__() 60 | self.q1 = nn.Linear(state_dim, 256) 61 | self.q2 = nn.Linear(256, 256) 62 | self.q3 = nn.Linear(256, num_actions) 63 | 64 | self.i1 = nn.Linear(state_dim, 256) 65 | self.i2 = nn.Linear(256, 256) 66 | self.i3 = nn.Linear(256, num_actions) 67 | 68 | 69 | def forward(self, state): 70 | q = F.relu(self.q1(state)) 71 | q = F.relu(self.q2(q)) 72 | 73 | i = F.relu(self.i1(state)) 74 | i = F.relu(self.i2(i)) 75 | i = F.relu(self.i3(i)) 76 | return self.q3(q), F.log_softmax(i, dim=1), i 77 | 78 | def policy_infer(self, obs): 79 | 80 | q, imt, i = self(obs) 81 | imt = imt.exp() 82 | imt = (imt/imt.max(1, keepdim=True)[0] > 0.3).float() 83 | # Use large negative number to mask actions from argmax 84 | 85 | return (imt * q + (1. - imt) * -1e8).argmax(1) 86 | 87 | -------------------------------------------------------------------------------- /offlinerl/utils/net/maple_actor.py: -------------------------------------------------------------------------------- 1 | import torch.nn 2 | import torch.nn as nn 3 | import numpy as np 4 | import torch.nn.functional as F 5 | from offlinerl.utils.net.common import miniblock 6 | 7 | 8 | class Maple_actor(nn.Module): 9 | def __init__(self, obs_dim, action_dim, deterministic=False, hidden_sizes=(16,), Guassain_hidden_sizes=(256,256), max_traj_len=5, LOG_MAX_STD=2, LOG_MIN_STD=-20, EPS=1e-8, lstm_hidden_unit=128): 10 | super(Maple_actor,self).__init__() 11 | self.obs_dim = obs_dim 12 | self.deterministic = deterministic 13 | self.act_dim = action_dim 14 | self.hidden_sizes = list(hidden_sizes).copy() 15 | self.Guassain_hidden_sizes = list(Guassain_hidden_sizes).copy() 16 | self.max_traj_len = max_traj_len 17 | self.LOG_MAX_STD = LOG_MAX_STD 18 | self.LOG_MIN_STD = LOG_MIN_STD 19 | self.EPS = EPS 20 | self.lstm_hidden_unit = lstm_hidden_unit 21 | self.mlp = miniblock(lstm_hidden_unit, hidden_sizes[0], None, relu=False) 22 | if len(hidden_sizes) >= 2: 23 | for i in range(1,len(hidden_sizes)): 24 | self.mlp += miniblock(hidden_sizes[i-1], hidden_sizes[i], None) 25 | self.mlp = nn.Sequential(*self.mlp) 26 | self.Guassain_input_dim = self.hidden_sizes[-1] + self.obs_dim 27 | self.Guassain_mlp = miniblock(self.Guassain_input_dim, self.Guassain_hidden_sizes[0], None) 28 | if len(Guassain_hidden_sizes)>=2: 29 | for i in range(1,len(Guassain_hidden_sizes)): 30 | self.Guassain_mlp += miniblock(Guassain_hidden_sizes[i-1], Guassain_hidden_sizes[i], None) 31 | self.Guassain_mlp = nn.Sequential(*self.Guassain_mlp) 32 | self.Guassain_mu_mlp = [nn.Linear(self.Guassain_hidden_sizes[-1], action_dim)] 33 | self.Guassain_logstd_mlp = [nn.Linear(self.Guassain_hidden_sizes[-1], action_dim)] 34 | self.Guassain_mu_mlp = nn.Sequential(*self.Guassain_mu_mlp) 35 | self.Guassain_logstd_mlp = nn.Sequential(*self.Guassain_logstd_mlp) 36 | def gaussian_likelihood(self,x, mu, log_std): 37 | pre_sum = -0.5 * (((x - mu) / (torch.exp(log_std) + self.EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi)) 38 | return torch.sum(pre_sum, dim=-1) 39 | 40 | def forward(self, hidden_policy, obs): 41 | policy_out = self.mlp(hidden_policy) 42 | policy_z = torch.cat([policy_out, obs], dim=-1) 43 | out = self.Guassain_mlp(policy_z) 44 | mu = self.Guassain_mu_mlp(out) 45 | log_std = self.Guassain_logstd_mlp(out) 46 | log_std = torch.clip(log_std, self.LOG_MIN_STD, self.LOG_MAX_STD) 47 | std = torch.exp(log_std) 48 | acts = torch.distributions.Normal(torch.zeros_like(mu),torch.ones_like(std)).sample()*std + mu 49 | log_p_acts = self.gaussian_likelihood(acts, mu, log_std) 50 | mu, acts, log_p_acts = self.apply_squashing_func(mu, acts, log_p_acts) 51 | return mu, acts, log_p_acts, std 52 | 53 | def apply_squashing_func(self, mu, pi, logp_pi): 54 | logp_pi -= torch.sum(2 * (np.log(2) - pi - F.softplus(-2 * pi)), dim=-1) 55 | # Squash those unbounded actions! 56 | mu = torch.tanh(mu) 57 | pi = torch.tanh(pi) 58 | return mu, pi, logp_pi 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /offlinerl/utils/net/mlas.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from offlinerl.utils.net.common import BasePolicy 6 | 7 | class VAE(nn.Module, BasePolicy): 8 | def __init__(self, 9 | state_dim, 10 | action_dim, 11 | latent_dim, 12 | max_action, 13 | hidden_size=750): 14 | super(VAE, self).__init__() 15 | 16 | self.e1 = nn.Linear(state_dim + action_dim, hidden_size) 17 | self.e2 = nn.Linear(hidden_size, hidden_size) 18 | 19 | self.mean = nn.Linear(hidden_size, latent_dim) 20 | self.log_std = nn.Linear(hidden_size, latent_dim) 21 | 22 | self.d1 = nn.Linear(state_dim + latent_dim, hidden_size) 23 | self.d2 = nn.Linear(hidden_size, hidden_size) 24 | self.d3 = nn.Linear(hidden_size, action_dim) 25 | 26 | self.max_action = max_action 27 | self.latent_dim = latent_dim 28 | 29 | self._actor = None 30 | 31 | def forward(self, state, action): 32 | z = F.relu(self.e1(torch.cat([state, action], 1))) 33 | z = F.relu(self.e2(z)) 34 | 35 | mean = self.mean(z) 36 | # Clamped for numerical stability 37 | log_std = self.log_std(z).clamp(-4, 15) 38 | std = torch.exp(log_std) 39 | z = mean + std * torch.randn_like(std) 40 | 41 | u = self.decode(state, z) 42 | 43 | return u, mean, std 44 | 45 | def decode(self, state, z=None, clip=None, raw=False): 46 | # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5] 47 | if z is None: 48 | z = torch.randn((state.shape[0], self.latent_dim)).to(state.device) 49 | if clip is not None: 50 | z = z.clamp(-clip, clip) 51 | 52 | a = F.relu(self.d1(torch.cat([state, z], 1))) 53 | a = F.relu(self.d2(a)) 54 | a = self.d3(a) 55 | if raw: 56 | return a 57 | return self.max_action * torch.tanh(a) 58 | 59 | def policy_infer(self, obs): 60 | return self.decode(obs, z=self._actor(obs)[0]) 61 | 62 | class ActorPerturbation(nn.Module, BasePolicy): 63 | def __init__(self, state_dim, action_dim, latent_action_dim, max_action, max_latent_action=2, phi=0.05): 64 | super(ActorPerturbation, self).__init__() 65 | 66 | self.hidden_size = (400, 300, 400, 300) 67 | 68 | self.l1 = nn.Linear(state_dim, self.hidden_size[0]) 69 | self.l2 = nn.Linear(self.hidden_size[0], self.hidden_size[1]) 70 | self.l3 = nn.Linear(self.hidden_size[1], latent_action_dim) 71 | 72 | self.l4 = nn.Linear(state_dim + action_dim, self.hidden_size[2]) 73 | self.l5 = nn.Linear(self.hidden_size[2], self.hidden_size[3]) 74 | self.l6 = nn.Linear(self.hidden_size[3], action_dim) 75 | 76 | self.max_latent_action = max_latent_action 77 | self.max_action = max_action 78 | self.phi = phi 79 | 80 | self.vae = None 81 | 82 | def forward(self, state, decoder): 83 | a = F.relu(self.l1(state)) 84 | a = F.relu(self.l2(a)) 85 | latent_action = self.max_latent_action * torch.tanh(self.l3(a)) 86 | 87 | mid_action = decoder(state, z=latent_action) 88 | 89 | a = F.relu(self.l4(torch.cat([state, mid_action], 1))) 90 | a = F.relu(self.l5(a)) 91 | a = self.phi * torch.tanh(self.l6(a)) 92 | final_action = (a + mid_action).clamp(-self.max_action, self.max_action) 93 | return latent_action, mid_action, final_action 94 | 95 | def policy_infer(self, obs): 96 | 97 | return self(obs, self.vae.decode)[-1] -------------------------------------------------------------------------------- /offlinerl/utils/net/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/utils/net/model/__init__.py -------------------------------------------------------------------------------- /offlinerl/utils/net/model/ensemble.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from offlinerl.utils.function import soft_clamp 5 | from offlinerl.utils.net.common import Swish 6 | 7 | class EnsembleLinear(torch.nn.Module): 8 | def __init__(self, in_features, out_features, ensemble_size=7): 9 | super().__init__() 10 | 11 | self.ensemble_size = ensemble_size 12 | 13 | self.register_parameter('weight', torch.nn.Parameter(torch.zeros(ensemble_size, in_features, out_features))) 14 | self.register_parameter('bias', torch.nn.Parameter(torch.zeros(ensemble_size, 1, out_features))) 15 | 16 | torch.nn.init.trunc_normal_(self.weight, std=1/(2*in_features**0.5)) 17 | 18 | self.register_parameter('saved_weight', torch.nn.Parameter(self.weight.detach().clone())) 19 | self.register_parameter('saved_bias', torch.nn.Parameter(self.bias.detach().clone())) 20 | 21 | self.select = list(range(0, self.ensemble_size)) 22 | 23 | def forward(self, x): 24 | weight = self.weight[self.select] 25 | bias = self.bias[self.select] 26 | 27 | if len(x.shape) == 2: 28 | x = torch.einsum('ij,bjk->bik', x, weight) 29 | else: 30 | x = torch.einsum('bij,bjk->bik', x, weight) 31 | 32 | x = x + bias 33 | 34 | return x 35 | 36 | def set_select(self, indexes): 37 | assert len(indexes) <= self.ensemble_size and max(indexes) < self.ensemble_size 38 | self.select = indexes 39 | self.weight.data[indexes] = self.saved_weight.data[indexes] 40 | self.bias.data[indexes] = self.saved_bias.data[indexes] 41 | 42 | def update_save(self, indexes): 43 | self.saved_weight.data[indexes] = self.weight.data[indexes] 44 | self.saved_bias.data[indexes] = self.bias.data[indexes] 45 | 46 | class EnsembleTransition(torch.nn.Module): 47 | def __init__(self, obs_dim, action_dim, hidden_features, hidden_layers, ensemble_size=7, mode='local', with_reward=True): 48 | super().__init__() 49 | self.obs_dim = obs_dim 50 | self.mode = mode 51 | self.with_reward = with_reward 52 | self.ensemble_size = ensemble_size 53 | 54 | self.activation = Swish() 55 | 56 | module_list = [] 57 | for i in range(hidden_layers): 58 | if i == 0: 59 | module_list.append(EnsembleLinear(obs_dim + action_dim, hidden_features, ensemble_size)) 60 | else: 61 | module_list.append(EnsembleLinear(hidden_features, hidden_features, ensemble_size)) 62 | self.backbones = torch.nn.ModuleList(module_list) 63 | 64 | self.output_layer = EnsembleLinear(hidden_features, 2 * (obs_dim + self.with_reward), ensemble_size) 65 | self.obs_mean = None 66 | self.obs_std = None 67 | self.register_parameter('max_logstd', torch.nn.Parameter(torch.ones(obs_dim + self.with_reward) * 1, requires_grad=True)) 68 | self.register_parameter('min_logstd', torch.nn.Parameter(torch.ones(obs_dim + self.with_reward) * -5, requires_grad=True)) 69 | 70 | def update_self(self, obs): 71 | self.obs_mean = obs.mean(dim=0) 72 | self.obs_std = obs.std(dim=0) 73 | 74 | def forward(self, obs_action): 75 | # Normalization for obs. If 'normalize', no residual. 76 | # use 'dims' to make forward work both when training and evaluating 77 | dims = len(obs_action.shape) - 2 # dim == 0: eval, dim == 1: train 78 | if self.obs_mean is not None: 79 | if dims == 1: 80 | obs_mean = self.obs_mean.unsqueeze(0).expand(obs_action.shape[0], -1).to(obs_action.device) 81 | obs_std = self.obs_std.unsqueeze(0).expand(obs_action.shape[0], -1).to(obs_action.device) 82 | else: 83 | obs_mean = self.obs_mean.to(obs_action.device) 84 | obs_std = self.obs_std.to(obs_action.device) 85 | 86 | if self.mode == 'normalize': 87 | batch_size = obs_action.shape[dims] 88 | obs, action = torch.split(obs_action, [self.obs_dim, obs_action.shape[-1] - self.obs_dim], dim=-1) 89 | if dims == 1: 90 | obs = obs - obs_mean.unsqueeze(dims).expand(-1, batch_size, -1) 91 | obs = obs / (obs_std.unsqueeze(dims).expand(-1, batch_size, -1) + 1e-8) 92 | else: 93 | obs = obs - obs_mean.unsqueeze(dims).expand(batch_size, -1) 94 | obs = obs / (obs_std.unsqueeze(dims).expand(batch_size, -1) + 1e-8) 95 | output = torch.cat([obs, action], dim=-1) 96 | else: 97 | output = obs_action 98 | else: 99 | output = obs_action 100 | 101 | for layer in self.backbones: 102 | output = self.activation(layer(output)) 103 | mu, logstd = torch.chunk(self.output_layer(output), 2, dim=-1) 104 | logstd = soft_clamp(logstd, self.min_logstd, self.max_logstd) 105 | # 'local': with residual 106 | if self.mode == 'local' or self.mode == 'normalize': 107 | if self.with_reward: 108 | obs, reward = torch.split(mu, [self.obs_dim, 1], dim=-1) 109 | obs = obs + obs_action[..., :self.obs_dim] 110 | mu = torch.cat([obs, reward], dim=-1) 111 | else: 112 | mu = mu + obs_action[..., :self.obs_dim] 113 | return torch.distributions.Normal(mu, torch.exp(logstd)) 114 | 115 | def set_select(self, indexes): 116 | self.elites = indexes 117 | for layer in self.backbones: 118 | layer.set_select(indexes) 119 | self.output_layer.set_select(indexes) 120 | 121 | def update_save(self, indexes): 122 | for layer in self.backbones: 123 | layer.update_save(indexes) 124 | self.output_layer.update_save(indexes) 125 | 126 | def random_elite_idxs(self, batch_size: int) -> np.ndarray: 127 | idxs = np.random.choice(len(self.elites), size=batch_size) 128 | return idxs -------------------------------------------------------------------------------- /offlinerl/utils/net/model/maple_critic.py: -------------------------------------------------------------------------------- 1 | import torch.nn 2 | import torch.nn as nn 3 | import numpy as np 4 | import torch.nn.functional as F 5 | from offlinerl.utils.net.common import miniblock 6 | 7 | 8 | class Maple_critic(nn.Module): 9 | def __init__(self, obs_dim, action_dim,deterministic=False,hidden_sizes=(16,),value_hidden_sizes=(256,256),lstm_hidden_unit=128): 10 | super(Maple_critic,self).__init__() 11 | self.obs_dim = obs_dim 12 | self.action_dim = action_dim 13 | self.deterministic = deterministic 14 | self.hidden_sizes = list(hidden_sizes).copy() 15 | self.value_hidden_sizes = list(value_hidden_sizes).copy() 16 | self.lstm_hidden_unit = lstm_hidden_unit 17 | self.mlp = miniblock(self.lstm_hidden_unit, self.hidden_sizes[0], None, relu=False) 18 | if len(self.hidden_sizes) >= 2: 19 | for i in range(1,len(self.hidden_sizes)): 20 | self.mlp += miniblock(self.hidden_sizes[i-1], self.hidden_sizes[i], None) 21 | self.mlp = nn.Sequential(*self.mlp) 22 | self.vfs = miniblock(self.hidden_sizes[-1]+self.obs_dim+self.action_dim, self.value_hidden_sizes[0],None) 23 | if len(self.value_hidden_sizes)>=2: 24 | for i in range(1, len(self.value_hidden_sizes)): 25 | self.vfs += miniblock(self.value_hidden_sizes[i-1], self.value_hidden_sizes[i], None) 26 | self.vfs += [nn.Linear(self.value_hidden_sizes[-1], 1)] 27 | self.vfs = nn.Sequential(*self.vfs) 28 | 29 | def forward(self, value_hidden, actions, obs): 30 | out = self.mlp(value_hidden) 31 | out = torch.cat([out, obs, actions], dim=-1) 32 | out = self.vfs(out) 33 | return out 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /offlinerl/utils/net/model/new_ensemble.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import os.path as path 5 | from torch.nn import functional as F 6 | from typing import Dict, List, Union, Tuple, Optional 7 | 8 | 9 | class EnsembleLinear(nn.Module): 10 | def __init__( 11 | self, 12 | input_dim: int, 13 | output_dim: int, 14 | num_ensemble: int, 15 | weight_decay: float = 0.0 16 | ) -> None: 17 | super().__init__() 18 | 19 | self.num_ensemble = num_ensemble 20 | 21 | self.register_parameter("weight", nn.Parameter(torch.zeros(num_ensemble, input_dim, output_dim))) 22 | self.register_parameter("bias", nn.Parameter(torch.zeros(num_ensemble, 1, output_dim))) 23 | 24 | nn.init.trunc_normal_(self.weight, std=1/(2*input_dim**0.5)) 25 | 26 | self.register_parameter("saved_weight", nn.Parameter(self.weight.detach().clone())) 27 | self.register_parameter("saved_bias", nn.Parameter(self.bias.detach().clone())) 28 | 29 | self.weight_decay = weight_decay 30 | 31 | def forward(self, x: torch.Tensor) -> torch.Tensor: 32 | weight = self.weight 33 | bias = self.bias 34 | 35 | if len(x.shape) == 2: 36 | x = torch.einsum('ij,bjk->bik', x, weight) 37 | else: 38 | x = torch.einsum('bij,bjk->bik', x, weight) 39 | 40 | x = x + bias 41 | 42 | return x 43 | 44 | def load_save(self) -> None: 45 | self.weight.data.copy_(self.saved_weight.data) 46 | self.bias.data.copy_(self.saved_bias.data) 47 | 48 | def update_save(self, indexes: List[int]) -> None: 49 | self.saved_weight.data[indexes] = self.weight.data[indexes] 50 | self.saved_bias.data[indexes] = self.bias.data[indexes] 51 | 52 | def get_decay_loss(self) -> torch.Tensor: 53 | decay_loss = self.weight_decay * (0.5*((self.weight**2).sum())) 54 | return decay_loss 55 | 56 | 57 | class Swish(nn.Module): 58 | def __init__(self) -> None: 59 | super(Swish, self).__init__() 60 | 61 | def forward(self, x: torch.Tensor) -> torch.Tensor: 62 | x = x * torch.sigmoid(x) 63 | return x 64 | 65 | 66 | def soft_clamp( 67 | x : torch.Tensor, 68 | _min: Optional[torch.Tensor] = None, 69 | _max: Optional[torch.Tensor] = None 70 | ) -> torch.Tensor: 71 | # clamp tensor values while mataining the gradient 72 | if _max is not None: 73 | x = _max - F.softplus(_max - x) 74 | if _min is not None: 75 | x = _min + F.softplus(x - _min) 76 | return x 77 | 78 | 79 | class EnsembleTransition(nn.Module): 80 | def __init__( 81 | self, 82 | obs_dim: int, 83 | action_dim: int, 84 | hidden_dims: Union[List[int], Tuple[int]], 85 | num_ensemble: int = 7, 86 | num_elites: int = 5, 87 | activation: nn.Module = Swish, 88 | weight_decays: Optional[Union[List[float], Tuple[float]]] = None, 89 | with_reward: bool = True, 90 | device: str = "cpu" 91 | ) -> None: 92 | super().__init__() 93 | 94 | self.num_ensemble = num_ensemble 95 | self.num_elites = num_elites 96 | self._with_reward = with_reward 97 | self.device = torch.device(device) 98 | 99 | self.activation = activation() 100 | 101 | assert len(weight_decays) == (len(hidden_dims) + 1) 102 | 103 | module_list = [] 104 | hidden_dims = [obs_dim+action_dim] + list(hidden_dims) 105 | if weight_decays is None: 106 | weight_decays = [0.0] * (len(hidden_dims) + 1) 107 | for in_dim, out_dim, weight_decay in zip(hidden_dims[:-1], hidden_dims[1:], weight_decays[:-1]): 108 | module_list.append(EnsembleLinear(in_dim, out_dim, num_ensemble, weight_decay)) 109 | self.backbones = nn.ModuleList(module_list) 110 | 111 | self.output_layer = EnsembleLinear( 112 | hidden_dims[-1], 113 | 2 * (obs_dim + self._with_reward), 114 | num_ensemble, 115 | weight_decays[-1] 116 | ) 117 | 118 | self.register_parameter( 119 | "max_logvar", 120 | nn.Parameter(torch.ones(obs_dim + self._with_reward) * 0.5, requires_grad=True) 121 | ) 122 | self.register_parameter( 123 | "min_logvar", 124 | nn.Parameter(torch.ones(obs_dim + self._with_reward) * -10, requires_grad=True) 125 | ) 126 | 127 | self.register_parameter( 128 | "elites", 129 | nn.Parameter(torch.tensor(list(range(0, self.num_elites))), requires_grad=False) 130 | ) 131 | 132 | self.to(self.device) 133 | 134 | def forward(self, obs_action: np.ndarray) -> Tuple[torch.Tensor, torch.Tensor]: 135 | obs_action = torch.as_tensor(obs_action, dtype=torch.float32).to(self.device) 136 | output = obs_action 137 | for layer in self.backbones: 138 | output = self.activation(layer(output)) 139 | mean, logvar = torch.chunk(self.output_layer(output), 2, dim=-1) 140 | logvar = soft_clamp(logvar, self.min_logvar, self.max_logvar) 141 | return mean, logvar 142 | 143 | def load_save(self) -> None: 144 | for layer in self.backbones: 145 | layer.load_save() 146 | self.output_layer.load_save() 147 | 148 | def update_save(self, indexes: List[int]) -> None: 149 | for layer in self.backbones: 150 | layer.update_save(indexes) 151 | self.output_layer.update_save(indexes) 152 | 153 | def get_decay_loss(self) -> torch.Tensor: 154 | decay_loss = 0 155 | for layer in self.backbones: 156 | decay_loss += layer.get_decay_loss() 157 | decay_loss += self.output_layer.get_decay_loss() 158 | return decay_loss 159 | 160 | def set_elites(self, indexes: List[int]) -> None: 161 | assert len(indexes) <= self.num_ensemble and max(indexes) < self.num_ensemble 162 | self.register_parameter('elites', nn.Parameter(torch.tensor(indexes), requires_grad=False)) 163 | 164 | def random_elite_idxs(self, batch_size: int) -> np.ndarray: 165 | idxs = np.random.choice(self.elites.data.cpu().numpy(), size=batch_size) 166 | return idxs 167 | 168 | 169 | class StandardScaler(object): 170 | def __init__(self, mu=None, std=None): 171 | self.mu = mu 172 | self.std = std 173 | 174 | def fit(self, data): 175 | """Runs two ops, one for assigning the mean of the data to the internal mean, and 176 | another for assigning the standard deviation of the data to the internal standard deviation. 177 | This function must be called within a 'with .as_default()' block. 178 | 179 | Arguments: 180 | data (np.ndarray): A numpy array containing the input 181 | 182 | Returns: None. 183 | """ 184 | self.mu = np.mean(data, axis=0, keepdims=True) 185 | self.std = np.std(data, axis=0, keepdims=True) 186 | self.std[self.std < 1e-12] = 1.0 187 | 188 | def transform(self, data): 189 | """Transforms the input matrix data using the parameters of this scaler. 190 | 191 | Arguments: 192 | data (np.array): A numpy array containing the points to be transformed. 193 | 194 | Returns: (np.array) The transformed dataset. 195 | """ 196 | return (data - self.mu) / self.std 197 | 198 | def inverse_transform(self, data): 199 | """Undoes the transformation performed by this scaler. 200 | 201 | Arguments: 202 | data (np.array): A numpy array containing the points to be transformed. 203 | 204 | Returns: (np.array) The transformed dataset. 205 | """ 206 | return self.std * data + self.mu 207 | 208 | def save_scaler(self, save_path): 209 | mu_path = path.join(save_path, "mu.npy") 210 | std_path = path.join(save_path, "std.npy") 211 | np.save(mu_path, self.mu) 212 | np.save(std_path, self.std) 213 | 214 | def load_scaler(self, load_path): 215 | mu_path = path.join(load_path, "mu.npy") 216 | std_path = path.join(load_path, "std.npy") 217 | self.mu = np.load(mu_path) 218 | self.std = np.load(std_path) 219 | 220 | def transform_tensor(self, data: torch.Tensor): 221 | device = data.device 222 | data = self.transform(data.cpu().numpy()) 223 | data = torch.tensor(data, device=device) 224 | return data -------------------------------------------------------------------------------- /offlinerl/utils/net/model_GRU.py: -------------------------------------------------------------------------------- 1 | import torch.nn 2 | import torch.nn as nn 3 | import numpy as np 4 | import torch.nn.functional as F 5 | from offlinerl.utils.net.common import miniblock 6 | 7 | class GRU_Model(nn.Module): 8 | def __init__(self, obs_dim, action_dim,device=None, lstm_hidden_units=128): 9 | super(GRU_Model, self).__init__() 10 | self.obs_dim = obs_dim 11 | self.action_dim = action_dim 12 | self.device = device 13 | self.lstm_hidden_units = lstm_hidden_units 14 | self.GRU = nn.GRU(self.obs_dim + self.action_dim, lstm_hidden_units, batch_first=True) 15 | def forward(self, obs, last_acts, pre_hidden, lens): 16 | sta_acs = torch.cat([obs, last_acts], dim=-1) 17 | packed = torch.nn.utils.rnn.pack_padded_sequence(sta_acs,lens,batch_first=True, enforce_sorted=False) 18 | if len(pre_hidden.shape) == 2: 19 | pre_hidden = torch.unsqueeze(pre_hidden, dim=0) 20 | output,_ = self.GRU(packed, pre_hidden) 21 | output,_ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True) 22 | return output 23 | def get_hidden(self, obs, last_actions, lens): 24 | pre_hidden = torch.zeros((1,len(lens),self.lstm_hidden_units)).to(self.device) 25 | return self(obs, last_actions, pre_hidden,lens) 26 | -------------------------------------------------------------------------------- /offlinerl/utils/net/moose.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from offlinerl.utils.net.common import BasePolicy 6 | 7 | class VAE(nn.Module, BasePolicy): 8 | def __init__(self, 9 | state_dim, 10 | action_dim, 11 | latent_dim, 12 | max_action, 13 | hidden_size=750): 14 | super(VAE, self).__init__() 15 | 16 | self.e1 = nn.Linear(state_dim + action_dim, hidden_size) 17 | self.e2 = nn.Linear(hidden_size, hidden_size) 18 | 19 | self.mean = nn.Linear(hidden_size, latent_dim) 20 | self.log_std = nn.Linear(hidden_size, latent_dim) 21 | 22 | self.d1 = nn.Linear(latent_dim, hidden_size) 23 | self.d2 = nn.Linear(hidden_size, hidden_size) 24 | self.d3 = nn.Linear(hidden_size, state_dim + action_dim) 25 | 26 | self.max_action = max_action 27 | self.latent_dim = latent_dim 28 | 29 | self._actor = None 30 | 31 | def forward(self, state, action): 32 | z = F.relu(self.e1(torch.cat([state, action], 1))) 33 | z = F.relu(self.e2(z)) 34 | 35 | mean = self.mean(z) 36 | # Clamped for numerical stability 37 | log_std = self.log_std(z).clamp(-4, 15) 38 | std = torch.exp(log_std) 39 | z = mean + std * torch.randn_like(std) 40 | 41 | u = self.decode(z) 42 | 43 | return u, mean, std 44 | 45 | def decode(self, z): 46 | a = F.relu(self.d1(z)) 47 | a = F.relu(self.d2(a)) 48 | a = self.d3(a) 49 | return a 50 | 51 | 52 | def policy_infer(self, obs): 53 | return self.decode(obs) -------------------------------------------------------------------------------- /offlinerl/utils/net/tanhpolicy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch import nn as nn 4 | from torch.nn import functional as F 5 | from torch.distributions import Distribution, Normal 6 | 7 | from offlinerl.utils.net.common import BasePolicy 8 | from offlinerl.utils.net.continuous import ActorProb 9 | 10 | 11 | class TanhNormal(Distribution): 12 | """ 13 | Represent distribution of X where 14 | X = tanh(Z) 15 | Z ~ N(mean, std) 16 | 17 | Note: this is not very numerically stable. 18 | """ 19 | def __init__(self, normal_mean, normal_std, max_action=1, min_action=-1, epsilon=1e-6): 20 | """ 21 | :param normal_mean: Mean of the normal distribution 22 | :param normal_std: Std of the normal distribution 23 | :param epsilon: Numerical stability epsilon when computing log-prob. 24 | """ 25 | self.normal_mean = normal_mean 26 | self.normal_std = normal_std 27 | self.normal = Normal(normal_mean, normal_std) 28 | self.epsilon = epsilon 29 | self.max_action = max_action 30 | self.min_action = min_action 31 | 32 | def sample_n(self, n, return_pre_tanh_value=False): 33 | z = self.normal.sample_n(n) 34 | if return_pre_tanh_value: 35 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2, z 36 | else: 37 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2 38 | 39 | def atanh(self,x): 40 | one_plus_x = (1 + x).clamp(min=1e-6) 41 | one_minus_x = (1 - x).clamp(min=1e-6) 42 | return 0.5 * torch.log(one_plus_x / one_minus_x) 43 | 44 | @property 45 | def mode(self): 46 | return ((self.max_action-self.min_action)/2) * torch.tanh(self.normal_mean) + (self.max_action+self.min_action)/2 47 | 48 | def log_prob(self, value, pre_tanh_value=None): 49 | """ 50 | 51 | :param value: some value, x 52 | :param pre_tanh_value: arctanh(x) 53 | :return: 54 | """ 55 | unscaled_value = (2*value - (self.max_action+self.min_action))/(self.max_action - self.min_action) # assume the actual actions have been transformed 56 | if pre_tanh_value is None: 57 | pre_tanh_value = self.atanh(unscaled_value) # get the raw Gaussian distribution output 58 | 59 | # ==== previous calculation of tanh log_prob ===== 60 | # self.normal.log_prob(pre_tanh_value) - torch.log( 61 | # 1 - value * value + self.epsilon 62 | # ) 63 | # previous calculation of tanhGaussian log_prob is OK when the action is in (-1,1). To be more general, we need the following revision 64 | 65 | action_scale = (self.max_action-self.min_action)/2.0 66 | squashed_action = unscaled_value 67 | log_prob = self.normal.log_prob(pre_tanh_value) - torch.log(action_scale * (1 - squashed_action.pow(2)) + self.epsilon) 68 | return log_prob 69 | 70 | def sample(self, return_pretanh_value=False): 71 | """ 72 | Gradients will and should *not* pass through this operation. 73 | 74 | See https://github.com/pytorch/pytorch/issues/4620 for discussion. 75 | """ 76 | z = self.normal.sample().detach() 77 | 78 | if return_pretanh_value: 79 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2, z 80 | else: 81 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2 82 | 83 | def rsample(self, return_pretanh_value=False): 84 | """ 85 | Sampling in the reparameterization case. 86 | """ 87 | z = ( 88 | self.normal_mean + 89 | self.normal_std * 90 | Normal( 91 | torch.zeros(self.normal_mean.size(), device=self.normal_mean.device), 92 | torch.ones(self.normal_std.size(), device=self.normal_mean.device) 93 | ).sample() 94 | ) 95 | z.requires_grad_() 96 | 97 | if return_pretanh_value: 98 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2, z 99 | else: 100 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2 101 | 102 | 103 | class TanhGaussianPolicy(ActorProb, BasePolicy): 104 | LOG_SIG_MAX = 2 105 | LOG_SIG_MIN = -5 106 | MEAN_MIN = -9.0 107 | MEAN_MAX = 9.0 108 | 109 | def atanh(self,x): 110 | one_plus_x = (1 + x).clamp(min=1e-6) 111 | one_minus_x = (1 - x).clamp(min=1e-6) 112 | return 0.5*torch.log(one_plus_x/ one_minus_x) 113 | 114 | def log_prob(self, obs, actions): 115 | raw_actions = self.atanh(actions) 116 | logits, h = self.preprocess(obs) 117 | 118 | mean = self.mu(logits) 119 | mean = torch.clamp(mean, self.MEAN_MIN, self.MEAN_MAX) 120 | if self._c_sigma: 121 | log_std = torch.clamp( 122 | self.sigma(logits), min=self.LOG_SIG_MIN, max=self.LOG_SIG_MAX 123 | ) 124 | std = log_std.exp() 125 | else: 126 | shape = [1] * len(mean.shape) 127 | shape[1] = -1 128 | log_std = (self.sigma.view(shape) + torch.zeros_like(mean)) 129 | std = log_std.exp() 130 | 131 | tanh_normal = TanhNormal(mean, std) 132 | log_prob = tanh_normal.log_prob(value=actions, pre_tanh_value=raw_actions) 133 | return log_prob.sum(-1) 134 | 135 | def forward( 136 | self, 137 | obs, 138 | state=None, 139 | infor={}, 140 | reparameterize=True, 141 | ): 142 | """ 143 | :param obs: Observation 144 | :param deterministic: If True, do not sample 145 | :param return_log_prob: If True, return a sample and its log probability 146 | """ 147 | logits, h = self.preprocess(obs, state) 148 | mean = self.mu(logits) 149 | 150 | if self._c_sigma: 151 | log_std = torch.clamp( 152 | self.sigma(logits), min=self.LOG_SIG_MIN, max=self.LOG_SIG_MAX 153 | ) 154 | std = log_std.exp() 155 | else: 156 | shape = [1] * len(mean.shape) 157 | shape[1] = -1 158 | log_std = (self.sigma.view(shape) + torch.zeros_like(mean)) 159 | std = log_std.exp() 160 | 161 | return TanhNormal(mean, std, max_action=self._max, min_action=-self._max) 162 | 163 | def policy_infer(self, obs): 164 | return self(obs).mode 165 | -------------------------------------------------------------------------------- /offlinerl/utils/net/terminal_check.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def termination_fn_halfcheetah(obs, act, next_obs): 5 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 6 | 7 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) 8 | done = ~not_done 9 | done = done[:, None] 10 | return done 11 | 12 | def termination_fn_hopper(obs, act, next_obs): 13 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 14 | 15 | height = next_obs[:, 0] 16 | angle = next_obs[:, 1] 17 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) * \ 18 | np.isfinite(next_obs).all(axis=-1) \ 19 | * np.abs(next_obs[:,1:] < 100).all(axis=-1) \ 20 | * (height > .7) \ 21 | * (np.abs(angle) < .2) 22 | 23 | done = ~not_done 24 | done = done[:,None] 25 | return done 26 | 27 | def termination_fn_halfcheetahveljump(obs, act, next_obs): 28 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 29 | 30 | done = np.array([False]).repeat(len(obs)) 31 | done = done[:,None] 32 | return done 33 | 34 | def termination_fn_antangle(obs, act, next_obs): 35 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 36 | 37 | x = next_obs[:, 0] 38 | not_done = np.isfinite(next_obs).all(axis=-1) \ 39 | * (x >= 0.2) \ 40 | * (x <= 1.0) 41 | 42 | done = ~not_done 43 | done = done[:,None] 44 | return done 45 | 46 | def termination_fn_ant(obs, act, next_obs): 47 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 48 | 49 | x = next_obs[:, 0] 50 | not_done = np.isfinite(next_obs).all(axis=-1) \ 51 | * (x >= 0.2) \ 52 | * (x <= 1.0) 53 | 54 | done = ~not_done 55 | done = done[:,None] 56 | return done 57 | 58 | def termination_fn_walker2d(obs, act, next_obs): 59 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 60 | 61 | height = next_obs[:, 0] 62 | angle = next_obs[:, 1] 63 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) \ 64 | * (height > 0.8) \ 65 | * (height < 2.0) \ 66 | * (angle > -1.0) \ 67 | * (angle < 1.0) 68 | done = ~not_done 69 | done = done[:,None] 70 | return done 71 | 72 | def termination_fn_point2denv(obs, act, next_obs): 73 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 74 | 75 | done = np.array([False]).repeat(len(obs)) 76 | done = done[:,None] 77 | return done 78 | 79 | def termination_fn_point2dwallenv(obs, act, next_obs): 80 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 81 | 82 | done = np.array([False]).repeat(len(obs)) 83 | done = done[:,None] 84 | return done 85 | 86 | def termination_fn_pendulum(obs, act, next_obs): 87 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 88 | 89 | done = np.zeros((len(obs), 1)) 90 | return done 91 | 92 | def termination_fn_humanoid(obs, act, next_obs): 93 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 94 | 95 | z = next_obs[:,0] 96 | done = (z < 1.0) + (z > 2.0) 97 | 98 | done = done[:,None] 99 | return done 100 | 101 | def termination_fn_pen(obs, act, next_obs): 102 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 103 | 104 | obj_pos = next_obs[:, 24:27] 105 | done = obj_pos[:, 2] < 0.075 106 | 107 | done = done[:,None] 108 | return done 109 | 110 | def terminaltion_fn_door(obs, act, next_obs): 111 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 112 | 113 | done = np.array([False] * obs.shape[0]) 114 | 115 | done = done[:, None] 116 | return done 117 | 118 | def is_terminal(obs,act, next_obs,task): 119 | if 'halfcheetahvel' in task: 120 | return termination_fn_halfcheetahveljump(obs, act, next_obs) 121 | elif 'halfcheetah' in task: 122 | return termination_fn_halfcheetah(obs, act, next_obs) 123 | elif 'hopper' in task: 124 | return termination_fn_hopper(obs,act,next_obs) 125 | elif 'antangle' in task: 126 | return termination_fn_antangle(obs,act,next_obs) 127 | elif 'ant' in task: 128 | return termination_fn_ant(obs, act, next_obs) 129 | elif 'walker2d' in task: 130 | return termination_fn_walker2d(obs, act, next_obs) 131 | elif 'point2denv' in task: 132 | return termination_fn_point2denv(obs, act, next_obs) 133 | elif 'point2dwallenv' in task: 134 | return termination_fn_point2dwallenv(obs,act, next_obs) 135 | elif 'pendulum' in task: 136 | return termination_fn_pendulum(obs,act,next_obs) 137 | elif 'humanoid' in task: 138 | return termination_fn_humanoid(obs, act, next_obs) 139 | 140 | def get_termination_fn(task): 141 | if 'halfcheetahvel' in task: 142 | return termination_fn_halfcheetahveljump 143 | elif 'halfcheetah' in task: 144 | return termination_fn_halfcheetah 145 | elif 'hopper' in task: 146 | return termination_fn_hopper 147 | elif 'antangle' in task: 148 | return termination_fn_antangle 149 | elif 'ant' in task: 150 | return termination_fn_ant 151 | elif 'walker2d' in task: 152 | return termination_fn_walker2d 153 | elif 'point2denv' in task: 154 | return termination_fn_point2denv 155 | elif 'point2dwallenv' in task: 156 | return termination_fn_point2dwallenv 157 | elif 'pendulum' in task: 158 | return termination_fn_pendulum 159 | elif 'humanoid' in task: 160 | return termination_fn_humanoid 161 | elif 'pen' in task: 162 | return termination_fn_pen 163 | elif 'door' in task: 164 | return terminaltion_fn_door 165 | elif task in ['Pipeline', 'DMSD', 'Fusion', 'Salespromotion', 'SafetyHalfCheetah']: 166 | def terminaltion_fn(obs, act, next_obs): 167 | data = { 168 | "obs" : obs, 169 | "action" : act, 170 | "next_obs" : next_obs, 171 | } 172 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 173 | 174 | done = np.zeros((len(obs), 1)) 175 | return done 176 | return terminaltion_fn 177 | elif 'RandomFrictionHopper' in task: 178 | from neorl2.envs.terminated.randomfrictionhopper_terminated import get_terminated 179 | 180 | def terminaltion_fn(obs, act, next_obs): 181 | data = { 182 | "obs" : obs, 183 | "action" : act, 184 | "next_obs" : next_obs, 185 | } 186 | return np.bool_(get_terminated(data)) 187 | 188 | return terminaltion_fn 189 | elif 'Simglucose' in task: 190 | from neorl2.envs.terminated.simglucose_terminated import get_terminated 191 | 192 | def terminaltion_fn(obs, act, next_obs): 193 | data = { 194 | "obs" : obs, 195 | "action" : act, 196 | "next_obs" : next_obs, 197 | } 198 | return np.bool_(get_terminated(data)) 199 | 200 | return terminaltion_fn 201 | elif 'RocketRecovery' in task: 202 | from neorl2.envs.terminated.rocketrecovery_terminated import get_terminated 203 | 204 | def terminaltion_fn(obs, act, next_obs): 205 | data = { 206 | "obs" : obs, 207 | "action" : act, 208 | "next_obs" : next_obs, 209 | } 210 | return np.bool_(get_terminated(data)) 211 | 212 | return terminaltion_fn 213 | 214 | else: 215 | raise NotImplementedError(f"Task {task} not implemented") -------------------------------------------------------------------------------- /offlinerl/utils/net/vae.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from offlinerl.utils.net.common import BasePolicy 6 | 7 | class VAE(nn.Module, BasePolicy): 8 | def __init__(self, 9 | state_dim, 10 | action_dim, 11 | latent_dim, 12 | max_action, 13 | hidden_size=750): 14 | super(VAE, self).__init__() 15 | 16 | self.e1 = nn.Linear(state_dim + action_dim, hidden_size) 17 | self.e2 = nn.Linear(hidden_size, hidden_size) 18 | 19 | self.mean = nn.Linear(hidden_size, latent_dim) 20 | self.log_std = nn.Linear(hidden_size, latent_dim) 21 | 22 | self.d1 = nn.Linear(state_dim + latent_dim, hidden_size) 23 | self.d2 = nn.Linear(hidden_size, hidden_size) 24 | self.d3 = nn.Linear(hidden_size, action_dim) 25 | 26 | self.max_action = max_action 27 | self.latent_dim = latent_dim 28 | 29 | self._actor = None 30 | 31 | def forward(self, state, action): 32 | z = F.relu(self.e1(torch.cat([state, action], 1))) 33 | z = F.relu(self.e2(z)) 34 | 35 | mean = self.mean(z) 36 | # Clamped for numerical stability 37 | log_std = self.log_std(z).clamp(-4, 15) 38 | std = torch.exp(log_std) 39 | z = mean + std * torch.randn_like(std) 40 | 41 | u = self.decode(state, z) 42 | 43 | return u, mean, std 44 | 45 | def decode(self, state, z=None, clip=None, raw=False): 46 | # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5] 47 | if z is None: 48 | z = torch.randn((state.shape[0], self.latent_dim)).to(state.device) 49 | if clip is not None: 50 | z = z.clamp(-clip, clip) 51 | 52 | a = F.relu(self.d1(torch.cat([state, z], 1))) 53 | a = F.relu(self.d2(a)) 54 | a = self.d3(a) 55 | if raw: 56 | return a 57 | return self.max_action * torch.tanh(a) 58 | 59 | def policy_infer(self, obs): 60 | return self.decode(obs, z=self._actor(obs)[0]) 61 | 62 | class ActorPerturbation(nn.Module, BasePolicy): 63 | def __init__(self, state_dim, action_dim, latent_action_dim, max_action, max_latent_action=2, phi=0.05): 64 | super(ActorPerturbation, self).__init__() 65 | 66 | self.hidden_size = (400, 300, 400, 300) 67 | 68 | self.l1 = nn.Linear(state_dim, self.hidden_size[0]) 69 | self.l2 = nn.Linear(self.hidden_size[0], self.hidden_size[1]) 70 | self.l3 = nn.Linear(self.hidden_size[1], latent_action_dim) 71 | 72 | self.l4 = nn.Linear(state_dim + action_dim, self.hidden_size[2]) 73 | self.l5 = nn.Linear(self.hidden_size[2], self.hidden_size[3]) 74 | self.l6 = nn.Linear(self.hidden_size[3], action_dim) 75 | 76 | self.max_latent_action = max_latent_action 77 | self.max_action = max_action 78 | self.phi = phi 79 | 80 | self.vae = None 81 | 82 | def forward(self, state, decoder): 83 | a = F.relu(self.l1(state)) 84 | a = F.relu(self.l2(a)) 85 | latent_action = self.max_latent_action * torch.tanh(self.l3(a)) 86 | 87 | mid_action = decoder(state, z=latent_action) 88 | 89 | a = F.relu(self.l4(torch.cat([state, mid_action], 1))) 90 | a = F.relu(self.l5(a)) 91 | a = self.phi * torch.tanh(self.l6(a)) 92 | final_action = (a + mid_action).clamp(-self.max_action, self.max_action) 93 | return latent_action, mid_action, final_action 94 | 95 | def policy_infer(self, obs): 96 | 97 | return self(obs, self.vae.decode)[-1] -------------------------------------------------------------------------------- /offlinerl/utils/replay_pool.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class ReplayPool(object): 5 | """A class used to save and replay data.""" 6 | 7 | @abc.abstractmethod 8 | def add_sample(self, sample): 9 | """Add a transition tuple.""" 10 | pass 11 | 12 | @abc.abstractmethod 13 | def terminate_episode(self): 14 | """Clean up pool after episode termination.""" 15 | pass 16 | 17 | @property 18 | @abc.abstractmethod 19 | def size(self, **kwargs): 20 | pass 21 | 22 | def add_path(self, path): 23 | """Add a rollout to the replay pool. 24 | 25 | This default implementation naively goes through every step, but you 26 | may want to optimize this. 27 | 28 | NOTE: You should NOT call "terminate_episode" after calling add_path. 29 | It's assumed that this function handles the episode termination. 30 | 31 | :param path: Dict like one outputted by railrl.samplers.util.rollout 32 | """ 33 | self.add_samples(path) 34 | self.terminate_episode() 35 | 36 | @abc.abstractmethod 37 | def random_batch(self, batch_size): 38 | """Return a random batch of size `batch_size`.""" 39 | pass 40 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | from setuptools import setup 5 | from setuptools import find_packages 6 | 7 | def get_version() -> str: 8 | # https://packaging.python.org/guides/single-sourcing-package-version/ 9 | init = open(os.path.join("offlinerl", "__init__.py"), "r").read().split() 10 | return init[init.index("__version__") + 2][1:-1] 11 | 12 | setup( 13 | name='offlinerl', 14 | description="A Library for Offline RL(Batch RL)", 15 | url="https://agit.ai/Polixir/OfflineRL", 16 | version=get_version(), 17 | packages=find_packages(), 18 | author="SongyiGao", 19 | author_email="songyigao@gmail.com", 20 | python_requires=">=3.7", 21 | install_requires=[ 22 | "aim", 23 | "fire", 24 | "loguru", 25 | "gym", 26 | "scikit-learn", 27 | "gtimer", 28 | "numpy", 29 | "ray==2.9", 30 | "aioredis==1.3.1", 31 | "aiohttp==3.7.4", 32 | ], 33 | 34 | ) 35 | --------------------------------------------------------------------------------