├── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── model_tune.py
    ├── train_d4rl.py
    ├── train_task.py
    └── train_tune.py
├── offlinerl
    ├── __init__.py
    ├── algo
    │   ├── __init__.py
    │   ├── base.py
    │   ├── dynamics_model
    │   │   ├── __init__.py
    │   │   └── bc_model.py
    │   ├── modelbase
    │   │   ├── __init__.py
    │   │   ├── bremen.py
    │   │   ├── combo.py
    │   │   ├── maple.py
    │   │   ├── maple_new.py
    │   │   ├── mobile.py
    │   │   ├── model_base.py
    │   │   ├── moose.py
    │   │   ├── mopo.py
    │   │   └── rambo.py
    │   ├── modelfree
    │   │   ├── __init__.py
    │   │   ├── bc.py
    │   │   ├── bcq.py
    │   │   ├── bcqd.py
    │   │   ├── cql.py
    │   │   ├── crr.py
    │   │   ├── edac.py
    │   │   ├── mcq.py
    │   │   ├── plas.py
    │   │   ├── prdc.py
    │   │   └── td3bc.py
    │   └── online
    │   │   ├── __init__.py
    │   │   └── bremen.py
    ├── config
    │   ├── __init__.py
    │   └── algo
    │   │   ├── __init__.py
    │   │   ├── bc_config.py
    │   │   ├── bc_model_config.py
    │   │   ├── bcq_config.py
    │   │   ├── bcqd_config.py
    │   │   ├── bremen_config.py
    │   │   ├── combo_config.py
    │   │   ├── cql_config.py
    │   │   ├── crr_config.py
    │   │   ├── edac_config.py
    │   │   ├── maple_config.py
    │   │   ├── maple_config_new.py
    │   │   ├── mcq_config.py
    │   │   ├── mobile_config.py
    │   │   ├── moose_config.py
    │   │   ├── mopo_config.py
    │   │   ├── plas_config.py
    │   │   ├── prdc_config.py
    │   │   ├── rambo_config.py
    │   │   └── td3bc_config.py
    ├── data
    │   ├── __init__.py
    │   ├── d4rl.py
    │   └── neorl.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── d4rl.py
    │   ├── fqe.py
    │   ├── gym.py
    │   └── neorl.py
    ├── outside_utils
    │   ├── buffer
    │   │   ├── __init__.py
    │   │   └── buffer.py
    │   ├── dynamics
    │   │   ├── __init__.py
    │   │   ├── base_dynamics.py
    │   │   ├── ensemble_dynamics.py
    │   │   ├── mujoco_oracle_dynamics.py
    │   │   └── rnn_dynamics.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── actor_module.py
    │   │   ├── critic_module.py
    │   │   ├── dist_module.py
    │   │   ├── dynamics_module.py
    │   │   └── ensemble_critic_module.py
    │   ├── nets
    │   │   ├── __init__.py
    │   │   ├── ensemble_linear.py
    │   │   ├── mlp.py
    │   │   ├── rnn.py
    │   │   └── vae.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── logger.py
    │   │   ├── scaler.py
    │   │   └── termination_fns.py
    └── utils
    │   ├── __init__.py
    │   ├── config.py
    │   ├── data.py
    │   ├── env.py
    │   ├── exp.py
    │   ├── flexible_replay_pool.py
    │   ├── function.py
    │   ├── io.py
    │   ├── loader.py
    │   ├── logger.py
    │   ├── net
    │       ├── __init__.py
    │       ├── bcq_net.py
    │       ├── common.py
    │       ├── continuous.py
    │       ├── maple_actor.py
    │       ├── mlas.py
    │       ├── model
    │       │   ├── __init__.py
    │       │   ├── ensemble.py
    │       │   ├── maple_critic.py
    │       │   └── new_ensemble.py
    │       ├── model_GRU.py
    │       ├── moose.py
    │       ├── tanhpolicy.py
    │       ├── terminal_check.py
    │       └── vae.py
    │   ├── replay_pool.py
    │   └── simple_replay_pool.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | 
  7 | # define
  8 | test/
  9 | .aim*
 10 | offlinerl_tmp/
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # .idea folder
 16 | .idea/
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | pip-wheel-metadata/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | cover/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | *.out
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | #   For a library or package, you might want to ignore these files since the code is
 97 | #   intended to run in multiple environments; otherwise, check them in:
 98 | # .python-version
 99 | 
100 | # pipenv
101 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | #   install all needed dependencies.
105 | #Pipfile.lock
106 | 
107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
108 | __pypackages__/
109 | 
110 | # Celery stuff
111 | celerybeat-schedule
112 | celerybeat.pid
113 | 
114 | # SageMath parsed files
115 | *.sage.py
116 | 
117 | # Environments
118 | .env
119 | .venv
120 | venv/
121 | ENV/
122 | env.bak/
123 | venv.bak/
124 | 
125 | # Spyder project settings
126 | .spyderproject
127 | .spyproject
128 | 
129 | # Rope project settings
130 | .ropeproject
131 | 
132 | # mkdocs documentation
133 | /site
134 | 
135 | # mypy
136 | .mypy_cache/
137 | .dmypy.json
138 | dmypy.json
139 | 
140 | # Pyre type checker
141 | .pyre/
142 | 
143 | # pytype static type analyzer
144 | .pytype/
145 | 
146 | # customize
147 | log/
148 | MUJOCO_LOG.TXT
149 | *.pth
150 | .vscode/
151 | .DS_Store
152 | *.zip
153 | *.pstats
154 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OfflineRL 
  2 | 
  3 | OfflineRL is a repository for Offline RL (batch reinforcement learning or offline reinforcement learning).
  4 | 
  5 | ## Re-implemented Algorithms
  6 | ### Model-free methods
  7 | - **CRR**: Wang, Ziyu, et al. “Critic Regularized Regression.” Advances in Neural Information Processing Systems, vol. 33, 2020, pp. 7768–7778. [paper](https://arxiv.org/abs/2006.15134)
  8 | - **CQL**: Kumar, Aviral, et al. “Conservative Q-Learning for Offline Reinforcement Learning.” Advances in Neural Information Processing Systems, vol. 33, 2020. [paper](https://arxiv.org/abs/2006.04779) [code](https://github.com/aviralkumar2907/CQL)
  9 | - **PLAS**: Zhou, Wenxuan, et al. “PLAS: Latent Action Space for Offline Reinforcement Learning.” ArXiv Preprint ArXiv:2011.07213, 2020.
 10 |  [website](https://sites.google.com/view/latent-policy) [paper](https://arxiv.org/abs/2011.07213) [code](https://github.com/Wenxuan-Zhou/PLAS)
 11 | - **BCQ**: Fujimoto, Scott, et al. “Off-Policy Deep Reinforcement Learning without Exploration.” International Conference on Machine Learning, 2018, pp. 2052–2062. [paper](https://arxiv.org/abs/1812.02900) [code](https://github.com/sfujim/BCQ)
 12 | - **EDAC**: An, Gaon, et al. "Uncertainty-based offline reinforcement learning with diversified q-ensemble." Advances in neural information processing systems 34 (2021): 7436-7447. [paper](https://arxiv.org/abs/2110.01548) [code](https://github.com/snu-mllab/EDAC)
 13 | - **MCQ**: Lyu, Jiafei, et al. "Mildly conservative q-learning for offline reinforcement learning." Advances in Neural Information Processing Systems 35 (2022): 1711-1724. [paper](https://arxiv.org/abs/2206.04745) [code](https://github.com/dmksjfl/MCQ)
 14 | - **TD3BC**: Fujimoto, Scott, and Shixiang Shane Gu. "A minimalist approach to offline reinforcement learning." Advances in neural information processing systems 34 (2021): 20132-20145. [paper](https://arxiv.org/abs/2106.06860) [code](https://github.com/sfujim/TD3_BC)
 15 | - **PRDC**: Ran, Yuhang, et al. “Policy Regularization with Dataset Constraint for Offline Reinforcement Learning.” International Conference on Machine Learning, 2023, pp. 28701-28717. [paper](https://arxiv.org/abs/2306.06569) [code](https://github.com/LAMDA-RL/PRDC)
 16 | ### Model-based methods
 17 | - **BREMEN**: Matsushima, Tatsuya, et al. “Deployment-Efficient Reinforcement Learning via Model-Based Offline Optimization.” International Conference on Learning Representations, 2021. [paper](https://openreview.net/forum?id=3hGNqpI4WS) [code](https://github.com/matsuolab/BREMEN)
 18 | - **COMBO**: Yu, Tianhe, et al. "COMBO: Conservative Offline Model-Based Policy Optimization." arXiv preprint arXiv:2102.08363 (2021). [paper](https://arxiv.org/abs/2102.08363)
 19 | - **MOPO**: Yu, Tianhe, et al. “MOPO: Model-Based Offline Policy Optimization.” Advances in Neural Information Processing Systems, vol. 33, 2020. [paper](https://papers.nips.cc/paper/2020/hash/a322852ce0df73e204b7e67cbbef0d0a-Abstract.html) [code](https://github.com/tianheyu927/mopo)
 20 | - **MAPLE**: Xiong-Hui Chen, et al. "MAPLE: Offline Model-based Adaptable Policy Learning". Advances in Neural Information Processing Systems, vol. 34, 2021. [paper](https://proceedings.neurips.cc/paper/2021/hash/470e7a4f017a5476afb7eeb3f8b96f9b-Abstract.html) [code](https://github.com/xionghuichen/MAPLE)
 21 | - **MOBILE**: Yihao Sun, et al. "Model-Bellman Inconsistency for Model-based Offline Reinforcement Learning". Proceedings of the 40th International Conference on Machine Learning, PMLR 202:33177-33194, 2023. [paper](https://proceedings.mlr.press/v202/sun23q.html) [code](https://github.com/yihaosun1124/mobile)
 22 | - **RAMBO**: Rigter, Marc, Bruno Lacerda, and Nick Hawes. "Rambo-rl: Robust adversarial model-based offline reinforcement learning." Advances in neural information processing systems 35 (2022): 16082-16097. [paper](https://arxiv.org/abs/2204.12581) [code](https://github.com/marc-rigter/rambo)
 23 | 
 24 | ## Install Datasets
 25 | ### NeoRL
 26 | 
 27 | ```shell
 28 | git clone https://github.com/Polixir/neorl.git
 29 | cd neorl
 30 | pip install -e .
 31 | ```
 32 | 
 33 | For more details on use, please see [neorl](https://github.com/Polixir/neorl).
 34 | 
 35 | ### D4RL (Optional)
 36 | ```shell
 37 | pip install git+https://github.com/rail-berkeley/d4rl@master#egg=d4rl
 38 | ```
 39 | 
 40 | For more details on use, please see [d4rl](https://github.com/rail-berkeley/d4rl).
 41 | 
 42 | ## Install offlinerl
 43 | 
 44 | ```shell
 45 | pip install -e .
 46 | ```
 47 | 
 48 | ## Example
 49 | 
 50 | ```python
 51 | # Training in HalfCheetah-v3-L-9 task using default parameters of cql algorithm
 52 | python examples/train_task.py --algo_name=cql --exp_name=halfcheetah --task HalfCheetah-v3 --task_data_type low --task_train_num 100
 53 | 
 54 | # Training in SafetyHalfCheetahtask using default parameters of cql algorithm
 55 | python examples/train_task.py --algo_name=mcq --exp_name=SafetyHalfCheetah --task SafetyHalfCheetah 
 56 | 
 57 | # Parameter search in the default parameter space using the cql algorithm in the HalfCheetah-v3-L-9 task
 58 | python examples/train_tune.py --algo_name=cql --exp_name=halfcheetah --task HalfCheetah-v3 --task_data_type low --task_train_num 100
 59 | 
 60 | # Parameter search in the default parameter space using the cql algorithm in the SafetyHalfCheetahtask task
 61 | # python examples/train_tune.py --algo_name=mcq --exp_name=SafetyHalfCheetah --task SafetyHalfCheetah 
 62 | 
 63 | # Training in D4RL halfcheetah-medium task using default parameters of cql algorithm (D4RL need to be installed)
 64 | python examples/train_d4rl.py --algo_name=cql --exp_name=d4rl-halfcheetah-medium-cql --task d4rl-halfcheetah-medium-v0
 65 | ```
 66 | 
 67 | **Parameters:**
 68 | 
 69 | - ​**algo_name**:  Algorithm name . There are now bc, cql, plas,  bcq and mopo algorithms available.
 70 | - ​**exp_name**:  Experiment name for easy visualization using aim.
 71 | - ​**task**: Task name, See [neorl](https://github.com/Polixir/neorl/wiki/Tasks) for details.
 72 | - ​**task_data_type**: Data level. Each task collects data using low, medium, and high level strategies in [neorl](https://github.com/Polixir/neorl).
 73 | - ​**task_train_num**:  Number of training data trajectories. For each task, neorl provides training data for up to 10000 trajectories.
 74 | 
 75 | 
 76 | 
 77 | ## View experimental results
 78 | We use **Aim** to store and visualize results. Aim is an experiment logger that is easy to manage thousands of experiments. For more details, see [aim](https://github.com/aimhubio/aim). 
 79 | 
 80 | To visualize results in this repository:
 81 | ```shell
 82 | cd offlinerl_tmp
 83 | aim up
 84 | ```
 85 | Then you can see the results on http://127.0.0.1:43800.
 86 | 
 87 | 
 88 | ## Model-based Running Example
 89 | 
 90 | ```python
 91 | # Tune and save the transition models
 92 | python examples/model_tune.py --algo_name bc_model --exp_name neorl-RandomFrictionHopper-model --task RandomFrictionHopper
 93 | ```
 94 | 
 95 | ```python
 96 | # Training MOPO and load the best transition model
 97 | python examples/train_task.py --algo_name mopo --exp_name neorl-safecheetah-mopo-new --task SafetyHalfCheetah --dynamics_path best_run_id
 98 | 
 99 | # Training COMBO and load the best transition model
100 | python examples/train_task.py --algo_name combo --exp_name neorl-safecheetah-combo-new --task SafetyHalfCheetah --dynamics_path best_run_id
101 | 
102 | # Training RAMBO and load the best transition model
103 | python examples/train_task.py --algo_name rambo --exp_name neorl-safecheetah-rambo-new --task SafetyHalfCheetah --dynamics_path best_run_id
104 | 
105 | # Training MOBILE and load the best transition model
106 | python examples/train_task.py --algo_name mobile --exp_name neorl-safecheetah-mobile-new --task SafetyHalfCheetah --dynamics_path best_run_id
107 | ```
108 | 


--------------------------------------------------------------------------------
/examples/model_tune.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | import random
 3 | from ray import tune
 4 | 
 5 | from offlinerl.algo import algo_select
 6 | from offlinerl.data import load_data_from_neorl
 7 | from offlinerl.evaluation import get_defalut_callback, ModelCallBackFunction
 8 | 
 9 | def training_function(config):
10 |     algo_init_fn, algo_trainer_obj, algo_config = algo_select(config["kwargs"])
11 |     train_buffer, val_buffer = load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"])
12 |     algo_config.update(config)
13 |     algo_config["device"] = "cuda"
14 |     algo_init = algo_init_fn(algo_config)
15 |     algo_trainer = algo_trainer_obj(algo_init, algo_config)
16 | 
17 |     callback = ModelCallBackFunction()
18 |     callback.initialize(train_buffer=train_buffer, val_buffer=val_buffer, task=algo_config["task"])
19 | 
20 |     score = algo_trainer.train(train_buffer, None, callback_fn=callback)
21 |     
22 |     # return score
23 |     return 0
24 | 
25 | 
26 | def run_algo(**kwargs):
27 |     config = {}
28 |     config["kwargs"] = kwargs
29 |     config["kwargs"]['seed'] = random.randint(0, 1000000)  
30 |     _, _, algo_config = algo_select(kwargs)
31 |     # Prepare Dataset
32 |     load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"])
33 |     grid_tune = algo_config["grid_tune"]
34 |     for k,v in grid_tune.items():
35 |         config[k] = tune.grid_search(v)
36 | 
37 |     analysis = tune.run(
38 |         training_function,
39 |         config=config,
40 |         resources_per_trial={"gpu": 0.5},
41 |         )
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     fire.Fire(run_algo)
46 | 


--------------------------------------------------------------------------------
/examples/train_d4rl.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | 
 3 | from offlinerl.algo import algo_select
 4 | from offlinerl.data.d4rl import load_d4rl_buffer
 5 | from offlinerl.evaluation import OnlineCallBackFunction
 6 | 
 7 | 
 8 | def run_algo(**kwargs):
 9 |     algo_init_fn, algo_trainer_obj, algo_config = algo_select(kwargs)
10 |     train_buffer = load_d4rl_buffer(algo_config["task"])
11 |     algo_init = algo_init_fn(algo_config)
12 |     algo_trainer = algo_trainer_obj(algo_init, algo_config)
13 |     callback = OnlineCallBackFunction()
14 |     callback.initialize(train_buffer=train_buffer, val_buffer=None, 
15 |                         task=algo_config["task"], number_of_runs=algo_config.get("eval_episodes",100))
16 | 
17 |     algo_trainer.train(train_buffer, None, callback_fn=callback)
18 | 
19 | if __name__ == "__main__":
20 |     fire.Fire(run_algo)
21 |     


--------------------------------------------------------------------------------
/examples/train_task.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | 
 3 | from offlinerl.algo import algo_select
 4 | from offlinerl.data import load_data_from_neorl
 5 | from offlinerl.evaluation import get_defalut_callback, OnlineCallBackFunction
 6 | 
 7 | 
 8 | def run_algo(**kwargs):
 9 |     algo_init_fn, algo_trainer_obj, algo_config = algo_select(kwargs)
10 |     train_buffer, val_buffer = load_data_from_neorl(algo_config["task"], 
11 |                                                     algo_config["task_data_type"], algo_config["task_train_num"])
12 |     algo_config['data_name'] = "neorl2-" + algo_config["task"]
13 |     algo_init = algo_init_fn(algo_config)
14 |     algo_trainer = algo_trainer_obj(algo_init, algo_config)
15 |     callback = OnlineCallBackFunction()
16 |     callback.initialize(train_buffer=train_buffer, val_buffer=val_buffer, 
17 |                         task=algo_config["task"], number_of_runs=algo_config.get("eval_episodes",100))
18 | 
19 |     algo_trainer.train(train_buffer, None, callback_fn=callback)
20 | 
21 | if __name__ == "__main__":
22 |     fire.Fire(run_algo)
23 |     


--------------------------------------------------------------------------------
/examples/train_tune.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | import random
 3 | from ray import tune
 4 | 
 5 | from offlinerl.algo import algo_select
 6 | from offlinerl.data import load_data_from_neorl
 7 | from offlinerl.evaluation import get_defalut_callback, OnlineCallBackFunction
 8 | 
 9 | def training_function(config):
10 |     algo_init_fn, algo_trainer_obj, algo_config = algo_select(config["kwargs"])
11 |     train_buffer, val_buffer = load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"])
12 |     algo_config.update(config)
13 |     algo_config["device"] = "cuda"
14 |     algo_init = algo_init_fn(algo_config)
15 |     algo_trainer = algo_trainer_obj(algo_init, algo_config)
16 | 
17 |     callback = OnlineCallBackFunction()
18 |     callback.initialize(train_buffer=train_buffer, val_buffer=val_buffer, task=algo_config["task"])
19 | 
20 |     score = algo_trainer.train(train_buffer, None, callback_fn=callback)
21 |     
22 |     # return score
23 |     return 0
24 | 
25 | 
26 | def run_algo(**kwargs):
27 |     config = {}
28 |     config["kwargs"] = kwargs
29 |     config["kwargs"]['seed'] = random.randint(0, 1000000)
30 |     _, _, algo_config = algo_select(kwargs)
31 |     # Prepare Dataset
32 |     load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"])
33 |     grid_tune = algo_config["grid_tune"]
34 |     for k,v in grid_tune.items():
35 |         config[k] = tune.grid_search(v)
36 | 
37 |     analysis = tune.run(
38 |         training_function,
39 |         config=config,
40 |         resources_per_trial={"gpu": 0.333333},
41 |         )
42 | 
43 |     
44 | if __name__ == "__main__":
45 |     fire.Fire(run_algo)


--------------------------------------------------------------------------------
/offlinerl/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from loguru import logger
 3 | 
 4 | from offlinerl import algo, data, evaluation, utils, config
 5 | 
 6 | logger_config = {
 7 |     "handlers": [
 8 |         {"sink": sys.stdout, 
 9 |          "colorize" : True, 
10 |          #"format" : "<green>{time}</green> <level>{message}</level>",
11 |          "format" : "<green>{time:YYYY-MM-DD at HH:mm:ss.SSS}</green> | <blue>{level}</blue> | {message}",
12 |          "enqueue" : True,
13 |          "backtrace" : True, 
14 |          "diagnose" : True,
15 |         },
16 |     ],
17 | 
18 | }
19 | logger.configure(**logger_config)
20 | 
21 | #logger.disable("offlinerl")
22 | logger.enable("offlinerl")
23 | 
24 | __version__ = "0.0.1"
25 | 
26 | __all__ = [
27 |     "algo",
28 |     "data",
29 |     "evaluation",
30 |     "utils",
31 |     "config",
32 | ]


--------------------------------------------------------------------------------
/offlinerl/algo/__init__.py:
--------------------------------------------------------------------------------
 1 | from loguru import logger
 2 | import warnings
 3 | 
 4 | warnings.filterwarnings('ignore')
 5 | 
 6 | 
 7 | from offlinerl.config.algo import  edac_config, mcq_config, cql_config, plas_config, mopo_config, moose_config, bcqd_config, bcq_config, bc_config, crr_config, combo_config, bremen_config, maple_config, mobile_config, rambo_config, td3bc_config, bc_model_config, maple_config_new,prdc_config
 8 | from offlinerl.utils.config import parse_config
 9 | from offlinerl.algo.modelfree import cql, plas, bcqd, bcq, bc, crr, edac, mcq, td3bc, prdc
10 | from offlinerl.algo.modelbase import mopo, moose, combo, bremen, maple, mobile, rambo, maple_new
11 | from offlinerl.algo.dynamics_model import bc_model
12 | 
13 | algo_dict = {
14 |     'edac' : {"algo" : edac, "config" : edac_config},
15 |     'bc' : {"algo" : bc, "config" : bc_config},
16 |     'bcq' : {"algo" : bcq, "config" : bcq_config},
17 |     'mcq' : {"algo" : mcq, "config" : mcq_config},
18 |     'bcqd' : {"algo" : bcqd, "config" : bcqd_config},
19 |     'combo' : {"algo" : combo, "config" : combo_config},
20 |     "cql" : {"algo" : cql, "config" : cql_config},
21 |     "crr" : {"algo" : crr, "config" : crr_config},
22 |     "plas" : {"algo" : plas, "config" : plas_config},
23 |     "prdc" : {"algo" : prdc, "config" : prdc_config},
24 |     'moose' : {"algo" : moose, "config" : moose_config},
25 |     'mopo': {"algo" : mopo, "config": mopo_config},
26 |     'bremen' : {"algo" : bremen, "config" : bremen_config},
27 |     'maple': {'algo':maple , 'config':maple_config},
28 |     'mobile': {'algo':mobile , 'config':mobile_config},
29 |     'rambo': {'algo':rambo , 'config':rambo_config},
30 |     'td3bc': {'algo':td3bc , 'config':td3bc_config},
31 |     'bc_model': {'algo':bc_model , 'config':bc_model_config},
32 |     'maple_new': {'algo':maple_new , 'config':maple_config_new},
33 | }
34 | 
35 | def algo_select(command_args, algo_config_module=None):
36 |     algo_name = command_args["algo_name"]
37 |     logger.info('Use {} algorithm!', algo_name)
38 |     assert algo_name in algo_dict.keys()
39 |     algo = algo_dict[algo_name]["algo"]
40 |     
41 |     if algo_config_module is None:
42 |         algo_config_module = algo_dict[algo_name]["config"]
43 |     algo_config = parse_config(algo_config_module)
44 |     algo_config.update(command_args)
45 |     
46 |     algo_init = algo.algo_init
47 |     algo_trainer = algo.AlgoTrainer
48 |     
49 |     return algo_init, algo_trainer, algo_config
50 |     
51 |     


--------------------------------------------------------------------------------
/offlinerl/algo/base.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import uuid
  3 | import json
  4 | from abc import ABC, abstractmethod
  5 | 
  6 | import torch
  7 | from collections import OrderedDict
  8 | from loguru import logger
  9 | from offlinerl.utils.exp import init_exp_run
 10 | from offlinerl.utils.io import create_dir
 11 | from offlinerl.utils.logger import log_path
 12 | 
 13 | 
 14 | import time
 15 | import random   
 16 | 
 17 | class BaseAlgo(ABC):
 18 |     def __init__(self, args):        
 19 |         logger.info('Init AlgoTrainer')
 20 |         if "exp_name" not in args.keys():
 21 |             exp_name = str(uuid.uuid1()).replace("-","")
 22 |         else:
 23 |             exp_name = args["exp_name"]
 24 |         
 25 |         if "aim_path" in args.keys():
 26 |             if os.path.exists(args["aim_path"]):
 27 |                 time.sleep(random.randint(1, 5))
 28 |                 repo = args["aim_path"]
 29 |             else:
 30 |                 os.makedirs(args["aim_path"])
 31 |             repo = args["aim_path"]
 32 |         else:
 33 |             repo = None
 34 |         
 35 |         self.repo = repo
 36 | 
 37 |         try:
 38 |             self.exp_run = init_exp_run(repo = repo, experiment_name = exp_name)
 39 |         except:
 40 |             time.sleep(random.randint(1, 5))
 41 |             self.exp_run = init_exp_run(repo = repo, experiment_name = exp_name)
 42 |             
 43 |         if self.exp_run.repo is not None:  # a naive fix of aim exp_logger.repo is None
 44 |             self.index_path = self.exp_run.repo.path
 45 |         else:
 46 |             repo = os.path.join(log_path(),"./.aim")
 47 |             if not os.path.exists(repo):
 48 |                 logger.info('{} dir is not exist, create {}',repo, repo)
 49 |                 os.system(str("cd " + os.path.join(repo,"../") + "&& aim init"))
 50 |             self.index_path = repo
 51 | 
 52 |         print(f'self.index_path/{self.index_path}')
 53 |         self.models_save_dir = os.path.join(self.index_path, "models")
 54 |         self.metric_logs = OrderedDict()
 55 |         self.metric_logs_path = os.path.join(self.index_path, "metric_logs.json")
 56 |         create_dir(self.models_save_dir)
 57 | 
 58 |         # self.exp_run.set_params(args, name='hparams')
 59 |         self.exp_run['hparams'] = args
 60 |     
 61 |     def log_res(self, epoch, result):
 62 |         logger.info('Epoch : {}', epoch)
 63 |         for k,v in result.items():
 64 |             logger.info('{} : {}',k, v)
 65 |             self.exp_run.track(v, name=k.split(" ")[0], epoch=epoch,)
 66 |         
 67 |         self.metric_logs[str(epoch)] = result
 68 |         with open(self.metric_logs_path,"w") as f:
 69 |             json.dump(self.metric_logs,f)
 70 | 
 71 |         self.run_id = self.exp_run.name.split( )[-1]
 72 |         tmp_dir = os.path.join(self.models_save_dir, self.run_id)
 73 |         if not os.path.exists(tmp_dir):
 74 |             os.makedirs(tmp_dir)
 75 |         # self.save_model(os.path.join(tmp_dir, str(epoch) + ".pt"))           
 76 |         self.save_model(os.path.join(tmp_dir, "policy.pt")) 
 77 |         
 78 |         self.report_result = result
 79 |         self.report_result["hparams"] = self.exp_run['hparams']
 80 |         self.report_result["model_path"] = os.path.join(tmp_dir, "policy.pt")
 81 |         
 82 |     
 83 |     @abstractmethod
 84 |     def train(self, 
 85 |               history_buffer,
 86 |               eval_fn=None,):
 87 |         pass
 88 |     
 89 |     def _sync_weight(self, net_target, net, soft_target_tau = 5e-3):
 90 |         for o, n in zip(net_target.parameters(), net.parameters()):
 91 |             o.data.copy_(o.data * (1.0 - soft_target_tau) + n.data * soft_target_tau)
 92 |     
 93 |     @abstractmethod
 94 |     def get_policy(self,):
 95 |         pass
 96 |     
 97 |     #@abstractmethod
 98 |     def save_model(self, model_path):
 99 |         torch.save(self.get_policy(), model_path)
100 |         
101 |     #@abstractmethod
102 |     def load_model(self, model_path):
103 |         model = torch.load(model_path)
104 |         
105 |         return model


--------------------------------------------------------------------------------
/offlinerl/algo/dynamics_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/dynamics_model/__init__.py


--------------------------------------------------------------------------------
/offlinerl/algo/modelbase/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/modelbase/__init__.py


--------------------------------------------------------------------------------
/offlinerl/algo/modelbase/mopo.py:
--------------------------------------------------------------------------------
 1 | # MOPO: Model-based Offline Policy Optimization
 2 | # https://arxiv.org/abs/2005.13239
 3 | # https://github.com/tianheyu927/mopo
 4 | import os
 5 | import torch
 6 | import numpy as np
 7 | from copy import deepcopy
 8 | from loguru import logger
 9 | from collections import deque
10 | from typing import Dict
11 | 
12 | from offlinerl.algo.modelbase.model_base import algo_init, ModelBasedAlgoTrainer
13 | 
14 | 
15 | class AlgoTrainer(ModelBasedAlgoTrainer):
16 |     def __init__(self, algo_init, args):
17 |         super(AlgoTrainer, self).__init__(algo_init, args)
18 | 
19 |         self.fake_buffer_size = self.args["model_retain_epochs"] * self.args["rollout_batch_size"] * self.args["horizon"]
20 |         self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.actor_optim, args['max_epoch'])
21 |     
22 |     def policy_learn(self, batch: Dict):
23 |         real_batch, fake_batch = batch["real"], batch["fake"]
24 |         mix_batch = {k: torch.cat([real_batch[k], fake_batch[k]], 0) for k in real_batch.keys()}
25 | 
26 |         obss, actions, next_obss, rewards, terminals = mix_batch["observations"], mix_batch["actions"], \
27 |             mix_batch["next_observations"], mix_batch["rewards"], mix_batch["terminals"]
28 | 
29 |         # update critic
30 |         q1, q2 = self.critic1(obss, actions), self.critic2(obss, actions)
31 |         with torch.no_grad():
32 |             next_actions, next_log_probs = self.actforward(next_obss)
33 |             next_q = torch.min(
34 |                 self.target_critic1(next_obss, next_actions), self.target_critic2(next_obss, next_actions)
35 |             ) - self._alpha * next_log_probs
36 |             target_q = rewards + self._gamma * (1 - terminals) * next_q
37 | 
38 |         critic1_loss = ((q1 - target_q).pow(2)).mean()
39 |         self.critic1_optim.zero_grad()
40 |         critic1_loss.backward()
41 |         self.critic1_optim.step()
42 | 
43 |         critic2_loss = ((q2 - target_q).pow(2)).mean()
44 |         self.critic2_optim.zero_grad()
45 |         critic2_loss.backward()
46 |         self.critic2_optim.step()
47 | 
48 |         # update actor
49 |         a, log_probs = self.actforward(obss)
50 |         q1a, q2a = self.critic1(obss, a), self.critic2(obss, a)
51 | 
52 |         actor_loss = - torch.min(q1a, q2a).mean() + self._alpha * log_probs.mean()
53 |         self.actor_optim.zero_grad()
54 |         actor_loss.backward()
55 |         self.actor_optim.step()
56 | 
57 |         if self._is_auto_alpha:
58 |             log_probs = log_probs.detach() + self._target_entropy
59 |             alpha_loss = -(self._log_alpha * log_probs).mean()
60 |             self.alpha_optim.zero_grad()
61 |             alpha_loss.backward()
62 |             self.alpha_optim.step()
63 |             self._alpha = torch.clamp(self._log_alpha.detach().exp(), 0.0, 1.0)
64 | 
65 |         self._sync_weight()
66 | 
67 |         result = {
68 |             "loss/actor": actor_loss.item(),
69 |             "loss/critic1": critic1_loss.item(),
70 |             "loss/critic2": critic2_loss.item(),
71 |         }
72 | 
73 |         if self._is_auto_alpha:
74 |             result["loss/alpha"] = alpha_loss.item()
75 |             result["alpha"] = self._alpha.item()
76 | 
77 |         return result
78 | 


--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/modelfree/__init__.py


--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/bc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from copy import deepcopy
 3 | from loguru import logger
 4 | 
 5 | from offlinerl.algo.base import BaseAlgo
 6 | from offlinerl.utils.net.continuous import GaussianActor
 7 | from offlinerl.utils.exp import setup_seed
 8 | 
 9 | 
10 | def algo_init(args):
11 |     logger.info('Run algo_init function')
12 | 
13 |     setup_seed(args['seed'])
14 |     
15 |     if args["obs_shape"] and args["action_shape"]:
16 |         obs_shape, action_shape = args["obs_shape"], args["action_shape"]
17 |         max_action = args["max_action"]
18 |     elif "task" in args.keys():
19 |         from offlinerl.utils.env import get_env_shape, get_env_action_range
20 |         obs_shape, action_shape = get_env_shape(args['task'])
21 |         max_action, _ = get_env_action_range(args["task"])
22 |         args["obs_shape"], args["action_shape"] = obs_shape, action_shape
23 |     else:
24 |         raise NotImplementedError
25 | 
26 |     actor = GaussianActor(obs_shape, action_shape, args['actor_features'], args['actor_layers']).to(args['device'])
27 |     actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr'])
28 | 
29 |     return {
30 |         "actor" : {"net" : actor, "opt" : actor_optim},
31 |     }
32 | 
33 | 
34 | class AlgoTrainer(BaseAlgo):
35 |     def __init__(self, algo_init, args):
36 |         super(AlgoTrainer, self).__init__(args)
37 |         self.args = args
38 | 
39 |         self.actor = algo_init['actor']['net']
40 |         self.actor_optim = algo_init['actor']['opt']
41 | 
42 |         self.batch_size = self.args['batch_size']
43 |         self.device = self.args['device']
44 | 
45 |         self.best_actor = deepcopy(self.actor)
46 |         self.best_loss = float('inf')
47 |         
48 |     def train(self, train_buffer, val_buffer, callback_fn):
49 |         if val_buffer == None:
50 |             from offlinerl.utils.data import SampleBatch
51 |             ori_buffer = deepcopy(train_buffer)
52 |             sep_len = int(len(ori_buffer)*0.1)
53 |             val_buffer = SampleBatch(ori_buffer[-sep_len:])
54 |             train_buffer = SampleBatch(ori_buffer[:-sep_len])
55 |             # breakpoint()
56 |         for epoch in range(self.args['max_epoch']):
57 |             for i in range(self.args['steps_per_epoch']):
58 |                 batch_data = train_buffer.sample(self.batch_size)
59 |                 batch_data.to_torch(device=self.device)
60 |                 obs = batch_data['obs']
61 |                 action = batch_data['act']
62 | 
63 |                 action_dist = self.actor(obs)
64 |                 # loss = - action_dist.log_prob(action).mean()
65 |                 loss = ((action_dist.mode - action) ** 2).mean()
66 | 
67 |                 self.actor_optim.zero_grad()
68 |                 loss.backward()
69 |                 self.actor_optim.step()
70 | 
71 |             with torch.no_grad():
72 |                 val_loss = 0
73 |                 for i in range(len(val_buffer) // self.batch_size + (len(val_buffer) % self.batch_size > 0)):
74 |                     batch_data = val_buffer[i*self.batch_size:(i+1)*self.batch_size]
75 |                     batch_data.to_torch(device=self.device)
76 |                     obs = batch_data['obs']
77 |                     action = batch_data['act']
78 | 
79 |                     action_dist = self.actor(obs)
80 |                     val_loss += ((action_dist.mean - action) ** 2).mean().item()
81 | 
82 |             if val_loss < self.best_loss:
83 |                 self.best_loss = val_loss
84 |                 self.best_actor.load_state_dict(self.actor.state_dict())
85 |                 
86 |             res = callback_fn(self.get_policy())
87 |             res['loss'] = val_loss
88 |             self.log_res(epoch, res)
89 | 
90 |         return self.report_result
91 |     
92 |     def get_policy(self):
93 |         return self.best_actor
94 | 


--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/bcqd.py:
--------------------------------------------------------------------------------
  1 | #Discrete Batch-Constrained deep Q-Learning (BCQ)
  2 | import copy
  3 | 
  4 | import torch
  5 | import numpy as np
  6 | from torch import nn
  7 | from torch import optim
  8 | import torch.nn.functional as F
  9 | from loguru import logger
 10 | 
 11 | from offlinerl.algo.base import BaseAlgo
 12 | from offlinerl.utils.net.bcq_net import Conv_Q, FC_Q
 13 | from offlinerl.utils.exp import setup_seed
 14 | 
 15 | 
 16 | def algo_init(args):
 17 |     logger.info('Run algo_init function')
 18 | 
 19 |     setup_seed(args['seed'])
 20 |     
 21 |     if args["obs_shape"] and args["action_shape"]:
 22 |         obs_shape, action_shape = args["obs_shape"], args["action_shape"]
 23 |     elif "task" in args.keys():
 24 |         from offlinerl.utils.env import get_env_shape
 25 |         obs_shape, action_shape = get_env_shape(args['task'])
 26 |         args["obs_shape"], args["action_shape"] = obs_shape, action_shape
 27 |     else:
 28 |         raise NotImplementedError
 29 |         
 30 |     if isinstance(args["obs_shape"], int):
 31 |         state_dim = (
 32 |             4,
 33 |             84,
 34 |             84
 35 |         ) 
 36 |         
 37 |         critic = Conv_Q(state_dim[0], args["action_shape"]).to(args['device'])
 38 |     else:
 39 |         critic = FC_Q(np.prod(args["obs_shape"]), args["action_shape"]).to(args['device'])
 40 |         
 41 |     critic_opt = optim.Adam(critic.parameters(), **args["optimizer_parameters"])
 42 |     
 43 |         
 44 |     nets =  {
 45 |         "critic" : {"net" : critic, "opt" : critic_opt},
 46 |         
 47 |     }
 48 |         
 49 |     return nets
 50 | 
 51 | 
 52 | class AlgoTrainer(BaseAlgo):
 53 |     def __init__(self, algo_init, args):
 54 |         super(AlgoTrainer, self).__init__(args)
 55 |         self.args = args
 56 |         
 57 |         self.Q = algo_init["critic"]["net"]
 58 |         self.Q_target = copy.deepcopy(self.Q)
 59 |         self.Q_optimizer = algo_init["critic"]["opt"]
 60 |         
 61 |         self.discount = self.args["discount"]
 62 | 
 63 |         # Target update rule
 64 |         self.maybe_update_target = self.polyak_target_update if self.args["polyak_target_update"] else self.copy_target_update
 65 |         self.target_update_frequency = self.args["target_update_frequency"]
 66 |         self.tau = self.args["tau"]
 67 | 
 68 |         # Decay for eps
 69 |         self.initial_eps = self.args["initial_eps"]
 70 |         self.end_eps = self.args["end_eps"]
 71 |         self.slope = (self.end_eps - self.initial_eps) / self.args["eps_decay_period"]
 72 | 
 73 |         # Evaluation hyper-parameters
 74 |         self.state_shape = (-1,) + self.args["obs_shape"] if isinstance(self.args["obs_shape"], int) else (-1, self.args["obs_shape"])
 75 |         self.eval_eps = self.args["eval_eps"]
 76 |         self.num_actions = self.args["action_shape"]
 77 | 
 78 |         # Threshold for "unlikely" actions
 79 |         self.threshold = self.args["BCQ_threshold"]
 80 | 
 81 |         # Number of training iterations
 82 |         self.iterations = 0
 83 | 
 84 |     def train(self, train_buffer, val_buffer, callback_fn):
 85 |         training_iters = 0
 86 |         while training_iters < self.args["max_timesteps"]:
 87 |             
 88 |             # Sample replay buffer
 89 |             batch = train_buffer.sample(self.args["batch_size"])
 90 |             batch = batch.to_torch(dtype=torch.float32, device=self.args["device"])
 91 |             reward = batch.rew
 92 |             done = batch.done
 93 |             state = batch.obs
 94 |             action = batch.act.to(torch.int64)
 95 |             next_state = batch.obs_next
 96 | 
 97 |             # Compute the target Q value
 98 |             with torch.no_grad():
 99 |                 q, imt, i = self.Q(next_state)
100 |                 imt = imt.exp()
101 |                 imt = (imt/imt.max(1, keepdim=True)[0] > self.threshold).float()
102 | 
103 |                 # Use large negative number to mask actions from argmax
104 |                 next_action = (imt * q + (1 - imt) * -1e8).argmax(1, keepdim=True)
105 | 
106 |                 q, imt, i = self.Q_target(next_state)
107 |                 target_Q = reward + done * self.discount * q.gather(1, next_action).reshape(-1, 1)
108 | 
109 |             # Get current Q estimate
110 |             current_Q, imt, i = self.Q(state)
111 | 
112 |             current_Q = current_Q.gather(1, action)
113 | 
114 |             # Compute Q loss
115 |             q_loss = F.smooth_l1_loss(current_Q, target_Q)
116 |             i_loss = F.nll_loss(imt, action.reshape(-1))
117 | 
118 |             Q_loss = q_loss + i_loss + 1e-2 * i.pow(2).mean()
119 | 
120 |             # Optimize the Q
121 |             self.Q_optimizer.zero_grad()
122 |             Q_loss.backward()
123 |             self.Q_optimizer.step()
124 | 
125 |             # Update target network by polyak or full copy every X iterations.
126 |             self.maybe_update_target()
127 |             training_iters += 1
128 |             #print(training_iters ,self.args["eval_freq"])
129 |             if training_iters % self.args["eval_freq"] == 0:
130 |                 res = callback_fn(self.get_policy())
131 |                 
132 |                 self.log_res(training_iters // self.args["eval_freq"], res)
133 |                 
134 |         return self.report_result
135 | 
136 | 
137 |     def polyak_target_update(self):
138 |         for param, target_param in zip(self.Q.parameters(), self.Q_target.parameters()):
139 |             target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
140 | 
141 | 
142 |     def copy_target_update(self):
143 |         if self.iterations % self.target_update_frequency == 0:
144 |              self.Q_target.load_state_dict(self.Q.state_dict())
145 | 
146 |     def save(self, filename):
147 |         torch.save(self.Q.state_dict(), filename + "_Q")
148 |         torch.save(self.Q_optimizer.state_dict(), filename + "_optimizer")
149 | 
150 | 
151 |     def load(self, filename):
152 |         self.Q.load_state_dict(torch.load(filename + "_Q"))
153 |         self.Q_target = copy.deepcopy(self.Q)
154 |         self.Q_optimizer.load_state_dict(torch.load(filename + "_optimizer"))
155 |         
156 |     def get_policy(self,):
157 |         return self.Q
158 |     
159 |     def save_model(self):
160 |         pass


--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/crr.py:
--------------------------------------------------------------------------------
  1 | # Critic regularized regression
  2 | # Paper: https://arxiv.org/abs/2006.15134
  3 | 
  4 | import torch
  5 | from copy import deepcopy
  6 | from loguru import logger
  7 | 
  8 | from offlinerl.algo.base import BaseAlgo
  9 | from offlinerl.utils.net.common import Net
 10 | from offlinerl.utils.net.continuous import DistributionalCritic
 11 | from offlinerl.utils.net.tanhpolicy import TanhGaussianPolicy
 12 | from offlinerl.utils.exp import setup_seed
 13 | 
 14 | def algo_init(args):
 15 |     logger.info('Run algo_init function')
 16 | 
 17 |     setup_seed(args['seed'])
 18 |     
 19 |     if args["obs_shape"] and args["action_shape"]:
 20 |         obs_shape, action_shape = args["obs_shape"], args["action_shape"]
 21 |         max_action = args["max_action"]
 22 |     elif "task" in args.keys():
 23 |         from offlinerl.utils.env import get_env_shape, get_env_action_range
 24 |         obs_shape, action_shape = get_env_shape(args['task'])
 25 |         max_action, _ = get_env_action_range(args["task"])
 26 |         args["obs_shape"], args["action_shape"] = obs_shape, action_shape
 27 |     else:
 28 |         raise NotImplementedError
 29 | 
 30 |     net_a = Net(layer_num=args['hidden_layers'], 
 31 |                 state_shape=obs_shape, 
 32 |                 hidden_layer_size=args['hidden_features'])
 33 | 
 34 |     actor = TanhGaussianPolicy(preprocess_net=net_a,
 35 |                                action_shape=action_shape,
 36 |                                hidden_layer_size=args['hidden_features'],
 37 |                                conditioned_sigma=True).to(args['device'])
 38 | 
 39 |     actor_optim = torch.optim.Adam(actor.parameters(), lr=args['lr'])
 40 | 
 41 |     critic = DistributionalCritic(obs_shape, action_shape, args['atoms'], 
 42 |                                   args['hidden_features'], args['hidden_layers'],
 43 |                                   None, None).to(args['device'])
 44 |     critic_optim = torch.optim.Adam(critic.parameters(), lr=args['lr'])
 45 | 
 46 |     return {
 47 |         "actor" : {"net" : actor, "opt" : actor_optim},
 48 |         "critic" : {"net" : critic, "opt" : critic_optim},
 49 |     }
 50 | 
 51 | 
 52 | class AlgoTrainer(BaseAlgo):
 53 |     def __init__(self, algo_init, args):
 54 |         super(AlgoTrainer, self).__init__(args)
 55 |         self.args = args
 56 | 
 57 |         self.actor = algo_init['actor']['net']
 58 |         self.actor_target = deepcopy(self.actor)
 59 |         self.actor_target.requires_grad_(False)
 60 |         self.actor_optim = algo_init['actor']['opt']
 61 | 
 62 |         self.critic = algo_init['critic']['net']
 63 |         self.critic_target = deepcopy(self.critic)
 64 |         self.critic_target.requires_grad_(False)
 65 |         self.critic_optim = algo_init['critic']['opt']
 66 | 
 67 |         self.batch_size = self.args['batch_size']
 68 |         self.gamma = self.args['gamma']
 69 |         self.beta = self.args['beta']
 70 |         self.m = self.args['advantage_samples']
 71 |         self.advantage_mode = self.args['advantage_mode']
 72 |         self.weight_mode = self.args['weight_mode']
 73 |         self.device = self.args['device']
 74 |         
 75 |     def train(self, train_buffer, val_buffer, callback_fn):
 76 |         rewards = train_buffer['rew']
 77 |         self.critic.set_interval(rewards.min() / (1 - self.gamma), rewards.max() / (1 - self.gamma))
 78 |         self.critic_target.set_interval(rewards.min() / (1 - self.gamma), rewards.max() / (1 - self.gamma))
 79 |         for epoch in range(self.args['max_epoch']):
 80 |             for i in range(self.args['steps_per_epoch']):
 81 |                 batch_data = train_buffer.sample(self.batch_size)
 82 |                 batch_data.to_torch(device=self.device)
 83 |                 obs = batch_data['obs']
 84 |                 action = batch_data['act']
 85 |                 next_obs = batch_data['obs_next']
 86 |                 reward = batch_data['rew']
 87 |                 done = batch_data['done'].float()
 88 | 
 89 |                 # update critic
 90 |                 p = self.critic(obs, action)
 91 |                 next_action = self.actor_target.get_action(next_obs)
 92 |                 target_p = self.critic_target.get_target(next_obs, next_action, reward, self.gamma * (1 - done))
 93 |                 critic_loss = - (target_p * torch.log(p + 1e-8)).mean()
 94 | 
 95 |                 self.critic_optim.zero_grad()
 96 |                 critic_loss.backward()
 97 |                 self.critic_optim.step()
 98 | 
 99 |                 # update actor
100 |                 action_dist = self.actor(obs)
101 |                 log_prob = action_dist.log_prob(action)
102 |                 actions = torch.stack([action_dist.sample() for _ in range(self.m)], dim=0)
103 |                 repeat_obs = torch.repeat_interleave(obs.unsqueeze(0), self.m, 0)
104 |                 _, values = self.critic(repeat_obs, actions, with_q=True)
105 |                 _, value = self.critic(obs, action, with_q=True)
106 |                 
107 |                 if self.advantage_mode == 'mean':
108 |                     advantage = value - values.mean(dim=0)
109 |                 elif self.advantage_mode == 'max':
110 |                     advantage = value - values.max(dim=0)[0]
111 |                 
112 |                 if self.weight_mode == 'exp':
113 |                     weight = torch.exp(advantage / self.beta)
114 |                 elif self.weight_mode == 'binary':
115 |                     weight = (advantage > 0).float()
116 |                     
117 |                 weight = torch.clamp_max(weight, 20).detach()
118 |                 actor_loss = - torch.mean(weight * log_prob)
119 | 
120 |                 self.actor_optim.zero_grad()
121 |                 actor_loss.backward()
122 |                 self.actor_optim.step()
123 | 
124 |                 if i % self.args['update_frequency']:
125 |                     self._sync_weight(self.critic_target, self.critic, 1.0)
126 |                     self._sync_weight(self.actor_target, self.actor, 1.0)
127 |             print("actor_loss: ", actor_loss.item())
128 |             res = callback_fn(self.get_policy())
129 |             
130 |             self.log_res(epoch, res)
131 | 
132 |         return self.report_result
133 |     
134 |     def get_policy(self):
135 |         return self.actor


--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/td3bc.py:
--------------------------------------------------------------------------------
  1 | # A Minimalist Approach to Offline Reinforcement Learning
  2 | # https://arxiv.org/pdf/2106.06860
  3 | # https://github.com/sfujim/TD3_BC
  4 | import torch
  5 | from copy import deepcopy
  6 | from loguru import logger
  7 | from torch.functional import F
  8 | 
  9 | from offlinerl.algo.base import BaseAlgo
 10 | from offlinerl.utils.net.common import MLP,Net
 11 | from offlinerl.utils.net.tanhpolicy import TanhGaussianPolicy
 12 | from offlinerl.utils.exp import setup_seed
 13 | 
 14 | 
 15 | def algo_init(args):
 16 |     logger.info('Run algo_init function')
 17 |     setup_seed(args['seed'])
 18 |     if args["obs_shape"] and args["action_shape"]:
 19 |         obs_shape, action_shape = args["obs_shape"], args["action_shape"]
 20 |         max_action = args["max_action"]
 21 |     elif "task" in args.keys():
 22 |         from offlinerl.utils.env import get_env_shape, get_env_action_range
 23 |         obs_shape, action_shape = get_env_shape(args['task'])
 24 |         max_action, _ = get_env_action_range(args["task"])
 25 |         args["obs_shape"], args["action_shape"] = obs_shape, action_shape
 26 |     else:
 27 |         raise NotImplementedError
 28 |     
 29 |     net_a = Net(layer_num = args['actor_layers'], 
 30 |                 state_shape = obs_shape, 
 31 |                 hidden_layer_size = args['actor_features'])
 32 |     
 33 |     actor = TanhGaussianPolicy(preprocess_net = net_a,
 34 |                                action_shape = action_shape,
 35 |                                hidden_layer_size = args['actor_features'],
 36 |                                conditioned_sigma = True,
 37 |                               ).to(args['device'])
 38 |     
 39 |     actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr'])
 40 | 
 41 |     critic_1 = MLP(obs_shape + action_shape, 1, args['value_features'], args['value_layers'], hidden_activation='relu').to(args['device'])
 42 |     critic_2 = MLP(obs_shape + action_shape, 1, args['value_features'], args['value_layers'], hidden_activation='relu').to(args['device'])
 43 |     critic_1_optim = torch.optim.Adam([*critic_1.parameters()], lr=args['critic_lr'])
 44 |     critic_2_optim = torch.optim.Adam([*critic_2.parameters()], lr=args['critic_lr'])
 45 |     
 46 |     nets =  {
 47 |         "actor" : {"net" : actor, "opt" : actor_optim},
 48 |         "critic" : {"net" : [critic_1, critic_2], "opt" : [critic_1_optim,critic_2_optim]},
 49 |         
 50 |     }
 51 |     
 52 |     return nets
 53 | 
 54 | 
 55 | class AlgoTrainer(BaseAlgo):
 56 |     def __init__(self, algo_init, args):
 57 |         super(AlgoTrainer, self).__init__(args)
 58 |         self.args = args
 59 | 
 60 |         self.actor = algo_init['actor']['net']
 61 |         self.actor_optim = algo_init['actor']['opt']
 62 | 
 63 |         self.critic_1, self.critic_2 = algo_init['critic']['net']
 64 |         self.target_critic_1 = deepcopy(self.critic_1)
 65 |         self.target_critic_2 = deepcopy(self.critic_2)
 66 |         self.critic_1_optim = algo_init['critic']['opt'][0]
 67 |         self.critic_2_optim = algo_init['critic']['opt'][1]
 68 | 
 69 |         self.alpha = self.args['alpha']
 70 |         self.policy_noise = self.args['policy_noise']
 71 |         self.noise_clip = self.args['noise_clip']
 72 |         self.policy_freq = self.args['policy_freq']
 73 |         self.discount = self.args['discount']
 74 |         
 75 |         self.batch_size = self.args['batch_size']
 76 |         self.device = self.args['device']
 77 |         self.max_action = 1
 78 |         
 79 |         
 80 |     def forward(self, obs, reparameterize=True, return_log_prob=True):
 81 |         log_prob = None
 82 |         tanh_normal = self.actor(obs,reparameterize=reparameterize,)
 83 |         if return_log_prob:
 84 |             if reparameterize is True:
 85 |                 action, pre_tanh_value = tanh_normal.rsample(
 86 |                     return_pretanh_value=True
 87 |                 )
 88 |             else:
 89 |                 action, pre_tanh_value = tanh_normal.sample(
 90 |                     return_pretanh_value=True
 91 |                 )
 92 |             log_prob = tanh_normal.log_prob(
 93 |                 action,
 94 |                 pre_tanh_value=pre_tanh_value
 95 |             )
 96 |             log_prob = log_prob.sum(dim=1, keepdim=True)
 97 |         else:
 98 |             if reparameterize is True:
 99 |                 action = tanh_normal.rsample()
100 |             else:
101 |                 action = tanh_normal.sample()
102 |         return action, log_prob
103 |         
104 |     def train(self, train_buffer, val_buffer, callback_fn):
105 |         # train_buffer
106 |         obs_mean = train_buffer["obs"].mean(0)
107 |         obs_std = train_buffer["obs"].std(0) + 1e-3
108 |         obs_mean = torch.as_tensor(obs_mean, dtype=torch.float32)
109 |         obs_std = torch.as_tensor(obs_std, dtype=torch.float32)
110 |         self.actor.preprocess.s_mean = obs_mean
111 |         self.actor.preprocess.s_std = obs_std
112 |         
113 |         self.target_actor = deepcopy(self.actor)
114 |         
115 |         for epoch in range(self.args['max_epoch']):
116 |             for i in range(self.args['steps_per_epoch']):
117 |                 batch_data = train_buffer.sample(self.batch_size)
118 |                 batch_data.to_torch(device=self.device)
119 |                 
120 |                 obs = batch_data['obs']
121 |                 action = batch_data['act']
122 |                 next_obs = batch_data['obs_next']
123 |                 reward = batch_data['rew']
124 |                 done = batch_data['done'].float()
125 |                 
126 |                 with torch.no_grad():
127 |                     noise = (torch.randn_like(action) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
128 |                     next_action = (self.target_actor(next_obs).mode + noise).clamp(-self.max_action, self.max_action)
129 |                     next_obs_action = torch.cat([next_obs, next_action], dim=-1)
130 |                     target_q = torch.min(
131 |                         self.target_critic_1(next_obs_action), self.target_critic_2(next_obs_action)
132 |                     )*self.discount*(1-done) + reward
133 |                 
134 |                 obs_action = torch.cat([obs, action], dim=-1)
135 |                 current_q1, current_q2 = self.critic_1(obs_action), self.critic_2(obs_action)
136 |                 critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
137 |                 
138 |                 # Optimize the critic
139 |                 self.critic_1_optim.zero_grad()
140 |                 self.critic_2_optim.zero_grad()
141 |                 critic_loss.backward()
142 |                 self.critic_1_optim.step()
143 |                 self.critic_2_optim.step()
144 |                 
145 |                 
146 |                 if i % self.policy_freq == 0:
147 |                     pi = self.actor(obs).mode
148 |                     q = self.critic_1(torch.cat([obs, pi], dim=-1))
149 |                     lmbda = self.alpha / q.abs().mean().detach()
150 |                     actor_loss = -lmbda * q.mean() + F.mse_loss(pi, action)
151 |                     
152 |                     self.actor_optim.zero_grad()
153 |                     actor_loss.backward()
154 |                     self.actor_optim.step()
155 |                     
156 |                     self._sync_weight(self.target_actor, self.actor, soft_target_tau=self.args['soft_target_tau'])
157 |                     self._sync_weight(self.target_critic_1, self.critic_1, soft_target_tau=self.args['soft_target_tau'])
158 |                     self._sync_weight(self.target_critic_2, self.critic_2, soft_target_tau=self.args['soft_target_tau'])
159 |                 
160 |             res = callback_fn(self.get_policy())
161 |                 
162 |             res.update({
163 |                 "actor_loss" : actor_loss.item(),
164 |                 "critic_loss" : critic_loss.item(),
165 |                 "lmbda" : lmbda.item(),
166 |                 "q" : q.mean().item(),
167 |             })
168 | 
169 | 
170 |             self.log_res(epoch, res)
171 | 
172 |         return self.report_result
173 | 
174 |     def get_model(self):
175 |         return self.actor
176 |     
177 |     def get_policy(self):
178 |         return self.actor


--------------------------------------------------------------------------------
/offlinerl/algo/online/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/online/__init__.py


--------------------------------------------------------------------------------
/offlinerl/algo/online/bremen.py:
--------------------------------------------------------------------------------
1 | # Deployment-Efficient Reinforcement Learning via Model-Based Offline Optimization
2 | # https://arxiv.org/abs/2006.03647
3 | # https://github.com/matsuolab/BREMEN
4 | 
5 | # TODO


--------------------------------------------------------------------------------
/offlinerl/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/config/__init__.py


--------------------------------------------------------------------------------
/offlinerl/config/algo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/config/algo/__init__.py


--------------------------------------------------------------------------------
/offlinerl/config/algo/bc_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | actor_features = 256
16 | actor_layers = 2
17 | 
18 | batch_size = 256
19 | steps_per_epoch = 1000
20 | max_epoch = 1000
21 | 
22 | actor_lr = 1e-3
23 | 
24 | #tune
25 | params_tune = {
26 |     "actor_lr" : {"type" : "continuous", "value": [1e-4, 1e-3]},
27 | }
28 | 
29 | #tune
30 | grid_tune = {
31 |     "actor_lr" : [1e-4, 5e-4, 1e-3],
32 |     "actor_layers" : [2,3],
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/bc_model_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | # model save path
16 | dynamics_path = None
17 | dynamics_save_path = None
18 | 
19 | # transition model train
20 | transition_init_num = 7
21 | transition_select_num = 5
22 | val_ratio = 0.2
23 | max_epochs_since_update = 10
24 | transition_max_epochs = None
25 | 
26 | # trick config
27 | normalize_obs = False
28 | transition_scaler = True
29 | 
30 | # transition config
31 | transition_batch_size = 256
32 | transition_lr = 1e-3
33 | logvar_loss_coef = 0.01
34 | dynamics_hidden_dims = [200, 200, 200, 200]
35 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
36 | 
37 | #tune
38 | params_tune = {
39 |     "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
40 |     "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
41 |     "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
42 |     "lam" : {"type" : "continuous", "value": [0.1, 10]},
43 |     "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
44 | }
45 | 
46 | #tune
47 | grid_tune = {
48 |     "transition_scaler" : [True, False],
49 |     "transition_lr" : [1e-3, 3e-4],
50 |     "logvar_loss_coef" : [0.01, 1e-3],
51 | }
52 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/bcq_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | vae_features = 750
16 | vae_layers = 2
17 | jitter_features = 400
18 | jitter_layers = 2
19 | value_features = 400
20 | value_layers = 2
21 | phi = 0.05
22 | lam = 0.75
23 | 
24 | batch_size = 100
25 | steps_per_epoch = 5000
26 | max_epoch = 200
27 | 
28 | vae_lr = 1e-3
29 | jitter_lr = 3e-4
30 | critic_lr = 3e-4
31 | gamma = 0.99
32 | soft_target_tau = 5e-3
33 | 
34 | #tune
35 | params_tune = {
36 |     "phi" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
37 |     "lam" : {"type" : "continuous", "value": [0, 1]},
38 | }
39 | 
40 | #tune
41 | grid_tune = {
42 |     "phi" : [0.05, 0.1, 0.2, 0.5],
43 | }
44 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/bcqd_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | 
16 | max_timesteps = 1e6
17 | eval_freq = 1e3
18 | 
19 | optimizer_parameters = {
20 |     "lr": 3e-4,
21 |     }
22 | 
23 | BCQ_threshold = 0.3
24 | 
25 | discount = 0.99
26 | tau = 0.005
27 | polyak_target_update = True
28 | target_update_frequency=1
29 | start_timesteps = 1e3
30 | initial_eps = 0.1
31 | end_eps = 0.1
32 | eps_decay_period = 1
33 | eval_eps = 0.001
34 | buffer_size = 1e6
35 | batch_size = 256
36 | train_freq = 1


--------------------------------------------------------------------------------
/offlinerl/config/algo/bremen_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
10 | obs_shape = None
11 | act_shape = None
12 | 
13 | dynamics_path = None
14 | behavior_path = None
15 | 
16 | transition_hidden_size = 256
17 | transition_hidden_layers = 4
18 | transition_init_num = 7
19 | transition_select_num = 5
20 | 
21 | actor_hidden_size = 256
22 | actor_hidden_layers = 2
23 | value_hidden_size = 256
24 | value_hidden_layers = 2
25 | 
26 | transition_batch_size = 256
27 | data_collection_per_epoch = 50000
28 | max_epoch = 250
29 | trpo_steps_per_epoch = 25
30 | 
31 | bc_batch_size = 256
32 | bc_init = True
33 | 
34 | transition_lr = 1e-3
35 | bc_lr = 1e-3
36 | value_lr = 3e-4
37 | 
38 | cg_iters = 10
39 | damping_coeff = 0.1
40 | backtrack_iters = 10
41 | backtrack_coeff = 0.8
42 | train_v_iters = 50
43 | trpo_step_size = 0.01
44 | explore_mode = 'sample'
45 | static_noise = 0.1
46 | 
47 | horizon = 250
48 | gamma = 0.99
49 | lam = 0.95
50 | 
51 | #tune
52 | params_tune = {
53 |     "horizon" : {"type" : "discrete", "value": [250, 500, 1000]}
54 | }
55 | 
56 | #tune
57 | grid_tune = {
58 |     'horizon' : [250, 1000],
59 |     # 'trpo_step_size' : [0.01, 0.05],
60 |     'explore_mode' : ['sample', 'static'],
61 | }
62 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/combo_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42 
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | # model save path
16 | dynamics_path = None
17 | dynamics_save_path = None
18 | 
19 | # transition model train
20 | transition_init_num = 7
21 | transition_select_num = 5
22 | val_ratio = 0.2
23 | max_epochs_since_update = 5
24 | transition_max_epochs = None
25 | 
26 | # trick config
27 | trainsition_clip = True
28 | normalize_obs = False
29 | transition_scaler = True
30 | policy_scaler = False
31 | 
32 | # transition config
33 | transition_batch_size = 256
34 | transition_lr = 1e-3
35 | logvar_loss_coef = 0.01
36 | dynamics_hidden_dims = [200, 200, 200, 200]
37 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
38 | 
39 | # alpha config
40 | learnable_alpha = True
41 | alpha_lr = 1e-4
42 | alpha = 0.2
43 | 
44 | # train config
45 | horizon = 1
46 | real_data_ratio = 0.5
47 | max_epoch = 1000
48 | steps_per_epoch = 1000
49 | rollout_freq = 1000
50 | rollout_batch_size = 5e+4
51 | 
52 | # policy config
53 | hidden_dims = [256, 256, 256]
54 | policy_batch_size = 256
55 | actor_lr = 1e-4
56 | 
57 | # critic config
58 | critic_lr = 3e-4
59 | discount = 0.99
60 | soft_target_tau = 5e-3
61 | target_entropy = None
62 | 
63 | # others
64 | val_frequency = 10
65 | eval_episodes = 10
66 | model_retain_epochs = 5
67 | 
68 | # combo config
69 | cql_weight = 2.5
70 | temperatue = 1.0
71 | max_q_backup = False
72 | deterministic_backup = True
73 | with_lagrange = False
74 | lagrange_threshold = 10.0
75 | cql_alpha_lr = 3e-4
76 | num_repeat_actions = 10
77 | uniform_rollout = False
78 | rho_s = "mix"  # choose from ["model", "mix"]
79 | 
80 | #tune
81 | params_tune = {
82 |     "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
83 |     "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
84 |     "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
85 |     "lam" : {"type" : "continuous", "value": [0.1, 10]},
86 |     "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
87 | }
88 | 
89 | #tune
90 | grid_tune = {
91 |     "horizon" : [1, 5],
92 |     "cql_weight" : [2.5, 3.5, 5],
93 |     "rho_s": ["model", "mix"],
94 | }
95 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/cql_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | max_epoch = 1000
16 | steps_per_epoch = 1000
17 | policy_bc_steps = 40000
18 | 
19 | batch_size = 256
20 | hidden_layer_size = 256
21 | layer_num = 2
22 | actor_lr=1E-4
23 | critic_lr=3E-4
24 | reward_scale=1
25 | use_automatic_entropy_tuning=True
26 | target_entropy = None
27 | discount = 0.99
28 | soft_target_tau=5e-3
29 | 
30 | # min Q
31 | explore=1.0
32 | temp=1.0
33 | min_q_version=3
34 | min_q_weight=5.0
35 | # lagrange
36 | with_lagrange=False
37 | lagrange_thresh=2.0
38 | 
39 | # extra params
40 | num_random=10
41 | type_q_backup= "min"
42 | q_backup_lmbda = 0.75
43 | deterministic_backup=False
44 | 
45 | discrete = False
46 | 
47 | #tune
48 | params_tune = {
49 |     "actor_lr" : {"type" : "discrete", "value":[1e-4, 3e-4]},
50 |     "min_q_version" : {"type" : "discrete", "value":[2, 3]},
51 |     "min_q_weight" : {"type": "discrete", "value":[5, 10]},
52 |     "lagrange_thresh" : {"type": "discrete", "value":[-1, 2, 5, 10]},
53 |     "type_q_backup" : {"type": "discrete", "value":["max", "none"]},
54 | }
55 | 
56 | #tune
57 | grid_tune = {
58 |     #"actor_lr" : [1e-4, 3e-4],
59 |     "min_q_version" : [2, 3],
60 |     "min_q_weight" : [5, 10],
61 |     "lagrange_thresh" : [-1, 2, 5, 10],
62 |     # "type_q_backup" : ["min", "none"],
63 | }
64 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/crr_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | hidden_features = 256
16 | hidden_layers = 2
17 | atoms = 21
18 | 
19 | advantage_mode = 'mean'
20 | weight_mode = 'exp'
21 | advantage_samples = 4
22 | beta = 1.0
23 | gamma = 0.99
24 | 
25 | batch_size = 1024
26 | steps_per_epoch = 1000
27 | max_epoch = 200
28 | 
29 | lr = 1e-4
30 | update_frequency = 100
31 | 
32 | #tune
33 | params_tune = {
34 |     "beta" : {"type" : "continuous", "value": [0.0, 10.0]},
35 | }
36 | 
37 | #tune
38 | grid_tune = {
39 |     "advantage_mode" : ['mean', 'max'],
40 |     "weight_mode" : ['exp', 'binary'],
41 | }
42 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/edac_config.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from offlinerl.utils.exp import select_free_cuda
  3 | # from datetime import datetime
  4 | 
  5 | task = "Hopper-v3"
  6 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
  7 | 
  8 | 
  9 | # parser.add_argument("--algo-name", type=str, default="edac")
 10 |     # parser.add_argument("--task", type=str, default="SafetyHalfCheetah")
 11 | obs_shape = None
 12 | act_shape = None
 13 | 
 14 | seed = 42
 15 | actor_lr=1e-4
 16 | critic_lr=3e-4
 17 | task_train_num = 99
 18 | task_data_type = 'high'
 19 | # hidden_dims=[256, 256, 256]
 20 | hidden_layer_size = 256
 21 | layer_num = 2
 22 | gamma=0.99
 23 | tau=0.005
 24 | alpha=0.2
 25 | auto_alpha=True
 26 | 
 27 | target_entropy = None
 28 | alpha_lr =1e-4
 29 | num_critics = 50
 30 | 
 31 | max_q_backup = False
 32 | deterministic_backup=False
 33 | 
 34 | eta=1.0
 35 | normalize_reward=False
 36 | 
 37 | epoch=3000
 38 | step_per_epoch=1000
 39 | 
 40 | eval_episodes=100
 41 | batch_size=256
 42 | 
 43 | #tune
 44 | params_tune = {
 45 |     "num_critics" : {"type" : "discrete", "value":[10,50]},
 46 |     "eta" : {"type" : "discrete", "value":[1, 5]},
 47 | }
 48 | 
 49 | grid_tune = {
 50 |     "num_critics" : [10, 50],
 51 |     "eta" : [1, 5],
 52 | }
 53 | 
 54 | 
 55 | # task_data_type = "low"
 56 | # task_train_num = 99
 57 | 
 58 | # seed = 42
 59 | 
 60 | # device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
 61 | # obs_shape = None
 62 | # act_shape = None
 63 | # max_action = None
 64 | 
 65 | # max_epoch = 300
 66 | # steps_per_epoch = 1000
 67 | # policy_bc_steps = 40000
 68 | 
 69 | # batch_size = 256
 70 | # hidden_layer_size = 256
 71 | # layer_num = 2
 72 | # actor_lr=1E-4
 73 | # critic_lr=3E-4
 74 | # reward_scale=1
 75 | # use_automatic_entropy_tuning=True
 76 | # target_entropy = None
 77 | # discount = 0.99
 78 | # soft_target_tau=5e-3
 79 | 
 80 | # # min Q
 81 | # explore=1.0
 82 | # temp=1.0
 83 | # min_q_version=3
 84 | # min_q_weight=5.0
 85 | 
 86 | # # lagrange
 87 | # with_lagrange=False
 88 | # lagrange_thresh=2.0
 89 | 
 90 | # # extra params
 91 | # num_random=10
 92 | # type_q_backup= "min"
 93 | # q_backup_lmbda = 0.75
 94 | # deterministic_backup=False
 95 | 
 96 | # discrete = False
 97 | 
 98 | #tune
 99 | # params_tune = {
100 | #     "actor_lr" : {"type" : "discrete", "value":[1e-4, 3e-4]},
101 | #     "min_q_version" : {"type" : "discrete", "value":[2, 3]},
102 | #     "min_q_weight" : {"type": "discrete", "value":[5, 10]},
103 | #     "lagrange_thresh" : {"type": "discrete", "value":[-1, 2, 5, 10]},
104 | #     "type_q_backup" : {"type": "discrete", "value":["max", "none"]},
105 | # }
106 | 
107 | # #tune
108 | # grid_tune = {
109 | #     #"actor_lr" : [1e-4, 3e-4],
110 | #     "min_q_version" : [2, 3],
111 | #     "min_q_weight" : [5, 10],
112 | #     "lagrange_thresh" : [-1, 2, 5, 10],
113 | #     # "type_q_backup" : ["min", "none"],
114 | # }
115 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/maple_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | # device = 'cuda:0'
12 | obs_shape = None
13 | act_shape = None
14 | max_action = None
15 | # new parameters based on mopo
16 | lstm_hidden_unit = 128
17 | Guassain_hidden_sizes = (256,256)
18 | value_hidden_sizes=(256,256)
19 | hidden_sizes=(16,)
20 | model_pool_size = 250000
21 | rollout_batch_size = 50000
22 | handle_per_round = 400
23 | out_train_epoch = 1000
24 | in_train_epoch = 1000 
25 | 
26 | train_batch_size = 256              # train policy num of trajectories
27 | 
28 | number_runs_eval = 40            # evaluation epochs in mujoco 
29 | 
30 | #-------------
31 | dynamics_path = None
32 | dynamics_save_path = None
33 | only_dynamics = False
34 | 
35 | hidden_layer_size = 256
36 | hidden_layers = 2
37 | transition_layers = 4
38 | 
39 | transition_init_num = 20
40 | transition_select_num = 14
41 | # by selecting a number smaller than rollout_batch_size, you can protect the model rollout from OOM error
42 | mini_forward_size = -1
43 | 
44 | real_data_ratio = 0.05
45 | 
46 | transition_batch_size = 256
47 | policy_batch_size = 256
48 | data_collection_per_epoch = 50e3
49 | steps_per_epoch = 1000
50 | max_epoch = 1000
51 | 
52 | 
53 | eval_episodes = 100
54 | 
55 | learnable_alpha = True
56 | uncertainty_mode = 'aleatoric'
57 | transition_lr = 1e-3
58 | actor_lr = 3e-4
59 | critic_lr = 3e-4
60 | discount = 0.99
61 | soft_target_tau = 5e-3
62 | 
63 | horizon = 10
64 | lam = 0.25
65 | 
66 | penalty_clip = 20
67 | mode = 'normalize' # 'normalize', 'local', 'noRes'
68 | 
69 | #tune
70 | params_tune = {
71 |     "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
72 |     "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
73 |     "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
74 |     "lam" : {"type" : "continuous", "value": [0.1, 10]},
75 |     "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
76 | }
77 | 
78 | #tune
79 | grid_tune = {
80 |     "horizon" : [1, 5],
81 |     "lam" : [0.5, 1, 2, 5],
82 |     "uncertainty_mode" : ['aleatoric', 'disagreement'],
83 | }
84 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/maple_config_new.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | # transition model train
16 | transition_init_num = 20
17 | transition_select_num = 14
18 | val_ratio = 0.2
19 | max_epochs_since_update = 5
20 | transition_max_epochs = None
21 | 
22 | # trick config
23 | trainsition_clip = False
24 | normalize_obs = False  # should set to False
25 | transition_scaler = True
26 | 
27 | # transition config
28 | transition_batch_size = 256
29 | transition_lr = 1e-3
30 | logvar_loss_coef = 0.01
31 | dynamics_hidden_dims = [200, 200, 200, 200]
32 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
33 | 
34 | # new parameters based on mopo
35 | lstm_hidden_unit = 128
36 | Guassain_hidden_sizes = (256,256)
37 | value_hidden_sizes=(256,256)
38 | hidden_sizes=(16,)
39 | model_pool_size = 250000
40 | rollout_batch_size = 50000
41 | handle_per_round = 400
42 | out_train_epoch = 1000
43 | in_train_epoch = 1000 
44 | 
45 | train_batch_size = 256              # train policy num of trajectories
46 | 
47 | number_runs_eval = 40            # evaluation epochs in mujoco 
48 | 
49 | #-------------
50 | dynamics_path = None
51 | dynamics_save_path = None
52 | only_dynamics = False
53 | 
54 | hidden_layer_size = 256
55 | hidden_layers = 2
56 | 
57 | real_data_ratio = 0.05
58 | 
59 | policy_batch_size = 256
60 | data_collection_per_epoch = 50e3
61 | steps_per_epoch = 1000
62 | max_epoch = 1000
63 | 
64 | eval_episodes = 100
65 | 
66 | # alpha config
67 | learnable_alpha = True
68 | alpha_lr = 1e-4
69 | alpha = 0.2
70 | target_entropy = None
71 | 
72 | uncertainty_mode = 'aleatoric'
73 | actor_lr = 3e-4
74 | critic_lr = 3e-4
75 | discount = 0.99
76 | soft_target_tau = 5e-3
77 | 
78 | horizon = 10
79 | penalty_coef = 0.25
80 | 
81 | penalty_clip = 20
82 | 
83 | #tune
84 | params_tune = {
85 |     "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
86 |     "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
87 |     "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
88 |     "lam" : {"type" : "continuous", "value": [0.1, 10]},
89 |     "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
90 | }
91 | 
92 | #tune
93 | grid_tune = {
94 |     "horizon" : [1, 5],
95 |     "lam" : [0.5, 1, 2, 5],
96 |     "uncertainty_mode" : ['aleatoric', 'disagreement'],
97 | }
98 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/mcq_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | vae_features = 750
16 | vae_layers = 2
17 | actor_features = 400
18 | actor_layers = 2
19 | value_features = 400
20 | value_layers = 2
21 | lam = 0.95
22 | 
23 | alpha = 0.2
24 | auto_alpha = True
25 | target_entropy = None
26 | 
27 | batch_size = 256
28 | steps_per_epoch = 1000
29 | max_epoch = 1000
30 | 
31 | vae_lr = 1e-3
32 | actor_lr = 3e-4
33 | critic_lr = 3e-4
34 | alpha_lr = 3e-4
35 | gamma = 0.99
36 | soft_target_tau = 5e-3
37 | 
38 | num_sampled_actions = 10
39 | eval_episodes = 100
40 | 
41 | #tune
42 | params_tune = {
43 |     "lam" : {"type" : "continuous", "value": [0.3, 0.95]},
44 | }
45 | 
46 | #tune
47 | grid_tune = {
48 |     "lam" : [0.3,0.4,0.5, 0.6, 0.7, 0.8, 0.9, 0.95],
49 |     "auto_alpha" : [True, False],
50 | }
51 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/mobile_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42 
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | # model save path
16 | dynamics_path = None
17 | dynamics_save_path = None
18 | 
19 | # transition model train
20 | transition_init_num = 7
21 | transition_select_num = 5
22 | val_ratio = 0.2
23 | max_epochs_since_update = 5
24 | transition_max_epochs = None
25 | 
26 | # trick config
27 | trainsition_clip = True
28 | normalize_obs = False
29 | transition_scaler = True
30 | policy_scaler = False
31 | 
32 | # transition config
33 | transition_batch_size = 256
34 | transition_lr = 1e-3
35 | logvar_loss_coef = 0.01
36 | dynamics_hidden_dims = [200, 200, 200, 200]
37 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
38 | 
39 | # alpha config
40 | learnable_alpha = True
41 | alpha_lr = 1e-4
42 | alpha = 0.2
43 | 
44 | # train config
45 | horizon = 5
46 | real_data_ratio = 0.05
47 | max_epoch = 3000
48 | steps_per_epoch = 1000
49 | rollout_freq = 1000
50 | rollout_batch_size = 5e+4
51 | 
52 | # policy config
53 | hidden_dims = [256, 256]
54 | policy_batch_size = 256
55 | actor_lr = 1e-4
56 | 
57 | # critic config
58 | critic_lr = 3e-4
59 | discount = 0.99
60 | soft_target_tau = 5e-3
61 | target_entropy = None
62 | 
63 | # others
64 | val_frequency = 10
65 | eval_episodes = 10
66 | model_retain_epochs = 5
67 | 
68 | # mobile config
69 | num_q_ensemble = 2
70 | penalty_coef = 3.5
71 | num_samples = 10
72 | 
73 | #tune
74 | params_tune = {
75 |     "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
76 |     "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
77 |     "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
78 |     "lam" : {"type" : "continuous", "value": [0.1, 10]},
79 |     "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
80 | }
81 | 
82 | #tune
83 | grid_tune = {
84 |     "horizon" : [1, 5],
85 |     "penalty_coef" : [0.5, 1.5, 2.5, 3.5],
86 |     "real_data_ratio" :[0.05], 
87 | }
88 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/moose_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | vae_iterations = 500000
16 | vae_hidden_size = 750
17 | vae_batch_size = 100
18 | vae_kl_weight = 0.5
19 | #vae_pretrain_model = "/tmp/vae_499999.pkl"
20 | 
21 | 
22 | latent = False
23 | layer_num = 3
24 | actor_batch_size = 100
25 | hidden_layer_size = 256
26 | actor_iterations = 500000
27 | vae_lr = 1e-4
28 | actor_lr = 1e-4
29 | critic_lr = 1e-3
30 | soft_target_tau = 0.005
31 | lmbda = 0.75
32 | discount = 0.99
33 | 
34 | max_latent_action = 2 
35 | phi = 0.05
36 | 
37 | #tune
38 | params_tune = {
39 |     "vae_iterations" : {"type" : "continuous", "value":[50000, 100000, 500000,]},
40 |     "actor_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]},
41 |     "vae_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]},
42 |     "lmbda" :{"type": "discrete", "value":[0.0, 0.25, 0.5, 0.75, 1.0]},
43 | }
44 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/mopo_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | # model save path
16 | dynamics_path = None
17 | dynamics_save_path = None
18 | 
19 | # transition model train
20 | transition_init_num = 7
21 | transition_select_num = 5
22 | val_ratio = 0.2
23 | max_epochs_since_update = 5
24 | transition_max_epochs = None
25 | 
26 | # trick config
27 | trainsition_clip = False
28 | normalize_obs = False
29 | transition_scaler = True
30 | policy_scaler = False
31 | 
32 | # transition config
33 | transition_batch_size = 256
34 | transition_lr = 1e-3
35 | logvar_loss_coef = 0.01
36 | dynamics_hidden_dims = [200, 200, 200, 200]
37 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
38 | 
39 | # alpha config
40 | learnable_alpha = True
41 | alpha_lr = 1e-4
42 | alpha = 0.2
43 | target_entropy = None
44 | 
45 | # train config
46 | horizon = 1
47 | real_data_ratio = 0.05
48 | max_epoch = 3000
49 | steps_per_epoch = 1000
50 | rollout_freq = 1000
51 | rollout_batch_size = 5e+4
52 | 
53 | # policy config
54 | hidden_dims = [256, 256]
55 | policy_batch_size = 256
56 | actor_lr = 1e-4
57 | 
58 | # critic config
59 | critic_lr = 3e-4
60 | discount = 0.99
61 | soft_target_tau = 5e-3
62 | 
63 | # others
64 | model_retain_epochs = 5
65 | 
66 | # mopo config
67 | uncertainty_mode = 'aleatoric'
68 | penalty_coef = 1
69 | 
70 | #tune
71 | params_tune = {
72 |     "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
73 |     "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
74 |     "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
75 |     "lam" : {"type" : "continuous", "value": [0.1, 10]},
76 |     "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
77 | }
78 | 
79 | #tune
80 | grid_tune = {
81 |     "horizon" : [1, 5],
82 |     "penalty_coef" : [0.5, 1, 2, 5],
83 |     "uncertainty_mode" : ['aleatoric', 'disagreement'],
84 | }
85 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/plas_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | vae_iterations = 500000
16 | vae_hidden_size = 750
17 | vae_batch_size = 100
18 | vae_kl_weight = 0.5
19 | 
20 | latent = True
21 | layer_num = 2
22 | actor_batch_size = 100
23 | hidden_layer_size = 256
24 | actor_iterations = 500000
25 | vae_lr = 1e-4
26 | actor_lr = 1e-4
27 | critic_lr = 1e-3
28 | soft_target_tau = 0.005
29 | lmbda = 0.75
30 | discount = 0.99
31 | 
32 | max_latent_action = 2 
33 | phi = 0.05
34 | 
35 | #tune
36 | params_tune = {
37 |     "vae_iterations" : {"type" : "discrete", "value":[50000, 100000, 500000,]},
38 |     "actor_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]},
39 |     "vae_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]},
40 |     "actor_batch_size" : {"type": "discrete", "value":[128, 256, 512]},
41 |     "latent" : {"type": "discrete", "value":[True, False]},
42 |     "lmbda" :{"type": "discrete", "value":[0.65, 0.75, 0.85]},
43 | }
44 | 
45 | #tune
46 | grid_tune = {
47 |     "phi" : [0, 0.05, 0.1, 0.2, 0.4],
48 | }
49 | 


--------------------------------------------------------------------------------
/offlinerl/config/algo/prdc_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
10 | 
11 | 
12 | steps_per_epoch = 1000
13 | max_epoch = 1000
14 | batch_size = 256
15 | state_dim = None
16 | action_dim = None
17 | alpha = 2.5
18 | beta = 2.0
19 | k = 1
20 | policy_freq = 2
21 | noise_clip = 0.5
22 | policy_noise = 2
23 | discount = 0.99
24 | tau = 0.005
25 | expl_noise = 0.1
26 | critic_lr = 3e-4
27 | actor_lr = 3e-4
28 | max_action = 1.0
29 | 
30 | 
31 | 
32 | #tune
33 | grid_tune = {
34 |     "alpha" : [2.5, 7.5, 20.0, 40.0],
35 |     "beta" : [2.0, 7.5, 15.0],
36 | }


--------------------------------------------------------------------------------
/offlinerl/config/algo/rambo_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Simglucose"
 5 | task_data_type = "medium"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | 
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 | 
15 | # model save path
16 | policy_bc_path = None
17 | policy_bc_save_path = None
18 | dynamics_path = None
19 | dynamics_save_path = None
20 | 
21 | # transition model train
22 | transition_init_num = 7
23 | transition_select_num = 5
24 | val_ratio = 0.2
25 | max_epochs_since_update = 5
26 | transition_max_epochs = None
27 | 
28 | # trick config
29 | trainsition_clip = True
30 | normalize_obs = False
31 | transition_scaler = True
32 | policy_scaler = True
33 | 
34 | # transition config
35 | transition_batch_size = 256
36 | transition_lr = 1e-3  # 3e-4
37 | logvar_loss_coef = 0.01  # 1e-3
38 | dynamics_hidden_dims = [200, 200, 200, 200]
39 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
40 | 
41 | # alpha config
42 | learnable_alpha = True
43 | alpha_lr = 1e-4
44 | alpha = 0.2
45 | 
46 | # train config
47 | horizon = 5
48 | real_data_ratio = 0.5
49 | max_epoch = 2000
50 | steps_per_epoch = 1000
51 | rollout_freq = 250
52 | rollout_batch_size = 5e+4
53 | 
54 | # policy config
55 | hidden_dims = [256, 256]
56 | policy_batch_size = 256
57 | actor_lr = 1e-4
58 | 
59 | # critic config
60 | critic_lr = 3e-4
61 | discount = 0.99
62 | soft_target_tau = 5e-3
63 | target_entropy = None
64 | 
65 | # others
66 | val_frequency = 10
67 | eval_episodes = 10
68 | model_retain_epochs = 5
69 | 
70 | # rambo config
71 | policy_bc_epoch = 50
72 | policy_bc_batch_size = 256
73 | policy_bc_lr = 1e-4
74 | 
75 | transition_adv_lr = 3e-4
76 | dynamics_update_freq = 1000
77 | adv_train_steps = 1000
78 | adv_rollout_batch_size = 256
79 | adv_rollout_length = 5
80 | include_ent_in_adv = False
81 | adv_weight = 3e-4
82 | 
83 | #tune
84 | params_tune = {
85 |     "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
86 |     "horizon" : {"type" : "discrete", "value": [1, 2, 5]},
87 |     "adv_weight" : {"type" : "discrete", "value": [0, 3e-4]},
88 | }
89 | 
90 | #tune
91 | grid_tune = {
92 |     "horizon" : [1, 5],
93 |     "transition_adv_lr" : [1e-3, 3e-4],
94 |     "adv_weight" : [0, 1e-3, 3e-4],
95 | }


--------------------------------------------------------------------------------
/offlinerl/config/algo/td3bc_config.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from offlinerl.utils.exp import select_free_cuda
 3 | 
 4 | task = "Hopper-v3"
 5 | task_data_type = "low"
 6 | task_train_num = 99
 7 | 
 8 | seed = 42
 9 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
10 | obs_shape = None
11 | act_shape = None
12 | max_action = None
13 | 
14 | 
15 | actor_features = 256
16 | actor_layers = 2
17 | value_features = 256
18 | value_layers = 2
19 | 
20 | alpha = 2.5
21 | policy_noise = 0.2
22 | noise_clip = 0.5
23 | policy_freq = 2
24 | 
25 | 
26 | batch_size = 256
27 | steps_per_epoch = 1000
28 | max_epoch = 1000
29 | 
30 | 
31 | actor_lr = 3e-4
32 | critic_lr = 3e-4
33 | alpha_lr = 3e-4
34 | discount = 0.99
35 | soft_target_tau = 5e-3
36 | 
37 | num_sampled_actions = 10
38 | eval_episodes = 100
39 | 
40 | #tune
41 | grid_tune = {
42 |     "alpha" : [0.05, 0.1, 0.2],
43 |     "policy_noise" : [0.5, 1.5, 2.5],
44 | }
45 | 


--------------------------------------------------------------------------------
/offlinerl/data/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import random
 4 | import numpy as np
 5 | from loguru import logger
 6 | 
 7 | from offlinerl.utils.logger import log_path
 8 | from offlinerl.utils.io import create_dir, download_helper, read_json
 9 | 
10 | from offlinerl.data.neorl import load_neorl_buffer
11 | 
12 | dataset_dir = os.path.join(log_path(),"./offlinerl_datasets")
13 | create_dir(dataset_dir)
14 | 
15 | def load_data_from_neorl2_util(task):
16 | 
17 |     import neorl2
18 |     import gymnasium as gym
19 | 
20 |     env = neorl2.make(task)
21 |     if 'fusion' in task.lower():
22 |         train_data, val_data = env.get_dataset(traj_num=20)
23 |     else:
24 |         train_data, val_data = env.get_dataset()
25 | 
26 |     return train_data, val_data
27 | 
28 | def load_data_from_neorl2(task):
29 |     train_data, val_data = load_data_from_neorl2_util(task)
30 |     train_buffer = load_neorl_buffer({
31 |         'obs': train_data["obs"].astype(np.float32),
32 |         'action': train_data["action"].astype(np.float32),
33 |         'next_obs': train_data["next_obs"].astype(np.float32),
34 |         'reward': train_data["reward"].astype(np.float32).reshape(-1, 1),
35 |         'done': np.bool_(train_data["done"]).reshape(-1, 1),
36 |     })
37 |     
38 |     val_buffer = load_neorl_buffer({
39 |         'obs': val_data["obs"].astype(np.float32),
40 |         'action': val_data["action"].astype(np.float32),
41 |         'next_obs': val_data["next_obs"].astype(np.float32),
42 |         'reward': val_data["reward"].astype(np.float32).reshape(-1, 1),
43 |         'done': np.bool_(val_data["done"]).reshape(-1, 1),
44 |     })
45 | 
46 |     return train_buffer, val_buffer
47 | 
48 | def load_data_from_neorl(task, task_data_type = "low", task_train_num = 99):
49 |     try:
50 |         import neorl
51 |         env = neorl.make(task)
52 |         train_data, val_data = env.get_dataset(data_type = task_data_type, train_num = task_train_num)
53 |         train_buffer, val_buffer = load_neorl_buffer(train_data), load_neorl_buffer(val_data)
54 |         logger.info(f"Load task data from neorl. -> {task}")
55 |     except:
56 |         train_buffer, val_buffer = load_data_from_neorl2(task)
57 |         logger.info(f"Load task data from neorl2. -> {task}")
58 |     return train_buffer, val_buffer


--------------------------------------------------------------------------------
/offlinerl/data/d4rl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | from d4rl import gym_mujoco
 5 | import gym
 6 | import d4rl
 7 | import numpy as np
 8 | from loguru import logger
 9 | 
10 | from offlinerl.utils.data import SampleBatch
11 | 
12 | def load_d4rl_buffer(task):
13 |     env = gym.make(task[5:])
14 |     dataset = d4rl.qlearning_dataset(env)
15 | 
16 |     buffer = SampleBatch(
17 |         obs=dataset['observations'],
18 |         obs_next=dataset['next_observations'],
19 |         act=dataset['actions'],
20 |         rew=np.expand_dims(np.squeeze(dataset['rewards']), 1),
21 |         done=np.expand_dims(np.squeeze(dataset['terminals']), 1),
22 |     )
23 | 
24 |     logger.info('obs shape: {}', buffer.obs.shape)
25 |     logger.info('obs_next shape: {}', buffer.obs_next.shape)
26 |     logger.info('act shape: {}', buffer.act.shape)
27 |     logger.info('rew shape: {}', buffer.rew.shape)
28 |     logger.info('done shape: {}', buffer.done.shape)
29 |     logger.info('Episode reward: {}', buffer.rew.sum() /np.sum(buffer.done) )
30 |     logger.info('Number of terminals on: {}', np.sum(buffer.done))
31 |     return buffer
32 | 


--------------------------------------------------------------------------------
/offlinerl/data/neorl.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from loguru import logger
 3 | 
 4 | from offlinerl.utils.data import SampleBatch, get_scaler
 5 | from offlinerl.utils.data import BufferDataset, BufferDataloader
 6 | 
 7 | def load_neorl_buffer(data):
 8 |     buffer = SampleBatch(
 9 |         obs = data["obs"],
10 |         obs_next = data["next_obs"],
11 |         act = data["action"],
12 |         rew = data["reward"],
13 |         done = data["done"],
14 |     )
15 | 
16 |     logger.info('obs shape: {}', buffer.obs.shape)
17 |     logger.info('obs_next shape: {}', buffer.obs_next.shape)
18 |     logger.info('act shape: {}', buffer.act.shape)
19 |     logger.info('rew shape: {}', buffer.rew.shape)
20 |     logger.info('done shape: {}', buffer.done.shape)
21 |     logger.info('Episode reward: {}', buffer.rew.sum() /np.sum(buffer.done) )
22 |     logger.info('Number of terminals on: {}', np.sum(buffer.done))
23 |     
24 |     """
25 |     rew_scaler = get_scaler(buffer.rew)
26 |     buffer.rew = rew_scaler.transform(buffer.rew)
27 |     buffer.rew =  buffer.rew * 0.01
28 |     buffer.done[buffer.rew < np.sort(buffer.rew.reshape(-1))[int(len(buffer)*0.01)]] = 1
29 |     
30 |     buffer = BufferDataset(buffer)
31 |     buffer = BufferDataloader(buffer, batch_size=1, collate_fn=lambda x: x[0], num_workers=8)
32 |     """
33 |     
34 |     return buffer
35 | 


--------------------------------------------------------------------------------
/offlinerl/evaluation/d4rl.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import d4rl 
 3 | import torch
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | from collections import OrderedDict
 7 | from d4rl.infos import REF_MIN_SCORE, REF_MAX_SCORE
 8 | 
 9 | from offlinerl.utils.env import get_env
10 | 
11 | 
12 | def d4rl_score(task, rew_mean, len_mean):
13 |     score = (rew_mean - REF_MIN_SCORE[task]) / (REF_MAX_SCORE[task] - REF_MIN_SCORE[task]) * 100
14 |     
15 |     return score
16 | 
17 | 
18 | def d4rl_eval_fn(task, eval_episodes=100):
19 |     env = get_env(task)
20 |     
21 |     def d4rl_eval(policy):
22 |         episode_rewards = []
23 |         episode_lengths = []
24 |         for _ in range(eval_episodes):
25 |             state, done = env.reset(), False
26 |             rewards = 0
27 |             lengths = 0
28 |             while not done:
29 |                 state = state[np.newaxis]
30 |                 action = policy.get_action(state)
31 |                 state, reward, done, _ = env.step(action)
32 |                 rewards += reward
33 |                 lengths += 1
34 | 
35 |             episode_rewards.append(rewards)
36 |             episode_lengths.append(lengths)
37 | 
38 | 
39 |         rew_mean = np.mean(episode_rewards)
40 |         len_mean = np.mean(episode_lengths)
41 |         
42 |         score = d4rl_score(task, rew_mean, len_mean)
43 |         
44 |         res = OrderedDict()
45 |         res["Reward_Mean"] = rew_mean
46 |         res["Length_Mean"] = len_mean
47 |         res["D4rl_Score"] = score
48 | 
49 |         return res
50 |     
51 |     return d4rl_eval


--------------------------------------------------------------------------------
/offlinerl/evaluation/fqe.py:
--------------------------------------------------------------------------------
 1 | # https://arxiv.org/abs/2007.09055
 2 | # Hyperparameter Selection for Offline Reinforcement Learning
 3 | from copy import deepcopy
 4 | import torch
 5 | from tqdm import tqdm
 6 | 
 7 | from offlinerl.utils.net.common import MLP
 8 | from offlinerl.utils.net.continuous import DistributionalCritic
 9 | 
10 | class FQE:
11 |     # https://arxiv.org/abs/2007.09055
12 |     # Hyperparameter Selection for Offline Reinforcement Learning
13 |     def __init__(self,
14 |                  policy, 
15 |                  buffer,
16 |                  q_hidden_features=1024,
17 |                  q_hidden_layers=4,
18 |                  device="cuda" if torch.cuda.is_available() else "cpu"
19 |                  ):
20 |         self.policy = policy
21 |         self.buffer = buffer
22 |         self.critic_hidden_features = q_hidden_features
23 |         self.critic_hidden_layers = q_hidden_layers
24 |         self._device = device
25 | 
26 |     def train_estimator(self,
27 |                         init_critic=None, 
28 |                         discount=0.99,
29 |                         target_update_period=100,
30 |                         critic_lr=1e-4,
31 |                         num_steps=250000,
32 |                         polyak=0.0,
33 |                         batch_size=256,
34 |                         verbose=False):
35 | 
36 |         min_reward = self.buffer.rew.min()
37 |         max_reward = self.buffer.rew.max()
38 | 
39 |         max_value = (1.2 * max_reward + 0.8 * min_reward) / (1 - discount)
40 |         min_value = (1.2 * min_reward + 0.8 * max_reward) / (1 - discount)
41 | 
42 |         data = self.buffer.sample(batch_size)
43 |         input_dim = data.obs.shape[-1] + data.act.shape[-1]
44 |         critic = MLP(input_dim, 1, self.critic_hidden_features, self.critic_hidden_layers).to(self._device)
45 |         if init_critic is not None: critic.load_state_dict(init_critic.state_dict())
46 |         critic_optimizer = torch.optim.Adam(critic.parameters(), lr=critic_lr)
47 |         target_critic = deepcopy(critic).to(self._device)
48 |         target_critic.requires_grad_(False)
49 | 
50 |         if verbose:
51 |             counter = tqdm(total=num_steps)
52 | 
53 |         print('Training Fqe...')
54 |         for t in range(num_steps):
55 |             batch = self.buffer.sample(batch_size)
56 |             data = batch.to_torch(dtype=torch.float32, device=self._device)
57 |             r = data.rew
58 |             terminals = data.done
59 |             o1 = data.obs
60 |             a1 = data.act
61 | 
62 |             o2 = data.obs_next
63 |             a2 = self.policy.get_action(o2)
64 |             q_target = target_critic(torch.cat((o2, a2), -1)).detach()
65 |             current_discount = discount * (1 - terminals)
66 |             backup = r + current_discount * q_target
67 |             backup = torch.clamp(backup, min_value, max_value) # prevent explosion
68 |             
69 |             q = critic(torch.cat((o1, a1), -1))
70 |             critic_loss = ((q - backup) ** 2).mean()
71 | 
72 |             critic_optimizer.zero_grad()
73 |             critic_loss.backward()
74 |             critic_optimizer.step()
75 |         
76 |             if t % target_update_period == 0:
77 |                 with torch.no_grad():
78 |                     for p, p_targ in zip(critic.parameters(), target_critic.parameters()):
79 |                         p_targ.data.mul_(polyak)
80 |                         p_targ.data.add_((1 - polyak) * p.data)
81 | 
82 |             if verbose:
83 |                 counter.update(1)
84 | 
85 |         return critic


--------------------------------------------------------------------------------
/offlinerl/evaluation/gym.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | from collections import OrderedDict
 4 | 
 5 | from offlinerl.utils.env import get_env
 6 | 
 7 | def gym_policy_eval(task, eval_episodes=100):
 8 |     env = get_env(task)
 9 |     
10 |     def policy_eval(policy):
11 |         episode_rewards = []
12 |         episode_lengths = []
13 |         for _ in range(eval_episodes):
14 |             state, done = env.reset(), False
15 |             rewards = 0
16 |             lengths = 0
17 |             while not done:
18 |                 state = state[np.newaxis]
19 |                 action = policy.get_action(state).reshape(-1)
20 |                 state, reward, done, _ = env.step(action)
21 |                 rewards += reward
22 |                 lengths += 1
23 | 
24 |             episode_rewards.append(rewards)
25 |             episode_lengths.append(lengths)
26 | 
27 | 
28 |         rew_mean = np.mean(episode_rewards)
29 |         len_mean = np.mean(episode_lengths)
30 |        
31 |         
32 |         res = OrderedDict()
33 |         res["Reward_Mean"] = rew_mean
34 |         res["Length_Mean"] = len_mean
35 | 
36 |         return res
37 |     
38 |     return policy_eval
39 | 
40 | 
41 | def gym_env_eval(task, eval_episodes=100):
42 |     env = get_env(task)
43 |     
44 |     def env_eval(policy, obs_scaler=None, act_scaler=None):
45 |         env_mae = []
46 |         for _ in range(eval_episodes):
47 |             state, done = env.reset(), False
48 |             rewards = 0
49 |             lengths = 0
50 |             while not done:
51 |                 state = state[np.newaxis]              
52 |                 action = env.action_space.sample()
53 |                 
54 |                 obs = state.reshape(1,-1)
55 |                 act = action.reshape(1,-1)
56 |                 if obs_scaler is not None:
57 |                     obs = obs_scaler.transform(obs)        
58 |                 if act_scaler is not None:   
59 |                     act = act_scaler.transform(act)
60 |                     
61 |                 policy_state = policy.get_action(np.concatenate([obs,act], axis=1))
62 |                 
63 |                 if obs_scaler is not None:
64 |                     policy_state = obs_scaler.inverse_transform(policy_state)
65 |                 
66 |                 state, reward, done, _ = env.step(action)
67 |                 
68 |                 env_mae.append(np.mean(np.abs(policy_state -state)))
69 | 
70 |         env_mae = np.mean(env_mae)
71 |        
72 |         
73 |         res = OrderedDict()
74 |         res["Env_Mae"] = env_mae
75 | 
76 |         return res
77 |     
78 |     return env_eval 


--------------------------------------------------------------------------------
/offlinerl/evaluation/neorl.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import ray
 3 | from copy import deepcopy
 4 | import numpy as np
 5 | from collections import OrderedDict
 6 | 
 7 | from offlinerl.utils.env import get_env
 8 | from multiprocessing import Pool
 9 | 
10 | 
11 | #@ray.remote(num_gpus=0.1)
12 | def test_one_trail(env, policy):
13 |     # env = deepcopy(env)
14 |     # policy = deepcopy(policy)
15 | 
16 |     state, done = env.reset(), False
17 |     if isinstance(state, tuple):
18 |         state = state[0]
19 |     rewards = 0
20 |     lengths = 0
21 |     while not done:
22 |         state = state[np.newaxis]
23 |         action = policy.get_action(state).reshape(-1)
24 |         result = env.step(action)
25 |         if len(result) == 4:
26 |             state, reward, done, _ = result
27 |         else:
28 |             state, reward, done, timeout,_ = result
29 |             done = done or timeout
30 |         rewards += reward
31 |         lengths += 1
32 | 
33 |     return (rewards, lengths)
34 | 
35 | def test_one_trail_sp_local(env, policy):
36 |     # env = deepcopy(env)
37 |     # policy = deepcopy(policy)
38 | 
39 |     state, done = env.reset(), False
40 |     rewards = 0
41 |     lengths = 0
42 |     obs_dim = env.observation_space.shape[0]
43 |     act_dim = env.action_space.shape[0]
44 |     
45 |     while not done:
46 |         state = state.reshape(-1, obs_dim)
47 |         action = policy.get_action(state).reshape(-1, act_dim)
48 |         # print("actions: ", action[0:3,])
49 |         state, reward, done, _ = env.step(action)
50 |         rewards += reward
51 |         lengths += 1
52 | 
53 |     return (rewards, lengths)
54 | 
55 | def test_on_real_env(policy, env, number_of_runs=100):
56 |     rewards = []
57 |     episode_lengths = []
58 |     policy = deepcopy(policy)
59 |     policy.eval()
60 |     
61 |     if (not hasattr(env.spec, "id")) and ("sp" in env._name or "sales" in env._name):
62 |         results = [test_one_trail_sp_local(env, policy) for _ in range(number_of_runs)]
63 |     else:
64 |         pool = Pool(processes=10)
65 |         results = [pool.apply_async(test_one_trail, args=(env, policy)) for _ in range(number_of_runs)]
66 |         results = [result.get() for result in results]
67 |         pool.close()
68 |         pool.join()
69 |     
70 |     policy.train()
71 |     
72 |     rewards = [result[0] for result in results]
73 |     episode_lengths = [result[1] for result in results]
74 |     
75 |     rew_mean = np.mean(rewards)
76 |     rew_std = np.std(rewards)
77 |     len_mean = np.mean(episode_lengths)
78 | 
79 | 
80 |     res = OrderedDict()
81 |     res["Reward_Mean_Env"] = rew_mean
82 |     res["Reward_Std_Env"] = rew_std
83 |     res["Length_Mean_Env"] = len_mean
84 |     res["Length_Std_Env"] = np.std(episode_lengths)
85 | 
86 |     return res
87 | 


--------------------------------------------------------------------------------
/offlinerl/outside_utils/buffer/__init__.py:
--------------------------------------------------------------------------------
1 | from offlinerl.outside_utils.buffer.buffer import ReplayBuffer
2 | 
3 | 
4 | __all__ = [
5 |     "ReplayBuffer"
6 | ]


--------------------------------------------------------------------------------
/offlinerl/outside_utils/buffer/buffer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | from typing import Optional, Union, Tuple, Dict
  5 | 
  6 | 
  7 | class ReplayBuffer:
  8 |     def __init__(
  9 |         self,
 10 |         buffer_size: int,
 11 |         obs_shape: Tuple,
 12 |         obs_dtype: np.dtype,
 13 |         action_dim: int,
 14 |         action_dtype: np.dtype,
 15 |         device: str = "cpu"
 16 |     ) -> None:
 17 |         self._max_size = buffer_size
 18 |         self.obs_shape = obs_shape
 19 |         self.obs_dtype = obs_dtype
 20 |         self.action_dim = action_dim
 21 |         self.action_dtype = action_dtype
 22 | 
 23 |         self._ptr = 0
 24 |         self._size = 0
 25 |         self.observations = np.zeros((self._max_size,) + self.obs_shape, dtype=obs_dtype)
 26 |         self.next_observations = np.zeros((self._max_size,) + self.obs_shape, dtype=obs_dtype)
 27 |         self.actions = np.zeros((self._max_size, self.action_dim), dtype=action_dtype)
 28 |         self.rewards = np.zeros((self._max_size, 1), dtype=np.float32)
 29 |         self.terminals = np.zeros((self._max_size, 1), dtype=np.float32)
 30 | 
 31 |         self.device = torch.device(device)
 32 | 
 33 |     def add(
 34 |         self,
 35 |         obs: np.ndarray,
 36 |         next_obs: np.ndarray,
 37 |         action: np.ndarray,
 38 |         reward: np.ndarray,
 39 |         terminal: np.ndarray
 40 |     ) -> None:
 41 |         # Copy to avoid modification by reference
 42 |         self.observations[self._ptr] = np.array(obs).copy()
 43 |         self.next_observations[self._ptr] = np.array(next_obs).copy()
 44 |         self.actions[self._ptr] = np.array(action).copy()
 45 |         self.rewards[self._ptr] = np.array(reward).copy()
 46 |         self.terminals[self._ptr] = np.array(terminal).copy()
 47 | 
 48 |         self._ptr = (self._ptr + 1) % self._max_size
 49 |         self._size = min(self._size + 1, self._max_size)
 50 |     
 51 |     def add_batch(
 52 |         self,
 53 |         obss: np.ndarray,
 54 |         next_obss: np.ndarray,
 55 |         actions: np.ndarray,
 56 |         rewards: np.ndarray,
 57 |         terminals: np.ndarray
 58 |     ) -> None:
 59 |         batch_size = len(obss)
 60 |         indexes = np.arange(self._ptr, self._ptr + batch_size) % self._max_size
 61 | 
 62 |         self.observations[indexes] = np.array(obss).copy()
 63 |         self.next_observations[indexes] = np.array(next_obss).copy()
 64 |         self.actions[indexes] = np.array(actions).copy()
 65 |         self.rewards[indexes] = np.array(rewards).copy()
 66 |         self.terminals[indexes] = np.array(terminals).copy()
 67 | 
 68 |         self._ptr = (self._ptr + batch_size) % self._max_size
 69 |         self._size = min(self._size + batch_size, self._max_size)
 70 |     
 71 |     def load_dataset(self, dataset: Dict[str, np.ndarray]) -> None:
 72 |         observations = np.array(dataset["obs"], dtype=self.obs_dtype)
 73 |         next_observations = np.array(dataset["obs_next"], dtype=self.obs_dtype)
 74 |         actions = np.array(dataset["act"], dtype=self.action_dtype)
 75 |         rewards = np.array(dataset["rew"], dtype=np.float32).reshape(-1, 1)
 76 |         terminals = np.array(dataset["done"], dtype=np.float32).reshape(-1, 1)
 77 | 
 78 |         self.observations = observations
 79 |         self.next_observations = next_observations
 80 |         self.actions = actions
 81 |         self.rewards = rewards
 82 |         self.terminals = terminals
 83 | 
 84 |         self._ptr = len(observations)
 85 |         self._size = len(observations)
 86 |      
 87 |     def normalize_obs(self, eps: float = 1e-3, inplace : bool = True) -> Tuple[np.ndarray, np.ndarray]:
 88 |         mean = self.observations.mean(0, keepdims=True)
 89 |         std = self.observations.std(0, keepdims=True) + eps
 90 |         if inplace:
 91 |             self.observations = (self.observations - mean) / std
 92 |             self.next_observations = (self.next_observations - mean) / std
 93 |         obs_mean, obs_std = mean, std
 94 |         return obs_mean, obs_std
 95 | 
 96 |     def sample(self, batch_size: int) -> Dict[str, torch.Tensor]:
 97 | 
 98 |         batch_indexes = np.random.randint(0, self._size, size=batch_size)
 99 |         
100 |         return {
101 |             "observations": torch.tensor(self.observations[batch_indexes]).to(self.device),
102 |             "actions": torch.tensor(self.actions[batch_indexes]).to(self.device),
103 |             "next_observations": torch.tensor(self.next_observations[batch_indexes]).to(self.device),
104 |             "terminals": torch.tensor(self.terminals[batch_indexes]).to(self.device),
105 |             "rewards": torch.tensor(self.rewards[batch_indexes]).to(self.device)
106 |         }
107 |     
108 |     def sample_all(self) -> Dict[str, np.ndarray]:
109 |         return {
110 |             "observations": self.observations[:self._size].copy(),
111 |             "actions": self.actions[:self._size].copy(),
112 |             "next_observations": self.next_observations[:self._size].copy(),
113 |             "terminals": self.terminals[:self._size].copy(),
114 |             "rewards": self.rewards[:self._size].copy()
115 |         }


--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/__init__.py:
--------------------------------------------------------------------------------
 1 | from offlinerl.outside_utils.dynamics.base_dynamics import BaseDynamics
 2 | from offlinerl.outside_utils.dynamics.ensemble_dynamics import EnsembleDynamics
 3 | from offlinerl.outside_utils.dynamics.rnn_dynamics import RNNDynamics
 4 | from offlinerl.outside_utils.dynamics.mujoco_oracle_dynamics import MujocoOracleDynamics
 5 | 
 6 | 
 7 | __all__ = [
 8 |     "BaseDynamics",
 9 |     "EnsembleDynamics",
10 |     "RNNDynamics",
11 |     "MujocoOracleDynamics"
12 | ]


--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/base_dynamics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from typing import Callable, List, Tuple, Dict
 6 | 
 7 | 
 8 | class BaseDynamics(object):
 9 |     def __init__(
10 |         self,
11 |         model: nn.Module,
12 |         optim: torch.optim.Optimizer
13 |     ) -> None:
14 |         super().__init__()
15 |         self.model = model
16 |         self.optim = optim
17 |     
18 |     def step(
19 |         self,
20 |         obs: np.ndarray,
21 |         action: np.ndarray
22 |     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]:
23 |         raise NotImplementedError
24 | 


--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/ensemble_dynamics.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from typing import Callable, List, Tuple, Dict, Optional
  7 | from offlinerl.outside_utils.dynamics import BaseDynamics
  8 | from offlinerl.outside_utils.utils.scaler import StandardScaler
  9 | from offlinerl.outside_utils.utils.logger import Logger
 10 | import warnings
 11 | 
 12 | 
 13 | class EnsembleDynamics(BaseDynamics):
 14 |     def __init__(
 15 |         self,
 16 |         model: nn.Module,
 17 |         optim: torch.optim.Optimizer,
 18 |         scaler: StandardScaler,
 19 |         terminal_fn: Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray],
 20 |         penalty_coef: float = 0.0,
 21 |         uncertainty_mode: str = "aleatoric",
 22 |         data_range: tuple = None,
 23 |     ) -> None:
 24 |         super().__init__(model, optim)
 25 |         self.scaler = scaler
 26 |         self.terminal_fn = terminal_fn
 27 |         self._penalty_coef = penalty_coef
 28 |         self._uncertainty_mode = uncertainty_mode
 29 |         self.obs_min, self.obs_max, self.rew_min, self.rew_max = data_range
 30 | 
 31 |     @ torch.no_grad()
 32 |     def step(
 33 |         self,
 34 |         obs: np.ndarray,
 35 |         action: np.ndarray,
 36 |         transition_scaler: bool = True,
 37 |         transition_clip: bool = False,
 38 |         clip_penalty: bool = False,
 39 |         max_penalty: float = 0,
 40 |     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]:
 41 |         "imagine single forward step"
 42 |         obs_act = np.concatenate([obs, action], axis=-1)
 43 |         if transition_scaler:
 44 |             obs_act = self.scaler.transform(obs_act)
 45 |         mean, logvar = self.model(obs_act)
 46 |         mean = mean.cpu().numpy()
 47 |         logvar = logvar.cpu().numpy()
 48 |         mean[..., :-1] += obs
 49 |         std = np.sqrt(np.exp(logvar))
 50 | 
 51 |         ensemble_samples = (mean + np.random.normal(size=mean.shape) * std).astype(np.float32)
 52 | 
 53 |         # choose one model from ensemble
 54 |         num_models, batch_size, _ = ensemble_samples.shape
 55 |         model_idxs = self.model.random_elite_idxs(batch_size)
 56 |         samples = ensemble_samples[model_idxs, np.arange(batch_size)]
 57 |         
 58 |         next_obs = samples[..., :-1]
 59 |         reward = samples[..., -1:]
 60 |         terminal = self.terminal_fn(obs, action, next_obs)
 61 |         if transition_clip:
 62 |             next_obs = np.clip(next_obs, self.obs_min, self.obs_max)
 63 |             reward = np.clip(reward, self.rew_min, self.rew_max)
 64 | 
 65 |         info = {}
 66 |         info["raw_reward"] = reward
 67 | 
 68 |         if self._penalty_coef > 0.0:
 69 |             norm_mean = mean
 70 |             norm_std = std
 71 |             if self._uncertainty_mode == "aleatoric":
 72 |                 penalty = np.amax(np.linalg.norm(norm_std, axis=2), axis=0)
 73 |             elif self._uncertainty_mode == "pairwise-diff":
 74 |                 next_obses_mean = norm_mean[..., :-1]
 75 |                 next_obs_mean = np.mean(next_obses_mean, axis=0)
 76 |                 diff = next_obses_mean - next_obs_mean
 77 |                 penalty = np.amax(np.linalg.norm(diff, axis=2), axis=0)
 78 |             elif self._uncertainty_mode == "ensemble_std":
 79 |                 next_obses_mean = norm_mean[..., :-1]
 80 |                 penalty = np.sqrt(next_obses_mean.var(0).mean(1))
 81 |             else:
 82 |                 warnings.warn("Invalid uncertainty mode. No penalty applied!!!")
 83 |                 penalty = np.zeros_like(reward).mean(1)
 84 | 
 85 |             penalty = np.expand_dims(penalty, 1).astype(np.float32)
 86 |             if clip_penalty:
 87 |                 penalty = np.clip(penalty, a_max=max_penalty)
 88 |             assert penalty.shape == reward.shape
 89 |             reward = reward - self._penalty_coef * penalty
 90 |             info["penalty"] = penalty
 91 |         
 92 |         return next_obs, reward, np.bool_(terminal), info
 93 |     
 94 |     @ torch.no_grad()
 95 |     def sample_next_obss(
 96 |         self,
 97 |         obs: torch.Tensor,
 98 |         action: torch.Tensor,
 99 |         num_samples: int,
100 |         transition_scaler: bool = True,
101 |         transition_clip: bool = False,
102 |     ) -> torch.Tensor:
103 |         obs_act = torch.cat([obs, action], dim=-1)
104 |         if transition_scaler:
105 |             obs_act = self.scaler.transform_tensor(obs_act)
106 |         mean, logvar = self.model(obs_act)
107 |         mean[..., :-1] += obs
108 |         std = torch.sqrt(torch.exp(logvar))
109 | 
110 |         mean = mean[self.model.elites.data.cpu().numpy()]
111 |         std = std[self.model.elites.data.cpu().numpy()]
112 | 
113 |         samples = torch.stack([mean + torch.randn_like(std) * std for i in range(num_samples)], 0)
114 |         next_obss = samples[..., :-1]
115 |         if transition_clip:
116 |             obs_min = torch.as_tensor(self.obs_min).to(next_obss.device)
117 |             obs_max = torch.as_tensor(self.obs_max).to(next_obss.device)
118 |             next_obss = torch.clamp(next_obss, obs_min, obs_max)
119 |         return next_obss
120 | 
121 |     def format_samples_for_training(self, data: Dict) -> Tuple[np.ndarray, np.ndarray]:
122 |         obss = data["observations"]
123 |         actions = data["actions"]
124 |         next_obss = data["next_observations"]
125 |         rewards = data["rewards"]
126 |         delta_obss = next_obss - obss
127 |         inputs = np.concatenate((obss, actions), axis=-1)
128 |         targets = np.concatenate((delta_obss, rewards), axis=-1)
129 |         return inputs, targets
130 |     
131 |     def select_elites(self, metrics: List) -> List[int]:
132 |         pairs = [(metric, index) for metric, index in zip(metrics, range(len(metrics)))]
133 |         pairs = sorted(pairs, key=lambda x: x[0])
134 |         elites = [pairs[i][1] for i in range(self.model.num_elites)]
135 |         return elites
136 | 
137 |     def save(self, save_path: str) -> None:
138 |         torch.save(self.model.state_dict(), os.path.join(save_path, "dynamics.pth"))
139 |         self.scaler.save_scaler(save_path)
140 |     
141 |     def load(self, load_path: str) -> None:
142 |         self.model.load_state_dict(torch.load(os.path.join(load_path, "dynamics.pth"), map_location=self.model.device))
143 |         self.scaler.load_scaler(load_path)
144 | 


--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/mujoco_oracle_dynamics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from gym.envs.mujoco import mujoco_env
 4 | from typing import Callable, List, Tuple, Dict
 5 | 
 6 | 
 7 | class MujocoOracleDynamics(object):
 8 |     def __init__(self, env: mujoco_env.MujocoEnv) -> None:
 9 |         self.env = env
10 | 
11 |     def _set_state_from_obs(self, obs:np.ndarray) -> None:
12 |         if len(obs) == (self.env.model.nq + self.env.model.nv - 1):
13 |             xpos = np.zeros(1)
14 |             obs = np.concatenate([xpos, obs])
15 |         qpos = obs[:self.env.model.nq]
16 |         qvel = obs[self.env.model.nq:]
17 |         self.env._elapsed_steps = 0
18 |         self.env.set_state(qpos, qvel)
19 | 
20 |     def step(
21 |         self,
22 |         obs: np.ndarray,
23 |         action: np.ndarray
24 |     ) -> Tuple[np.ndarray, float, bool, Dict]:
25 |         if (len(obs.shape) > 1) or (len(action.shape) > 1):
26 |             raise ValueError
27 |         self._set_state_from_obs(obs)
28 |         next_obs, reward, terminal, info = self.env.step(action)
29 |         return next_obs, reward, terminal, info


--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/rnn_dynamics.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from typing import Callable, List, Tuple, Dict
 7 | from torch.utils.data.dataloader import DataLoader
 8 | from offlinerl.outside_utils.dynamics import BaseDynamics
 9 | from offlinerl.outside_utils.utils.scaler import StandardScaler
10 | from offlinerl.outside_utils.utils.logger import Logger
11 | 
12 | 
13 | class RNNDynamics(BaseDynamics):
14 |     def __init__(
15 |         self,
16 |         model: nn.Module,
17 |         optim: torch.optim.Optimizer,
18 |         scaler: StandardScaler,
19 |         terminal_fn: Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray],
20 |     ) -> None:
21 |         super().__init__(model, optim)
22 |         self.scaler = scaler
23 |         self.terminal_fn = terminal_fn
24 |     
25 |     @ torch.no_grad()
26 |     def step(
27 |         self,
28 |         obss: np.ndarray,
29 |         actions: np.ndarray
30 |     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]:
31 |         "imagine single forward step"
32 |         inputs = np.concatenate([obss, actions], axis=-1)
33 |         inputs = self.scaler.transform(inputs)
34 |         preds, _ = self.model(inputs)
35 |         # get last timestep pred
36 |         preds = preds[:, -1]
37 |         next_obss = preds[..., :-1].cpu().numpy() + obss[:, -1]
38 |         rewards = preds[..., -1:].cpu().numpy()
39 | 
40 |         terminals = self.terminal_fn(obss[:, -1], actions[:, -1], next_obss)
41 |         info = {}
42 | 
43 |         return next_obss, rewards, terminals, info
44 | 
45 |     def train(self, data: Dict, batch_size: int, max_iters: int, logger: Logger) -> None:
46 |         self.model.train()
47 |         loader = DataLoader(data, shuffle=True, batch_size=batch_size)
48 |         for iter in range(max_iters):
49 |             for batch in loader:
50 |                 train_loss = self.learn(batch)
51 |                 logger.logkv_mean("loss/model", train_loss)
52 |             
53 |             logger.set_timestep(iter)
54 |             logger.dumpkvs(exclude=["policy_training_progress"])
55 |         self.save(logger.model_dir)
56 |         self.model.eval()
57 |     
58 |     def learn(self, batch) -> float:
59 |         inputs, targets, masks = batch
60 |         preds, _ = self.model.forward(inputs)
61 | 
62 |         loss = (((preds - targets) ** 2).mean(-1) * masks).mean()
63 | 
64 |         self.optim.zero_grad()
65 |         loss.backward()
66 |         self.optim.step()
67 | 
68 |         return loss.item()
69 |     
70 |     def save(self, save_path: str) -> None:
71 |         torch.save(self.model.state_dict(), os.path.join(save_path, "dynamics.pth"))
72 |         self.scaler.save_scaler(save_path)
73 |     
74 |     def load(self, load_path: str) -> None:
75 |         self.model.load_state_dict(torch.load(os.path.join(load_path, "dynamics.pth"), map_location=self.model.device))
76 |         self.scaler.load_scaler(load_path)


--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | from offlinerl.outside_utils.modules.actor_module import Actor, ActorProb
 2 | from offlinerl.outside_utils.modules.critic_module import Critic
 3 | from offlinerl.outside_utils.modules.ensemble_critic_module import EnsembleCritic
 4 | from offlinerl.outside_utils.modules.dist_module import DiagGaussian, TanhDiagGaussian
 5 | from offlinerl.outside_utils.modules.dynamics_module import EnsembleDynamicsModel
 6 | 
 7 | 
 8 | __all__ = [
 9 |     "Actor",
10 |     "ActorProb",
11 |     "Critic",
12 |     "EnsembleCritic",
13 |     "DiagGaussian",
14 |     "TanhDiagGaussian",
15 |     "EnsembleDynamicsModel"
16 | ]


--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/actor_module.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | from torch.nn import functional as F
 5 | from typing import Union, Optional
 6 | 
 7 | 
 8 | # for SAC
 9 | class ActorProb(nn.Module):
10 |     def __init__(
11 |         self,
12 |         backbone: nn.Module,
13 |         dist_net: nn.Module,
14 |         device: str = "cpu"
15 |     ) -> None:
16 |         super().__init__()
17 | 
18 |         self.device = torch.device(device)
19 |         self.backbone = backbone.to(device)
20 |         self.dist_net = dist_net.to(device)
21 |         self.scaler = None
22 | 
23 |     def set_scaler(self, scaler):
24 |         self.scaler = scaler
25 | 
26 |     def forward(self, obs: Union[np.ndarray, torch.Tensor]) -> torch.distributions.Normal:
27 |         obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
28 |         logits = self.backbone(obs)
29 |         dist = self.dist_net(logits)
30 |         return dist
31 |     
32 |     def get_action(self, obs: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
33 |         if self.scaler is not None:
34 |             obs = self.scaler.transform(obs)
35 |         dist = self.forward(obs)
36 |         action, _ = dist.mode()
37 |         return action.detach().cpu().numpy()
38 |     
39 |     def to(self, device: str) -> None:
40 |         self.device = torch.device(device)
41 |         self.backbone.to(device)
42 |         self.dist_net.to(device)
43 |         return self
44 | 
45 | 
46 | # for TD3
47 | class Actor(nn.Module):
48 |     def __init__(
49 |         self,
50 |         backbone: nn.Module,
51 |         action_dim: int,
52 |         max_action: float = 1.0,
53 |         device: str = "cpu"
54 |     ) -> None:
55 |         super().__init__()
56 | 
57 |         self.device = torch.device(device)
58 |         self.backbone = backbone.to(device)
59 |         latent_dim = getattr(backbone, "output_dim")
60 |         output_dim = action_dim
61 |         self.last = nn.Linear(latent_dim, output_dim).to(device)
62 |         self._max = max_action
63 | 
64 |     def forward(self, obs: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
65 |         obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
66 |         logits = self.backbone(obs)
67 |         actions = self._max * torch.tanh(self.last(logits))
68 |         return actions


--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/critic_module.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | from torch.nn import functional as F
 5 | from typing import Union, Optional
 6 | 
 7 | 
 8 | class Critic(nn.Module):
 9 |     def __init__(self, backbone: nn.Module, device: str = "cpu") -> None:
10 |         super().__init__()
11 | 
12 |         self.device = torch.device(device)
13 |         self.backbone = backbone.to(device)
14 |         latent_dim = getattr(backbone, "output_dim")
15 |         self.last = nn.Linear(latent_dim, 1).to(device)
16 | 
17 |     def forward(
18 |         self,
19 |         obs: Union[np.ndarray, torch.Tensor],
20 |         actions: Optional[Union[np.ndarray, torch.Tensor]] = None
21 |     ) -> torch.Tensor:
22 |         obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
23 |         if actions is not None:
24 |             actions = torch.as_tensor(actions, device=self.device, dtype=torch.float32).flatten(1)
25 |             obs = torch.cat([obs, actions], dim=1)
26 |         logits = self.backbone(obs)
27 |         values = self.last(logits)
28 |         return values


--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/dist_module.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | 
  6 | class NormalWrapper(torch.distributions.Normal):
  7 |     def log_prob(self, actions):
  8 |         return super().log_prob(actions).sum(-1, keepdim=True)
  9 | 
 10 |     def entropy(self):
 11 |         return super().entropy().sum(-1)
 12 | 
 13 |     def mode(self):
 14 |         return self.mean
 15 | 
 16 | 
 17 | class TanhNormalWrapper(torch.distributions.Normal):
 18 |     def log_prob(self, action, raw_action=None):
 19 |         if raw_action is None:
 20 |             raw_action = self.arctanh(action)
 21 |         log_prob = super().log_prob(raw_action).sum(-1, keepdim=True)
 22 |         eps = 1e-6
 23 |         log_prob = log_prob - torch.log((1 - action.pow(2)) + eps).sum(-1, keepdim=True)
 24 |         return log_prob
 25 | 
 26 |     def mode(self):
 27 |         raw_action = self.mean
 28 |         action = torch.tanh(self.mean)
 29 |         return action, raw_action
 30 | 
 31 |     def arctanh(self, x):
 32 |         one_plus_x = (1 + x).clamp(min=1e-6)
 33 |         one_minus_x = (1 - x).clamp(min=1e-6)
 34 |         return 0.5 * torch.log(one_plus_x / one_minus_x)
 35 | 
 36 |     def rsample(self):
 37 |         raw_action = super().rsample()
 38 |         action = torch.tanh(raw_action)
 39 |         return action, raw_action
 40 | 
 41 | 
 42 | class DiagGaussian(nn.Module):
 43 |     def __init__(
 44 |         self,
 45 |         latent_dim,
 46 |         output_dim,
 47 |         unbounded=False,
 48 |         conditioned_sigma=False,
 49 |         max_mu=1.0,
 50 |         sigma_min=-5.0,
 51 |         sigma_max=2.0
 52 |     ):
 53 |         super().__init__()
 54 |         self.mu = nn.Linear(latent_dim, output_dim)
 55 |         self._c_sigma = conditioned_sigma
 56 |         if conditioned_sigma:
 57 |             self.sigma = nn.Linear(latent_dim, output_dim)
 58 |         else:
 59 |             self.sigma_param = nn.Parameter(torch.zeros(output_dim, 1))
 60 |         self._unbounded = unbounded
 61 |         self._max = max_mu
 62 |         self._sigma_min = sigma_min
 63 |         self._sigma_max = sigma_max
 64 | 
 65 |     def forward(self, logits):
 66 |         mu = self.mu(logits)
 67 |         if not self._unbounded:
 68 |             mu = self._max * torch.tanh(mu)
 69 |         if self._c_sigma:
 70 |             sigma = torch.clamp(self.sigma(logits), min=self._sigma_min, max=self._sigma_max).exp()
 71 |         else:
 72 |             shape = [1] * len(mu.shape)
 73 |             shape[1] = -1
 74 |             sigma = (self.sigma_param.view(shape) + torch.zeros_like(mu)).exp()
 75 |         return NormalWrapper(mu, sigma)
 76 | 
 77 | 
 78 | class TanhDiagGaussian(DiagGaussian):
 79 |     def __init__(
 80 |         self,
 81 |         latent_dim,
 82 |         output_dim,
 83 |         unbounded=False,
 84 |         conditioned_sigma=False,
 85 |         max_mu=1.0,
 86 |         sigma_min=-5.0,
 87 |         sigma_max=2.0
 88 |     ):
 89 |         super().__init__(
 90 |             latent_dim=latent_dim,
 91 |             output_dim=output_dim,
 92 |             unbounded=unbounded,
 93 |             conditioned_sigma=conditioned_sigma,
 94 |             max_mu=max_mu,
 95 |             sigma_min=sigma_min,
 96 |             sigma_max=sigma_max
 97 |         )
 98 | 
 99 |     def forward(self, logits):
100 |         mu = self.mu(logits)
101 |         if not self._unbounded:
102 |             mu = self._max * torch.tanh(mu)
103 |         if self._c_sigma:
104 |             sigma = torch.clamp(self.sigma(logits), min=self._sigma_min, max=self._sigma_max).exp()
105 |         else:
106 |             shape = [1] * len(mu.shape)
107 |             shape[1] = -1
108 |             sigma = (self.sigma_param.view(shape) + torch.zeros_like(mu)).exp()
109 |         return TanhNormalWrapper(mu, sigma)
110 | 


--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/dynamics_module.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.nn import functional as F
  5 | from typing import Dict, List, Union, Tuple, Optional
  6 | from offlinerl.outside_utils.nets import EnsembleLinear
  7 | 
  8 | 
  9 | class Swish(nn.Module):
 10 |     def __init__(self) -> None:
 11 |         super(Swish, self).__init__()
 12 | 
 13 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 14 |         x = x * torch.sigmoid(x)
 15 |         return x
 16 | 
 17 | 
 18 | def soft_clamp(
 19 |     x : torch.Tensor,
 20 |     _min: Optional[torch.Tensor] = None,
 21 |     _max: Optional[torch.Tensor] = None
 22 | ) -> torch.Tensor:
 23 |     # clamp tensor values while mataining the gradient
 24 |     if _max is not None:
 25 |         x = _max - F.softplus(_max - x)
 26 |     if _min is not None:
 27 |         x = _min + F.softplus(x - _min)
 28 |     return x
 29 | 
 30 | 
 31 | class EnsembleDynamicsModel(nn.Module):
 32 |     def __init__(
 33 |         self,
 34 |         obs_dim: int,
 35 |         action_dim: int,
 36 |         hidden_dims: Union[List[int], Tuple[int]],
 37 |         num_ensemble: int = 7,
 38 |         num_elites: int = 5,
 39 |         activation: nn.Module = Swish,
 40 |         weight_decays: Optional[Union[List[float], Tuple[float]]] = None,
 41 |         with_reward: bool = True,
 42 |         device: str = "cpu"
 43 |     ) -> None:
 44 |         super().__init__()
 45 | 
 46 |         self.num_ensemble = num_ensemble
 47 |         self.num_elites = num_elites
 48 |         self._with_reward = with_reward
 49 |         self.device = torch.device(device)
 50 | 
 51 |         self.activation = activation()
 52 | 
 53 |         assert len(weight_decays) == (len(hidden_dims) + 1)
 54 | 
 55 |         module_list = []
 56 |         hidden_dims = [obs_dim+action_dim] + list(hidden_dims)
 57 |         if weight_decays is None:
 58 |             weight_decays = [0.0] * (len(hidden_dims) + 1)
 59 |         for in_dim, out_dim, weight_decay in zip(hidden_dims[:-1], hidden_dims[1:], weight_decays[:-1]):
 60 |             module_list.append(EnsembleLinear(in_dim, out_dim, num_ensemble, weight_decay))
 61 |         self.backbones = nn.ModuleList(module_list)
 62 | 
 63 |         self.output_layer = EnsembleLinear(
 64 |             hidden_dims[-1],
 65 |             2 * (obs_dim + self._with_reward),
 66 |             num_ensemble,
 67 |             weight_decays[-1]
 68 |         )
 69 | 
 70 |         self.register_parameter(
 71 |             "max_logvar",
 72 |             nn.Parameter(torch.ones(obs_dim + self._with_reward) * 0.5, requires_grad=True)
 73 |         )
 74 |         self.register_parameter(
 75 |             "min_logvar",
 76 |             nn.Parameter(torch.ones(obs_dim + self._with_reward) * -10, requires_grad=True)
 77 |         )
 78 | 
 79 |         self.register_parameter(
 80 |             "elites",
 81 |             nn.Parameter(torch.tensor(list(range(0, self.num_elites))), requires_grad=False)
 82 |         )
 83 | 
 84 |         self.to(self.device)
 85 | 
 86 |     def forward(self, obs_action: np.ndarray) -> Tuple[torch.Tensor, torch.Tensor]:
 87 |         if isinstance(obs_action, np.ndarray):
 88 |             obs_action = torch.as_tensor(obs_action, dtype=torch.float32).to(self.device)
 89 |         output = obs_action
 90 |         for layer in self.backbones:
 91 |             output = self.activation(layer(output))
 92 |         mean, logvar = torch.chunk(self.output_layer(output), 2, dim=-1)
 93 |         logvar = soft_clamp(logvar, self.min_logvar, self.max_logvar)
 94 |         return mean, logvar
 95 | 
 96 |     def load_save(self) -> None:
 97 |         for layer in self.backbones:
 98 |             layer.load_save()
 99 |         self.output_layer.load_save()
100 | 
101 |     def update_save(self, indexes: List[int]) -> None:
102 |         for layer in self.backbones:
103 |             layer.update_save(indexes)
104 |         self.output_layer.update_save(indexes)
105 |     
106 |     def get_decay_loss(self) -> torch.Tensor:
107 |         decay_loss = 0
108 |         for layer in self.backbones:
109 |             decay_loss += layer.get_decay_loss()
110 |         decay_loss += self.output_layer.get_decay_loss()
111 |         return decay_loss
112 | 
113 |     def set_elites(self, indexes: List[int]) -> None:
114 |         assert len(indexes) <= self.num_ensemble and max(indexes) < self.num_ensemble
115 |         self.register_parameter('elites', nn.Parameter(torch.tensor(indexes), requires_grad=False))
116 |     
117 |     def random_elite_idxs(self, batch_size: int) -> np.ndarray:
118 |         idxs = np.random.choice(self.elites.data.cpu().numpy(), size=batch_size)
119 |         return idxs


--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/ensemble_critic_module.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | from torch.nn import functional as F
 5 | from typing import Union, Optional, List, Tuple
 6 | 
 7 | from offlinerl.outside_utils.nets import EnsembleLinear
 8 | 
 9 | 
10 | class EnsembleCritic(nn.Module):
11 |     def __init__(
12 |         self,
13 |         obs_dim: int,
14 |         action_dim: int,
15 |         hidden_dims: Union[List[int], Tuple[int]],
16 |         activation: nn.Module = nn.ReLU,
17 |         num_ensemble: int = 10,
18 |         device: str = "cpu"
19 |     ) -> None:
20 |         super().__init__()
21 |         input_dim = obs_dim + action_dim
22 |         hidden_dims = [input_dim] + list(hidden_dims)
23 |         model = []
24 |         for in_dim, out_dim in zip(hidden_dims[:-1], hidden_dims[1:]):
25 |             model += [EnsembleLinear(in_dim, out_dim, num_ensemble), activation()]
26 |         model.append(EnsembleLinear(hidden_dims[-1], 1, num_ensemble))
27 |         self.model = nn.Sequential(*model)
28 | 
29 |         self.device = torch.device(device)
30 |         self.model = self.model.to(device)
31 |         self._num_ensemble = num_ensemble
32 | 
33 |     def forward(
34 |         self,
35 |         obs: Union[np.ndarray, torch.Tensor],
36 |         actions: Optional[Union[np.ndarray, torch.Tensor]] = None
37 |     ) -> torch.Tensor:
38 |         obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
39 |         if actions is not None:
40 |             actions = torch.as_tensor(actions, device=self.device, dtype=torch.float32)
41 |             obs = torch.cat([obs, actions], dim=-1)
42 |         values = self.model(obs)
43 |         # values: [num_ensemble, batch_size, 1]
44 |         return values


--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/__init__.py:
--------------------------------------------------------------------------------
 1 | from offlinerl.outside_utils.nets.mlp import MLP
 2 | from offlinerl.outside_utils.nets.vae import VAE
 3 | from offlinerl.outside_utils.nets.ensemble_linear import EnsembleLinear
 4 | from offlinerl.outside_utils.nets.rnn import RNNModel
 5 | 
 6 | 
 7 | __all__ = [
 8 |     "MLP",
 9 |     "VAE",
10 |     "EnsembleLinear",
11 |     "RNNModel"
12 | ]


--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/ensemble_linear.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | from torch.nn import functional as F
 5 | from typing import Dict, List, Union, Tuple, Optional
 6 | 
 7 | 
 8 | class EnsembleLinear(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         input_dim: int,
12 |         output_dim: int,
13 |         num_ensemble: int,
14 |         weight_decay: float = 0.0
15 |     ) -> None:
16 |         super().__init__()
17 | 
18 |         self.num_ensemble = num_ensemble
19 | 
20 |         self.register_parameter("weight", nn.Parameter(torch.zeros(num_ensemble, input_dim, output_dim)))
21 |         self.register_parameter("bias", nn.Parameter(torch.zeros(num_ensemble, 1, output_dim)))
22 | 
23 |         nn.init.trunc_normal_(self.weight, std=1/(2*input_dim**0.5))
24 | 
25 |         self.register_parameter("saved_weight", nn.Parameter(self.weight.detach().clone()))
26 |         self.register_parameter("saved_bias", nn.Parameter(self.bias.detach().clone()))
27 | 
28 |         self.weight_decay = weight_decay
29 | 
30 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
31 |         weight = self.weight
32 |         bias = self.bias
33 | 
34 |         if len(x.shape) == 2:
35 |             x = torch.einsum('ij,bjk->bik', x, weight)
36 |         else:
37 |             x = torch.einsum('bij,bjk->bik', x, weight)
38 | 
39 |         x = x + bias
40 | 
41 |         return x
42 | 
43 |     def load_save(self) -> None:
44 |         self.weight.data.copy_(self.saved_weight.data)
45 |         self.bias.data.copy_(self.saved_bias.data)
46 | 
47 |     def update_save(self, indexes: List[int]) -> None:
48 |         self.saved_weight.data[indexes] = self.weight.data[indexes]
49 |         self.saved_bias.data[indexes] = self.bias.data[indexes]
50 |     
51 |     def get_decay_loss(self) -> torch.Tensor:
52 |         decay_loss = self.weight_decay * (0.5*((self.weight**2).sum()))
53 |         return decay_loss


--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/mlp.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from torch.nn import functional as F
 6 | from typing import Dict, List, Union, Tuple, Optional
 7 | 
 8 | 
 9 | class MLP(nn.Module):
10 |     def __init__(
11 |         self,
12 |         input_dim: int,
13 |         hidden_dims: Union[List[int], Tuple[int]],
14 |         output_dim: Optional[int] = None,
15 |         activation: nn.Module = nn.ReLU,
16 |         dropout_rate: Optional[float] = None
17 |     ) -> None:
18 |         super().__init__()
19 |         hidden_dims = [input_dim] + list(hidden_dims)
20 |         model = []
21 |         for in_dim, out_dim in zip(hidden_dims[:-1], hidden_dims[1:]):
22 |             model += [nn.Linear(in_dim, out_dim), activation()]
23 |             if dropout_rate is not None:
24 |                 model += [nn.Dropout(p=dropout_rate)]
25 | 
26 |         self.output_dim = hidden_dims[-1]
27 |         if output_dim is not None:
28 |             model += [nn.Linear(hidden_dims[-1], output_dim)]
29 |             self.output_dim = output_dim
30 |         self.model = nn.Sequential(*model)
31 | 
32 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
33 |         return self.model(x)


--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/rnn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | class Swish(nn.Module):
  7 |     def __init__(self):
  8 |         super(Swish, self).__init__()
  9 | 
 10 |     def forward(self, x):
 11 |         x = x * torch.sigmoid(x)
 12 |         return x
 13 | 
 14 | 
 15 | def soft_clamp(x : torch.Tensor, _min=None, _max=None):
 16 |     # clamp tensor values while mataining the gradient
 17 |     if _max is not None:
 18 |         x = _max - F.softplus(_max - x)
 19 |     if _min is not None:
 20 |         x = _min + F.softplus(x - _min)
 21 |     return x
 22 | 
 23 | 
 24 | class ResBlock(nn.Module):
 25 |     def __init__(
 26 |         self,
 27 |         input_dim,
 28 |         output_dim,
 29 |         activation=Swish(),
 30 |         layer_norm=True,
 31 |         with_residual=True,
 32 |         dropout=0.1
 33 |     ):
 34 |         super().__init__()
 35 | 
 36 |         self.linear = nn.Linear(input_dim, output_dim)
 37 |         self.activation = activation
 38 |         self.layer_norm = nn.LayerNorm(output_dim) if layer_norm else None
 39 |         self.dropout = nn.Dropout(dropout) if dropout else None
 40 |         self.with_residual = with_residual
 41 |     
 42 |     def forward(self, x):
 43 |         y = self.activation(self.linear(x))
 44 |         if self.dropout is not None:
 45 |             y = self.dropout(y)
 46 |         if self.with_residual:
 47 |             y = x + y
 48 |         if self.layer_norm is not None:
 49 |             y = self.layer_norm(y)
 50 |         return y
 51 | 
 52 | 
 53 | class RNNModel(nn.Module):
 54 |     def __init__(
 55 |         self,
 56 |         input_dim,
 57 |         output_dim,
 58 |         hidden_dims=[200, 200, 200, 200],
 59 |         rnn_num_layers=3,
 60 |         dropout_rate=0.1,
 61 |         device="cpu"
 62 |     ):
 63 |         super().__init__()
 64 |         self.input_dim = input_dim
 65 |         self.hidden_dims = hidden_dims
 66 |         self.output_dim = output_dim
 67 |         self.device = torch.device(device)
 68 | 
 69 |         self.activation = Swish()
 70 |         self.rnn_layer = nn.GRU(
 71 |             input_size=input_dim,
 72 |             hidden_size=hidden_dims[0],
 73 |             num_layers=rnn_num_layers,
 74 |             batch_first=True
 75 |         )
 76 |         module_list = []
 77 |         self.input_layer = ResBlock(input_dim, hidden_dims[0], dropout=dropout_rate, with_residual=False)
 78 |         dims = list(hidden_dims)
 79 |         for in_dim, out_dim in zip(dims[:-1], dims[1:]):
 80 |             module_list.append(ResBlock(in_dim, out_dim, dropout=dropout_rate))
 81 |         self.backbones = nn.ModuleList(module_list)
 82 |         self.merge_layer = nn.Linear(dims[0] + dims[-1], hidden_dims[0])
 83 |         self.output_layer = nn.Linear(hidden_dims[-1], output_dim)
 84 | 
 85 |         self.to(self.device)
 86 | 
 87 |     def forward(self, input, h_state=None):
 88 |         batch_size, num_timesteps, _ = input.shape
 89 |         input = torch.as_tensor(input, dtype=torch.float32).to(self.device)
 90 |         rnn_output, h_state = self.rnn_layer(input, h_state)
 91 |         rnn_output = rnn_output.reshape(-1, self.hidden_dims[0])
 92 |         input = input.view(-1, self.input_dim)
 93 |         output = self.input_layer(input)
 94 |         output = torch.cat([output, rnn_output], dim=-1)
 95 |         output = self.activation(self.merge_layer(output))
 96 |         for layer in self.backbones:
 97 |             output = layer(output)
 98 |         output = self.output_layer(output)
 99 |         output = output.view(batch_size, num_timesteps, -1)
100 |         return output, h_state
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     model = RNNModel(14, 12)
105 |     x = torch.randn(64, 20, 14)
106 |     y, _ = model(x)
107 |     print(y.shape)


--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/vae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import functional as F
 4 | from typing import Dict, List, Union, Tuple, Optional
 5 | 
 6 | 
 7 | # Vanilla Variational Auto-Encoder 
 8 | class VAE(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         input_dim: int,
12 |         output_dim: int,
13 |         hidden_dim: int,
14 |         latent_dim: int,
15 |         max_action: Union[int, float],
16 |         device: str = "cpu"
17 |     ) -> None:
18 |         super(VAE, self).__init__()
19 |         self.e1 = nn.Linear(input_dim + output_dim, hidden_dim)
20 |         self.e2 = nn.Linear(hidden_dim, hidden_dim)
21 | 
22 |         self.mean = nn.Linear(hidden_dim, latent_dim)
23 |         self.log_std = nn.Linear(hidden_dim, latent_dim)
24 | 
25 |         self.d1 = nn.Linear(input_dim + latent_dim, hidden_dim)
26 |         self.d2 = nn.Linear(hidden_dim, hidden_dim)
27 |         self.d3 = nn.Linear(hidden_dim, output_dim)
28 | 
29 |         self.max_action = max_action
30 |         self.latent_dim = latent_dim
31 |         self.device = torch.device(device)
32 | 
33 |         self.to(device=self.device)
34 | 
35 | 
36 |     def forward(
37 |         self,
38 |         obs: torch.Tensor,
39 |         action: torch.Tensor
40 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
41 |         z = F.relu(self.e1(torch.cat([obs, action], 1)))
42 |         z = F.relu(self.e2(z))
43 | 
44 |         mean = self.mean(z)
45 |         # Clamped for numerical stability 
46 |         log_std = self.log_std(z).clamp(-4, 15)
47 |         std = torch.exp(log_std)
48 |         z = mean + std * torch.randn_like(std)
49 | 
50 |         u = self.decode(obs, z)
51 | 
52 |         return u, mean, std
53 | 
54 |     def decode(self, obs: torch.Tensor, z: Optional[torch.Tensor] = None) -> torch.Tensor:
55 |         # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5]
56 |         if z is None:
57 |             z = torch.randn((obs.shape[0], self.latent_dim)).to(self.device).clamp(-0.5,0.5)
58 | 
59 |         a = F.relu(self.d1(torch.cat([obs, z], 1)))
60 |         a = F.relu(self.d2(a))
61 |         return self.max_action * torch.tanh(self.d3(a))


--------------------------------------------------------------------------------
/offlinerl/outside_utils/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/outside_utils/utils/__init__.py


--------------------------------------------------------------------------------
/offlinerl/outside_utils/utils/scaler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os.path as path
 3 | import torch
 4 | 
 5 | 
 6 | class StandardScaler(object):
 7 |     def __init__(self, mu=None, std=None):
 8 |         self.mu = mu
 9 |         self.std = std
10 | 
11 |     def fit(self, data):
12 |         """Runs two ops, one for assigning the mean of the data to the internal mean, and
13 |         another for assigning the standard deviation of the data to the internal standard deviation.
14 |         This function must be called within a 'with <session>.as_default()' block.
15 | 
16 |         Arguments:
17 |         data (np.ndarray): A numpy array containing the input
18 | 
19 |         Returns: None.
20 |         """
21 |         self.mu = np.mean(data, axis=0, keepdims=True)
22 |         self.std = np.std(data, axis=0, keepdims=True)
23 |         self.std[self.std < 1e-12] = 1.0
24 | 
25 |     def transform(self, data):
26 |         """Transforms the input matrix data using the parameters of this scaler.
27 | 
28 |         Arguments:
29 |         data (np.array): A numpy array containing the points to be transformed.
30 | 
31 |         Returns: (np.array) The transformed dataset.
32 |         """
33 |         if isinstance(data, torch.Tensor):
34 |             data = data.cpu().numpy()
35 |         return (data - self.mu) / self.std
36 | 
37 |     def inverse_transform(self, data):
38 |         """Undoes the transformation performed by this scaler.
39 | 
40 |         Arguments:
41 |         data (np.array): A numpy array containing the points to be transformed.
42 | 
43 |         Returns: (np.array) The transformed dataset.
44 |         """
45 |         if isinstance(data, torch.Tensor):
46 |             data = data.cpu().numpy()
47 |         return self.std * data + self.mu
48 |     
49 |     def save_scaler(self, save_path, surfix=""):
50 |         mu_path = path.join(save_path, surfix+"mu.npy")
51 |         std_path = path.join(save_path, surfix+"std.npy")
52 |         np.save(mu_path, self.mu)
53 |         np.save(std_path, self.std)
54 |     
55 |     def load_scaler(self, load_path, surfix=""):
56 |         mu_path = path.join(load_path, surfix+"mu.npy")
57 |         std_path = path.join(load_path, surfix+"std.npy")
58 |         self.mu = np.load(mu_path)
59 |         self.std = np.load(std_path)
60 | 
61 |     def transform_tensor(self, data: torch.Tensor):
62 |         device = data.device
63 |         data = self.transform(data.cpu().numpy())
64 |         data = torch.tensor(data, device=device)
65 |         return data
66 |     
67 |     def inverse_transform_to_array(self, data: torch.Tensor):
68 |         device = data.device
69 |         data = self.inverse_transform(data.cpu().numpy())
70 |         # data = torch.tensor(data, device=device)
71 |         return data


--------------------------------------------------------------------------------
/offlinerl/outside_utils/utils/termination_fns.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | def obs_unnormalization(termination_fn, obs_mean, obs_std):
  4 |     def thunk(obs, act, next_obs):
  5 |         obs = obs*obs_std + obs_mean
  6 |         next_obs = next_obs*obs_std + obs_mean
  7 |         return termination_fn(obs, act, next_obs)
  8 |     return thunk
  9 | 
 10 | def termination_fn_halfcheetah(obs, act, next_obs):
 11 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 12 | 
 13 |     not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1))
 14 |     done = ~not_done
 15 |     done = done[:, None]
 16 |     return done
 17 | 
 18 | def termination_fn_hopper(obs, act, next_obs):
 19 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 20 | 
 21 |     height = next_obs[:, 0]
 22 |     angle = next_obs[:, 1]
 23 |     not_done =  np.isfinite(next_obs).all(axis=-1) \
 24 |                     * np.abs(next_obs[:,1:] < 100).all(axis=-1) \
 25 |                     * (height > .7) \
 26 |                     * (np.abs(angle) < .2)
 27 | 
 28 |     done = ~not_done
 29 |     done = done[:,None]
 30 |     return done
 31 | 
 32 | def termination_fn_halfcheetahveljump(obs, act, next_obs):
 33 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 34 | 
 35 |     done = np.array([False]).repeat(len(obs))
 36 |     done = done[:,None]
 37 |     return done
 38 | 
 39 | def termination_fn_antangle(obs, act, next_obs):
 40 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 41 | 
 42 |     x = next_obs[:, 0]
 43 |     not_done = 	np.isfinite(next_obs).all(axis=-1) \
 44 |                 * (x >= 0.2) \
 45 |                 * (x <= 1.0)
 46 | 
 47 |     done = ~not_done
 48 |     done = done[:,None]
 49 |     return done
 50 | 
 51 | def termination_fn_ant(obs, act, next_obs):
 52 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 53 | 
 54 |     x = next_obs[:, 0]
 55 |     not_done = 	np.isfinite(next_obs).all(axis=-1) \
 56 |                 * (x >= 0.2) \
 57 |                 * (x <= 1.0)
 58 | 
 59 |     done = ~not_done
 60 |     done = done[:,None]
 61 |     return done
 62 | 
 63 | def termination_fn_walker2d(obs, act, next_obs):
 64 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 65 | 
 66 |     height = next_obs[:, 0]
 67 |     angle = next_obs[:, 1]
 68 |     not_done =  np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) \
 69 |                 * (height > 0.8) \
 70 |                 * (height < 2.0) \
 71 |                 * (angle > -1.0) \
 72 |                 * (angle < 1.0)
 73 |     done = ~not_done
 74 |     done = done[:,None]
 75 |     return done
 76 | 
 77 | def termination_fn_point2denv(obs, act, next_obs):
 78 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 79 | 
 80 |     done = np.array([False]).repeat(len(obs))
 81 |     done = done[:,None]
 82 |     return done
 83 | 
 84 | def termination_fn_point2dwallenv(obs, act, next_obs):
 85 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 86 | 
 87 |     done = np.array([False]).repeat(len(obs))
 88 |     done = done[:,None]
 89 |     return done
 90 | 
 91 | def termination_fn_pendulum(obs, act, next_obs):
 92 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 93 | 
 94 |     done = np.zeros((len(obs), 1))
 95 |     return done
 96 | 
 97 | def termination_fn_humanoid(obs, act, next_obs):
 98 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 99 | 
100 |     z = next_obs[:,0]
101 |     done = (z < 1.0) + (z > 2.0)
102 | 
103 |     done = done[:,None]
104 |     return done
105 | 
106 | def termination_fn_pen(obs, act, next_obs):
107 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
108 | 
109 |     obj_pos = next_obs[:, 24:27]
110 |     done = obj_pos[:, 2] < 0.075
111 | 
112 |     done = done[:,None]
113 |     return done
114 | 
115 | def terminaltion_fn_door(obs, act, next_obs):
116 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
117 | 
118 |     done = np.array([False] * obs.shape[0])
119 | 
120 |     done = done[:, None]
121 |     return done
122 | 
123 | def get_termination_fn(task):
124 |     if 'halfcheetahvel' in task:
125 |         return termination_fn_halfcheetahveljump
126 |     elif 'halfcheetah' in task:
127 |         return termination_fn_halfcheetah
128 |     elif 'hopper' in task:
129 |         return termination_fn_hopper
130 |     elif 'antangle' in task:
131 |         return termination_fn_antangle
132 |     elif 'ant' in task:
133 |         return termination_fn_ant
134 |     elif 'walker2d' in task:
135 |         return termination_fn_walker2d
136 |     elif 'point2denv' in task:
137 |         return termination_fn_point2denv
138 |     elif 'point2dwallenv' in task:
139 |         return termination_fn_point2dwallenv
140 |     elif 'pendulum' in task:
141 |         return termination_fn_pendulum
142 |     elif 'humanoid' in task:
143 |         return termination_fn_humanoid
144 |     elif 'pen' in task:
145 |         return termination_fn_pen
146 |     elif 'door' in task:
147 |         return terminaltion_fn_door
148 |     else:
149 |         raise np.zeros
150 | 


--------------------------------------------------------------------------------
/offlinerl/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/utils/__init__.py


--------------------------------------------------------------------------------
/offlinerl/utils/config.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | 
 4 | del_attr = ["function", "module"]
 5 | 
 6 | def parse_config(cfg_module):
 7 |     args = [ i for i in dir(cfg_module) if not i.startswith("__")]
 8 | 
 9 |     config = OrderedDict()
10 |     for arg in args:
11 |         k = arg
12 |         v = getattr(cfg_module, arg)
13 |         if type(v).__name__ in del_attr and k != "device":
14 |             continue
15 |         else:
16 |             config[k] = v
17 | 
18 |     
19 |     return config
20 |     


--------------------------------------------------------------------------------
/offlinerl/utils/env.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | from typing import Tuple
  4 | 
  5 | def create_terminal_function():
  6 |     return terminal_function
  7 | 
  8 | def get_env(task : str) -> gym.Env:
  9 |     try:
 10 |         if task in ['Pipeline', 'Simglucose', 'RocketRecovery', 'RandomFrictionHopper', 'DMSD', 'Fusion', 'Salespromotion', 'SafetyHalfCheetah']:
 11 |             import neorl2
 12 |             import gymnasium as gym
 13 |             env = gym.make(task)
 14 |         elif task.startswith("HalfCheetah-v3"):
 15 |             import neorl
 16 |             env = neorl.make("HalfCheetah-v3")
 17 |         elif task.startswith("Hopper-v3"):
 18 |             import neorl
 19 |             env = neorl.make("Hopper-v3")
 20 |         elif task.startswith("Walker2d-v3"): 
 21 |             import neorl  
 22 |             env = neorl.make("Walker2d-v3")
 23 |         elif task.startswith('d4rl'):
 24 |             import gym
 25 |             import d4rl
 26 |             from d4rl import gym_mujoco
 27 |             env = gym.make(task[5:])
 28 |             # hack to add terminal function 
 29 |             if 'hopper' in task:
 30 |                 def terminal_function(data : dict):
 31 |                     obs = data["obs"]
 32 |                     action = data["action"]
 33 |                     obs_next = data["next_obs"]
 34 | 
 35 |                     singel_done = False
 36 |                     if len(obs.shape) == 1:
 37 |                         singel_done = True
 38 |                         obs = obs.reshape(1, -1)
 39 |                     if len(action.shape) == 1:
 40 |                         action = action.reshape(1, -1)
 41 |                     if len(obs_next.shape) == 1:
 42 |                         obs_next = obs_next.reshape(1, -1)
 43 | 
 44 |                     if isinstance(obs, np.ndarray):
 45 |                         array_type = np
 46 |                     else:
 47 |                         import torch
 48 |                         array_type = torch
 49 | 
 50 |                     z = obs_next[:, 0:1]
 51 |                     angle = obs_next[:, 1:2]
 52 |                     states = obs_next[:, 1:]
 53 | 
 54 |                     min_state, max_state = (-100.0, 100.0)
 55 |                     min_z, max_z = (0.7, float('inf'))
 56 |                     min_angle, max_angle = (-0.2, 0.2)
 57 | 
 58 |                     healthy_state = array_type.all(array_type.logical_and(min_state < states, states < max_state), axis=-1, keepdim=True)
 59 |                     healthy_z = array_type.logical_and(min_z < z, z < max_z)
 60 |                     healthy_angle = array_type.logical_and(min_angle < angle, angle < max_angle)
 61 | 
 62 |                     is_healthy = array_type.logical_and(array_type.logical_and(healthy_state, healthy_z), healthy_angle)
 63 | 
 64 |                     done = array_type.logical_not(is_healthy)
 65 | 
 66 |                     if singel_done:
 67 |                         done = done
 68 |                     else:
 69 |                         done = done.reshape(-1, 1)
 70 |                     return done
 71 | 
 72 |                 # env.get_done_func = lambda: terminal_function
 73 |                 env.get_done_func = create_terminal_function
 74 |             elif 'walker' in task:
 75 |                 def terminal_function(data : dict):
 76 | 
 77 |                     obs = data["obs"]
 78 |                     action = data["action"]
 79 |                     obs_next = data["next_obs"]
 80 | 
 81 |                     singel_done = False
 82 |                     if len(obs.shape) == 1:
 83 |                         singel_done = True
 84 |                         obs = obs.reshape(1, -1)
 85 |                     if len(action.shape) == 1:
 86 |                         action = action.reshape(1, -1)
 87 |                     if len(obs_next.shape) == 1:
 88 |                         obs_next = obs_next.reshape(1, -1)
 89 | 
 90 |                     if isinstance(obs, np.ndarray):
 91 |                         array_type = np
 92 |                     else:
 93 |                         import torch
 94 |                         array_type = torch
 95 | 
 96 |                     min_z, max_z = (0.8, 2.0)
 97 |                     min_angle, max_angle = (-1.0, 1.0)
 98 |                     min_state, max_state = (-100.0, 100.0)
 99 |                     
100 |                     z = obs_next[:, 0:1]
101 |                     angle = obs_next[:, 1:2]
102 |                     state = obs_next[:, 2:]
103 |                     
104 |                     healthy_state = array_type.all(array_type.logical_and(min_state < state, state < max_state), axis=-1, keepdim=True)
105 |                     healthy_z = array_type.logical_and(min_z < z, z < max_z)
106 |                     healthy_angle = array_type.logical_and(min_angle < angle, angle < max_angle)
107 |                     is_healthy = array_type.logical_and(array_type.logical_and(healthy_state, healthy_z), healthy_angle)
108 |                     done = array_type.logical_not(is_healthy)
109 | 
110 |                     if singel_done:
111 |                         done = done
112 |                     else:
113 |                         done = done.reshape(-1, 1)
114 |                         
115 |                     return done
116 | 
117 |                 # env.get_done_func = lambda: terminal_function
118 |                 env.get_done_func = create_terminal_function
119 |         else:
120 |             task_name = task.strip().split("-")[0]
121 |             env = neorl.make(task_name)
122 |     except:
123 |             raise NotImplementedError
124 | 
125 |     return env
126 | 
127 | def get_env_shape(task : str) -> Tuple[int, int]:
128 |     env = get_env(task)
129 |     obs_dim = env.observation_space.shape
130 |     action_space = env.action_space
131 |     
132 |     if len(obs_dim) == 1:
133 |         obs_dim = obs_dim[0]
134 |         
135 |     if hasattr(env.action_space, 'n'):
136 |         act_dim = env.action_space.n
137 |     else:
138 |         act_dim = action_space.shape[0]
139 |     
140 |     return obs_dim, act_dim
141 | 
142 | def get_env_obs_act_spaces(task : str):
143 |     env = get_env(task)
144 |     obs_space = env.observation_space
145 |     act_space = env.action_space
146 |     return obs_space, act_space
147 | 
148 | def get_env_action_range(task : str) -> Tuple[float, float]:
149 |     env = get_env(task)
150 |     act_max = float(env.action_space.high[0])
151 |     act_min = float(env.action_space.low[0])
152 |     
153 |     return act_max, act_min  
154 |     
155 | def get_env_state_range(task : str) -> Tuple[float, float]:
156 |     env = get_env(task)
157 |     obs_max = float(env.observation_space.high[0])
158 |     obs_min = float(env.observation_space.low[0])
159 |     
160 |     return obs_max, obs_min


--------------------------------------------------------------------------------
/offlinerl/utils/exp.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import uuid
 3 | import random
 4 | 
 5 | 
 6 | import torch
 7 | import numpy as np
 8 | from aim import Run
 9 | from loguru import logger
10 | 
11 | from offlinerl.utils.logger import log_path
12 | 
13 | 
14 | def setup_seed(seed=1024):
15 |      torch.manual_seed(seed)
16 |      torch.cuda.manual_seed_all(seed)
17 |      np.random.seed(seed)
18 |      random.seed(seed)
19 |      torch.backends.cudnn.deterministic = True
20 |         
21 | def select_free_cuda():
22 |     # 获取可用的 GPU 数量
23 |     num_gpus = torch.cuda.device_count()
24 | 
25 |     if num_gpus == 0:
26 |         print("No GPU available.")
27 |         return None
28 | 
29 |     # 遍历所有 GPU，选择利用率最低的 GPU
30 |     min_memory_usage = float('inf')
31 |     selected_gpu_id = None
32 | 
33 |     for gpu_id in range(num_gpus):
34 |         torch.cuda.set_device(gpu_id)
35 |         gpu_memory_usage = torch.cuda.max_memory_allocated() / 1024**3  # in GB
36 |         # 选择利用率最低的 GPU
37 |         if gpu_memory_usage < min_memory_usage:
38 |             min_memory_usage = gpu_memory_usage
39 |             selected_gpu_id = gpu_id
40 | 
41 |     return selected_gpu_id
42 | 
43 | def set_free_device_fn():
44 |     device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
45 | 
46 |     return device
47 | 
48 | 
49 | def init_exp_run(repo=None, experiment_name=None, flush_frequency=1):
50 |     if repo is None:
51 |         repo = os.path.join(log_path(),"./.aim")
52 |         if not os.path.exists(repo):
53 |             print(f'=====repo:{repo}')
54 |             logger.info('{} dir is not exist, create {}',repo, repo)
55 |             os.system(str("cd " + os.path.join(repo,"../") + "&& aim init"))
56 |     else:
57 |         repo = os.path.join(repo,"./.aim")
58 |         if not os.path.exists(repo):
59 |             print(f'=====repo:{repo}')
60 |             logger.info('{} dir is not exist, create {}',repo, repo)
61 |             os.system(str("cd " + os.path.join(repo,"../") + "&& aim init"))
62 |     run = Run(
63 |         repo=repo,
64 |         experiment=experiment_name
65 |     )
66 |         
67 |     return run


--------------------------------------------------------------------------------
/offlinerl/utils/flexible_replay_pool.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import pickle
  3 | 
  4 | import numpy as np
  5 | 
  6 | from .replay_pool import ReplayPool
  7 | 
  8 | 
  9 | class FlexibleReplayPool(ReplayPool):
 10 |     def __init__(self, max_size, fields_attrs, obs_filter=False, modify_rew=False):
 11 |         super(FlexibleReplayPool, self).__init__()
 12 | 
 13 |         max_size = int(max_size)
 14 |         self._max_size = max_size
 15 | 
 16 |         self.fields = {}
 17 |         self.fields_attrs = {}
 18 | 
 19 |         self.add_fields(fields_attrs)
 20 | 
 21 |         self.obs_filter = obs_filter
 22 |         self.modify_rew = modify_rew
 23 | 
 24 |         self._pointer = 0
 25 |         self._size = 0
 26 |         self._samples_since_save = 0
 27 | 
 28 |     @property
 29 |     def size(self):
 30 |         return self._size
 31 | 
 32 |     @property
 33 |     def field_names(self):
 34 |         return list(self.fields.keys())
 35 | 
 36 |     def add_fields(self, fields_attrs):
 37 |         self.fields_attrs.update(fields_attrs)
 38 | 
 39 |         for field_name, field_attrs in fields_attrs.items():
 40 |             field_shape = (self._max_size, *field_attrs['shape'])
 41 |             initializer = field_attrs.get('initializer', np.zeros)
 42 |             self.fields[field_name] = initializer(
 43 |                 field_shape, dtype=field_attrs['dtype'])
 44 | 
 45 |     def _advance(self, count=1):
 46 |         self._pointer = (self._pointer + count) % self._max_size
 47 |         self._size = min(self._size + count, self._max_size)
 48 |         self._samples_since_save += count
 49 | 
 50 |     def add_sample(self, sample):
 51 |         samples = {
 52 |             key: value[None, ...]
 53 |             for key, value in sample.items()
 54 |         }
 55 |         self.add_samples(samples)
 56 | 
 57 |     def add_samples(self, samples):
 58 |         # if 'infos' not in samples:
 59 |         #     samples['infos'] = {}
 60 |         field_names = list(samples.keys())
 61 |         num_samples = samples[field_names[0]].shape[0]
 62 |         index = np.arange(
 63 |             self._pointer, self._pointer + num_samples) % self._max_size
 64 |         for field_name in self.field_names:
 65 |             # print(field_name)
 66 |             default_value = (
 67 |                 self.fields_attrs[field_name].get('default_value', 0.0))
 68 |             values = samples.get(field_name, default_value)
 69 |             if field_name not in samples.keys() and 'infos' in samples and field_name in samples['infos'][0].keys():
 70 |                 values = np.expand_dims(np.array([samples['infos'][i].get(field_name, default_value) for i in range(num_samples)]), axis=1)
 71 |             try:
 72 |                 assert values.shape[0] == num_samples, f'value shape: {values.shape[0]}, expected: {num_samples}'
 73 |                 if isinstance(values[0], dict):
 74 |                     values = np.stack([np.concatenate([
 75 |                                 value[key]
 76 |                                 for key in value.keys()
 77 |                             ], axis=-1) for value in values])
 78 |                 self.fields[field_name][index] = values
 79 |             except Exception as e:
 80 |                 import traceback
 81 |                 traceback.print_exc(limit=10)
 82 |                 print('[ DEBUG ] errors occurs: {}'.format(e))
 83 | 
 84 |                 import pdb; pdb.set_trace()
 85 |         self._advance(num_samples)
 86 | 
 87 |     def restore_samples(self, samples):
 88 |         num_samples = samples[list(samples.keys())[0]].shape[0]
 89 |         index = np.arange(
 90 |             0, num_samples) % self._max_size
 91 |         for key, values in samples.items():
 92 |             assert key in self.field_names
 93 |             self.fields[key][index] = values
 94 | 
 95 |     def random_indices(self, batch_size):
 96 |         if self._size == 0: return np.arange(0, 0)
 97 |         return np.random.randint(0, self._size, batch_size)
 98 | 
 99 |     def random_batch(self, batch_size, field_name_filter=None, **kwargs):
100 |         random_indices = self.random_indices(batch_size)
101 |         return self.batch_by_indices(
102 |             random_indices, field_name_filter=field_name_filter, **kwargs)
103 | 
104 |     def last_n_batch(self, last_n, field_name_filter=None, **kwargs):
105 |         last_n_indices = np.arange(
106 |             self._pointer - min(self.size, last_n), self._pointer
107 |         ) % self._max_size
108 |         return self.batch_by_indices(
109 |             last_n_indices, field_name_filter=field_name_filter, **kwargs)
110 | 
111 |     def filter_fields(self, field_names, field_name_filter):
112 |         if isinstance(field_name_filter, str):
113 |             field_name_filter = [field_name_filter]
114 | 
115 |         if isinstance(field_name_filter, (list, tuple)):
116 |             field_name_list = field_name_filter
117 | 
118 |             def filter_fn(field_name):
119 |                 return field_name in field_name_list
120 | 
121 |         else:
122 |             filter_fn = field_name_filter
123 | 
124 |         filtered_field_names = [
125 |             field_name for field_name in field_names
126 |             if filter_fn(field_name)
127 |         ]
128 | 
129 |         return filtered_field_names
130 | 
131 |     def batch_by_indices(self, indices, field_name_filter=None):
132 |         if np.any(indices % self._max_size > self.size):
133 |             raise ValueError(
134 |                 "Tried to retrieve batch with indices greater than current"
135 |                 " size")
136 | 
137 |         field_names = self.field_names
138 |         if field_name_filter is not None:
139 |             field_names = self.filter_fields(
140 |                 field_names, field_name_filter)
141 | 
142 |         return {
143 |             field_name: self.fields[field_name][indices]
144 |             for field_name in field_names
145 |         }
146 | 
147 |     def save_latest_experience(self, pickle_path):
148 |         latest_samples = self.last_n_batch(self._samples_since_save)
149 | 
150 |         with gzip.open(pickle_path, 'wb') as f:
151 |             pickle.dump(latest_samples, f)
152 | 
153 |         self._samples_since_save = 0
154 | 
155 |     def load_experience(self, experience_path):
156 |         with gzip.open(experience_path, 'rb') as f:
157 |             latest_samples = pickle.load(f)
158 | 
159 |         key = list(latest_samples.keys())[0]
160 |         num_samples = latest_samples[key].shape[0]
161 |         for field_name, data in latest_samples.items():
162 |             assert data.shape[0] == num_samples, data.shape
163 | 
164 |         self.add_samples(latest_samples)
165 |         self._samples_since_save = 0
166 | 
167 |     def return_all_samples(self):
168 |         return {
169 |             field_name: self.fields[field_name][:self.size]
170 |             for field_name in self.field_names
171 |         }
172 | 
173 |     def __getstate__(self):
174 |         state = self.__dict__.copy()
175 |         state['fields'] = {
176 |             field_name: self.fields[field_name][:self.size]
177 |             for field_name in self.field_names
178 |         }
179 | 
180 |         return state
181 | 
182 |     def __setstate__(self, state):
183 |         if state['_size'] < state['_max_size']:
184 |             pad_size = state['_max_size'] - state['_size']
185 |             for field_name in state['fields'].keys():
186 |                 field_shape = state['fields_attrs'][field_name]['shape']
187 |                 state['fields'][field_name] = np.concatenate((
188 |                     state['fields'][field_name],
189 |                     np.zeros((pad_size, *field_shape))
190 |                 ), axis=0)
191 | 
192 |         self.__dict__ = state
193 | 


--------------------------------------------------------------------------------
/offlinerl/utils/function.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.functional import F
 3 | 
 4 | def soft_clamp(x : torch.Tensor, _min=None, _max=None):
 5 |     # clamp tensor values while mataining the gradient
 6 |     if _max is not None:
 7 |         x = _max - F.softplus(_max - x)
 8 |     if _min is not None:
 9 |         x = _min + F.softplus(x - _min)
10 |     return x


--------------------------------------------------------------------------------
/offlinerl/utils/io.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pickle
 4 | import urllib
 5 | import urllib.request
 6 | from tqdm import tqdm
 7 | 
 8 | def read_json(file_path):
 9 |     with open(file_path, 'r') as f:
10 |         data = json.load(f)
11 |         
12 |     return data
13 | 
14 | 
15 | def load_pkl(file_path):
16 |     assert os.path.exists(file_path)
17 |     with open(file_path, 'rb') as handle:
18 |         data = pickle.load(handle)
19 |         
20 |     return data
21 |     
22 | def save_pkl(data, file_path):
23 |     with open(file_path, 'wb') as handle:
24 |         pickle.dump(data, handle)
25 |         
26 |         
27 | def del_dir(dir_path):
28 |     os.removedirs(dir_path)
29 |     
30 | def create_dir(dir_path, cover=False):
31 |     if cover or not os.path.exists(dir_path):
32 |         if cover and os.path.exists(dir_path):
33 |             os.removedirs(dir_path)
34 |         os.makedirs(dir_path)
35 |         
36 |         
37 | def save_video(video_array, video_save_path):
38 |     import cv2
39 |     fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
40 |     output_movie = cv2.VideoWriter(video_save_path, fourcc, 10, (640, 360))
41 |     
42 |     for frame in video_array:
43 |         output_movie.write(frame)
44 |         
45 |     out.release()
46 |     cv2.destroyAllWindows()
47 |     
48 | def download_helper(url, filename):
49 |     'Download file from given url. Modified from `torchvision.dataset.utils`'
50 |     def gen_bar_updater():
51 |         pbar = tqdm(total=None)
52 | 
53 |         def bar_update(count, block_size, total_size):
54 |             if pbar.total is None and total_size:
55 |                 pbar.total = total_size
56 |             progress_bytes = count * block_size
57 |             pbar.update(progress_bytes - pbar.n)
58 | 
59 |         return bar_update
60 | 
61 |     try:
62 |         print('Downloading ' + url + ' to ' + filename)
63 |         urllib.request.urlretrieve(
64 |             url, filename,
65 |             reporthook=gen_bar_updater()
66 |         )
67 |         
68 |         return True
69 |     except (urllib.error.URLError, IOError) as e:
70 |         if url[:5] == 'https':
71 |             url = url.replace('https:', 'http:')
72 |             print('Failed download. Trying https -> http instead.'
73 |                     ' Downloading ' + url + ' to ' + filename)
74 |             urllib.request.urlretrieve(
75 |                 url, filename,
76 |                 reporthook=gen_bar_updater()
77 |             )
78 |             
79 |             return True
80 |         else:
81 |             raise e


--------------------------------------------------------------------------------
/offlinerl/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import uuid
 3 | import aim
 4 | 
 5 | from offlinerl.utils.io import create_dir
 6 | 
 7 | def log_path():
 8 |     import offlinerl
 9 |     log_path = os.path.abspath(os.path.join(offlinerl.__file__,"../../","offlinerl_tmp"))
10 | 
11 |     create_dir(log_path)
12 | 
13 |     return log_path
14 |     
15 | """
16 | class exp_logger():
17 |     def __init__(self, experiment_name=None,flush_frequency=1):
18 |         print("experiment_name:",experiment_name)
19 |         self.aim_logger = aim.Session(experiment=experiment_name, flush_frequency=flush_frequency)
20 |     
21 |     def log_hparams(self, hparams_dict):
22 |         self.aim_logger.set_params(hparams_dict, name='hparams')
23 | """


--------------------------------------------------------------------------------
/offlinerl/utils/net/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/utils/net/__init__.py


--------------------------------------------------------------------------------
/offlinerl/utils/net/bcq_net.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import numpy as np
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | from offlinerl.utils.net.common import BasePolicy
 8 | 
 9 | 
10 | # Used for Atari
11 | class Conv_Q(nn.Module):
12 |     def __init__(self, frames, num_actions):
13 |         super(Conv_Q, self).__init__()
14 |         self.c1 = nn.Conv2d(frames, 32, kernel_size=8, stride=4)
15 |         self.c2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
16 |         self.c3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
17 | 
18 |         self.q1 = nn.Linear(3136, 512)
19 |         self.q2 = nn.Linear(512, 16)
20 |         self.q3 = nn.Linear(16, num_actions)
21 | 
22 |         self.i1 = nn.Linear(3136, 512)
23 |         self.i2 = nn.Linear(512, 16)
24 |         self.i3 = nn.Linear(16, num_actions)
25 | 
26 | 
27 |     def forward(self, state):
28 |         c = F.relu(self.c1(state))
29 |         c = F.relu(self.c2(c))
30 |         c = F.relu(self.c3(c))
31 | 
32 |         q = F.relu(self.q1(c.reshape(-1, 3136)))
33 |         q = F.relu(self.q2(q))
34 |         q = self.q3(q)
35 |         
36 |         i = F.relu(self.i1(c.reshape(-1, 3136)))
37 |         i = F.relu(self.i2(i))
38 |         i = self.i3(i)
39 |         return q, F.log_softmax(i, dim=1), i
40 | 
41 |     def encode(self, state):
42 |         with torch.no_grad():
43 |             c = F.relu(self.c1(state))
44 |             c = F.relu(self.c2(c))
45 |             c = F.relu(self.c3(c))
46 | 
47 |             q = F.relu(self.q1(c.reshape(-1, 3136)))
48 |             q = F.relu(self.q2(q))
49 | 
50 |             i = F.relu(self.i1(c.reshape(-1, 3136)))
51 |             i = F.relu(self.i2(i))            
52 |             return i
53 | 
54 | 
55 | 
56 | # Used for Box2D / Toy problems
57 | class FC_Q(nn.Module, BasePolicy):
58 |     def __init__(self, state_dim, num_actions):
59 |         super(FC_Q, self).__init__()
60 |         self.q1 = nn.Linear(state_dim, 256)
61 |         self.q2 = nn.Linear(256, 256)
62 |         self.q3 = nn.Linear(256, num_actions)
63 | 
64 |         self.i1 = nn.Linear(state_dim, 256)
65 |         self.i2 = nn.Linear(256, 256)
66 |         self.i3 = nn.Linear(256, num_actions)
67 | 
68 | 
69 |     def forward(self, state):
70 |         q = F.relu(self.q1(state))
71 |         q = F.relu(self.q2(q))
72 | 
73 |         i = F.relu(self.i1(state))
74 |         i = F.relu(self.i2(i))
75 |         i = F.relu(self.i3(i))
76 |         return self.q3(q), F.log_softmax(i, dim=1), i
77 |     
78 |     def policy_infer(self, obs):
79 |     
80 |         q, imt, i = self(obs)
81 |         imt = imt.exp()
82 |         imt = (imt/imt.max(1, keepdim=True)[0] > 0.3).float()
83 |         # Use large negative number to mask actions from argmax
84 | 
85 |         return (imt * q + (1. - imt) * -1e8).argmax(1)
86 | 
87 | 


--------------------------------------------------------------------------------
/offlinerl/utils/net/maple_actor.py:
--------------------------------------------------------------------------------
 1 | import torch.nn
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | import torch.nn.functional as F
 5 | from offlinerl.utils.net.common import miniblock
 6 | 
 7 | 
 8 | class Maple_actor(nn.Module):
 9 |     def __init__(self, obs_dim, action_dim, deterministic=False, hidden_sizes=(16,), Guassain_hidden_sizes=(256,256), max_traj_len=5, LOG_MAX_STD=2, LOG_MIN_STD=-20, EPS=1e-8, lstm_hidden_unit=128):
10 |         super(Maple_actor,self).__init__()
11 |         self.obs_dim = obs_dim
12 |         self.deterministic = deterministic
13 |         self.act_dim = action_dim
14 |         self.hidden_sizes = list(hidden_sizes).copy()
15 |         self.Guassain_hidden_sizes = list(Guassain_hidden_sizes).copy()
16 |         self.max_traj_len = max_traj_len
17 |         self.LOG_MAX_STD = LOG_MAX_STD
18 |         self.LOG_MIN_STD = LOG_MIN_STD
19 |         self.EPS = EPS
20 |         self.lstm_hidden_unit = lstm_hidden_unit
21 |         self.mlp = miniblock(lstm_hidden_unit, hidden_sizes[0], None, relu=False)
22 |         if len(hidden_sizes) >= 2:
23 |             for i in range(1,len(hidden_sizes)):
24 |                 self.mlp += miniblock(hidden_sizes[i-1], hidden_sizes[i], None)
25 |         self.mlp = nn.Sequential(*self.mlp)
26 |         self.Guassain_input_dim = self.hidden_sizes[-1] + self.obs_dim
27 |         self.Guassain_mlp = miniblock(self.Guassain_input_dim, self.Guassain_hidden_sizes[0], None)
28 |         if len(Guassain_hidden_sizes)>=2:
29 |             for i in range(1,len(Guassain_hidden_sizes)):
30 |                 self.Guassain_mlp += miniblock(Guassain_hidden_sizes[i-1], Guassain_hidden_sizes[i], None)
31 |         self.Guassain_mlp = nn.Sequential(*self.Guassain_mlp)
32 |         self.Guassain_mu_mlp = [nn.Linear(self.Guassain_hidden_sizes[-1], action_dim)]
33 |         self.Guassain_logstd_mlp = [nn.Linear(self.Guassain_hidden_sizes[-1], action_dim)]
34 |         self.Guassain_mu_mlp = nn.Sequential(*self.Guassain_mu_mlp)
35 |         self.Guassain_logstd_mlp = nn.Sequential(*self.Guassain_logstd_mlp)
36 |     def gaussian_likelihood(self,x, mu, log_std):
37 |         pre_sum = -0.5 * (((x - mu) / (torch.exp(log_std) + self.EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi))
38 |         return torch.sum(pre_sum, dim=-1)
39 | 
40 |     def forward(self, hidden_policy, obs): 
41 |         policy_out = self.mlp(hidden_policy)
42 |         policy_z = torch.cat([policy_out, obs], dim=-1)
43 |         out = self.Guassain_mlp(policy_z)
44 |         mu = self.Guassain_mu_mlp(out)
45 |         log_std = self.Guassain_logstd_mlp(out)
46 |         log_std = torch.clip(log_std, self.LOG_MIN_STD, self.LOG_MAX_STD)
47 |         std = torch.exp(log_std)
48 |         acts = torch.distributions.Normal(torch.zeros_like(mu),torch.ones_like(std)).sample()*std + mu
49 |         log_p_acts = self.gaussian_likelihood(acts, mu, log_std)
50 |         mu, acts, log_p_acts = self.apply_squashing_func(mu, acts, log_p_acts)
51 |         return mu, acts, log_p_acts, std
52 | 
53 |     def apply_squashing_func(self, mu, pi, logp_pi):
54 |         logp_pi -= torch.sum(2 * (np.log(2) - pi - F.softplus(-2 * pi)), dim=-1)
55 |         # Squash those unbounded actions!
56 |         mu = torch.tanh(mu)
57 |         pi = torch.tanh(pi)
58 |         return mu, pi, logp_pi
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/offlinerl/utils/net/mlas.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from offlinerl.utils.net.common import BasePolicy
 6 | 
 7 | class VAE(nn.Module, BasePolicy):
 8 |     def __init__(self, 
 9 |                  state_dim, 
10 |                  action_dim, 
11 |                  latent_dim, 
12 |                  max_action,
13 |                  hidden_size=750):
14 |         super(VAE, self).__init__()
15 |         
16 |         self.e1 = nn.Linear(state_dim + action_dim, hidden_size)
17 |         self.e2 = nn.Linear(hidden_size, hidden_size)
18 | 
19 |         self.mean = nn.Linear(hidden_size, latent_dim)
20 |         self.log_std = nn.Linear(hidden_size, latent_dim)
21 | 
22 |         self.d1 = nn.Linear(state_dim + latent_dim, hidden_size)
23 |         self.d2 = nn.Linear(hidden_size, hidden_size)
24 |         self.d3 = nn.Linear(hidden_size, action_dim)
25 | 
26 |         self.max_action = max_action
27 |         self.latent_dim = latent_dim
28 |         
29 |         self._actor = None
30 | 
31 |     def forward(self, state, action):
32 |         z = F.relu(self.e1(torch.cat([state, action], 1)))
33 |         z = F.relu(self.e2(z))
34 | 
35 |         mean = self.mean(z)
36 |         # Clamped for numerical stability
37 |         log_std = self.log_std(z).clamp(-4, 15)
38 |         std = torch.exp(log_std)
39 |         z = mean + std * torch.randn_like(std)
40 | 
41 |         u = self.decode(state, z)
42 | 
43 |         return u, mean, std
44 | 
45 |     def decode(self, state, z=None, clip=None, raw=False):
46 |         # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5]
47 |         if z is None:
48 |             z = torch.randn((state.shape[0], self.latent_dim)).to(state.device)
49 |             if clip is not None:
50 |                 z = z.clamp(-clip, clip)
51 | 
52 |         a = F.relu(self.d1(torch.cat([state, z], 1)))
53 |         a = F.relu(self.d2(a))
54 |         a = self.d3(a)
55 |         if raw: 
56 |             return a
57 |         return self.max_action * torch.tanh(a)
58 |     
59 |     def policy_infer(self, obs):
60 |         return self.decode(obs, z=self._actor(obs)[0])
61 |     
62 | class ActorPerturbation(nn.Module, BasePolicy):
63 |     def __init__(self, state_dim, action_dim, latent_action_dim, max_action, max_latent_action=2, phi=0.05):
64 |         super(ActorPerturbation, self).__init__()
65 | 
66 |         self.hidden_size = (400, 300, 400, 300)
67 | 
68 |         self.l1 = nn.Linear(state_dim, self.hidden_size[0])
69 |         self.l2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])
70 |         self.l3 = nn.Linear(self.hidden_size[1], latent_action_dim)
71 | 
72 |         self.l4 = nn.Linear(state_dim + action_dim, self.hidden_size[2])
73 |         self.l5 = nn.Linear(self.hidden_size[2], self.hidden_size[3])
74 |         self.l6 = nn.Linear(self.hidden_size[3], action_dim)
75 | 
76 |         self.max_latent_action = max_latent_action
77 |         self.max_action = max_action
78 |         self.phi = phi
79 |         
80 |         self.vae = None
81 | 
82 |     def forward(self, state, decoder):
83 |         a = F.relu(self.l1(state))
84 |         a = F.relu(self.l2(a))
85 |         latent_action = self.max_latent_action * torch.tanh(self.l3(a))
86 | 
87 |         mid_action = decoder(state, z=latent_action)
88 | 
89 |         a = F.relu(self.l4(torch.cat([state, mid_action], 1)))
90 |         a = F.relu(self.l5(a))
91 |         a = self.phi * torch.tanh(self.l6(a))
92 |         final_action = (a + mid_action).clamp(-self.max_action, self.max_action)
93 |         return latent_action, mid_action, final_action
94 |     
95 |     def policy_infer(self, obs):
96 |         
97 |         return self(obs, self.vae.decode)[-1]


--------------------------------------------------------------------------------
/offlinerl/utils/net/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/utils/net/model/__init__.py


--------------------------------------------------------------------------------
/offlinerl/utils/net/model/ensemble.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | from offlinerl.utils.function import soft_clamp
  5 | from offlinerl.utils.net.common import Swish
  6 | 
  7 | class EnsembleLinear(torch.nn.Module):
  8 |     def __init__(self, in_features, out_features, ensemble_size=7):
  9 |         super().__init__()
 10 | 
 11 |         self.ensemble_size = ensemble_size
 12 | 
 13 |         self.register_parameter('weight', torch.nn.Parameter(torch.zeros(ensemble_size, in_features, out_features)))
 14 |         self.register_parameter('bias', torch.nn.Parameter(torch.zeros(ensemble_size, 1, out_features)))
 15 | 
 16 |         torch.nn.init.trunc_normal_(self.weight, std=1/(2*in_features**0.5))
 17 | 
 18 |         self.register_parameter('saved_weight', torch.nn.Parameter(self.weight.detach().clone()))
 19 |         self.register_parameter('saved_bias', torch.nn.Parameter(self.bias.detach().clone()))
 20 | 
 21 |         self.select = list(range(0, self.ensemble_size))
 22 | 
 23 |     def forward(self, x):
 24 |         weight = self.weight[self.select]
 25 |         bias = self.bias[self.select]
 26 | 
 27 |         if len(x.shape) == 2:
 28 |             x = torch.einsum('ij,bjk->bik', x, weight)
 29 |         else:
 30 |             x = torch.einsum('bij,bjk->bik', x, weight)
 31 | 
 32 |         x = x + bias
 33 | 
 34 |         return x
 35 | 
 36 |     def set_select(self, indexes):
 37 |         assert len(indexes) <= self.ensemble_size and max(indexes) < self.ensemble_size
 38 |         self.select = indexes
 39 |         self.weight.data[indexes] = self.saved_weight.data[indexes]
 40 |         self.bias.data[indexes] = self.saved_bias.data[indexes]
 41 | 
 42 |     def update_save(self, indexes):
 43 |         self.saved_weight.data[indexes] = self.weight.data[indexes]
 44 |         self.saved_bias.data[indexes] = self.bias.data[indexes]
 45 | 
 46 | class EnsembleTransition(torch.nn.Module):
 47 |     def __init__(self, obs_dim, action_dim, hidden_features, hidden_layers, ensemble_size=7, mode='local', with_reward=True):
 48 |         super().__init__()
 49 |         self.obs_dim = obs_dim
 50 |         self.mode = mode
 51 |         self.with_reward = with_reward
 52 |         self.ensemble_size = ensemble_size
 53 | 
 54 |         self.activation = Swish()
 55 | 
 56 |         module_list = []
 57 |         for i in range(hidden_layers):
 58 |             if i == 0:
 59 |                 module_list.append(EnsembleLinear(obs_dim + action_dim, hidden_features, ensemble_size))
 60 |             else:
 61 |                 module_list.append(EnsembleLinear(hidden_features, hidden_features, ensemble_size))
 62 |         self.backbones = torch.nn.ModuleList(module_list)
 63 | 
 64 |         self.output_layer = EnsembleLinear(hidden_features, 2 * (obs_dim + self.with_reward), ensemble_size)
 65 |         self.obs_mean = None
 66 |         self.obs_std = None
 67 |         self.register_parameter('max_logstd', torch.nn.Parameter(torch.ones(obs_dim + self.with_reward) * 1, requires_grad=True))
 68 |         self.register_parameter('min_logstd', torch.nn.Parameter(torch.ones(obs_dim + self.with_reward) * -5, requires_grad=True))
 69 | 
 70 |     def update_self(self, obs):
 71 |         self.obs_mean = obs.mean(dim=0)
 72 |         self.obs_std = obs.std(dim=0)
 73 | 
 74 |     def forward(self, obs_action):
 75 |         # Normalization for obs. If 'normalize', no residual. 
 76 |         # use 'dims' to make forward work both when training and evaluating
 77 |         dims = len(obs_action.shape) - 2  # dim == 0: eval, dim == 1: train
 78 |         if self.obs_mean is not None:
 79 |             if dims == 1:
 80 |                 obs_mean = self.obs_mean.unsqueeze(0).expand(obs_action.shape[0], -1).to(obs_action.device)
 81 |                 obs_std = self.obs_std.unsqueeze(0).expand(obs_action.shape[0], -1).to(obs_action.device)
 82 |             else:
 83 |                 obs_mean = self.obs_mean.to(obs_action.device)
 84 |                 obs_std = self.obs_std.to(obs_action.device)
 85 | 
 86 |             if self.mode == 'normalize':
 87 |                 batch_size = obs_action.shape[dims]
 88 |                 obs, action = torch.split(obs_action, [self.obs_dim, obs_action.shape[-1] - self.obs_dim], dim=-1)
 89 |                 if dims == 1:
 90 |                     obs = obs - obs_mean.unsqueeze(dims).expand(-1, batch_size, -1)
 91 |                     obs = obs / (obs_std.unsqueeze(dims).expand(-1, batch_size, -1) + 1e-8)
 92 |                 else:
 93 |                     obs = obs - obs_mean.unsqueeze(dims).expand(batch_size, -1)
 94 |                     obs = obs / (obs_std.unsqueeze(dims).expand(batch_size, -1) + 1e-8)
 95 |                 output = torch.cat([obs, action], dim=-1)
 96 |             else:
 97 |                 output = obs_action
 98 |         else:
 99 |             output = obs_action
100 | 
101 |         for layer in self.backbones:
102 |             output = self.activation(layer(output))
103 |         mu, logstd = torch.chunk(self.output_layer(output), 2, dim=-1)
104 |         logstd = soft_clamp(logstd, self.min_logstd, self.max_logstd)
105 |         # 'local': with residual
106 |         if self.mode == 'local' or self.mode == 'normalize':
107 |             if self.with_reward:
108 |                 obs, reward = torch.split(mu, [self.obs_dim, 1], dim=-1)
109 |                 obs = obs + obs_action[..., :self.obs_dim]
110 |                 mu = torch.cat([obs, reward], dim=-1)
111 |             else:
112 |                 mu = mu + obs_action[..., :self.obs_dim]
113 |         return torch.distributions.Normal(mu, torch.exp(logstd))
114 | 
115 |     def set_select(self, indexes):
116 |         self.elites = indexes
117 |         for layer in self.backbones:
118 |             layer.set_select(indexes)
119 |         self.output_layer.set_select(indexes)
120 | 
121 |     def update_save(self, indexes):
122 |         for layer in self.backbones:
123 |             layer.update_save(indexes)
124 |         self.output_layer.update_save(indexes)
125 | 
126 |     def random_elite_idxs(self,  batch_size: int) -> np.ndarray:
127 |         idxs = np.random.choice(len(self.elites), size=batch_size)
128 |         return idxs


--------------------------------------------------------------------------------
/offlinerl/utils/net/model/maple_critic.py:
--------------------------------------------------------------------------------
 1 | import torch.nn
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | import torch.nn.functional as F
 5 | from offlinerl.utils.net.common import miniblock
 6 | 
 7 | 
 8 | class Maple_critic(nn.Module):
 9 |     def __init__(self, obs_dim, action_dim,deterministic=False,hidden_sizes=(16,),value_hidden_sizes=(256,256),lstm_hidden_unit=128):
10 |         super(Maple_critic,self).__init__()
11 |         self.obs_dim = obs_dim
12 |         self.action_dim = action_dim
13 |         self.deterministic = deterministic
14 |         self.hidden_sizes = list(hidden_sizes).copy()
15 |         self.value_hidden_sizes = list(value_hidden_sizes).copy()
16 |         self.lstm_hidden_unit = lstm_hidden_unit
17 |         self.mlp = miniblock(self.lstm_hidden_unit, self.hidden_sizes[0], None, relu=False)
18 |         if len(self.hidden_sizes) >= 2:
19 |             for i in range(1,len(self.hidden_sizes)):
20 |                 self.mlp += miniblock(self.hidden_sizes[i-1], self.hidden_sizes[i], None)
21 |         self.mlp = nn.Sequential(*self.mlp)
22 |         self.vfs = miniblock(self.hidden_sizes[-1]+self.obs_dim+self.action_dim, self.value_hidden_sizes[0],None)
23 |         if len(self.value_hidden_sizes)>=2:
24 |             for i in range(1, len(self.value_hidden_sizes)):
25 |                 self.vfs += miniblock(self.value_hidden_sizes[i-1], self.value_hidden_sizes[i], None)
26 |         self.vfs += [nn.Linear(self.value_hidden_sizes[-1], 1)]
27 |         self.vfs = nn.Sequential(*self.vfs)
28 | 
29 |     def forward(self, value_hidden, actions, obs):
30 |         out = self.mlp(value_hidden)
31 |         out = torch.cat([out, obs, actions], dim=-1)
32 |         out = self.vfs(out)
33 |         return out
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/offlinerl/utils/net/model/new_ensemble.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import os.path as path
  5 | from torch.nn import functional as F
  6 | from typing import Dict, List, Union, Tuple, Optional
  7 | 
  8 | 
  9 | class EnsembleLinear(nn.Module):
 10 |     def __init__(
 11 |         self,
 12 |         input_dim: int,
 13 |         output_dim: int,
 14 |         num_ensemble: int,
 15 |         weight_decay: float = 0.0
 16 |     ) -> None:
 17 |         super().__init__()
 18 | 
 19 |         self.num_ensemble = num_ensemble
 20 | 
 21 |         self.register_parameter("weight", nn.Parameter(torch.zeros(num_ensemble, input_dim, output_dim)))
 22 |         self.register_parameter("bias", nn.Parameter(torch.zeros(num_ensemble, 1, output_dim)))
 23 | 
 24 |         nn.init.trunc_normal_(self.weight, std=1/(2*input_dim**0.5))
 25 | 
 26 |         self.register_parameter("saved_weight", nn.Parameter(self.weight.detach().clone()))
 27 |         self.register_parameter("saved_bias", nn.Parameter(self.bias.detach().clone()))
 28 | 
 29 |         self.weight_decay = weight_decay
 30 | 
 31 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 32 |         weight = self.weight
 33 |         bias = self.bias
 34 | 
 35 |         if len(x.shape) == 2:
 36 |             x = torch.einsum('ij,bjk->bik', x, weight)
 37 |         else:
 38 |             x = torch.einsum('bij,bjk->bik', x, weight)
 39 | 
 40 |         x = x + bias
 41 | 
 42 |         return x
 43 | 
 44 |     def load_save(self) -> None:
 45 |         self.weight.data.copy_(self.saved_weight.data)
 46 |         self.bias.data.copy_(self.saved_bias.data)
 47 | 
 48 |     def update_save(self, indexes: List[int]) -> None:
 49 |         self.saved_weight.data[indexes] = self.weight.data[indexes]
 50 |         self.saved_bias.data[indexes] = self.bias.data[indexes]
 51 |     
 52 |     def get_decay_loss(self) -> torch.Tensor:
 53 |         decay_loss = self.weight_decay * (0.5*((self.weight**2).sum()))
 54 |         return decay_loss
 55 |     
 56 | 
 57 | class Swish(nn.Module):
 58 |     def __init__(self) -> None:
 59 |         super(Swish, self).__init__()
 60 | 
 61 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 62 |         x = x * torch.sigmoid(x)
 63 |         return x
 64 | 
 65 | 
 66 | def soft_clamp(
 67 |     x : torch.Tensor,
 68 |     _min: Optional[torch.Tensor] = None,
 69 |     _max: Optional[torch.Tensor] = None
 70 | ) -> torch.Tensor:
 71 |     # clamp tensor values while mataining the gradient
 72 |     if _max is not None:
 73 |         x = _max - F.softplus(_max - x)
 74 |     if _min is not None:
 75 |         x = _min + F.softplus(x - _min)
 76 |     return x
 77 | 
 78 | 
 79 | class EnsembleTransition(nn.Module):
 80 |     def __init__(
 81 |         self,
 82 |         obs_dim: int,
 83 |         action_dim: int,
 84 |         hidden_dims: Union[List[int], Tuple[int]],
 85 |         num_ensemble: int = 7,
 86 |         num_elites: int = 5,
 87 |         activation: nn.Module = Swish,
 88 |         weight_decays: Optional[Union[List[float], Tuple[float]]] = None,
 89 |         with_reward: bool = True,
 90 |         device: str = "cpu"
 91 |     ) -> None:
 92 |         super().__init__()
 93 | 
 94 |         self.num_ensemble = num_ensemble
 95 |         self.num_elites = num_elites
 96 |         self._with_reward = with_reward
 97 |         self.device = torch.device(device)
 98 | 
 99 |         self.activation = activation()
100 | 
101 |         assert len(weight_decays) == (len(hidden_dims) + 1)
102 | 
103 |         module_list = []
104 |         hidden_dims = [obs_dim+action_dim] + list(hidden_dims)
105 |         if weight_decays is None:
106 |             weight_decays = [0.0] * (len(hidden_dims) + 1)
107 |         for in_dim, out_dim, weight_decay in zip(hidden_dims[:-1], hidden_dims[1:], weight_decays[:-1]):
108 |             module_list.append(EnsembleLinear(in_dim, out_dim, num_ensemble, weight_decay))
109 |         self.backbones = nn.ModuleList(module_list)
110 | 
111 |         self.output_layer = EnsembleLinear(
112 |             hidden_dims[-1],
113 |             2 * (obs_dim + self._with_reward),
114 |             num_ensemble,
115 |             weight_decays[-1]
116 |         )
117 | 
118 |         self.register_parameter(
119 |             "max_logvar",
120 |             nn.Parameter(torch.ones(obs_dim + self._with_reward) * 0.5, requires_grad=True)
121 |         )
122 |         self.register_parameter(
123 |             "min_logvar",
124 |             nn.Parameter(torch.ones(obs_dim + self._with_reward) * -10, requires_grad=True)
125 |         )
126 | 
127 |         self.register_parameter(
128 |             "elites",
129 |             nn.Parameter(torch.tensor(list(range(0, self.num_elites))), requires_grad=False)
130 |         )
131 | 
132 |         self.to(self.device)
133 | 
134 |     def forward(self, obs_action: np.ndarray) -> Tuple[torch.Tensor, torch.Tensor]:
135 |         obs_action = torch.as_tensor(obs_action, dtype=torch.float32).to(self.device)
136 |         output = obs_action
137 |         for layer in self.backbones:
138 |             output = self.activation(layer(output))
139 |         mean, logvar = torch.chunk(self.output_layer(output), 2, dim=-1)
140 |         logvar = soft_clamp(logvar, self.min_logvar, self.max_logvar)
141 |         return mean, logvar
142 | 
143 |     def load_save(self) -> None:
144 |         for layer in self.backbones:
145 |             layer.load_save()
146 |         self.output_layer.load_save()
147 | 
148 |     def update_save(self, indexes: List[int]) -> None:
149 |         for layer in self.backbones:
150 |             layer.update_save(indexes)
151 |         self.output_layer.update_save(indexes)
152 |     
153 |     def get_decay_loss(self) -> torch.Tensor:
154 |         decay_loss = 0
155 |         for layer in self.backbones:
156 |             decay_loss += layer.get_decay_loss()
157 |         decay_loss += self.output_layer.get_decay_loss()
158 |         return decay_loss
159 | 
160 |     def set_elites(self, indexes: List[int]) -> None:
161 |         assert len(indexes) <= self.num_ensemble and max(indexes) < self.num_ensemble
162 |         self.register_parameter('elites', nn.Parameter(torch.tensor(indexes), requires_grad=False))
163 |     
164 |     def random_elite_idxs(self, batch_size: int) -> np.ndarray:
165 |         idxs = np.random.choice(self.elites.data.cpu().numpy(), size=batch_size)
166 |         return idxs
167 | 
168 | 
169 | class StandardScaler(object):
170 |     def __init__(self, mu=None, std=None):
171 |         self.mu = mu
172 |         self.std = std
173 | 
174 |     def fit(self, data):
175 |         """Runs two ops, one for assigning the mean of the data to the internal mean, and
176 |         another for assigning the standard deviation of the data to the internal standard deviation.
177 |         This function must be called within a 'with <session>.as_default()' block.
178 | 
179 |         Arguments:
180 |         data (np.ndarray): A numpy array containing the input
181 | 
182 |         Returns: None.
183 |         """
184 |         self.mu = np.mean(data, axis=0, keepdims=True)
185 |         self.std = np.std(data, axis=0, keepdims=True)
186 |         self.std[self.std < 1e-12] = 1.0
187 | 
188 |     def transform(self, data):
189 |         """Transforms the input matrix data using the parameters of this scaler.
190 | 
191 |         Arguments:
192 |         data (np.array): A numpy array containing the points to be transformed.
193 | 
194 |         Returns: (np.array) The transformed dataset.
195 |         """
196 |         return (data - self.mu) / self.std
197 | 
198 |     def inverse_transform(self, data):
199 |         """Undoes the transformation performed by this scaler.
200 | 
201 |         Arguments:
202 |         data (np.array): A numpy array containing the points to be transformed.
203 | 
204 |         Returns: (np.array) The transformed dataset.
205 |         """
206 |         return self.std * data + self.mu
207 |     
208 |     def save_scaler(self, save_path):
209 |         mu_path = path.join(save_path, "mu.npy")
210 |         std_path = path.join(save_path, "std.npy")
211 |         np.save(mu_path, self.mu)
212 |         np.save(std_path, self.std)
213 |     
214 |     def load_scaler(self, load_path):
215 |         mu_path = path.join(load_path, "mu.npy")
216 |         std_path = path.join(load_path, "std.npy")
217 |         self.mu = np.load(mu_path)
218 |         self.std = np.load(std_path)
219 | 
220 |     def transform_tensor(self, data: torch.Tensor):
221 |         device = data.device
222 |         data = self.transform(data.cpu().numpy())
223 |         data = torch.tensor(data, device=device)
224 |         return data


--------------------------------------------------------------------------------
/offlinerl/utils/net/model_GRU.py:
--------------------------------------------------------------------------------
 1 | import torch.nn
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | import torch.nn.functional as F
 5 | from offlinerl.utils.net.common import miniblock
 6 | 
 7 | class GRU_Model(nn.Module):
 8 |     def __init__(self, obs_dim, action_dim,device=None, lstm_hidden_units=128):
 9 |         super(GRU_Model, self).__init__()
10 |         self.obs_dim = obs_dim
11 |         self.action_dim = action_dim
12 |         self.device = device
13 |         self.lstm_hidden_units = lstm_hidden_units
14 |         self.GRU = nn.GRU(self.obs_dim + self.action_dim, lstm_hidden_units, batch_first=True)
15 |     def forward(self, obs, last_acts, pre_hidden, lens):
16 |         sta_acs = torch.cat([obs, last_acts], dim=-1)
17 |         packed = torch.nn.utils.rnn.pack_padded_sequence(sta_acs,lens,batch_first=True, enforce_sorted=False)
18 |         if len(pre_hidden.shape) == 2:
19 |             pre_hidden = torch.unsqueeze(pre_hidden, dim=0)
20 |         output,_ = self.GRU(packed, pre_hidden)
21 |         output,_ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
22 |         return output
23 |     def get_hidden(self, obs, last_actions, lens):
24 |         pre_hidden = torch.zeros((1,len(lens),self.lstm_hidden_units)).to(self.device)
25 |         return self(obs, last_actions, pre_hidden,lens)
26 | 


--------------------------------------------------------------------------------
/offlinerl/utils/net/moose.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from offlinerl.utils.net.common import BasePolicy
 6 | 
 7 | class VAE(nn.Module, BasePolicy):
 8 |     def __init__(self, 
 9 |                  state_dim, 
10 |                  action_dim, 
11 |                  latent_dim, 
12 |                  max_action,
13 |                  hidden_size=750):
14 |         super(VAE, self).__init__()
15 |         
16 |         self.e1 = nn.Linear(state_dim + action_dim, hidden_size)
17 |         self.e2 = nn.Linear(hidden_size, hidden_size)
18 | 
19 |         self.mean = nn.Linear(hidden_size, latent_dim)
20 |         self.log_std = nn.Linear(hidden_size, latent_dim)
21 | 
22 |         self.d1 = nn.Linear(latent_dim, hidden_size)
23 |         self.d2 = nn.Linear(hidden_size, hidden_size)
24 |         self.d3 = nn.Linear(hidden_size, state_dim + action_dim)
25 | 
26 |         self.max_action = max_action
27 |         self.latent_dim = latent_dim
28 |         
29 |         self._actor = None
30 | 
31 |     def forward(self, state, action):
32 |         z = F.relu(self.e1(torch.cat([state, action], 1)))
33 |         z = F.relu(self.e2(z))
34 | 
35 |         mean = self.mean(z)
36 |         # Clamped for numerical stability
37 |         log_std = self.log_std(z).clamp(-4, 15)
38 |         std = torch.exp(log_std)
39 |         z = mean + std * torch.randn_like(std)
40 | 
41 |         u = self.decode(z)
42 | 
43 |         return u, mean, std
44 | 
45 |     def decode(self, z):
46 |         a = F.relu(self.d1(z))
47 |         a = F.relu(self.d2(a))
48 |         a = self.d3(a)
49 |         return a
50 | 
51 |     
52 |     def policy_infer(self, obs):
53 |         return self.decode(obs)


--------------------------------------------------------------------------------
/offlinerl/utils/net/tanhpolicy.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from torch import nn as nn
  4 | from torch.nn import functional as F
  5 | from torch.distributions import Distribution, Normal
  6 | 
  7 | from offlinerl.utils.net.common import BasePolicy
  8 | from offlinerl.utils.net.continuous import ActorProb
  9 | 
 10 | 
 11 | class TanhNormal(Distribution):
 12 |     """
 13 |     Represent distribution of X where
 14 |         X = tanh(Z)
 15 |         Z ~ N(mean, std)
 16 | 
 17 |     Note: this is not very numerically stable.
 18 |     """
 19 |     def __init__(self, normal_mean, normal_std, max_action=1, min_action=-1, epsilon=1e-6):
 20 |         """
 21 |         :param normal_mean: Mean of the normal distribution
 22 |         :param normal_std: Std of the normal distribution
 23 |         :param epsilon: Numerical stability epsilon when computing log-prob.
 24 |         """
 25 |         self.normal_mean = normal_mean
 26 |         self.normal_std = normal_std
 27 |         self.normal = Normal(normal_mean, normal_std)
 28 |         self.epsilon = epsilon
 29 |         self.max_action = max_action
 30 |         self.min_action = min_action
 31 | 
 32 |     def sample_n(self, n, return_pre_tanh_value=False):
 33 |         z = self.normal.sample_n(n)
 34 |         if return_pre_tanh_value:
 35 |             return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2, z
 36 |         else:
 37 |             return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2
 38 | 
 39 |     def atanh(self,x):
 40 |         one_plus_x = (1 + x).clamp(min=1e-6)
 41 |         one_minus_x = (1 - x).clamp(min=1e-6)
 42 |         return 0.5 * torch.log(one_plus_x / one_minus_x)
 43 |     
 44 |     @property
 45 |     def mode(self):
 46 |         return ((self.max_action-self.min_action)/2) * torch.tanh(self.normal_mean) + (self.max_action+self.min_action)/2
 47 | 
 48 |     def log_prob(self, value, pre_tanh_value=None):
 49 |         """
 50 | 
 51 |         :param value: some value, x
 52 |         :param pre_tanh_value: arctanh(x)
 53 |         :return:
 54 |         """
 55 |         unscaled_value = (2*value - (self.max_action+self.min_action))/(self.max_action - self.min_action)  # assume the actual actions have been transformed
 56 |         if pre_tanh_value is None:
 57 |             pre_tanh_value = self.atanh(unscaled_value)  # get the raw Gaussian distribution output
 58 | 
 59 |         # ==== previous calculation of tanh log_prob =====
 60 |         # self.normal.log_prob(pre_tanh_value) - torch.log(
 61 |         #     1 - value * value + self.epsilon
 62 |         # )
 63 |         # previous calculation of tanhGaussian log_prob is OK when the action is in (-1,1). To be more general, we need the following revision
 64 | 
 65 |         action_scale = (self.max_action-self.min_action)/2.0
 66 |         squashed_action = unscaled_value
 67 |         log_prob = self.normal.log_prob(pre_tanh_value) - torch.log(action_scale * (1 - squashed_action.pow(2)) + self.epsilon)
 68 |         return log_prob
 69 | 
 70 |     def sample(self, return_pretanh_value=False):
 71 |         """
 72 |         Gradients will and should *not* pass through this operation.
 73 | 
 74 |         See https://github.com/pytorch/pytorch/issues/4620 for discussion.
 75 |         """
 76 |         z = self.normal.sample().detach()
 77 | 
 78 |         if return_pretanh_value:
 79 |             return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2, z
 80 |         else:
 81 |             return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2
 82 | 
 83 |     def rsample(self, return_pretanh_value=False):
 84 |         """
 85 |         Sampling in the reparameterization case.
 86 |         """
 87 |         z = (
 88 |             self.normal_mean +
 89 |             self.normal_std *
 90 |             Normal(
 91 |                 torch.zeros(self.normal_mean.size(), device=self.normal_mean.device),
 92 |                 torch.ones(self.normal_std.size(), device=self.normal_mean.device)
 93 |             ).sample()
 94 |         )
 95 |         z.requires_grad_()
 96 | 
 97 |         if return_pretanh_value:
 98 |             return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2, z
 99 |         else:
100 |             return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2
101 |         
102 | 
103 | class TanhGaussianPolicy(ActorProb, BasePolicy):
104 |     LOG_SIG_MAX = 2
105 |     LOG_SIG_MIN = -5
106 |     MEAN_MIN = -9.0
107 |     MEAN_MAX = 9.0
108 |     
109 |     def atanh(self,x):
110 |         one_plus_x = (1 + x).clamp(min=1e-6)
111 |         one_minus_x = (1 - x).clamp(min=1e-6)
112 |         return 0.5*torch.log(one_plus_x/ one_minus_x)
113 | 
114 |     def log_prob(self, obs, actions):
115 |         raw_actions = self.atanh(actions)
116 |         logits, h = self.preprocess(obs)
117 |         
118 |         mean = self.mu(logits)
119 |         mean = torch.clamp(mean, self.MEAN_MIN, self.MEAN_MAX)
120 |         if self._c_sigma:
121 |             log_std = torch.clamp(
122 |                 self.sigma(logits), min=self.LOG_SIG_MIN, max=self.LOG_SIG_MAX
123 |             )
124 |             std = log_std.exp()
125 |         else:
126 |             shape = [1] * len(mean.shape)
127 |             shape[1] = -1
128 |             log_std = (self.sigma.view(shape) + torch.zeros_like(mean))
129 |             std = log_std.exp()
130 | 
131 |         tanh_normal = TanhNormal(mean, std)
132 |         log_prob = tanh_normal.log_prob(value=actions, pre_tanh_value=raw_actions)
133 |         return log_prob.sum(-1)
134 | 
135 |     def forward(
136 |             self,
137 |             obs,
138 |             state=None,
139 |             infor={},
140 |             reparameterize=True,
141 |     ):
142 |         """
143 |         :param obs: Observation
144 |         :param deterministic: If True, do not sample
145 |         :param return_log_prob: If True, return a sample and its log probability
146 |         """
147 |         logits, h = self.preprocess(obs, state)
148 |         mean = self.mu(logits)
149 |         
150 |         if self._c_sigma:
151 |             log_std = torch.clamp(
152 |                 self.sigma(logits), min=self.LOG_SIG_MIN, max=self.LOG_SIG_MAX
153 |             )
154 |             std = log_std.exp()
155 |         else:
156 |             shape = [1] * len(mean.shape)
157 |             shape[1] = -1
158 |             log_std = (self.sigma.view(shape) + torch.zeros_like(mean))
159 |             std = log_std.exp()
160 |         
161 |         return TanhNormal(mean, std, max_action=self._max, min_action=-self._max)
162 |     
163 |     def policy_infer(self, obs):
164 |         return self(obs).mode
165 | 


--------------------------------------------------------------------------------
/offlinerl/utils/net/terminal_check.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def termination_fn_halfcheetah(obs, act, next_obs):
  5 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
  6 | 
  7 |     not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1))
  8 |     done = ~not_done
  9 |     done = done[:, None]
 10 |     return done
 11 | 
 12 | def termination_fn_hopper(obs, act, next_obs):
 13 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 14 | 
 15 |     height = next_obs[:, 0]
 16 |     angle = next_obs[:, 1]
 17 |     not_done =  np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) * \
 18 |                 np.isfinite(next_obs).all(axis=-1) \
 19 |                 * np.abs(next_obs[:,1:] < 100).all(axis=-1) \
 20 |                 * (height > .7) \
 21 |                 * (np.abs(angle) < .2)
 22 | 
 23 |     done = ~not_done
 24 |     done = done[:,None]
 25 |     return done
 26 | 
 27 | def termination_fn_halfcheetahveljump(obs, act, next_obs):
 28 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 29 | 
 30 |     done = np.array([False]).repeat(len(obs))
 31 |     done = done[:,None]
 32 |     return done
 33 | 
 34 | def termination_fn_antangle(obs, act, next_obs):
 35 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 36 | 
 37 |     x = next_obs[:, 0]
 38 |     not_done = 	np.isfinite(next_obs).all(axis=-1) \
 39 |                 * (x >= 0.2) \
 40 |                 * (x <= 1.0)
 41 | 
 42 |     done = ~not_done
 43 |     done = done[:,None]
 44 |     return done
 45 | 
 46 | def termination_fn_ant(obs, act, next_obs):
 47 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 48 | 
 49 |     x = next_obs[:, 0]
 50 |     not_done = 	np.isfinite(next_obs).all(axis=-1) \
 51 |                 * (x >= 0.2) \
 52 |                 * (x <= 1.0)
 53 | 
 54 |     done = ~not_done
 55 |     done = done[:,None]
 56 |     return done
 57 | 
 58 | def termination_fn_walker2d(obs, act, next_obs):
 59 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 60 | 
 61 |     height = next_obs[:, 0]
 62 |     angle = next_obs[:, 1]
 63 |     not_done =  np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) \
 64 |                 * (height > 0.8) \
 65 |                 * (height < 2.0) \
 66 |                 * (angle > -1.0) \
 67 |                 * (angle < 1.0)
 68 |     done = ~not_done
 69 |     done = done[:,None]
 70 |     return done
 71 | 
 72 | def termination_fn_point2denv(obs, act, next_obs):
 73 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 74 | 
 75 |     done = np.array([False]).repeat(len(obs))
 76 |     done = done[:,None]
 77 |     return done
 78 | 
 79 | def termination_fn_point2dwallenv(obs, act, next_obs):
 80 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 81 | 
 82 |     done = np.array([False]).repeat(len(obs))
 83 |     done = done[:,None]
 84 |     return done
 85 | 
 86 | def termination_fn_pendulum(obs, act, next_obs):
 87 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 88 | 
 89 |     done = np.zeros((len(obs), 1))
 90 |     return done
 91 | 
 92 | def termination_fn_humanoid(obs, act, next_obs):
 93 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 94 | 
 95 |     z = next_obs[:,0]
 96 |     done = (z < 1.0) + (z > 2.0)
 97 | 
 98 |     done = done[:,None]
 99 |     return done
100 | 
101 | def termination_fn_pen(obs, act, next_obs):
102 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
103 | 
104 |     obj_pos = next_obs[:, 24:27]
105 |     done = obj_pos[:, 2] < 0.075
106 | 
107 |     done = done[:,None]
108 |     return done
109 | 
110 | def terminaltion_fn_door(obs, act, next_obs):
111 |     assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
112 | 
113 |     done = np.array([False] * obs.shape[0])
114 | 
115 |     done = done[:, None]
116 |     return done
117 | 
118 | def is_terminal(obs,act, next_obs,task):
119 |     if 'halfcheetahvel' in task:
120 |         return termination_fn_halfcheetahveljump(obs, act, next_obs)
121 |     elif 'halfcheetah' in task:
122 |         return termination_fn_halfcheetah(obs, act, next_obs)
123 |     elif 'hopper' in task:
124 |         return termination_fn_hopper(obs,act,next_obs)
125 |     elif 'antangle' in task:
126 |         return termination_fn_antangle(obs,act,next_obs)
127 |     elif 'ant' in task:
128 |         return termination_fn_ant(obs, act, next_obs)
129 |     elif 'walker2d' in task:
130 |         return termination_fn_walker2d(obs, act, next_obs)
131 |     elif 'point2denv' in task:
132 |         return termination_fn_point2denv(obs, act, next_obs)
133 |     elif 'point2dwallenv' in task:
134 |         return termination_fn_point2dwallenv(obs,act, next_obs)
135 |     elif 'pendulum' in task:
136 |         return termination_fn_pendulum(obs,act,next_obs)
137 |     elif 'humanoid' in task:
138 |         return termination_fn_humanoid(obs, act, next_obs)
139 | 
140 | def get_termination_fn(task):
141 |     if 'halfcheetahvel' in task:
142 |         return termination_fn_halfcheetahveljump
143 |     elif 'halfcheetah' in task:
144 |         return termination_fn_halfcheetah
145 |     elif 'hopper' in task:
146 |         return termination_fn_hopper
147 |     elif 'antangle' in task:
148 |         return termination_fn_antangle
149 |     elif 'ant' in task:
150 |         return termination_fn_ant
151 |     elif 'walker2d' in task:
152 |         return termination_fn_walker2d
153 |     elif 'point2denv' in task:
154 |         return termination_fn_point2denv
155 |     elif 'point2dwallenv' in task:
156 |         return termination_fn_point2dwallenv
157 |     elif 'pendulum' in task:
158 |         return termination_fn_pendulum
159 |     elif 'humanoid' in task:
160 |         return termination_fn_humanoid
161 |     elif 'pen' in task:
162 |         return termination_fn_pen
163 |     elif 'door' in task:
164 |         return terminaltion_fn_door
165 |     elif task in ['Pipeline', 'DMSD', 'Fusion', 'Salespromotion', 'SafetyHalfCheetah']:
166 |         def terminaltion_fn(obs, act, next_obs):
167 |             data = {
168 |                 "obs" : obs,
169 |                 "action" : act,
170 |                 "next_obs" : next_obs,
171 |             }
172 |             assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
173 | 
174 |             done = np.zeros((len(obs), 1))  
175 |             return done
176 |         return terminaltion_fn
177 |     elif 'RandomFrictionHopper' in task:
178 |         from neorl2.envs.terminated.randomfrictionhopper_terminated import get_terminated
179 |         
180 |         def terminaltion_fn(obs, act, next_obs):
181 |             data = {
182 |                 "obs" : obs,
183 |                 "action" : act,
184 |                 "next_obs" : next_obs,
185 |             }
186 |             return np.bool_(get_terminated(data))
187 |         
188 |         return terminaltion_fn
189 |     elif 'Simglucose' in task:
190 |         from neorl2.envs.terminated.simglucose_terminated import get_terminated
191 |         
192 |         def terminaltion_fn(obs, act, next_obs):
193 |             data = {
194 |                 "obs" : obs,
195 |                 "action" : act,
196 |                 "next_obs" : next_obs,
197 |             }
198 |             return np.bool_(get_terminated(data))
199 |         
200 |         return terminaltion_fn
201 |     elif 'RocketRecovery' in task:
202 |         from neorl2.envs.terminated.rocketrecovery_terminated import get_terminated
203 |         
204 |         def terminaltion_fn(obs, act, next_obs):
205 |             data = {
206 |                 "obs" : obs,
207 |                 "action" : act,
208 |                 "next_obs" : next_obs,
209 |             }
210 |             return np.bool_(get_terminated(data))
211 |         
212 |         return terminaltion_fn
213 |         
214 |     else:
215 |         raise NotImplementedError(f"Task {task} not implemented")


--------------------------------------------------------------------------------
/offlinerl/utils/net/vae.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from offlinerl.utils.net.common import BasePolicy
 6 | 
 7 | class VAE(nn.Module, BasePolicy):
 8 |     def __init__(self, 
 9 |                  state_dim, 
10 |                  action_dim, 
11 |                  latent_dim, 
12 |                  max_action,
13 |                  hidden_size=750):
14 |         super(VAE, self).__init__()
15 |         
16 |         self.e1 = nn.Linear(state_dim + action_dim, hidden_size)
17 |         self.e2 = nn.Linear(hidden_size, hidden_size)
18 | 
19 |         self.mean = nn.Linear(hidden_size, latent_dim)
20 |         self.log_std = nn.Linear(hidden_size, latent_dim)
21 | 
22 |         self.d1 = nn.Linear(state_dim + latent_dim, hidden_size)
23 |         self.d2 = nn.Linear(hidden_size, hidden_size)
24 |         self.d3 = nn.Linear(hidden_size, action_dim)
25 | 
26 |         self.max_action = max_action
27 |         self.latent_dim = latent_dim
28 |         
29 |         self._actor = None
30 | 
31 |     def forward(self, state, action):
32 |         z = F.relu(self.e1(torch.cat([state, action], 1)))
33 |         z = F.relu(self.e2(z))
34 | 
35 |         mean = self.mean(z)
36 |         # Clamped for numerical stability
37 |         log_std = self.log_std(z).clamp(-4, 15)
38 |         std = torch.exp(log_std)
39 |         z = mean + std * torch.randn_like(std)
40 | 
41 |         u = self.decode(state, z)
42 | 
43 |         return u, mean, std
44 | 
45 |     def decode(self, state, z=None, clip=None, raw=False):
46 |         # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5]
47 |         if z is None:
48 |             z = torch.randn((state.shape[0], self.latent_dim)).to(state.device)
49 |             if clip is not None:
50 |                 z = z.clamp(-clip, clip)
51 | 
52 |         a = F.relu(self.d1(torch.cat([state, z], 1)))
53 |         a = F.relu(self.d2(a))
54 |         a = self.d3(a)
55 |         if raw: 
56 |             return a
57 |         return self.max_action * torch.tanh(a)
58 |     
59 |     def policy_infer(self, obs):
60 |         return self.decode(obs, z=self._actor(obs)[0])
61 |     
62 | class ActorPerturbation(nn.Module, BasePolicy):
63 |     def __init__(self, state_dim, action_dim, latent_action_dim, max_action, max_latent_action=2, phi=0.05):
64 |         super(ActorPerturbation, self).__init__()
65 | 
66 |         self.hidden_size = (400, 300, 400, 300)
67 | 
68 |         self.l1 = nn.Linear(state_dim, self.hidden_size[0])
69 |         self.l2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])
70 |         self.l3 = nn.Linear(self.hidden_size[1], latent_action_dim)
71 | 
72 |         self.l4 = nn.Linear(state_dim + action_dim, self.hidden_size[2])
73 |         self.l5 = nn.Linear(self.hidden_size[2], self.hidden_size[3])
74 |         self.l6 = nn.Linear(self.hidden_size[3], action_dim)
75 | 
76 |         self.max_latent_action = max_latent_action
77 |         self.max_action = max_action
78 |         self.phi = phi
79 |         
80 |         self.vae = None
81 | 
82 |     def forward(self, state, decoder):
83 |         a = F.relu(self.l1(state))
84 |         a = F.relu(self.l2(a))
85 |         latent_action = self.max_latent_action * torch.tanh(self.l3(a))
86 | 
87 |         mid_action = decoder(state, z=latent_action)
88 | 
89 |         a = F.relu(self.l4(torch.cat([state, mid_action], 1)))
90 |         a = F.relu(self.l5(a))
91 |         a = self.phi * torch.tanh(self.l6(a))
92 |         final_action = (a + mid_action).clamp(-self.max_action, self.max_action)
93 |         return latent_action, mid_action, final_action
94 |     
95 |     def policy_infer(self, obs):
96 |         
97 |         return self(obs, self.vae.decode)[-1]


--------------------------------------------------------------------------------
/offlinerl/utils/replay_pool.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class ReplayPool(object):
 5 |     """A class used to save and replay data."""
 6 | 
 7 |     @abc.abstractmethod
 8 |     def add_sample(self, sample):
 9 |         """Add a transition tuple."""
10 |         pass
11 | 
12 |     @abc.abstractmethod
13 |     def terminate_episode(self):
14 |         """Clean up pool after episode termination."""
15 |         pass
16 | 
17 |     @property
18 |     @abc.abstractmethod
19 |     def size(self, **kwargs):
20 |         pass
21 | 
22 |     def add_path(self, path):
23 |         """Add a rollout to the replay pool.
24 | 
25 |         This default implementation naively goes through every step, but you
26 |         may want to optimize this.
27 | 
28 |         NOTE: You should NOT call "terminate_episode" after calling add_path.
29 |         It's assumed that this function handles the episode termination.
30 | 
31 |         :param path: Dict like one outputted by railrl.samplers.util.rollout
32 |         """
33 |         self.add_samples(path)
34 |         self.terminate_episode()
35 | 
36 |     @abc.abstractmethod
37 |     def random_batch(self, batch_size):
38 |         """Return a random batch of size `batch_size`."""
39 |         pass
40 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import os
 4 | from setuptools import setup
 5 | from setuptools import find_packages
 6 | 
 7 | def get_version() -> str:
 8 |     # https://packaging.python.org/guides/single-sourcing-package-version/
 9 |     init = open(os.path.join("offlinerl", "__init__.py"), "r").read().split()
10 |     return init[init.index("__version__") + 2][1:-1]
11 | 
12 | setup(
13 |     name='offlinerl',
14 |     description="A Library for Offline RL(Batch RL)",
15 |     url="https://agit.ai/Polixir/OfflineRL",
16 |     version=get_version(),
17 |     packages=find_packages(),
18 |     author="SongyiGao",
19 |     author_email="songyigao@gmail.com",
20 |     python_requires=">=3.7",
21 |     install_requires=[
22 |         "aim",
23 |         "fire",
24 |         "loguru",
25 |         "gym",
26 |         "scikit-learn",
27 |         "gtimer",
28 |         "numpy",
29 |         "ray==2.9",
30 |         "aioredis==1.3.1",
31 |         "aiohttp==3.7.4",
32 |     ],
33 |     
34 | )
35 | 


--------------------------------------------------------------------------------