├── .gitignore
├── LICENSE
├── README.md
├── examples
├── model_tune.py
├── train_d4rl.py
├── train_task.py
└── train_tune.py
├── offlinerl
├── __init__.py
├── algo
│ ├── __init__.py
│ ├── base.py
│ ├── dynamics_model
│ │ ├── __init__.py
│ │ └── bc_model.py
│ ├── modelbase
│ │ ├── __init__.py
│ │ ├── bremen.py
│ │ ├── combo.py
│ │ ├── maple.py
│ │ ├── maple_new.py
│ │ ├── mobile.py
│ │ ├── model_base.py
│ │ ├── moose.py
│ │ ├── mopo.py
│ │ └── rambo.py
│ ├── modelfree
│ │ ├── __init__.py
│ │ ├── bc.py
│ │ ├── bcq.py
│ │ ├── bcqd.py
│ │ ├── cql.py
│ │ ├── crr.py
│ │ ├── edac.py
│ │ ├── mcq.py
│ │ ├── plas.py
│ │ ├── prdc.py
│ │ └── td3bc.py
│ └── online
│ │ ├── __init__.py
│ │ └── bremen.py
├── config
│ ├── __init__.py
│ └── algo
│ │ ├── __init__.py
│ │ ├── bc_config.py
│ │ ├── bc_model_config.py
│ │ ├── bcq_config.py
│ │ ├── bcqd_config.py
│ │ ├── bremen_config.py
│ │ ├── combo_config.py
│ │ ├── cql_config.py
│ │ ├── crr_config.py
│ │ ├── edac_config.py
│ │ ├── maple_config.py
│ │ ├── maple_config_new.py
│ │ ├── mcq_config.py
│ │ ├── mobile_config.py
│ │ ├── moose_config.py
│ │ ├── mopo_config.py
│ │ ├── plas_config.py
│ │ ├── prdc_config.py
│ │ ├── rambo_config.py
│ │ └── td3bc_config.py
├── data
│ ├── __init__.py
│ ├── d4rl.py
│ └── neorl.py
├── evaluation
│ ├── __init__.py
│ ├── d4rl.py
│ ├── fqe.py
│ ├── gym.py
│ └── neorl.py
├── outside_utils
│ ├── buffer
│ │ ├── __init__.py
│ │ └── buffer.py
│ ├── dynamics
│ │ ├── __init__.py
│ │ ├── base_dynamics.py
│ │ ├── ensemble_dynamics.py
│ │ ├── mujoco_oracle_dynamics.py
│ │ └── rnn_dynamics.py
│ ├── modules
│ │ ├── __init__.py
│ │ ├── actor_module.py
│ │ ├── critic_module.py
│ │ ├── dist_module.py
│ │ ├── dynamics_module.py
│ │ └── ensemble_critic_module.py
│ ├── nets
│ │ ├── __init__.py
│ │ ├── ensemble_linear.py
│ │ ├── mlp.py
│ │ ├── rnn.py
│ │ └── vae.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── logger.py
│ │ ├── scaler.py
│ │ └── termination_fns.py
└── utils
│ ├── __init__.py
│ ├── config.py
│ ├── data.py
│ ├── env.py
│ ├── exp.py
│ ├── flexible_replay_pool.py
│ ├── function.py
│ ├── io.py
│ ├── loader.py
│ ├── logger.py
│ ├── net
│ ├── __init__.py
│ ├── bcq_net.py
│ ├── common.py
│ ├── continuous.py
│ ├── maple_actor.py
│ ├── mlas.py
│ ├── model
│ │ ├── __init__.py
│ │ ├── ensemble.py
│ │ ├── maple_critic.py
│ │ └── new_ensemble.py
│ ├── model_GRU.py
│ ├── moose.py
│ ├── tanhpolicy.py
│ ├── terminal_check.py
│ └── vae.py
│ ├── replay_pool.py
│ └── simple_replay_pool.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 |
7 | # define
8 | test/
9 | .aim*
10 | offlinerl_tmp/
11 |
12 | # C extensions
13 | *.so
14 |
15 | # .idea folder
16 | .idea/
17 |
18 | # Distribution / packaging
19 | .Python
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | wheels/
32 | pip-wheel-metadata/
33 | share/python-wheels/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 | MANIFEST
38 |
39 | # PyInstaller
40 | # Usually these files are written by a python script from a template
41 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
42 | *.manifest
43 | *.spec
44 |
45 | # Installer logs
46 | pip-log.txt
47 | pip-delete-this-directory.txt
48 |
49 | # Unit test / coverage reports
50 | htmlcov/
51 | .tox/
52 | .nox/
53 | .coverage
54 | .coverage.*
55 | .cache
56 | nosetests.xml
57 | coverage.xml
58 | *.cover
59 | *.py,cover
60 | .hypothesis/
61 | .pytest_cache/
62 | cover/
63 |
64 | # Translations
65 | *.mo
66 | *.pot
67 |
68 | # Django stuff:
69 | *.log
70 | *.out
71 | local_settings.py
72 | db.sqlite3
73 | db.sqlite3-journal
74 |
75 | # Flask stuff:
76 | instance/
77 | .webassets-cache
78 |
79 | # Scrapy stuff:
80 | .scrapy
81 |
82 | # Sphinx documentation
83 | docs/_build/
84 |
85 | # PyBuilder
86 | target/
87 |
88 | # Jupyter Notebook
89 | .ipynb_checkpoints
90 |
91 | # IPython
92 | profile_default/
93 | ipython_config.py
94 |
95 | # pyenv
96 | # For a library or package, you might want to ignore these files since the code is
97 | # intended to run in multiple environments; otherwise, check them in:
98 | # .python-version
99 |
100 | # pipenv
101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | # install all needed dependencies.
105 | #Pipfile.lock
106 |
107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
108 | __pypackages__/
109 |
110 | # Celery stuff
111 | celerybeat-schedule
112 | celerybeat.pid
113 |
114 | # SageMath parsed files
115 | *.sage.py
116 |
117 | # Environments
118 | .env
119 | .venv
120 | venv/
121 | ENV/
122 | env.bak/
123 | venv.bak/
124 |
125 | # Spyder project settings
126 | .spyderproject
127 | .spyproject
128 |
129 | # Rope project settings
130 | .ropeproject
131 |
132 | # mkdocs documentation
133 | /site
134 |
135 | # mypy
136 | .mypy_cache/
137 | .dmypy.json
138 | dmypy.json
139 |
140 | # Pyre type checker
141 | .pyre/
142 |
143 | # pytype static type analyzer
144 | .pytype/
145 |
146 | # customize
147 | log/
148 | MUJOCO_LOG.TXT
149 | *.pth
150 | .vscode/
151 | .DS_Store
152 | *.zip
153 | *.pstats
154 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OfflineRL
2 |
3 | OfflineRL is a repository for Offline RL (batch reinforcement learning or offline reinforcement learning).
4 |
5 | ## Re-implemented Algorithms
6 | ### Model-free methods
7 | - **CRR**: Wang, Ziyu, et al. “Critic Regularized Regression.” Advances in Neural Information Processing Systems, vol. 33, 2020, pp. 7768–7778. [paper](https://arxiv.org/abs/2006.15134)
8 | - **CQL**: Kumar, Aviral, et al. “Conservative Q-Learning for Offline Reinforcement Learning.” Advances in Neural Information Processing Systems, vol. 33, 2020. [paper](https://arxiv.org/abs/2006.04779) [code](https://github.com/aviralkumar2907/CQL)
9 | - **PLAS**: Zhou, Wenxuan, et al. “PLAS: Latent Action Space for Offline Reinforcement Learning.” ArXiv Preprint ArXiv:2011.07213, 2020.
10 | [website](https://sites.google.com/view/latent-policy) [paper](https://arxiv.org/abs/2011.07213) [code](https://github.com/Wenxuan-Zhou/PLAS)
11 | - **BCQ**: Fujimoto, Scott, et al. “Off-Policy Deep Reinforcement Learning without Exploration.” International Conference on Machine Learning, 2018, pp. 2052–2062. [paper](https://arxiv.org/abs/1812.02900) [code](https://github.com/sfujim/BCQ)
12 | - **EDAC**: An, Gaon, et al. "Uncertainty-based offline reinforcement learning with diversified q-ensemble." Advances in neural information processing systems 34 (2021): 7436-7447. [paper](https://arxiv.org/abs/2110.01548) [code](https://github.com/snu-mllab/EDAC)
13 | - **MCQ**: Lyu, Jiafei, et al. "Mildly conservative q-learning for offline reinforcement learning." Advances in Neural Information Processing Systems 35 (2022): 1711-1724. [paper](https://arxiv.org/abs/2206.04745) [code](https://github.com/dmksjfl/MCQ)
14 | - **TD3BC**: Fujimoto, Scott, and Shixiang Shane Gu. "A minimalist approach to offline reinforcement learning." Advances in neural information processing systems 34 (2021): 20132-20145. [paper](https://arxiv.org/abs/2106.06860) [code](https://github.com/sfujim/TD3_BC)
15 | - **PRDC**: Ran, Yuhang, et al. “Policy Regularization with Dataset Constraint for Offline Reinforcement Learning.” International Conference on Machine Learning, 2023, pp. 28701-28717. [paper](https://arxiv.org/abs/2306.06569) [code](https://github.com/LAMDA-RL/PRDC)
16 | ### Model-based methods
17 | - **BREMEN**: Matsushima, Tatsuya, et al. “Deployment-Efficient Reinforcement Learning via Model-Based Offline Optimization.” International Conference on Learning Representations, 2021. [paper](https://openreview.net/forum?id=3hGNqpI4WS) [code](https://github.com/matsuolab/BREMEN)
18 | - **COMBO**: Yu, Tianhe, et al. "COMBO: Conservative Offline Model-Based Policy Optimization." arXiv preprint arXiv:2102.08363 (2021). [paper](https://arxiv.org/abs/2102.08363)
19 | - **MOPO**: Yu, Tianhe, et al. “MOPO: Model-Based Offline Policy Optimization.” Advances in Neural Information Processing Systems, vol. 33, 2020. [paper](https://papers.nips.cc/paper/2020/hash/a322852ce0df73e204b7e67cbbef0d0a-Abstract.html) [code](https://github.com/tianheyu927/mopo)
20 | - **MAPLE**: Xiong-Hui Chen, et al. "MAPLE: Offline Model-based Adaptable Policy Learning". Advances in Neural Information Processing Systems, vol. 34, 2021. [paper](https://proceedings.neurips.cc/paper/2021/hash/470e7a4f017a5476afb7eeb3f8b96f9b-Abstract.html) [code](https://github.com/xionghuichen/MAPLE)
21 | - **MOBILE**: Yihao Sun, et al. "Model-Bellman Inconsistency for Model-based Offline Reinforcement Learning". Proceedings of the 40th International Conference on Machine Learning, PMLR 202:33177-33194, 2023. [paper](https://proceedings.mlr.press/v202/sun23q.html) [code](https://github.com/yihaosun1124/mobile)
22 | - **RAMBO**: Rigter, Marc, Bruno Lacerda, and Nick Hawes. "Rambo-rl: Robust adversarial model-based offline reinforcement learning." Advances in neural information processing systems 35 (2022): 16082-16097. [paper](https://arxiv.org/abs/2204.12581) [code](https://github.com/marc-rigter/rambo)
23 |
24 | ## Install Datasets
25 | ### NeoRL
26 |
27 | ```shell
28 | git clone https://github.com/Polixir/neorl.git
29 | cd neorl
30 | pip install -e .
31 | ```
32 |
33 | For more details on use, please see [neorl](https://github.com/Polixir/neorl).
34 |
35 | ### D4RL (Optional)
36 | ```shell
37 | pip install git+https://github.com/rail-berkeley/d4rl@master#egg=d4rl
38 | ```
39 |
40 | For more details on use, please see [d4rl](https://github.com/rail-berkeley/d4rl).
41 |
42 | ## Install offlinerl
43 |
44 | ```shell
45 | pip install -e .
46 | ```
47 |
48 | ## Example
49 |
50 | ```python
51 | # Training in HalfCheetah-v3-L-9 task using default parameters of cql algorithm
52 | python examples/train_task.py --algo_name=cql --exp_name=halfcheetah --task HalfCheetah-v3 --task_data_type low --task_train_num 100
53 |
54 | # Training in SafetyHalfCheetahtask using default parameters of cql algorithm
55 | python examples/train_task.py --algo_name=mcq --exp_name=SafetyHalfCheetah --task SafetyHalfCheetah
56 |
57 | # Parameter search in the default parameter space using the cql algorithm in the HalfCheetah-v3-L-9 task
58 | python examples/train_tune.py --algo_name=cql --exp_name=halfcheetah --task HalfCheetah-v3 --task_data_type low --task_train_num 100
59 |
60 | # Parameter search in the default parameter space using the cql algorithm in the SafetyHalfCheetahtask task
61 | # python examples/train_tune.py --algo_name=mcq --exp_name=SafetyHalfCheetah --task SafetyHalfCheetah
62 |
63 | # Training in D4RL halfcheetah-medium task using default parameters of cql algorithm (D4RL need to be installed)
64 | python examples/train_d4rl.py --algo_name=cql --exp_name=d4rl-halfcheetah-medium-cql --task d4rl-halfcheetah-medium-v0
65 | ```
66 |
67 | **Parameters:**
68 |
69 | - **algo_name**: Algorithm name . There are now bc, cql, plas, bcq and mopo algorithms available.
70 | - **exp_name**: Experiment name for easy visualization using aim.
71 | - **task**: Task name, See [neorl](https://github.com/Polixir/neorl/wiki/Tasks) for details.
72 | - **task_data_type**: Data level. Each task collects data using low, medium, and high level strategies in [neorl](https://github.com/Polixir/neorl).
73 | - **task_train_num**: Number of training data trajectories. For each task, neorl provides training data for up to 10000 trajectories.
74 |
75 |
76 |
77 | ## View experimental results
78 | We use **Aim** to store and visualize results. Aim is an experiment logger that is easy to manage thousands of experiments. For more details, see [aim](https://github.com/aimhubio/aim).
79 |
80 | To visualize results in this repository:
81 | ```shell
82 | cd offlinerl_tmp
83 | aim up
84 | ```
85 | Then you can see the results on http://127.0.0.1:43800.
86 |
87 |
88 | ## Model-based Running Example
89 |
90 | ```python
91 | # Tune and save the transition models
92 | python examples/model_tune.py --algo_name bc_model --exp_name neorl-RandomFrictionHopper-model --task RandomFrictionHopper
93 | ```
94 |
95 | ```python
96 | # Training MOPO and load the best transition model
97 | python examples/train_task.py --algo_name mopo --exp_name neorl-safecheetah-mopo-new --task SafetyHalfCheetah --dynamics_path best_run_id
98 |
99 | # Training COMBO and load the best transition model
100 | python examples/train_task.py --algo_name combo --exp_name neorl-safecheetah-combo-new --task SafetyHalfCheetah --dynamics_path best_run_id
101 |
102 | # Training RAMBO and load the best transition model
103 | python examples/train_task.py --algo_name rambo --exp_name neorl-safecheetah-rambo-new --task SafetyHalfCheetah --dynamics_path best_run_id
104 |
105 | # Training MOBILE and load the best transition model
106 | python examples/train_task.py --algo_name mobile --exp_name neorl-safecheetah-mobile-new --task SafetyHalfCheetah --dynamics_path best_run_id
107 | ```
108 |
--------------------------------------------------------------------------------
/examples/model_tune.py:
--------------------------------------------------------------------------------
1 | import fire
2 | import random
3 | from ray import tune
4 |
5 | from offlinerl.algo import algo_select
6 | from offlinerl.data import load_data_from_neorl
7 | from offlinerl.evaluation import get_defalut_callback, ModelCallBackFunction
8 |
9 | def training_function(config):
10 | algo_init_fn, algo_trainer_obj, algo_config = algo_select(config["kwargs"])
11 | train_buffer, val_buffer = load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"])
12 | algo_config.update(config)
13 | algo_config["device"] = "cuda"
14 | algo_init = algo_init_fn(algo_config)
15 | algo_trainer = algo_trainer_obj(algo_init, algo_config)
16 |
17 | callback = ModelCallBackFunction()
18 | callback.initialize(train_buffer=train_buffer, val_buffer=val_buffer, task=algo_config["task"])
19 |
20 | score = algo_trainer.train(train_buffer, None, callback_fn=callback)
21 |
22 | # return score
23 | return 0
24 |
25 |
26 | def run_algo(**kwargs):
27 | config = {}
28 | config["kwargs"] = kwargs
29 | config["kwargs"]['seed'] = random.randint(0, 1000000)
30 | _, _, algo_config = algo_select(kwargs)
31 | # Prepare Dataset
32 | load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"])
33 | grid_tune = algo_config["grid_tune"]
34 | for k,v in grid_tune.items():
35 | config[k] = tune.grid_search(v)
36 |
37 | analysis = tune.run(
38 | training_function,
39 | config=config,
40 | resources_per_trial={"gpu": 0.5},
41 | )
42 |
43 |
44 | if __name__ == "__main__":
45 | fire.Fire(run_algo)
46 |
--------------------------------------------------------------------------------
/examples/train_d4rl.py:
--------------------------------------------------------------------------------
1 | import fire
2 |
3 | from offlinerl.algo import algo_select
4 | from offlinerl.data.d4rl import load_d4rl_buffer
5 | from offlinerl.evaluation import OnlineCallBackFunction
6 |
7 |
8 | def run_algo(**kwargs):
9 | algo_init_fn, algo_trainer_obj, algo_config = algo_select(kwargs)
10 | train_buffer = load_d4rl_buffer(algo_config["task"])
11 | algo_init = algo_init_fn(algo_config)
12 | algo_trainer = algo_trainer_obj(algo_init, algo_config)
13 | callback = OnlineCallBackFunction()
14 | callback.initialize(train_buffer=train_buffer, val_buffer=None,
15 | task=algo_config["task"], number_of_runs=algo_config.get("eval_episodes",100))
16 |
17 | algo_trainer.train(train_buffer, None, callback_fn=callback)
18 |
19 | if __name__ == "__main__":
20 | fire.Fire(run_algo)
21 |
--------------------------------------------------------------------------------
/examples/train_task.py:
--------------------------------------------------------------------------------
1 | import fire
2 |
3 | from offlinerl.algo import algo_select
4 | from offlinerl.data import load_data_from_neorl
5 | from offlinerl.evaluation import get_defalut_callback, OnlineCallBackFunction
6 |
7 |
8 | def run_algo(**kwargs):
9 | algo_init_fn, algo_trainer_obj, algo_config = algo_select(kwargs)
10 | train_buffer, val_buffer = load_data_from_neorl(algo_config["task"],
11 | algo_config["task_data_type"], algo_config["task_train_num"])
12 | algo_config['data_name'] = "neorl2-" + algo_config["task"]
13 | algo_init = algo_init_fn(algo_config)
14 | algo_trainer = algo_trainer_obj(algo_init, algo_config)
15 | callback = OnlineCallBackFunction()
16 | callback.initialize(train_buffer=train_buffer, val_buffer=val_buffer,
17 | task=algo_config["task"], number_of_runs=algo_config.get("eval_episodes",100))
18 |
19 | algo_trainer.train(train_buffer, None, callback_fn=callback)
20 |
21 | if __name__ == "__main__":
22 | fire.Fire(run_algo)
23 |
--------------------------------------------------------------------------------
/examples/train_tune.py:
--------------------------------------------------------------------------------
1 | import fire
2 | import random
3 | from ray import tune
4 |
5 | from offlinerl.algo import algo_select
6 | from offlinerl.data import load_data_from_neorl
7 | from offlinerl.evaluation import get_defalut_callback, OnlineCallBackFunction
8 |
9 | def training_function(config):
10 | algo_init_fn, algo_trainer_obj, algo_config = algo_select(config["kwargs"])
11 | train_buffer, val_buffer = load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"])
12 | algo_config.update(config)
13 | algo_config["device"] = "cuda"
14 | algo_init = algo_init_fn(algo_config)
15 | algo_trainer = algo_trainer_obj(algo_init, algo_config)
16 |
17 | callback = OnlineCallBackFunction()
18 | callback.initialize(train_buffer=train_buffer, val_buffer=val_buffer, task=algo_config["task"])
19 |
20 | score = algo_trainer.train(train_buffer, None, callback_fn=callback)
21 |
22 | # return score
23 | return 0
24 |
25 |
26 | def run_algo(**kwargs):
27 | config = {}
28 | config["kwargs"] = kwargs
29 | config["kwargs"]['seed'] = random.randint(0, 1000000)
30 | _, _, algo_config = algo_select(kwargs)
31 | # Prepare Dataset
32 | load_data_from_neorl(algo_config["task"], algo_config["task_data_type"], algo_config["task_train_num"])
33 | grid_tune = algo_config["grid_tune"]
34 | for k,v in grid_tune.items():
35 | config[k] = tune.grid_search(v)
36 |
37 | analysis = tune.run(
38 | training_function,
39 | config=config,
40 | resources_per_trial={"gpu": 0.333333},
41 | )
42 |
43 |
44 | if __name__ == "__main__":
45 | fire.Fire(run_algo)
--------------------------------------------------------------------------------
/offlinerl/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from loguru import logger
3 |
4 | from offlinerl import algo, data, evaluation, utils, config
5 |
6 | logger_config = {
7 | "handlers": [
8 | {"sink": sys.stdout,
9 | "colorize" : True,
10 | #"format" : "{time} {message}",
11 | "format" : "{time:YYYY-MM-DD at HH:mm:ss.SSS} | {level} | {message}",
12 | "enqueue" : True,
13 | "backtrace" : True,
14 | "diagnose" : True,
15 | },
16 | ],
17 |
18 | }
19 | logger.configure(**logger_config)
20 |
21 | #logger.disable("offlinerl")
22 | logger.enable("offlinerl")
23 |
24 | __version__ = "0.0.1"
25 |
26 | __all__ = [
27 | "algo",
28 | "data",
29 | "evaluation",
30 | "utils",
31 | "config",
32 | ]
--------------------------------------------------------------------------------
/offlinerl/algo/__init__.py:
--------------------------------------------------------------------------------
1 | from loguru import logger
2 | import warnings
3 |
4 | warnings.filterwarnings('ignore')
5 |
6 |
7 | from offlinerl.config.algo import edac_config, mcq_config, cql_config, plas_config, mopo_config, moose_config, bcqd_config, bcq_config, bc_config, crr_config, combo_config, bremen_config, maple_config, mobile_config, rambo_config, td3bc_config, bc_model_config, maple_config_new,prdc_config
8 | from offlinerl.utils.config import parse_config
9 | from offlinerl.algo.modelfree import cql, plas, bcqd, bcq, bc, crr, edac, mcq, td3bc, prdc
10 | from offlinerl.algo.modelbase import mopo, moose, combo, bremen, maple, mobile, rambo, maple_new
11 | from offlinerl.algo.dynamics_model import bc_model
12 |
13 | algo_dict = {
14 | 'edac' : {"algo" : edac, "config" : edac_config},
15 | 'bc' : {"algo" : bc, "config" : bc_config},
16 | 'bcq' : {"algo" : bcq, "config" : bcq_config},
17 | 'mcq' : {"algo" : mcq, "config" : mcq_config},
18 | 'bcqd' : {"algo" : bcqd, "config" : bcqd_config},
19 | 'combo' : {"algo" : combo, "config" : combo_config},
20 | "cql" : {"algo" : cql, "config" : cql_config},
21 | "crr" : {"algo" : crr, "config" : crr_config},
22 | "plas" : {"algo" : plas, "config" : plas_config},
23 | "prdc" : {"algo" : prdc, "config" : prdc_config},
24 | 'moose' : {"algo" : moose, "config" : moose_config},
25 | 'mopo': {"algo" : mopo, "config": mopo_config},
26 | 'bremen' : {"algo" : bremen, "config" : bremen_config},
27 | 'maple': {'algo':maple , 'config':maple_config},
28 | 'mobile': {'algo':mobile , 'config':mobile_config},
29 | 'rambo': {'algo':rambo , 'config':rambo_config},
30 | 'td3bc': {'algo':td3bc , 'config':td3bc_config},
31 | 'bc_model': {'algo':bc_model , 'config':bc_model_config},
32 | 'maple_new': {'algo':maple_new , 'config':maple_config_new},
33 | }
34 |
35 | def algo_select(command_args, algo_config_module=None):
36 | algo_name = command_args["algo_name"]
37 | logger.info('Use {} algorithm!', algo_name)
38 | assert algo_name in algo_dict.keys()
39 | algo = algo_dict[algo_name]["algo"]
40 |
41 | if algo_config_module is None:
42 | algo_config_module = algo_dict[algo_name]["config"]
43 | algo_config = parse_config(algo_config_module)
44 | algo_config.update(command_args)
45 |
46 | algo_init = algo.algo_init
47 | algo_trainer = algo.AlgoTrainer
48 |
49 | return algo_init, algo_trainer, algo_config
50 |
51 |
--------------------------------------------------------------------------------
/offlinerl/algo/base.py:
--------------------------------------------------------------------------------
1 | import os
2 | import uuid
3 | import json
4 | from abc import ABC, abstractmethod
5 |
6 | import torch
7 | from collections import OrderedDict
8 | from loguru import logger
9 | from offlinerl.utils.exp import init_exp_run
10 | from offlinerl.utils.io import create_dir
11 | from offlinerl.utils.logger import log_path
12 |
13 |
14 | import time
15 | import random
16 |
17 | class BaseAlgo(ABC):
18 | def __init__(self, args):
19 | logger.info('Init AlgoTrainer')
20 | if "exp_name" not in args.keys():
21 | exp_name = str(uuid.uuid1()).replace("-","")
22 | else:
23 | exp_name = args["exp_name"]
24 |
25 | if "aim_path" in args.keys():
26 | if os.path.exists(args["aim_path"]):
27 | time.sleep(random.randint(1, 5))
28 | repo = args["aim_path"]
29 | else:
30 | os.makedirs(args["aim_path"])
31 | repo = args["aim_path"]
32 | else:
33 | repo = None
34 |
35 | self.repo = repo
36 |
37 | try:
38 | self.exp_run = init_exp_run(repo = repo, experiment_name = exp_name)
39 | except:
40 | time.sleep(random.randint(1, 5))
41 | self.exp_run = init_exp_run(repo = repo, experiment_name = exp_name)
42 |
43 | if self.exp_run.repo is not None: # a naive fix of aim exp_logger.repo is None
44 | self.index_path = self.exp_run.repo.path
45 | else:
46 | repo = os.path.join(log_path(),"./.aim")
47 | if not os.path.exists(repo):
48 | logger.info('{} dir is not exist, create {}',repo, repo)
49 | os.system(str("cd " + os.path.join(repo,"../") + "&& aim init"))
50 | self.index_path = repo
51 |
52 | print(f'self.index_path/{self.index_path}')
53 | self.models_save_dir = os.path.join(self.index_path, "models")
54 | self.metric_logs = OrderedDict()
55 | self.metric_logs_path = os.path.join(self.index_path, "metric_logs.json")
56 | create_dir(self.models_save_dir)
57 |
58 | # self.exp_run.set_params(args, name='hparams')
59 | self.exp_run['hparams'] = args
60 |
61 | def log_res(self, epoch, result):
62 | logger.info('Epoch : {}', epoch)
63 | for k,v in result.items():
64 | logger.info('{} : {}',k, v)
65 | self.exp_run.track(v, name=k.split(" ")[0], epoch=epoch,)
66 |
67 | self.metric_logs[str(epoch)] = result
68 | with open(self.metric_logs_path,"w") as f:
69 | json.dump(self.metric_logs,f)
70 |
71 | self.run_id = self.exp_run.name.split( )[-1]
72 | tmp_dir = os.path.join(self.models_save_dir, self.run_id)
73 | if not os.path.exists(tmp_dir):
74 | os.makedirs(tmp_dir)
75 | # self.save_model(os.path.join(tmp_dir, str(epoch) + ".pt"))
76 | self.save_model(os.path.join(tmp_dir, "policy.pt"))
77 |
78 | self.report_result = result
79 | self.report_result["hparams"] = self.exp_run['hparams']
80 | self.report_result["model_path"] = os.path.join(tmp_dir, "policy.pt")
81 |
82 |
83 | @abstractmethod
84 | def train(self,
85 | history_buffer,
86 | eval_fn=None,):
87 | pass
88 |
89 | def _sync_weight(self, net_target, net, soft_target_tau = 5e-3):
90 | for o, n in zip(net_target.parameters(), net.parameters()):
91 | o.data.copy_(o.data * (1.0 - soft_target_tau) + n.data * soft_target_tau)
92 |
93 | @abstractmethod
94 | def get_policy(self,):
95 | pass
96 |
97 | #@abstractmethod
98 | def save_model(self, model_path):
99 | torch.save(self.get_policy(), model_path)
100 |
101 | #@abstractmethod
102 | def load_model(self, model_path):
103 | model = torch.load(model_path)
104 |
105 | return model
--------------------------------------------------------------------------------
/offlinerl/algo/dynamics_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/dynamics_model/__init__.py
--------------------------------------------------------------------------------
/offlinerl/algo/modelbase/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/modelbase/__init__.py
--------------------------------------------------------------------------------
/offlinerl/algo/modelbase/mopo.py:
--------------------------------------------------------------------------------
1 | # MOPO: Model-based Offline Policy Optimization
2 | # https://arxiv.org/abs/2005.13239
3 | # https://github.com/tianheyu927/mopo
4 | import os
5 | import torch
6 | import numpy as np
7 | from copy import deepcopy
8 | from loguru import logger
9 | from collections import deque
10 | from typing import Dict
11 |
12 | from offlinerl.algo.modelbase.model_base import algo_init, ModelBasedAlgoTrainer
13 |
14 |
15 | class AlgoTrainer(ModelBasedAlgoTrainer):
16 | def __init__(self, algo_init, args):
17 | super(AlgoTrainer, self).__init__(algo_init, args)
18 |
19 | self.fake_buffer_size = self.args["model_retain_epochs"] * self.args["rollout_batch_size"] * self.args["horizon"]
20 | self.lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.actor_optim, args['max_epoch'])
21 |
22 | def policy_learn(self, batch: Dict):
23 | real_batch, fake_batch = batch["real"], batch["fake"]
24 | mix_batch = {k: torch.cat([real_batch[k], fake_batch[k]], 0) for k in real_batch.keys()}
25 |
26 | obss, actions, next_obss, rewards, terminals = mix_batch["observations"], mix_batch["actions"], \
27 | mix_batch["next_observations"], mix_batch["rewards"], mix_batch["terminals"]
28 |
29 | # update critic
30 | q1, q2 = self.critic1(obss, actions), self.critic2(obss, actions)
31 | with torch.no_grad():
32 | next_actions, next_log_probs = self.actforward(next_obss)
33 | next_q = torch.min(
34 | self.target_critic1(next_obss, next_actions), self.target_critic2(next_obss, next_actions)
35 | ) - self._alpha * next_log_probs
36 | target_q = rewards + self._gamma * (1 - terminals) * next_q
37 |
38 | critic1_loss = ((q1 - target_q).pow(2)).mean()
39 | self.critic1_optim.zero_grad()
40 | critic1_loss.backward()
41 | self.critic1_optim.step()
42 |
43 | critic2_loss = ((q2 - target_q).pow(2)).mean()
44 | self.critic2_optim.zero_grad()
45 | critic2_loss.backward()
46 | self.critic2_optim.step()
47 |
48 | # update actor
49 | a, log_probs = self.actforward(obss)
50 | q1a, q2a = self.critic1(obss, a), self.critic2(obss, a)
51 |
52 | actor_loss = - torch.min(q1a, q2a).mean() + self._alpha * log_probs.mean()
53 | self.actor_optim.zero_grad()
54 | actor_loss.backward()
55 | self.actor_optim.step()
56 |
57 | if self._is_auto_alpha:
58 | log_probs = log_probs.detach() + self._target_entropy
59 | alpha_loss = -(self._log_alpha * log_probs).mean()
60 | self.alpha_optim.zero_grad()
61 | alpha_loss.backward()
62 | self.alpha_optim.step()
63 | self._alpha = torch.clamp(self._log_alpha.detach().exp(), 0.0, 1.0)
64 |
65 | self._sync_weight()
66 |
67 | result = {
68 | "loss/actor": actor_loss.item(),
69 | "loss/critic1": critic1_loss.item(),
70 | "loss/critic2": critic2_loss.item(),
71 | }
72 |
73 | if self._is_auto_alpha:
74 | result["loss/alpha"] = alpha_loss.item()
75 | result["alpha"] = self._alpha.item()
76 |
77 | return result
78 |
--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/modelfree/__init__.py
--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/bc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from copy import deepcopy
3 | from loguru import logger
4 |
5 | from offlinerl.algo.base import BaseAlgo
6 | from offlinerl.utils.net.continuous import GaussianActor
7 | from offlinerl.utils.exp import setup_seed
8 |
9 |
10 | def algo_init(args):
11 | logger.info('Run algo_init function')
12 |
13 | setup_seed(args['seed'])
14 |
15 | if args["obs_shape"] and args["action_shape"]:
16 | obs_shape, action_shape = args["obs_shape"], args["action_shape"]
17 | max_action = args["max_action"]
18 | elif "task" in args.keys():
19 | from offlinerl.utils.env import get_env_shape, get_env_action_range
20 | obs_shape, action_shape = get_env_shape(args['task'])
21 | max_action, _ = get_env_action_range(args["task"])
22 | args["obs_shape"], args["action_shape"] = obs_shape, action_shape
23 | else:
24 | raise NotImplementedError
25 |
26 | actor = GaussianActor(obs_shape, action_shape, args['actor_features'], args['actor_layers']).to(args['device'])
27 | actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr'])
28 |
29 | return {
30 | "actor" : {"net" : actor, "opt" : actor_optim},
31 | }
32 |
33 |
34 | class AlgoTrainer(BaseAlgo):
35 | def __init__(self, algo_init, args):
36 | super(AlgoTrainer, self).__init__(args)
37 | self.args = args
38 |
39 | self.actor = algo_init['actor']['net']
40 | self.actor_optim = algo_init['actor']['opt']
41 |
42 | self.batch_size = self.args['batch_size']
43 | self.device = self.args['device']
44 |
45 | self.best_actor = deepcopy(self.actor)
46 | self.best_loss = float('inf')
47 |
48 | def train(self, train_buffer, val_buffer, callback_fn):
49 | if val_buffer == None:
50 | from offlinerl.utils.data import SampleBatch
51 | ori_buffer = deepcopy(train_buffer)
52 | sep_len = int(len(ori_buffer)*0.1)
53 | val_buffer = SampleBatch(ori_buffer[-sep_len:])
54 | train_buffer = SampleBatch(ori_buffer[:-sep_len])
55 | # breakpoint()
56 | for epoch in range(self.args['max_epoch']):
57 | for i in range(self.args['steps_per_epoch']):
58 | batch_data = train_buffer.sample(self.batch_size)
59 | batch_data.to_torch(device=self.device)
60 | obs = batch_data['obs']
61 | action = batch_data['act']
62 |
63 | action_dist = self.actor(obs)
64 | # loss = - action_dist.log_prob(action).mean()
65 | loss = ((action_dist.mode - action) ** 2).mean()
66 |
67 | self.actor_optim.zero_grad()
68 | loss.backward()
69 | self.actor_optim.step()
70 |
71 | with torch.no_grad():
72 | val_loss = 0
73 | for i in range(len(val_buffer) // self.batch_size + (len(val_buffer) % self.batch_size > 0)):
74 | batch_data = val_buffer[i*self.batch_size:(i+1)*self.batch_size]
75 | batch_data.to_torch(device=self.device)
76 | obs = batch_data['obs']
77 | action = batch_data['act']
78 |
79 | action_dist = self.actor(obs)
80 | val_loss += ((action_dist.mean - action) ** 2).mean().item()
81 |
82 | if val_loss < self.best_loss:
83 | self.best_loss = val_loss
84 | self.best_actor.load_state_dict(self.actor.state_dict())
85 |
86 | res = callback_fn(self.get_policy())
87 | res['loss'] = val_loss
88 | self.log_res(epoch, res)
89 |
90 | return self.report_result
91 |
92 | def get_policy(self):
93 | return self.best_actor
94 |
--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/bcqd.py:
--------------------------------------------------------------------------------
1 | #Discrete Batch-Constrained deep Q-Learning (BCQ)
2 | import copy
3 |
4 | import torch
5 | import numpy as np
6 | from torch import nn
7 | from torch import optim
8 | import torch.nn.functional as F
9 | from loguru import logger
10 |
11 | from offlinerl.algo.base import BaseAlgo
12 | from offlinerl.utils.net.bcq_net import Conv_Q, FC_Q
13 | from offlinerl.utils.exp import setup_seed
14 |
15 |
16 | def algo_init(args):
17 | logger.info('Run algo_init function')
18 |
19 | setup_seed(args['seed'])
20 |
21 | if args["obs_shape"] and args["action_shape"]:
22 | obs_shape, action_shape = args["obs_shape"], args["action_shape"]
23 | elif "task" in args.keys():
24 | from offlinerl.utils.env import get_env_shape
25 | obs_shape, action_shape = get_env_shape(args['task'])
26 | args["obs_shape"], args["action_shape"] = obs_shape, action_shape
27 | else:
28 | raise NotImplementedError
29 |
30 | if isinstance(args["obs_shape"], int):
31 | state_dim = (
32 | 4,
33 | 84,
34 | 84
35 | )
36 |
37 | critic = Conv_Q(state_dim[0], args["action_shape"]).to(args['device'])
38 | else:
39 | critic = FC_Q(np.prod(args["obs_shape"]), args["action_shape"]).to(args['device'])
40 |
41 | critic_opt = optim.Adam(critic.parameters(), **args["optimizer_parameters"])
42 |
43 |
44 | nets = {
45 | "critic" : {"net" : critic, "opt" : critic_opt},
46 |
47 | }
48 |
49 | return nets
50 |
51 |
52 | class AlgoTrainer(BaseAlgo):
53 | def __init__(self, algo_init, args):
54 | super(AlgoTrainer, self).__init__(args)
55 | self.args = args
56 |
57 | self.Q = algo_init["critic"]["net"]
58 | self.Q_target = copy.deepcopy(self.Q)
59 | self.Q_optimizer = algo_init["critic"]["opt"]
60 |
61 | self.discount = self.args["discount"]
62 |
63 | # Target update rule
64 | self.maybe_update_target = self.polyak_target_update if self.args["polyak_target_update"] else self.copy_target_update
65 | self.target_update_frequency = self.args["target_update_frequency"]
66 | self.tau = self.args["tau"]
67 |
68 | # Decay for eps
69 | self.initial_eps = self.args["initial_eps"]
70 | self.end_eps = self.args["end_eps"]
71 | self.slope = (self.end_eps - self.initial_eps) / self.args["eps_decay_period"]
72 |
73 | # Evaluation hyper-parameters
74 | self.state_shape = (-1,) + self.args["obs_shape"] if isinstance(self.args["obs_shape"], int) else (-1, self.args["obs_shape"])
75 | self.eval_eps = self.args["eval_eps"]
76 | self.num_actions = self.args["action_shape"]
77 |
78 | # Threshold for "unlikely" actions
79 | self.threshold = self.args["BCQ_threshold"]
80 |
81 | # Number of training iterations
82 | self.iterations = 0
83 |
84 | def train(self, train_buffer, val_buffer, callback_fn):
85 | training_iters = 0
86 | while training_iters < self.args["max_timesteps"]:
87 |
88 | # Sample replay buffer
89 | batch = train_buffer.sample(self.args["batch_size"])
90 | batch = batch.to_torch(dtype=torch.float32, device=self.args["device"])
91 | reward = batch.rew
92 | done = batch.done
93 | state = batch.obs
94 | action = batch.act.to(torch.int64)
95 | next_state = batch.obs_next
96 |
97 | # Compute the target Q value
98 | with torch.no_grad():
99 | q, imt, i = self.Q(next_state)
100 | imt = imt.exp()
101 | imt = (imt/imt.max(1, keepdim=True)[0] > self.threshold).float()
102 |
103 | # Use large negative number to mask actions from argmax
104 | next_action = (imt * q + (1 - imt) * -1e8).argmax(1, keepdim=True)
105 |
106 | q, imt, i = self.Q_target(next_state)
107 | target_Q = reward + done * self.discount * q.gather(1, next_action).reshape(-1, 1)
108 |
109 | # Get current Q estimate
110 | current_Q, imt, i = self.Q(state)
111 |
112 | current_Q = current_Q.gather(1, action)
113 |
114 | # Compute Q loss
115 | q_loss = F.smooth_l1_loss(current_Q, target_Q)
116 | i_loss = F.nll_loss(imt, action.reshape(-1))
117 |
118 | Q_loss = q_loss + i_loss + 1e-2 * i.pow(2).mean()
119 |
120 | # Optimize the Q
121 | self.Q_optimizer.zero_grad()
122 | Q_loss.backward()
123 | self.Q_optimizer.step()
124 |
125 | # Update target network by polyak or full copy every X iterations.
126 | self.maybe_update_target()
127 | training_iters += 1
128 | #print(training_iters ,self.args["eval_freq"])
129 | if training_iters % self.args["eval_freq"] == 0:
130 | res = callback_fn(self.get_policy())
131 |
132 | self.log_res(training_iters // self.args["eval_freq"], res)
133 |
134 | return self.report_result
135 |
136 |
137 | def polyak_target_update(self):
138 | for param, target_param in zip(self.Q.parameters(), self.Q_target.parameters()):
139 | target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
140 |
141 |
142 | def copy_target_update(self):
143 | if self.iterations % self.target_update_frequency == 0:
144 | self.Q_target.load_state_dict(self.Q.state_dict())
145 |
146 | def save(self, filename):
147 | torch.save(self.Q.state_dict(), filename + "_Q")
148 | torch.save(self.Q_optimizer.state_dict(), filename + "_optimizer")
149 |
150 |
151 | def load(self, filename):
152 | self.Q.load_state_dict(torch.load(filename + "_Q"))
153 | self.Q_target = copy.deepcopy(self.Q)
154 | self.Q_optimizer.load_state_dict(torch.load(filename + "_optimizer"))
155 |
156 | def get_policy(self,):
157 | return self.Q
158 |
159 | def save_model(self):
160 | pass
--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/crr.py:
--------------------------------------------------------------------------------
1 | # Critic regularized regression
2 | # Paper: https://arxiv.org/abs/2006.15134
3 |
4 | import torch
5 | from copy import deepcopy
6 | from loguru import logger
7 |
8 | from offlinerl.algo.base import BaseAlgo
9 | from offlinerl.utils.net.common import Net
10 | from offlinerl.utils.net.continuous import DistributionalCritic
11 | from offlinerl.utils.net.tanhpolicy import TanhGaussianPolicy
12 | from offlinerl.utils.exp import setup_seed
13 |
14 | def algo_init(args):
15 | logger.info('Run algo_init function')
16 |
17 | setup_seed(args['seed'])
18 |
19 | if args["obs_shape"] and args["action_shape"]:
20 | obs_shape, action_shape = args["obs_shape"], args["action_shape"]
21 | max_action = args["max_action"]
22 | elif "task" in args.keys():
23 | from offlinerl.utils.env import get_env_shape, get_env_action_range
24 | obs_shape, action_shape = get_env_shape(args['task'])
25 | max_action, _ = get_env_action_range(args["task"])
26 | args["obs_shape"], args["action_shape"] = obs_shape, action_shape
27 | else:
28 | raise NotImplementedError
29 |
30 | net_a = Net(layer_num=args['hidden_layers'],
31 | state_shape=obs_shape,
32 | hidden_layer_size=args['hidden_features'])
33 |
34 | actor = TanhGaussianPolicy(preprocess_net=net_a,
35 | action_shape=action_shape,
36 | hidden_layer_size=args['hidden_features'],
37 | conditioned_sigma=True).to(args['device'])
38 |
39 | actor_optim = torch.optim.Adam(actor.parameters(), lr=args['lr'])
40 |
41 | critic = DistributionalCritic(obs_shape, action_shape, args['atoms'],
42 | args['hidden_features'], args['hidden_layers'],
43 | None, None).to(args['device'])
44 | critic_optim = torch.optim.Adam(critic.parameters(), lr=args['lr'])
45 |
46 | return {
47 | "actor" : {"net" : actor, "opt" : actor_optim},
48 | "critic" : {"net" : critic, "opt" : critic_optim},
49 | }
50 |
51 |
52 | class AlgoTrainer(BaseAlgo):
53 | def __init__(self, algo_init, args):
54 | super(AlgoTrainer, self).__init__(args)
55 | self.args = args
56 |
57 | self.actor = algo_init['actor']['net']
58 | self.actor_target = deepcopy(self.actor)
59 | self.actor_target.requires_grad_(False)
60 | self.actor_optim = algo_init['actor']['opt']
61 |
62 | self.critic = algo_init['critic']['net']
63 | self.critic_target = deepcopy(self.critic)
64 | self.critic_target.requires_grad_(False)
65 | self.critic_optim = algo_init['critic']['opt']
66 |
67 | self.batch_size = self.args['batch_size']
68 | self.gamma = self.args['gamma']
69 | self.beta = self.args['beta']
70 | self.m = self.args['advantage_samples']
71 | self.advantage_mode = self.args['advantage_mode']
72 | self.weight_mode = self.args['weight_mode']
73 | self.device = self.args['device']
74 |
75 | def train(self, train_buffer, val_buffer, callback_fn):
76 | rewards = train_buffer['rew']
77 | self.critic.set_interval(rewards.min() / (1 - self.gamma), rewards.max() / (1 - self.gamma))
78 | self.critic_target.set_interval(rewards.min() / (1 - self.gamma), rewards.max() / (1 - self.gamma))
79 | for epoch in range(self.args['max_epoch']):
80 | for i in range(self.args['steps_per_epoch']):
81 | batch_data = train_buffer.sample(self.batch_size)
82 | batch_data.to_torch(device=self.device)
83 | obs = batch_data['obs']
84 | action = batch_data['act']
85 | next_obs = batch_data['obs_next']
86 | reward = batch_data['rew']
87 | done = batch_data['done'].float()
88 |
89 | # update critic
90 | p = self.critic(obs, action)
91 | next_action = self.actor_target.get_action(next_obs)
92 | target_p = self.critic_target.get_target(next_obs, next_action, reward, self.gamma * (1 - done))
93 | critic_loss = - (target_p * torch.log(p + 1e-8)).mean()
94 |
95 | self.critic_optim.zero_grad()
96 | critic_loss.backward()
97 | self.critic_optim.step()
98 |
99 | # update actor
100 | action_dist = self.actor(obs)
101 | log_prob = action_dist.log_prob(action)
102 | actions = torch.stack([action_dist.sample() for _ in range(self.m)], dim=0)
103 | repeat_obs = torch.repeat_interleave(obs.unsqueeze(0), self.m, 0)
104 | _, values = self.critic(repeat_obs, actions, with_q=True)
105 | _, value = self.critic(obs, action, with_q=True)
106 |
107 | if self.advantage_mode == 'mean':
108 | advantage = value - values.mean(dim=0)
109 | elif self.advantage_mode == 'max':
110 | advantage = value - values.max(dim=0)[0]
111 |
112 | if self.weight_mode == 'exp':
113 | weight = torch.exp(advantage / self.beta)
114 | elif self.weight_mode == 'binary':
115 | weight = (advantage > 0).float()
116 |
117 | weight = torch.clamp_max(weight, 20).detach()
118 | actor_loss = - torch.mean(weight * log_prob)
119 |
120 | self.actor_optim.zero_grad()
121 | actor_loss.backward()
122 | self.actor_optim.step()
123 |
124 | if i % self.args['update_frequency']:
125 | self._sync_weight(self.critic_target, self.critic, 1.0)
126 | self._sync_weight(self.actor_target, self.actor, 1.0)
127 | print("actor_loss: ", actor_loss.item())
128 | res = callback_fn(self.get_policy())
129 |
130 | self.log_res(epoch, res)
131 |
132 | return self.report_result
133 |
134 | def get_policy(self):
135 | return self.actor
--------------------------------------------------------------------------------
/offlinerl/algo/modelfree/td3bc.py:
--------------------------------------------------------------------------------
1 | # A Minimalist Approach to Offline Reinforcement Learning
2 | # https://arxiv.org/pdf/2106.06860
3 | # https://github.com/sfujim/TD3_BC
4 | import torch
5 | from copy import deepcopy
6 | from loguru import logger
7 | from torch.functional import F
8 |
9 | from offlinerl.algo.base import BaseAlgo
10 | from offlinerl.utils.net.common import MLP,Net
11 | from offlinerl.utils.net.tanhpolicy import TanhGaussianPolicy
12 | from offlinerl.utils.exp import setup_seed
13 |
14 |
15 | def algo_init(args):
16 | logger.info('Run algo_init function')
17 | setup_seed(args['seed'])
18 | if args["obs_shape"] and args["action_shape"]:
19 | obs_shape, action_shape = args["obs_shape"], args["action_shape"]
20 | max_action = args["max_action"]
21 | elif "task" in args.keys():
22 | from offlinerl.utils.env import get_env_shape, get_env_action_range
23 | obs_shape, action_shape = get_env_shape(args['task'])
24 | max_action, _ = get_env_action_range(args["task"])
25 | args["obs_shape"], args["action_shape"] = obs_shape, action_shape
26 | else:
27 | raise NotImplementedError
28 |
29 | net_a = Net(layer_num = args['actor_layers'],
30 | state_shape = obs_shape,
31 | hidden_layer_size = args['actor_features'])
32 |
33 | actor = TanhGaussianPolicy(preprocess_net = net_a,
34 | action_shape = action_shape,
35 | hidden_layer_size = args['actor_features'],
36 | conditioned_sigma = True,
37 | ).to(args['device'])
38 |
39 | actor_optim = torch.optim.Adam(actor.parameters(), lr=args['actor_lr'])
40 |
41 | critic_1 = MLP(obs_shape + action_shape, 1, args['value_features'], args['value_layers'], hidden_activation='relu').to(args['device'])
42 | critic_2 = MLP(obs_shape + action_shape, 1, args['value_features'], args['value_layers'], hidden_activation='relu').to(args['device'])
43 | critic_1_optim = torch.optim.Adam([*critic_1.parameters()], lr=args['critic_lr'])
44 | critic_2_optim = torch.optim.Adam([*critic_2.parameters()], lr=args['critic_lr'])
45 |
46 | nets = {
47 | "actor" : {"net" : actor, "opt" : actor_optim},
48 | "critic" : {"net" : [critic_1, critic_2], "opt" : [critic_1_optim,critic_2_optim]},
49 |
50 | }
51 |
52 | return nets
53 |
54 |
55 | class AlgoTrainer(BaseAlgo):
56 | def __init__(self, algo_init, args):
57 | super(AlgoTrainer, self).__init__(args)
58 | self.args = args
59 |
60 | self.actor = algo_init['actor']['net']
61 | self.actor_optim = algo_init['actor']['opt']
62 |
63 | self.critic_1, self.critic_2 = algo_init['critic']['net']
64 | self.target_critic_1 = deepcopy(self.critic_1)
65 | self.target_critic_2 = deepcopy(self.critic_2)
66 | self.critic_1_optim = algo_init['critic']['opt'][0]
67 | self.critic_2_optim = algo_init['critic']['opt'][1]
68 |
69 | self.alpha = self.args['alpha']
70 | self.policy_noise = self.args['policy_noise']
71 | self.noise_clip = self.args['noise_clip']
72 | self.policy_freq = self.args['policy_freq']
73 | self.discount = self.args['discount']
74 |
75 | self.batch_size = self.args['batch_size']
76 | self.device = self.args['device']
77 | self.max_action = 1
78 |
79 |
80 | def forward(self, obs, reparameterize=True, return_log_prob=True):
81 | log_prob = None
82 | tanh_normal = self.actor(obs,reparameterize=reparameterize,)
83 | if return_log_prob:
84 | if reparameterize is True:
85 | action, pre_tanh_value = tanh_normal.rsample(
86 | return_pretanh_value=True
87 | )
88 | else:
89 | action, pre_tanh_value = tanh_normal.sample(
90 | return_pretanh_value=True
91 | )
92 | log_prob = tanh_normal.log_prob(
93 | action,
94 | pre_tanh_value=pre_tanh_value
95 | )
96 | log_prob = log_prob.sum(dim=1, keepdim=True)
97 | else:
98 | if reparameterize is True:
99 | action = tanh_normal.rsample()
100 | else:
101 | action = tanh_normal.sample()
102 | return action, log_prob
103 |
104 | def train(self, train_buffer, val_buffer, callback_fn):
105 | # train_buffer
106 | obs_mean = train_buffer["obs"].mean(0)
107 | obs_std = train_buffer["obs"].std(0) + 1e-3
108 | obs_mean = torch.as_tensor(obs_mean, dtype=torch.float32)
109 | obs_std = torch.as_tensor(obs_std, dtype=torch.float32)
110 | self.actor.preprocess.s_mean = obs_mean
111 | self.actor.preprocess.s_std = obs_std
112 |
113 | self.target_actor = deepcopy(self.actor)
114 |
115 | for epoch in range(self.args['max_epoch']):
116 | for i in range(self.args['steps_per_epoch']):
117 | batch_data = train_buffer.sample(self.batch_size)
118 | batch_data.to_torch(device=self.device)
119 |
120 | obs = batch_data['obs']
121 | action = batch_data['act']
122 | next_obs = batch_data['obs_next']
123 | reward = batch_data['rew']
124 | done = batch_data['done'].float()
125 |
126 | with torch.no_grad():
127 | noise = (torch.randn_like(action) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
128 | next_action = (self.target_actor(next_obs).mode + noise).clamp(-self.max_action, self.max_action)
129 | next_obs_action = torch.cat([next_obs, next_action], dim=-1)
130 | target_q = torch.min(
131 | self.target_critic_1(next_obs_action), self.target_critic_2(next_obs_action)
132 | )*self.discount*(1-done) + reward
133 |
134 | obs_action = torch.cat([obs, action], dim=-1)
135 | current_q1, current_q2 = self.critic_1(obs_action), self.critic_2(obs_action)
136 | critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q)
137 |
138 | # Optimize the critic
139 | self.critic_1_optim.zero_grad()
140 | self.critic_2_optim.zero_grad()
141 | critic_loss.backward()
142 | self.critic_1_optim.step()
143 | self.critic_2_optim.step()
144 |
145 |
146 | if i % self.policy_freq == 0:
147 | pi = self.actor(obs).mode
148 | q = self.critic_1(torch.cat([obs, pi], dim=-1))
149 | lmbda = self.alpha / q.abs().mean().detach()
150 | actor_loss = -lmbda * q.mean() + F.mse_loss(pi, action)
151 |
152 | self.actor_optim.zero_grad()
153 | actor_loss.backward()
154 | self.actor_optim.step()
155 |
156 | self._sync_weight(self.target_actor, self.actor, soft_target_tau=self.args['soft_target_tau'])
157 | self._sync_weight(self.target_critic_1, self.critic_1, soft_target_tau=self.args['soft_target_tau'])
158 | self._sync_weight(self.target_critic_2, self.critic_2, soft_target_tau=self.args['soft_target_tau'])
159 |
160 | res = callback_fn(self.get_policy())
161 |
162 | res.update({
163 | "actor_loss" : actor_loss.item(),
164 | "critic_loss" : critic_loss.item(),
165 | "lmbda" : lmbda.item(),
166 | "q" : q.mean().item(),
167 | })
168 |
169 |
170 | self.log_res(epoch, res)
171 |
172 | return self.report_result
173 |
174 | def get_model(self):
175 | return self.actor
176 |
177 | def get_policy(self):
178 | return self.actor
--------------------------------------------------------------------------------
/offlinerl/algo/online/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/algo/online/__init__.py
--------------------------------------------------------------------------------
/offlinerl/algo/online/bremen.py:
--------------------------------------------------------------------------------
1 | # Deployment-Efficient Reinforcement Learning via Model-Based Offline Optimization
2 | # https://arxiv.org/abs/2006.03647
3 | # https://github.com/matsuolab/BREMEN
4 |
5 | # TODO
--------------------------------------------------------------------------------
/offlinerl/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/config/__init__.py
--------------------------------------------------------------------------------
/offlinerl/config/algo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/config/algo/__init__.py
--------------------------------------------------------------------------------
/offlinerl/config/algo/bc_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | actor_features = 256
16 | actor_layers = 2
17 |
18 | batch_size = 256
19 | steps_per_epoch = 1000
20 | max_epoch = 1000
21 |
22 | actor_lr = 1e-3
23 |
24 | #tune
25 | params_tune = {
26 | "actor_lr" : {"type" : "continuous", "value": [1e-4, 1e-3]},
27 | }
28 |
29 | #tune
30 | grid_tune = {
31 | "actor_lr" : [1e-4, 5e-4, 1e-3],
32 | "actor_layers" : [2,3],
33 | }
34 |
35 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/bc_model_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | # model save path
16 | dynamics_path = None
17 | dynamics_save_path = None
18 |
19 | # transition model train
20 | transition_init_num = 7
21 | transition_select_num = 5
22 | val_ratio = 0.2
23 | max_epochs_since_update = 10
24 | transition_max_epochs = None
25 |
26 | # trick config
27 | normalize_obs = False
28 | transition_scaler = True
29 |
30 | # transition config
31 | transition_batch_size = 256
32 | transition_lr = 1e-3
33 | logvar_loss_coef = 0.01
34 | dynamics_hidden_dims = [200, 200, 200, 200]
35 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
36 |
37 | #tune
38 | params_tune = {
39 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
40 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
41 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
42 | "lam" : {"type" : "continuous", "value": [0.1, 10]},
43 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
44 | }
45 |
46 | #tune
47 | grid_tune = {
48 | "transition_scaler" : [True, False],
49 | "transition_lr" : [1e-3, 3e-4],
50 | "logvar_loss_coef" : [0.01, 1e-3],
51 | }
52 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/bcq_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | vae_features = 750
16 | vae_layers = 2
17 | jitter_features = 400
18 | jitter_layers = 2
19 | value_features = 400
20 | value_layers = 2
21 | phi = 0.05
22 | lam = 0.75
23 |
24 | batch_size = 100
25 | steps_per_epoch = 5000
26 | max_epoch = 200
27 |
28 | vae_lr = 1e-3
29 | jitter_lr = 3e-4
30 | critic_lr = 3e-4
31 | gamma = 0.99
32 | soft_target_tau = 5e-3
33 |
34 | #tune
35 | params_tune = {
36 | "phi" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
37 | "lam" : {"type" : "continuous", "value": [0, 1]},
38 | }
39 |
40 | #tune
41 | grid_tune = {
42 | "phi" : [0.05, 0.1, 0.2, 0.5],
43 | }
44 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/bcqd_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 |
16 | max_timesteps = 1e6
17 | eval_freq = 1e3
18 |
19 | optimizer_parameters = {
20 | "lr": 3e-4,
21 | }
22 |
23 | BCQ_threshold = 0.3
24 |
25 | discount = 0.99
26 | tau = 0.005
27 | polyak_target_update = True
28 | target_update_frequency=1
29 | start_timesteps = 1e3
30 | initial_eps = 0.1
31 | end_eps = 0.1
32 | eps_decay_period = 1
33 | eval_eps = 0.001
34 | buffer_size = 1e6
35 | batch_size = 256
36 | train_freq = 1
--------------------------------------------------------------------------------
/offlinerl/config/algo/bremen_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
10 | obs_shape = None
11 | act_shape = None
12 |
13 | dynamics_path = None
14 | behavior_path = None
15 |
16 | transition_hidden_size = 256
17 | transition_hidden_layers = 4
18 | transition_init_num = 7
19 | transition_select_num = 5
20 |
21 | actor_hidden_size = 256
22 | actor_hidden_layers = 2
23 | value_hidden_size = 256
24 | value_hidden_layers = 2
25 |
26 | transition_batch_size = 256
27 | data_collection_per_epoch = 50000
28 | max_epoch = 250
29 | trpo_steps_per_epoch = 25
30 |
31 | bc_batch_size = 256
32 | bc_init = True
33 |
34 | transition_lr = 1e-3
35 | bc_lr = 1e-3
36 | value_lr = 3e-4
37 |
38 | cg_iters = 10
39 | damping_coeff = 0.1
40 | backtrack_iters = 10
41 | backtrack_coeff = 0.8
42 | train_v_iters = 50
43 | trpo_step_size = 0.01
44 | explore_mode = 'sample'
45 | static_noise = 0.1
46 |
47 | horizon = 250
48 | gamma = 0.99
49 | lam = 0.95
50 |
51 | #tune
52 | params_tune = {
53 | "horizon" : {"type" : "discrete", "value": [250, 500, 1000]}
54 | }
55 |
56 | #tune
57 | grid_tune = {
58 | 'horizon' : [250, 1000],
59 | # 'trpo_step_size' : [0.01, 0.05],
60 | 'explore_mode' : ['sample', 'static'],
61 | }
62 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/combo_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | # model save path
16 | dynamics_path = None
17 | dynamics_save_path = None
18 |
19 | # transition model train
20 | transition_init_num = 7
21 | transition_select_num = 5
22 | val_ratio = 0.2
23 | max_epochs_since_update = 5
24 | transition_max_epochs = None
25 |
26 | # trick config
27 | trainsition_clip = True
28 | normalize_obs = False
29 | transition_scaler = True
30 | policy_scaler = False
31 |
32 | # transition config
33 | transition_batch_size = 256
34 | transition_lr = 1e-3
35 | logvar_loss_coef = 0.01
36 | dynamics_hidden_dims = [200, 200, 200, 200]
37 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
38 |
39 | # alpha config
40 | learnable_alpha = True
41 | alpha_lr = 1e-4
42 | alpha = 0.2
43 |
44 | # train config
45 | horizon = 1
46 | real_data_ratio = 0.5
47 | max_epoch = 1000
48 | steps_per_epoch = 1000
49 | rollout_freq = 1000
50 | rollout_batch_size = 5e+4
51 |
52 | # policy config
53 | hidden_dims = [256, 256, 256]
54 | policy_batch_size = 256
55 | actor_lr = 1e-4
56 |
57 | # critic config
58 | critic_lr = 3e-4
59 | discount = 0.99
60 | soft_target_tau = 5e-3
61 | target_entropy = None
62 |
63 | # others
64 | val_frequency = 10
65 | eval_episodes = 10
66 | model_retain_epochs = 5
67 |
68 | # combo config
69 | cql_weight = 2.5
70 | temperatue = 1.0
71 | max_q_backup = False
72 | deterministic_backup = True
73 | with_lagrange = False
74 | lagrange_threshold = 10.0
75 | cql_alpha_lr = 3e-4
76 | num_repeat_actions = 10
77 | uniform_rollout = False
78 | rho_s = "mix" # choose from ["model", "mix"]
79 |
80 | #tune
81 | params_tune = {
82 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
83 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
84 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
85 | "lam" : {"type" : "continuous", "value": [0.1, 10]},
86 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
87 | }
88 |
89 | #tune
90 | grid_tune = {
91 | "horizon" : [1, 5],
92 | "cql_weight" : [2.5, 3.5, 5],
93 | "rho_s": ["model", "mix"],
94 | }
95 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/cql_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | max_epoch = 1000
16 | steps_per_epoch = 1000
17 | policy_bc_steps = 40000
18 |
19 | batch_size = 256
20 | hidden_layer_size = 256
21 | layer_num = 2
22 | actor_lr=1E-4
23 | critic_lr=3E-4
24 | reward_scale=1
25 | use_automatic_entropy_tuning=True
26 | target_entropy = None
27 | discount = 0.99
28 | soft_target_tau=5e-3
29 |
30 | # min Q
31 | explore=1.0
32 | temp=1.0
33 | min_q_version=3
34 | min_q_weight=5.0
35 | # lagrange
36 | with_lagrange=False
37 | lagrange_thresh=2.0
38 |
39 | # extra params
40 | num_random=10
41 | type_q_backup= "min"
42 | q_backup_lmbda = 0.75
43 | deterministic_backup=False
44 |
45 | discrete = False
46 |
47 | #tune
48 | params_tune = {
49 | "actor_lr" : {"type" : "discrete", "value":[1e-4, 3e-4]},
50 | "min_q_version" : {"type" : "discrete", "value":[2, 3]},
51 | "min_q_weight" : {"type": "discrete", "value":[5, 10]},
52 | "lagrange_thresh" : {"type": "discrete", "value":[-1, 2, 5, 10]},
53 | "type_q_backup" : {"type": "discrete", "value":["max", "none"]},
54 | }
55 |
56 | #tune
57 | grid_tune = {
58 | #"actor_lr" : [1e-4, 3e-4],
59 | "min_q_version" : [2, 3],
60 | "min_q_weight" : [5, 10],
61 | "lagrange_thresh" : [-1, 2, 5, 10],
62 | # "type_q_backup" : ["min", "none"],
63 | }
64 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/crr_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | hidden_features = 256
16 | hidden_layers = 2
17 | atoms = 21
18 |
19 | advantage_mode = 'mean'
20 | weight_mode = 'exp'
21 | advantage_samples = 4
22 | beta = 1.0
23 | gamma = 0.99
24 |
25 | batch_size = 1024
26 | steps_per_epoch = 1000
27 | max_epoch = 200
28 |
29 | lr = 1e-4
30 | update_frequency = 100
31 |
32 | #tune
33 | params_tune = {
34 | "beta" : {"type" : "continuous", "value": [0.0, 10.0]},
35 | }
36 |
37 | #tune
38 | grid_tune = {
39 | "advantage_mode" : ['mean', 'max'],
40 | "weight_mode" : ['exp', 'binary'],
41 | }
42 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/edac_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 | # from datetime import datetime
4 |
5 | task = "Hopper-v3"
6 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
7 |
8 |
9 | # parser.add_argument("--algo-name", type=str, default="edac")
10 | # parser.add_argument("--task", type=str, default="SafetyHalfCheetah")
11 | obs_shape = None
12 | act_shape = None
13 |
14 | seed = 42
15 | actor_lr=1e-4
16 | critic_lr=3e-4
17 | task_train_num = 99
18 | task_data_type = 'high'
19 | # hidden_dims=[256, 256, 256]
20 | hidden_layer_size = 256
21 | layer_num = 2
22 | gamma=0.99
23 | tau=0.005
24 | alpha=0.2
25 | auto_alpha=True
26 |
27 | target_entropy = None
28 | alpha_lr =1e-4
29 | num_critics = 50
30 |
31 | max_q_backup = False
32 | deterministic_backup=False
33 |
34 | eta=1.0
35 | normalize_reward=False
36 |
37 | epoch=3000
38 | step_per_epoch=1000
39 |
40 | eval_episodes=100
41 | batch_size=256
42 |
43 | #tune
44 | params_tune = {
45 | "num_critics" : {"type" : "discrete", "value":[10,50]},
46 | "eta" : {"type" : "discrete", "value":[1, 5]},
47 | }
48 |
49 | grid_tune = {
50 | "num_critics" : [10, 50],
51 | "eta" : [1, 5],
52 | }
53 |
54 |
55 | # task_data_type = "low"
56 | # task_train_num = 99
57 |
58 | # seed = 42
59 |
60 | # device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
61 | # obs_shape = None
62 | # act_shape = None
63 | # max_action = None
64 |
65 | # max_epoch = 300
66 | # steps_per_epoch = 1000
67 | # policy_bc_steps = 40000
68 |
69 | # batch_size = 256
70 | # hidden_layer_size = 256
71 | # layer_num = 2
72 | # actor_lr=1E-4
73 | # critic_lr=3E-4
74 | # reward_scale=1
75 | # use_automatic_entropy_tuning=True
76 | # target_entropy = None
77 | # discount = 0.99
78 | # soft_target_tau=5e-3
79 |
80 | # # min Q
81 | # explore=1.0
82 | # temp=1.0
83 | # min_q_version=3
84 | # min_q_weight=5.0
85 |
86 | # # lagrange
87 | # with_lagrange=False
88 | # lagrange_thresh=2.0
89 |
90 | # # extra params
91 | # num_random=10
92 | # type_q_backup= "min"
93 | # q_backup_lmbda = 0.75
94 | # deterministic_backup=False
95 |
96 | # discrete = False
97 |
98 | #tune
99 | # params_tune = {
100 | # "actor_lr" : {"type" : "discrete", "value":[1e-4, 3e-4]},
101 | # "min_q_version" : {"type" : "discrete", "value":[2, 3]},
102 | # "min_q_weight" : {"type": "discrete", "value":[5, 10]},
103 | # "lagrange_thresh" : {"type": "discrete", "value":[-1, 2, 5, 10]},
104 | # "type_q_backup" : {"type": "discrete", "value":["max", "none"]},
105 | # }
106 |
107 | # #tune
108 | # grid_tune = {
109 | # #"actor_lr" : [1e-4, 3e-4],
110 | # "min_q_version" : [2, 3],
111 | # "min_q_weight" : [5, 10],
112 | # "lagrange_thresh" : [-1, 2, 5, 10],
113 | # # "type_q_backup" : ["min", "none"],
114 | # }
115 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/maple_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | # device = 'cuda:0'
12 | obs_shape = None
13 | act_shape = None
14 | max_action = None
15 | # new parameters based on mopo
16 | lstm_hidden_unit = 128
17 | Guassain_hidden_sizes = (256,256)
18 | value_hidden_sizes=(256,256)
19 | hidden_sizes=(16,)
20 | model_pool_size = 250000
21 | rollout_batch_size = 50000
22 | handle_per_round = 400
23 | out_train_epoch = 1000
24 | in_train_epoch = 1000
25 |
26 | train_batch_size = 256 # train policy num of trajectories
27 |
28 | number_runs_eval = 40 # evaluation epochs in mujoco
29 |
30 | #-------------
31 | dynamics_path = None
32 | dynamics_save_path = None
33 | only_dynamics = False
34 |
35 | hidden_layer_size = 256
36 | hidden_layers = 2
37 | transition_layers = 4
38 |
39 | transition_init_num = 20
40 | transition_select_num = 14
41 | # by selecting a number smaller than rollout_batch_size, you can protect the model rollout from OOM error
42 | mini_forward_size = -1
43 |
44 | real_data_ratio = 0.05
45 |
46 | transition_batch_size = 256
47 | policy_batch_size = 256
48 | data_collection_per_epoch = 50e3
49 | steps_per_epoch = 1000
50 | max_epoch = 1000
51 |
52 |
53 | eval_episodes = 100
54 |
55 | learnable_alpha = True
56 | uncertainty_mode = 'aleatoric'
57 | transition_lr = 1e-3
58 | actor_lr = 3e-4
59 | critic_lr = 3e-4
60 | discount = 0.99
61 | soft_target_tau = 5e-3
62 |
63 | horizon = 10
64 | lam = 0.25
65 |
66 | penalty_clip = 20
67 | mode = 'normalize' # 'normalize', 'local', 'noRes'
68 |
69 | #tune
70 | params_tune = {
71 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
72 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
73 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
74 | "lam" : {"type" : "continuous", "value": [0.1, 10]},
75 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
76 | }
77 |
78 | #tune
79 | grid_tune = {
80 | "horizon" : [1, 5],
81 | "lam" : [0.5, 1, 2, 5],
82 | "uncertainty_mode" : ['aleatoric', 'disagreement'],
83 | }
84 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/maple_config_new.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | # transition model train
16 | transition_init_num = 20
17 | transition_select_num = 14
18 | val_ratio = 0.2
19 | max_epochs_since_update = 5
20 | transition_max_epochs = None
21 |
22 | # trick config
23 | trainsition_clip = False
24 | normalize_obs = False # should set to False
25 | transition_scaler = True
26 |
27 | # transition config
28 | transition_batch_size = 256
29 | transition_lr = 1e-3
30 | logvar_loss_coef = 0.01
31 | dynamics_hidden_dims = [200, 200, 200, 200]
32 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
33 |
34 | # new parameters based on mopo
35 | lstm_hidden_unit = 128
36 | Guassain_hidden_sizes = (256,256)
37 | value_hidden_sizes=(256,256)
38 | hidden_sizes=(16,)
39 | model_pool_size = 250000
40 | rollout_batch_size = 50000
41 | handle_per_round = 400
42 | out_train_epoch = 1000
43 | in_train_epoch = 1000
44 |
45 | train_batch_size = 256 # train policy num of trajectories
46 |
47 | number_runs_eval = 40 # evaluation epochs in mujoco
48 |
49 | #-------------
50 | dynamics_path = None
51 | dynamics_save_path = None
52 | only_dynamics = False
53 |
54 | hidden_layer_size = 256
55 | hidden_layers = 2
56 |
57 | real_data_ratio = 0.05
58 |
59 | policy_batch_size = 256
60 | data_collection_per_epoch = 50e3
61 | steps_per_epoch = 1000
62 | max_epoch = 1000
63 |
64 | eval_episodes = 100
65 |
66 | # alpha config
67 | learnable_alpha = True
68 | alpha_lr = 1e-4
69 | alpha = 0.2
70 | target_entropy = None
71 |
72 | uncertainty_mode = 'aleatoric'
73 | actor_lr = 3e-4
74 | critic_lr = 3e-4
75 | discount = 0.99
76 | soft_target_tau = 5e-3
77 |
78 | horizon = 10
79 | penalty_coef = 0.25
80 |
81 | penalty_clip = 20
82 |
83 | #tune
84 | params_tune = {
85 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
86 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
87 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
88 | "lam" : {"type" : "continuous", "value": [0.1, 10]},
89 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
90 | }
91 |
92 | #tune
93 | grid_tune = {
94 | "horizon" : [1, 5],
95 | "lam" : [0.5, 1, 2, 5],
96 | "uncertainty_mode" : ['aleatoric', 'disagreement'],
97 | }
98 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/mcq_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | vae_features = 750
16 | vae_layers = 2
17 | actor_features = 400
18 | actor_layers = 2
19 | value_features = 400
20 | value_layers = 2
21 | lam = 0.95
22 |
23 | alpha = 0.2
24 | auto_alpha = True
25 | target_entropy = None
26 |
27 | batch_size = 256
28 | steps_per_epoch = 1000
29 | max_epoch = 1000
30 |
31 | vae_lr = 1e-3
32 | actor_lr = 3e-4
33 | critic_lr = 3e-4
34 | alpha_lr = 3e-4
35 | gamma = 0.99
36 | soft_target_tau = 5e-3
37 |
38 | num_sampled_actions = 10
39 | eval_episodes = 100
40 |
41 | #tune
42 | params_tune = {
43 | "lam" : {"type" : "continuous", "value": [0.3, 0.95]},
44 | }
45 |
46 | #tune
47 | grid_tune = {
48 | "lam" : [0.3,0.4,0.5, 0.6, 0.7, 0.8, 0.9, 0.95],
49 | "auto_alpha" : [True, False],
50 | }
51 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/mobile_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | # model save path
16 | dynamics_path = None
17 | dynamics_save_path = None
18 |
19 | # transition model train
20 | transition_init_num = 7
21 | transition_select_num = 5
22 | val_ratio = 0.2
23 | max_epochs_since_update = 5
24 | transition_max_epochs = None
25 |
26 | # trick config
27 | trainsition_clip = True
28 | normalize_obs = False
29 | transition_scaler = True
30 | policy_scaler = False
31 |
32 | # transition config
33 | transition_batch_size = 256
34 | transition_lr = 1e-3
35 | logvar_loss_coef = 0.01
36 | dynamics_hidden_dims = [200, 200, 200, 200]
37 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
38 |
39 | # alpha config
40 | learnable_alpha = True
41 | alpha_lr = 1e-4
42 | alpha = 0.2
43 |
44 | # train config
45 | horizon = 5
46 | real_data_ratio = 0.05
47 | max_epoch = 3000
48 | steps_per_epoch = 1000
49 | rollout_freq = 1000
50 | rollout_batch_size = 5e+4
51 |
52 | # policy config
53 | hidden_dims = [256, 256]
54 | policy_batch_size = 256
55 | actor_lr = 1e-4
56 |
57 | # critic config
58 | critic_lr = 3e-4
59 | discount = 0.99
60 | soft_target_tau = 5e-3
61 | target_entropy = None
62 |
63 | # others
64 | val_frequency = 10
65 | eval_episodes = 10
66 | model_retain_epochs = 5
67 |
68 | # mobile config
69 | num_q_ensemble = 2
70 | penalty_coef = 3.5
71 | num_samples = 10
72 |
73 | #tune
74 | params_tune = {
75 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
76 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
77 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
78 | "lam" : {"type" : "continuous", "value": [0.1, 10]},
79 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
80 | }
81 |
82 | #tune
83 | grid_tune = {
84 | "horizon" : [1, 5],
85 | "penalty_coef" : [0.5, 1.5, 2.5, 3.5],
86 | "real_data_ratio" :[0.05],
87 | }
88 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/moose_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | vae_iterations = 500000
16 | vae_hidden_size = 750
17 | vae_batch_size = 100
18 | vae_kl_weight = 0.5
19 | #vae_pretrain_model = "/tmp/vae_499999.pkl"
20 |
21 |
22 | latent = False
23 | layer_num = 3
24 | actor_batch_size = 100
25 | hidden_layer_size = 256
26 | actor_iterations = 500000
27 | vae_lr = 1e-4
28 | actor_lr = 1e-4
29 | critic_lr = 1e-3
30 | soft_target_tau = 0.005
31 | lmbda = 0.75
32 | discount = 0.99
33 |
34 | max_latent_action = 2
35 | phi = 0.05
36 |
37 | #tune
38 | params_tune = {
39 | "vae_iterations" : {"type" : "continuous", "value":[50000, 100000, 500000,]},
40 | "actor_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]},
41 | "vae_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]},
42 | "lmbda" :{"type": "discrete", "value":[0.0, 0.25, 0.5, 0.75, 1.0]},
43 | }
44 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/mopo_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | # model save path
16 | dynamics_path = None
17 | dynamics_save_path = None
18 |
19 | # transition model train
20 | transition_init_num = 7
21 | transition_select_num = 5
22 | val_ratio = 0.2
23 | max_epochs_since_update = 5
24 | transition_max_epochs = None
25 |
26 | # trick config
27 | trainsition_clip = False
28 | normalize_obs = False
29 | transition_scaler = True
30 | policy_scaler = False
31 |
32 | # transition config
33 | transition_batch_size = 256
34 | transition_lr = 1e-3
35 | logvar_loss_coef = 0.01
36 | dynamics_hidden_dims = [200, 200, 200, 200]
37 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
38 |
39 | # alpha config
40 | learnable_alpha = True
41 | alpha_lr = 1e-4
42 | alpha = 0.2
43 | target_entropy = None
44 |
45 | # train config
46 | horizon = 1
47 | real_data_ratio = 0.05
48 | max_epoch = 3000
49 | steps_per_epoch = 1000
50 | rollout_freq = 1000
51 | rollout_batch_size = 5e+4
52 |
53 | # policy config
54 | hidden_dims = [256, 256]
55 | policy_batch_size = 256
56 | actor_lr = 1e-4
57 |
58 | # critic config
59 | critic_lr = 3e-4
60 | discount = 0.99
61 | soft_target_tau = 5e-3
62 |
63 | # others
64 | model_retain_epochs = 5
65 |
66 | # mopo config
67 | uncertainty_mode = 'aleatoric'
68 | penalty_coef = 1
69 |
70 | #tune
71 | params_tune = {
72 | "buffer_size" : {"type" : "discrete", "value": [1e6, 2e6]},
73 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
74 | "horzion" : {"type" : "discrete", "value": [1, 2, 5]},
75 | "lam" : {"type" : "continuous", "value": [0.1, 10]},
76 | "learnable_alpha" : {"type" : "discrete", "value": [True, False]},
77 | }
78 |
79 | #tune
80 | grid_tune = {
81 | "horizon" : [1, 5],
82 | "penalty_coef" : [0.5, 1, 2, 5],
83 | "uncertainty_mode" : ['aleatoric', 'disagreement'],
84 | }
85 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/plas_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | vae_iterations = 500000
16 | vae_hidden_size = 750
17 | vae_batch_size = 100
18 | vae_kl_weight = 0.5
19 |
20 | latent = True
21 | layer_num = 2
22 | actor_batch_size = 100
23 | hidden_layer_size = 256
24 | actor_iterations = 500000
25 | vae_lr = 1e-4
26 | actor_lr = 1e-4
27 | critic_lr = 1e-3
28 | soft_target_tau = 0.005
29 | lmbda = 0.75
30 | discount = 0.99
31 |
32 | max_latent_action = 2
33 | phi = 0.05
34 |
35 | #tune
36 | params_tune = {
37 | "vae_iterations" : {"type" : "discrete", "value":[50000, 100000, 500000,]},
38 | "actor_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]},
39 | "vae_lr" : {"type" : "continuous", "value":[1E-4, 1E-3]},
40 | "actor_batch_size" : {"type": "discrete", "value":[128, 256, 512]},
41 | "latent" : {"type": "discrete", "value":[True, False]},
42 | "lmbda" :{"type": "discrete", "value":[0.65, 0.75, 0.85]},
43 | }
44 |
45 | #tune
46 | grid_tune = {
47 | "phi" : [0, 0.05, 0.1, 0.2, 0.4],
48 | }
49 |
--------------------------------------------------------------------------------
/offlinerl/config/algo/prdc_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
10 |
11 |
12 | steps_per_epoch = 1000
13 | max_epoch = 1000
14 | batch_size = 256
15 | state_dim = None
16 | action_dim = None
17 | alpha = 2.5
18 | beta = 2.0
19 | k = 1
20 | policy_freq = 2
21 | noise_clip = 0.5
22 | policy_noise = 2
23 | discount = 0.99
24 | tau = 0.005
25 | expl_noise = 0.1
26 | critic_lr = 3e-4
27 | actor_lr = 3e-4
28 | max_action = 1.0
29 |
30 |
31 |
32 | #tune
33 | grid_tune = {
34 | "alpha" : [2.5, 7.5, 20.0, 40.0],
35 | "beta" : [2.0, 7.5, 15.0],
36 | }
--------------------------------------------------------------------------------
/offlinerl/config/algo/rambo_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Simglucose"
5 | task_data_type = "medium"
6 | task_train_num = 99
7 |
8 | seed = 42
9 |
10 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
11 | obs_shape = None
12 | act_shape = None
13 | max_action = None
14 |
15 | # model save path
16 | policy_bc_path = None
17 | policy_bc_save_path = None
18 | dynamics_path = None
19 | dynamics_save_path = None
20 |
21 | # transition model train
22 | transition_init_num = 7
23 | transition_select_num = 5
24 | val_ratio = 0.2
25 | max_epochs_since_update = 5
26 | transition_max_epochs = None
27 |
28 | # trick config
29 | trainsition_clip = True
30 | normalize_obs = False
31 | transition_scaler = True
32 | policy_scaler = True
33 |
34 | # transition config
35 | transition_batch_size = 256
36 | transition_lr = 1e-3 # 3e-4
37 | logvar_loss_coef = 0.01 # 1e-3
38 | dynamics_hidden_dims = [200, 200, 200, 200]
39 | dynamics_weight_decay = [2.5e-5, 5e-5, 7.5e-5, 7.5e-5, 1e-4]
40 |
41 | # alpha config
42 | learnable_alpha = True
43 | alpha_lr = 1e-4
44 | alpha = 0.2
45 |
46 | # train config
47 | horizon = 5
48 | real_data_ratio = 0.5
49 | max_epoch = 2000
50 | steps_per_epoch = 1000
51 | rollout_freq = 250
52 | rollout_batch_size = 5e+4
53 |
54 | # policy config
55 | hidden_dims = [256, 256]
56 | policy_batch_size = 256
57 | actor_lr = 1e-4
58 |
59 | # critic config
60 | critic_lr = 3e-4
61 | discount = 0.99
62 | soft_target_tau = 5e-3
63 | target_entropy = None
64 |
65 | # others
66 | val_frequency = 10
67 | eval_episodes = 10
68 | model_retain_epochs = 5
69 |
70 | # rambo config
71 | policy_bc_epoch = 50
72 | policy_bc_batch_size = 256
73 | policy_bc_lr = 1e-4
74 |
75 | transition_adv_lr = 3e-4
76 | dynamics_update_freq = 1000
77 | adv_train_steps = 1000
78 | adv_rollout_batch_size = 256
79 | adv_rollout_length = 5
80 | include_ent_in_adv = False
81 | adv_weight = 3e-4
82 |
83 | #tune
84 | params_tune = {
85 | "real_data_ratio" : {"type" : "discrete", "value": [0.05, 0.1, 0.2]},
86 | "horizon" : {"type" : "discrete", "value": [1, 2, 5]},
87 | "adv_weight" : {"type" : "discrete", "value": [0, 3e-4]},
88 | }
89 |
90 | #tune
91 | grid_tune = {
92 | "horizon" : [1, 5],
93 | "transition_adv_lr" : [1e-3, 3e-4],
94 | "adv_weight" : [0, 1e-3, 3e-4],
95 | }
--------------------------------------------------------------------------------
/offlinerl/config/algo/td3bc_config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from offlinerl.utils.exp import select_free_cuda
3 |
4 | task = "Hopper-v3"
5 | task_data_type = "low"
6 | task_train_num = 99
7 |
8 | seed = 42
9 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
10 | obs_shape = None
11 | act_shape = None
12 | max_action = None
13 |
14 |
15 | actor_features = 256
16 | actor_layers = 2
17 | value_features = 256
18 | value_layers = 2
19 |
20 | alpha = 2.5
21 | policy_noise = 0.2
22 | noise_clip = 0.5
23 | policy_freq = 2
24 |
25 |
26 | batch_size = 256
27 | steps_per_epoch = 1000
28 | max_epoch = 1000
29 |
30 |
31 | actor_lr = 3e-4
32 | critic_lr = 3e-4
33 | alpha_lr = 3e-4
34 | discount = 0.99
35 | soft_target_tau = 5e-3
36 |
37 | num_sampled_actions = 10
38 | eval_episodes = 100
39 |
40 | #tune
41 | grid_tune = {
42 | "alpha" : [0.05, 0.1, 0.2],
43 | "policy_noise" : [0.5, 1.5, 2.5],
44 | }
45 |
--------------------------------------------------------------------------------
/offlinerl/data/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import random
4 | import numpy as np
5 | from loguru import logger
6 |
7 | from offlinerl.utils.logger import log_path
8 | from offlinerl.utils.io import create_dir, download_helper, read_json
9 |
10 | from offlinerl.data.neorl import load_neorl_buffer
11 |
12 | dataset_dir = os.path.join(log_path(),"./offlinerl_datasets")
13 | create_dir(dataset_dir)
14 |
15 | def load_data_from_neorl2_util(task):
16 |
17 | import neorl2
18 | import gymnasium as gym
19 |
20 | env = neorl2.make(task)
21 | if 'fusion' in task.lower():
22 | train_data, val_data = env.get_dataset(traj_num=20)
23 | else:
24 | train_data, val_data = env.get_dataset()
25 |
26 | return train_data, val_data
27 |
28 | def load_data_from_neorl2(task):
29 | train_data, val_data = load_data_from_neorl2_util(task)
30 | train_buffer = load_neorl_buffer({
31 | 'obs': train_data["obs"].astype(np.float32),
32 | 'action': train_data["action"].astype(np.float32),
33 | 'next_obs': train_data["next_obs"].astype(np.float32),
34 | 'reward': train_data["reward"].astype(np.float32).reshape(-1, 1),
35 | 'done': np.bool_(train_data["done"]).reshape(-1, 1),
36 | })
37 |
38 | val_buffer = load_neorl_buffer({
39 | 'obs': val_data["obs"].astype(np.float32),
40 | 'action': val_data["action"].astype(np.float32),
41 | 'next_obs': val_data["next_obs"].astype(np.float32),
42 | 'reward': val_data["reward"].astype(np.float32).reshape(-1, 1),
43 | 'done': np.bool_(val_data["done"]).reshape(-1, 1),
44 | })
45 |
46 | return train_buffer, val_buffer
47 |
48 | def load_data_from_neorl(task, task_data_type = "low", task_train_num = 99):
49 | try:
50 | import neorl
51 | env = neorl.make(task)
52 | train_data, val_data = env.get_dataset(data_type = task_data_type, train_num = task_train_num)
53 | train_buffer, val_buffer = load_neorl_buffer(train_data), load_neorl_buffer(val_data)
54 | logger.info(f"Load task data from neorl. -> {task}")
55 | except:
56 | train_buffer, val_buffer = load_data_from_neorl2(task)
57 | logger.info(f"Load task data from neorl2. -> {task}")
58 | return train_buffer, val_buffer
--------------------------------------------------------------------------------
/offlinerl/data/d4rl.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle
3 |
4 | from d4rl import gym_mujoco
5 | import gym
6 | import d4rl
7 | import numpy as np
8 | from loguru import logger
9 |
10 | from offlinerl.utils.data import SampleBatch
11 |
12 | def load_d4rl_buffer(task):
13 | env = gym.make(task[5:])
14 | dataset = d4rl.qlearning_dataset(env)
15 |
16 | buffer = SampleBatch(
17 | obs=dataset['observations'],
18 | obs_next=dataset['next_observations'],
19 | act=dataset['actions'],
20 | rew=np.expand_dims(np.squeeze(dataset['rewards']), 1),
21 | done=np.expand_dims(np.squeeze(dataset['terminals']), 1),
22 | )
23 |
24 | logger.info('obs shape: {}', buffer.obs.shape)
25 | logger.info('obs_next shape: {}', buffer.obs_next.shape)
26 | logger.info('act shape: {}', buffer.act.shape)
27 | logger.info('rew shape: {}', buffer.rew.shape)
28 | logger.info('done shape: {}', buffer.done.shape)
29 | logger.info('Episode reward: {}', buffer.rew.sum() /np.sum(buffer.done) )
30 | logger.info('Number of terminals on: {}', np.sum(buffer.done))
31 | return buffer
32 |
--------------------------------------------------------------------------------
/offlinerl/data/neorl.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from loguru import logger
3 |
4 | from offlinerl.utils.data import SampleBatch, get_scaler
5 | from offlinerl.utils.data import BufferDataset, BufferDataloader
6 |
7 | def load_neorl_buffer(data):
8 | buffer = SampleBatch(
9 | obs = data["obs"],
10 | obs_next = data["next_obs"],
11 | act = data["action"],
12 | rew = data["reward"],
13 | done = data["done"],
14 | )
15 |
16 | logger.info('obs shape: {}', buffer.obs.shape)
17 | logger.info('obs_next shape: {}', buffer.obs_next.shape)
18 | logger.info('act shape: {}', buffer.act.shape)
19 | logger.info('rew shape: {}', buffer.rew.shape)
20 | logger.info('done shape: {}', buffer.done.shape)
21 | logger.info('Episode reward: {}', buffer.rew.sum() /np.sum(buffer.done) )
22 | logger.info('Number of terminals on: {}', np.sum(buffer.done))
23 |
24 | """
25 | rew_scaler = get_scaler(buffer.rew)
26 | buffer.rew = rew_scaler.transform(buffer.rew)
27 | buffer.rew = buffer.rew * 0.01
28 | buffer.done[buffer.rew < np.sort(buffer.rew.reshape(-1))[int(len(buffer)*0.01)]] = 1
29 |
30 | buffer = BufferDataset(buffer)
31 | buffer = BufferDataloader(buffer, batch_size=1, collate_fn=lambda x: x[0], num_workers=8)
32 | """
33 |
34 | return buffer
35 |
--------------------------------------------------------------------------------
/offlinerl/evaluation/d4rl.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import d4rl
3 | import torch
4 | import numpy as np
5 | from tqdm import tqdm
6 | from collections import OrderedDict
7 | from d4rl.infos import REF_MIN_SCORE, REF_MAX_SCORE
8 |
9 | from offlinerl.utils.env import get_env
10 |
11 |
12 | def d4rl_score(task, rew_mean, len_mean):
13 | score = (rew_mean - REF_MIN_SCORE[task]) / (REF_MAX_SCORE[task] - REF_MIN_SCORE[task]) * 100
14 |
15 | return score
16 |
17 |
18 | def d4rl_eval_fn(task, eval_episodes=100):
19 | env = get_env(task)
20 |
21 | def d4rl_eval(policy):
22 | episode_rewards = []
23 | episode_lengths = []
24 | for _ in range(eval_episodes):
25 | state, done = env.reset(), False
26 | rewards = 0
27 | lengths = 0
28 | while not done:
29 | state = state[np.newaxis]
30 | action = policy.get_action(state)
31 | state, reward, done, _ = env.step(action)
32 | rewards += reward
33 | lengths += 1
34 |
35 | episode_rewards.append(rewards)
36 | episode_lengths.append(lengths)
37 |
38 |
39 | rew_mean = np.mean(episode_rewards)
40 | len_mean = np.mean(episode_lengths)
41 |
42 | score = d4rl_score(task, rew_mean, len_mean)
43 |
44 | res = OrderedDict()
45 | res["Reward_Mean"] = rew_mean
46 | res["Length_Mean"] = len_mean
47 | res["D4rl_Score"] = score
48 |
49 | return res
50 |
51 | return d4rl_eval
--------------------------------------------------------------------------------
/offlinerl/evaluation/fqe.py:
--------------------------------------------------------------------------------
1 | # https://arxiv.org/abs/2007.09055
2 | # Hyperparameter Selection for Offline Reinforcement Learning
3 | from copy import deepcopy
4 | import torch
5 | from tqdm import tqdm
6 |
7 | from offlinerl.utils.net.common import MLP
8 | from offlinerl.utils.net.continuous import DistributionalCritic
9 |
10 | class FQE:
11 | # https://arxiv.org/abs/2007.09055
12 | # Hyperparameter Selection for Offline Reinforcement Learning
13 | def __init__(self,
14 | policy,
15 | buffer,
16 | q_hidden_features=1024,
17 | q_hidden_layers=4,
18 | device="cuda" if torch.cuda.is_available() else "cpu"
19 | ):
20 | self.policy = policy
21 | self.buffer = buffer
22 | self.critic_hidden_features = q_hidden_features
23 | self.critic_hidden_layers = q_hidden_layers
24 | self._device = device
25 |
26 | def train_estimator(self,
27 | init_critic=None,
28 | discount=0.99,
29 | target_update_period=100,
30 | critic_lr=1e-4,
31 | num_steps=250000,
32 | polyak=0.0,
33 | batch_size=256,
34 | verbose=False):
35 |
36 | min_reward = self.buffer.rew.min()
37 | max_reward = self.buffer.rew.max()
38 |
39 | max_value = (1.2 * max_reward + 0.8 * min_reward) / (1 - discount)
40 | min_value = (1.2 * min_reward + 0.8 * max_reward) / (1 - discount)
41 |
42 | data = self.buffer.sample(batch_size)
43 | input_dim = data.obs.shape[-1] + data.act.shape[-1]
44 | critic = MLP(input_dim, 1, self.critic_hidden_features, self.critic_hidden_layers).to(self._device)
45 | if init_critic is not None: critic.load_state_dict(init_critic.state_dict())
46 | critic_optimizer = torch.optim.Adam(critic.parameters(), lr=critic_lr)
47 | target_critic = deepcopy(critic).to(self._device)
48 | target_critic.requires_grad_(False)
49 |
50 | if verbose:
51 | counter = tqdm(total=num_steps)
52 |
53 | print('Training Fqe...')
54 | for t in range(num_steps):
55 | batch = self.buffer.sample(batch_size)
56 | data = batch.to_torch(dtype=torch.float32, device=self._device)
57 | r = data.rew
58 | terminals = data.done
59 | o1 = data.obs
60 | a1 = data.act
61 |
62 | o2 = data.obs_next
63 | a2 = self.policy.get_action(o2)
64 | q_target = target_critic(torch.cat((o2, a2), -1)).detach()
65 | current_discount = discount * (1 - terminals)
66 | backup = r + current_discount * q_target
67 | backup = torch.clamp(backup, min_value, max_value) # prevent explosion
68 |
69 | q = critic(torch.cat((o1, a1), -1))
70 | critic_loss = ((q - backup) ** 2).mean()
71 |
72 | critic_optimizer.zero_grad()
73 | critic_loss.backward()
74 | critic_optimizer.step()
75 |
76 | if t % target_update_period == 0:
77 | with torch.no_grad():
78 | for p, p_targ in zip(critic.parameters(), target_critic.parameters()):
79 | p_targ.data.mul_(polyak)
80 | p_targ.data.add_((1 - polyak) * p.data)
81 |
82 | if verbose:
83 | counter.update(1)
84 |
85 | return critic
--------------------------------------------------------------------------------
/offlinerl/evaluation/gym.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from collections import OrderedDict
4 |
5 | from offlinerl.utils.env import get_env
6 |
7 | def gym_policy_eval(task, eval_episodes=100):
8 | env = get_env(task)
9 |
10 | def policy_eval(policy):
11 | episode_rewards = []
12 | episode_lengths = []
13 | for _ in range(eval_episodes):
14 | state, done = env.reset(), False
15 | rewards = 0
16 | lengths = 0
17 | while not done:
18 | state = state[np.newaxis]
19 | action = policy.get_action(state).reshape(-1)
20 | state, reward, done, _ = env.step(action)
21 | rewards += reward
22 | lengths += 1
23 |
24 | episode_rewards.append(rewards)
25 | episode_lengths.append(lengths)
26 |
27 |
28 | rew_mean = np.mean(episode_rewards)
29 | len_mean = np.mean(episode_lengths)
30 |
31 |
32 | res = OrderedDict()
33 | res["Reward_Mean"] = rew_mean
34 | res["Length_Mean"] = len_mean
35 |
36 | return res
37 |
38 | return policy_eval
39 |
40 |
41 | def gym_env_eval(task, eval_episodes=100):
42 | env = get_env(task)
43 |
44 | def env_eval(policy, obs_scaler=None, act_scaler=None):
45 | env_mae = []
46 | for _ in range(eval_episodes):
47 | state, done = env.reset(), False
48 | rewards = 0
49 | lengths = 0
50 | while not done:
51 | state = state[np.newaxis]
52 | action = env.action_space.sample()
53 |
54 | obs = state.reshape(1,-1)
55 | act = action.reshape(1,-1)
56 | if obs_scaler is not None:
57 | obs = obs_scaler.transform(obs)
58 | if act_scaler is not None:
59 | act = act_scaler.transform(act)
60 |
61 | policy_state = policy.get_action(np.concatenate([obs,act], axis=1))
62 |
63 | if obs_scaler is not None:
64 | policy_state = obs_scaler.inverse_transform(policy_state)
65 |
66 | state, reward, done, _ = env.step(action)
67 |
68 | env_mae.append(np.mean(np.abs(policy_state -state)))
69 |
70 | env_mae = np.mean(env_mae)
71 |
72 |
73 | res = OrderedDict()
74 | res["Env_Mae"] = env_mae
75 |
76 | return res
77 |
78 | return env_eval
--------------------------------------------------------------------------------
/offlinerl/evaluation/neorl.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import ray
3 | from copy import deepcopy
4 | import numpy as np
5 | from collections import OrderedDict
6 |
7 | from offlinerl.utils.env import get_env
8 | from multiprocessing import Pool
9 |
10 |
11 | #@ray.remote(num_gpus=0.1)
12 | def test_one_trail(env, policy):
13 | # env = deepcopy(env)
14 | # policy = deepcopy(policy)
15 |
16 | state, done = env.reset(), False
17 | if isinstance(state, tuple):
18 | state = state[0]
19 | rewards = 0
20 | lengths = 0
21 | while not done:
22 | state = state[np.newaxis]
23 | action = policy.get_action(state).reshape(-1)
24 | result = env.step(action)
25 | if len(result) == 4:
26 | state, reward, done, _ = result
27 | else:
28 | state, reward, done, timeout,_ = result
29 | done = done or timeout
30 | rewards += reward
31 | lengths += 1
32 |
33 | return (rewards, lengths)
34 |
35 | def test_one_trail_sp_local(env, policy):
36 | # env = deepcopy(env)
37 | # policy = deepcopy(policy)
38 |
39 | state, done = env.reset(), False
40 | rewards = 0
41 | lengths = 0
42 | obs_dim = env.observation_space.shape[0]
43 | act_dim = env.action_space.shape[0]
44 |
45 | while not done:
46 | state = state.reshape(-1, obs_dim)
47 | action = policy.get_action(state).reshape(-1, act_dim)
48 | # print("actions: ", action[0:3,])
49 | state, reward, done, _ = env.step(action)
50 | rewards += reward
51 | lengths += 1
52 |
53 | return (rewards, lengths)
54 |
55 | def test_on_real_env(policy, env, number_of_runs=100):
56 | rewards = []
57 | episode_lengths = []
58 | policy = deepcopy(policy)
59 | policy.eval()
60 |
61 | if (not hasattr(env.spec, "id")) and ("sp" in env._name or "sales" in env._name):
62 | results = [test_one_trail_sp_local(env, policy) for _ in range(number_of_runs)]
63 | else:
64 | pool = Pool(processes=10)
65 | results = [pool.apply_async(test_one_trail, args=(env, policy)) for _ in range(number_of_runs)]
66 | results = [result.get() for result in results]
67 | pool.close()
68 | pool.join()
69 |
70 | policy.train()
71 |
72 | rewards = [result[0] for result in results]
73 | episode_lengths = [result[1] for result in results]
74 |
75 | rew_mean = np.mean(rewards)
76 | rew_std = np.std(rewards)
77 | len_mean = np.mean(episode_lengths)
78 |
79 |
80 | res = OrderedDict()
81 | res["Reward_Mean_Env"] = rew_mean
82 | res["Reward_Std_Env"] = rew_std
83 | res["Length_Mean_Env"] = len_mean
84 | res["Length_Std_Env"] = np.std(episode_lengths)
85 |
86 | return res
87 |
--------------------------------------------------------------------------------
/offlinerl/outside_utils/buffer/__init__.py:
--------------------------------------------------------------------------------
1 | from offlinerl.outside_utils.buffer.buffer import ReplayBuffer
2 |
3 |
4 | __all__ = [
5 | "ReplayBuffer"
6 | ]
--------------------------------------------------------------------------------
/offlinerl/outside_utils/buffer/buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from typing import Optional, Union, Tuple, Dict
5 |
6 |
7 | class ReplayBuffer:
8 | def __init__(
9 | self,
10 | buffer_size: int,
11 | obs_shape: Tuple,
12 | obs_dtype: np.dtype,
13 | action_dim: int,
14 | action_dtype: np.dtype,
15 | device: str = "cpu"
16 | ) -> None:
17 | self._max_size = buffer_size
18 | self.obs_shape = obs_shape
19 | self.obs_dtype = obs_dtype
20 | self.action_dim = action_dim
21 | self.action_dtype = action_dtype
22 |
23 | self._ptr = 0
24 | self._size = 0
25 | self.observations = np.zeros((self._max_size,) + self.obs_shape, dtype=obs_dtype)
26 | self.next_observations = np.zeros((self._max_size,) + self.obs_shape, dtype=obs_dtype)
27 | self.actions = np.zeros((self._max_size, self.action_dim), dtype=action_dtype)
28 | self.rewards = np.zeros((self._max_size, 1), dtype=np.float32)
29 | self.terminals = np.zeros((self._max_size, 1), dtype=np.float32)
30 |
31 | self.device = torch.device(device)
32 |
33 | def add(
34 | self,
35 | obs: np.ndarray,
36 | next_obs: np.ndarray,
37 | action: np.ndarray,
38 | reward: np.ndarray,
39 | terminal: np.ndarray
40 | ) -> None:
41 | # Copy to avoid modification by reference
42 | self.observations[self._ptr] = np.array(obs).copy()
43 | self.next_observations[self._ptr] = np.array(next_obs).copy()
44 | self.actions[self._ptr] = np.array(action).copy()
45 | self.rewards[self._ptr] = np.array(reward).copy()
46 | self.terminals[self._ptr] = np.array(terminal).copy()
47 |
48 | self._ptr = (self._ptr + 1) % self._max_size
49 | self._size = min(self._size + 1, self._max_size)
50 |
51 | def add_batch(
52 | self,
53 | obss: np.ndarray,
54 | next_obss: np.ndarray,
55 | actions: np.ndarray,
56 | rewards: np.ndarray,
57 | terminals: np.ndarray
58 | ) -> None:
59 | batch_size = len(obss)
60 | indexes = np.arange(self._ptr, self._ptr + batch_size) % self._max_size
61 |
62 | self.observations[indexes] = np.array(obss).copy()
63 | self.next_observations[indexes] = np.array(next_obss).copy()
64 | self.actions[indexes] = np.array(actions).copy()
65 | self.rewards[indexes] = np.array(rewards).copy()
66 | self.terminals[indexes] = np.array(terminals).copy()
67 |
68 | self._ptr = (self._ptr + batch_size) % self._max_size
69 | self._size = min(self._size + batch_size, self._max_size)
70 |
71 | def load_dataset(self, dataset: Dict[str, np.ndarray]) -> None:
72 | observations = np.array(dataset["obs"], dtype=self.obs_dtype)
73 | next_observations = np.array(dataset["obs_next"], dtype=self.obs_dtype)
74 | actions = np.array(dataset["act"], dtype=self.action_dtype)
75 | rewards = np.array(dataset["rew"], dtype=np.float32).reshape(-1, 1)
76 | terminals = np.array(dataset["done"], dtype=np.float32).reshape(-1, 1)
77 |
78 | self.observations = observations
79 | self.next_observations = next_observations
80 | self.actions = actions
81 | self.rewards = rewards
82 | self.terminals = terminals
83 |
84 | self._ptr = len(observations)
85 | self._size = len(observations)
86 |
87 | def normalize_obs(self, eps: float = 1e-3, inplace : bool = True) -> Tuple[np.ndarray, np.ndarray]:
88 | mean = self.observations.mean(0, keepdims=True)
89 | std = self.observations.std(0, keepdims=True) + eps
90 | if inplace:
91 | self.observations = (self.observations - mean) / std
92 | self.next_observations = (self.next_observations - mean) / std
93 | obs_mean, obs_std = mean, std
94 | return obs_mean, obs_std
95 |
96 | def sample(self, batch_size: int) -> Dict[str, torch.Tensor]:
97 |
98 | batch_indexes = np.random.randint(0, self._size, size=batch_size)
99 |
100 | return {
101 | "observations": torch.tensor(self.observations[batch_indexes]).to(self.device),
102 | "actions": torch.tensor(self.actions[batch_indexes]).to(self.device),
103 | "next_observations": torch.tensor(self.next_observations[batch_indexes]).to(self.device),
104 | "terminals": torch.tensor(self.terminals[batch_indexes]).to(self.device),
105 | "rewards": torch.tensor(self.rewards[batch_indexes]).to(self.device)
106 | }
107 |
108 | def sample_all(self) -> Dict[str, np.ndarray]:
109 | return {
110 | "observations": self.observations[:self._size].copy(),
111 | "actions": self.actions[:self._size].copy(),
112 | "next_observations": self.next_observations[:self._size].copy(),
113 | "terminals": self.terminals[:self._size].copy(),
114 | "rewards": self.rewards[:self._size].copy()
115 | }
--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/__init__.py:
--------------------------------------------------------------------------------
1 | from offlinerl.outside_utils.dynamics.base_dynamics import BaseDynamics
2 | from offlinerl.outside_utils.dynamics.ensemble_dynamics import EnsembleDynamics
3 | from offlinerl.outside_utils.dynamics.rnn_dynamics import RNNDynamics
4 | from offlinerl.outside_utils.dynamics.mujoco_oracle_dynamics import MujocoOracleDynamics
5 |
6 |
7 | __all__ = [
8 | "BaseDynamics",
9 | "EnsembleDynamics",
10 | "RNNDynamics",
11 | "MujocoOracleDynamics"
12 | ]
--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/base_dynamics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 |
5 | from typing import Callable, List, Tuple, Dict
6 |
7 |
8 | class BaseDynamics(object):
9 | def __init__(
10 | self,
11 | model: nn.Module,
12 | optim: torch.optim.Optimizer
13 | ) -> None:
14 | super().__init__()
15 | self.model = model
16 | self.optim = optim
17 |
18 | def step(
19 | self,
20 | obs: np.ndarray,
21 | action: np.ndarray
22 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]:
23 | raise NotImplementedError
24 |
--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/ensemble_dynamics.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 |
6 | from typing import Callable, List, Tuple, Dict, Optional
7 | from offlinerl.outside_utils.dynamics import BaseDynamics
8 | from offlinerl.outside_utils.utils.scaler import StandardScaler
9 | from offlinerl.outside_utils.utils.logger import Logger
10 | import warnings
11 |
12 |
13 | class EnsembleDynamics(BaseDynamics):
14 | def __init__(
15 | self,
16 | model: nn.Module,
17 | optim: torch.optim.Optimizer,
18 | scaler: StandardScaler,
19 | terminal_fn: Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray],
20 | penalty_coef: float = 0.0,
21 | uncertainty_mode: str = "aleatoric",
22 | data_range: tuple = None,
23 | ) -> None:
24 | super().__init__(model, optim)
25 | self.scaler = scaler
26 | self.terminal_fn = terminal_fn
27 | self._penalty_coef = penalty_coef
28 | self._uncertainty_mode = uncertainty_mode
29 | self.obs_min, self.obs_max, self.rew_min, self.rew_max = data_range
30 |
31 | @ torch.no_grad()
32 | def step(
33 | self,
34 | obs: np.ndarray,
35 | action: np.ndarray,
36 | transition_scaler: bool = True,
37 | transition_clip: bool = False,
38 | clip_penalty: bool = False,
39 | max_penalty: float = 0,
40 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]:
41 | "imagine single forward step"
42 | obs_act = np.concatenate([obs, action], axis=-1)
43 | if transition_scaler:
44 | obs_act = self.scaler.transform(obs_act)
45 | mean, logvar = self.model(obs_act)
46 | mean = mean.cpu().numpy()
47 | logvar = logvar.cpu().numpy()
48 | mean[..., :-1] += obs
49 | std = np.sqrt(np.exp(logvar))
50 |
51 | ensemble_samples = (mean + np.random.normal(size=mean.shape) * std).astype(np.float32)
52 |
53 | # choose one model from ensemble
54 | num_models, batch_size, _ = ensemble_samples.shape
55 | model_idxs = self.model.random_elite_idxs(batch_size)
56 | samples = ensemble_samples[model_idxs, np.arange(batch_size)]
57 |
58 | next_obs = samples[..., :-1]
59 | reward = samples[..., -1:]
60 | terminal = self.terminal_fn(obs, action, next_obs)
61 | if transition_clip:
62 | next_obs = np.clip(next_obs, self.obs_min, self.obs_max)
63 | reward = np.clip(reward, self.rew_min, self.rew_max)
64 |
65 | info = {}
66 | info["raw_reward"] = reward
67 |
68 | if self._penalty_coef > 0.0:
69 | norm_mean = mean
70 | norm_std = std
71 | if self._uncertainty_mode == "aleatoric":
72 | penalty = np.amax(np.linalg.norm(norm_std, axis=2), axis=0)
73 | elif self._uncertainty_mode == "pairwise-diff":
74 | next_obses_mean = norm_mean[..., :-1]
75 | next_obs_mean = np.mean(next_obses_mean, axis=0)
76 | diff = next_obses_mean - next_obs_mean
77 | penalty = np.amax(np.linalg.norm(diff, axis=2), axis=0)
78 | elif self._uncertainty_mode == "ensemble_std":
79 | next_obses_mean = norm_mean[..., :-1]
80 | penalty = np.sqrt(next_obses_mean.var(0).mean(1))
81 | else:
82 | warnings.warn("Invalid uncertainty mode. No penalty applied!!!")
83 | penalty = np.zeros_like(reward).mean(1)
84 |
85 | penalty = np.expand_dims(penalty, 1).astype(np.float32)
86 | if clip_penalty:
87 | penalty = np.clip(penalty, a_max=max_penalty)
88 | assert penalty.shape == reward.shape
89 | reward = reward - self._penalty_coef * penalty
90 | info["penalty"] = penalty
91 |
92 | return next_obs, reward, np.bool_(terminal), info
93 |
94 | @ torch.no_grad()
95 | def sample_next_obss(
96 | self,
97 | obs: torch.Tensor,
98 | action: torch.Tensor,
99 | num_samples: int,
100 | transition_scaler: bool = True,
101 | transition_clip: bool = False,
102 | ) -> torch.Tensor:
103 | obs_act = torch.cat([obs, action], dim=-1)
104 | if transition_scaler:
105 | obs_act = self.scaler.transform_tensor(obs_act)
106 | mean, logvar = self.model(obs_act)
107 | mean[..., :-1] += obs
108 | std = torch.sqrt(torch.exp(logvar))
109 |
110 | mean = mean[self.model.elites.data.cpu().numpy()]
111 | std = std[self.model.elites.data.cpu().numpy()]
112 |
113 | samples = torch.stack([mean + torch.randn_like(std) * std for i in range(num_samples)], 0)
114 | next_obss = samples[..., :-1]
115 | if transition_clip:
116 | obs_min = torch.as_tensor(self.obs_min).to(next_obss.device)
117 | obs_max = torch.as_tensor(self.obs_max).to(next_obss.device)
118 | next_obss = torch.clamp(next_obss, obs_min, obs_max)
119 | return next_obss
120 |
121 | def format_samples_for_training(self, data: Dict) -> Tuple[np.ndarray, np.ndarray]:
122 | obss = data["observations"]
123 | actions = data["actions"]
124 | next_obss = data["next_observations"]
125 | rewards = data["rewards"]
126 | delta_obss = next_obss - obss
127 | inputs = np.concatenate((obss, actions), axis=-1)
128 | targets = np.concatenate((delta_obss, rewards), axis=-1)
129 | return inputs, targets
130 |
131 | def select_elites(self, metrics: List) -> List[int]:
132 | pairs = [(metric, index) for metric, index in zip(metrics, range(len(metrics)))]
133 | pairs = sorted(pairs, key=lambda x: x[0])
134 | elites = [pairs[i][1] for i in range(self.model.num_elites)]
135 | return elites
136 |
137 | def save(self, save_path: str) -> None:
138 | torch.save(self.model.state_dict(), os.path.join(save_path, "dynamics.pth"))
139 | self.scaler.save_scaler(save_path)
140 |
141 | def load(self, load_path: str) -> None:
142 | self.model.load_state_dict(torch.load(os.path.join(load_path, "dynamics.pth"), map_location=self.model.device))
143 | self.scaler.load_scaler(load_path)
144 |
--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/mujoco_oracle_dynamics.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from gym.envs.mujoco import mujoco_env
4 | from typing import Callable, List, Tuple, Dict
5 |
6 |
7 | class MujocoOracleDynamics(object):
8 | def __init__(self, env: mujoco_env.MujocoEnv) -> None:
9 | self.env = env
10 |
11 | def _set_state_from_obs(self, obs:np.ndarray) -> None:
12 | if len(obs) == (self.env.model.nq + self.env.model.nv - 1):
13 | xpos = np.zeros(1)
14 | obs = np.concatenate([xpos, obs])
15 | qpos = obs[:self.env.model.nq]
16 | qvel = obs[self.env.model.nq:]
17 | self.env._elapsed_steps = 0
18 | self.env.set_state(qpos, qvel)
19 |
20 | def step(
21 | self,
22 | obs: np.ndarray,
23 | action: np.ndarray
24 | ) -> Tuple[np.ndarray, float, bool, Dict]:
25 | if (len(obs.shape) > 1) or (len(action.shape) > 1):
26 | raise ValueError
27 | self._set_state_from_obs(obs)
28 | next_obs, reward, terminal, info = self.env.step(action)
29 | return next_obs, reward, terminal, info
--------------------------------------------------------------------------------
/offlinerl/outside_utils/dynamics/rnn_dynamics.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 |
6 | from typing import Callable, List, Tuple, Dict
7 | from torch.utils.data.dataloader import DataLoader
8 | from offlinerl.outside_utils.dynamics import BaseDynamics
9 | from offlinerl.outside_utils.utils.scaler import StandardScaler
10 | from offlinerl.outside_utils.utils.logger import Logger
11 |
12 |
13 | class RNNDynamics(BaseDynamics):
14 | def __init__(
15 | self,
16 | model: nn.Module,
17 | optim: torch.optim.Optimizer,
18 | scaler: StandardScaler,
19 | terminal_fn: Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray],
20 | ) -> None:
21 | super().__init__(model, optim)
22 | self.scaler = scaler
23 | self.terminal_fn = terminal_fn
24 |
25 | @ torch.no_grad()
26 | def step(
27 | self,
28 | obss: np.ndarray,
29 | actions: np.ndarray
30 | ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict]:
31 | "imagine single forward step"
32 | inputs = np.concatenate([obss, actions], axis=-1)
33 | inputs = self.scaler.transform(inputs)
34 | preds, _ = self.model(inputs)
35 | # get last timestep pred
36 | preds = preds[:, -1]
37 | next_obss = preds[..., :-1].cpu().numpy() + obss[:, -1]
38 | rewards = preds[..., -1:].cpu().numpy()
39 |
40 | terminals = self.terminal_fn(obss[:, -1], actions[:, -1], next_obss)
41 | info = {}
42 |
43 | return next_obss, rewards, terminals, info
44 |
45 | def train(self, data: Dict, batch_size: int, max_iters: int, logger: Logger) -> None:
46 | self.model.train()
47 | loader = DataLoader(data, shuffle=True, batch_size=batch_size)
48 | for iter in range(max_iters):
49 | for batch in loader:
50 | train_loss = self.learn(batch)
51 | logger.logkv_mean("loss/model", train_loss)
52 |
53 | logger.set_timestep(iter)
54 | logger.dumpkvs(exclude=["policy_training_progress"])
55 | self.save(logger.model_dir)
56 | self.model.eval()
57 |
58 | def learn(self, batch) -> float:
59 | inputs, targets, masks = batch
60 | preds, _ = self.model.forward(inputs)
61 |
62 | loss = (((preds - targets) ** 2).mean(-1) * masks).mean()
63 |
64 | self.optim.zero_grad()
65 | loss.backward()
66 | self.optim.step()
67 |
68 | return loss.item()
69 |
70 | def save(self, save_path: str) -> None:
71 | torch.save(self.model.state_dict(), os.path.join(save_path, "dynamics.pth"))
72 | self.scaler.save_scaler(save_path)
73 |
74 | def load(self, load_path: str) -> None:
75 | self.model.load_state_dict(torch.load(os.path.join(load_path, "dynamics.pth"), map_location=self.model.device))
76 | self.scaler.load_scaler(load_path)
--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from offlinerl.outside_utils.modules.actor_module import Actor, ActorProb
2 | from offlinerl.outside_utils.modules.critic_module import Critic
3 | from offlinerl.outside_utils.modules.ensemble_critic_module import EnsembleCritic
4 | from offlinerl.outside_utils.modules.dist_module import DiagGaussian, TanhDiagGaussian
5 | from offlinerl.outside_utils.modules.dynamics_module import EnsembleDynamicsModel
6 |
7 |
8 | __all__ = [
9 | "Actor",
10 | "ActorProb",
11 | "Critic",
12 | "EnsembleCritic",
13 | "DiagGaussian",
14 | "TanhDiagGaussian",
15 | "EnsembleDynamicsModel"
16 | ]
--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/actor_module.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | from torch.nn import functional as F
5 | from typing import Union, Optional
6 |
7 |
8 | # for SAC
9 | class ActorProb(nn.Module):
10 | def __init__(
11 | self,
12 | backbone: nn.Module,
13 | dist_net: nn.Module,
14 | device: str = "cpu"
15 | ) -> None:
16 | super().__init__()
17 |
18 | self.device = torch.device(device)
19 | self.backbone = backbone.to(device)
20 | self.dist_net = dist_net.to(device)
21 | self.scaler = None
22 |
23 | def set_scaler(self, scaler):
24 | self.scaler = scaler
25 |
26 | def forward(self, obs: Union[np.ndarray, torch.Tensor]) -> torch.distributions.Normal:
27 | obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
28 | logits = self.backbone(obs)
29 | dist = self.dist_net(logits)
30 | return dist
31 |
32 | def get_action(self, obs: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
33 | if self.scaler is not None:
34 | obs = self.scaler.transform(obs)
35 | dist = self.forward(obs)
36 | action, _ = dist.mode()
37 | return action.detach().cpu().numpy()
38 |
39 | def to(self, device: str) -> None:
40 | self.device = torch.device(device)
41 | self.backbone.to(device)
42 | self.dist_net.to(device)
43 | return self
44 |
45 |
46 | # for TD3
47 | class Actor(nn.Module):
48 | def __init__(
49 | self,
50 | backbone: nn.Module,
51 | action_dim: int,
52 | max_action: float = 1.0,
53 | device: str = "cpu"
54 | ) -> None:
55 | super().__init__()
56 |
57 | self.device = torch.device(device)
58 | self.backbone = backbone.to(device)
59 | latent_dim = getattr(backbone, "output_dim")
60 | output_dim = action_dim
61 | self.last = nn.Linear(latent_dim, output_dim).to(device)
62 | self._max = max_action
63 |
64 | def forward(self, obs: Union[np.ndarray, torch.Tensor]) -> torch.Tensor:
65 | obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
66 | logits = self.backbone(obs)
67 | actions = self._max * torch.tanh(self.last(logits))
68 | return actions
--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/critic_module.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | from torch.nn import functional as F
5 | from typing import Union, Optional
6 |
7 |
8 | class Critic(nn.Module):
9 | def __init__(self, backbone: nn.Module, device: str = "cpu") -> None:
10 | super().__init__()
11 |
12 | self.device = torch.device(device)
13 | self.backbone = backbone.to(device)
14 | latent_dim = getattr(backbone, "output_dim")
15 | self.last = nn.Linear(latent_dim, 1).to(device)
16 |
17 | def forward(
18 | self,
19 | obs: Union[np.ndarray, torch.Tensor],
20 | actions: Optional[Union[np.ndarray, torch.Tensor]] = None
21 | ) -> torch.Tensor:
22 | obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
23 | if actions is not None:
24 | actions = torch.as_tensor(actions, device=self.device, dtype=torch.float32).flatten(1)
25 | obs = torch.cat([obs, actions], dim=1)
26 | logits = self.backbone(obs)
27 | values = self.last(logits)
28 | return values
--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/dist_module.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class NormalWrapper(torch.distributions.Normal):
7 | def log_prob(self, actions):
8 | return super().log_prob(actions).sum(-1, keepdim=True)
9 |
10 | def entropy(self):
11 | return super().entropy().sum(-1)
12 |
13 | def mode(self):
14 | return self.mean
15 |
16 |
17 | class TanhNormalWrapper(torch.distributions.Normal):
18 | def log_prob(self, action, raw_action=None):
19 | if raw_action is None:
20 | raw_action = self.arctanh(action)
21 | log_prob = super().log_prob(raw_action).sum(-1, keepdim=True)
22 | eps = 1e-6
23 | log_prob = log_prob - torch.log((1 - action.pow(2)) + eps).sum(-1, keepdim=True)
24 | return log_prob
25 |
26 | def mode(self):
27 | raw_action = self.mean
28 | action = torch.tanh(self.mean)
29 | return action, raw_action
30 |
31 | def arctanh(self, x):
32 | one_plus_x = (1 + x).clamp(min=1e-6)
33 | one_minus_x = (1 - x).clamp(min=1e-6)
34 | return 0.5 * torch.log(one_plus_x / one_minus_x)
35 |
36 | def rsample(self):
37 | raw_action = super().rsample()
38 | action = torch.tanh(raw_action)
39 | return action, raw_action
40 |
41 |
42 | class DiagGaussian(nn.Module):
43 | def __init__(
44 | self,
45 | latent_dim,
46 | output_dim,
47 | unbounded=False,
48 | conditioned_sigma=False,
49 | max_mu=1.0,
50 | sigma_min=-5.0,
51 | sigma_max=2.0
52 | ):
53 | super().__init__()
54 | self.mu = nn.Linear(latent_dim, output_dim)
55 | self._c_sigma = conditioned_sigma
56 | if conditioned_sigma:
57 | self.sigma = nn.Linear(latent_dim, output_dim)
58 | else:
59 | self.sigma_param = nn.Parameter(torch.zeros(output_dim, 1))
60 | self._unbounded = unbounded
61 | self._max = max_mu
62 | self._sigma_min = sigma_min
63 | self._sigma_max = sigma_max
64 |
65 | def forward(self, logits):
66 | mu = self.mu(logits)
67 | if not self._unbounded:
68 | mu = self._max * torch.tanh(mu)
69 | if self._c_sigma:
70 | sigma = torch.clamp(self.sigma(logits), min=self._sigma_min, max=self._sigma_max).exp()
71 | else:
72 | shape = [1] * len(mu.shape)
73 | shape[1] = -1
74 | sigma = (self.sigma_param.view(shape) + torch.zeros_like(mu)).exp()
75 | return NormalWrapper(mu, sigma)
76 |
77 |
78 | class TanhDiagGaussian(DiagGaussian):
79 | def __init__(
80 | self,
81 | latent_dim,
82 | output_dim,
83 | unbounded=False,
84 | conditioned_sigma=False,
85 | max_mu=1.0,
86 | sigma_min=-5.0,
87 | sigma_max=2.0
88 | ):
89 | super().__init__(
90 | latent_dim=latent_dim,
91 | output_dim=output_dim,
92 | unbounded=unbounded,
93 | conditioned_sigma=conditioned_sigma,
94 | max_mu=max_mu,
95 | sigma_min=sigma_min,
96 | sigma_max=sigma_max
97 | )
98 |
99 | def forward(self, logits):
100 | mu = self.mu(logits)
101 | if not self._unbounded:
102 | mu = self._max * torch.tanh(mu)
103 | if self._c_sigma:
104 | sigma = torch.clamp(self.sigma(logits), min=self._sigma_min, max=self._sigma_max).exp()
105 | else:
106 | shape = [1] * len(mu.shape)
107 | shape[1] = -1
108 | sigma = (self.sigma_param.view(shape) + torch.zeros_like(mu)).exp()
109 | return TanhNormalWrapper(mu, sigma)
110 |
--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/dynamics_module.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | from torch.nn import functional as F
5 | from typing import Dict, List, Union, Tuple, Optional
6 | from offlinerl.outside_utils.nets import EnsembleLinear
7 |
8 |
9 | class Swish(nn.Module):
10 | def __init__(self) -> None:
11 | super(Swish, self).__init__()
12 |
13 | def forward(self, x: torch.Tensor) -> torch.Tensor:
14 | x = x * torch.sigmoid(x)
15 | return x
16 |
17 |
18 | def soft_clamp(
19 | x : torch.Tensor,
20 | _min: Optional[torch.Tensor] = None,
21 | _max: Optional[torch.Tensor] = None
22 | ) -> torch.Tensor:
23 | # clamp tensor values while mataining the gradient
24 | if _max is not None:
25 | x = _max - F.softplus(_max - x)
26 | if _min is not None:
27 | x = _min + F.softplus(x - _min)
28 | return x
29 |
30 |
31 | class EnsembleDynamicsModel(nn.Module):
32 | def __init__(
33 | self,
34 | obs_dim: int,
35 | action_dim: int,
36 | hidden_dims: Union[List[int], Tuple[int]],
37 | num_ensemble: int = 7,
38 | num_elites: int = 5,
39 | activation: nn.Module = Swish,
40 | weight_decays: Optional[Union[List[float], Tuple[float]]] = None,
41 | with_reward: bool = True,
42 | device: str = "cpu"
43 | ) -> None:
44 | super().__init__()
45 |
46 | self.num_ensemble = num_ensemble
47 | self.num_elites = num_elites
48 | self._with_reward = with_reward
49 | self.device = torch.device(device)
50 |
51 | self.activation = activation()
52 |
53 | assert len(weight_decays) == (len(hidden_dims) + 1)
54 |
55 | module_list = []
56 | hidden_dims = [obs_dim+action_dim] + list(hidden_dims)
57 | if weight_decays is None:
58 | weight_decays = [0.0] * (len(hidden_dims) + 1)
59 | for in_dim, out_dim, weight_decay in zip(hidden_dims[:-1], hidden_dims[1:], weight_decays[:-1]):
60 | module_list.append(EnsembleLinear(in_dim, out_dim, num_ensemble, weight_decay))
61 | self.backbones = nn.ModuleList(module_list)
62 |
63 | self.output_layer = EnsembleLinear(
64 | hidden_dims[-1],
65 | 2 * (obs_dim + self._with_reward),
66 | num_ensemble,
67 | weight_decays[-1]
68 | )
69 |
70 | self.register_parameter(
71 | "max_logvar",
72 | nn.Parameter(torch.ones(obs_dim + self._with_reward) * 0.5, requires_grad=True)
73 | )
74 | self.register_parameter(
75 | "min_logvar",
76 | nn.Parameter(torch.ones(obs_dim + self._with_reward) * -10, requires_grad=True)
77 | )
78 |
79 | self.register_parameter(
80 | "elites",
81 | nn.Parameter(torch.tensor(list(range(0, self.num_elites))), requires_grad=False)
82 | )
83 |
84 | self.to(self.device)
85 |
86 | def forward(self, obs_action: np.ndarray) -> Tuple[torch.Tensor, torch.Tensor]:
87 | if isinstance(obs_action, np.ndarray):
88 | obs_action = torch.as_tensor(obs_action, dtype=torch.float32).to(self.device)
89 | output = obs_action
90 | for layer in self.backbones:
91 | output = self.activation(layer(output))
92 | mean, logvar = torch.chunk(self.output_layer(output), 2, dim=-1)
93 | logvar = soft_clamp(logvar, self.min_logvar, self.max_logvar)
94 | return mean, logvar
95 |
96 | def load_save(self) -> None:
97 | for layer in self.backbones:
98 | layer.load_save()
99 | self.output_layer.load_save()
100 |
101 | def update_save(self, indexes: List[int]) -> None:
102 | for layer in self.backbones:
103 | layer.update_save(indexes)
104 | self.output_layer.update_save(indexes)
105 |
106 | def get_decay_loss(self) -> torch.Tensor:
107 | decay_loss = 0
108 | for layer in self.backbones:
109 | decay_loss += layer.get_decay_loss()
110 | decay_loss += self.output_layer.get_decay_loss()
111 | return decay_loss
112 |
113 | def set_elites(self, indexes: List[int]) -> None:
114 | assert len(indexes) <= self.num_ensemble and max(indexes) < self.num_ensemble
115 | self.register_parameter('elites', nn.Parameter(torch.tensor(indexes), requires_grad=False))
116 |
117 | def random_elite_idxs(self, batch_size: int) -> np.ndarray:
118 | idxs = np.random.choice(self.elites.data.cpu().numpy(), size=batch_size)
119 | return idxs
--------------------------------------------------------------------------------
/offlinerl/outside_utils/modules/ensemble_critic_module.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | from torch.nn import functional as F
5 | from typing import Union, Optional, List, Tuple
6 |
7 | from offlinerl.outside_utils.nets import EnsembleLinear
8 |
9 |
10 | class EnsembleCritic(nn.Module):
11 | def __init__(
12 | self,
13 | obs_dim: int,
14 | action_dim: int,
15 | hidden_dims: Union[List[int], Tuple[int]],
16 | activation: nn.Module = nn.ReLU,
17 | num_ensemble: int = 10,
18 | device: str = "cpu"
19 | ) -> None:
20 | super().__init__()
21 | input_dim = obs_dim + action_dim
22 | hidden_dims = [input_dim] + list(hidden_dims)
23 | model = []
24 | for in_dim, out_dim in zip(hidden_dims[:-1], hidden_dims[1:]):
25 | model += [EnsembleLinear(in_dim, out_dim, num_ensemble), activation()]
26 | model.append(EnsembleLinear(hidden_dims[-1], 1, num_ensemble))
27 | self.model = nn.Sequential(*model)
28 |
29 | self.device = torch.device(device)
30 | self.model = self.model.to(device)
31 | self._num_ensemble = num_ensemble
32 |
33 | def forward(
34 | self,
35 | obs: Union[np.ndarray, torch.Tensor],
36 | actions: Optional[Union[np.ndarray, torch.Tensor]] = None
37 | ) -> torch.Tensor:
38 | obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
39 | if actions is not None:
40 | actions = torch.as_tensor(actions, device=self.device, dtype=torch.float32)
41 | obs = torch.cat([obs, actions], dim=-1)
42 | values = self.model(obs)
43 | # values: [num_ensemble, batch_size, 1]
44 | return values
--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/__init__.py:
--------------------------------------------------------------------------------
1 | from offlinerl.outside_utils.nets.mlp import MLP
2 | from offlinerl.outside_utils.nets.vae import VAE
3 | from offlinerl.outside_utils.nets.ensemble_linear import EnsembleLinear
4 | from offlinerl.outside_utils.nets.rnn import RNNModel
5 |
6 |
7 | __all__ = [
8 | "MLP",
9 | "VAE",
10 | "EnsembleLinear",
11 | "RNNModel"
12 | ]
--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/ensemble_linear.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | from torch.nn import functional as F
5 | from typing import Dict, List, Union, Tuple, Optional
6 |
7 |
8 | class EnsembleLinear(nn.Module):
9 | def __init__(
10 | self,
11 | input_dim: int,
12 | output_dim: int,
13 | num_ensemble: int,
14 | weight_decay: float = 0.0
15 | ) -> None:
16 | super().__init__()
17 |
18 | self.num_ensemble = num_ensemble
19 |
20 | self.register_parameter("weight", nn.Parameter(torch.zeros(num_ensemble, input_dim, output_dim)))
21 | self.register_parameter("bias", nn.Parameter(torch.zeros(num_ensemble, 1, output_dim)))
22 |
23 | nn.init.trunc_normal_(self.weight, std=1/(2*input_dim**0.5))
24 |
25 | self.register_parameter("saved_weight", nn.Parameter(self.weight.detach().clone()))
26 | self.register_parameter("saved_bias", nn.Parameter(self.bias.detach().clone()))
27 |
28 | self.weight_decay = weight_decay
29 |
30 | def forward(self, x: torch.Tensor) -> torch.Tensor:
31 | weight = self.weight
32 | bias = self.bias
33 |
34 | if len(x.shape) == 2:
35 | x = torch.einsum('ij,bjk->bik', x, weight)
36 | else:
37 | x = torch.einsum('bij,bjk->bik', x, weight)
38 |
39 | x = x + bias
40 |
41 | return x
42 |
43 | def load_save(self) -> None:
44 | self.weight.data.copy_(self.saved_weight.data)
45 | self.bias.data.copy_(self.saved_bias.data)
46 |
47 | def update_save(self, indexes: List[int]) -> None:
48 | self.saved_weight.data[indexes] = self.weight.data[indexes]
49 | self.saved_bias.data[indexes] = self.bias.data[indexes]
50 |
51 | def get_decay_loss(self) -> torch.Tensor:
52 | decay_loss = self.weight_decay * (0.5*((self.weight**2).sum()))
53 | return decay_loss
--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/mlp.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 |
5 | from torch.nn import functional as F
6 | from typing import Dict, List, Union, Tuple, Optional
7 |
8 |
9 | class MLP(nn.Module):
10 | def __init__(
11 | self,
12 | input_dim: int,
13 | hidden_dims: Union[List[int], Tuple[int]],
14 | output_dim: Optional[int] = None,
15 | activation: nn.Module = nn.ReLU,
16 | dropout_rate: Optional[float] = None
17 | ) -> None:
18 | super().__init__()
19 | hidden_dims = [input_dim] + list(hidden_dims)
20 | model = []
21 | for in_dim, out_dim in zip(hidden_dims[:-1], hidden_dims[1:]):
22 | model += [nn.Linear(in_dim, out_dim), activation()]
23 | if dropout_rate is not None:
24 | model += [nn.Dropout(p=dropout_rate)]
25 |
26 | self.output_dim = hidden_dims[-1]
27 | if output_dim is not None:
28 | model += [nn.Linear(hidden_dims[-1], output_dim)]
29 | self.output_dim = output_dim
30 | self.model = nn.Sequential(*model)
31 |
32 | def forward(self, x: torch.Tensor) -> torch.Tensor:
33 | return self.model(x)
--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/rnn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn import functional as F
4 |
5 |
6 | class Swish(nn.Module):
7 | def __init__(self):
8 | super(Swish, self).__init__()
9 |
10 | def forward(self, x):
11 | x = x * torch.sigmoid(x)
12 | return x
13 |
14 |
15 | def soft_clamp(x : torch.Tensor, _min=None, _max=None):
16 | # clamp tensor values while mataining the gradient
17 | if _max is not None:
18 | x = _max - F.softplus(_max - x)
19 | if _min is not None:
20 | x = _min + F.softplus(x - _min)
21 | return x
22 |
23 |
24 | class ResBlock(nn.Module):
25 | def __init__(
26 | self,
27 | input_dim,
28 | output_dim,
29 | activation=Swish(),
30 | layer_norm=True,
31 | with_residual=True,
32 | dropout=0.1
33 | ):
34 | super().__init__()
35 |
36 | self.linear = nn.Linear(input_dim, output_dim)
37 | self.activation = activation
38 | self.layer_norm = nn.LayerNorm(output_dim) if layer_norm else None
39 | self.dropout = nn.Dropout(dropout) if dropout else None
40 | self.with_residual = with_residual
41 |
42 | def forward(self, x):
43 | y = self.activation(self.linear(x))
44 | if self.dropout is not None:
45 | y = self.dropout(y)
46 | if self.with_residual:
47 | y = x + y
48 | if self.layer_norm is not None:
49 | y = self.layer_norm(y)
50 | return y
51 |
52 |
53 | class RNNModel(nn.Module):
54 | def __init__(
55 | self,
56 | input_dim,
57 | output_dim,
58 | hidden_dims=[200, 200, 200, 200],
59 | rnn_num_layers=3,
60 | dropout_rate=0.1,
61 | device="cpu"
62 | ):
63 | super().__init__()
64 | self.input_dim = input_dim
65 | self.hidden_dims = hidden_dims
66 | self.output_dim = output_dim
67 | self.device = torch.device(device)
68 |
69 | self.activation = Swish()
70 | self.rnn_layer = nn.GRU(
71 | input_size=input_dim,
72 | hidden_size=hidden_dims[0],
73 | num_layers=rnn_num_layers,
74 | batch_first=True
75 | )
76 | module_list = []
77 | self.input_layer = ResBlock(input_dim, hidden_dims[0], dropout=dropout_rate, with_residual=False)
78 | dims = list(hidden_dims)
79 | for in_dim, out_dim in zip(dims[:-1], dims[1:]):
80 | module_list.append(ResBlock(in_dim, out_dim, dropout=dropout_rate))
81 | self.backbones = nn.ModuleList(module_list)
82 | self.merge_layer = nn.Linear(dims[0] + dims[-1], hidden_dims[0])
83 | self.output_layer = nn.Linear(hidden_dims[-1], output_dim)
84 |
85 | self.to(self.device)
86 |
87 | def forward(self, input, h_state=None):
88 | batch_size, num_timesteps, _ = input.shape
89 | input = torch.as_tensor(input, dtype=torch.float32).to(self.device)
90 | rnn_output, h_state = self.rnn_layer(input, h_state)
91 | rnn_output = rnn_output.reshape(-1, self.hidden_dims[0])
92 | input = input.view(-1, self.input_dim)
93 | output = self.input_layer(input)
94 | output = torch.cat([output, rnn_output], dim=-1)
95 | output = self.activation(self.merge_layer(output))
96 | for layer in self.backbones:
97 | output = layer(output)
98 | output = self.output_layer(output)
99 | output = output.view(batch_size, num_timesteps, -1)
100 | return output, h_state
101 |
102 |
103 | if __name__ == "__main__":
104 | model = RNNModel(14, 12)
105 | x = torch.randn(64, 20, 14)
106 | y, _ = model(x)
107 | print(y.shape)
--------------------------------------------------------------------------------
/offlinerl/outside_utils/nets/vae.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.nn import functional as F
4 | from typing import Dict, List, Union, Tuple, Optional
5 |
6 |
7 | # Vanilla Variational Auto-Encoder
8 | class VAE(nn.Module):
9 | def __init__(
10 | self,
11 | input_dim: int,
12 | output_dim: int,
13 | hidden_dim: int,
14 | latent_dim: int,
15 | max_action: Union[int, float],
16 | device: str = "cpu"
17 | ) -> None:
18 | super(VAE, self).__init__()
19 | self.e1 = nn.Linear(input_dim + output_dim, hidden_dim)
20 | self.e2 = nn.Linear(hidden_dim, hidden_dim)
21 |
22 | self.mean = nn.Linear(hidden_dim, latent_dim)
23 | self.log_std = nn.Linear(hidden_dim, latent_dim)
24 |
25 | self.d1 = nn.Linear(input_dim + latent_dim, hidden_dim)
26 | self.d2 = nn.Linear(hidden_dim, hidden_dim)
27 | self.d3 = nn.Linear(hidden_dim, output_dim)
28 |
29 | self.max_action = max_action
30 | self.latent_dim = latent_dim
31 | self.device = torch.device(device)
32 |
33 | self.to(device=self.device)
34 |
35 |
36 | def forward(
37 | self,
38 | obs: torch.Tensor,
39 | action: torch.Tensor
40 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
41 | z = F.relu(self.e1(torch.cat([obs, action], 1)))
42 | z = F.relu(self.e2(z))
43 |
44 | mean = self.mean(z)
45 | # Clamped for numerical stability
46 | log_std = self.log_std(z).clamp(-4, 15)
47 | std = torch.exp(log_std)
48 | z = mean + std * torch.randn_like(std)
49 |
50 | u = self.decode(obs, z)
51 |
52 | return u, mean, std
53 |
54 | def decode(self, obs: torch.Tensor, z: Optional[torch.Tensor] = None) -> torch.Tensor:
55 | # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5]
56 | if z is None:
57 | z = torch.randn((obs.shape[0], self.latent_dim)).to(self.device).clamp(-0.5,0.5)
58 |
59 | a = F.relu(self.d1(torch.cat([obs, z], 1)))
60 | a = F.relu(self.d2(a))
61 | return self.max_action * torch.tanh(self.d3(a))
--------------------------------------------------------------------------------
/offlinerl/outside_utils/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/outside_utils/utils/__init__.py
--------------------------------------------------------------------------------
/offlinerl/outside_utils/utils/scaler.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os.path as path
3 | import torch
4 |
5 |
6 | class StandardScaler(object):
7 | def __init__(self, mu=None, std=None):
8 | self.mu = mu
9 | self.std = std
10 |
11 | def fit(self, data):
12 | """Runs two ops, one for assigning the mean of the data to the internal mean, and
13 | another for assigning the standard deviation of the data to the internal standard deviation.
14 | This function must be called within a 'with .as_default()' block.
15 |
16 | Arguments:
17 | data (np.ndarray): A numpy array containing the input
18 |
19 | Returns: None.
20 | """
21 | self.mu = np.mean(data, axis=0, keepdims=True)
22 | self.std = np.std(data, axis=0, keepdims=True)
23 | self.std[self.std < 1e-12] = 1.0
24 |
25 | def transform(self, data):
26 | """Transforms the input matrix data using the parameters of this scaler.
27 |
28 | Arguments:
29 | data (np.array): A numpy array containing the points to be transformed.
30 |
31 | Returns: (np.array) The transformed dataset.
32 | """
33 | if isinstance(data, torch.Tensor):
34 | data = data.cpu().numpy()
35 | return (data - self.mu) / self.std
36 |
37 | def inverse_transform(self, data):
38 | """Undoes the transformation performed by this scaler.
39 |
40 | Arguments:
41 | data (np.array): A numpy array containing the points to be transformed.
42 |
43 | Returns: (np.array) The transformed dataset.
44 | """
45 | if isinstance(data, torch.Tensor):
46 | data = data.cpu().numpy()
47 | return self.std * data + self.mu
48 |
49 | def save_scaler(self, save_path, surfix=""):
50 | mu_path = path.join(save_path, surfix+"mu.npy")
51 | std_path = path.join(save_path, surfix+"std.npy")
52 | np.save(mu_path, self.mu)
53 | np.save(std_path, self.std)
54 |
55 | def load_scaler(self, load_path, surfix=""):
56 | mu_path = path.join(load_path, surfix+"mu.npy")
57 | std_path = path.join(load_path, surfix+"std.npy")
58 | self.mu = np.load(mu_path)
59 | self.std = np.load(std_path)
60 |
61 | def transform_tensor(self, data: torch.Tensor):
62 | device = data.device
63 | data = self.transform(data.cpu().numpy())
64 | data = torch.tensor(data, device=device)
65 | return data
66 |
67 | def inverse_transform_to_array(self, data: torch.Tensor):
68 | device = data.device
69 | data = self.inverse_transform(data.cpu().numpy())
70 | # data = torch.tensor(data, device=device)
71 | return data
--------------------------------------------------------------------------------
/offlinerl/outside_utils/utils/termination_fns.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def obs_unnormalization(termination_fn, obs_mean, obs_std):
4 | def thunk(obs, act, next_obs):
5 | obs = obs*obs_std + obs_mean
6 | next_obs = next_obs*obs_std + obs_mean
7 | return termination_fn(obs, act, next_obs)
8 | return thunk
9 |
10 | def termination_fn_halfcheetah(obs, act, next_obs):
11 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
12 |
13 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1))
14 | done = ~not_done
15 | done = done[:, None]
16 | return done
17 |
18 | def termination_fn_hopper(obs, act, next_obs):
19 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
20 |
21 | height = next_obs[:, 0]
22 | angle = next_obs[:, 1]
23 | not_done = np.isfinite(next_obs).all(axis=-1) \
24 | * np.abs(next_obs[:,1:] < 100).all(axis=-1) \
25 | * (height > .7) \
26 | * (np.abs(angle) < .2)
27 |
28 | done = ~not_done
29 | done = done[:,None]
30 | return done
31 |
32 | def termination_fn_halfcheetahveljump(obs, act, next_obs):
33 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
34 |
35 | done = np.array([False]).repeat(len(obs))
36 | done = done[:,None]
37 | return done
38 |
39 | def termination_fn_antangle(obs, act, next_obs):
40 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
41 |
42 | x = next_obs[:, 0]
43 | not_done = np.isfinite(next_obs).all(axis=-1) \
44 | * (x >= 0.2) \
45 | * (x <= 1.0)
46 |
47 | done = ~not_done
48 | done = done[:,None]
49 | return done
50 |
51 | def termination_fn_ant(obs, act, next_obs):
52 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
53 |
54 | x = next_obs[:, 0]
55 | not_done = np.isfinite(next_obs).all(axis=-1) \
56 | * (x >= 0.2) \
57 | * (x <= 1.0)
58 |
59 | done = ~not_done
60 | done = done[:,None]
61 | return done
62 |
63 | def termination_fn_walker2d(obs, act, next_obs):
64 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
65 |
66 | height = next_obs[:, 0]
67 | angle = next_obs[:, 1]
68 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) \
69 | * (height > 0.8) \
70 | * (height < 2.0) \
71 | * (angle > -1.0) \
72 | * (angle < 1.0)
73 | done = ~not_done
74 | done = done[:,None]
75 | return done
76 |
77 | def termination_fn_point2denv(obs, act, next_obs):
78 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
79 |
80 | done = np.array([False]).repeat(len(obs))
81 | done = done[:,None]
82 | return done
83 |
84 | def termination_fn_point2dwallenv(obs, act, next_obs):
85 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
86 |
87 | done = np.array([False]).repeat(len(obs))
88 | done = done[:,None]
89 | return done
90 |
91 | def termination_fn_pendulum(obs, act, next_obs):
92 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
93 |
94 | done = np.zeros((len(obs), 1))
95 | return done
96 |
97 | def termination_fn_humanoid(obs, act, next_obs):
98 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
99 |
100 | z = next_obs[:,0]
101 | done = (z < 1.0) + (z > 2.0)
102 |
103 | done = done[:,None]
104 | return done
105 |
106 | def termination_fn_pen(obs, act, next_obs):
107 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
108 |
109 | obj_pos = next_obs[:, 24:27]
110 | done = obj_pos[:, 2] < 0.075
111 |
112 | done = done[:,None]
113 | return done
114 |
115 | def terminaltion_fn_door(obs, act, next_obs):
116 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
117 |
118 | done = np.array([False] * obs.shape[0])
119 |
120 | done = done[:, None]
121 | return done
122 |
123 | def get_termination_fn(task):
124 | if 'halfcheetahvel' in task:
125 | return termination_fn_halfcheetahveljump
126 | elif 'halfcheetah' in task:
127 | return termination_fn_halfcheetah
128 | elif 'hopper' in task:
129 | return termination_fn_hopper
130 | elif 'antangle' in task:
131 | return termination_fn_antangle
132 | elif 'ant' in task:
133 | return termination_fn_ant
134 | elif 'walker2d' in task:
135 | return termination_fn_walker2d
136 | elif 'point2denv' in task:
137 | return termination_fn_point2denv
138 | elif 'point2dwallenv' in task:
139 | return termination_fn_point2dwallenv
140 | elif 'pendulum' in task:
141 | return termination_fn_pendulum
142 | elif 'humanoid' in task:
143 | return termination_fn_humanoid
144 | elif 'pen' in task:
145 | return termination_fn_pen
146 | elif 'door' in task:
147 | return terminaltion_fn_door
148 | else:
149 | raise np.zeros
150 |
--------------------------------------------------------------------------------
/offlinerl/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/utils/__init__.py
--------------------------------------------------------------------------------
/offlinerl/utils/config.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 |
4 | del_attr = ["function", "module"]
5 |
6 | def parse_config(cfg_module):
7 | args = [ i for i in dir(cfg_module) if not i.startswith("__")]
8 |
9 | config = OrderedDict()
10 | for arg in args:
11 | k = arg
12 | v = getattr(cfg_module, arg)
13 | if type(v).__name__ in del_attr and k != "device":
14 | continue
15 | else:
16 | config[k] = v
17 |
18 |
19 | return config
20 |
--------------------------------------------------------------------------------
/offlinerl/utils/env.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | from typing import Tuple
4 |
5 | def create_terminal_function():
6 | return terminal_function
7 |
8 | def get_env(task : str) -> gym.Env:
9 | try:
10 | if task in ['Pipeline', 'Simglucose', 'RocketRecovery', 'RandomFrictionHopper', 'DMSD', 'Fusion', 'Salespromotion', 'SafetyHalfCheetah']:
11 | import neorl2
12 | import gymnasium as gym
13 | env = gym.make(task)
14 | elif task.startswith("HalfCheetah-v3"):
15 | import neorl
16 | env = neorl.make("HalfCheetah-v3")
17 | elif task.startswith("Hopper-v3"):
18 | import neorl
19 | env = neorl.make("Hopper-v3")
20 | elif task.startswith("Walker2d-v3"):
21 | import neorl
22 | env = neorl.make("Walker2d-v3")
23 | elif task.startswith('d4rl'):
24 | import gym
25 | import d4rl
26 | from d4rl import gym_mujoco
27 | env = gym.make(task[5:])
28 | # hack to add terminal function
29 | if 'hopper' in task:
30 | def terminal_function(data : dict):
31 | obs = data["obs"]
32 | action = data["action"]
33 | obs_next = data["next_obs"]
34 |
35 | singel_done = False
36 | if len(obs.shape) == 1:
37 | singel_done = True
38 | obs = obs.reshape(1, -1)
39 | if len(action.shape) == 1:
40 | action = action.reshape(1, -1)
41 | if len(obs_next.shape) == 1:
42 | obs_next = obs_next.reshape(1, -1)
43 |
44 | if isinstance(obs, np.ndarray):
45 | array_type = np
46 | else:
47 | import torch
48 | array_type = torch
49 |
50 | z = obs_next[:, 0:1]
51 | angle = obs_next[:, 1:2]
52 | states = obs_next[:, 1:]
53 |
54 | min_state, max_state = (-100.0, 100.0)
55 | min_z, max_z = (0.7, float('inf'))
56 | min_angle, max_angle = (-0.2, 0.2)
57 |
58 | healthy_state = array_type.all(array_type.logical_and(min_state < states, states < max_state), axis=-1, keepdim=True)
59 | healthy_z = array_type.logical_and(min_z < z, z < max_z)
60 | healthy_angle = array_type.logical_and(min_angle < angle, angle < max_angle)
61 |
62 | is_healthy = array_type.logical_and(array_type.logical_and(healthy_state, healthy_z), healthy_angle)
63 |
64 | done = array_type.logical_not(is_healthy)
65 |
66 | if singel_done:
67 | done = done
68 | else:
69 | done = done.reshape(-1, 1)
70 | return done
71 |
72 | # env.get_done_func = lambda: terminal_function
73 | env.get_done_func = create_terminal_function
74 | elif 'walker' in task:
75 | def terminal_function(data : dict):
76 |
77 | obs = data["obs"]
78 | action = data["action"]
79 | obs_next = data["next_obs"]
80 |
81 | singel_done = False
82 | if len(obs.shape) == 1:
83 | singel_done = True
84 | obs = obs.reshape(1, -1)
85 | if len(action.shape) == 1:
86 | action = action.reshape(1, -1)
87 | if len(obs_next.shape) == 1:
88 | obs_next = obs_next.reshape(1, -1)
89 |
90 | if isinstance(obs, np.ndarray):
91 | array_type = np
92 | else:
93 | import torch
94 | array_type = torch
95 |
96 | min_z, max_z = (0.8, 2.0)
97 | min_angle, max_angle = (-1.0, 1.0)
98 | min_state, max_state = (-100.0, 100.0)
99 |
100 | z = obs_next[:, 0:1]
101 | angle = obs_next[:, 1:2]
102 | state = obs_next[:, 2:]
103 |
104 | healthy_state = array_type.all(array_type.logical_and(min_state < state, state < max_state), axis=-1, keepdim=True)
105 | healthy_z = array_type.logical_and(min_z < z, z < max_z)
106 | healthy_angle = array_type.logical_and(min_angle < angle, angle < max_angle)
107 | is_healthy = array_type.logical_and(array_type.logical_and(healthy_state, healthy_z), healthy_angle)
108 | done = array_type.logical_not(is_healthy)
109 |
110 | if singel_done:
111 | done = done
112 | else:
113 | done = done.reshape(-1, 1)
114 |
115 | return done
116 |
117 | # env.get_done_func = lambda: terminal_function
118 | env.get_done_func = create_terminal_function
119 | else:
120 | task_name = task.strip().split("-")[0]
121 | env = neorl.make(task_name)
122 | except:
123 | raise NotImplementedError
124 |
125 | return env
126 |
127 | def get_env_shape(task : str) -> Tuple[int, int]:
128 | env = get_env(task)
129 | obs_dim = env.observation_space.shape
130 | action_space = env.action_space
131 |
132 | if len(obs_dim) == 1:
133 | obs_dim = obs_dim[0]
134 |
135 | if hasattr(env.action_space, 'n'):
136 | act_dim = env.action_space.n
137 | else:
138 | act_dim = action_space.shape[0]
139 |
140 | return obs_dim, act_dim
141 |
142 | def get_env_obs_act_spaces(task : str):
143 | env = get_env(task)
144 | obs_space = env.observation_space
145 | act_space = env.action_space
146 | return obs_space, act_space
147 |
148 | def get_env_action_range(task : str) -> Tuple[float, float]:
149 | env = get_env(task)
150 | act_max = float(env.action_space.high[0])
151 | act_min = float(env.action_space.low[0])
152 |
153 | return act_max, act_min
154 |
155 | def get_env_state_range(task : str) -> Tuple[float, float]:
156 | env = get_env(task)
157 | obs_max = float(env.observation_space.high[0])
158 | obs_min = float(env.observation_space.low[0])
159 |
160 | return obs_max, obs_min
--------------------------------------------------------------------------------
/offlinerl/utils/exp.py:
--------------------------------------------------------------------------------
1 | import os
2 | import uuid
3 | import random
4 |
5 |
6 | import torch
7 | import numpy as np
8 | from aim import Run
9 | from loguru import logger
10 |
11 | from offlinerl.utils.logger import log_path
12 |
13 |
14 | def setup_seed(seed=1024):
15 | torch.manual_seed(seed)
16 | torch.cuda.manual_seed_all(seed)
17 | np.random.seed(seed)
18 | random.seed(seed)
19 | torch.backends.cudnn.deterministic = True
20 |
21 | def select_free_cuda():
22 | # 获取可用的 GPU 数量
23 | num_gpus = torch.cuda.device_count()
24 |
25 | if num_gpus == 0:
26 | print("No GPU available.")
27 | return None
28 |
29 | # 遍历所有 GPU,选择利用率最低的 GPU
30 | min_memory_usage = float('inf')
31 | selected_gpu_id = None
32 |
33 | for gpu_id in range(num_gpus):
34 | torch.cuda.set_device(gpu_id)
35 | gpu_memory_usage = torch.cuda.max_memory_allocated() / 1024**3 # in GB
36 | # 选择利用率最低的 GPU
37 | if gpu_memory_usage < min_memory_usage:
38 | min_memory_usage = gpu_memory_usage
39 | selected_gpu_id = gpu_id
40 |
41 | return selected_gpu_id
42 |
43 | def set_free_device_fn():
44 | device = 'cuda'+":"+str(select_free_cuda()) if torch.cuda.is_available() else 'cpu'
45 |
46 | return device
47 |
48 |
49 | def init_exp_run(repo=None, experiment_name=None, flush_frequency=1):
50 | if repo is None:
51 | repo = os.path.join(log_path(),"./.aim")
52 | if not os.path.exists(repo):
53 | print(f'=====repo:{repo}')
54 | logger.info('{} dir is not exist, create {}',repo, repo)
55 | os.system(str("cd " + os.path.join(repo,"../") + "&& aim init"))
56 | else:
57 | repo = os.path.join(repo,"./.aim")
58 | if not os.path.exists(repo):
59 | print(f'=====repo:{repo}')
60 | logger.info('{} dir is not exist, create {}',repo, repo)
61 | os.system(str("cd " + os.path.join(repo,"../") + "&& aim init"))
62 | run = Run(
63 | repo=repo,
64 | experiment=experiment_name
65 | )
66 |
67 | return run
--------------------------------------------------------------------------------
/offlinerl/utils/flexible_replay_pool.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import pickle
3 |
4 | import numpy as np
5 |
6 | from .replay_pool import ReplayPool
7 |
8 |
9 | class FlexibleReplayPool(ReplayPool):
10 | def __init__(self, max_size, fields_attrs, obs_filter=False, modify_rew=False):
11 | super(FlexibleReplayPool, self).__init__()
12 |
13 | max_size = int(max_size)
14 | self._max_size = max_size
15 |
16 | self.fields = {}
17 | self.fields_attrs = {}
18 |
19 | self.add_fields(fields_attrs)
20 |
21 | self.obs_filter = obs_filter
22 | self.modify_rew = modify_rew
23 |
24 | self._pointer = 0
25 | self._size = 0
26 | self._samples_since_save = 0
27 |
28 | @property
29 | def size(self):
30 | return self._size
31 |
32 | @property
33 | def field_names(self):
34 | return list(self.fields.keys())
35 |
36 | def add_fields(self, fields_attrs):
37 | self.fields_attrs.update(fields_attrs)
38 |
39 | for field_name, field_attrs in fields_attrs.items():
40 | field_shape = (self._max_size, *field_attrs['shape'])
41 | initializer = field_attrs.get('initializer', np.zeros)
42 | self.fields[field_name] = initializer(
43 | field_shape, dtype=field_attrs['dtype'])
44 |
45 | def _advance(self, count=1):
46 | self._pointer = (self._pointer + count) % self._max_size
47 | self._size = min(self._size + count, self._max_size)
48 | self._samples_since_save += count
49 |
50 | def add_sample(self, sample):
51 | samples = {
52 | key: value[None, ...]
53 | for key, value in sample.items()
54 | }
55 | self.add_samples(samples)
56 |
57 | def add_samples(self, samples):
58 | # if 'infos' not in samples:
59 | # samples['infos'] = {}
60 | field_names = list(samples.keys())
61 | num_samples = samples[field_names[0]].shape[0]
62 | index = np.arange(
63 | self._pointer, self._pointer + num_samples) % self._max_size
64 | for field_name in self.field_names:
65 | # print(field_name)
66 | default_value = (
67 | self.fields_attrs[field_name].get('default_value', 0.0))
68 | values = samples.get(field_name, default_value)
69 | if field_name not in samples.keys() and 'infos' in samples and field_name in samples['infos'][0].keys():
70 | values = np.expand_dims(np.array([samples['infos'][i].get(field_name, default_value) for i in range(num_samples)]), axis=1)
71 | try:
72 | assert values.shape[0] == num_samples, f'value shape: {values.shape[0]}, expected: {num_samples}'
73 | if isinstance(values[0], dict):
74 | values = np.stack([np.concatenate([
75 | value[key]
76 | for key in value.keys()
77 | ], axis=-1) for value in values])
78 | self.fields[field_name][index] = values
79 | except Exception as e:
80 | import traceback
81 | traceback.print_exc(limit=10)
82 | print('[ DEBUG ] errors occurs: {}'.format(e))
83 |
84 | import pdb; pdb.set_trace()
85 | self._advance(num_samples)
86 |
87 | def restore_samples(self, samples):
88 | num_samples = samples[list(samples.keys())[0]].shape[0]
89 | index = np.arange(
90 | 0, num_samples) % self._max_size
91 | for key, values in samples.items():
92 | assert key in self.field_names
93 | self.fields[key][index] = values
94 |
95 | def random_indices(self, batch_size):
96 | if self._size == 0: return np.arange(0, 0)
97 | return np.random.randint(0, self._size, batch_size)
98 |
99 | def random_batch(self, batch_size, field_name_filter=None, **kwargs):
100 | random_indices = self.random_indices(batch_size)
101 | return self.batch_by_indices(
102 | random_indices, field_name_filter=field_name_filter, **kwargs)
103 |
104 | def last_n_batch(self, last_n, field_name_filter=None, **kwargs):
105 | last_n_indices = np.arange(
106 | self._pointer - min(self.size, last_n), self._pointer
107 | ) % self._max_size
108 | return self.batch_by_indices(
109 | last_n_indices, field_name_filter=field_name_filter, **kwargs)
110 |
111 | def filter_fields(self, field_names, field_name_filter):
112 | if isinstance(field_name_filter, str):
113 | field_name_filter = [field_name_filter]
114 |
115 | if isinstance(field_name_filter, (list, tuple)):
116 | field_name_list = field_name_filter
117 |
118 | def filter_fn(field_name):
119 | return field_name in field_name_list
120 |
121 | else:
122 | filter_fn = field_name_filter
123 |
124 | filtered_field_names = [
125 | field_name for field_name in field_names
126 | if filter_fn(field_name)
127 | ]
128 |
129 | return filtered_field_names
130 |
131 | def batch_by_indices(self, indices, field_name_filter=None):
132 | if np.any(indices % self._max_size > self.size):
133 | raise ValueError(
134 | "Tried to retrieve batch with indices greater than current"
135 | " size")
136 |
137 | field_names = self.field_names
138 | if field_name_filter is not None:
139 | field_names = self.filter_fields(
140 | field_names, field_name_filter)
141 |
142 | return {
143 | field_name: self.fields[field_name][indices]
144 | for field_name in field_names
145 | }
146 |
147 | def save_latest_experience(self, pickle_path):
148 | latest_samples = self.last_n_batch(self._samples_since_save)
149 |
150 | with gzip.open(pickle_path, 'wb') as f:
151 | pickle.dump(latest_samples, f)
152 |
153 | self._samples_since_save = 0
154 |
155 | def load_experience(self, experience_path):
156 | with gzip.open(experience_path, 'rb') as f:
157 | latest_samples = pickle.load(f)
158 |
159 | key = list(latest_samples.keys())[0]
160 | num_samples = latest_samples[key].shape[0]
161 | for field_name, data in latest_samples.items():
162 | assert data.shape[0] == num_samples, data.shape
163 |
164 | self.add_samples(latest_samples)
165 | self._samples_since_save = 0
166 |
167 | def return_all_samples(self):
168 | return {
169 | field_name: self.fields[field_name][:self.size]
170 | for field_name in self.field_names
171 | }
172 |
173 | def __getstate__(self):
174 | state = self.__dict__.copy()
175 | state['fields'] = {
176 | field_name: self.fields[field_name][:self.size]
177 | for field_name in self.field_names
178 | }
179 |
180 | return state
181 |
182 | def __setstate__(self, state):
183 | if state['_size'] < state['_max_size']:
184 | pad_size = state['_max_size'] - state['_size']
185 | for field_name in state['fields'].keys():
186 | field_shape = state['fields_attrs'][field_name]['shape']
187 | state['fields'][field_name] = np.concatenate((
188 | state['fields'][field_name],
189 | np.zeros((pad_size, *field_shape))
190 | ), axis=0)
191 |
192 | self.__dict__ = state
193 |
--------------------------------------------------------------------------------
/offlinerl/utils/function.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.functional import F
3 |
4 | def soft_clamp(x : torch.Tensor, _min=None, _max=None):
5 | # clamp tensor values while mataining the gradient
6 | if _max is not None:
7 | x = _max - F.softplus(_max - x)
8 | if _min is not None:
9 | x = _min + F.softplus(x - _min)
10 | return x
--------------------------------------------------------------------------------
/offlinerl/utils/io.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import pickle
4 | import urllib
5 | import urllib.request
6 | from tqdm import tqdm
7 |
8 | def read_json(file_path):
9 | with open(file_path, 'r') as f:
10 | data = json.load(f)
11 |
12 | return data
13 |
14 |
15 | def load_pkl(file_path):
16 | assert os.path.exists(file_path)
17 | with open(file_path, 'rb') as handle:
18 | data = pickle.load(handle)
19 |
20 | return data
21 |
22 | def save_pkl(data, file_path):
23 | with open(file_path, 'wb') as handle:
24 | pickle.dump(data, handle)
25 |
26 |
27 | def del_dir(dir_path):
28 | os.removedirs(dir_path)
29 |
30 | def create_dir(dir_path, cover=False):
31 | if cover or not os.path.exists(dir_path):
32 | if cover and os.path.exists(dir_path):
33 | os.removedirs(dir_path)
34 | os.makedirs(dir_path)
35 |
36 |
37 | def save_video(video_array, video_save_path):
38 | import cv2
39 | fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
40 | output_movie = cv2.VideoWriter(video_save_path, fourcc, 10, (640, 360))
41 |
42 | for frame in video_array:
43 | output_movie.write(frame)
44 |
45 | out.release()
46 | cv2.destroyAllWindows()
47 |
48 | def download_helper(url, filename):
49 | 'Download file from given url. Modified from `torchvision.dataset.utils`'
50 | def gen_bar_updater():
51 | pbar = tqdm(total=None)
52 |
53 | def bar_update(count, block_size, total_size):
54 | if pbar.total is None and total_size:
55 | pbar.total = total_size
56 | progress_bytes = count * block_size
57 | pbar.update(progress_bytes - pbar.n)
58 |
59 | return bar_update
60 |
61 | try:
62 | print('Downloading ' + url + ' to ' + filename)
63 | urllib.request.urlretrieve(
64 | url, filename,
65 | reporthook=gen_bar_updater()
66 | )
67 |
68 | return True
69 | except (urllib.error.URLError, IOError) as e:
70 | if url[:5] == 'https':
71 | url = url.replace('https:', 'http:')
72 | print('Failed download. Trying https -> http instead.'
73 | ' Downloading ' + url + ' to ' + filename)
74 | urllib.request.urlretrieve(
75 | url, filename,
76 | reporthook=gen_bar_updater()
77 | )
78 |
79 | return True
80 | else:
81 | raise e
--------------------------------------------------------------------------------
/offlinerl/utils/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | import uuid
3 | import aim
4 |
5 | from offlinerl.utils.io import create_dir
6 |
7 | def log_path():
8 | import offlinerl
9 | log_path = os.path.abspath(os.path.join(offlinerl.__file__,"../../","offlinerl_tmp"))
10 |
11 | create_dir(log_path)
12 |
13 | return log_path
14 |
15 | """
16 | class exp_logger():
17 | def __init__(self, experiment_name=None,flush_frequency=1):
18 | print("experiment_name:",experiment_name)
19 | self.aim_logger = aim.Session(experiment=experiment_name, flush_frequency=flush_frequency)
20 |
21 | def log_hparams(self, hparams_dict):
22 | self.aim_logger.set_params(hparams_dict, name='hparams')
23 | """
--------------------------------------------------------------------------------
/offlinerl/utils/net/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/utils/net/__init__.py
--------------------------------------------------------------------------------
/offlinerl/utils/net/bcq_net.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | from offlinerl.utils.net.common import BasePolicy
8 |
9 |
10 | # Used for Atari
11 | class Conv_Q(nn.Module):
12 | def __init__(self, frames, num_actions):
13 | super(Conv_Q, self).__init__()
14 | self.c1 = nn.Conv2d(frames, 32, kernel_size=8, stride=4)
15 | self.c2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
16 | self.c3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
17 |
18 | self.q1 = nn.Linear(3136, 512)
19 | self.q2 = nn.Linear(512, 16)
20 | self.q3 = nn.Linear(16, num_actions)
21 |
22 | self.i1 = nn.Linear(3136, 512)
23 | self.i2 = nn.Linear(512, 16)
24 | self.i3 = nn.Linear(16, num_actions)
25 |
26 |
27 | def forward(self, state):
28 | c = F.relu(self.c1(state))
29 | c = F.relu(self.c2(c))
30 | c = F.relu(self.c3(c))
31 |
32 | q = F.relu(self.q1(c.reshape(-1, 3136)))
33 | q = F.relu(self.q2(q))
34 | q = self.q3(q)
35 |
36 | i = F.relu(self.i1(c.reshape(-1, 3136)))
37 | i = F.relu(self.i2(i))
38 | i = self.i3(i)
39 | return q, F.log_softmax(i, dim=1), i
40 |
41 | def encode(self, state):
42 | with torch.no_grad():
43 | c = F.relu(self.c1(state))
44 | c = F.relu(self.c2(c))
45 | c = F.relu(self.c3(c))
46 |
47 | q = F.relu(self.q1(c.reshape(-1, 3136)))
48 | q = F.relu(self.q2(q))
49 |
50 | i = F.relu(self.i1(c.reshape(-1, 3136)))
51 | i = F.relu(self.i2(i))
52 | return i
53 |
54 |
55 |
56 | # Used for Box2D / Toy problems
57 | class FC_Q(nn.Module, BasePolicy):
58 | def __init__(self, state_dim, num_actions):
59 | super(FC_Q, self).__init__()
60 | self.q1 = nn.Linear(state_dim, 256)
61 | self.q2 = nn.Linear(256, 256)
62 | self.q3 = nn.Linear(256, num_actions)
63 |
64 | self.i1 = nn.Linear(state_dim, 256)
65 | self.i2 = nn.Linear(256, 256)
66 | self.i3 = nn.Linear(256, num_actions)
67 |
68 |
69 | def forward(self, state):
70 | q = F.relu(self.q1(state))
71 | q = F.relu(self.q2(q))
72 |
73 | i = F.relu(self.i1(state))
74 | i = F.relu(self.i2(i))
75 | i = F.relu(self.i3(i))
76 | return self.q3(q), F.log_softmax(i, dim=1), i
77 |
78 | def policy_infer(self, obs):
79 |
80 | q, imt, i = self(obs)
81 | imt = imt.exp()
82 | imt = (imt/imt.max(1, keepdim=True)[0] > 0.3).float()
83 | # Use large negative number to mask actions from argmax
84 |
85 | return (imt * q + (1. - imt) * -1e8).argmax(1)
86 |
87 |
--------------------------------------------------------------------------------
/offlinerl/utils/net/maple_actor.py:
--------------------------------------------------------------------------------
1 | import torch.nn
2 | import torch.nn as nn
3 | import numpy as np
4 | import torch.nn.functional as F
5 | from offlinerl.utils.net.common import miniblock
6 |
7 |
8 | class Maple_actor(nn.Module):
9 | def __init__(self, obs_dim, action_dim, deterministic=False, hidden_sizes=(16,), Guassain_hidden_sizes=(256,256), max_traj_len=5, LOG_MAX_STD=2, LOG_MIN_STD=-20, EPS=1e-8, lstm_hidden_unit=128):
10 | super(Maple_actor,self).__init__()
11 | self.obs_dim = obs_dim
12 | self.deterministic = deterministic
13 | self.act_dim = action_dim
14 | self.hidden_sizes = list(hidden_sizes).copy()
15 | self.Guassain_hidden_sizes = list(Guassain_hidden_sizes).copy()
16 | self.max_traj_len = max_traj_len
17 | self.LOG_MAX_STD = LOG_MAX_STD
18 | self.LOG_MIN_STD = LOG_MIN_STD
19 | self.EPS = EPS
20 | self.lstm_hidden_unit = lstm_hidden_unit
21 | self.mlp = miniblock(lstm_hidden_unit, hidden_sizes[0], None, relu=False)
22 | if len(hidden_sizes) >= 2:
23 | for i in range(1,len(hidden_sizes)):
24 | self.mlp += miniblock(hidden_sizes[i-1], hidden_sizes[i], None)
25 | self.mlp = nn.Sequential(*self.mlp)
26 | self.Guassain_input_dim = self.hidden_sizes[-1] + self.obs_dim
27 | self.Guassain_mlp = miniblock(self.Guassain_input_dim, self.Guassain_hidden_sizes[0], None)
28 | if len(Guassain_hidden_sizes)>=2:
29 | for i in range(1,len(Guassain_hidden_sizes)):
30 | self.Guassain_mlp += miniblock(Guassain_hidden_sizes[i-1], Guassain_hidden_sizes[i], None)
31 | self.Guassain_mlp = nn.Sequential(*self.Guassain_mlp)
32 | self.Guassain_mu_mlp = [nn.Linear(self.Guassain_hidden_sizes[-1], action_dim)]
33 | self.Guassain_logstd_mlp = [nn.Linear(self.Guassain_hidden_sizes[-1], action_dim)]
34 | self.Guassain_mu_mlp = nn.Sequential(*self.Guassain_mu_mlp)
35 | self.Guassain_logstd_mlp = nn.Sequential(*self.Guassain_logstd_mlp)
36 | def gaussian_likelihood(self,x, mu, log_std):
37 | pre_sum = -0.5 * (((x - mu) / (torch.exp(log_std) + self.EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi))
38 | return torch.sum(pre_sum, dim=-1)
39 |
40 | def forward(self, hidden_policy, obs):
41 | policy_out = self.mlp(hidden_policy)
42 | policy_z = torch.cat([policy_out, obs], dim=-1)
43 | out = self.Guassain_mlp(policy_z)
44 | mu = self.Guassain_mu_mlp(out)
45 | log_std = self.Guassain_logstd_mlp(out)
46 | log_std = torch.clip(log_std, self.LOG_MIN_STD, self.LOG_MAX_STD)
47 | std = torch.exp(log_std)
48 | acts = torch.distributions.Normal(torch.zeros_like(mu),torch.ones_like(std)).sample()*std + mu
49 | log_p_acts = self.gaussian_likelihood(acts, mu, log_std)
50 | mu, acts, log_p_acts = self.apply_squashing_func(mu, acts, log_p_acts)
51 | return mu, acts, log_p_acts, std
52 |
53 | def apply_squashing_func(self, mu, pi, logp_pi):
54 | logp_pi -= torch.sum(2 * (np.log(2) - pi - F.softplus(-2 * pi)), dim=-1)
55 | # Squash those unbounded actions!
56 | mu = torch.tanh(mu)
57 | pi = torch.tanh(pi)
58 | return mu, pi, logp_pi
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/offlinerl/utils/net/mlas.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from offlinerl.utils.net.common import BasePolicy
6 |
7 | class VAE(nn.Module, BasePolicy):
8 | def __init__(self,
9 | state_dim,
10 | action_dim,
11 | latent_dim,
12 | max_action,
13 | hidden_size=750):
14 | super(VAE, self).__init__()
15 |
16 | self.e1 = nn.Linear(state_dim + action_dim, hidden_size)
17 | self.e2 = nn.Linear(hidden_size, hidden_size)
18 |
19 | self.mean = nn.Linear(hidden_size, latent_dim)
20 | self.log_std = nn.Linear(hidden_size, latent_dim)
21 |
22 | self.d1 = nn.Linear(state_dim + latent_dim, hidden_size)
23 | self.d2 = nn.Linear(hidden_size, hidden_size)
24 | self.d3 = nn.Linear(hidden_size, action_dim)
25 |
26 | self.max_action = max_action
27 | self.latent_dim = latent_dim
28 |
29 | self._actor = None
30 |
31 | def forward(self, state, action):
32 | z = F.relu(self.e1(torch.cat([state, action], 1)))
33 | z = F.relu(self.e2(z))
34 |
35 | mean = self.mean(z)
36 | # Clamped for numerical stability
37 | log_std = self.log_std(z).clamp(-4, 15)
38 | std = torch.exp(log_std)
39 | z = mean + std * torch.randn_like(std)
40 |
41 | u = self.decode(state, z)
42 |
43 | return u, mean, std
44 |
45 | def decode(self, state, z=None, clip=None, raw=False):
46 | # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5]
47 | if z is None:
48 | z = torch.randn((state.shape[0], self.latent_dim)).to(state.device)
49 | if clip is not None:
50 | z = z.clamp(-clip, clip)
51 |
52 | a = F.relu(self.d1(torch.cat([state, z], 1)))
53 | a = F.relu(self.d2(a))
54 | a = self.d3(a)
55 | if raw:
56 | return a
57 | return self.max_action * torch.tanh(a)
58 |
59 | def policy_infer(self, obs):
60 | return self.decode(obs, z=self._actor(obs)[0])
61 |
62 | class ActorPerturbation(nn.Module, BasePolicy):
63 | def __init__(self, state_dim, action_dim, latent_action_dim, max_action, max_latent_action=2, phi=0.05):
64 | super(ActorPerturbation, self).__init__()
65 |
66 | self.hidden_size = (400, 300, 400, 300)
67 |
68 | self.l1 = nn.Linear(state_dim, self.hidden_size[0])
69 | self.l2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])
70 | self.l3 = nn.Linear(self.hidden_size[1], latent_action_dim)
71 |
72 | self.l4 = nn.Linear(state_dim + action_dim, self.hidden_size[2])
73 | self.l5 = nn.Linear(self.hidden_size[2], self.hidden_size[3])
74 | self.l6 = nn.Linear(self.hidden_size[3], action_dim)
75 |
76 | self.max_latent_action = max_latent_action
77 | self.max_action = max_action
78 | self.phi = phi
79 |
80 | self.vae = None
81 |
82 | def forward(self, state, decoder):
83 | a = F.relu(self.l1(state))
84 | a = F.relu(self.l2(a))
85 | latent_action = self.max_latent_action * torch.tanh(self.l3(a))
86 |
87 | mid_action = decoder(state, z=latent_action)
88 |
89 | a = F.relu(self.l4(torch.cat([state, mid_action], 1)))
90 | a = F.relu(self.l5(a))
91 | a = self.phi * torch.tanh(self.l6(a))
92 | final_action = (a + mid_action).clamp(-self.max_action, self.max_action)
93 | return latent_action, mid_action, final_action
94 |
95 | def policy_infer(self, obs):
96 |
97 | return self(obs, self.vae.decode)[-1]
--------------------------------------------------------------------------------
/offlinerl/utils/net/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/polixir/OfflineRL/ea1a446b210d3782e61e559b68306b15b349e9ef/offlinerl/utils/net/model/__init__.py
--------------------------------------------------------------------------------
/offlinerl/utils/net/model/ensemble.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from offlinerl.utils.function import soft_clamp
5 | from offlinerl.utils.net.common import Swish
6 |
7 | class EnsembleLinear(torch.nn.Module):
8 | def __init__(self, in_features, out_features, ensemble_size=7):
9 | super().__init__()
10 |
11 | self.ensemble_size = ensemble_size
12 |
13 | self.register_parameter('weight', torch.nn.Parameter(torch.zeros(ensemble_size, in_features, out_features)))
14 | self.register_parameter('bias', torch.nn.Parameter(torch.zeros(ensemble_size, 1, out_features)))
15 |
16 | torch.nn.init.trunc_normal_(self.weight, std=1/(2*in_features**0.5))
17 |
18 | self.register_parameter('saved_weight', torch.nn.Parameter(self.weight.detach().clone()))
19 | self.register_parameter('saved_bias', torch.nn.Parameter(self.bias.detach().clone()))
20 |
21 | self.select = list(range(0, self.ensemble_size))
22 |
23 | def forward(self, x):
24 | weight = self.weight[self.select]
25 | bias = self.bias[self.select]
26 |
27 | if len(x.shape) == 2:
28 | x = torch.einsum('ij,bjk->bik', x, weight)
29 | else:
30 | x = torch.einsum('bij,bjk->bik', x, weight)
31 |
32 | x = x + bias
33 |
34 | return x
35 |
36 | def set_select(self, indexes):
37 | assert len(indexes) <= self.ensemble_size and max(indexes) < self.ensemble_size
38 | self.select = indexes
39 | self.weight.data[indexes] = self.saved_weight.data[indexes]
40 | self.bias.data[indexes] = self.saved_bias.data[indexes]
41 |
42 | def update_save(self, indexes):
43 | self.saved_weight.data[indexes] = self.weight.data[indexes]
44 | self.saved_bias.data[indexes] = self.bias.data[indexes]
45 |
46 | class EnsembleTransition(torch.nn.Module):
47 | def __init__(self, obs_dim, action_dim, hidden_features, hidden_layers, ensemble_size=7, mode='local', with_reward=True):
48 | super().__init__()
49 | self.obs_dim = obs_dim
50 | self.mode = mode
51 | self.with_reward = with_reward
52 | self.ensemble_size = ensemble_size
53 |
54 | self.activation = Swish()
55 |
56 | module_list = []
57 | for i in range(hidden_layers):
58 | if i == 0:
59 | module_list.append(EnsembleLinear(obs_dim + action_dim, hidden_features, ensemble_size))
60 | else:
61 | module_list.append(EnsembleLinear(hidden_features, hidden_features, ensemble_size))
62 | self.backbones = torch.nn.ModuleList(module_list)
63 |
64 | self.output_layer = EnsembleLinear(hidden_features, 2 * (obs_dim + self.with_reward), ensemble_size)
65 | self.obs_mean = None
66 | self.obs_std = None
67 | self.register_parameter('max_logstd', torch.nn.Parameter(torch.ones(obs_dim + self.with_reward) * 1, requires_grad=True))
68 | self.register_parameter('min_logstd', torch.nn.Parameter(torch.ones(obs_dim + self.with_reward) * -5, requires_grad=True))
69 |
70 | def update_self(self, obs):
71 | self.obs_mean = obs.mean(dim=0)
72 | self.obs_std = obs.std(dim=0)
73 |
74 | def forward(self, obs_action):
75 | # Normalization for obs. If 'normalize', no residual.
76 | # use 'dims' to make forward work both when training and evaluating
77 | dims = len(obs_action.shape) - 2 # dim == 0: eval, dim == 1: train
78 | if self.obs_mean is not None:
79 | if dims == 1:
80 | obs_mean = self.obs_mean.unsqueeze(0).expand(obs_action.shape[0], -1).to(obs_action.device)
81 | obs_std = self.obs_std.unsqueeze(0).expand(obs_action.shape[0], -1).to(obs_action.device)
82 | else:
83 | obs_mean = self.obs_mean.to(obs_action.device)
84 | obs_std = self.obs_std.to(obs_action.device)
85 |
86 | if self.mode == 'normalize':
87 | batch_size = obs_action.shape[dims]
88 | obs, action = torch.split(obs_action, [self.obs_dim, obs_action.shape[-1] - self.obs_dim], dim=-1)
89 | if dims == 1:
90 | obs = obs - obs_mean.unsqueeze(dims).expand(-1, batch_size, -1)
91 | obs = obs / (obs_std.unsqueeze(dims).expand(-1, batch_size, -1) + 1e-8)
92 | else:
93 | obs = obs - obs_mean.unsqueeze(dims).expand(batch_size, -1)
94 | obs = obs / (obs_std.unsqueeze(dims).expand(batch_size, -1) + 1e-8)
95 | output = torch.cat([obs, action], dim=-1)
96 | else:
97 | output = obs_action
98 | else:
99 | output = obs_action
100 |
101 | for layer in self.backbones:
102 | output = self.activation(layer(output))
103 | mu, logstd = torch.chunk(self.output_layer(output), 2, dim=-1)
104 | logstd = soft_clamp(logstd, self.min_logstd, self.max_logstd)
105 | # 'local': with residual
106 | if self.mode == 'local' or self.mode == 'normalize':
107 | if self.with_reward:
108 | obs, reward = torch.split(mu, [self.obs_dim, 1], dim=-1)
109 | obs = obs + obs_action[..., :self.obs_dim]
110 | mu = torch.cat([obs, reward], dim=-1)
111 | else:
112 | mu = mu + obs_action[..., :self.obs_dim]
113 | return torch.distributions.Normal(mu, torch.exp(logstd))
114 |
115 | def set_select(self, indexes):
116 | self.elites = indexes
117 | for layer in self.backbones:
118 | layer.set_select(indexes)
119 | self.output_layer.set_select(indexes)
120 |
121 | def update_save(self, indexes):
122 | for layer in self.backbones:
123 | layer.update_save(indexes)
124 | self.output_layer.update_save(indexes)
125 |
126 | def random_elite_idxs(self, batch_size: int) -> np.ndarray:
127 | idxs = np.random.choice(len(self.elites), size=batch_size)
128 | return idxs
--------------------------------------------------------------------------------
/offlinerl/utils/net/model/maple_critic.py:
--------------------------------------------------------------------------------
1 | import torch.nn
2 | import torch.nn as nn
3 | import numpy as np
4 | import torch.nn.functional as F
5 | from offlinerl.utils.net.common import miniblock
6 |
7 |
8 | class Maple_critic(nn.Module):
9 | def __init__(self, obs_dim, action_dim,deterministic=False,hidden_sizes=(16,),value_hidden_sizes=(256,256),lstm_hidden_unit=128):
10 | super(Maple_critic,self).__init__()
11 | self.obs_dim = obs_dim
12 | self.action_dim = action_dim
13 | self.deterministic = deterministic
14 | self.hidden_sizes = list(hidden_sizes).copy()
15 | self.value_hidden_sizes = list(value_hidden_sizes).copy()
16 | self.lstm_hidden_unit = lstm_hidden_unit
17 | self.mlp = miniblock(self.lstm_hidden_unit, self.hidden_sizes[0], None, relu=False)
18 | if len(self.hidden_sizes) >= 2:
19 | for i in range(1,len(self.hidden_sizes)):
20 | self.mlp += miniblock(self.hidden_sizes[i-1], self.hidden_sizes[i], None)
21 | self.mlp = nn.Sequential(*self.mlp)
22 | self.vfs = miniblock(self.hidden_sizes[-1]+self.obs_dim+self.action_dim, self.value_hidden_sizes[0],None)
23 | if len(self.value_hidden_sizes)>=2:
24 | for i in range(1, len(self.value_hidden_sizes)):
25 | self.vfs += miniblock(self.value_hidden_sizes[i-1], self.value_hidden_sizes[i], None)
26 | self.vfs += [nn.Linear(self.value_hidden_sizes[-1], 1)]
27 | self.vfs = nn.Sequential(*self.vfs)
28 |
29 | def forward(self, value_hidden, actions, obs):
30 | out = self.mlp(value_hidden)
31 | out = torch.cat([out, obs, actions], dim=-1)
32 | out = self.vfs(out)
33 | return out
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/offlinerl/utils/net/model/new_ensemble.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 | import os.path as path
5 | from torch.nn import functional as F
6 | from typing import Dict, List, Union, Tuple, Optional
7 |
8 |
9 | class EnsembleLinear(nn.Module):
10 | def __init__(
11 | self,
12 | input_dim: int,
13 | output_dim: int,
14 | num_ensemble: int,
15 | weight_decay: float = 0.0
16 | ) -> None:
17 | super().__init__()
18 |
19 | self.num_ensemble = num_ensemble
20 |
21 | self.register_parameter("weight", nn.Parameter(torch.zeros(num_ensemble, input_dim, output_dim)))
22 | self.register_parameter("bias", nn.Parameter(torch.zeros(num_ensemble, 1, output_dim)))
23 |
24 | nn.init.trunc_normal_(self.weight, std=1/(2*input_dim**0.5))
25 |
26 | self.register_parameter("saved_weight", nn.Parameter(self.weight.detach().clone()))
27 | self.register_parameter("saved_bias", nn.Parameter(self.bias.detach().clone()))
28 |
29 | self.weight_decay = weight_decay
30 |
31 | def forward(self, x: torch.Tensor) -> torch.Tensor:
32 | weight = self.weight
33 | bias = self.bias
34 |
35 | if len(x.shape) == 2:
36 | x = torch.einsum('ij,bjk->bik', x, weight)
37 | else:
38 | x = torch.einsum('bij,bjk->bik', x, weight)
39 |
40 | x = x + bias
41 |
42 | return x
43 |
44 | def load_save(self) -> None:
45 | self.weight.data.copy_(self.saved_weight.data)
46 | self.bias.data.copy_(self.saved_bias.data)
47 |
48 | def update_save(self, indexes: List[int]) -> None:
49 | self.saved_weight.data[indexes] = self.weight.data[indexes]
50 | self.saved_bias.data[indexes] = self.bias.data[indexes]
51 |
52 | def get_decay_loss(self) -> torch.Tensor:
53 | decay_loss = self.weight_decay * (0.5*((self.weight**2).sum()))
54 | return decay_loss
55 |
56 |
57 | class Swish(nn.Module):
58 | def __init__(self) -> None:
59 | super(Swish, self).__init__()
60 |
61 | def forward(self, x: torch.Tensor) -> torch.Tensor:
62 | x = x * torch.sigmoid(x)
63 | return x
64 |
65 |
66 | def soft_clamp(
67 | x : torch.Tensor,
68 | _min: Optional[torch.Tensor] = None,
69 | _max: Optional[torch.Tensor] = None
70 | ) -> torch.Tensor:
71 | # clamp tensor values while mataining the gradient
72 | if _max is not None:
73 | x = _max - F.softplus(_max - x)
74 | if _min is not None:
75 | x = _min + F.softplus(x - _min)
76 | return x
77 |
78 |
79 | class EnsembleTransition(nn.Module):
80 | def __init__(
81 | self,
82 | obs_dim: int,
83 | action_dim: int,
84 | hidden_dims: Union[List[int], Tuple[int]],
85 | num_ensemble: int = 7,
86 | num_elites: int = 5,
87 | activation: nn.Module = Swish,
88 | weight_decays: Optional[Union[List[float], Tuple[float]]] = None,
89 | with_reward: bool = True,
90 | device: str = "cpu"
91 | ) -> None:
92 | super().__init__()
93 |
94 | self.num_ensemble = num_ensemble
95 | self.num_elites = num_elites
96 | self._with_reward = with_reward
97 | self.device = torch.device(device)
98 |
99 | self.activation = activation()
100 |
101 | assert len(weight_decays) == (len(hidden_dims) + 1)
102 |
103 | module_list = []
104 | hidden_dims = [obs_dim+action_dim] + list(hidden_dims)
105 | if weight_decays is None:
106 | weight_decays = [0.0] * (len(hidden_dims) + 1)
107 | for in_dim, out_dim, weight_decay in zip(hidden_dims[:-1], hidden_dims[1:], weight_decays[:-1]):
108 | module_list.append(EnsembleLinear(in_dim, out_dim, num_ensemble, weight_decay))
109 | self.backbones = nn.ModuleList(module_list)
110 |
111 | self.output_layer = EnsembleLinear(
112 | hidden_dims[-1],
113 | 2 * (obs_dim + self._with_reward),
114 | num_ensemble,
115 | weight_decays[-1]
116 | )
117 |
118 | self.register_parameter(
119 | "max_logvar",
120 | nn.Parameter(torch.ones(obs_dim + self._with_reward) * 0.5, requires_grad=True)
121 | )
122 | self.register_parameter(
123 | "min_logvar",
124 | nn.Parameter(torch.ones(obs_dim + self._with_reward) * -10, requires_grad=True)
125 | )
126 |
127 | self.register_parameter(
128 | "elites",
129 | nn.Parameter(torch.tensor(list(range(0, self.num_elites))), requires_grad=False)
130 | )
131 |
132 | self.to(self.device)
133 |
134 | def forward(self, obs_action: np.ndarray) -> Tuple[torch.Tensor, torch.Tensor]:
135 | obs_action = torch.as_tensor(obs_action, dtype=torch.float32).to(self.device)
136 | output = obs_action
137 | for layer in self.backbones:
138 | output = self.activation(layer(output))
139 | mean, logvar = torch.chunk(self.output_layer(output), 2, dim=-1)
140 | logvar = soft_clamp(logvar, self.min_logvar, self.max_logvar)
141 | return mean, logvar
142 |
143 | def load_save(self) -> None:
144 | for layer in self.backbones:
145 | layer.load_save()
146 | self.output_layer.load_save()
147 |
148 | def update_save(self, indexes: List[int]) -> None:
149 | for layer in self.backbones:
150 | layer.update_save(indexes)
151 | self.output_layer.update_save(indexes)
152 |
153 | def get_decay_loss(self) -> torch.Tensor:
154 | decay_loss = 0
155 | for layer in self.backbones:
156 | decay_loss += layer.get_decay_loss()
157 | decay_loss += self.output_layer.get_decay_loss()
158 | return decay_loss
159 |
160 | def set_elites(self, indexes: List[int]) -> None:
161 | assert len(indexes) <= self.num_ensemble and max(indexes) < self.num_ensemble
162 | self.register_parameter('elites', nn.Parameter(torch.tensor(indexes), requires_grad=False))
163 |
164 | def random_elite_idxs(self, batch_size: int) -> np.ndarray:
165 | idxs = np.random.choice(self.elites.data.cpu().numpy(), size=batch_size)
166 | return idxs
167 |
168 |
169 | class StandardScaler(object):
170 | def __init__(self, mu=None, std=None):
171 | self.mu = mu
172 | self.std = std
173 |
174 | def fit(self, data):
175 | """Runs two ops, one for assigning the mean of the data to the internal mean, and
176 | another for assigning the standard deviation of the data to the internal standard deviation.
177 | This function must be called within a 'with .as_default()' block.
178 |
179 | Arguments:
180 | data (np.ndarray): A numpy array containing the input
181 |
182 | Returns: None.
183 | """
184 | self.mu = np.mean(data, axis=0, keepdims=True)
185 | self.std = np.std(data, axis=0, keepdims=True)
186 | self.std[self.std < 1e-12] = 1.0
187 |
188 | def transform(self, data):
189 | """Transforms the input matrix data using the parameters of this scaler.
190 |
191 | Arguments:
192 | data (np.array): A numpy array containing the points to be transformed.
193 |
194 | Returns: (np.array) The transformed dataset.
195 | """
196 | return (data - self.mu) / self.std
197 |
198 | def inverse_transform(self, data):
199 | """Undoes the transformation performed by this scaler.
200 |
201 | Arguments:
202 | data (np.array): A numpy array containing the points to be transformed.
203 |
204 | Returns: (np.array) The transformed dataset.
205 | """
206 | return self.std * data + self.mu
207 |
208 | def save_scaler(self, save_path):
209 | mu_path = path.join(save_path, "mu.npy")
210 | std_path = path.join(save_path, "std.npy")
211 | np.save(mu_path, self.mu)
212 | np.save(std_path, self.std)
213 |
214 | def load_scaler(self, load_path):
215 | mu_path = path.join(load_path, "mu.npy")
216 | std_path = path.join(load_path, "std.npy")
217 | self.mu = np.load(mu_path)
218 | self.std = np.load(std_path)
219 |
220 | def transform_tensor(self, data: torch.Tensor):
221 | device = data.device
222 | data = self.transform(data.cpu().numpy())
223 | data = torch.tensor(data, device=device)
224 | return data
--------------------------------------------------------------------------------
/offlinerl/utils/net/model_GRU.py:
--------------------------------------------------------------------------------
1 | import torch.nn
2 | import torch.nn as nn
3 | import numpy as np
4 | import torch.nn.functional as F
5 | from offlinerl.utils.net.common import miniblock
6 |
7 | class GRU_Model(nn.Module):
8 | def __init__(self, obs_dim, action_dim,device=None, lstm_hidden_units=128):
9 | super(GRU_Model, self).__init__()
10 | self.obs_dim = obs_dim
11 | self.action_dim = action_dim
12 | self.device = device
13 | self.lstm_hidden_units = lstm_hidden_units
14 | self.GRU = nn.GRU(self.obs_dim + self.action_dim, lstm_hidden_units, batch_first=True)
15 | def forward(self, obs, last_acts, pre_hidden, lens):
16 | sta_acs = torch.cat([obs, last_acts], dim=-1)
17 | packed = torch.nn.utils.rnn.pack_padded_sequence(sta_acs,lens,batch_first=True, enforce_sorted=False)
18 | if len(pre_hidden.shape) == 2:
19 | pre_hidden = torch.unsqueeze(pre_hidden, dim=0)
20 | output,_ = self.GRU(packed, pre_hidden)
21 | output,_ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
22 | return output
23 | def get_hidden(self, obs, last_actions, lens):
24 | pre_hidden = torch.zeros((1,len(lens),self.lstm_hidden_units)).to(self.device)
25 | return self(obs, last_actions, pre_hidden,lens)
26 |
--------------------------------------------------------------------------------
/offlinerl/utils/net/moose.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from offlinerl.utils.net.common import BasePolicy
6 |
7 | class VAE(nn.Module, BasePolicy):
8 | def __init__(self,
9 | state_dim,
10 | action_dim,
11 | latent_dim,
12 | max_action,
13 | hidden_size=750):
14 | super(VAE, self).__init__()
15 |
16 | self.e1 = nn.Linear(state_dim + action_dim, hidden_size)
17 | self.e2 = nn.Linear(hidden_size, hidden_size)
18 |
19 | self.mean = nn.Linear(hidden_size, latent_dim)
20 | self.log_std = nn.Linear(hidden_size, latent_dim)
21 |
22 | self.d1 = nn.Linear(latent_dim, hidden_size)
23 | self.d2 = nn.Linear(hidden_size, hidden_size)
24 | self.d3 = nn.Linear(hidden_size, state_dim + action_dim)
25 |
26 | self.max_action = max_action
27 | self.latent_dim = latent_dim
28 |
29 | self._actor = None
30 |
31 | def forward(self, state, action):
32 | z = F.relu(self.e1(torch.cat([state, action], 1)))
33 | z = F.relu(self.e2(z))
34 |
35 | mean = self.mean(z)
36 | # Clamped for numerical stability
37 | log_std = self.log_std(z).clamp(-4, 15)
38 | std = torch.exp(log_std)
39 | z = mean + std * torch.randn_like(std)
40 |
41 | u = self.decode(z)
42 |
43 | return u, mean, std
44 |
45 | def decode(self, z):
46 | a = F.relu(self.d1(z))
47 | a = F.relu(self.d2(a))
48 | a = self.d3(a)
49 | return a
50 |
51 |
52 | def policy_infer(self, obs):
53 | return self.decode(obs)
--------------------------------------------------------------------------------
/offlinerl/utils/net/tanhpolicy.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from torch import nn as nn
4 | from torch.nn import functional as F
5 | from torch.distributions import Distribution, Normal
6 |
7 | from offlinerl.utils.net.common import BasePolicy
8 | from offlinerl.utils.net.continuous import ActorProb
9 |
10 |
11 | class TanhNormal(Distribution):
12 | """
13 | Represent distribution of X where
14 | X = tanh(Z)
15 | Z ~ N(mean, std)
16 |
17 | Note: this is not very numerically stable.
18 | """
19 | def __init__(self, normal_mean, normal_std, max_action=1, min_action=-1, epsilon=1e-6):
20 | """
21 | :param normal_mean: Mean of the normal distribution
22 | :param normal_std: Std of the normal distribution
23 | :param epsilon: Numerical stability epsilon when computing log-prob.
24 | """
25 | self.normal_mean = normal_mean
26 | self.normal_std = normal_std
27 | self.normal = Normal(normal_mean, normal_std)
28 | self.epsilon = epsilon
29 | self.max_action = max_action
30 | self.min_action = min_action
31 |
32 | def sample_n(self, n, return_pre_tanh_value=False):
33 | z = self.normal.sample_n(n)
34 | if return_pre_tanh_value:
35 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2, z
36 | else:
37 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2
38 |
39 | def atanh(self,x):
40 | one_plus_x = (1 + x).clamp(min=1e-6)
41 | one_minus_x = (1 - x).clamp(min=1e-6)
42 | return 0.5 * torch.log(one_plus_x / one_minus_x)
43 |
44 | @property
45 | def mode(self):
46 | return ((self.max_action-self.min_action)/2) * torch.tanh(self.normal_mean) + (self.max_action+self.min_action)/2
47 |
48 | def log_prob(self, value, pre_tanh_value=None):
49 | """
50 |
51 | :param value: some value, x
52 | :param pre_tanh_value: arctanh(x)
53 | :return:
54 | """
55 | unscaled_value = (2*value - (self.max_action+self.min_action))/(self.max_action - self.min_action) # assume the actual actions have been transformed
56 | if pre_tanh_value is None:
57 | pre_tanh_value = self.atanh(unscaled_value) # get the raw Gaussian distribution output
58 |
59 | # ==== previous calculation of tanh log_prob =====
60 | # self.normal.log_prob(pre_tanh_value) - torch.log(
61 | # 1 - value * value + self.epsilon
62 | # )
63 | # previous calculation of tanhGaussian log_prob is OK when the action is in (-1,1). To be more general, we need the following revision
64 |
65 | action_scale = (self.max_action-self.min_action)/2.0
66 | squashed_action = unscaled_value
67 | log_prob = self.normal.log_prob(pre_tanh_value) - torch.log(action_scale * (1 - squashed_action.pow(2)) + self.epsilon)
68 | return log_prob
69 |
70 | def sample(self, return_pretanh_value=False):
71 | """
72 | Gradients will and should *not* pass through this operation.
73 |
74 | See https://github.com/pytorch/pytorch/issues/4620 for discussion.
75 | """
76 | z = self.normal.sample().detach()
77 |
78 | if return_pretanh_value:
79 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2, z
80 | else:
81 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2
82 |
83 | def rsample(self, return_pretanh_value=False):
84 | """
85 | Sampling in the reparameterization case.
86 | """
87 | z = (
88 | self.normal_mean +
89 | self.normal_std *
90 | Normal(
91 | torch.zeros(self.normal_mean.size(), device=self.normal_mean.device),
92 | torch.ones(self.normal_std.size(), device=self.normal_mean.device)
93 | ).sample()
94 | )
95 | z.requires_grad_()
96 |
97 | if return_pretanh_value:
98 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2, z
99 | else:
100 | return (self.max_action-self.min_action)/2*torch.tanh(z)+(self.max_action+self.min_action)/2
101 |
102 |
103 | class TanhGaussianPolicy(ActorProb, BasePolicy):
104 | LOG_SIG_MAX = 2
105 | LOG_SIG_MIN = -5
106 | MEAN_MIN = -9.0
107 | MEAN_MAX = 9.0
108 |
109 | def atanh(self,x):
110 | one_plus_x = (1 + x).clamp(min=1e-6)
111 | one_minus_x = (1 - x).clamp(min=1e-6)
112 | return 0.5*torch.log(one_plus_x/ one_minus_x)
113 |
114 | def log_prob(self, obs, actions):
115 | raw_actions = self.atanh(actions)
116 | logits, h = self.preprocess(obs)
117 |
118 | mean = self.mu(logits)
119 | mean = torch.clamp(mean, self.MEAN_MIN, self.MEAN_MAX)
120 | if self._c_sigma:
121 | log_std = torch.clamp(
122 | self.sigma(logits), min=self.LOG_SIG_MIN, max=self.LOG_SIG_MAX
123 | )
124 | std = log_std.exp()
125 | else:
126 | shape = [1] * len(mean.shape)
127 | shape[1] = -1
128 | log_std = (self.sigma.view(shape) + torch.zeros_like(mean))
129 | std = log_std.exp()
130 |
131 | tanh_normal = TanhNormal(mean, std)
132 | log_prob = tanh_normal.log_prob(value=actions, pre_tanh_value=raw_actions)
133 | return log_prob.sum(-1)
134 |
135 | def forward(
136 | self,
137 | obs,
138 | state=None,
139 | infor={},
140 | reparameterize=True,
141 | ):
142 | """
143 | :param obs: Observation
144 | :param deterministic: If True, do not sample
145 | :param return_log_prob: If True, return a sample and its log probability
146 | """
147 | logits, h = self.preprocess(obs, state)
148 | mean = self.mu(logits)
149 |
150 | if self._c_sigma:
151 | log_std = torch.clamp(
152 | self.sigma(logits), min=self.LOG_SIG_MIN, max=self.LOG_SIG_MAX
153 | )
154 | std = log_std.exp()
155 | else:
156 | shape = [1] * len(mean.shape)
157 | shape[1] = -1
158 | log_std = (self.sigma.view(shape) + torch.zeros_like(mean))
159 | std = log_std.exp()
160 |
161 | return TanhNormal(mean, std, max_action=self._max, min_action=-self._max)
162 |
163 | def policy_infer(self, obs):
164 | return self(obs).mode
165 |
--------------------------------------------------------------------------------
/offlinerl/utils/net/terminal_check.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def termination_fn_halfcheetah(obs, act, next_obs):
5 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
6 |
7 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1))
8 | done = ~not_done
9 | done = done[:, None]
10 | return done
11 |
12 | def termination_fn_hopper(obs, act, next_obs):
13 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
14 |
15 | height = next_obs[:, 0]
16 | angle = next_obs[:, 1]
17 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) * \
18 | np.isfinite(next_obs).all(axis=-1) \
19 | * np.abs(next_obs[:,1:] < 100).all(axis=-1) \
20 | * (height > .7) \
21 | * (np.abs(angle) < .2)
22 |
23 | done = ~not_done
24 | done = done[:,None]
25 | return done
26 |
27 | def termination_fn_halfcheetahveljump(obs, act, next_obs):
28 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
29 |
30 | done = np.array([False]).repeat(len(obs))
31 | done = done[:,None]
32 | return done
33 |
34 | def termination_fn_antangle(obs, act, next_obs):
35 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
36 |
37 | x = next_obs[:, 0]
38 | not_done = np.isfinite(next_obs).all(axis=-1) \
39 | * (x >= 0.2) \
40 | * (x <= 1.0)
41 |
42 | done = ~not_done
43 | done = done[:,None]
44 | return done
45 |
46 | def termination_fn_ant(obs, act, next_obs):
47 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
48 |
49 | x = next_obs[:, 0]
50 | not_done = np.isfinite(next_obs).all(axis=-1) \
51 | * (x >= 0.2) \
52 | * (x <= 1.0)
53 |
54 | done = ~not_done
55 | done = done[:,None]
56 | return done
57 |
58 | def termination_fn_walker2d(obs, act, next_obs):
59 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
60 |
61 | height = next_obs[:, 0]
62 | angle = next_obs[:, 1]
63 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) \
64 | * (height > 0.8) \
65 | * (height < 2.0) \
66 | * (angle > -1.0) \
67 | * (angle < 1.0)
68 | done = ~not_done
69 | done = done[:,None]
70 | return done
71 |
72 | def termination_fn_point2denv(obs, act, next_obs):
73 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
74 |
75 | done = np.array([False]).repeat(len(obs))
76 | done = done[:,None]
77 | return done
78 |
79 | def termination_fn_point2dwallenv(obs, act, next_obs):
80 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
81 |
82 | done = np.array([False]).repeat(len(obs))
83 | done = done[:,None]
84 | return done
85 |
86 | def termination_fn_pendulum(obs, act, next_obs):
87 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
88 |
89 | done = np.zeros((len(obs), 1))
90 | return done
91 |
92 | def termination_fn_humanoid(obs, act, next_obs):
93 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
94 |
95 | z = next_obs[:,0]
96 | done = (z < 1.0) + (z > 2.0)
97 |
98 | done = done[:,None]
99 | return done
100 |
101 | def termination_fn_pen(obs, act, next_obs):
102 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
103 |
104 | obj_pos = next_obs[:, 24:27]
105 | done = obj_pos[:, 2] < 0.075
106 |
107 | done = done[:,None]
108 | return done
109 |
110 | def terminaltion_fn_door(obs, act, next_obs):
111 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
112 |
113 | done = np.array([False] * obs.shape[0])
114 |
115 | done = done[:, None]
116 | return done
117 |
118 | def is_terminal(obs,act, next_obs,task):
119 | if 'halfcheetahvel' in task:
120 | return termination_fn_halfcheetahveljump(obs, act, next_obs)
121 | elif 'halfcheetah' in task:
122 | return termination_fn_halfcheetah(obs, act, next_obs)
123 | elif 'hopper' in task:
124 | return termination_fn_hopper(obs,act,next_obs)
125 | elif 'antangle' in task:
126 | return termination_fn_antangle(obs,act,next_obs)
127 | elif 'ant' in task:
128 | return termination_fn_ant(obs, act, next_obs)
129 | elif 'walker2d' in task:
130 | return termination_fn_walker2d(obs, act, next_obs)
131 | elif 'point2denv' in task:
132 | return termination_fn_point2denv(obs, act, next_obs)
133 | elif 'point2dwallenv' in task:
134 | return termination_fn_point2dwallenv(obs,act, next_obs)
135 | elif 'pendulum' in task:
136 | return termination_fn_pendulum(obs,act,next_obs)
137 | elif 'humanoid' in task:
138 | return termination_fn_humanoid(obs, act, next_obs)
139 |
140 | def get_termination_fn(task):
141 | if 'halfcheetahvel' in task:
142 | return termination_fn_halfcheetahveljump
143 | elif 'halfcheetah' in task:
144 | return termination_fn_halfcheetah
145 | elif 'hopper' in task:
146 | return termination_fn_hopper
147 | elif 'antangle' in task:
148 | return termination_fn_antangle
149 | elif 'ant' in task:
150 | return termination_fn_ant
151 | elif 'walker2d' in task:
152 | return termination_fn_walker2d
153 | elif 'point2denv' in task:
154 | return termination_fn_point2denv
155 | elif 'point2dwallenv' in task:
156 | return termination_fn_point2dwallenv
157 | elif 'pendulum' in task:
158 | return termination_fn_pendulum
159 | elif 'humanoid' in task:
160 | return termination_fn_humanoid
161 | elif 'pen' in task:
162 | return termination_fn_pen
163 | elif 'door' in task:
164 | return terminaltion_fn_door
165 | elif task in ['Pipeline', 'DMSD', 'Fusion', 'Salespromotion', 'SafetyHalfCheetah']:
166 | def terminaltion_fn(obs, act, next_obs):
167 | data = {
168 | "obs" : obs,
169 | "action" : act,
170 | "next_obs" : next_obs,
171 | }
172 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
173 |
174 | done = np.zeros((len(obs), 1))
175 | return done
176 | return terminaltion_fn
177 | elif 'RandomFrictionHopper' in task:
178 | from neorl2.envs.terminated.randomfrictionhopper_terminated import get_terminated
179 |
180 | def terminaltion_fn(obs, act, next_obs):
181 | data = {
182 | "obs" : obs,
183 | "action" : act,
184 | "next_obs" : next_obs,
185 | }
186 | return np.bool_(get_terminated(data))
187 |
188 | return terminaltion_fn
189 | elif 'Simglucose' in task:
190 | from neorl2.envs.terminated.simglucose_terminated import get_terminated
191 |
192 | def terminaltion_fn(obs, act, next_obs):
193 | data = {
194 | "obs" : obs,
195 | "action" : act,
196 | "next_obs" : next_obs,
197 | }
198 | return np.bool_(get_terminated(data))
199 |
200 | return terminaltion_fn
201 | elif 'RocketRecovery' in task:
202 | from neorl2.envs.terminated.rocketrecovery_terminated import get_terminated
203 |
204 | def terminaltion_fn(obs, act, next_obs):
205 | data = {
206 | "obs" : obs,
207 | "action" : act,
208 | "next_obs" : next_obs,
209 | }
210 | return np.bool_(get_terminated(data))
211 |
212 | return terminaltion_fn
213 |
214 | else:
215 | raise NotImplementedError(f"Task {task} not implemented")
--------------------------------------------------------------------------------
/offlinerl/utils/net/vae.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from offlinerl.utils.net.common import BasePolicy
6 |
7 | class VAE(nn.Module, BasePolicy):
8 | def __init__(self,
9 | state_dim,
10 | action_dim,
11 | latent_dim,
12 | max_action,
13 | hidden_size=750):
14 | super(VAE, self).__init__()
15 |
16 | self.e1 = nn.Linear(state_dim + action_dim, hidden_size)
17 | self.e2 = nn.Linear(hidden_size, hidden_size)
18 |
19 | self.mean = nn.Linear(hidden_size, latent_dim)
20 | self.log_std = nn.Linear(hidden_size, latent_dim)
21 |
22 | self.d1 = nn.Linear(state_dim + latent_dim, hidden_size)
23 | self.d2 = nn.Linear(hidden_size, hidden_size)
24 | self.d3 = nn.Linear(hidden_size, action_dim)
25 |
26 | self.max_action = max_action
27 | self.latent_dim = latent_dim
28 |
29 | self._actor = None
30 |
31 | def forward(self, state, action):
32 | z = F.relu(self.e1(torch.cat([state, action], 1)))
33 | z = F.relu(self.e2(z))
34 |
35 | mean = self.mean(z)
36 | # Clamped for numerical stability
37 | log_std = self.log_std(z).clamp(-4, 15)
38 | std = torch.exp(log_std)
39 | z = mean + std * torch.randn_like(std)
40 |
41 | u = self.decode(state, z)
42 |
43 | return u, mean, std
44 |
45 | def decode(self, state, z=None, clip=None, raw=False):
46 | # When sampling from the VAE, the latent vector is clipped to [-0.5, 0.5]
47 | if z is None:
48 | z = torch.randn((state.shape[0], self.latent_dim)).to(state.device)
49 | if clip is not None:
50 | z = z.clamp(-clip, clip)
51 |
52 | a = F.relu(self.d1(torch.cat([state, z], 1)))
53 | a = F.relu(self.d2(a))
54 | a = self.d3(a)
55 | if raw:
56 | return a
57 | return self.max_action * torch.tanh(a)
58 |
59 | def policy_infer(self, obs):
60 | return self.decode(obs, z=self._actor(obs)[0])
61 |
62 | class ActorPerturbation(nn.Module, BasePolicy):
63 | def __init__(self, state_dim, action_dim, latent_action_dim, max_action, max_latent_action=2, phi=0.05):
64 | super(ActorPerturbation, self).__init__()
65 |
66 | self.hidden_size = (400, 300, 400, 300)
67 |
68 | self.l1 = nn.Linear(state_dim, self.hidden_size[0])
69 | self.l2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])
70 | self.l3 = nn.Linear(self.hidden_size[1], latent_action_dim)
71 |
72 | self.l4 = nn.Linear(state_dim + action_dim, self.hidden_size[2])
73 | self.l5 = nn.Linear(self.hidden_size[2], self.hidden_size[3])
74 | self.l6 = nn.Linear(self.hidden_size[3], action_dim)
75 |
76 | self.max_latent_action = max_latent_action
77 | self.max_action = max_action
78 | self.phi = phi
79 |
80 | self.vae = None
81 |
82 | def forward(self, state, decoder):
83 | a = F.relu(self.l1(state))
84 | a = F.relu(self.l2(a))
85 | latent_action = self.max_latent_action * torch.tanh(self.l3(a))
86 |
87 | mid_action = decoder(state, z=latent_action)
88 |
89 | a = F.relu(self.l4(torch.cat([state, mid_action], 1)))
90 | a = F.relu(self.l5(a))
91 | a = self.phi * torch.tanh(self.l6(a))
92 | final_action = (a + mid_action).clamp(-self.max_action, self.max_action)
93 | return latent_action, mid_action, final_action
94 |
95 | def policy_infer(self, obs):
96 |
97 | return self(obs, self.vae.decode)[-1]
--------------------------------------------------------------------------------
/offlinerl/utils/replay_pool.py:
--------------------------------------------------------------------------------
1 | import abc
2 |
3 |
4 | class ReplayPool(object):
5 | """A class used to save and replay data."""
6 |
7 | @abc.abstractmethod
8 | def add_sample(self, sample):
9 | """Add a transition tuple."""
10 | pass
11 |
12 | @abc.abstractmethod
13 | def terminate_episode(self):
14 | """Clean up pool after episode termination."""
15 | pass
16 |
17 | @property
18 | @abc.abstractmethod
19 | def size(self, **kwargs):
20 | pass
21 |
22 | def add_path(self, path):
23 | """Add a rollout to the replay pool.
24 |
25 | This default implementation naively goes through every step, but you
26 | may want to optimize this.
27 |
28 | NOTE: You should NOT call "terminate_episode" after calling add_path.
29 | It's assumed that this function handles the episode termination.
30 |
31 | :param path: Dict like one outputted by railrl.samplers.util.rollout
32 | """
33 | self.add_samples(path)
34 | self.terminate_episode()
35 |
36 | @abc.abstractmethod
37 | def random_batch(self, batch_size):
38 | """Return a random batch of size `batch_size`."""
39 | pass
40 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | import os
4 | from setuptools import setup
5 | from setuptools import find_packages
6 |
7 | def get_version() -> str:
8 | # https://packaging.python.org/guides/single-sourcing-package-version/
9 | init = open(os.path.join("offlinerl", "__init__.py"), "r").read().split()
10 | return init[init.index("__version__") + 2][1:-1]
11 |
12 | setup(
13 | name='offlinerl',
14 | description="A Library for Offline RL(Batch RL)",
15 | url="https://agit.ai/Polixir/OfflineRL",
16 | version=get_version(),
17 | packages=find_packages(),
18 | author="SongyiGao",
19 | author_email="songyigao@gmail.com",
20 | python_requires=">=3.7",
21 | install_requires=[
22 | "aim",
23 | "fire",
24 | "loguru",
25 | "gym",
26 | "scikit-learn",
27 | "gtimer",
28 | "numpy",
29 | "ray==2.9",
30 | "aioredis==1.3.1",
31 | "aiohttp==3.7.4",
32 | ],
33 |
34 | )
35 |
--------------------------------------------------------------------------------