├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── algorithms │ ├── a3c.rst │ ├── ac.rst │ ├── ddpg.rst │ ├── dppo.rst │ ├── dqn.rst │ ├── pg.rst │ ├── ppo.rst │ ├── sac.rst │ ├── td3.rst │ └── trpo.rst ├── common │ ├── basicnets.rst │ ├── buffer.rst │ ├── distributions.rst │ ├── envlist.rst │ ├── envwrappers.rst │ ├── mathutils.rst │ ├── policynets.rst │ ├── utils.rst │ └── valuenets.rst ├── conf.py ├── guide │ ├── api.rst │ ├── configuration.rst │ ├── installation.rst │ └── quickstart.rst ├── img │ ├── logo.png │ └── rlzoo-logo.png ├── index.rst ├── make.bat ├── mkdocs.yml └── other │ ├── drl_book.rst │ └── drl_tutorial.rst ├── examples.md ├── gif ├── ACM_MM2021_Presentation_Slide.pdf ├── atari.gif ├── box2d.gif ├── classic.gif ├── dmcontrol.gif ├── interactive.gif ├── mujoco.gif ├── rlbench.gif └── robotics.gif ├── requirements.txt ├── rlzoo ├── .gitignore ├── __init__.py ├── algorithms │ ├── __init__.py │ ├── a3c │ │ ├── __init__.py │ │ ├── a3c.py │ │ ├── default.py │ │ └── run_a3c.py │ ├── ac │ │ ├── __init__.py │ │ ├── ac.py │ │ ├── default.py │ │ └── run_ac.py │ ├── ddpg │ │ ├── __init__.py │ │ ├── ddpg.py │ │ ├── default.py │ │ └── run_ddpg.py │ ├── dppo │ │ ├── __init__.py │ │ ├── default.py │ │ └── dppo.py │ ├── dppo_clip │ │ ├── __init__.py │ │ ├── dppo_clip.py │ │ └── run_dppo_clip.py │ ├── dppo_clip_distributed │ │ ├── __init__.py │ │ └── dppo_clip.py │ ├── dppo_penalty │ │ ├── __init__.py │ │ ├── dppo_penalty.py │ │ └── run_dppo_penalty.py │ ├── dqn │ │ ├── __init__.py │ │ ├── default.py │ │ ├── dqn.py │ │ └── run_dqn.py │ ├── pg │ │ ├── __init__.py │ │ ├── default.py │ │ ├── pg.py │ │ └── run_pg.py │ ├── ppo │ │ ├── __init__.py │ │ ├── default.py │ │ └── ppo.py │ ├── ppo_clip │ │ ├── __init__.py │ │ ├── ppo_clip.py │ │ └── run_ppo_clip.py │ ├── ppo_penalty │ │ ├── __init__.py │ │ ├── ppo_penalty.py │ │ └── run_ppo_penalty.py │ ├── sac │ │ ├── __init__.py │ │ ├── default.py │ │ ├── run_sac.py │ │ └── sac.py │ ├── td3 │ │ ├── __init__.py │ │ ├── default.py │ │ ├── run_td3.py │ │ └── td3.py │ └── trpo │ │ ├── __init__.py │ │ ├── default.py │ │ ├── run_trpo.py │ │ └── trpo.py ├── common │ ├── __init__.py │ ├── basic_nets.py │ ├── buffer.py │ ├── build_rlbench_env.py │ ├── distributions.py │ ├── env_list.py │ ├── env_wrappers.py │ ├── math_utils.py │ ├── policy_networks.py │ ├── utils.py │ └── value_networks.py ├── distributed │ ├── __init__.py │ ├── dis_components.py │ ├── run_dis_train.sh │ ├── start_dis_role.py │ └── training_components.py ├── interactive │ ├── .gitignore │ ├── common.py │ ├── components.py │ └── main.ipynb └── run_rlzoo.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | *~ 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | docs/test_build/ 70 | docs/build_test/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | venv_/ 96 | venv2/ 97 | venv3/ 98 | venv_doc/ 99 | venv_py2/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | 114 | 115 | # IDE Specific directories 116 | .DS_Store 117 | .idea 118 | .vscode/ 119 | 120 | # TensorLayer Directories 121 | checkpoints 122 | data/ 123 | lib_win/ 124 | 125 | # Custom Scripts 126 | update_tl.bat 127 | update_tl.py 128 | 129 | # Data Files and ByteCode files 130 | *.gz 131 | *.npz 132 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include examples.md LICENSE.txt requirements.txt README.md 2 | recursive-include rlzoo *.py 3 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/algorithms/a3c.rst: -------------------------------------------------------------------------------- 1 | A3C 2 | ================================= 3 | 4 | Example 5 | ----------- 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import A3C 13 | 14 | AlgName = 'A3C' 15 | EnvName = 'PongNoFrameskip-v4' 16 | EnvType = 'atari' 17 | 18 | # EnvName = 'Pendulum-v0' # only continuous action 19 | # EnvType = 'classic_control' 20 | 21 | # EnvName = 'BipedalWalker-v2' 22 | # EnvType = 'box2d' 23 | 24 | # EnvName = 'Ant-v2' 25 | # EnvType = 'mujoco' 26 | 27 | # EnvName = 'FetchPush-v1' 28 | # EnvType = 'robotics' 29 | 30 | # EnvName = 'FishSwim-v0' 31 | # EnvType = 'dm_control' 32 | 33 | number_workers = 2 # need to specify number of parallel workers 34 | env = build_env(EnvName, EnvType, nenv=number_workers) 35 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 36 | alg = eval(AlgName+'(**alg_params)') 37 | alg.learn(env=env, mode='train', render=False, **learn_params) 38 | alg.learn(env=env, mode='test', render=True, **learn_params) 39 | 40 | Asychronous Advantage Actor-Critic 41 | ---------------------------------------- 42 | 43 | .. autoclass:: rlzoo.algorithms.a3c.a3c.A3C 44 | :members: 45 | :undoc-members: 46 | 47 | Default Hyper-parameters 48 | ---------------------------------- 49 | 50 | .. automodule:: rlzoo.algorithms.a3c.default 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | -------------------------------------------------------------------------------- /docs/algorithms/ac.rst: -------------------------------------------------------------------------------- 1 | AC 2 | =========================== 3 | 4 | Example 5 | ----------- 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import AC 13 | 14 | AlgName = 'AC' 15 | EnvName = 'PongNoFrameskip-v4' 16 | EnvType = 'atari' 17 | 18 | # EnvName = 'Pendulum-v0' 19 | # EnvType = 'classic_control' 20 | 21 | # EnvName = 'BipedalWalker-v2' 22 | # EnvType = 'box2d' 23 | 24 | # EnvName = 'Ant-v2' 25 | # EnvType = 'mujoco' 26 | 27 | # EnvName = 'FetchPush-v1' 28 | # EnvType = 'robotics' 29 | 30 | # EnvName = 'FishSwim-v0' 31 | # EnvType = 'dm_control' 32 | 33 | # EnvName = 'ReachTarget' 34 | # EnvType = 'rlbench' 35 | 36 | env = build_env(EnvName, EnvType) 37 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 38 | alg = eval(AlgName+'(**alg_params)') 39 | alg.learn(env=env, mode='train', render=False, **learn_params) 40 | alg.learn(env=env, mode='test', render=True, **learn_params) 41 | 42 | Actor-Critic 43 | --------------------------------- 44 | 45 | .. autoclass:: rlzoo.algorithms.ac.ac.AC 46 | :members: 47 | :undoc-members: 48 | 49 | Default Hyper-parameters 50 | ---------------------------------- 51 | 52 | .. automodule:: rlzoo.algorithms.ac.default 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | 57 | -------------------------------------------------------------------------------- /docs/algorithms/ddpg.rst: -------------------------------------------------------------------------------- 1 | DDPG 2 | =========================== 3 | 4 | Example 5 | ----------- 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import DDPG 13 | 14 | AlgName = 'DDPG' 15 | EnvName = 'Pendulum-v0' # only continuous action 16 | EnvType = 'classic_control' 17 | 18 | # EnvName = 'BipedalWalker-v2' 19 | # EnvType = 'box2d' 20 | 21 | # EnvName = 'Ant-v2' 22 | # EnvType = 'mujoco' 23 | 24 | # EnvName = 'FetchPush-v1' 25 | # EnvType = 'robotics' 26 | 27 | # EnvName = 'FishSwim-v0' 28 | # EnvType = 'dm_control' 29 | 30 | # EnvName = 'ReachTarget' 31 | # EnvType = 'rlbench' 32 | 33 | env = build_env(EnvName, EnvType) 34 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 35 | alg = eval(AlgName+'(**alg_params)') 36 | alg.learn(env=env, mode='train', render=False, **learn_params) 37 | alg.learn(env=env, mode='test', render=True, **learn_params) 38 | 39 | Deep Deterministic Policy Gradient 40 | ----------------------------------- 41 | 42 | .. autoclass:: rlzoo.algorithms.ddpg.ddpg.DDPG 43 | :members: 44 | :undoc-members: 45 | 46 | Default Hyper-parameters 47 | ---------------------------------- 48 | 49 | .. automodule:: rlzoo.algorithms.ddpg.default 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | -------------------------------------------------------------------------------- /docs/algorithms/dppo.rst: -------------------------------------------------------------------------------- 1 | DPPO 2 | =========================== 3 | 4 | Example 5 | ----------- 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import DPPO 13 | 14 | EnvName = 'PongNoFrameskip-v4' 15 | EnvType = 'atari' 16 | 17 | # EnvName = 'Pendulum-v0' 18 | # EnvType = 'classic_control' 19 | 20 | # EnvName = 'BipedalWalker-v2' 21 | # EnvType = 'box2d' 22 | 23 | # EnvName = 'Ant-v2' 24 | # EnvType = 'mujoco' 25 | 26 | # EnvName = 'FetchPush-v1' 27 | # EnvType = 'robotics' 28 | 29 | # EnvName = 'FishSwim-v0' 30 | # EnvType = 'dm_control' 31 | 32 | # EnvName = 'ReachTarget' 33 | # EnvType = 'rlbench' 34 | 35 | number_workers = 2 # need to specify number of parallel workers 36 | env = build_env(EnvName, EnvType, nenv=number_workers) 37 | alg_params, learn_params = call_default_params(env, EnvType, 'DPPO') 38 | alg = DPPO(method='penalty', **alg_params) # specify 'clip' or 'penalty' method for PPO 39 | alg.learn(env=env, mode='train', render=False, **learn_params) 40 | alg.learn(env=env, mode='test', render=True, **learn_params) 41 | 42 | Distributed Proximal Policy Optimization (Penalty) 43 | ---------------------------------------------------- 44 | 45 | .. autoclass:: rlzoo.algorithms.dppo_penalty.dppo_penalty.DPPO_PENALTY 46 | :members: 47 | :undoc-members: 48 | 49 | 50 | Distributed Proximal Policy Optimization (Clip) 51 | ------------------------------------------------ 52 | 53 | .. autoclass:: rlzoo.algorithms.dppo_clip.dppo_clip.DPPO_CLIP 54 | :members: 55 | :undoc-members: 56 | 57 | Default Hyper-parameters 58 | ---------------------------------- 59 | 60 | .. automodule:: rlzoo.algorithms.dppo.default 61 | :members: 62 | :undoc-members: 63 | :show-inheritance: 64 | 65 | -------------------------------------------------------------------------------- /docs/algorithms/dqn.rst: -------------------------------------------------------------------------------- 1 | DQN and Variants 2 | ================================= 3 | 4 | Example 5 | ------------ 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import DQN 13 | 14 | AlgName = 'DQN' 15 | EnvName = 'PongNoFrameskip-v4' 16 | EnvType = 'atari' 17 | 18 | # EnvName = 'CartPole-v1' 19 | # EnvType = 'classic_control' # the name of env needs to match the type of env 20 | 21 | env = build_env(EnvName, EnvType) 22 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 23 | alg = eval(AlgName+'(**alg_params)') 24 | alg.learn(env=env, mode='train', **learn_params) 25 | alg.learn(env=env, mode='test', render=True, **learn_params) 26 | 27 | Deep Q-Networks 28 | --------------------------------- 29 | 30 | .. autoclass:: rlzoo.algorithms.dqn.dqn.DQN 31 | :members: 32 | :undoc-members: 33 | 34 | Default Hyper-parameters 35 | ---------------------------------- 36 | 37 | .. automodule:: rlzoo.algorithms.dqn.default 38 | :members: 39 | :undoc-members: 40 | :show-inheritance: 41 | 42 | -------------------------------------------------------------------------------- /docs/algorithms/pg.rst: -------------------------------------------------------------------------------- 1 | VPG 2 | ================================= 3 | 4 | Example 5 | ----------- 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import PG 13 | 14 | AlgName = 'PG' 15 | EnvName = 'PongNoFrameskip-v4' 16 | EnvType = 'atari' 17 | 18 | # EnvName = 'CartPole-v0' 19 | # EnvType = 'classic_control' 20 | 21 | # EnvName = 'BipedalWalker-v2' 22 | # EnvType = 'box2d' 23 | 24 | # EnvName = 'Ant-v2' 25 | # EnvType = 'mujoco' 26 | 27 | # EnvName = 'FetchPush-v1' 28 | # EnvType = 'robotics' 29 | 30 | # EnvName = 'FishSwim-v0' 31 | # EnvType = 'dm_control' 32 | 33 | # EnvName = 'ReachTarget' 34 | # EnvType = 'rlbench' 35 | 36 | env = build_env(EnvName, EnvType) 37 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 38 | alg = eval(AlgName+'(**alg_params)') 39 | alg.learn(env=env, mode='train', render=False, **learn_params) 40 | alg.learn(env=env, mode='test', render=True, **learn_params) 41 | 42 | Vanilla Policy Gradient 43 | --------------------------------- 44 | 45 | .. autoclass:: rlzoo.algorithms.pg.pg.PG 46 | :members: 47 | :undoc-members: 48 | 49 | Default Hyper-parameters 50 | ---------------------------------- 51 | 52 | .. automodule:: rlzoo.algorithms.pg.default 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | 57 | -------------------------------------------------------------------------------- /docs/algorithms/ppo.rst: -------------------------------------------------------------------------------- 1 | PPO 2 | =========================== 3 | 4 | Example 5 | ----------- 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import PPO 13 | 14 | EnvName = 'PongNoFrameskip-v4' 15 | EnvType = 'atari' 16 | 17 | # EnvName = 'Pendulum-v0' 18 | # EnvType = 'classic_control' 19 | 20 | # EnvName = 'BipedalWalker-v2' 21 | # EnvType = 'box2d' 22 | 23 | # EnvName = 'Ant-v2' 24 | # EnvType = 'mujoco' 25 | 26 | # EnvName = 'FetchPush-v1' 27 | # EnvType = 'robotics' 28 | 29 | # EnvName = 'FishSwim-v0' 30 | # EnvType = 'dm_control' 31 | 32 | # EnvName = 'ReachTarget' 33 | # EnvType = 'rlbench' 34 | 35 | env = build_env(EnvName, EnvType) 36 | alg_params, learn_params = call_default_params(env, EnvType, 'PPO') 37 | alg = PPO(method='clip', **alg_params) # specify 'clip' or 'penalty' method for PPO 38 | alg.learn(env=env, mode='train', render=False, **learn_params) 39 | alg.learn(env=env, mode='test', render=False, **learn_params) 40 | 41 | Proximal Policy Optimization (Penalty) 42 | ---------------------------------------------------- 43 | 44 | .. autoclass:: rlzoo.algorithms.ppo_penalty.ppo_penalty.PPO_PENALTY 45 | :members: 46 | :undoc-members: 47 | 48 | 49 | Proximal Policy Optimization (Clip) 50 | ------------------------------------------------ 51 | 52 | .. autoclass:: rlzoo.algorithms.ppo_clip.ppo_clip.PPO_CLIP 53 | :members: 54 | :undoc-members: 55 | 56 | Default Hyper-parameters 57 | ---------------------------------- 58 | 59 | .. automodule:: rlzoo.algorithms.ppo.default 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | 64 | -------------------------------------------------------------------------------- /docs/algorithms/sac.rst: -------------------------------------------------------------------------------- 1 | SAC 2 | =========================== 3 | 4 | Example 5 | ----------- 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import SAC 13 | 14 | AlgName = 'SAC' 15 | EnvName = 'Pendulum-v0' # only continuous action 16 | EnvType = 'classic_control' 17 | 18 | # EnvName = 'BipedalWalker-v2' 19 | # EnvType = 'box2d' 20 | 21 | # EnvName = 'Ant-v2' 22 | # EnvType = 'mujoco' 23 | 24 | # EnvName = 'FetchPush-v1' 25 | # EnvType = 'robotics' 26 | 27 | # EnvName = 'FishSwim-v0' 28 | # EnvType = 'dm_control' 29 | 30 | # EnvName = 'ReachTarget' 31 | # EnvType = 'rlbench' 32 | 33 | env = build_env(EnvName, EnvType) 34 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 35 | alg = eval(AlgName+'(**alg_params)') 36 | alg.learn(env=env, mode='train', render=False, **learn_params) 37 | alg.learn(env=env, mode='test', render=True, **learn_params) 38 | 39 | Soft Actor-Critic 40 | --------------------------------- 41 | 42 | .. autoclass:: rlzoo.algorithms.sac.sac.SAC 43 | :members: 44 | :undoc-members: 45 | 46 | Default Hyper-parameters 47 | ---------------------------------- 48 | 49 | .. automodule:: rlzoo.algorithms.sac.default 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | -------------------------------------------------------------------------------- /docs/algorithms/td3.rst: -------------------------------------------------------------------------------- 1 | TD3 2 | =========================== 3 | 4 | Example 5 | ----------- 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import TD3 13 | 14 | AlgName = 'TD3' 15 | EnvName = 'Pendulum-v0' # only continuous action 16 | EnvType = 'classic_control' 17 | 18 | # EnvName = 'BipedalWalker-v2' 19 | # EnvType = 'box2d' 20 | 21 | # EnvName = 'Ant-v2' 22 | # EnvType = 'mujoco' 23 | 24 | # EnvName = 'FetchPush-v1' 25 | # EnvType = 'robotics' 26 | 27 | # EnvName = 'FishSwim-v0' 28 | # EnvType = 'dm_control' 29 | 30 | # EnvName = 'ReachTarget' 31 | # EnvType = 'rlbench' 32 | 33 | env = build_env(EnvName, EnvType) 34 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 35 | alg = eval(AlgName+'(**alg_params)') 36 | alg.learn(env=env, mode='train', render=False, **learn_params) 37 | alg.learn(env=env, mode='test', render=True, **learn_params) 38 | 39 | Twin Delayed DDPG 40 | --------------------------------- 41 | 42 | .. autoclass:: rlzoo.algorithms.td3.td3.TD3 43 | :members: 44 | :undoc-members: 45 | 46 | Default Hyper-parameters 47 | ---------------------------------- 48 | 49 | .. automodule:: rlzoo.algorithms.td3.default 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | -------------------------------------------------------------------------------- /docs/algorithms/trpo.rst: -------------------------------------------------------------------------------- 1 | TRPO 2 | =========================== 3 | 4 | Example 5 | ----------- 6 | 7 | .. code-block:: python 8 | :linenos: 9 | 10 | from rlzoo.common.env_wrappers import build_env 11 | from rlzoo.common.utils import call_default_params 12 | from rlzoo.algorithms import TD3 13 | 14 | AlgName = 'TRPO' 15 | EnvName = 'PongNoFrameskip-v4' 16 | EnvType = 'atari' 17 | 18 | # EnvName = 'CartPole-v0' 19 | # EnvType = 'classic_control' 20 | 21 | # EnvName = 'BipedalWalker-v2' 22 | # EnvType = 'box2d' 23 | 24 | # EnvName = 'Ant-v2' 25 | # EnvType = 'mujoco' 26 | 27 | # EnvName = 'FetchPush-v1' 28 | # EnvType = 'robotics' 29 | 30 | # EnvName = 'FishSwim-v0' 31 | # EnvType = 'dm_control' 32 | 33 | # EnvName = 'ReachTarget' 34 | # EnvType = 'rlbench' 35 | 36 | env = build_env(EnvName, EnvType) 37 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 38 | alg = eval(AlgName+'(**alg_params)') 39 | alg.learn(env=env, mode='train', render=False, **learn_params) 40 | alg.learn(env=env, mode='test', render=True, **learn_params) 41 | 42 | Trust Region Policy Optimization 43 | --------------------------------- 44 | 45 | .. autoclass:: rlzoo.algorithms.trpo.trpo.TRPO 46 | :members: 47 | :undoc-members: 48 | 49 | Default Hyper-parameters 50 | ---------------------------------- 51 | 52 | .. automodule:: rlzoo.algorithms.trpo.default 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | 57 | -------------------------------------------------------------------------------- /docs/common/basicnets.rst: -------------------------------------------------------------------------------- 1 | Basic Networks 2 | =========================== 3 | 4 | 5 | Basic Networks in RLzoo 6 | --------------------------------- 7 | 8 | .. automodule:: rlzoo.common.basic_nets 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | -------------------------------------------------------------------------------- /docs/common/buffer.rst: -------------------------------------------------------------------------------- 1 | Replay Buffer 2 | =========================== 3 | 4 | 5 | Replay Buffer in RLzoo 6 | --------------------------------- 7 | 8 | .. automodule:: rlzoo.common.buffer 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | :special-members: 13 | 14 | -------------------------------------------------------------------------------- /docs/common/distributions.rst: -------------------------------------------------------------------------------- 1 | Distributions 2 | =========================== 3 | 4 | 5 | Distributions for Stochastic Policy in RLzoo 6 | ---------------------------------------------- 7 | 8 | .. automodule:: rlzoo.common.distributions 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | -------------------------------------------------------------------------------- /docs/common/envlist.rst: -------------------------------------------------------------------------------- 1 | Environment List 2 | =========================== 3 | 4 | .. _env_list: 5 | 6 | List of Supported Environments in RLzoo 7 | ---------------------------------------- 8 | 9 | .. automodule:: rlzoo.common.env_list 10 | :members: 11 | :undoc-members: 12 | :show-inheritance: 13 | 14 | 15 | 16 | .. literalinclude:: ../../rlzoo/common/env_list.py 17 | :language: python 18 | :lines: 10- 19 | :linenos: -------------------------------------------------------------------------------- /docs/common/envwrappers.rst: -------------------------------------------------------------------------------- 1 | Environment Wrappers 2 | =========================== 3 | 4 | 5 | Environment Wrappers in RLzoo 6 | --------------------------------- 7 | 8 | .. automodule:: rlzoo.common.env_wrappers 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | -------------------------------------------------------------------------------- /docs/common/mathutils.rst: -------------------------------------------------------------------------------- 1 | Math Utilities 2 | =========================== 3 | 4 | 5 | Math Utilities in RLzoo 6 | --------------------------------- 7 | 8 | .. automodule:: rlzoo.common.math_utils 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | -------------------------------------------------------------------------------- /docs/common/policynets.rst: -------------------------------------------------------------------------------- 1 | Policy Networks 2 | =========================== 3 | 4 | 5 | Policy Networks in RLzoo 6 | --------------------------------- 7 | 8 | .. autoclass:: rlzoo.common.policy_networks.StochasticContinuousPolicyNetwork 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | :special-members: 13 | 14 | 15 | .. autoclass:: rlzoo.common.policy_networks.DeterministicContinuousPolicyNetwork 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | :special-members: 20 | 21 | .. autoclass:: rlzoo.common.policy_networks.DeterministicPolicyNetwork 22 | :members: 23 | :undoc-members: 24 | :show-inheritance: 25 | :special-members: 26 | 27 | .. autoclass:: rlzoo.common.policy_networks.StochasticPolicyNetwork 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | :special-members: 32 | -------------------------------------------------------------------------------- /docs/common/utils.rst: -------------------------------------------------------------------------------- 1 | Common Utilities 2 | =========================== 3 | 4 | 5 | Common Utilities in RLzoo 6 | --------------------------------- 7 | 8 | .. automodule:: rlzoo.common.utils 9 | :members: 10 | :undoc-members: 11 | :show-inheritance: 12 | 13 | -------------------------------------------------------------------------------- /docs/common/valuenets.rst: -------------------------------------------------------------------------------- 1 | Value Networks 2 | =========================== 3 | 4 | 5 | Value Networks in RLzoo 6 | --------------------------------- 7 | 8 | .. autoclass:: rlzoo.common.value_networks.ValueNetwork 9 | :members: 10 | :undoc-members: 11 | :special-members: 12 | 13 | .. autoclass:: rlzoo.common.value_networks.MlpQNetwork 14 | :members: 15 | :undoc-members: 16 | :special-members: 17 | 18 | .. autoclass:: rlzoo.common.value_networks.QNetwork 19 | :members: 20 | :undoc-members: 21 | :special-members: 22 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("../")) # Important 17 | sys.path.insert(0, os.path.abspath(os.path.join("..", "rlzoo"))) # Important 18 | 19 | # from rlzoo.algorithms import * 20 | import sphinx_rtd_theme 21 | 22 | # -- Project information ----------------------------------------------------- 23 | 24 | project = 'RLzoo' 25 | copyright = '2020, Zihan Ding, Tianyang Yu, Yanhua Huang, Hongming Zhang, Hao Dong' 26 | author = 'Zihan Ding, Tianyang Yu, Yanhua Huang, Hongming Zhang, Hao Dong' 27 | 28 | # The full version, including alpha/beta/rc tags 29 | release = '1.0.3' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | 38 | extensions = [ 39 | 'sphinx.ext.autodoc', 40 | 'sphinx.ext.doctest', 41 | 'sphinx.ext.intersphinx', 42 | 'sphinx.ext.coverage', 43 | 'sphinx.ext.imgmath', 44 | 'sphinx.ext.mathjax', 45 | 'sphinx.ext.ifconfig', 46 | 'sphinx.ext.viewcode', 47 | 'sphinx.ext.githubpages', 48 | # 'sphinxcontrib.bibtex', 49 | 'recommonmark' 50 | ] 51 | 52 | autodoc_mock_imports = [ 53 | 'cv2', 54 | 'hyperdash', 55 | 'gridfs', 56 | 'horovod', 57 | 'hyperdash', 58 | 'imageio', 59 | 'lxml', 60 | 'matplotlib', 61 | 'nltk', 62 | # 'numpy', 63 | 'PIL', 64 | 'progressbar', 65 | 'pymongo', 66 | 'scipy', 67 | 'skimage', 68 | 'sklearn', 69 | # 'tensorflow', 70 | 'tqdm', 71 | 'h5py', 72 | # 'tensorlayer.third_party.roi_pooling.roi_pooling.roi_pooling_ops', # TL C++ Packages 73 | ] 74 | 75 | 76 | # Add any paths that contain templates here, relative to this directory. 77 | templates_path = ['_templates'] 78 | source_suffix = ['.rst', '.md'] 79 | master_doc = 'index' 80 | 81 | # List of patterns, relative to source directory, that match files and 82 | # directories to ignore when looking for source files. 83 | # This pattern also affects html_static_path and html_extra_path. 84 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 85 | 86 | 87 | # -- Options for HTML output ------------------------------------------------- 88 | 89 | # The theme to use for HTML and HTML Help pages. See the documentation for 90 | # a list of builtin themes. 91 | # 92 | html_theme = 'sphinx_rtd_theme' 93 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 94 | html_logo = './img/rlzoo-logo.png' 95 | 96 | 97 | # Add any paths that contain custom static files (such as style sheets) here, 98 | # relative to this directory. They are copied after the builtin static files, 99 | # so a file named "default.css" will overwrite the builtin "default.css". 100 | html_static_path = ['_static'] 101 | -------------------------------------------------------------------------------- /docs/guide/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | ================================= 3 | 4 | make_env() 5 | ---------------------- 6 | 7 | It can be used as: 8 | 9 | .. code-block:: python 10 | :linenos: 11 | 12 | env = build_env(EnvName, EnvType) 13 | 14 | call_default_params() 15 | ---------------------- 16 | 17 | It can be used as: 18 | 19 | .. code-block:: python 20 | :linenos: 21 | 22 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 23 | 24 | The ``call_default_params`` returns the hyper-parameters stored in two dictionaries ``alg_params`` and ``learn_params``, which can be printed to see what are contained inside. Hyper-parameters in these two dictionaries can also be changed by users before instantiating the agent and starting the learning process. 25 | 26 | If you want to know exactly where the default hyper-parameters come from, they are stored in an individual Python script as ``default.py`` in each algorithm file in ``./rlzoo/algorithms/``. 27 | 28 | alg.learn() 29 | ------------ 30 | 31 | It can be used as: 32 | 33 | .. code-block:: python 34 | :linenos: 35 | 36 | # start the training 37 | alg.learn(env=env, mode='train', render=False, **learn_params) 38 | # test after training 39 | alg.learn(env=env, mode='test', render=True, **learn_params) 40 | 41 | where the ``alg`` is an instantiation of DRL algorithm in RLzoo. -------------------------------------------------------------------------------- /docs/guide/configuration.rst: -------------------------------------------------------------------------------- 1 | Configurations Overview 2 | ================================= 3 | 4 | Supported DRL Agorithms 5 | -------------------------- 6 | Generally RLzoo supports following DRL algorithms: 7 | 8 | **Value-based methods** 9 | 10 | * `Deep Q-Networks (DQN) `_ 11 | * `Double DQN `_ 12 | * `Dueling DQN `_ 13 | * `Prioritized Experience Replay (PER) `_ 14 | * `Retrace `_ 15 | * `Noisy DQN `_ 16 | * `Distributed DQN `_ 17 | 18 | **Policy-based methods** 19 | 20 | * `Vanilla Policy Gradient (VPG) `_ 21 | * `Trust Region Policy Optimization (TRPO) `_ 22 | * `Proximal Policy Optimization (PPO) `_ 23 | * `Distributed PPO (DPPO) `_ 24 | 25 | **Actor-critic methods** 26 | 27 | * `Actor-Critic (AC) `_ 28 | * `Asychronous Advantage Actor-Critic (A3C) `_ 29 | * `Deep Deterministic Policy Gradient (DDPG) `_ 30 | * `Twin Delayed DDPG (TD3) `_ 31 | * `Soft Actor-Critic (SAC) `_ 32 | 33 | 34 | Supported Environments 35 | -------------------------- 36 | Generally RLzoo supports following environments for DRL: 37 | 38 | * `OpenAI Gym `_ 39 | * Atari 40 | * Box2D 41 | * Classic Control 42 | * MuJoCo 43 | * Robotics 44 | * `DeepMind Control Suite `_ 45 | 46 | * `RLBench `_ 47 | 48 | 49 | Full list of specific names of environments supported in RLzoo can be checked in :ref:`env_list`. 50 | 51 | Supported Configurations 52 | ----------------------------- 53 | Not all configurations (specific RL algorithm on specific environment) are supported in RLzoo, as in other libraries. The supported configurations for RL algorithms with corresponding environments in RLzoo are listed in the following table. 54 | 55 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 56 | | Algorithms | Action Space | Policy | Update | Envs | 57 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 58 | | DQN (double, dueling, PER) | Discrete Only | NA | Off-policy | Atari, Classic Control | 59 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 60 | | AC | Discrete/Continuous | Stochastic | On-policy | All | 61 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 62 | | PG | Discrete/Continuous | Stochastic | On-policy | All | 63 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 64 | | DDPG | Continuous | Deterministic | Off-policy | Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control, RLBench | 65 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 66 | | TD3 | Continuous | Deterministic | Off-policy | Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control, RLBench | 67 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 68 | | SAC | Continuous | Stochastic | Off-policy | Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control, RLBench | 69 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 70 | | A3C | Discrete/Continuous | Stochastic | On-policy | Atari, Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control | 71 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 72 | | PPO | Discrete/Continuous | Stochastic | On-policy | All | 73 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 74 | | DPPO | Discrete/Continuous | Stochastic | On-policy | Atari, Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control | 75 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ 76 | | TRPO | Discrete/Continuous | Stochastic | On-policy | All | 77 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+ -------------------------------------------------------------------------------- /docs/guide/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ================================= 3 | 4 | RLzoo generally requires Python>=3.5. Also if you want to use DeepMind Control Suite environment, Python 3.6 will be required. 5 | 6 | Direct installation: 7 | 8 | .. code-block:: bash 9 | :linenos: 10 | 11 | pip3 install rlzoo --upgrade 12 | 13 | Install from the source code on github: 14 | 15 | .. code-block:: bash 16 | :linenos: 17 | 18 | git clone https://github.com/tensorlayer/RLzoo.git 19 | cd RLzoo 20 | pip3 install . -------------------------------------------------------------------------------- /docs/guide/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quick Start 2 | ================================= 3 | 4 | Simple Usage 5 | --------------- 6 | 7 | Open ``./run_rlzoo.py``: 8 | 9 | .. code-block:: python 10 | :linenos: 11 | 12 | from rlzoo.common.env_wrappers import build_env 13 | from rlzoo.common.utils import call_default_params 14 | from rlzoo.algorithms import TD3 15 | # choose an algorithm 16 | AlgName = 'TD3' 17 | # select a corresponding environment type 18 | EnvType = 'classic_control' 19 | # chose an environment 20 | EnvName = 'Pendulum-v0' 21 | # build an environment with wrappers 22 | env = build_env(EnvName, EnvType) 23 | # call default parameters for the algorithm and learning process 24 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 25 | # instantiate the algorithm 26 | alg = eval(AlgName+'(**alg_params)') 27 | # start the training 28 | alg.learn(env=env, mode='train', render=False, **learn_params) 29 | # test after training 30 | alg.learn(env=env, mode='test', render=True, **learn_params) 31 | 32 | 33 | Run the example: 34 | 35 | .. code-block:: bash 36 | 37 | python run_rlzoo.py 38 | 39 | 40 | Choices for ``AlgName``: 'DQN', 'AC', 'A3C', 'DDPG', 'TD3', 'SAC', 'PG', 'TRPO', 'PPO', 'DPPO' 41 | 42 | Choices for ``EnvType``: 'atari', 'box2d', 'classic_control', 'mujoco', 'robotics', 'dm_control', 'rlbench' 43 | 44 | Choices for ``EnvName`` refers to :ref:`env_list` 45 | 46 | 47 | Another Usage 48 | --------------- 49 | 50 | For providing more flexibility, we provide another usage example of RLzoo with more explicit configurations as follows, where the users can pass in customized networks and otpimizers, etc. 51 | 52 | .. code-block:: python 53 | :linenos: 54 | 55 | import gym 56 | from rlzoo.common.utils import make_env, set_seed 57 | from rlzoo.algorithms import AC 58 | from rlzoo.common.value_networks import ValueNetwork 59 | from rlzoo.common.policy_networks import StochasticPolicyNetwork 60 | 61 | ''' load environment ''' 62 | env = gym.make('CartPole-v0').unwrapped 63 | obs_space = env.observation_space 64 | act_space = env.action_space 65 | # reproducible 66 | seed = 2 67 | set_seed(seed, env) 68 | 69 | ''' build networks for the algorithm ''' 70 | num_hidden_layer = 4 #number of hidden layers for the networks 71 | hidden_dim = 64 # dimension of hidden layers for the networks 72 | with tf.name_scope('AC'): 73 | with tf.name_scope('Critic'): 74 | # choose the critic network, can be replaced with customized network 75 | critic = ValueNetwork(obs_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) 76 | with tf.name_scope('Actor'): 77 | # choose the actor network, can be replaced with customized network 78 | actor = StochasticPolicyNetwork(obs_space, act_space, hidden_dim_list=num_hidden_layer * [hidden_dim], output_activation=tf.nn.tanh) 79 | net_list = [actor, critic] # list of the networks 80 | 81 | ''' choose optimizers ''' 82 | a_lr, c_lr = 1e-4, 1e-2 # a_lr: learning rate of the actor; c_lr: learning rate of the critic 83 | a_optimizer = tf.optimizers.Adam(a_lr) 84 | c_optimizer = tf.optimizers.Adam(c_lr) 85 | optimizers_list=[a_optimizer, c_optimizer] # list of optimizers 86 | 87 | # intialize the algorithm model, with algorithm parameters passed in 88 | model = AC(net_list, optimizers_list) 89 | ''' 90 | full list of arguments for the algorithm 91 | ---------------------------------------- 92 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 93 | optimizers_list: a list of optimizers for all networks and differentiable variables 94 | gamma: discounted factor of reward 95 | action_range: scale of action values 96 | ''' 97 | 98 | # start the training process, with learning parameters passed in 99 | model.learn(env, train_episodes=500, max_steps=200, 100 | save_interval=50, mode='train', render=False) 101 | ''' 102 | full list of parameters for training 103 | ------------------------------------- 104 | env: learning environment 105 | train_episodes: total number of episodes for training 106 | test_episodes: total number of episodes for testing 107 | max_steps: maximum number of steps for one episode 108 | save_interval: time steps for saving the weights and plotting the results 109 | mode: 'train' or 'test' 110 | render: if true, visualize the environment 111 | ''' 112 | 113 | # test after training 114 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) 115 | 116 | 117 | 118 | Interactive Configurations 119 | -------------------------- 120 | 121 | We also provide an interactive learning configuration with Jupyter Notebook and *ipywidgets*, where you can select the algorithm, environment, and general learning settings with simple clicking on dropdown lists and sliders! 122 | A video demonstrating the usage is as following. 123 | The interactive mode can be used with `rlzoo/interactive/main.ipynb `_ by running ``$ jupyter notebook`` to open it. 124 | 125 | .. image:: ../../gif/interactive.gif -------------------------------------------------------------------------------- /docs/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/docs/img/logo.png -------------------------------------------------------------------------------- /docs/img/rlzoo-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/docs/img/rlzoo-logo.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. RLzoo documentation master file, created by 2 | sphinx-quickstart on Wed Apr 29 23:00:36 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Reinforcement Learning Zoo! 7 | ============================================ 8 | 9 | .. image:: img/rlzoo-logo.png 10 | :width: 40 % 11 | :align: center 12 | :target: https://github.com/tensorlayer/rlzoo 13 | 14 | RLzoo is a collection of the most practical reinforcement learning algorithms, frameworks and applications, released on `Github `_ in November 2019. It is implemented with Tensorflow 2.0 and API of neural network layers in TensorLayer 2, to provide a hands-on fast-developing approach for reinforcement learning practices and benchmarks. It supports basic toy-test environments like `OpenAI Gym `_ and `DeepMind Control Suite `_ with very simple configurations. Moreover, RLzoo supports robot learning benchmark environment `RLBench `_ based on Vrep/Pyrep simulator. Other large-scale distributed training framework for more realistic scenarios with Unity 3D, Mujoco, Bullet Physics, etc, will be supported in the future. 15 | 16 | We also provide novices friendly `DRL Tutorials `_ for algorithms implementation, where each algorithm is implemented in an individual script. The tutorials serve as code examples for our Springer textbook `Deep Reinforcement Learning: Fundamentals, Research and Applications `_ , you can get the free PDF if your institute has Springer license. 17 | 18 | .. toctree:: 19 | :maxdepth: 1 20 | :caption: User Guide 21 | 22 | guide/installation 23 | guide/quickstart 24 | guide/configuration 25 | guide/api 26 | 27 | .. toctree:: 28 | :maxdepth: 1 29 | :caption: RL Algorithms 30 | 31 | algorithms/dqn 32 | algorithms/pg 33 | algorithms/ac 34 | algorithms/a3c 35 | algorithms/ddpg 36 | algorithms/td3 37 | algorithms/sac 38 | algorithms/trpo 39 | algorithms/ppo 40 | algorithms/dppo 41 | 42 | .. toctree:: 43 | :maxdepth: 1 44 | :caption: Common 45 | 46 | common/basicnets 47 | common/policynets 48 | common/valuenets 49 | common/buffer 50 | common/distributions 51 | common/envwrappers 52 | common/envlist 53 | common/mathutils 54 | common/utils 55 | 56 | .. toctree:: 57 | :maxdepth: 1 58 | :caption: Other Resources 59 | 60 | other/drl_book 61 | other/drl_tutorial 62 | 63 | Contributing 64 | ================== 65 | 66 | This project is under active development, if you want to join the core team, feel free to contact Zihan Ding at zhding[at]mail.ustc.edu.cn 67 | 68 | Citation 69 | ================== 70 | 71 | * :ref:`genindex` 72 | * :ref:`modindex` 73 | * :ref:`search` 74 | 75 | 76 | .. image:: img/logo.png 77 | :width: 70 % 78 | :align: center 79 | :target: https://github.com/tensorlayer/rlzoo 80 | 81 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: My Docs 2 | -------------------------------------------------------------------------------- /docs/other/drl_book.rst: -------------------------------------------------------------------------------- 1 | DRL Book 2 | ========== 3 | 4 | .. image:: http://deep-reinforcement-learning-book.github.io/assets/images/cover_v1.png 5 | :width: 30 % 6 | :align: center 7 | :target: https://deepreinforcementlearningbook.org 8 | 9 | - You can get the `free PDF `__ if your institute has Springer license. 10 | 11 | Deep reinforcement learning (DRL) relies on the intersection of reinforcement learning (RL) and deep learning (DL). It has been able to solve a wide range of complex decision-making tasks that were previously out of reach for a machine and famously contributed to the success of AlphaGo. Furthermore, it opens up numerous new applications in domains such as healthcare, robotics, smart grids, and finance. 12 | 13 | Divided into three main parts, this book provides a comprehensive and self-contained introduction to DRL. The first part introduces the foundations of DL, RL and widely used DRL methods and discusses their implementation. The second part covers selected DRL research topics, which are useful for those wanting to specialize in DRL research. To help readers gain a deep understanding of DRL and quickly apply the techniques in practice, the third part presents mass applications, such as the intelligent transportation system and learning to run, with detailed explanations. 14 | 15 | The book is intended for computer science students, both undergraduate and postgraduate, who would like to learn DRL from scratch, practice its implementation, and explore the research topics. This book also appeals to engineers and practitioners who do not have strong machine learning background, but want to quickly understand how DRL works and use the techniques in their applications. 16 | 17 | Editors 18 | -------- 19 | - Hao Dong - Peking University 20 | - Zihan Ding - Princeton University 21 | - Shanghang Zhang - University of California, Berkeley 22 | 23 | Authors 24 | -------- 25 | - Hao Dong - Peking University 26 | - Zihan Ding - Princeton University 27 | - Shanghang Zhang - University of California, Berkeley 28 | - Hang Yuan - Oxford University 29 | - Hongming Zhang - Peking University 30 | - Jingqing Zhang - Imperial College London 31 | - Yanhua Huang - Xiaohongshu Technology Co. 32 | - Tianyang Yu - Nanchang University 33 | - Huaqing Zhang - Google 34 | - Ruitong Huang - Borealis AI 35 | 36 | 37 | .. image:: https://deep-generative-models.github.io/files/web/water-bottom-min.png 38 | :width: 100 % 39 | :align: center 40 | :target: https://github.com/tensorlayer/tensorlayer/edit/master/examples/reinforcement_learning 41 | 42 | 43 | -------------------------------------------------------------------------------- /docs/other/drl_tutorial.rst: -------------------------------------------------------------------------------- 1 | DRL Tutorial 2 | ================================= 3 | 4 | 5 | .. image:: https://tensorlayer.readthedocs.io/en/latest/_images/tl_transparent_logo.png 6 | :width: 30 % 7 | :align: center 8 | :target: https://github.com/tensorlayer/tensorlayer/edit/master/examples/reinforcement_learning 9 | 10 | 11 | Different from RLzoo for simple usage with **high-level APIs**, the `RL tutorial `__ aims to make the reinforcement learning tutorial simple, transparent and straight-forward with **low-level APIs**, as this would not only benefits new learners of reinforcement learning, but also provide convenience for senior researchers to testify their new ideas quickly. 12 | 13 | .. image:: https://deep-generative-models.github.io/files/web/water-bottom-min.png 14 | :width: 100 % 15 | :align: center 16 | :target: https://github.com/tensorlayer/tensorlayer/edit/master/examples/reinforcement_learning 17 | 18 | 19 | -------------------------------------------------------------------------------- /examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | ## Descriptions of Algorithms and Environments in RLZoo 4 | 5 | | Algorithms | Action Space | Policy | Update | Envs | 6 | | -------------------------- | ------------------- | ------------- | ---------- | ------------------------------------------------------------ | 7 | | DQN (double, dueling, PER) | Discrete Only | -- | Off-policy | Atari, Classic Control | 8 | | AC | Discrete/Continuous | Stochastic | On-policy | All | 9 | | PG | Discrete/Continuous | Stochastic | On-policy | All | 10 | | DDPG | Continuous | Deterministic | Off-policy | Classic Control, Box2D, Mujoco, Robotics, DeepMind Control, RLBench | 11 | | TD3 | Continuous | Deterministic | Off-policy | Classic Control, Box2D, Mujoco, Robotics, DeepMind Control, RLBench | 12 | | SAC | Continuous | Stochastic | Off-policy | Classic Control, Box2D, Mujoco, Robotics, DeepMind Control, RLBench | 13 | | A3C | Discrete/Continuous | Stochastic | On-policy | Atari, Classic Control, Box2D, Mujoco, Robotics, DeepMind Control | 14 | | PPO | Discrete/Continuous | Stochastic | On-policy | All | 15 | | DPPO | Discrete/Continuous | Stochastic | On-policy | Atari, Classic Control, Box2D, Mujoco, Robotics, DeepMind Control | 16 | | TRPO | Discrete/Continuous | Stochastic | On-policy | All | 17 | 18 | 19 | 20 | ## 1. Deep Q-Network (DQN) 21 | 22 | ```python 23 | AlgName = 'DQN' 24 | EnvName = 'PongNoFrameskip-v4' 25 | EnvType = 'atari' 26 | # EnvName = 'CartPole-v1' 27 | # EnvType = 'classic_control' # the name of env needs to match the type of env 28 | 29 | env = build_env(EnvName, EnvType) 30 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 31 | alg = eval(AlgName+'(**alg_params)') 32 | alg.learn(env=env, mode='train', **learn_params) 33 | alg.learn(env=env, mode='test', render=True, **learn_params) 34 | 35 | ``` 36 | 37 | ## 2. Actor-Critic (AC) 38 | 39 | ```python 40 | AlgName = 'AC' 41 | EnvName = 'PongNoFrameskip-v4' 42 | EnvType = 'atari' 43 | 44 | # EnvName = 'Pendulum-v0' 45 | # EnvType = 'classic_control' 46 | 47 | # EnvName = 'BipedalWalker-v2' 48 | # EnvType = 'box2d' 49 | 50 | # EnvName = 'Ant-v2' 51 | # EnvType = 'mujoco' 52 | 53 | # EnvName = 'FetchPush-v1' 54 | # EnvType = 'robotics' 55 | 56 | # EnvName = 'FishSwim-v0' 57 | # EnvType = 'dm_control' 58 | 59 | # EnvName = 'ReachTarget' 60 | # EnvType = 'rlbench' 61 | 62 | env = build_env(EnvName, EnvType) 63 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 64 | alg = eval(AlgName+'(**alg_params)') 65 | alg.learn(env=env, mode='train', render=False, **learn_params) 66 | alg.learn(env=env, mode='test', render=True, **learn_params) 67 | 68 | ``` 69 | 70 | ## 3. Policy Gradient (PG) 71 | 72 | ```python 73 | AlgName = 'PG' 74 | EnvName = 'PongNoFrameskip-v4' 75 | EnvType = 'atari' 76 | 77 | # EnvName = 'CartPole-v0' 78 | # EnvType = 'classic_control' 79 | 80 | # EnvName = 'BipedalWalker-v2' 81 | # EnvType = 'box2d' 82 | 83 | # EnvName = 'Ant-v2' 84 | # EnvType = 'mujoco' 85 | 86 | # EnvName = 'FetchPush-v1' 87 | # EnvType = 'robotics' 88 | 89 | # EnvName = 'FishSwim-v0' 90 | # EnvType = 'dm_control' 91 | 92 | # EnvName = 'ReachTarget' 93 | # EnvType = 'rlbench' 94 | 95 | env = build_env(EnvName, EnvType) 96 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 97 | alg = eval(AlgName+'(**alg_params)') 98 | alg.learn(env=env, mode='train', render=False, **learn_params) 99 | alg.learn(env=env, mode='test', render=True, **learn_params) 100 | ``` 101 | 102 | ## 4. Deep Deterministic Policy Gradient (DDPG) 103 | 104 | ```python 105 | AlgName = 'DDPG' 106 | EnvName = 'Pendulum-v0' # only continuous action 107 | EnvType = 'classic_control' 108 | 109 | # EnvName = 'BipedalWalker-v2' 110 | # EnvType = 'box2d' 111 | 112 | # EnvName = 'Ant-v2' 113 | # EnvType = 'mujoco' 114 | 115 | # EnvName = 'FetchPush-v1' 116 | # EnvType = 'robotics' 117 | 118 | # EnvName = 'FishSwim-v0' 119 | # EnvType = 'dm_control' 120 | 121 | # EnvName = 'ReachTarget' 122 | # EnvType = 'rlbench' 123 | 124 | env = build_env(EnvName, EnvType) 125 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 126 | alg = eval(AlgName+'(**alg_params)') 127 | alg.learn(env=env, mode='train', render=False, **learn_params) 128 | alg.learn(env=env, mode='test', render=True, **learn_params) 129 | 130 | ``` 131 | 132 | 133 | 134 | ## 5. Twin Delayed DDPG (TD3) 135 | 136 | ```python 137 | AlgName = 'TD3' 138 | EnvName = 'Pendulum-v0' # only continuous action 139 | EnvType = 'classic_control' 140 | 141 | # EnvName = 'BipedalWalker-v2' 142 | # EnvType = 'box2d' 143 | 144 | # EnvName = 'Ant-v2' 145 | # EnvType = 'mujoco' 146 | 147 | # EnvName = 'FetchPush-v1' 148 | # EnvType = 'robotics' 149 | 150 | # EnvName = 'FishSwim-v0' 151 | # EnvType = 'dm_control' 152 | 153 | # EnvName = 'ReachTarget' 154 | # EnvType = 'rlbench' 155 | 156 | env = build_env(EnvName, EnvType) 157 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 158 | alg = eval(AlgName+'(**alg_params)') 159 | alg.learn(env=env, mode='train', render=False, **learn_params) 160 | alg.learn(env=env, mode='test', render=True, **learn_params) 161 | ``` 162 | 163 | ## 6. Soft Actor-Critic (SAC) 164 | 165 | ```python 166 | AlgName = 'SAC' 167 | EnvName = 'Pendulum-v0' # only continuous action 168 | EnvType = 'classic_control' 169 | 170 | # EnvName = 'BipedalWalker-v2' 171 | # EnvType = 'box2d' 172 | 173 | # EnvName = 'Ant-v2' 174 | # EnvType = 'mujoco' 175 | 176 | # EnvName = 'FetchPush-v1' 177 | # EnvType = 'robotics' 178 | 179 | # EnvName = 'FishSwim-v0' 180 | # EnvType = 'dm_control' 181 | 182 | # EnvName = 'ReachTarget' 183 | # EnvType = 'rlbench' 184 | 185 | env = build_env(EnvName, EnvType) 186 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 187 | alg = eval(AlgName+'(**alg_params)') 188 | alg.learn(env=env, mode='train', render=False, **learn_params) 189 | alg.learn(env=env, mode='test', render=True, **learn_params) 190 | ``` 191 | 192 | ## 7. Asynchronous Advantage Actor-Critic (A3C) 193 | 194 | ```python 195 | AlgName = 'A3C' 196 | EnvName = 'PongNoFrameskip-v4' 197 | EnvType = 'atari' 198 | 199 | # EnvName = 'Pendulum-v0' # only continuous action 200 | # EnvType = 'classic_control' 201 | 202 | # EnvName = 'BipedalWalker-v2' 203 | # EnvType = 'box2d' 204 | 205 | # EnvName = 'Ant-v2' 206 | # EnvType = 'mujoco' 207 | 208 | # EnvName = 'FetchPush-v1' 209 | # EnvType = 'robotics' 210 | 211 | # EnvName = 'FishSwim-v0' 212 | # EnvType = 'dm_control' 213 | 214 | number_workers = 2 # need to specify number of parallel workers 215 | env = build_env(EnvName, EnvType, nenv=number_workers) 216 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 217 | alg = eval(AlgName+'(**alg_params)') 218 | alg.learn(env=env, mode='train', render=False, **learn_params) 219 | alg.learn(env=env, mode='test', render=True, **learn_params) 220 | ``` 221 | 222 | ## 8. Proximal Policy Optimization (PPO) 223 | 224 | ```python 225 | EnvName = 'PongNoFrameskip-v4' 226 | EnvType = 'atari' 227 | 228 | # EnvName = 'Pendulum-v0' 229 | # EnvType = 'classic_control' 230 | 231 | # EnvName = 'BipedalWalker-v2' 232 | # EnvType = 'box2d' 233 | 234 | # EnvName = 'Ant-v2' 235 | # EnvType = 'mujoco' 236 | 237 | # EnvName = 'FetchPush-v1' 238 | # EnvType = 'robotics' 239 | 240 | # EnvName = 'FishSwim-v0' 241 | # EnvType = 'dm_control' 242 | 243 | # EnvName = 'ReachTarget' 244 | # EnvType = 'rlbench' 245 | 246 | env = build_env(EnvName, EnvType) 247 | alg_params, learn_params = call_default_params(env, EnvType, 'PPO') 248 | alg = PPO(method='clip', **alg_params) # specify 'clip' or 'penalty' method for PPO 249 | alg.learn(env=env, mode='train', render=False, **learn_params) 250 | alg.learn(env=env, mode='test', render=False, **learn_params) 251 | ``` 252 | 253 | ## 9. Distributed Proximal Policy Optimization (DPPO) 254 | 255 | ```python 256 | EnvName = 'PongNoFrameskip-v4' 257 | EnvType = 'atari' 258 | 259 | # EnvName = 'Pendulum-v0' 260 | # EnvType = 'classic_control' 261 | 262 | # EnvName = 'BipedalWalker-v2' 263 | # EnvType = 'box2d' 264 | 265 | # EnvName = 'Ant-v2' 266 | # EnvType = 'mujoco' 267 | 268 | # EnvName = 'FetchPush-v1' 269 | # EnvType = 'robotics' 270 | 271 | # EnvName = 'FishSwim-v0' 272 | # EnvType = 'dm_control' 273 | 274 | # EnvName = 'ReachTarget' 275 | # EnvType = 'rlbench' 276 | 277 | number_workers = 2 # need to specify number of parallel workers 278 | env = build_env(EnvName, EnvType, nenv=number_workers) 279 | alg_params, learn_params = call_default_params(env, EnvType, 'DPPO') 280 | alg = DPPO(method='penalty', **alg_params) # specify 'clip' or 'penalty' method for PPO 281 | alg.learn(env=env, mode='train', render=False, **learn_params) 282 | alg.learn(env=env, mode='test', render=True, **learn_params) 283 | ``` 284 | 285 | ## 10. Trust Region Policy Optimization (TRPO) 286 | 287 | ```python 288 | AlgName = 'TRPO' 289 | EnvName = 'PongNoFrameskip-v4' 290 | EnvType = 'atari' 291 | 292 | # EnvName = 'CartPole-v0' 293 | # EnvType = 'classic_control' 294 | 295 | # EnvName = 'BipedalWalker-v2' 296 | # EnvType = 'box2d' 297 | 298 | # EnvName = 'Ant-v2' 299 | # EnvType = 'mujoco' 300 | 301 | # EnvName = 'FetchPush-v1' 302 | # EnvType = 'robotics' 303 | 304 | # EnvName = 'FishSwim-v0' 305 | # EnvType = 'dm_control' 306 | 307 | # EnvName = 'ReachTarget' 308 | # EnvType = 'rlbench' 309 | 310 | env = build_env(EnvName, EnvType) 311 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 312 | alg = eval(AlgName+'(**alg_params)') 313 | alg.learn(env=env, mode='train', render=False, **learn_params) 314 | alg.learn(env=env, mode='test', render=True, **learn_params) 315 | ``` 316 | 317 | -------------------------------------------------------------------------------- /gif/ACM_MM2021_Presentation_Slide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/ACM_MM2021_Presentation_Slide.pdf -------------------------------------------------------------------------------- /gif/atari.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/atari.gif -------------------------------------------------------------------------------- /gif/box2d.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/box2d.gif -------------------------------------------------------------------------------- /gif/classic.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/classic.gif -------------------------------------------------------------------------------- /gif/dmcontrol.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/dmcontrol.gif -------------------------------------------------------------------------------- /gif/interactive.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/interactive.gif -------------------------------------------------------------------------------- /gif/mujoco.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/mujoco.gif -------------------------------------------------------------------------------- /gif/rlbench.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/rlbench.gif -------------------------------------------------------------------------------- /gif/robotics.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/robotics.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.0.3 2 | numpy==1.16.3 3 | opencv-python==4.1.0.25 4 | pygame==1.9.6 5 | tensorflow-gpu==2.1.0 6 | tensorflow-probability==0.8.0 7 | tensorlayer>=2.1.0 8 | gym==0.12.5 9 | ipywidgets==7.5.1 10 | 11 | -------------------------------------------------------------------------------- /rlzoo/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /img 3 | /log 4 | /model 5 | -------------------------------------------------------------------------------- /rlzoo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | from .ac.ac import AC 2 | from .pg.pg import PG 3 | from .dqn.dqn import DQN 4 | from .a3c.a3c import A3C 5 | from .ddpg.ddpg import DDPG 6 | from .td3.td3 import TD3 7 | from .sac.sac import SAC 8 | from .ppo.ppo import PPO 9 | from .ppo_penalty.ppo_penalty import PPO_PENALTY 10 | from .ppo_clip.ppo_clip import PPO_CLIP 11 | from .dppo.dppo import DPPO 12 | from .dppo_penalty.dppo_penalty import DPPO_PENALTY 13 | from .dppo_clip.dppo_clip import DPPO_CLIP 14 | from .trpo.trpo import TRPO 15 | -------------------------------------------------------------------------------- /rlzoo/algorithms/a3c/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/a3c/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/a3c/run_a3c.py: -------------------------------------------------------------------------------- 1 | from rlzoo.algorithms.a3c.a3c import A3C 2 | from rlzoo.common.policy_networks import * 3 | from rlzoo.common.value_networks import * 4 | import gym 5 | 6 | """ load environment """ 7 | env_id = 'BipedalWalker-v2' 8 | env = gym.make(env_id).unwrapped 9 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run 10 | action_shape = env.action_space.shape 11 | state_shape = env.observation_space.shape 12 | # reproducible 13 | seed = 2 14 | np.random.seed(seed) 15 | tf.random.set_seed(seed) 16 | env.seed(seed) 17 | 18 | """ build networks for the algorithm """ 19 | num_hidden_layer = 4 # number of hidden layers for the networks 20 | hidden_dim = 64 # dimension of hidden layers for the networks 21 | num_workers = 2 22 | net_list2 = [] 23 | for i in range(num_workers + 1): 24 | with tf.name_scope('A3C'): 25 | with tf.name_scope('Actor'): 26 | actor = StochasticPolicyNetwork(env.observation_space, env.action_space, 27 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 28 | with tf.name_scope('Critic'): 29 | critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) 30 | net_list = [actor, critic] 31 | net_list2.append(net_list) 32 | 33 | """ choose optimizers """ 34 | actor_lr, critic_lr = 5e-5, 1e-4 # learning rate 35 | a_optimizer = tf.optimizers.RMSprop(actor_lr) 36 | c_optimizer = tf.optimizers.RMSprop(critic_lr) 37 | optimizers_list = [a_optimizer, c_optimizer] 38 | 39 | model = A3C(net_list2, optimizers_list, entropy_beta=0.005) 40 | """ 41 | full list of arguments for the algorithm 42 | ---------------------------------------- 43 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 44 | optimizers_list: a list of optimizers for all networks and differentiable variables 45 | entropy_beta: factor for entropy boosted exploration 46 | """ 47 | 48 | env_list = [] 49 | for i in range(num_workers): 50 | env_list.append(gym.make(env_id).unwrapped) 51 | model.learn(env_list, train_episodes=20000, test_episodes=100, max_steps=20000, n_workers=num_workers, update_itr=10, 52 | gamma=0.99, save_interval=500, mode='train') 53 | """ 54 | full list of parameters for training 55 | --------------------------------------- 56 | env_list: a list of same learning environments 57 | train_episodes: total number of episodes for training 58 | test_episodes: total number of episodes for testing 59 | max_steps: maximum number of steps for one episode 60 | n_workers: manually set number of workers 61 | update_itr: update global policy after several episodes 62 | gamma: reward discount factor 63 | save_interval: timesteps for saving the weights and plotting the results 64 | mode: train or test 65 | """ 66 | # test 67 | model.learn(env_list, test_episodes=100, max_steps=20000, mode='test', render=True) 68 | -------------------------------------------------------------------------------- /rlzoo/algorithms/ac/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ac/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/ac/ac.py: -------------------------------------------------------------------------------- 1 | """ 2 | Actor-Critic 3 | ------------- 4 | It uses TD-error as the Advantage. 5 | 6 | Actor Critic History 7 | ---------------------- 8 | A3C > DDPG > AC 9 | 10 | Advantage 11 | ---------- 12 | AC converge faster than Policy Gradient. 13 | 14 | Disadvantage (IMPORTANT) 15 | ------------------------ 16 | The Policy is oscillated (difficult to converge), DDPG can solve 17 | this problem using advantage of DQN. 18 | 19 | Reference 20 | ---------- 21 | paper: https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf 22 | View more on MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ 23 | MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ 24 | 25 | Environment 26 | ------------ 27 | CartPole-v0: https://gym.openai.com/envs/CartPole-v0 28 | 29 | A pole is attached by an un-actuated joint to a cart, which moves along a 30 | frictionless track. The system is controlled by applying a force of +1 or -1 31 | to the cart. The pendulum starts upright, and the goal is to prevent it from 32 | falling over. 33 | 34 | A reward of +1 is provided for every timestep that the pole remains upright. 35 | The episode ends when the pole is more than 15 degrees from vertical, or the 36 | cart moves more than 2.4 units from the center. 37 | 38 | 39 | Prerequisites 40 | -------------- 41 | tensorflow >=2.0.0a0 42 | tensorlayer >=2.0.0 43 | 44 | """ 45 | import time 46 | 47 | import tensorlayer as tl 48 | 49 | from rlzoo.common.utils import * 50 | from rlzoo.common.value_networks import * 51 | from rlzoo.common.policy_networks import * 52 | 53 | tl.logging.set_verbosity(tl.logging.DEBUG) 54 | 55 | 56 | ############################### Actor-Critic #################################### 57 | class AC: 58 | def __init__(self, net_list, optimizers_list, gamma=0.9): 59 | assert len(net_list) == 2 60 | assert len(optimizers_list) == 2 61 | self.name = 'AC' 62 | self.actor, self.critic = net_list 63 | assert isinstance(self.critic, ValueNetwork) 64 | assert isinstance(self.actor, StochasticPolicyNetwork) 65 | self.a_optimizer, self.c_optimizer = optimizers_list 66 | self.GAMMA = gamma 67 | 68 | def update(self, s, a, r, s_): 69 | # critic update 70 | v_ = self.critic(np.array([s_])) 71 | with tf.GradientTape() as tape: 72 | v = self.critic(np.array([s])) 73 | td_error = r + self.GAMMA * v_ - v # TD_error = r + lambd * V(newS) - V(S) 74 | loss = tf.square(td_error) 75 | grad = tape.gradient(loss, self.critic.trainable_weights) 76 | self.c_optimizer.apply_gradients(zip(grad, self.critic.trainable_weights)) 77 | 78 | # actor update 79 | with tf.GradientTape() as tape: 80 | # _logits = self.actor(np.array([s])) 81 | ## cross-entropy loss weighted by td-error (advantage), 82 | # the cross-entropy mearsures the difference of two probability distributions: the predicted logits and sampled action distribution, 83 | # then weighted by the td-error: small difference of real and predict actions for large td-error (advantage); and vice versa. 84 | 85 | _ = self.actor(np.array([s])) 86 | neg_log_prob = self.actor.policy_dist.neglogp([a]) 87 | _exp_v = tf.reduce_mean(neg_log_prob * td_error) 88 | grad = tape.gradient(_exp_v, self.actor.trainable_weights) 89 | self.a_optimizer.apply_gradients(zip(grad, self.actor.trainable_weights)) 90 | return _exp_v 91 | 92 | def get_action(self, s): 93 | return self.actor(np.array([s]))[0].numpy() 94 | 95 | def get_action_greedy(self, s): 96 | return self.actor(np.array([s]), greedy=True)[0].numpy() 97 | 98 | def save_ckpt(self, env_name): # save trained weights 99 | save_model(self.actor, 'model_actor', self.name, env_name) 100 | save_model(self.critic, 'model_critic', self.name, env_name) 101 | 102 | def load_ckpt(self, env_name): # load trained weights 103 | load_model(self.actor, 'model_actor', self.name, env_name) 104 | load_model(self.critic, 'model_critic', self.name, env_name) 105 | 106 | def learn(self, env, train_episodes=1000, test_episodes=500, max_steps=200, 107 | save_interval=100, mode='train', render=False, plot_func=None): 108 | """ 109 | :param env: learning environment 110 | :param train_episodes: total number of episodes for training 111 | :param test_episodes: total number of episodes for testing 112 | :param max_steps: maximum number of steps for one episode 113 | :param save_interval: time steps for saving the weights and plotting the results 114 | :param mode: 'train' or 'test' 115 | :param render: if true, visualize the environment 116 | :param plot_func: additional function for interactive module 117 | """ 118 | 119 | t0 = time.time() 120 | if mode == 'train': 121 | print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) 122 | reward_buffer = [] 123 | for i_episode in range(train_episodes): 124 | s = env.reset() 125 | ep_rs_sum = 0 # rewards of all steps 126 | 127 | for step in range(max_steps): 128 | 129 | if render: 130 | env.render() 131 | 132 | a = self.get_action(s) 133 | s_new, r, done, info = env.step(a) 134 | ep_rs_sum += r 135 | 136 | try: 137 | self.update(s, a, r, s_new) # learn Policy : true_gradient = grad[logPi(s, a) * td_error] 138 | except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn() 139 | self.save_ckpt(env_name=env.spec.id) 140 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) 141 | 142 | s = s_new 143 | 144 | if done: 145 | break 146 | 147 | reward_buffer.append(ep_rs_sum) 148 | if plot_func is not None: 149 | plot_func(reward_buffer) 150 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \ 151 | .format(i_episode, train_episodes, ep_rs_sum, time.time() - t0)) 152 | 153 | if i_episode % save_interval == 0: 154 | self.save_ckpt(env_name=env.spec.id) 155 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) 156 | 157 | self.save_ckpt(env_name=env.spec.id) 158 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) 159 | 160 | elif mode == 'test': 161 | self.load_ckpt(env_name=env.spec.id) 162 | print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) 163 | 164 | reward_buffer = [] 165 | for i_episode in range(test_episodes): 166 | s = env.reset() 167 | ep_rs_sum = 0 # rewards of all steps 168 | for step in range(max_steps): 169 | if render: env.render() 170 | a = self.get_action_greedy(s) 171 | s_new, r, done, info = env.step(a) 172 | s_new = s_new 173 | 174 | ep_rs_sum += r 175 | s = s_new 176 | 177 | if done: 178 | break 179 | 180 | reward_buffer.append(ep_rs_sum) 181 | if plot_func: 182 | plot_func(reward_buffer) 183 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( 184 | i_episode, test_episodes, ep_rs_sum, time.time() - t0)) 185 | 186 | elif mode is not 'test': 187 | print('unknow mode type') 188 | -------------------------------------------------------------------------------- /rlzoo/algorithms/ac/run_ac.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.utils import set_seed 2 | from rlzoo.algorithms.ac.ac import AC 3 | from rlzoo.common.value_networks import * 4 | from rlzoo.common.policy_networks import * 5 | import gym 6 | 7 | """ load environment """ 8 | # env = gym.make('CartPole-v0').unwrapped 9 | env = gym.make('Pendulum-v0').unwrapped 10 | obs_space = env.observation_space 11 | act_space = env.action_space 12 | # reproducible 13 | seed = 1 14 | set_seed(seed, env) 15 | 16 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run 17 | 18 | 19 | """ build networks for the algorithm """ 20 | num_hidden_layer = 2 # number of hidden layers for the networks 21 | hidden_dim = 64 # dimension of hidden layers for the networks 22 | with tf.name_scope('AC'): 23 | with tf.name_scope('Critic'): 24 | critic = ValueNetwork(obs_space, hidden_dim_list=num_hidden_layer * [hidden_dim]) 25 | with tf.name_scope('Actor'): 26 | actor = StochasticPolicyNetwork(obs_space, act_space, hidden_dim_list=num_hidden_layer * [hidden_dim], 27 | output_activation=tf.nn.tanh) 28 | net_list = [actor, critic] 29 | 30 | """ choose optimizers """ 31 | a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic 32 | a_optimizer = tf.optimizers.Adam(a_lr) 33 | c_optimizer = tf.optimizers.Adam(c_lr) 34 | optimizers_list = [a_optimizer, c_optimizer] 35 | 36 | model = AC(net_list, optimizers_list) 37 | """ 38 | full list of arguments for the algorithm 39 | ---------------------------------------- 40 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 41 | optimizers_list: a list of optimizers for all networks and differentiable variables 42 | gamma: discounted factor of reward 43 | action_range: scale of action values 44 | """ 45 | 46 | model.learn(env, train_episodes=500, max_steps=200, 47 | save_interval=50, mode='train', render=False) 48 | """ 49 | full list of parameters for training 50 | --------------------------------------- 51 | env: learning environment 52 | train_episodes: total number of episodes for training 53 | test_episodes: total number of episodes for testing 54 | max_steps: maximum number of steps for one episode 55 | save_interval: time steps for saving the weights and plotting the results 56 | mode: 'train' or 'test' 57 | render: if true, visualize the environment 58 | """ 59 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) 60 | -------------------------------------------------------------------------------- /rlzoo/algorithms/ddpg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ddpg/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/ddpg/ddpg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Deterministic Policy Gradient (DDPG) 3 | ----------------------------------------- 4 | An algorithm concurrently learns a Q-function and a policy. 5 | It uses off-policy data and the Bellman equation to learn the Q-function, 6 | and uses the Q-function to learn the policy. 7 | Reference 8 | --------- 9 | Deterministic Policy Gradient Algorithms, Silver et al. 2014 10 | Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016 11 | MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ 12 | MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ 13 | 14 | Prerequisites 15 | ------------- 16 | tensorflow >=2.0.0a0 17 | tensorflow-probability 0.6.0 18 | tensorlayer >=2.0.0 19 | """ 20 | 21 | import time 22 | 23 | from rlzoo.common.utils import * 24 | from rlzoo.common.buffer import * 25 | from rlzoo.common.policy_networks import * 26 | from rlzoo.common.value_networks import * 27 | 28 | 29 | ############################### DDPG #################################### 30 | 31 | 32 | class DDPG(object): 33 | """ 34 | DDPG class 35 | """ 36 | 37 | def __init__(self, net_list, optimizers_list, replay_buffer_size, action_range=1., tau=0.01): 38 | """ 39 | :param net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 40 | :param optimizers_list: a list of optimizers for all networks and differentiable variables 41 | :param replay_buffer_size: the size of buffer for storing explored samples 42 | :param tau: soft update factor 43 | """ 44 | assert len(net_list) == 4 45 | assert len(optimizers_list) == 2 46 | self.name = 'DDPG' 47 | 48 | self.critic, self.critic_target, self.actor, self.actor_target = net_list 49 | 50 | assert isinstance(self.critic, QNetwork) 51 | assert isinstance(self.critic_target, QNetwork) 52 | assert isinstance(self.actor, DeterministicPolicyNetwork) 53 | assert isinstance(self.actor_target, DeterministicPolicyNetwork) 54 | assert isinstance(self.actor.action_space, gym.spaces.Box) 55 | 56 | def copy_para(from_model, to_model): 57 | for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): 58 | j.assign(i) 59 | 60 | copy_para(self.actor, self.actor_target) 61 | copy_para(self.critic, self.critic_target) 62 | 63 | self.replay_buffer_size = replay_buffer_size 64 | self.buffer = ReplayBuffer(replay_buffer_size) 65 | 66 | self.ema = tf.train.ExponentialMovingAverage(decay=1 - tau) # soft replacement 67 | self.action_range = action_range 68 | 69 | self.critic_opt, self.actor_opt = optimizers_list 70 | 71 | def ema_update(self): 72 | """ 73 | Soft updating by exponential smoothing 74 | 75 | :return: None 76 | """ 77 | paras = self.actor.trainable_weights + self.critic.trainable_weights 78 | self.ema.apply(paras) 79 | for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras): 80 | i.assign(self.ema.average(j)) 81 | 82 | def sample_action(self): 83 | """ generate random actions for exploration """ 84 | a = tf.random.uniform(self.actor.action_space.shape, self.actor.action_space.low, self.actor.action_space.high) 85 | return a 86 | 87 | def get_action(self, s, noise_scale): 88 | """ 89 | Choose action with exploration 90 | 91 | :param s: state 92 | 93 | :return: action 94 | """ 95 | a = self.actor([s])[0].numpy()*self.action_range 96 | 97 | # add randomness to action selection for exploration 98 | noise = np.random.normal(0, 1, a.shape) * noise_scale 99 | a += noise 100 | a = np.clip(a, self.actor.action_space.low, self.actor.action_space.high) 101 | 102 | return a 103 | 104 | def get_action_greedy(self, s): 105 | """ 106 | Choose action 107 | 108 | :param s: state 109 | 110 | :return: action 111 | """ 112 | return self.actor([s])[0].numpy()*self.action_range 113 | 114 | def update(self, batch_size, gamma): 115 | """ 116 | Update parameters 117 | 118 | :param batch_size: update batch size 119 | :param gamma: reward decay factor 120 | 121 | :return: 122 | """ 123 | bs, ba, br, bs_, bd = self.buffer.sample(batch_size) 124 | 125 | ba_ = self.actor_target(bs_)*self.action_range 126 | 127 | q_ = self.critic_target([bs_, ba_]) 128 | y = br + (1 - bd) * gamma * q_ 129 | with tf.GradientTape() as tape: 130 | q = self.critic([bs, ba]) 131 | td_error = tf.losses.mean_squared_error(y, q) 132 | c_grads = tape.gradient(td_error, self.critic.trainable_weights) 133 | self.critic_opt.apply_gradients(zip(c_grads, self.critic.trainable_weights)) 134 | 135 | with tf.GradientTape() as tape: 136 | a = self.actor(bs)*self.action_range 137 | q = self.critic([bs, a]) 138 | a_loss = - tf.reduce_mean(q) # maximize the q 139 | a_grads = tape.gradient(a_loss, self.actor.trainable_weights) 140 | self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights)) 141 | self.ema_update() 142 | 143 | def store_transition(self, s, a, r, s_, d): 144 | """ 145 | Store data in data buffer 146 | 147 | :param s: state 148 | :param a: act 149 | :param r: reward 150 | :param s_: next state 151 | 152 | :return: None 153 | """ 154 | d = 1 if d else 0 155 | 156 | self.buffer.push(s, a, [r], s_, d) 157 | 158 | def save_ckpt(self, env_name): 159 | """ 160 | save trained weights 161 | 162 | :return: None 163 | """ 164 | save_model(self.actor, 'model_policy_net', self.name, env_name) 165 | save_model(self.actor_target, 'model_target_policy_net', self.name, env_name) 166 | save_model(self.critic, 'model_q_net', self.name, env_name) 167 | save_model(self.critic_target, 'model_target_q_net', self.name, env_name) 168 | 169 | def load_ckpt(self, env_name): 170 | """ 171 | load trained weights 172 | 173 | :return: None 174 | """ 175 | load_model(self.actor, 'model_policy_net', self.name, env_name) 176 | load_model(self.actor_target, 'model_target_policy_net', self.name, env_name) 177 | load_model(self.critic, 'model_q_net', self.name, env_name) 178 | load_model(self.critic_target, 'model_target_q_net', self.name, env_name) 179 | 180 | def learn(self, env, train_episodes=200, test_episodes=100, max_steps=200, save_interval=10, explore_steps=500, 181 | mode='train', render=False, batch_size=32, gamma=0.9, noise_scale=1., noise_scale_decay=0.995, 182 | plot_func=None): 183 | """ 184 | learn function 185 | 186 | :param env: learning environment 187 | :param train_episodes: total number of episodes for training 188 | :param test_episodes: total number of episodes for testing 189 | :param max_steps: maximum number of steps for one episode 190 | :param save_interval: time steps for saving 191 | :param explore_steps: for random action sampling in the beginning of training 192 | :param mode: train or test mode 193 | :param render: render each step 194 | :param batch_size: update batch size 195 | :param gamma: reward decay factor 196 | :param noise_scale: range of action noise for exploration 197 | :param noise_scale_decay: noise scale decay factor 198 | :param plot_func: additional function for interactive module 199 | :return: None 200 | """ 201 | 202 | t0 = time.time() 203 | 204 | if mode == 'train': # train 205 | print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) 206 | reward_buffer = [] 207 | frame_idx = 0 208 | for i in range(1, train_episodes + 1): 209 | s = env.reset() 210 | ep_reward = 0 211 | 212 | for j in range(max_steps): 213 | if render: 214 | env.render() 215 | # Add exploration noise 216 | if frame_idx > explore_steps: 217 | a = self.get_action(s, noise_scale) 218 | else: 219 | a = self.sample_action() 220 | frame_idx += 1 221 | 222 | s_, r, done, info = env.step(a) 223 | 224 | self.store_transition(s, a, r, s_, done) 225 | if len(self.buffer) >= self.replay_buffer_size: 226 | self.update(batch_size, gamma) 227 | noise_scale *= noise_scale_decay 228 | s = s_ 229 | ep_reward += r 230 | 231 | if done: 232 | break 233 | 234 | print( 235 | 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( 236 | i, train_episodes, ep_reward, 237 | time.time() - t0 238 | ) 239 | ) 240 | 241 | reward_buffer.append(ep_reward) 242 | if plot_func is not None: 243 | plot_func(reward_buffer) 244 | if i and not i % save_interval: 245 | self.save_ckpt(env_name=env.spec.id) 246 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) 247 | 248 | self.save_ckpt(env_name=env.spec.id) 249 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id) 250 | 251 | # test 252 | elif mode == 'test': 253 | self.load_ckpt(env_name=env.spec.id) 254 | print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) 255 | reward_buffer = [] 256 | for eps in range(1, test_episodes+1): 257 | ep_rs_sum = 0 258 | s = env.reset() 259 | for step in range(max_steps): 260 | if render: 261 | env.render() 262 | action = self.get_action_greedy(s) 263 | s, reward, done, info = env.step(action) 264 | ep_rs_sum += reward 265 | if done: 266 | break 267 | 268 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( 269 | eps, test_episodes, ep_rs_sum, time.time() - t0) 270 | ) 271 | reward_buffer.append(ep_rs_sum) 272 | if plot_func: 273 | plot_func(reward_buffer) 274 | else: 275 | print('unknown mode type') -------------------------------------------------------------------------------- /rlzoo/algorithms/ddpg/run_ddpg.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.utils import make_env, set_seed 2 | from rlzoo.algorithms.ddpg.ddpg import DDPG 3 | from rlzoo.common.policy_networks import * 4 | from rlzoo.common.value_networks import * 5 | import gym 6 | 7 | """ load environment """ 8 | env = gym.make('Pendulum-v0').unwrapped 9 | 10 | obs_space = env.observation_space 11 | act_space = env.action_space 12 | 13 | # reproducible 14 | seed = 2 15 | set_seed(seed, env) 16 | 17 | """ build networks for the algorithm """ 18 | name = 'DDPG' 19 | num_hidden_layer = 2 # number of hidden layers for the networks 20 | hidden_dim = 64 # dimension of hidden layers for the networks 21 | 22 | actor = DeterministicPolicyNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer) 23 | critic = QNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer) 24 | 25 | actor_target = DeterministicPolicyNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer, trainable=False) 26 | 27 | critic_target = QNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer, trainable=False) 28 | 29 | net_list = [critic, critic_target, actor, actor_target] 30 | 31 | """ create model """ 32 | actor_lr = 1e-3 33 | critic_lr = 2e-3 34 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] 35 | replay_buffer_size = 10000 36 | model = DDPG(net_list, optimizers_list, replay_buffer_size) 37 | """ 38 | full list of arguments for the algorithm 39 | ---------------------------------------- 40 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 41 | optimizers_list: a list of optimizers for all networks and differentiable variables 42 | replay_buffer_size: the size of buffer for storing explored samples 43 | tau: soft update factor 44 | """ 45 | 46 | model.learn(env, train_episodes=100, max_steps=200, save_interval=10, 47 | mode='train', render=False, batch_size=32, gamma=0.9, noise_scale=1., noise_scale_decay=0.995) 48 | """ 49 | full list of parameters for training 50 | --------------------------------------- 51 | env: learning environment 52 | train_episodes: total number of episodes for training 53 | test_episodes: total number of episodes for testing 54 | max_steps: maximum number of steps for one episode 55 | save_interval: time steps for saving 56 | explore_steps: for random action sampling in the beginning of training 57 | mode: train or test mode 58 | render: render each step 59 | batch_size: update batch size 60 | gamma: reward decay factor 61 | noise_scale: range of action noise for exploration 62 | noise_scale_decay: noise scale decay factor 63 | """ 64 | 65 | model.learn(env, test_episodes=10, max_steps=200, mode='test', render=True) 66 | 67 | -------------------------------------------------------------------------------- /rlzoo/algorithms/dppo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dppo/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/dppo/dppo.py: -------------------------------------------------------------------------------- 1 | from rlzoo.algorithms.dppo_penalty.dppo_penalty import DPPO_PENALTY 2 | from rlzoo.algorithms.dppo_clip.dppo_clip import DPPO_CLIP 3 | 4 | 5 | def DPPO(**alg_params): 6 | method = alg_params['method'] 7 | if method == 'penalty': 8 | del alg_params['epsilon'] 9 | algo = DPPO_PENALTY 10 | elif method == 'clip': 11 | del alg_params['kl_target'] 12 | del alg_params['lam'] 13 | algo = DPPO_CLIP 14 | else: 15 | raise ValueError('Method input error. Method can only be penalty or clip') 16 | del alg_params['method'] 17 | return algo(**alg_params) 18 | -------------------------------------------------------------------------------- /rlzoo/algorithms/dppo_clip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dppo_clip/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/dppo_clip/run_dppo_clip.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.utils import set_seed 2 | from rlzoo.algorithms.dppo_clip.dppo_clip import DPPO_CLIP 3 | from rlzoo.common.policy_networks import * 4 | from rlzoo.common.value_networks import * 5 | import gym 6 | 7 | n_workers = 4 8 | """ load environment """ 9 | env = [gym.make('Pendulum-v0').unwrapped for i in range(n_workers)] 10 | 11 | # reproducible 12 | seed = 2 13 | set_seed(seed) 14 | 15 | """ build networks for the algorithm """ 16 | name = 'DPPO_CLIP' 17 | hidden_dim = 64 18 | num_hidden_layer = 2 19 | critic = ValueNetwork(env[0].observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') 20 | 21 | actor = StochasticPolicyNetwork(env[0].observation_space, env[0].action_space, 22 | [hidden_dim] * num_hidden_layer, 23 | trainable=True, 24 | name=name + '_policy') 25 | net_list = critic, actor 26 | 27 | """ create model """ 28 | actor_lr = 1e-4 29 | critic_lr = 2e-4 30 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] 31 | model = DPPO_CLIP(net_list, optimizers_list) 32 | """ 33 | full list of arguments for the algorithm 34 | ---------------------------------------- 35 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 36 | optimizers_list: a list of optimizers for all networks and differentiable variables 37 | epsilon: clip parameter 38 | """ 39 | 40 | model.learn(env, train_episodes=1000, max_steps=200, save_interval=50, gamma=0.9, 41 | mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10, n_workers=n_workers) 42 | 43 | """ 44 | full list of parameters for training 45 | --------------------------------------- 46 | env: learning environment 47 | train_episodes: total number of episodes for training 48 | test_episodes: total number of episodes for testing 49 | max_steps: maximum number of steps for one episode 50 | save_interval: time steps for saving 51 | gamma: reward discount factor 52 | mode: train or test 53 | batch_size: update batch size 54 | a_update_steps: actor update iteration steps 55 | c_update_steps: critic update iteration steps 56 | n_workers: number of workers 57 | :return: None 58 | """ 59 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) 60 | -------------------------------------------------------------------------------- /rlzoo/algorithms/dppo_clip_distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dppo_clip_distributed/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/dppo_clip_distributed/dppo_clip.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.policy_networks import StochasticPolicyNetwork 2 | from rlzoo.common.value_networks import ValueNetwork 3 | from rlzoo.common.utils import * 4 | import tensorflow as tf 5 | import numpy as np 6 | import copy 7 | import pickle 8 | 9 | 10 | def write_log(text: str): 11 | pass 12 | # print('infer server: '+text) 13 | # with open('infer_server_log.txt', 'a') as f: 14 | # f.write(str(text) + '\n') 15 | 16 | 17 | EPS = 1e-8 18 | 19 | 20 | class RLAlgorithm: 21 | def __init__(self): 22 | self.state_buffer = [] # shape: (None, [n_env], [state_shape]) 23 | self.action_buffer = [] 24 | self.reward_buffer = [] 25 | self.done_buffer = [] 26 | self.next_state_buffer = [] 27 | self.logp_buffer = [] 28 | self.all_buffer = self.state_buffer, self.action_buffer, self.reward_buffer, self.done_buffer, \ 29 | self.next_state_buffer, self.logp_buffer 30 | self.traj_list = [] 31 | self.gamma = 0.9 32 | self.name = 'NotNamed' 33 | 34 | @property 35 | def all_weights(self): 36 | raise NotImplementedError 37 | 38 | def update_model(self, params): 39 | raise NotImplementedError 40 | 41 | def _get_value(self, batch_state): 42 | """ 43 | return: value: tf.Tensor 44 | """ 45 | raise NotImplementedError 46 | 47 | def _get_action(self, batch_state): 48 | """ 49 | return: action: tf.Tensor, log_p: tf.Tensor 50 | """ 51 | raise NotImplementedError 52 | 53 | @property 54 | def logp_shape(self): 55 | raise NotImplementedError 56 | 57 | def save_ckpt(self, env_name): 58 | """ 59 | save trained weights 60 | 61 | :return: None 62 | """ 63 | raise NotImplementedError 64 | 65 | def plot_save_log(self, running_reward, env_name): 66 | plot_save_log(running_reward, algorithm_name=self.name, env_name=env_name) 67 | 68 | def collect_data(self, s, a, r, d, s_, log_p, batch_data=False): 69 | if not batch_data: 70 | s, a, r, d, s_, log_p = [s], [a], [r], [d], [s_], [log_p] 71 | for i, data in enumerate([s, a, r, d, s_, log_p]): 72 | self.all_buffer[i].append(data) 73 | 74 | def get_value(self, state, batch_data=False): 75 | if not batch_data: 76 | state = [state] 77 | value = self._get_value(np.array(state)) 78 | value_shape = np.shape(value) 79 | value = tf.reshape(value, value_shape[:-1]) 80 | return value 81 | 82 | def get_action(self, state, batch_data=False): 83 | if not batch_data: 84 | state = [state] 85 | 86 | state = np.array(state) 87 | action, log_p = self._get_action(state) 88 | action, log_p = action.numpy(), log_p.numpy() 89 | action_shape = np.shape(action) 90 | # 最后一维度是1 是batch但是len=1就不转, 是batch本来要转 91 | # 不是batch时候len=1也要转 92 | if action_shape[-1] == 1 and batch_data ^ (len(action_shape) == 1): 93 | # ((batch_data and not len(action_shape) == 1) or (not batch_data and len(action_shape) == 1)): 94 | action = np.reshape(action, action_shape[:-1]) # 转换 95 | log_p = np.reshape(log_p, log_p.shape[:-1]) 96 | return action, log_p 97 | 98 | # def _cal_discounted_r(self, state_list, reward_list, done_list, batch_data=False): 99 | # discounted_r = [] 100 | # for r in reward_list[::-1]: 101 | # v_s_ = r + 0.9 * v_s_ 102 | # discounted_r.append(v_s_) 103 | 104 | def _cal_discounted_r(self, next_state_list, reward_list, done_list, batch_data=False): 105 | discounted_r = np.zeros_like(reward_list) # reward_buffer shape: [-1, n_env] 106 | # done_list = np.array(done_list, dtype=np.int) 107 | done_list = np.array(done_list) 108 | v_s_ = self.get_value(next_state_list[-1], batch_data) * (1 - done_list[-1]) 109 | for i in range(len(reward_list) - 1, -1, -1): 110 | # discounted_r[i] = v_s_ = reward_list[i] + self.gamma * v_s_ 111 | discounted_r[i] = v_s_ = reward_list[i] + (1 - done_list[i]) * self.gamma * v_s_ 112 | return discounted_r 113 | 114 | def _cal_adv(self, state_list, reward_list, done_list, next_state_list, batch_data=False): 115 | dc_r = self._cal_discounted_r(next_state_list, reward_list, done_list, batch_data) 116 | # dc_r = np.array( 117 | # [[6.5132155], [6.125795], [5.6953278], [5.217031], [4.68559], [4.0951], [3.439], [2.71], [1.9], [1.]]) 118 | if batch_data: 119 | s_shape = np.shape(self.state_buffer) # state_buffer shape: [-1, n_env, *obs_shape] 120 | state_list = np.reshape(self.state_buffer, [-1, *s_shape[2:]]) 121 | v = self.get_value(state_list, batch_data).numpy() 122 | v = v.reshape(*s_shape[:2]) 123 | else: 124 | v = self.get_value(state_list, batch_data).numpy() 125 | 126 | dc_r = np.array(dc_r, dtype=np.float32) 127 | advs = dc_r - v 128 | # advs = (advs - np.mean(advs)) / (np.std(advs) + 1e-8) # norm all env data adv at the same time 129 | return advs 130 | 131 | def _get_traj(self): 132 | traj_list = [] 133 | for element in [ 134 | self.state_buffer, self.action_buffer, self.reward_buffer, self.done_buffer, self.next_state_buffer, 135 | self._cal_adv(self.state_buffer, self.reward_buffer, self.done_buffer, self.next_state_buffer, True), 136 | self.logp_buffer]: 137 | axes = list(range(len(np.shape(element)))) 138 | axes[0], axes[1] = 1, 0 139 | result = np.transpose(element, axes) 140 | # print(result) 141 | traj_list.append(result) 142 | traj_list = list(zip(*traj_list)) # 143 | return traj_list 144 | 145 | def update_traj_list(self): 146 | self.traj_list.extend(self._get_traj()) 147 | for buffer in self.all_buffer: 148 | buffer.clear() 149 | 150 | 151 | class DPPO_CLIP(RLAlgorithm): 152 | def __init__(self, net_builder, opt_builder, n_step=100, gamma=0.9, epsilon=0.2): 153 | super().__init__() 154 | self.critic, self.actor = None, None 155 | self.net_builder = net_builder 156 | self.gamma = gamma 157 | self.n_step = n_step 158 | self._logp_shape = None 159 | self.epsilon = epsilon 160 | self.name = 'DPPO_CLIP' 161 | self.acter_optimizer, self.critic_optimizer = opt_builder() 162 | 163 | def init_components(self): # todo init process should be placed 164 | networks = self.net_builder() 165 | assert len(networks) == 2 166 | self.critic, self.actor = networks 167 | assert isinstance(self.critic, ValueNetwork) 168 | assert isinstance(self.actor, StochasticPolicyNetwork) 169 | 170 | @property 171 | def all_weights(self): 172 | return self.critic.trainable_weights + self.actor.trainable_weights 173 | 174 | # api 175 | def _get_action(self, state): 176 | action = self.actor(state) 177 | log_p = self.actor.policy_dist.logp(action) 178 | return action, log_p 179 | 180 | def _get_value(self, state): 181 | return self.critic(state) 182 | 183 | def save_ckpt(self, env_name): 184 | """ 185 | save trained weights 186 | 187 | :return: None 188 | """ 189 | save_model(self.actor, 'actor', self.name, env_name) 190 | save_model(self.critic, 'critic', self.name, env_name) 191 | 192 | def load_ckpt(self, env_name): 193 | """ 194 | load trained weights 195 | 196 | :return: None 197 | """ 198 | load_model(self.actor, 'actor', self.name, env_name) 199 | load_model(self.critic, 'critic', self.name, env_name) 200 | 201 | # api 202 | def update_model(self, params): 203 | for i, j in zip(self.all_weights, params): 204 | i.assign(j) 205 | for buffer in self.all_buffer: 206 | buffer.clear() 207 | 208 | def a_train(self, s, a, adv, oldpi_logp): 209 | oldpi_prob = tf.exp(oldpi_logp) 210 | with tf.GradientTape() as tape: 211 | _ = self.actor(s) 212 | pi_prob = tf.exp(self.actor.policy_dist.logp(a)) 213 | ratio = pi_prob / (oldpi_prob + EPS) 214 | 215 | surr = ratio * adv 216 | aloss = -tf.reduce_mean( 217 | tf.minimum(surr, tf.clip_by_value(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv)) 218 | a_gard = tape.gradient(aloss, self.actor.trainable_weights) 219 | return a_gard 220 | 221 | def c_train(self, dc_r, s): 222 | dc_r = np.array(dc_r, dtype=np.float32) 223 | with tf.GradientTape() as tape: 224 | v = self.critic(s) 225 | advantage = dc_r - v 226 | closs = tf.reduce_mean(tf.square(advantage)) 227 | c_grad = tape.gradient(closs, self.critic.trainable_weights) 228 | return c_grad 229 | 230 | def train(self, traj_list, dis_agent=None): 231 | for traj in traj_list: 232 | state_list, action_list, reward_list, done_list, next_state_list, adv_list, logp_list = traj 233 | for _ in range(10): 234 | a_grad = self.a_train(state_list, action_list, adv_list, logp_list) 235 | if dis_agent: 236 | a_grad = [dis_agent.role_all_reduce(grad) for grad in a_grad] 237 | self.acter_optimizer.apply_gradients(zip(a_grad, self.actor.trainable_weights)) 238 | 239 | dc_r = self._cal_discounted_r(next_state_list, reward_list, done_list) 240 | for _ in range(10): 241 | c_grad = self.c_train(dc_r, state_list) 242 | if dis_agent: 243 | c_grad = [dis_agent.role_all_reduce(grad) for grad in c_grad] 244 | self.critic_optimizer.apply_gradients(zip(c_grad, self.critic.trainable_weights)) 245 | 246 | 247 | if __name__ == '__main__': 248 | from rlzoo.distributed.training_components import net_builder, env_maker, opt_builder 249 | from rlzoo.common.utils import set_seed 250 | 251 | env = env_maker() 252 | # set_seed(1, env) 253 | 254 | agent = DPPO_CLIP(net_builder, opt_builder) 255 | agent.init_components() 256 | 257 | running_reward = [] 258 | curr_step, max_step, traj_len = 0, 500 * 200, 200 259 | s = env.reset() 260 | d = False 261 | cnt = 0 262 | while curr_step < max_step: 263 | for _ in range(traj_len): 264 | curr_step += 1 265 | a, logp = agent.get_action(s) 266 | s_, r, d, _ = env.step(a) 267 | agent.collect_data(s, a, r, d, s_, logp) 268 | if d: 269 | s = env.reset() 270 | else: 271 | s = s_ 272 | agent.update_traj_list() 273 | agent.train(agent.traj_list) 274 | avg_eps_reward = min(sum(agent.traj_list[0][2]) / (sum(agent.traj_list[0][3] + 1e-10)), traj_len) 275 | agent.traj_list.clear() 276 | running_reward.append(avg_eps_reward) 277 | cnt += 1 278 | print(cnt, curr_step, avg_eps_reward) 279 | agent.plot_save_log(running_reward, env.spec.id) 280 | -------------------------------------------------------------------------------- /rlzoo/algorithms/dppo_penalty/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dppo_penalty/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/dppo_penalty/run_dppo_penalty.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.utils import set_seed 2 | from rlzoo.algorithms.dppo_penalty.dppo_penalty import DPPO_PENALTY 3 | from rlzoo.common.policy_networks import * 4 | from rlzoo.common.value_networks import * 5 | import gym 6 | 7 | 8 | n_workers = 4 9 | """ load environment """ 10 | env = [gym.make('Pendulum-v0').unwrapped for i in range(n_workers)] 11 | 12 | # reproducible 13 | seed = 2 14 | set_seed(seed) 15 | 16 | 17 | """ build networks for the algorithm """ 18 | name = 'DPPO_PENALTY' 19 | hidden_dim = 64 20 | num_hidden_layer = 2 21 | critic = ValueNetwork(env[0].observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') 22 | 23 | actor = StochasticPolicyNetwork(env[0].observation_space, env[0].action_space, 24 | [hidden_dim] * num_hidden_layer, trainable=True, 25 | name=name + '_policy') 26 | net_list = critic, actor 27 | 28 | """ create model """ 29 | actor_lr = 1e-4 30 | critic_lr = 2e-4 31 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] 32 | model = DPPO_PENALTY(net_list, optimizers_list) 33 | """ 34 | full list of arguments for the algorithm 35 | ---------------------------------------- 36 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 37 | optimizers_list: a list of optimizers for all networks and differentiable variables 38 | kl_target: controls bounds of policy update and adaptive lambda 39 | lam: KL-regularization coefficient 40 | """ 41 | 42 | model.learn(env, train_episodes=1000, max_steps=200, save_interval=50, gamma=0.9, 43 | mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10, n_workers=4) 44 | 45 | """ 46 | full list of parameters for training 47 | --------------------------------------- 48 | env: learning environment 49 | train_episodes: total number of episodes for training 50 | test_episodes: total number of episodes for testing 51 | max_steps: maximum number of steps for one episode 52 | save_interval: time steps for saving 53 | gamma: reward discount factor 54 | mode: train or test 55 | batch_size: update batch size 56 | a_update_steps: actor update iteration steps 57 | c_update_steps: critic update iteration steps 58 | n_workers: number of workers 59 | :return: None 60 | """ 61 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) 62 | -------------------------------------------------------------------------------- /rlzoo/algorithms/dqn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dqn/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/dqn/default.py: -------------------------------------------------------------------------------- 1 | from gym.spaces import Discrete 2 | 3 | from rlzoo.common.utils import set_seed 4 | from rlzoo.common.value_networks import * 5 | 6 | """ 7 | full list of algorithm parameters (alg_params) 8 | ----------------------------------------------- 9 | ----------------------------------------------- 10 | 11 | full list of learning parameters (learn_params) 12 | ----------------------------------------------- 13 | double_q (bool): if True double DQN will be used 14 | dueling (bool): if True dueling value estimation will be used 15 | exploration_rate (float): fraction of entire training period over 16 | which the exploration rate is annealed 17 | exploration_final_eps (float): final value of random action probability 18 | batch_size (int): size of a batched sampled from replay buffer for training 19 | train_freq (int): update the model every `train_freq` steps 20 | learning_starts (int): how many steps of the model to collect transitions 21 | for before learning starts 22 | target_network_update_freq (int): update the target network every 23 | `target_network_update_freq` steps 24 | buffer_size (int): size of the replay buffer 25 | prioritized_replay (bool): if True prioritized replay buffer will be used. 26 | prioritized_alpha (float): alpha parameter for prioritized replay 27 | prioritized_beta0 (float): beta parameter for prioritized replay 28 | mode (str): train or test 29 | ----------------------------------------------- 30 | """ 31 | 32 | 33 | def atari(env, default_seed=False, **kwargs): 34 | if default_seed: 35 | seed = 2 36 | set_seed(seed, env) # reproducible 37 | 38 | assert isinstance(env.action_space, Discrete) 39 | 40 | alg_params = dict( 41 | dueling=True, 42 | double_q=True, 43 | buffer_size=1000, 44 | prioritized_replay=True, 45 | prioritized_alpha=0.6, 46 | prioritized_beta0=0.4, 47 | ) 48 | alg_params.update(kwargs) 49 | if alg_params.get('net_list') is None: 50 | alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64], 51 | state_only=True, dueling=alg_params['dueling'])] 52 | 53 | if alg_params.get('optimizers_list') is None: 54 | alg_params['optimizers_list'] = tf.optimizers.Adam(1e-4, epsilon=1e-5, clipnorm=10), 55 | 56 | learn_params = dict( 57 | train_episodes=int(1e5), 58 | test_episodes=10, 59 | max_steps=200, 60 | save_interval=1e4, 61 | batch_size=32, 62 | exploration_rate=0.1, 63 | exploration_final_eps=0.01, 64 | train_freq=4, 65 | learning_starts=10000, 66 | target_network_update_freq=1000, 67 | gamma=0.99, 68 | ) 69 | 70 | return alg_params, learn_params 71 | 72 | 73 | def classic_control(env, default_seed=False, **kwargs): 74 | if default_seed: 75 | seed = 2 76 | set_seed(seed, env) # reproducible 77 | 78 | assert isinstance(env.action_space, Discrete) 79 | 80 | alg_params = dict( 81 | dueling=True, 82 | double_q=True, 83 | buffer_size=1000, 84 | prioritized_replay=False, 85 | prioritized_alpha=0.6, 86 | prioritized_beta0=0.4, 87 | ) 88 | alg_params.update(kwargs) 89 | if alg_params.get('net_list') is None: 90 | alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64], activation=tf.nn.tanh, 91 | state_only=True, dueling=alg_params['dueling'])] 92 | 93 | if alg_params.get('optimizers_list') is None: 94 | alg_params['optimizers_list'] = tf.optimizers.Adam(5e-3, epsilon=1e-5), 95 | 96 | learn_params = dict( 97 | train_episodes=int(1e3), 98 | test_episodes=10, 99 | max_steps=200, 100 | save_interval=1e3, 101 | batch_size=32, 102 | exploration_rate=0.2, 103 | exploration_final_eps=0.01, 104 | train_freq=4, 105 | learning_starts=200, 106 | target_network_update_freq=50, 107 | gamma=0.99, 108 | ) 109 | 110 | return alg_params, learn_params 111 | 112 | 113 | # class CNNQNet(tl.models.Model): 114 | # def __init__(self, in_dim, act_dim, dueling): 115 | # super().__init__() 116 | # self._state_shape = in_dim 117 | # self._action_shape = act_dim, 118 | # self.dueling = dueling 119 | # with tf.name_scope('DQN'): 120 | # with tf.name_scope('CNN'): 121 | # self.cnn = basic_nets.CNNModel(in_dim) 122 | # mlp_in_shape = self.cnn.outputs[0].shape[0] 123 | # with tf.name_scope('QValue'): 124 | # hidden_dim = 256 125 | # self.preq = tl.layers.Dense( 126 | # hidden_dim, tf.nn.relu, 127 | # tf.initializers.Orthogonal(1.0), 128 | # in_channels=mlp_in_shape 129 | # ) 130 | # self.qout = tl.layers.Dense( 131 | # act_dim, None, 132 | # tf.initializers.Orthogonal(1.0), 133 | # in_channels=hidden_dim 134 | # ) 135 | # if dueling: 136 | # with tf.name_scope('Value'): 137 | # hidden_dim = 256 138 | # self.prev = tl.layers.Dense( 139 | # hidden_dim, tf.nn.relu, 140 | # tf.initializers.Orthogonal(1.0), 141 | # in_channels=mlp_in_shape 142 | # ) 143 | # self.vout = tl.layers.Dense( 144 | # 1, None, 145 | # tf.initializers.Orthogonal(1.0), 146 | # in_channels=hidden_dim 147 | # ) 148 | # 149 | # def forward(self, obv): 150 | # obv = tf.cast(obv, tf.float32) / 255.0 151 | # mlp_in = tl.layers.flatten_reshape(self.cnn(obv)) 152 | # q_out = self.qout(self.preq(mlp_in)) 153 | # if self.dueling: 154 | # v_out = self.vout(self.prev(mlp_in)) 155 | # q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True) 156 | # return q_out 157 | # 158 | # @property 159 | # def state_shape(self): 160 | # return copy.deepcopy(self._state_shape) 161 | # 162 | # @property 163 | # def action_shape(self): 164 | # return copy.deepcopy(self._action_shape) 165 | # 166 | # 167 | # class MLPQNet(tl.models.Model): 168 | # def __init__(self, in_dim, act_dim, dueling): 169 | # super().__init__() 170 | # self._state_shape = in_dim, 171 | # self._action_shape = act_dim, 172 | # self.dueling = dueling 173 | # hidden_dim = 64 174 | # with tf.name_scope('DQN'): 175 | # with tf.name_scope('MLP'): 176 | # self.mlp = tl.layers.Dense( 177 | # hidden_dim, tf.nn.tanh, 178 | # tf.initializers.Orthogonal(1.0), 179 | # in_channels=in_dim 180 | # ) 181 | # with tf.name_scope('QValue'): 182 | # self.qmlp = tl.layers.Dense( 183 | # act_dim, None, 184 | # tf.initializers.Orthogonal(1.0), 185 | # in_channels=hidden_dim 186 | # ) 187 | # if dueling: 188 | # with tf.name_scope('Value'): 189 | # self.vmlp = tl.layers.Dense( 190 | # 1, None, 191 | # tf.initializers.Orthogonal(1.0), 192 | # in_channels=hidden_dim 193 | # ) 194 | # 195 | # def forward(self, obv): 196 | # obv = tf.cast(obv, tf.float32) 197 | # latent = self.mlp(obv) 198 | # q_out = self.qmlp(latent) 199 | # if self.dueling: 200 | # v_out = self.vmlp(latent) 201 | # q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True) 202 | # return q_out 203 | # 204 | # @property 205 | # def state_shape(self): 206 | # return copy.deepcopy(self._state_shape) 207 | # 208 | # @property 209 | # def action_shape(self): 210 | # return copy.deepcopy(self._action_shape) 211 | -------------------------------------------------------------------------------- /rlzoo/algorithms/dqn/dqn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Deep Q Network 3 | """ 4 | import random 5 | from copy import deepcopy 6 | 7 | from rlzoo.common.utils import * 8 | from rlzoo.common.buffer import ReplayBuffer, PrioritizedReplayBuffer 9 | from rlzoo.common.value_networks import * 10 | 11 | 12 | class DQN(object): 13 | """ 14 | Papers: 15 | 16 | Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep 17 | reinforcement learning[J]. Nature, 2015, 518(7540): 529. 18 | 19 | Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining Improvements 20 | in Deep Reinforcement Learning[J]. 2017. 21 | """ 22 | 23 | def __init__(self, net_list, optimizers_list, double_q, dueling, buffer_size, 24 | prioritized_replay, prioritized_alpha, prioritized_beta0, ): 25 | """ 26 | Parameters: 27 | ---------- 28 | :param net_list (list): a list of networks (value and policy) used in the algorithm, from common functions or customization 29 | :param optimizers_list (list): a list of optimizers for all networks and differentiable variables 30 | :param double_q (bool): if True double DQN will be used 31 | :param dueling (bool): if True dueling value estimation will be used 32 | :param buffer_size (int): size of the replay buffer 33 | :param prioritized_replay (bool): if True prioritized replay buffer will be used. 34 | :param prioritized_alpha (float): alpha parameter for prioritized replay 35 | :param prioritized_beta0 (float): beta parameter for prioritized replay 36 | """ 37 | assert isinstance(net_list[0], QNetwork) 38 | self.name = 'DQN' 39 | if prioritized_replay: 40 | self.buffer = PrioritizedReplayBuffer( 41 | buffer_size, prioritized_alpha, prioritized_beta0) 42 | else: 43 | self.buffer = ReplayBuffer(buffer_size) 44 | 45 | self.network = net_list[0] 46 | self.target_network = deepcopy(net_list[0]) 47 | self.network.train() 48 | self.target_network.infer() 49 | self.optimizer = optimizers_list[0] 50 | self.double_q = double_q 51 | self.prioritized_replay = prioritized_replay 52 | self.dueling = dueling 53 | 54 | def get_action(self, obv, eps=0.2): 55 | out_dim = self.network.action_shape[0] 56 | if random.random() < eps: 57 | return int(random.random() * out_dim) 58 | else: 59 | obv = np.expand_dims(obv, 0).astype('float32') 60 | return self.network(obv).numpy().argmax(1)[0] 61 | 62 | def get_action_greedy(self, obv): 63 | obv = np.expand_dims(obv, 0).astype('float32') 64 | return self.network(obv).numpy().argmax(1)[0] 65 | 66 | def sync(self): 67 | """Copy q network to target q network""" 68 | 69 | for var, var_tar in zip(self.network.trainable_weights, 70 | self.target_network.trainable_weights): 71 | var_tar.assign(var) 72 | 73 | def save_ckpt(self, env_name): 74 | """ 75 | save trained weights 76 | :return: None 77 | """ 78 | save_model(self.network, 'qnet', 'DQN', env_name) 79 | 80 | def load_ckpt(self, env_name): 81 | """ 82 | load trained weights 83 | :return: None 84 | """ 85 | load_model(self.network, 'qnet', 'DQN', env_name) 86 | 87 | # @tf.function 88 | def _td_error(self, transitions, reward_gamma): 89 | b_o, b_a, b_r, b_o_, b_d = transitions 90 | b_d = tf.cast(b_d, tf.float32) 91 | b_a = tf.cast(b_a, tf.int64) 92 | b_r = tf.cast(b_r, tf.float32) 93 | if self.double_q: 94 | b_a_ = tf.one_hot(tf.argmax(self.network(b_o_), 1), self.network.action_shape[0]) 95 | b_q_ = (1 - b_d) * tf.reduce_sum(self.target_network(b_o_) * b_a_, 1) 96 | else: 97 | b_q_ = (1 - b_d) * tf.reduce_max(self.target_network(b_o_), 1) 98 | 99 | b_q = tf.reduce_sum(self.network(b_o) * tf.one_hot(b_a, self.network.action_shape[0]), 1) 100 | return b_q - (b_r + reward_gamma * b_q_) 101 | 102 | def store_transition(self, s, a, r, s_, d): 103 | self.buffer.push(s, a, r, s_, d) 104 | 105 | def update(self, batch_size, gamma): 106 | if self.prioritized_replay: 107 | # sample from prioritized replay buffer 108 | *transitions, b_w, idxs = self.buffer.sample(batch_size) 109 | # calculate weighted huber loss 110 | with tf.GradientTape() as tape: 111 | priorities = self._td_error(transitions, gamma) 112 | huber_loss = tf.where(tf.abs(priorities) < 1, 113 | tf.square(priorities) * 0.5, 114 | tf.abs(priorities) - 0.5) 115 | loss = tf.reduce_mean(huber_loss * b_w) 116 | # backpropagate 117 | grad = tape.gradient(loss, self.network.trainable_weights) 118 | self.optimizer.apply_gradients(zip(grad, self.network.trainable_weights)) 119 | # update priorities 120 | priorities = np.clip(np.abs(priorities), 1e-6, None) 121 | self.buffer.update_priorities(idxs, priorities) 122 | else: 123 | # sample from prioritized replay buffer 124 | transitions = self.buffer.sample(batch_size) 125 | # calculate huber loss 126 | with tf.GradientTape() as tape: 127 | td_errors = self._td_error(transitions, gamma) 128 | huber_loss = tf.where(tf.abs(td_errors) < 1, 129 | tf.square(td_errors) * 0.5, 130 | tf.abs(td_errors) - 0.5) 131 | loss = tf.reduce_mean(huber_loss) 132 | # backpropagate 133 | grad = tape.gradient(loss, self.network.trainable_weights) 134 | self.optimizer.apply_gradients(zip(grad, self.network.trainable_weights)) 135 | 136 | def learn( 137 | self, env, mode='train', render=False, 138 | train_episodes=1000, test_episodes=10, max_steps=200, 139 | save_interval=1000, gamma=0.99, 140 | exploration_rate=0.2, exploration_final_eps=0.01, 141 | target_network_update_freq=50, 142 | batch_size=32, train_freq=4, learning_starts=200, 143 | plot_func=None 144 | ): 145 | 146 | """ 147 | :param env: learning environment 148 | :param mode: train or test 149 | :param render: render each step 150 | :param train_episodes: total number of episodes for training 151 | :param test_episodes: total number of episodes for testing 152 | :param max_steps: maximum number of steps for one episode 153 | :param save_interval: time steps for saving 154 | :param gamma: reward decay factor 155 | :param exploration_rate (float): fraction of entire training period over 156 | which the exploration rate is annealed 157 | :param exploration_final_eps (float): final value of random action probability 158 | :param target_network_update_freq (int): update the target network every 159 | `target_network_update_freq` steps 160 | :param batch_size (int): size of a batched sampled from replay buffer for training 161 | :param train_freq (int): update the model every `train_freq` steps 162 | :param learning_starts (int): how many steps of the model to collect transitions 163 | for before learning starts 164 | :param plot_func: additional function for interactive module 165 | 166 | """ 167 | if mode == 'train': 168 | print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) 169 | reward_buffer = [] 170 | i = 0 171 | for episode in range(1, train_episodes + 1): 172 | o = env.reset() 173 | ep_reward = 0 174 | for step in range(1, max_steps + 1): 175 | i += 1 176 | if render: 177 | env.render() 178 | eps = 1 - (1 - exploration_final_eps) * \ 179 | min(1, i / (exploration_rate * train_episodes * max_steps)) 180 | a = self.get_action(o, eps) 181 | 182 | # execute action and feed to replay buffer 183 | # note that `_` tail in var name means next 184 | o_, r, done, info = env.step(a) 185 | self.store_transition(o, a, r, o_, done) 186 | ep_reward += r 187 | 188 | # update networks 189 | if i >= learning_starts and i % train_freq == 0: 190 | self.update(batch_size, gamma) 191 | 192 | if i % target_network_update_freq == 0: 193 | self.sync() 194 | 195 | # reset current observation 196 | if done: 197 | break 198 | else: 199 | o = o_ 200 | 201 | # saving model 202 | if i % save_interval == 0: 203 | self.save_ckpt(env.spec.id) 204 | print( 205 | 'Time steps so far: {}, episode so far: {}, ' 206 | 'episode reward: {:.4f}, episode length: {}' 207 | .format(i, episode, ep_reward, step) 208 | ) 209 | reward_buffer.append(ep_reward) 210 | if plot_func is not None: 211 | plot_func(reward_buffer) 212 | 213 | elif mode == 'test': 214 | print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) 215 | 216 | self.load_ckpt(env.spec.id) 217 | self.network.infer() 218 | 219 | reward_buffer = [] 220 | for episode in range(1, test_episodes + 1): 221 | o = env.reset() 222 | ep_reward = 0 223 | for step in range(1, max_steps + 1): 224 | if render: 225 | env.render() 226 | a = self.get_action_greedy(o) 227 | 228 | # execute action 229 | # note that `_` tail in var name means next 230 | o_, r, done, info = env.step(a) 231 | ep_reward += r 232 | 233 | if done: 234 | break 235 | else: 236 | o = o_ 237 | 238 | print( 239 | 'episode so far: {}, ' 240 | 'episode reward: {:.4f}, episode length: {}' 241 | .format(episode, ep_reward, step) 242 | ) 243 | reward_buffer.append(ep_reward) 244 | if plot_func is not None: 245 | plot_func(reward_buffer) 246 | 247 | else: 248 | print('unknown mode type') 249 | -------------------------------------------------------------------------------- /rlzoo/algorithms/dqn/run_dqn.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from rlzoo.algorithms.dqn.dqn import DQN 4 | from rlzoo.algorithms.dqn.default import * 5 | from rlzoo.common.value_networks import * 6 | import gym 7 | 8 | """ load environment """ 9 | env = gym.make('CartPole-v0').unwrapped 10 | 11 | obs_space = env.observation_space 12 | act_space = env.action_space 13 | 14 | # reproducible 15 | seed = 2 16 | set_seed(seed, env) 17 | 18 | in_dim = env.observation_space.shape[0] 19 | act_dim = env.action_space.n 20 | """ build networks for the algorithm """ 21 | name = 'DQN' 22 | Q_net = QNetwork(env.observation_space, env.action_space, [64], activation=tf.nn.tanh, 23 | state_only=True, dueling=True) 24 | net_list = [Q_net] 25 | 26 | """ create model """ 27 | optimizer = tf.optimizers.Adam(5e-3, epsilon=1e-5) 28 | optimizers_list = [optimizer] 29 | model = DQN(net_list, optimizers_list, 30 | double_q=True, 31 | dueling=True, 32 | buffer_size=10000, 33 | prioritized_replay=False, 34 | prioritized_alpha=0.6, 35 | prioritized_beta0=0.4) 36 | """ 37 | full list of arguments for the algorithm 38 | ---------------------------------------- 39 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 40 | optimizers_list: a list of optimizers for all networks and differentiable variables 41 | replay_buffer_size: the size of buffer for storing explored samples 42 | tau: soft update factor 43 | """ 44 | 45 | model.learn(env, mode='train', render=False, 46 | train_episodes=1000, 47 | test_episodes=10, 48 | max_steps=200, 49 | save_interval=1e3, 50 | batch_size=32, 51 | exploration_rate=0.2, 52 | exploration_final_eps=0.01, 53 | train_freq=4, 54 | learning_starts=200, 55 | target_network_update_freq=50, 56 | gamma=0.99, ) 57 | """ 58 | full list of parameters for training 59 | --------------------------------------- 60 | env: learning environment 61 | train_episodes: total number of episodes for training 62 | test_episodes: total number of episodes for testing 63 | max_steps: maximum number of steps for one episode 64 | save_interval: time steps for saving 65 | explore_steps: for random action sampling in the beginning of training 66 | mode: train or test mode 67 | render: render each step 68 | batch_size: update batch size 69 | gamma: reward decay factor 70 | noise_scale: range of action noise for exploration 71 | noise_scale_decay: noise scale decay factor 72 | """ 73 | 74 | model.learn(env, mode='test', render=True, 75 | test_episodes=10, 76 | batch_size=32, 77 | exploration_rate=0.2, 78 | exploration_final_eps=0.01, 79 | train_freq=4, 80 | learning_starts=200, 81 | target_network_update_freq=50, 82 | gamma=0.99, ) 83 | -------------------------------------------------------------------------------- /rlzoo/algorithms/pg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/pg/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/pg/default.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.policy_networks import * 2 | from rlzoo.common.utils import set_seed 3 | 4 | """ 5 | full list of algorithm parameters (alg_params) 6 | ----------------------------------------------- 7 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 8 | optimizers_list: a list of optimizers for all networks and differentiable variables 9 | ----------------------------------------------- 10 | 11 | full list of learning parameters (learn_params) 12 | ----------------------------------------------- 13 | train_episodes: total number of episodes for training 14 | test_episodes: total number of episodes for testing 15 | max_steps: maximum number of steps for one episode 16 | save_interval: time steps for saving 17 | mode: train or test 18 | render: render each step 19 | gamma: reward decay 20 | ----------------------------------------------- 21 | """ 22 | 23 | 24 | def atari(env, default_seed=True): 25 | if default_seed: 26 | seed = 2 27 | set_seed(seed, env) # reproducible 28 | 29 | alg_params = dict() 30 | 31 | if alg_params.get('net_list') is None: 32 | num_hidden_layer = 1 # number of hidden layers for the networks 33 | hidden_dim = 32 # dimension of hidden layers for the networks 34 | with tf.name_scope('PG'): 35 | with tf.name_scope('Policy'): 36 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, 37 | num_hidden_layer * [hidden_dim]) 38 | net_list = [policy_net] 39 | alg_params['net_list'] = net_list 40 | 41 | if alg_params.get('optimizers_list') is None: 42 | learning_rate = 0.02 43 | policy_optimizer = tf.optimizers.Adam(learning_rate) 44 | optimizers_list = [policy_optimizer] 45 | alg_params['optimizers_list'] = optimizers_list 46 | 47 | learn_params = dict( 48 | train_episodes=200, 49 | test_episodes=100, 50 | max_steps=200, 51 | save_interval=20, 52 | gamma=0.95 53 | ) 54 | 55 | return alg_params, learn_params 56 | 57 | 58 | def classic_control(env, default_seed=True): 59 | if default_seed: 60 | seed = 2 61 | set_seed(seed, env) # reproducible 62 | 63 | alg_params = dict() 64 | 65 | if alg_params.get('net_list') is None: 66 | num_hidden_layer = 1 # number of hidden layers for the networks 67 | hidden_dim = 32 # dimension of hidden layers for the networks 68 | with tf.name_scope('PG'): 69 | with tf.name_scope('Policy'): 70 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, 71 | num_hidden_layer * [hidden_dim]) 72 | net_list = [policy_net] 73 | alg_params['net_list'] = net_list 74 | 75 | if alg_params.get('optimizers_list') is None: 76 | learning_rate = 0.02 77 | policy_optimizer = tf.optimizers.Adam(learning_rate) 78 | optimizers_list = [policy_optimizer] 79 | alg_params['optimizers_list'] = optimizers_list 80 | 81 | learn_params = dict( 82 | train_episodes=200, 83 | test_episodes=100, 84 | max_steps=200, 85 | save_interval=20, 86 | gamma=0.95 87 | ) 88 | 89 | return alg_params, learn_params 90 | 91 | 92 | def box2d(env, default_seed=True): 93 | if default_seed: 94 | seed = 2 95 | set_seed(seed, env) # reproducible 96 | 97 | alg_params = dict() 98 | 99 | if alg_params.get('net_list') is None: 100 | num_hidden_layer = 1 # number of hidden layers for the networks 101 | hidden_dim = 32 # dimension of hidden layers for the networks 102 | with tf.name_scope('PG'): 103 | with tf.name_scope('Policy'): 104 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, 105 | num_hidden_layer * [hidden_dim]) 106 | net_list = [policy_net] 107 | alg_params['net_list'] = net_list 108 | 109 | if alg_params.get('optimizers_list') is None: 110 | learning_rate = 0.02 111 | policy_optimizer = tf.optimizers.Adam(learning_rate) 112 | optimizers_list = [policy_optimizer] 113 | alg_params['optimizers_list'] = optimizers_list 114 | 115 | learn_params = dict( 116 | train_episodes=200, 117 | test_episodes=100, 118 | max_steps=200, 119 | save_interval=20, 120 | gamma=0.95 121 | ) 122 | 123 | return alg_params, learn_params 124 | 125 | 126 | def mujoco(env, default_seed=True): 127 | if default_seed: 128 | seed = 2 129 | set_seed(seed, env) # reproducible 130 | 131 | alg_params = dict() 132 | 133 | if alg_params.get('net_list') is None: 134 | num_hidden_layer = 1 # number of hidden layers for the networks 135 | hidden_dim = 32 # dimension of hidden layers for the networks 136 | with tf.name_scope('PG'): 137 | with tf.name_scope('Policy'): 138 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, 139 | num_hidden_layer * [hidden_dim]) 140 | net_list = [policy_net] 141 | alg_params['net_list'] = net_list 142 | 143 | if alg_params.get('optimizers_list') is None: 144 | learning_rate = 0.02 145 | policy_optimizer = tf.optimizers.Adam(learning_rate) 146 | optimizers_list = [policy_optimizer] 147 | alg_params['optimizers_list'] = optimizers_list 148 | 149 | learn_params = dict( 150 | train_episodes=200, 151 | test_episodes=100, 152 | max_steps=200, 153 | save_interval=20, 154 | gamma=0.95 155 | ) 156 | 157 | return alg_params, learn_params 158 | 159 | 160 | def robotics(env, default_seed=True): 161 | if default_seed: 162 | seed = 2 163 | set_seed(seed, env) # reproducible 164 | 165 | alg_params = dict() 166 | 167 | if alg_params.get('net_list') is None: 168 | num_hidden_layer = 1 # number of hidden layers for the networks 169 | hidden_dim = 32 # dimension of hidden layers for the networks 170 | with tf.name_scope('PG'): 171 | with tf.name_scope('Policy'): 172 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, 173 | num_hidden_layer * [hidden_dim]) 174 | net_list = [policy_net] 175 | alg_params['net_list'] = net_list 176 | 177 | if alg_params.get('optimizers_list') is None: 178 | learning_rate = 0.02 179 | policy_optimizer = tf.optimizers.Adam(learning_rate) 180 | optimizers_list = [policy_optimizer] 181 | alg_params['optimizers_list'] = optimizers_list 182 | 183 | learn_params = dict( 184 | train_episodes=200, 185 | test_episodes=100, 186 | max_steps=200, 187 | save_interval=20, 188 | gamma=0.95 189 | ) 190 | 191 | return alg_params, learn_params 192 | 193 | 194 | def dm_control(env, default_seed=True): 195 | if default_seed: 196 | seed = 2 197 | set_seed(seed, env) # reproducible 198 | 199 | alg_params = dict() 200 | 201 | if alg_params.get('net_list') is None: 202 | num_hidden_layer = 1 # number of hidden layers for the networks 203 | hidden_dim = 32 # dimension of hidden layers for the networks 204 | with tf.name_scope('PG'): 205 | with tf.name_scope('Policy'): 206 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, 207 | num_hidden_layer * [hidden_dim]) 208 | net_list = [policy_net] 209 | alg_params['net_list'] = net_list 210 | 211 | if alg_params.get('optimizers_list') is None: 212 | learning_rate = 0.02 213 | policy_optimizer = tf.optimizers.Adam(learning_rate) 214 | optimizers_list = [policy_optimizer] 215 | alg_params['optimizers_list'] = optimizers_list 216 | 217 | learn_params = dict( 218 | train_episodes=200, 219 | test_episodes=100, 220 | max_steps=200, 221 | save_interval=20, 222 | gamma=0.95 223 | ) 224 | 225 | return alg_params, learn_params 226 | 227 | 228 | def rlbench(env, default_seed=True): 229 | if default_seed: 230 | seed = 2 231 | set_seed(seed, env) # reproducible 232 | 233 | alg_params = dict() 234 | 235 | if alg_params.get('net_list') is None: 236 | num_hidden_layer = 1 # number of hidden layers for the networks 237 | hidden_dim = 32 # dimension of hidden layers for the networks 238 | with tf.name_scope('PG'): 239 | with tf.name_scope('Policy'): 240 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, 241 | num_hidden_layer * [hidden_dim]) 242 | net_list = [policy_net] 243 | alg_params['net_list'] = net_list 244 | 245 | if alg_params.get('optimizers_list') is None: 246 | learning_rate = 0.02 247 | policy_optimizer = tf.optimizers.Adam(learning_rate) 248 | optimizers_list = [policy_optimizer] 249 | alg_params['optimizers_list'] = optimizers_list 250 | 251 | learn_params = dict( 252 | train_episodes=200, 253 | test_episodes=100, 254 | max_steps=200, 255 | save_interval=20, 256 | gamma=0.95 257 | ) 258 | 259 | return alg_params, learn_params 260 | -------------------------------------------------------------------------------- /rlzoo/algorithms/pg/pg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vanilla Policy Gradient(VPG or REINFORCE) 3 | ----------------------------------------- 4 | The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. 5 | It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces. 6 | Here is an example on discrete action space game CartPole-v0. 7 | To apply it on continuous action space, you need to change the last softmax layer and the get_action function. 8 | 9 | Reference 10 | --------- 11 | Cookbook: Barto A G, Sutton R S. Reinforcement Learning: An Introduction[J]. 1998. 12 | MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/ 13 | MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/ 14 | 15 | Prerequisites 16 | -------------- 17 | tensorflow >=2.0.0a0 18 | tensorflow-probability 0.6.0 19 | tensorlayer >=2.0.0 20 | 21 | """ 22 | import time 23 | 24 | from rlzoo.common.utils import * 25 | from rlzoo.common.policy_networks import * 26 | 27 | 28 | ############################### PG #################################### 29 | 30 | 31 | class PG: 32 | """ 33 | PG class 34 | """ 35 | 36 | def __init__(self, net_list, optimizers_list): 37 | """ 38 | :param net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 39 | :param optimizers_list: a list of optimizers for all networks and differentiable variables 40 | 41 | """ 42 | assert len(net_list) == 1 43 | assert len(optimizers_list) == 1 44 | self.name = 'PG' 45 | self.model = net_list[0] 46 | assert isinstance(self.model, StochasticPolicyNetwork) 47 | self.buffer = [] 48 | print('Policy Network', self.model) 49 | self.optimizer = optimizers_list[0] 50 | 51 | def get_action(self, s): 52 | """ 53 | choose action with probabilities. 54 | 55 | :param s: state 56 | 57 | :return: act 58 | """ 59 | return self.model([s])[0].numpy() 60 | 61 | def get_action_greedy(self, s): 62 | """ 63 | choose action with greedy policy 64 | 65 | :param s: state 66 | 67 | :return: act 68 | """ 69 | return self.model([s], greedy=True).numpy()[0] 70 | 71 | def store_transition(self, s, a, r): 72 | """ 73 | store data in memory buffer 74 | 75 | :param s: state 76 | :param a: act 77 | :param r: reward 78 | 79 | :return: 80 | """ 81 | self.buffer.append([s, np.array(a, np.float32), np.array(r, np.float32)]) 82 | 83 | def update(self, gamma): 84 | """ 85 | update policy parameters via stochastic gradient ascent 86 | 87 | :return: None 88 | """ 89 | # discount and normalize episode reward 90 | s, a, r = zip(*self.buffer) 91 | s, a, r = np.array(s), np.array(a), np.array(r).flatten() 92 | discounted_ep_rs_norm = self._discount_and_norm_rewards(r, gamma) 93 | 94 | with tf.GradientTape() as tape: 95 | self.model(s) 96 | neg_log_prob = self.model.policy_dist.neglogp(a) 97 | loss = tf.reduce_mean(neg_log_prob * discounted_ep_rs_norm) # reward guided loss 98 | 99 | grad = tape.gradient(loss, self.model.trainable_weights) 100 | self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights)) 101 | 102 | self.buffer = [] 103 | return discounted_ep_rs_norm 104 | 105 | def _discount_and_norm_rewards(self, reward_list, gamma): 106 | """ 107 | compute discount_and_norm_rewards 108 | 109 | :return: discount_and_norm_rewards 110 | """ 111 | # discount episode rewards 112 | discounted_ep_rs = np.zeros_like(reward_list) 113 | running_add = 0 114 | for t in reversed(range(0, len(reward_list))): 115 | running_add = running_add * gamma + reward_list[t] 116 | discounted_ep_rs[t] = running_add 117 | 118 | # normalize episode rewards 119 | discounted_ep_rs -= np.mean(discounted_ep_rs) 120 | std = np.std(discounted_ep_rs) 121 | if std != 0: 122 | discounted_ep_rs /= np.std(discounted_ep_rs) 123 | discounted_ep_rs = discounted_ep_rs[:, np.newaxis] 124 | return discounted_ep_rs 125 | 126 | def save_ckpt(self, env_name): 127 | """ 128 | save trained weights 129 | 130 | :return: None 131 | """ 132 | save_model(self.model, 'model_policy', self.name, env_name) 133 | 134 | def load_ckpt(self, env_name): 135 | """ 136 | load trained weights 137 | 138 | :return: None 139 | """ 140 | load_model(self.model, 'model_policy', self.name, env_name) 141 | 142 | def learn(self, env, train_episodes=200, test_episodes=100, max_steps=200, save_interval=100, 143 | mode='train', render=False, gamma=0.95, plot_func=None): 144 | """ 145 | :param env: learning environment 146 | :param train_episodes: total number of episodes for training 147 | :param test_episodes: total number of episodes for testing 148 | :param max_steps: maximum number of steps for one episode 149 | :param save_interval: time steps for saving 150 | :param mode: train or test 151 | :param render: render each step 152 | :param gamma: reward decay 153 | :param plot_func: additional function for interactive module 154 | :return: None 155 | """ 156 | 157 | if mode == 'train': 158 | print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) 159 | reward_buffer = [] 160 | t0 = time.time() 161 | 162 | for i_episode in range(1, train_episodes + 1): 163 | 164 | observation = env.reset() 165 | 166 | ep_rs_sum = 0 167 | for step in range(max_steps): 168 | if render: 169 | env.render() 170 | action = self.get_action(observation) 171 | observation_, reward, done, info = env.step(action) 172 | self.store_transition(observation, action, reward) 173 | 174 | ep_rs_sum += reward 175 | observation = observation_ 176 | 177 | if done: 178 | break 179 | 180 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( 181 | i_episode, train_episodes, ep_rs_sum, time.time() - t0) 182 | ) 183 | reward_buffer.append(ep_rs_sum) 184 | if plot_func is not None: 185 | plot_func(reward_buffer) 186 | 187 | self.update(gamma) 188 | 189 | if i_episode and i_episode % save_interval == 0: 190 | self.save_ckpt(env_name=env.spec.id) 191 | plot_save_log(reward_buffer, algorithm_name='PG', env_name=env.spec.id) 192 | 193 | self.save_ckpt(env_name=env.spec.id) 194 | plot_save_log(reward_buffer, algorithm_name='PG', env_name=env.spec.id) 195 | 196 | elif mode == 'test': 197 | # test 198 | self.load_ckpt(env_name=env.spec.id) 199 | print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id)) 200 | t0 = time.time() 201 | for eps in range(test_episodes): 202 | observation = env.reset() 203 | ep_rs_sum = 0 204 | for step in range(max_steps): 205 | if render: 206 | env.render() 207 | action = self.get_action_greedy(observation) 208 | observation, reward, done, info = env.step(action) 209 | ep_rs_sum += reward 210 | if done: 211 | break 212 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format( 213 | eps, test_episodes, ep_rs_sum, time.time() - t0) 214 | ) 215 | 216 | else: 217 | print('unknown mode type') 218 | -------------------------------------------------------------------------------- /rlzoo/algorithms/pg/run_pg.py: -------------------------------------------------------------------------------- 1 | from rlzoo.algorithms.pg.pg import PG 2 | from rlzoo.common.policy_networks import * 3 | import gym 4 | 5 | """ load environment """ 6 | env = gym.make('CartPole-v0').unwrapped 7 | # env = gym.make('Pendulum-v0').unwrapped 8 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run 9 | obs_space = env.observation_space 10 | act_space = env.action_space 11 | 12 | # reproducible 13 | seed = 2 14 | np.random.seed(seed) 15 | tf.random.set_seed(seed) 16 | env.seed(seed) 17 | 18 | """ build networks for the algorithm """ 19 | name = 'pg' 20 | num_hidden_layer = 1 # number of hidden layers for the networks 21 | hidden_dim = 32 # dimension of hidden layers for the networks 22 | 23 | policy_net = StochasticPolicyNetwork(obs_space, act_space, num_hidden_layer * [hidden_dim]) 24 | net_list = [policy_net] 25 | 26 | """ choose optimizers """ 27 | learning_rate = 0.02 28 | policy_optimizer = tf.optimizers.Adam(learning_rate) 29 | optimizers_list = [policy_optimizer] 30 | 31 | model = PG(net_list, optimizers_list) 32 | """ 33 | full list of arguments for the algorithm 34 | ---------------------------------------- 35 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 36 | optimizers_list: a list of optimizers for all networks and differentiable variables 37 | """ 38 | 39 | model.learn(env, train_episodes=200, max_steps=200, save_interval=20, mode='train', render=False, gamma=0.95) 40 | """ 41 | full list of parameters for training 42 | --------------------------------------- 43 | env: learning environment 44 | train_episodes: total number of episodes for training 45 | test_episodes: total number of episodes for testing 46 | max_steps: maximum number of steps for one episode 47 | save_interval: time steps for saving 48 | mode: train or test 49 | render: render each step 50 | gamma: reward decay 51 | """ 52 | 53 | # test 54 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) 55 | -------------------------------------------------------------------------------- /rlzoo/algorithms/ppo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ppo/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/ppo/ppo.py: -------------------------------------------------------------------------------- 1 | from rlzoo.algorithms.ppo_penalty.ppo_penalty import PPO_PENALTY 2 | from rlzoo.algorithms.ppo_clip.ppo_clip import PPO_CLIP 3 | 4 | 5 | def PPO(**alg_params): 6 | method = alg_params['method'] 7 | if method == 'penalty': 8 | del alg_params['epsilon'] 9 | algo = PPO_PENALTY 10 | elif method == 'clip': 11 | del alg_params['kl_target'] 12 | del alg_params['lam'] 13 | algo = PPO_CLIP 14 | else: 15 | raise ValueError('Method input error. Method can only be penalty or clip') 16 | del alg_params['method'] 17 | return algo(**alg_params) 18 | -------------------------------------------------------------------------------- /rlzoo/algorithms/ppo_clip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ppo_clip/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/ppo_clip/run_ppo_clip.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.utils import make_env, set_seed 2 | from rlzoo.algorithms.ppo_clip.ppo_clip import PPO_CLIP 3 | from rlzoo.common.policy_networks import * 4 | from rlzoo.common.value_networks import * 5 | import gym 6 | 7 | 8 | """ load environment """ 9 | env = gym.make('Pendulum-v0').unwrapped 10 | 11 | # reproducible 12 | seed = 1 13 | set_seed(seed, env) 14 | 15 | """ build networks for the algorithm """ 16 | name = 'PPO_CLIP' 17 | hidden_dim = 64 18 | num_hidden_layer = 2 19 | critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') 20 | 21 | actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, 22 | output_activation=tf.nn.tanh, name=name + '_policy') 23 | net_list = critic, actor 24 | 25 | """ create model """ 26 | actor_lr = 1e-4 27 | critic_lr = 2e-4 28 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] 29 | 30 | model = PPO_CLIP(net_list, optimizers_list,) 31 | """ 32 | full list of arguments for the algorithm 33 | ---------------------------------------- 34 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 35 | optimizers_list: a list of optimizers for all networks and differentiable variables 36 | epsilon: clip parameter 37 | """ 38 | 39 | model.learn(env, train_episodes=500, max_steps=200, save_interval=50, gamma=0.9, 40 | mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10) 41 | 42 | """ 43 | full list of parameters for training 44 | --------------------------------------- 45 | env: learning environment 46 | train_episodes: total number of episodes for training 47 | test_episodes: total number of episodes for testing 48 | max_steps: maximum number of steps for one episode 49 | save_interval: time steps for saving 50 | gamma: reward discount factor 51 | mode: train or test 52 | render: render each step 53 | batch_size: UPDATE batch size 54 | a_update_steps: actor update iteration steps 55 | c_update_steps: critic update iteration steps 56 | :return: None 57 | """ 58 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) 59 | 60 | -------------------------------------------------------------------------------- /rlzoo/algorithms/ppo_penalty/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ppo_penalty/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/ppo_penalty/run_ppo_penalty.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.utils import make_env, set_seed 2 | from rlzoo.algorithms.ppo_penalty.ppo_penalty import PPO_PENALTY 3 | from rlzoo.common.policy_networks import * 4 | from rlzoo.common.value_networks import * 5 | import gym 6 | 7 | 8 | """ load environment """ 9 | env = gym.make('Pendulum-v0').unwrapped 10 | 11 | # reproducible 12 | seed = 1 13 | set_seed(seed, env) 14 | 15 | """ build networks for the algorithm """ 16 | name = 'PPO_PENALTY' 17 | hidden_dim = 64 18 | num_hidden_layer = 2 19 | critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') 20 | 21 | actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, 22 | output_activation=tf.nn.tanh, name=name + '_policy') 23 | net_list = critic, actor 24 | 25 | """ create model """ 26 | actor_lr = 1e-4 27 | critic_lr = 2e-4 28 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] 29 | 30 | model = PPO_PENALTY(net_list, optimizers_list,) 31 | """ 32 | full list of arguments for the algorithm 33 | ---------------------------------------- 34 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 35 | optimizers_list: a list of optimizers for all networks and differentiable variables 36 | kl_target: controls bounds of policy update and adaptive lambda 37 | lam: KL-regularization coefficient 38 | """ 39 | 40 | model.learn(env, train_episodes=500, max_steps=200, save_interval=50, gamma=0.9, 41 | mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10) 42 | 43 | """ 44 | full list of parameters for training 45 | --------------------------------------- 46 | env: learning environment 47 | train_episodes: total number of episodes for training 48 | test_episodes: total number of episodes for testing 49 | max_steps: maximum number of steps for one episode 50 | save_interval: times teps for saving 51 | gamma: reward discount factor 52 | mode: train or test 53 | render: render each step 54 | batch_size: update batch size 55 | a_update_steps: actor update iteration steps 56 | c_update_steps: critic update iteration steps 57 | :return: None 58 | """ 59 | 60 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) 61 | -------------------------------------------------------------------------------- /rlzoo/algorithms/sac/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/sac/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/sac/run_sac.py: -------------------------------------------------------------------------------- 1 | from rlzoo.algorithms.sac.sac import SAC 2 | from rlzoo.common.policy_networks import * 3 | from rlzoo.common.value_networks import * 4 | import gym 5 | 6 | """ load environment """ 7 | env = gym.make('Pendulum-v0').unwrapped 8 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run 9 | action_shape = env.action_space.shape 10 | state_shape = env.observation_space.shape 11 | # reproducible 12 | seed = 2 13 | np.random.seed(seed) 14 | tf.random.set_seed(seed) 15 | env.seed(seed) 16 | 17 | """ build networks for the algorithm """ 18 | num_hidden_layer = 2 # number of hidden layers for the networks 19 | hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here 20 | with tf.name_scope('SAC'): 21 | with tf.name_scope('Q_Net1'): 22 | soft_q_net1 = QNetwork(env.observation_space, env.action_space, 23 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 24 | with tf.name_scope('Q_Net2'): 25 | soft_q_net2 = QNetwork(env.observation_space, env.action_space, 26 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 27 | with tf.name_scope('Target_Q_Net1'): 28 | target_soft_q_net1 = QNetwork(env.observation_space, env.action_space, 29 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 30 | with tf.name_scope('Target_Q_Net2'): 31 | target_soft_q_net2 = QNetwork(env.observation_space, env.action_space, 32 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 33 | with tf.name_scope('Policy'): 34 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space, 35 | hidden_dim_list=num_hidden_layer * [hidden_dim], 36 | output_activation=None, 37 | state_conditioned=True) 38 | net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net] 39 | 40 | """ choose optimizers """ 41 | soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha 42 | soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr) 43 | soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr) 44 | policy_optimizer = tf.optimizers.Adam(policy_lr) 45 | alpha_optimizer = tf.optimizers.Adam(alpha_lr) 46 | optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer] 47 | 48 | model = SAC(net_list, optimizers_list) 49 | """ 50 | full list of arguments for the algorithm 51 | ---------------------------------------- 52 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 53 | optimizers_list: a list of optimizers for all networks and differentiable variables 54 | state_dim: dimension of state for the environment 55 | action_dim: dimension of action for the environment 56 | replay_buffer_capacity: the size of buffer for storing explored samples 57 | action_range: value of each action in [-action_range, action_range] 58 | """ 59 | 60 | model.learn(env, train_episodes=100, max_steps=150, batch_size=64, explore_steps=500, \ 61 | update_itr=3, policy_target_update_interval=3, reward_scale=1., save_interval=10, \ 62 | mode='train', AUTO_ENTROPY=True, render=False) 63 | """ 64 | full list of parameters for training 65 | --------------------------------------- 66 | env: learning environment 67 | train_episodes: total number of episodes for training 68 | test_episodes: total number of episodes for testing 69 | max_steps: maximum number of steps for one episode 70 | batch_size: udpate batchsize 71 | explore_steps: for random action sampling in the beginning of training 72 | update_itr: repeated updates for single step 73 | policy_target_update_interval: delayed update for the policy network and target networks 74 | reward_scale: value range of reward 75 | save_interval: timesteps for saving the weights and plotting the results 76 | mode: 'train' or 'test' 77 | AUTO_ENTROPY: automatically udpating variable alpha for entropy 78 | DETERMINISTIC: stochastic action policy if False, otherwise deterministic 79 | render: if true, visualize the environment 80 | """ 81 | # test 82 | model.learn(env, test_episodes=10, max_steps=150, mode='test', render=True) 83 | -------------------------------------------------------------------------------- /rlzoo/algorithms/td3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/td3/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/td3/run_td3.py: -------------------------------------------------------------------------------- 1 | from rlzoo.algorithms.td3.td3 import TD3 2 | from rlzoo.common.policy_networks import * 3 | from rlzoo.common.value_networks import * 4 | import gym 5 | 6 | """ load environment """ 7 | env = gym.make('Pendulum-v0').unwrapped 8 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run 9 | action_shape = env.action_space.shape 10 | state_shape = env.observation_space.shape 11 | # reproducible 12 | seed = 2 13 | np.random.seed(seed) 14 | tf.random.set_seed(seed) 15 | env.seed(seed) 16 | 17 | """ build networks for the algorithm """ 18 | num_hidden_layer = 2 # number of hidden layers for the networks 19 | hidden_dim = 64 # dimension of hidden layers for the networks 20 | with tf.name_scope('TD3'): 21 | with tf.name_scope('Q_Net1'): 22 | q_net1 = QNetwork(env.observation_space, env.action_space, 23 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 24 | with tf.name_scope('Q_Net2'): 25 | q_net2 = QNetwork(env.observation_space, env.action_space, 26 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 27 | with tf.name_scope('Target_Q_Net1'): 28 | target_q_net1 = QNetwork(env.observation_space, env.action_space, 29 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 30 | with tf.name_scope('Target_Q_Net2'): 31 | target_q_net2 = QNetwork(env.observation_space, env.action_space, 32 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 33 | with tf.name_scope('Policy'): 34 | policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, 35 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 36 | with tf.name_scope('Target_Policy'): 37 | target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space, 38 | hidden_dim_list=num_hidden_layer * [hidden_dim]) 39 | net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net] 40 | 41 | """ choose optimizers """ 42 | q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network 43 | q_optimizer1 = tf.optimizers.Adam(q_lr) 44 | q_optimizer2 = tf.optimizers.Adam(q_lr) 45 | policy_optimizer = tf.optimizers.Adam(policy_lr) 46 | optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer] 47 | 48 | model = TD3(net_list, optimizers_list) 49 | """ 50 | full list of arguments for the algorithm 51 | ---------------------------------------- 52 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 53 | optimizers_list: a list of optimizers for all networks and differentiable variables 54 | state_dim: dimension of state for the environment 55 | action_dim: dimension of action for the environment 56 | replay_buffer_capacity: the size of buffer for storing explored samples 57 | policy_target_update_interval: delayed interval for updating the target policy 58 | action_range: value of each action in [-action_range, action_range] 59 | """ 60 | 61 | model.learn(env, train_episodes=100, max_steps=150, batch_size=64, explore_steps=500, update_itr=3, 62 | reward_scale=1., save_interval=10, explore_noise_scale=1.0, eval_noise_scale=0.5, mode='train', 63 | render=False) 64 | """ 65 | full list of parameters for training 66 | --------------------------------------- 67 | env: learning environment 68 | train_episodes: total number of episodes for training 69 | test_episodes: total number of episodes for testing 70 | max_steps: maximum number of steps for one episode 71 | batch_size: udpate batchsize 72 | explore_steps: for random action sampling in the beginning of training 73 | update_itr: repeated updates for single step 74 | reward_scale: value range of reward 75 | save_interval: timesteps for saving the weights and plotting the results 76 | explore_noise_scale: range of action noise for exploration 77 | eval_noise_scale: range of action noise for evaluation of action value 78 | mode: 'train' or 'test' 79 | render: if true, visualize the environment 80 | 81 | """ 82 | # test 83 | model.learn(env, test_episodes=10, max_steps=150, mode='test', render=True) 84 | -------------------------------------------------------------------------------- /rlzoo/algorithms/trpo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/trpo/__init__.py -------------------------------------------------------------------------------- /rlzoo/algorithms/trpo/run_trpo.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.utils import set_seed 2 | from rlzoo.algorithms.trpo.trpo import TRPO 3 | from rlzoo.common.policy_networks import * 4 | from rlzoo.common.value_networks import * 5 | import gym 6 | 7 | """ load environment """ 8 | env = gym.make('Pendulum-v0').unwrapped 9 | 10 | # reproducible 11 | seed = 2 12 | set_seed(seed, env) 13 | 14 | """ build networks for the algorithm """ 15 | name = 'TRPO' 16 | hidden_dim = 64 17 | num_hidden_layer = 2 18 | critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') 19 | 20 | actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer, 21 | output_activation=tf.nn.tanh, name=name + '_policy') 22 | net_list = critic, actor 23 | 24 | critic_lr = 1e-3 25 | optimizers_list = [tf.optimizers.Adam(critic_lr)] 26 | 27 | """ create model """ 28 | model = TRPO(net_list, optimizers_list, damping_coeff=0.1, cg_iters=10, delta=0.01) 29 | """ 30 | full list of arguments for the algorithm 31 | ---------------------------------------- 32 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization 33 | optimizers_list: a list of optimizers for all networks and differentiable variables 34 | damping_coeff: Artifact for numerical stability 35 | cg_iters: Number of iterations of conjugate gradient to perform 36 | delta: KL-divergence limit for TRPO update. 37 | """ 38 | 39 | model.learn(env, mode='train', render=False, train_episodes=2000, max_steps=200, save_interval=100, 40 | gamma=0.9, batch_size=256, backtrack_iters=10, backtrack_coeff=0.8, train_critic_iters=80) 41 | """ 42 | full list of parameters for training 43 | --------------------------------------- 44 | env: learning environment 45 | train_episodes: total number of episodes for training 46 | test_episodes: total number of episodes for testing 47 | max_steps: maximum number of steps for one episode 48 | save_interval: time steps for saving 49 | gamma: reward discount factor 50 | mode: train or test 51 | render: render each step 52 | batch_size: update batch size 53 | backtrack_iters: Maximum number of steps allowed in the backtracking line search 54 | backtrack_coeff: How far back to step during backtracking line search 55 | train_critic_iters: critic update iteration steps 56 | """ 57 | 58 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True) 59 | -------------------------------------------------------------------------------- /rlzoo/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/common/__init__.py -------------------------------------------------------------------------------- /rlzoo/common/basic_nets.py: -------------------------------------------------------------------------------- 1 | """Basic neural networks""" 2 | import tensorflow as tf 3 | import tensorlayer as tl 4 | from tensorlayer.layers import Dense, Input 5 | from gym import spaces 6 | from collections import OrderedDict 7 | 8 | 9 | def MLP(input_dim, hidden_dim_list, w_init=tf.initializers.Orthogonal(0.2), 10 | activation=tf.nn.relu, *args, **kwargs): 11 | """Multiple fully-connected layers for approximation 12 | 13 | :param input_dim: (int) size of input tensor 14 | :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers 15 | :param w_init: (callable) initialization method for weights 16 | :param activation: (callable) activation function of hidden layers 17 | 18 | Return: 19 | input tensor, output tensor 20 | """ 21 | 22 | l = inputs = Input([None, input_dim]) 23 | for i in range(len(hidden_dim_list)): 24 | l = Dense(n_units=hidden_dim_list[i], act=activation, W_init=w_init)(l) 25 | outputs = l 26 | 27 | return inputs, outputs 28 | 29 | 30 | def MLPModel(input_dim, hidden_dim_list, w_init=tf.initializers.Orthogonal(0.2), 31 | activation=tf.nn.relu, *args, **kwargs): 32 | """Multiple fully-connected layers for approximation 33 | 34 | :param input_dim: (int) size of input tensor 35 | :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers 36 | :param w_init: (callable) initialization method for weights 37 | :param activation: (callable) activation function of hidden layers 38 | 39 | Return: 40 | input tensor, output tensor 41 | """ 42 | l = inputs = Input([None, input_dim], name='Input_Layer') 43 | for i in range(len(hidden_dim_list)): 44 | l = Dense(n_units=hidden_dim_list[i], act=activation, W_init=w_init, name='Hidden_Layer%d' % (i + 1))(l) 45 | outputs = l 46 | 47 | return tl.models.Model(inputs=inputs, outputs=outputs) 48 | 49 | 50 | def CNN(input_shape, conv_kwargs=None): 51 | """Multiple convolutional layers for approximation 52 | Default setting is equal to architecture used in DQN 53 | 54 | :param input_shape: (tuple[int]) (H, W, C) 55 | :param conv_kwargs: (list[param]) list of conv parameters for tl.layers.Conv2d 56 | 57 | Return: 58 | input tensor, output tensor 59 | """ 60 | if not conv_kwargs: 61 | in_channels = input_shape[-1] 62 | conv_kwargs = [ 63 | { 64 | 'in_channels': in_channels, 'n_filter': 32, 'act': tf.nn.relu, 65 | 'filter_size': (8, 8), 'strides': (4, 4), 'padding': 'VALID', 66 | 'W_init': tf.initializers.GlorotUniform() 67 | }, 68 | { 69 | 'in_channels': 32, 'n_filter': 64, 'act': tf.nn.relu, 70 | 'filter_size': (4, 4), 'strides': (2, 2), 'padding': 'VALID', 71 | 'W_init': tf.initializers.GlorotUniform() 72 | }, 73 | { 74 | 'in_channels': 64, 'n_filter': 64, 'act': tf.nn.relu, 75 | 'filter_size': (3, 3), 'strides': (1, 1), 'padding': 'VALID', 76 | 'W_init': tf.initializers.GlorotUniform() 77 | } 78 | ] 79 | l = inputs = tl.layers.Input((1,) + input_shape) 80 | 81 | for i, kwargs in enumerate(conv_kwargs): 82 | # kwargs['name'] = kwargs.get('name', 'cnn_layer{}'.format(i + 1)) 83 | l = tl.layers.Conv2d(**kwargs)(l) 84 | outputs = tl.layers.Flatten()(l) 85 | 86 | return inputs, outputs 87 | 88 | 89 | def CNNModel(input_shape, conv_kwargs=None): 90 | """Multiple convolutional layers for approximation 91 | Default setting is equal to architecture used in DQN 92 | 93 | :param input_shape: (tuple[int]) (H, W, C) 94 | :param conv_kwargs: (list[param]) list of conv parameters for tl.layers.Conv2d 95 | 96 | Return: 97 | tl.model.Model 98 | """ 99 | if not conv_kwargs: 100 | in_channels = input_shape[-1] 101 | conv_kwargs = [ 102 | { 103 | 'in_channels': in_channels, 'n_filter': 32, 'act': tf.nn.relu, 104 | 'filter_size': (8, 8), 'strides': (4, 4), 'padding': 'VALID', 105 | 'W_init': tf.initializers.GlorotUniform() 106 | }, 107 | { 108 | 'in_channels': 32, 'n_filter': 64, 'act': tf.nn.relu, 109 | 'filter_size': (4, 4), 'strides': (2, 2), 'padding': 'VALID', 110 | 'W_init': tf.initializers.GlorotUniform() 111 | }, 112 | { 113 | 'in_channels': 64, 'n_filter': 64, 'act': tf.nn.relu, 114 | 'filter_size': (3, 3), 'strides': (1, 1), 'padding': 'VALID', 115 | 'W_init': tf.initializers.GlorotUniform() 116 | } 117 | ] 118 | 119 | ni = tl.layers.Input((1,) + input_shape, name='CNN_Input') 120 | hi = ni 121 | 122 | for i, kwargs in enumerate(conv_kwargs): 123 | kwargs['name'] = kwargs.get('name', 'CNN_Layer{}'.format(i + 1)) 124 | hi = tl.layers.Conv2d(**kwargs)(hi) 125 | no = tl.layers.Flatten(name='Flatten_Layer')(hi) 126 | 127 | return tl.models.Model(inputs=ni, outputs=no) 128 | 129 | 130 | def CreateInputLayer(state_space, conv_kwargs=None): 131 | def CreateSingleInput(single_state_space): 132 | single_state_shape = single_state_space.shape 133 | # build structure 134 | if len(single_state_shape) == 1: 135 | l = inputs = Input((None,) + single_state_shape, name='input_layer') 136 | else: 137 | with tf.name_scope('CNN'): 138 | inputs, l = CNN(single_state_shape, conv_kwargs=conv_kwargs) 139 | return inputs, l, single_state_shape 140 | 141 | if isinstance(state_space, spaces.Dict): 142 | input_dict, layer_dict, shape_dict = OrderedDict(), OrderedDict(), OrderedDict() 143 | for k, v in state_space.spaces.items(): 144 | input_dict[k], layer_dict[k], shape_dict[k] = CreateSingleInput(v) 145 | return input_dict, layer_dict, shape_dict 146 | if isinstance(state_space, spaces.Space): 147 | return CreateSingleInput(state_space) 148 | else: 149 | raise ValueError('state space error') 150 | -------------------------------------------------------------------------------- /rlzoo/common/build_rlbench_env.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from collections import OrderedDict 3 | 4 | import numpy as np 5 | from gym import spaces 6 | 7 | from pyrep.const import RenderMode 8 | from pyrep.objects.dummy import Dummy 9 | from pyrep.objects.vision_sensor import VisionSensor 10 | from rlbench.environment import Environment 11 | from rlbench.action_modes import ArmActionMode, ActionMode 12 | from rlbench.observation_config import ObservationConfig 13 | from rlbench.tasks import * 14 | 15 | 16 | # Don't forget to add: export PYTHONPATH=PATH_TO_YOUR_LOCAL_RLBENCH_REPO 17 | 18 | # list of state types 19 | state_types = ['left_shoulder_rgb', 20 | 'left_shoulder_depth', 21 | 'left_shoulder_mask', 22 | 'right_shoulder_rgb', 23 | 'right_shoulder_depth', 24 | 'right_shoulder_mask', 25 | 'wrist_rgb', 26 | 'wrist_depth', 27 | 'wrist_mask', 28 | 'joint_velocities', 29 | 'joint_velocities_noise', 30 | 'joint_positions', 31 | 'joint_positions_noise', 32 | 'joint_forces', 33 | 'joint_forces_noise', 34 | 'gripper_pose', 35 | 'gripper_touch_forces', 36 | 'task_low_dim_state'] 37 | 38 | 39 | class RLBenchEnv(): 40 | """ make RLBench env to have same interfaces as openai.gym """ 41 | 42 | def __init__(self, task_name: str, state_type: list = 'state', ): 43 | # render_mode=None): 44 | """ 45 | create RL Bench environment 46 | :param task_name: task names can be found in rlbench.tasks 47 | :param state_type: state or vision or a sub list of state_types list like ['left_shoulder_rgb'] 48 | """ 49 | if state_type == 'state' or state_type == 'vision' or isinstance(state_type, list): 50 | self._state_type = state_type 51 | else: 52 | raise ValueError('State type value error, your value is {}'.format(state_type)) 53 | # self._render_mode = render_mode 54 | self._render_mode = None 55 | obs_config = ObservationConfig() 56 | obs_config.set_all(True) 57 | action_mode = ActionMode(ArmActionMode.ABS_JOINT_VELOCITY) 58 | self.env = Environment( 59 | action_mode, obs_config=obs_config, headless=True) 60 | self.env.launch() 61 | try: 62 | self.task = self.env.get_task(getattr(sys.modules[__name__], task_name)) 63 | except: 64 | raise NotImplementedError 65 | 66 | _, obs = self.task.reset() 67 | self.spec = Spec(task_name) 68 | 69 | if self._state_type == 'state': 70 | self.observation_space = spaces.Box( 71 | low=-np.inf, high=np.inf, shape=obs.get_low_dim_data().shape) 72 | elif self._state_type == 'vision': 73 | space_dict = OrderedDict() 74 | space_dict["state"] = spaces.Box( 75 | low=-np.inf, high=np.inf, shape=obs.get_low_dim_data().shape) 76 | for i in ["left_shoulder_rgb", "right_shoulder_rgb", "wrist_rgb", "front_rgb"]: 77 | space_dict[i] = spaces.Box( 78 | low=0, high=1, shape=getattr(obs, i).shape) 79 | self.observation_space = spaces.Dict(space_dict) 80 | else: 81 | space_dict = OrderedDict() 82 | for name in self._state_type: 83 | if name.split('_')[-1] in ('rgb', 'depth', 'mask'): 84 | space_dict[name] = spaces.Box( 85 | low=0, high=1, shape=getattr(obs, name).shape) 86 | else: 87 | space_dict[name] = spaces.Box( 88 | low=-np.inf, high=np.inf, 89 | shape=getattr(obs, name).shape) 90 | self.observation_space = spaces.Dict(space_dict) 91 | self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(self.env.action_size,), dtype=np.float32) 92 | 93 | # if render_mode is not None: 94 | # # Add the camera to the scene 95 | # cam_placeholder = Dummy('cam_cinematic_placeholder') 96 | # self._gym_cam = VisionSensor.create([640, 360]) 97 | # self._gym_cam.set_pose(cam_placeholder.get_pose()) 98 | # if render_mode == 'human': 99 | # self._gym_cam.set_render_mode(RenderMode.OPENGL3_WINDOWED) 100 | # else: 101 | # self._gym_cam.set_render_mode(RenderMode.OPENGL3) 102 | 103 | def _extract_obs(self, obs): 104 | if self._state_type == 'state': 105 | return np.array(obs.get_low_dim_data(), np.float32) 106 | elif self._state_type == 'vision': 107 | return np.array([np.array(obs.get_low_dim_data(), np.float32), 108 | np.array(obs.left_shoulder_rgb, np.float32), 109 | np.array(obs.right_shoulder_rgb, np.float32), 110 | np.array(obs.wrist_rgb, np.float32), 111 | np.array(obs.front_rgb, np.float32), ]) 112 | else: 113 | result = ['tag'] 114 | for name in self._state_type: 115 | result.append(np.array(getattr(obs, name), np.float32)) 116 | return np.delete(np.array(result,), 0, 0) 117 | 118 | def seed(self, seed_value): 119 | # set seed as in openai.gym env 120 | pass 121 | 122 | def render(self, mode='human'): 123 | # todo render available at any time 124 | if self._render_mode is None: 125 | self._render_mode = mode 126 | # Add the camera to the scene 127 | cam_placeholder = Dummy('cam_cinematic_placeholder') 128 | self._gym_cam = VisionSensor.create([640, 360]) 129 | self._gym_cam.set_pose(cam_placeholder.get_pose()) 130 | if mode == 'human': 131 | self._gym_cam.set_render_mode(RenderMode.OPENGL3_WINDOWED) 132 | else: 133 | self._gym_cam.set_render_mode(RenderMode.OPENGL3) 134 | 135 | if mode != self._render_mode: 136 | raise ValueError( 137 | 'The render mode must match the render mode selected in the ' 138 | 'constructor. \nI.e. if you want "human" render mode, then ' 139 | 'create the env by calling: ' 140 | 'gym.make("reach_target-state-v0", render_mode="human").\n' 141 | 'You passed in mode %s, but expected %s.' % ( 142 | mode, self._render_mode)) 143 | if mode == 'rgb_array': 144 | return self._gym_cam.capture_rgb() 145 | 146 | def reset(self): 147 | descriptions, obs = self.task.reset() 148 | return self._extract_obs(obs) 149 | 150 | def step(self, action): 151 | obs, reward, terminate = self.task.step(action) 152 | return self._extract_obs(obs), reward, terminate, None 153 | 154 | def close(self): 155 | self.env.shutdown() 156 | 157 | 158 | class Spec(): 159 | """ a fake spec """ 160 | 161 | def __init__(self, id_name): 162 | self.id = id_name 163 | -------------------------------------------------------------------------------- /rlzoo/common/distributions.py: -------------------------------------------------------------------------------- 1 | """Definition of parametrized distributions. Adapted from openai/baselines""" 2 | import copy 3 | from functools import wraps 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from gym import spaces 8 | 9 | 10 | def expand_dims(func): 11 | @wraps(func) 12 | def wrapper(*args, **kwargs): 13 | result = func(*args, **kwargs) 14 | result = tf.expand_dims(result, axis=-1) 15 | return result 16 | 17 | return wrapper 18 | 19 | 20 | class Distribution(object): 21 | """A particular probability distribution""" 22 | 23 | def set_param(self, *args, **kwargs): 24 | raise NotImplementedError 25 | 26 | def sample(self, *args, **kwargs): 27 | """Sampling from distribution. Allow explore parameters.""" 28 | raise NotImplementedError 29 | 30 | def logp(self, x): 31 | """Calculate log probability of a sample.""" 32 | return -self.neglogp(x) 33 | 34 | def neglogp(self, x): 35 | """Calculate negative log probability of a sample.""" 36 | raise NotImplementedError 37 | 38 | def kl(self, *parameters): 39 | """Calculate Kullback–Leibler divergence""" 40 | raise NotImplementedError 41 | 42 | def entropy(self): 43 | """Calculate the entropy of distribution.""" 44 | raise NotImplementedError 45 | 46 | 47 | class Categorical(Distribution): 48 | """Creates a categorical distribution""" 49 | 50 | def __init__(self, ndim, logits=None): 51 | """ 52 | Args: 53 | ndim (int): total number of actions 54 | logits (tensor): logits variables 55 | """ 56 | self._ndim = ndim 57 | self._logits = logits 58 | self.param = self._logits 59 | 60 | @property 61 | def ndim(self): 62 | return copy.copy(self._ndim) 63 | 64 | def set_param(self, logits): 65 | """ 66 | Args: 67 | logits (tensor): logits variables to set 68 | """ 69 | self._logits = logits 70 | self.param = self._logits 71 | 72 | def get_param(self): 73 | return copy.deepcopy(self._logits) 74 | 75 | def sample(self): 76 | """ Sample actions from distribution, using the Gumbel-Softmax trick """ 77 | u = np.array(np.random.uniform(0, 1, size=np.shape(self._logits)), dtype=np.float32) 78 | res = tf.argmax(self._logits - tf.math.log(-tf.math.log(u)), axis=-1) 79 | return res 80 | 81 | def greedy_sample(self): 82 | """ Get actions greedily """ 83 | _probs = tf.nn.softmax(self._logits) 84 | return tf.argmax(_probs, axis=-1) 85 | 86 | def logp(self, x): 87 | return -self.neglogp(x) 88 | 89 | @expand_dims 90 | def neglogp(self, x): 91 | x = np.array(x) 92 | if np.any(x % 1): 93 | raise ValueError('Input float actions in discrete action space') 94 | x = tf.convert_to_tensor(x, tf.int32) 95 | x = tf.one_hot(x, self._ndim, axis=-1) 96 | return tf.nn.softmax_cross_entropy_with_logits(x, self._logits) 97 | 98 | @expand_dims 99 | def kl(self, logits): 100 | """ 101 | Args: 102 | logits (tensor): logits variables of another distribution 103 | """ 104 | a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True) 105 | a1 = logits - tf.reduce_max(logits, axis=-1, keepdims=True) 106 | ea0 = tf.exp(a0) 107 | ea1 = tf.exp(a1) 108 | z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) 109 | z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True) 110 | p0 = ea0 / z0 111 | return tf.reduce_sum( 112 | p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=-1) 113 | 114 | @expand_dims 115 | def entropy(self): 116 | a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True) 117 | ea0 = tf.exp(a0) 118 | z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True) 119 | p0 = ea0 / z0 120 | return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=-1) 121 | 122 | 123 | class DiagGaussian(Distribution): 124 | """Creates a diagonal Gaussian distribution """ 125 | 126 | def __init__(self, ndim, mean_logstd=None): 127 | """ 128 | Args: 129 | ndim (int): the dimenstion of actions 130 | mean_logstd (tensor): mean and logstd stacked on the last axis 131 | """ 132 | self._ndim = ndim 133 | self.mean = None 134 | self.logstd = None 135 | self.std = None 136 | self.action_mean = None 137 | self.action_scale = None 138 | self.param = self.mean, self.logstd 139 | if mean_logstd is not None: 140 | self.set_param(mean_logstd) 141 | 142 | @property 143 | def ndim(self): 144 | return copy.copy(self._ndim) 145 | 146 | def set_param(self, mean_logstd): 147 | """ 148 | Args: 149 | mean_logstd (tensor): mean and log std 150 | """ 151 | self.mean, self.logstd = mean_logstd 152 | self.std = tf.math.exp(self.logstd) 153 | self.param = self.mean, self.logstd 154 | 155 | def get_param(self): 156 | """ Get parameters """ 157 | return copy.deepcopy(self.mean), copy.deepcopy(self.logstd) 158 | 159 | def sample(self): 160 | """ Get actions in deterministic or stochastic manner """ 161 | return self.mean, self.std * np.random.normal(0, 1, np.shape(self.mean)) 162 | 163 | def greedy_sample(self): 164 | """ Get actions greedily/deterministically """ 165 | return self.mean 166 | 167 | def logp(self, x): 168 | return -self.neglogp(x) 169 | 170 | @expand_dims 171 | def neglogp(self, x): 172 | # here we reverse the action normalization to make the computation of negative log probability correct 173 | x = (x - self.action_mean)/self.action_scale 174 | 175 | return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \ 176 | + 0.5 * np.log(2.0 * np.pi) * float(self._ndim) + tf.reduce_sum(self.logstd, axis=-1) 177 | 178 | @expand_dims 179 | def kl(self, mean_logstd): 180 | """ 181 | Args: 182 | mean_logstd (tensor): mean and logstd of another distribution 183 | """ 184 | mean, logstd = mean_logstd 185 | return tf.reduce_sum( 186 | logstd - self.logstd + 187 | (tf.square(self.std) + tf.square(self.mean - mean)) 188 | / (2.0 * tf.square(tf.math.exp(logstd))) - 0.5, axis=-1) 189 | 190 | @expand_dims 191 | def entropy(self): 192 | return tf.reduce_sum( 193 | self.logstd + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1) 194 | 195 | 196 | def make_dist(ac_space): 197 | """Get distribution based on action space 198 | 199 | :param ac_space: gym.spaces.Space 200 | """ 201 | if isinstance(ac_space, spaces.Discrete): 202 | return Categorical(ac_space.n) 203 | elif isinstance(ac_space, spaces.Box): 204 | assert len(ac_space.shape) == 1 205 | return DiagGaussian(ac_space.shape[0]) 206 | else: 207 | raise NotImplementedError 208 | -------------------------------------------------------------------------------- /rlzoo/common/math_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for mathematics utilization. 3 | 4 | # Requirements 5 | tensorflow==2.0.0a0 6 | tensorlayer==2.0.1 7 | 8 | """ 9 | 10 | 11 | def flatten_dims(shapes): # will be moved to common 12 | dim = 1 13 | for s in shapes: 14 | dim *= s 15 | return dim 16 | -------------------------------------------------------------------------------- /rlzoo/common/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Functions for utilization. 3 | 4 | # Requirements 5 | tensorflow==2.0.0a0 6 | tensorlayer==2.0.1 7 | 8 | """ 9 | import os 10 | import re 11 | 12 | import gym 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | import tensorlayer as tl 16 | import tensorflow as tf 17 | from importlib import import_module 18 | 19 | 20 | def plot(episode_rewards, algorithm_name, env_name): 21 | """ 22 | plot the learning curve, saved as ./img/algorithm_name-env_name.png 23 | 24 | :param episode_rewards: array of floats 25 | :param algorithm_name: string 26 | :param env_name: string 27 | """ 28 | path = os.path.join('.', 'img') 29 | name = algorithm_name + '-' + env_name 30 | plt.figure(figsize=(10, 5)) 31 | plt.title(name) 32 | plt.plot(np.arange(len(episode_rewards)), episode_rewards) 33 | plt.xlabel('Episode') 34 | plt.ylabel('Episode Reward') 35 | if not os.path.exists(path): 36 | os.makedirs(path) 37 | plt.savefig(os.path.join(path, name + '.png')) 38 | plt.close() 39 | 40 | 41 | def plot_save_log(episode_rewards, algorithm_name, env_name): 42 | """ 43 | plot the learning curve, saved as ./img/algorithm_name-env_name.png, 44 | and save the rewards log as ./log/algorithm_name-env_name.npy 45 | 46 | :param episode_rewards: array of floats 47 | :param algorithm_name: string 48 | :param env_name: string 49 | """ 50 | path = os.path.join('.', 'log') 51 | name = algorithm_name + '-' + env_name 52 | plot(episode_rewards, algorithm_name, env_name) 53 | if not os.path.exists(path): 54 | os.makedirs(path) 55 | np.save(os.path.join(path, name), episode_rewards) 56 | 57 | 58 | def save_model(model, model_name, algorithm_name, env_name): 59 | """ 60 | save trained neural network model 61 | 62 | :param model: tensorlayer.models.Model 63 | :param model_name: string, e.g. 'model_sac_q1' 64 | :param algorithm_name: string, e.g. 'SAC' 65 | """ 66 | name = algorithm_name + '-' + env_name 67 | path = os.path.join('.', 'model', name) 68 | if not os.path.exists(path): 69 | os.makedirs(path) 70 | tl.files.save_npz(model.trainable_weights, os.path.join(path, model_name)) 71 | 72 | 73 | def load_model(model, model_name, algorithm_name, env_name): 74 | """ 75 | load saved neural network model 76 | 77 | :param model: tensorlayer.models.Model 78 | :param model_name: string, e.g. 'model_sac_q1' 79 | :param algorithm_name: string, e.g. 'SAC' 80 | """ 81 | name = algorithm_name + '-' + env_name 82 | path = os.path.join('.', 'model', name) 83 | try: 84 | param = tl.files.load_npz(path, model_name + '.npz') 85 | for p0, p1 in zip(model.trainable_weights, param): 86 | p0.assign(p1) 87 | except Exception as e: 88 | print('Load Model Fails!') 89 | raise e 90 | 91 | 92 | def parse_all_args(parser): 93 | """ Parse known and unknown args """ 94 | common_options, other_args = parser.parse_known_args() 95 | other_options = dict() 96 | index = 0 97 | n = len(other_args) 98 | float_pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$') 99 | while index < n: # only str, int and float type will be parsed 100 | if other_args[index].startswith('--'): 101 | if other_args[index].__contains__('='): 102 | key, value = other_args[index].split('=') 103 | index += 1 104 | else: 105 | key, value = other_args[index:index + 2] 106 | index += 2 107 | if re.match(float_pattern, value): 108 | value = float(value) 109 | if value.is_integer(): 110 | value = int(value) 111 | other_options[key[2:]] = value 112 | return common_options, other_options 113 | 114 | 115 | def make_env(env_id): 116 | env = gym.make(env_id).unwrapped 117 | """ add env wrappers here """ 118 | return env 119 | 120 | 121 | def get_algorithm_module(algorithm, submodule): 122 | """ Get algorithm module in the corresponding folder """ 123 | return import_module('.'.join(['rlzoo', 'algorithms', algorithm, submodule])) 124 | 125 | 126 | def call_default_params(env, envtype, alg, default_seed=True): 127 | """ Get the default parameters for training from the default script """ 128 | alg = alg.lower() 129 | default = import_module('.'.join(['rlzoo', 'algorithms', alg, 'default'])) 130 | params = getattr(default, envtype)(env, 131 | default_seed) # need manually set seed in the main script if default_seed = False 132 | return params 133 | 134 | 135 | def set_seed(seed, env=None): 136 | """ set random seed for reproduciblity """ 137 | if isinstance(env, list): 138 | assert isinstance(seed, list) 139 | for i in range(len(env)): 140 | env[i].seed(seed[i]) 141 | seed = seed[0] # pick one seed for np and tf 142 | elif env is not None: 143 | env.seed(seed) 144 | np.random.seed(seed) 145 | tf.random.set_seed(seed) 146 | -------------------------------------------------------------------------------- /rlzoo/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/distributed/__init__.py -------------------------------------------------------------------------------- /rlzoo/distributed/dis_components.py: -------------------------------------------------------------------------------- 1 | import enum 2 | 3 | import tensorflow as tf 4 | from kungfu.python import current_cluster_size, current_rank 5 | from kungfu.tensorflow.ops import (barrier, request_variable, 6 | request_variable_with_template, 7 | save_variable, subset_all_reduce) 8 | from kungfu.tensorflow.ops.queue import new_queue 9 | 10 | 11 | class Role(enum.Enum): 12 | Learner = 1 13 | Actor = 2 14 | Server = 3 15 | 16 | 17 | def show_role_name(role): 18 | return { 19 | Role.Learner: 'learner', 20 | Role.Actor: 'actor', 21 | Role.Server: 'server', 22 | }[role] 23 | 24 | 25 | def _interval(n, offset=0): 26 | return list(range(offset, offset + n)) 27 | 28 | 29 | class Agent: 30 | def __init__(self, n_learners=1, n_actors=1, n_servers=1): 31 | rank = current_rank() 32 | size = current_cluster_size() 33 | if n_learners + n_actors + n_servers != size: 34 | raise RuntimeError('invalid cluster size') 35 | self._n_learners = n_learners 36 | self._n_actors = n_actors 37 | self._n_servers = n_servers 38 | self._global_rank = rank 39 | self._global_size = size 40 | roles = [Role.Learner] * n_learners + [Role.Actor] * n_actors + [Role.Server] * n_servers 41 | rank2role = dict(enumerate(roles)) 42 | self._role = rank2role[rank] 43 | self._roles = { 44 | Role.Learner: _interval(n_learners), 45 | Role.Actor: _interval(n_actors, n_learners), 46 | Role.Server: _interval(n_servers, n_learners + n_actors), 47 | } 48 | self._role_sizes = { 49 | Role.Learner: n_learners, 50 | Role.Actor: n_actors, 51 | Role.Server: n_servers, 52 | } 53 | self._role_offsets = { 54 | Role.Learner: 0, 55 | Role.Actor: n_learners, 56 | Role.Server: n_learners + n_actors, 57 | } 58 | self._role_rank = self._global_rank - self._role_offsets[self._role] 59 | self._role_size = self._role_sizes[self._role] 60 | 61 | def _to_global_rank(self, role, role_rank): 62 | return int(self._role_offsets[role] + int(role_rank)) 63 | 64 | # metadata APIs 65 | def role(self): 66 | return self._role 67 | 68 | def role_rank(self): 69 | return self._role_rank 70 | 71 | def role_size(self, role=None): 72 | if role is None: 73 | return self._role_size 74 | else: 75 | return self._role_sizes[role] 76 | 77 | # collective APIs 78 | def barrier(self): 79 | return barrier() 80 | 81 | def role_all_reduce(self, x): 82 | role_ranks = self._roles[self._role] 83 | topology = [i for i in range(self._global_size)] 84 | for i in role_ranks: 85 | topology[i] = role_ranks[0] 86 | # TODO: generate subset topology 87 | return subset_all_reduce(x, topology) 88 | 89 | # p2p APIs 90 | def save(self, x, name=None): 91 | return save_variable(x, name=name) 92 | 93 | def request(self, role: Role, role_rank, name, shape, dtype): 94 | role_size = self._role_sizes[role] 95 | assert (0 <= role_rank and role_rank < role_size) 96 | target = self._to_global_rank(role, role_rank) 97 | return request_variable( 98 | target, 99 | name=name, 100 | shape=shape, 101 | dtype=dtype, 102 | ) 103 | 104 | def new_queue(self, src, dst): 105 | """create a uni-direction queue.""" 106 | role1, rank1 = src 107 | role2, rank2 = dst 108 | srcRank = self._to_global_rank(role1, rank1) 109 | dstRank = self._to_global_rank(role2, rank2) 110 | return new_queue(srcRank, dstRank) 111 | 112 | def new_queue_pair(self, a, b): 113 | """create a pair of queues.""" 114 | q1 = self.new_queue(a, b) 115 | q2 = self.new_queue(b, a) 116 | return q1, q2 117 | 118 | 119 | class LearnerExample: 120 | pass 121 | 122 | 123 | class ActorExample: 124 | pass 125 | 126 | 127 | class ServerExample: 128 | pass 129 | -------------------------------------------------------------------------------- /rlzoo/distributed/run_dis_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0) 5 | 6 | kungfu_flags() { 7 | echo -q 8 | echo -logdir logs 9 | 10 | local ip1=127.0.0.1 11 | local np1=$np 12 | 13 | local ip2=127.0.0.10 14 | local np2=$np 15 | local H=$ip1:$np1,$ip2:$np2 16 | local m=cpu,gpu 17 | 18 | echo -H $ip1:$np1 19 | } 20 | 21 | prun() { 22 | local np=$1 23 | shift 24 | kungfu-run $(kungfu_flags) -np $np $@ 25 | } 26 | 27 | n_learner=2 28 | n_actor=2 29 | n_server=1 30 | 31 | flags() { 32 | echo -l $n_learner 33 | echo -a $n_actor 34 | echo -s $n_server 35 | } 36 | 37 | rl_run() { 38 | local n=$((n_learner + n_actor + n_server)) 39 | prun $n python3 training_components.py $(flags) 40 | } 41 | 42 | main() { 43 | rl_run 44 | } 45 | 46 | main 47 | -------------------------------------------------------------------------------- /rlzoo/distributed/start_dis_role.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from rlzoo.distributed.dis_components import * 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | 8 | def parse_args(): 9 | p = argparse.ArgumentParser() 10 | p.add_argument('-l', type=int, default=1) 11 | p.add_argument('-a', type=int, default=1) 12 | p.add_argument('-s', type=int, default=1) 13 | p.add_argument('-f', type=str, default='') # config.json 14 | 15 | args = p.parse_args() 16 | return args 17 | 18 | 19 | def run_learner(agent, args, training_conf, env_conf, agent_conf): 20 | agent_generator = agent_conf['agent_generator'] 21 | total_step, traj_len, train_n_traj = training_conf['total_step'], training_conf['traj_len'], training_conf['train_n_traj'], 22 | obs_shape, act_shape = env_conf['obs_shape'], env_conf['act_shape'] 23 | 24 | if agent.role_rank() == 0: 25 | param_q = agent.new_queue((Role.Learner, 0), (Role.Server, 0)) 26 | 27 | traj_q = agent.new_queue((Role.Server, 0), (Role.Learner, agent.role_rank())) 28 | 29 | rl_agent = agent_generator() 30 | rl_agent.init_components() 31 | 32 | # init model 33 | rl_agent.update_model([agent.role_all_reduce(weights) for weights in rl_agent.all_weights]) 34 | 35 | if agent.role_rank() == 0: 36 | for weight in rl_agent.all_weights: 37 | param_q.put(tf.Variable(weight, dtype=tf.float32)) 38 | 39 | n_update = total_step // (traj_len * agent.role_size(Role.Learner) * train_n_traj) 40 | for i in range(n_update): 41 | traj_list = [[traj_q.get(dtype=tf.float32, shape=(traj_len, *shape)) for shape in [ 42 | obs_shape, act_shape, (), (), obs_shape, (), (1,)]] for _ in range(train_n_traj)] 43 | 44 | rl_agent.train(traj_list, dis_agent=agent) 45 | 46 | # send weights to server 47 | if agent.role_rank() == 0: 48 | for weight in rl_agent.all_weights: 49 | param_q.put(tf.Variable(weight, dtype=tf.float32)) 50 | print('learner finished') 51 | 52 | 53 | def run_actor(agent, args, training_conf, env_conf): # sampler 54 | env_maker, total_step = env_conf['env_maker'], training_conf['total_step'] 55 | 56 | from gym import spaces 57 | 58 | env = env_maker() 59 | action_q, step_data_q = agent.new_queue_pair((Role.Server, 0), (Role.Actor, agent.role_rank())) 60 | 61 | state, reward, done = env.reset(), 0, False 62 | each_total_step = int(total_step/agent.role_size(Role.Actor)) 63 | action_dtype = tf.int32 if isinstance(env.action_space, spaces.Discrete) else tf.float32 64 | for i in range(each_total_step): 65 | step_data_q.put(tf.Variable(state, dtype=tf.float32)) 66 | a = action_q.get(dtype=action_dtype, shape=env.action_space.shape).numpy() 67 | next_state, reward, done, _ = env.step(a) 68 | for data in (reward, done, next_state): 69 | step_data_q.put(tf.Variable(data, dtype=tf.float32)) 70 | if done: 71 | state = env.reset() 72 | else: 73 | state = next_state 74 | print('actor finished') 75 | 76 | 77 | def run_server(agent, args, training_conf, env_conf, agent_conf): 78 | total_step, traj_len, train_n_traj, save_interval = training_conf['total_step'], training_conf['traj_len'], \ 79 | training_conf['train_n_traj'], training_conf['save_interval'], 80 | obs_shape, env_name = env_conf['obs_shape'], env_conf['env_name'] 81 | agent_generator = agent_conf['agent_generator'] 82 | 83 | from rlzoo.algorithms.dppo_clip_distributed.dppo_clip import DPPO_CLIP 84 | from rlzoo.distributed.dis_components import Role 85 | from gym import spaces 86 | 87 | learner_size = agent.role_size(Role.Learner) 88 | rl_agent: DPPO_CLIP = agent_generator() 89 | rl_agent.init_components() 90 | 91 | # queue to actor 92 | q_list = [agent.new_queue_pair((Role.Server, 0), (Role.Actor, i)) for i in 93 | range(agent.role_size(Role.Actor))] 94 | action_q_list, step_data_q_list = zip(*q_list) 95 | 96 | # queue to learner 97 | param_q = agent.new_queue((Role.Learner, 0), (Role.Server, 0)) 98 | traj_q_list = [agent.new_queue((Role.Server, 0), (Role.Learner, i)) for i in 99 | range(agent.role_size(Role.Learner))] 100 | 101 | # syn net weights from learner 102 | all_weights = [param_q.get(dtype=weight.dtype, shape=weight.shape) for weight in rl_agent.all_weights] 103 | rl_agent.update_model(all_weights) 104 | 105 | train_cnt = 0 106 | action_dtype = tf.int32 if isinstance(rl_agent.actor.action_space, spaces.Discrete) else tf.float32 107 | 108 | curr_step = 0 109 | 110 | total_reward_list = [] 111 | curr_reward_list = [] 112 | tmp_eps_reward = 0 113 | while curr_step < total_step: 114 | # tmp_eps_reward = 0 # todo env with no end 115 | for _ in range(traj_len): 116 | curr_step += agent.role_size(Role.Actor) 117 | 118 | state_list = [] 119 | for step_data_q in step_data_q_list: 120 | state_list.append(step_data_q.get(dtype=tf.float32, shape=obs_shape)) 121 | 122 | action_list, log_p_list = rl_agent.get_action(state_list, batch_data=True) 123 | 124 | for action_q, action in zip(action_q_list, action_list): 125 | action_q.put(tf.Variable(action, dtype=action_dtype)) 126 | reward_list, done_list, next_state_list = [], [], [], 127 | for i, step_data_q in enumerate(step_data_q_list): 128 | reward = step_data_q.get(dtype=tf.float32, shape=()) 129 | if i == 0: 130 | tmp_eps_reward += reward 131 | reward_list.append(reward) 132 | done = step_data_q.get(dtype=tf.float32, shape=()) 133 | if i == 0 and done: 134 | curr_reward_list.append(tmp_eps_reward) 135 | tmp_eps_reward = 0 136 | done_list.append(done) 137 | next_state_list.append(step_data_q.get(dtype=tf.float32, shape=obs_shape)) 138 | rl_agent.collect_data(state_list, action_list, reward_list, done_list, next_state_list, log_p_list, True) 139 | 140 | rl_agent.update_traj_list() 141 | 142 | # send traj to each learner and update weight 143 | learn_traj_len = learner_size * train_n_traj 144 | if len(rl_agent.traj_list) >= learn_traj_len: 145 | train_cnt += 1 146 | 147 | # todo env with end 148 | avg_eps_reward = None 149 | if curr_reward_list: 150 | avg_eps_reward = np.mean(curr_reward_list) 151 | curr_reward_list.clear() 152 | total_reward_list.append(avg_eps_reward) 153 | 154 | # todo env with no end 155 | # avg_eps_reward = tmp_eps_reward 156 | # total_reward_list.append(np.array(avg_eps_reward)) 157 | 158 | print('Training iters: {}, steps so far: {}, average eps reward: {}'.format( 159 | train_cnt, curr_step, np.array(avg_eps_reward))) 160 | 161 | rl_agent.plot_save_log(total_reward_list, env_name) 162 | 163 | traj_iter = iter(rl_agent.traj_list[:learn_traj_len]) 164 | rl_agent.traj_list = rl_agent.traj_list[learn_traj_len:] 165 | 166 | # send traj data to each learner 167 | for i, traj_q in enumerate(traj_q_list): 168 | for _ in range(train_n_traj): 169 | try: 170 | traj_data = next(traj_iter) 171 | except StopIteration: 172 | break 173 | for data in traj_data: 174 | traj_q.put(tf.Variable(data, dtype=tf.float32)) 175 | 176 | # syn net weights from learner 177 | all_weights = [param_q.get(dtype=weight.dtype, shape=weight.shape) for weight in rl_agent.all_weights] 178 | rl_agent.update_model(all_weights) 179 | 180 | # save model 181 | if not train_cnt % save_interval: 182 | rl_agent.save_ckpt(env_name) 183 | 184 | # save the final model 185 | rl_agent.save_ckpt(env_name) 186 | print('Server Finished.') 187 | 188 | 189 | def main(training_conf, env_conf, agent_conf): 190 | args = parse_args() 191 | agent = Agent(n_learners=args.l, n_actors=args.a, n_servers=args.s) 192 | 193 | print('%s : %d/%d' % (agent.role(), agent.role_rank(), agent.role_size())) 194 | 195 | agent.barrier() 196 | 197 | if agent.role() == Role.Learner: 198 | run_learner(agent, args, training_conf, env_conf, agent_conf) 199 | elif agent.role() == Role.Actor: 200 | run_actor(agent, args, training_conf, env_conf) 201 | elif agent.role() == Role.Server: 202 | run_server(agent, args, training_conf, env_conf, agent_conf) 203 | else: 204 | raise RuntimeError('Invalid Role.') 205 | 206 | agent.barrier() 207 | -------------------------------------------------------------------------------- /rlzoo/distributed/training_components.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.env_wrappers import build_env 2 | from rlzoo.common.policy_networks import * 3 | from rlzoo.common.value_networks import * 4 | from rlzoo.algorithms.dppo_clip_distributed.dppo_clip import DPPO_CLIP 5 | from functools import partial 6 | 7 | # Specify the training configurations 8 | training_conf = { 9 | 'total_step': int(1e7), # overall training timesteps 10 | 'traj_len': 200, # length of the rollout trajectory 11 | 'train_n_traj': 2, # update the models after every certain number of trajectories for each learner 12 | 'save_interval': 10, # saving the models after every certain number of updates 13 | } 14 | 15 | # Specify the environment and launch it 16 | env_name, env_type = 'CartPole-v0', 'classic_control' 17 | env_maker = partial(build_env, env_name, env_type) 18 | temp_env = env_maker() 19 | obs_shape, act_shape = temp_env.observation_space.shape, temp_env.action_space.shape 20 | 21 | env_conf = { 22 | 'env_name': env_name, 23 | 'env_type': env_type, 24 | 'env_maker': env_maker, 25 | 'obs_shape': obs_shape, 26 | 'act_shape': act_shape, 27 | } 28 | 29 | 30 | def build_network(observation_space, action_space, name='DPPO_CLIP'): 31 | """ build networks for the algorithm """ 32 | hidden_dim = 256 33 | num_hidden_layer = 2 34 | critic = ValueNetwork(observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value') 35 | 36 | actor = StochasticPolicyNetwork(observation_space, action_space, 37 | [hidden_dim] * num_hidden_layer, 38 | trainable=True, 39 | name=name + '_policy') 40 | return critic, actor 41 | 42 | 43 | def build_opt(actor_lr=1e-4, critic_lr=2e-4): 44 | """ choose the optimizer for learning """ 45 | import tensorflow as tf 46 | return [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)] 47 | 48 | 49 | net_builder = partial(build_network, temp_env.observation_space, temp_env.action_space) 50 | opt_builder = partial(build_opt, ) 51 | 52 | agent_conf = { 53 | 'net_builder': net_builder, 54 | 'opt_builder': opt_builder, 55 | 'agent_generator': partial(DPPO_CLIP, net_builder, opt_builder), 56 | } 57 | del temp_env 58 | 59 | from rlzoo.distributed.start_dis_role import main 60 | 61 | print('Start Training.') 62 | main(training_conf, env_conf, agent_conf) 63 | print('Training Finished.') 64 | -------------------------------------------------------------------------------- /rlzoo/interactive/.gitignore: -------------------------------------------------------------------------------- 1 | img/ 2 | log/ 3 | model/ 4 | -------------------------------------------------------------------------------- /rlzoo/interactive/common.py: -------------------------------------------------------------------------------- 1 | import decimal 2 | 3 | import ipywidgets as widgets 4 | import numpy as np 5 | 6 | border_list = [None, 'hidden', 'dotted', 'dashed', 'solid', 'double', 7 | 'groove', 'ridge', 'inset', 'outset', 'inherit'] 8 | 9 | 10 | class NumInput(widgets.HBox): 11 | 12 | def __init__(self, init_value, step=None, range_min=None, range_max=None): 13 | self.range = [range_min, range_max] 14 | range_min = 0 if range_min is None else range_min 15 | range_max = init_value * 2 if range_max is None else range_max 16 | self.range_size = max([range_max - init_value, init_value - range_min]) 17 | if step is None: 18 | fs = decimal.Decimal(str(init_value)).as_tuple().exponent 19 | self.decimals = -fs 20 | step = np.round(np.power(0.1, self.decimals), self.decimals) 21 | else: 22 | fs = decimal.Decimal(str(step)).as_tuple().exponent 23 | fv = decimal.Decimal(str(init_value)).as_tuple().exponent 24 | self.decimals = -min(fs, fv) 25 | 26 | self.step = step 27 | 28 | self.slider = widgets.FloatSlider( 29 | value=init_value, 30 | min=range_min, 31 | max=range_max, 32 | step=step, 33 | description='Slider input:', 34 | disabled=False, 35 | continuous_update=False, 36 | orientation='horizontal', 37 | readout=True, 38 | readout_format='.' + str(self.decimals) + 'f' 39 | ) 40 | 41 | self.text = widgets.FloatText( 42 | value=self.slider.value, 43 | description='Manual input:', 44 | disabled=False 45 | ) 46 | 47 | def __extend_max(change): 48 | num_new = np.around(change['new'], decimals=self.decimals) 49 | num_old = change['old'] 50 | if num_new > num_old: 51 | if num_new - num_old > (self.slider.max - num_old) / 2: 52 | self.range_size *= 2 53 | else: 54 | self.range_size *= 0.5 55 | else: 56 | if num_old - num_new > (num_old - self.slider.min) / 2: 57 | self.range_size *= 2 58 | else: 59 | self.range_size *= 0.5 60 | 61 | if self.range_size < self.step * 10: 62 | self.range_size = self.step * 10 63 | 64 | self.slider.min = num_new - self.range_size if self.range[0] is None else self.range[0] 65 | self.slider.max = num_new + self.range_size if self.range[1] is None else self.range[1] 66 | self.slider.value = num_new 67 | self.text.value = num_new 68 | 69 | self.slider.observe(__extend_max, names='value') 70 | self.text.observe(__extend_max, names='value') 71 | box_layout = widgets.Layout(display='flex', 72 | align_items='stretch', 73 | justify_content='center', ) 74 | # self.frame = widgets.HBox([self.slider, self.text], layout=box_layout) 75 | super().__init__([self.slider, self.text], layout=box_layout) 76 | self._int_type = False 77 | if (isinstance(init_value, int) or isinstance(init_value, np.int16) \ 78 | or isinstance(init_value, np.int32) or isinstance(init_value, np.int64)) \ 79 | and step % 1 == 0: 80 | self._int_type = True 81 | 82 | @property 83 | def value(self): 84 | result = self.slider.value 85 | if self._int_type: 86 | result = int(result) 87 | return result 88 | 89 | 90 | class Border: 91 | def __init__(self, element_list, description=None, size=5, style=0): 92 | if not isinstance(element_list, list): 93 | element_list = [element_list] 94 | 95 | box_layout = widgets.Layout(display='flex', 96 | flex_flow='column', 97 | align_items='flex-start', 98 | align_content='flex-start', 99 | # justify_content='center', 100 | justify_content='space-around', 101 | border=border_list[2] 102 | ) 103 | frame = widgets.Box(children=element_list, layout=box_layout) 104 | 105 | if description is not None: 106 | caption = widgets.HTML(value=""+description+"") 107 | children = [caption, frame] 108 | else: 109 | children = [frame] 110 | 111 | box_layout = widgets.Layout(display='flex', 112 | flex_flow='column', 113 | align_items='center', 114 | justify_content='center', 115 | border=border_list[style], ) 116 | self.frame = widgets.Box(children=children, layout=box_layout) 117 | 118 | 119 | class InfoDisplay: 120 | def __init__(self, description, detail): 121 | label = widgets.Label(description) 122 | self.data = widgets.Label(detail) 123 | self.frame = widgets.HBox([label, self.data], layout=widgets.Layout(justify_content='flex-start', )) 124 | # border=border_list[2])) 125 | -------------------------------------------------------------------------------- /rlzoo/run_rlzoo.py: -------------------------------------------------------------------------------- 1 | from rlzoo.common.env_wrappers import * 2 | from rlzoo.common.utils import * 3 | from rlzoo.algorithms import * 4 | 5 | # EnvName = 'PongNoFrameskip-v4' 6 | # EnvType = 'atari' 7 | 8 | # EnvName = 'CartPole-v0' 9 | EnvName = 'Pendulum-v0' 10 | EnvType = 'classic_control' 11 | 12 | # EnvName = 'BipedalWalker-v2' 13 | # EnvType = 'box2d' 14 | 15 | # EnvName = 'Ant-v2' 16 | # EnvType = 'mujoco' 17 | 18 | # EnvName = 'FetchPush-v1' 19 | # EnvType = 'robotics' 20 | 21 | # EnvName = 'FishSwim-v0' 22 | # EnvType = 'dm_control' 23 | 24 | # EnvName = 'ReachTarget' 25 | # EnvType = 'rlbench' 26 | # env = build_env(EnvName, EnvType, state_type='vision') 27 | 28 | AlgName = 'SAC' 29 | env = build_env(EnvName, EnvType) 30 | alg_params, learn_params = call_default_params(env, EnvType, AlgName) 31 | alg = eval(AlgName+'(**alg_params)') 32 | alg.learn(env=env, mode='train', render=False, **learn_params) 33 | alg.learn(env=env, mode='test', render=True, **learn_params) 34 | 35 | # AlgName = 'DPPO' 36 | # number_workers = 2 # need to specify number of parallel workers in parallel algorithms like A3C and DPPO 37 | # env = build_env(EnvName, EnvType, nenv=number_workers) 38 | # alg_params, learn_params = call_default_params(env, EnvType, AlgName) 39 | # alg_params['method'] = 'clip' # specify 'clip' or 'penalty' method for different version of PPO and DPPO 40 | # alg = eval(AlgName+'(**alg_params)') 41 | # alg.learn(env=env, mode='train', render=False, **learn_params) 42 | # alg.learn(env=env, mode='test', render=True, **learn_params) 43 | 44 | # AlgName = 'PPO' 45 | # env = build_env(EnvName, EnvType) 46 | # alg_params, learn_params = call_default_params(env, EnvType, AlgName) 47 | # alg_params['method'] = 'clip' # specify 'clip' or 'penalty' method for different version of PPO and DPPO 48 | # alg = eval(AlgName+'(**alg_params)') 49 | # alg.learn(env=env, mode='train', render=False, **learn_params) 50 | # alg.learn(env=env, mode='test', render=True, **learn_params) 51 | 52 | # AlgName = 'A3C' 53 | # number_workers = 2 # need to specify number of parallel workers 54 | # env = build_env(EnvName, EnvType, nenv=number_workers) 55 | # alg_params, learn_params = call_default_params(env, EnvType, 'A3C') 56 | # alg = eval(AlgName+'(**alg_params)') 57 | # alg.learn(env=env, mode='train', render=False, **learn_params) 58 | # alg.learn(env=env, mode='test', render=True, **learn_params) 59 | 60 | env.close() 61 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | # Read requirements.txt, ignore comments 4 | try: 5 | REQUIRES = list() 6 | f = open("requirements.txt", "rb") 7 | for line in f.read().decode("utf-8").split("\n"): 8 | line = line.strip() 9 | if "#" in line: 10 | line = line[:line.find("#")].strip() 11 | if line: 12 | REQUIRES.append(line) 13 | except: 14 | print("'requirements.txt' not found!") 15 | REQUIRES = list() 16 | 17 | setup( 18 | name = "rlzoo", 19 | version = "1.0.4", 20 | include_package_data=True, 21 | author='Zihan Ding, Tianyang Yu, Yanhua Huang, Hongming Zhang, Hao Dong', 22 | author_email='zhding@mail.ustc.edu.cn', 23 | url = "https://github.com/tensorlayer/RLzoo" , 24 | license = "apache" , 25 | packages = find_packages(), 26 | install_requires=REQUIRES, 27 | description = "A collection of reinforcement learning algorithms with hierarchical code structure and convenient APIs.", 28 | keywords = "Reinforcment Learning", 29 | platform=['any'], 30 | python_requires='>=3.5', 31 | ) 32 | --------------------------------------------------------------------------------