├── .gitignore
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── docs
├── Makefile
├── algorithms
│ ├── a3c.rst
│ ├── ac.rst
│ ├── ddpg.rst
│ ├── dppo.rst
│ ├── dqn.rst
│ ├── pg.rst
│ ├── ppo.rst
│ ├── sac.rst
│ ├── td3.rst
│ └── trpo.rst
├── common
│ ├── basicnets.rst
│ ├── buffer.rst
│ ├── distributions.rst
│ ├── envlist.rst
│ ├── envwrappers.rst
│ ├── mathutils.rst
│ ├── policynets.rst
│ ├── utils.rst
│ └── valuenets.rst
├── conf.py
├── guide
│ ├── api.rst
│ ├── configuration.rst
│ ├── installation.rst
│ └── quickstart.rst
├── img
│ ├── logo.png
│ └── rlzoo-logo.png
├── index.rst
├── make.bat
├── mkdocs.yml
└── other
│ ├── drl_book.rst
│ └── drl_tutorial.rst
├── examples.md
├── gif
├── ACM_MM2021_Presentation_Slide.pdf
├── atari.gif
├── box2d.gif
├── classic.gif
├── dmcontrol.gif
├── interactive.gif
├── mujoco.gif
├── rlbench.gif
└── robotics.gif
├── requirements.txt
├── rlzoo
├── .gitignore
├── __init__.py
├── algorithms
│ ├── __init__.py
│ ├── a3c
│ │ ├── __init__.py
│ │ ├── a3c.py
│ │ ├── default.py
│ │ └── run_a3c.py
│ ├── ac
│ │ ├── __init__.py
│ │ ├── ac.py
│ │ ├── default.py
│ │ └── run_ac.py
│ ├── ddpg
│ │ ├── __init__.py
│ │ ├── ddpg.py
│ │ ├── default.py
│ │ └── run_ddpg.py
│ ├── dppo
│ │ ├── __init__.py
│ │ ├── default.py
│ │ └── dppo.py
│ ├── dppo_clip
│ │ ├── __init__.py
│ │ ├── dppo_clip.py
│ │ └── run_dppo_clip.py
│ ├── dppo_clip_distributed
│ │ ├── __init__.py
│ │ └── dppo_clip.py
│ ├── dppo_penalty
│ │ ├── __init__.py
│ │ ├── dppo_penalty.py
│ │ └── run_dppo_penalty.py
│ ├── dqn
│ │ ├── __init__.py
│ │ ├── default.py
│ │ ├── dqn.py
│ │ └── run_dqn.py
│ ├── pg
│ │ ├── __init__.py
│ │ ├── default.py
│ │ ├── pg.py
│ │ └── run_pg.py
│ ├── ppo
│ │ ├── __init__.py
│ │ ├── default.py
│ │ └── ppo.py
│ ├── ppo_clip
│ │ ├── __init__.py
│ │ ├── ppo_clip.py
│ │ └── run_ppo_clip.py
│ ├── ppo_penalty
│ │ ├── __init__.py
│ │ ├── ppo_penalty.py
│ │ └── run_ppo_penalty.py
│ ├── sac
│ │ ├── __init__.py
│ │ ├── default.py
│ │ ├── run_sac.py
│ │ └── sac.py
│ ├── td3
│ │ ├── __init__.py
│ │ ├── default.py
│ │ ├── run_td3.py
│ │ └── td3.py
│ └── trpo
│ │ ├── __init__.py
│ │ ├── default.py
│ │ ├── run_trpo.py
│ │ └── trpo.py
├── common
│ ├── __init__.py
│ ├── basic_nets.py
│ ├── buffer.py
│ ├── build_rlbench_env.py
│ ├── distributions.py
│ ├── env_list.py
│ ├── env_wrappers.py
│ ├── math_utils.py
│ ├── policy_networks.py
│ ├── utils.py
│ └── value_networks.py
├── distributed
│ ├── __init__.py
│ ├── dis_components.py
│ ├── run_dis_train.sh
│ ├── start_dis_role.py
│ └── training_components.py
├── interactive
│ ├── .gitignore
│ ├── common.py
│ ├── components.py
│ └── main.ipynb
└── run_rlzoo.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 | *~
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 | .pytest_cache/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 | db.sqlite3
59 |
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 |
64 | # Scrapy stuff:
65 | .scrapy
66 |
67 | # Sphinx documentation
68 | docs/_build/
69 | docs/test_build/
70 | docs/build_test/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # pyenv
79 | .python-version
80 |
81 | # celery beat schedule file
82 | celerybeat-schedule
83 |
84 | # SageMath parsed files
85 | *.sage.py
86 |
87 | # Environments
88 | .env
89 | .venv
90 | env/
91 | venv/
92 | ENV/
93 | env.bak/
94 | venv.bak/
95 | venv_/
96 | venv2/
97 | venv3/
98 | venv_doc/
99 | venv_py2/
100 |
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 |
105 | # Rope project settings
106 | .ropeproject
107 |
108 | # mkdocs documentation
109 | /site
110 |
111 | # mypy
112 | .mypy_cache/
113 |
114 |
115 | # IDE Specific directories
116 | .DS_Store
117 | .idea
118 | .vscode/
119 |
120 | # TensorLayer Directories
121 | checkpoints
122 | data/
123 | lib_win/
124 |
125 | # Custom Scripts
126 | update_tl.bat
127 | update_tl.py
128 |
129 | # Data Files and ByteCode files
130 | *.gz
131 | *.npz
132 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include examples.md LICENSE.txt requirements.txt README.md
2 | recursive-include rlzoo *.py
3 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/algorithms/a3c.rst:
--------------------------------------------------------------------------------
1 | A3C
2 | =================================
3 |
4 | Example
5 | -----------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import A3C
13 |
14 | AlgName = 'A3C'
15 | EnvName = 'PongNoFrameskip-v4'
16 | EnvType = 'atari'
17 |
18 | # EnvName = 'Pendulum-v0' # only continuous action
19 | # EnvType = 'classic_control'
20 |
21 | # EnvName = 'BipedalWalker-v2'
22 | # EnvType = 'box2d'
23 |
24 | # EnvName = 'Ant-v2'
25 | # EnvType = 'mujoco'
26 |
27 | # EnvName = 'FetchPush-v1'
28 | # EnvType = 'robotics'
29 |
30 | # EnvName = 'FishSwim-v0'
31 | # EnvType = 'dm_control'
32 |
33 | number_workers = 2 # need to specify number of parallel workers
34 | env = build_env(EnvName, EnvType, nenv=number_workers)
35 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
36 | alg = eval(AlgName+'(**alg_params)')
37 | alg.learn(env=env, mode='train', render=False, **learn_params)
38 | alg.learn(env=env, mode='test', render=True, **learn_params)
39 |
40 | Asychronous Advantage Actor-Critic
41 | ----------------------------------------
42 |
43 | .. autoclass:: rlzoo.algorithms.a3c.a3c.A3C
44 | :members:
45 | :undoc-members:
46 |
47 | Default Hyper-parameters
48 | ----------------------------------
49 |
50 | .. automodule:: rlzoo.algorithms.a3c.default
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
--------------------------------------------------------------------------------
/docs/algorithms/ac.rst:
--------------------------------------------------------------------------------
1 | AC
2 | ===========================
3 |
4 | Example
5 | -----------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import AC
13 |
14 | AlgName = 'AC'
15 | EnvName = 'PongNoFrameskip-v4'
16 | EnvType = 'atari'
17 |
18 | # EnvName = 'Pendulum-v0'
19 | # EnvType = 'classic_control'
20 |
21 | # EnvName = 'BipedalWalker-v2'
22 | # EnvType = 'box2d'
23 |
24 | # EnvName = 'Ant-v2'
25 | # EnvType = 'mujoco'
26 |
27 | # EnvName = 'FetchPush-v1'
28 | # EnvType = 'robotics'
29 |
30 | # EnvName = 'FishSwim-v0'
31 | # EnvType = 'dm_control'
32 |
33 | # EnvName = 'ReachTarget'
34 | # EnvType = 'rlbench'
35 |
36 | env = build_env(EnvName, EnvType)
37 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
38 | alg = eval(AlgName+'(**alg_params)')
39 | alg.learn(env=env, mode='train', render=False, **learn_params)
40 | alg.learn(env=env, mode='test', render=True, **learn_params)
41 |
42 | Actor-Critic
43 | ---------------------------------
44 |
45 | .. autoclass:: rlzoo.algorithms.ac.ac.AC
46 | :members:
47 | :undoc-members:
48 |
49 | Default Hyper-parameters
50 | ----------------------------------
51 |
52 | .. automodule:: rlzoo.algorithms.ac.default
53 | :members:
54 | :undoc-members:
55 | :show-inheritance:
56 |
57 |
--------------------------------------------------------------------------------
/docs/algorithms/ddpg.rst:
--------------------------------------------------------------------------------
1 | DDPG
2 | ===========================
3 |
4 | Example
5 | -----------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import DDPG
13 |
14 | AlgName = 'DDPG'
15 | EnvName = 'Pendulum-v0' # only continuous action
16 | EnvType = 'classic_control'
17 |
18 | # EnvName = 'BipedalWalker-v2'
19 | # EnvType = 'box2d'
20 |
21 | # EnvName = 'Ant-v2'
22 | # EnvType = 'mujoco'
23 |
24 | # EnvName = 'FetchPush-v1'
25 | # EnvType = 'robotics'
26 |
27 | # EnvName = 'FishSwim-v0'
28 | # EnvType = 'dm_control'
29 |
30 | # EnvName = 'ReachTarget'
31 | # EnvType = 'rlbench'
32 |
33 | env = build_env(EnvName, EnvType)
34 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
35 | alg = eval(AlgName+'(**alg_params)')
36 | alg.learn(env=env, mode='train', render=False, **learn_params)
37 | alg.learn(env=env, mode='test', render=True, **learn_params)
38 |
39 | Deep Deterministic Policy Gradient
40 | -----------------------------------
41 |
42 | .. autoclass:: rlzoo.algorithms.ddpg.ddpg.DDPG
43 | :members:
44 | :undoc-members:
45 |
46 | Default Hyper-parameters
47 | ----------------------------------
48 |
49 | .. automodule:: rlzoo.algorithms.ddpg.default
50 | :members:
51 | :undoc-members:
52 | :show-inheritance:
53 |
54 |
--------------------------------------------------------------------------------
/docs/algorithms/dppo.rst:
--------------------------------------------------------------------------------
1 | DPPO
2 | ===========================
3 |
4 | Example
5 | -----------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import DPPO
13 |
14 | EnvName = 'PongNoFrameskip-v4'
15 | EnvType = 'atari'
16 |
17 | # EnvName = 'Pendulum-v0'
18 | # EnvType = 'classic_control'
19 |
20 | # EnvName = 'BipedalWalker-v2'
21 | # EnvType = 'box2d'
22 |
23 | # EnvName = 'Ant-v2'
24 | # EnvType = 'mujoco'
25 |
26 | # EnvName = 'FetchPush-v1'
27 | # EnvType = 'robotics'
28 |
29 | # EnvName = 'FishSwim-v0'
30 | # EnvType = 'dm_control'
31 |
32 | # EnvName = 'ReachTarget'
33 | # EnvType = 'rlbench'
34 |
35 | number_workers = 2 # need to specify number of parallel workers
36 | env = build_env(EnvName, EnvType, nenv=number_workers)
37 | alg_params, learn_params = call_default_params(env, EnvType, 'DPPO')
38 | alg = DPPO(method='penalty', **alg_params) # specify 'clip' or 'penalty' method for PPO
39 | alg.learn(env=env, mode='train', render=False, **learn_params)
40 | alg.learn(env=env, mode='test', render=True, **learn_params)
41 |
42 | Distributed Proximal Policy Optimization (Penalty)
43 | ----------------------------------------------------
44 |
45 | .. autoclass:: rlzoo.algorithms.dppo_penalty.dppo_penalty.DPPO_PENALTY
46 | :members:
47 | :undoc-members:
48 |
49 |
50 | Distributed Proximal Policy Optimization (Clip)
51 | ------------------------------------------------
52 |
53 | .. autoclass:: rlzoo.algorithms.dppo_clip.dppo_clip.DPPO_CLIP
54 | :members:
55 | :undoc-members:
56 |
57 | Default Hyper-parameters
58 | ----------------------------------
59 |
60 | .. automodule:: rlzoo.algorithms.dppo.default
61 | :members:
62 | :undoc-members:
63 | :show-inheritance:
64 |
65 |
--------------------------------------------------------------------------------
/docs/algorithms/dqn.rst:
--------------------------------------------------------------------------------
1 | DQN and Variants
2 | =================================
3 |
4 | Example
5 | ------------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import DQN
13 |
14 | AlgName = 'DQN'
15 | EnvName = 'PongNoFrameskip-v4'
16 | EnvType = 'atari'
17 |
18 | # EnvName = 'CartPole-v1'
19 | # EnvType = 'classic_control' # the name of env needs to match the type of env
20 |
21 | env = build_env(EnvName, EnvType)
22 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
23 | alg = eval(AlgName+'(**alg_params)')
24 | alg.learn(env=env, mode='train', **learn_params)
25 | alg.learn(env=env, mode='test', render=True, **learn_params)
26 |
27 | Deep Q-Networks
28 | ---------------------------------
29 |
30 | .. autoclass:: rlzoo.algorithms.dqn.dqn.DQN
31 | :members:
32 | :undoc-members:
33 |
34 | Default Hyper-parameters
35 | ----------------------------------
36 |
37 | .. automodule:: rlzoo.algorithms.dqn.default
38 | :members:
39 | :undoc-members:
40 | :show-inheritance:
41 |
42 |
--------------------------------------------------------------------------------
/docs/algorithms/pg.rst:
--------------------------------------------------------------------------------
1 | VPG
2 | =================================
3 |
4 | Example
5 | -----------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import PG
13 |
14 | AlgName = 'PG'
15 | EnvName = 'PongNoFrameskip-v4'
16 | EnvType = 'atari'
17 |
18 | # EnvName = 'CartPole-v0'
19 | # EnvType = 'classic_control'
20 |
21 | # EnvName = 'BipedalWalker-v2'
22 | # EnvType = 'box2d'
23 |
24 | # EnvName = 'Ant-v2'
25 | # EnvType = 'mujoco'
26 |
27 | # EnvName = 'FetchPush-v1'
28 | # EnvType = 'robotics'
29 |
30 | # EnvName = 'FishSwim-v0'
31 | # EnvType = 'dm_control'
32 |
33 | # EnvName = 'ReachTarget'
34 | # EnvType = 'rlbench'
35 |
36 | env = build_env(EnvName, EnvType)
37 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
38 | alg = eval(AlgName+'(**alg_params)')
39 | alg.learn(env=env, mode='train', render=False, **learn_params)
40 | alg.learn(env=env, mode='test', render=True, **learn_params)
41 |
42 | Vanilla Policy Gradient
43 | ---------------------------------
44 |
45 | .. autoclass:: rlzoo.algorithms.pg.pg.PG
46 | :members:
47 | :undoc-members:
48 |
49 | Default Hyper-parameters
50 | ----------------------------------
51 |
52 | .. automodule:: rlzoo.algorithms.pg.default
53 | :members:
54 | :undoc-members:
55 | :show-inheritance:
56 |
57 |
--------------------------------------------------------------------------------
/docs/algorithms/ppo.rst:
--------------------------------------------------------------------------------
1 | PPO
2 | ===========================
3 |
4 | Example
5 | -----------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import PPO
13 |
14 | EnvName = 'PongNoFrameskip-v4'
15 | EnvType = 'atari'
16 |
17 | # EnvName = 'Pendulum-v0'
18 | # EnvType = 'classic_control'
19 |
20 | # EnvName = 'BipedalWalker-v2'
21 | # EnvType = 'box2d'
22 |
23 | # EnvName = 'Ant-v2'
24 | # EnvType = 'mujoco'
25 |
26 | # EnvName = 'FetchPush-v1'
27 | # EnvType = 'robotics'
28 |
29 | # EnvName = 'FishSwim-v0'
30 | # EnvType = 'dm_control'
31 |
32 | # EnvName = 'ReachTarget'
33 | # EnvType = 'rlbench'
34 |
35 | env = build_env(EnvName, EnvType)
36 | alg_params, learn_params = call_default_params(env, EnvType, 'PPO')
37 | alg = PPO(method='clip', **alg_params) # specify 'clip' or 'penalty' method for PPO
38 | alg.learn(env=env, mode='train', render=False, **learn_params)
39 | alg.learn(env=env, mode='test', render=False, **learn_params)
40 |
41 | Proximal Policy Optimization (Penalty)
42 | ----------------------------------------------------
43 |
44 | .. autoclass:: rlzoo.algorithms.ppo_penalty.ppo_penalty.PPO_PENALTY
45 | :members:
46 | :undoc-members:
47 |
48 |
49 | Proximal Policy Optimization (Clip)
50 | ------------------------------------------------
51 |
52 | .. autoclass:: rlzoo.algorithms.ppo_clip.ppo_clip.PPO_CLIP
53 | :members:
54 | :undoc-members:
55 |
56 | Default Hyper-parameters
57 | ----------------------------------
58 |
59 | .. automodule:: rlzoo.algorithms.ppo.default
60 | :members:
61 | :undoc-members:
62 | :show-inheritance:
63 |
64 |
--------------------------------------------------------------------------------
/docs/algorithms/sac.rst:
--------------------------------------------------------------------------------
1 | SAC
2 | ===========================
3 |
4 | Example
5 | -----------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import SAC
13 |
14 | AlgName = 'SAC'
15 | EnvName = 'Pendulum-v0' # only continuous action
16 | EnvType = 'classic_control'
17 |
18 | # EnvName = 'BipedalWalker-v2'
19 | # EnvType = 'box2d'
20 |
21 | # EnvName = 'Ant-v2'
22 | # EnvType = 'mujoco'
23 |
24 | # EnvName = 'FetchPush-v1'
25 | # EnvType = 'robotics'
26 |
27 | # EnvName = 'FishSwim-v0'
28 | # EnvType = 'dm_control'
29 |
30 | # EnvName = 'ReachTarget'
31 | # EnvType = 'rlbench'
32 |
33 | env = build_env(EnvName, EnvType)
34 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
35 | alg = eval(AlgName+'(**alg_params)')
36 | alg.learn(env=env, mode='train', render=False, **learn_params)
37 | alg.learn(env=env, mode='test', render=True, **learn_params)
38 |
39 | Soft Actor-Critic
40 | ---------------------------------
41 |
42 | .. autoclass:: rlzoo.algorithms.sac.sac.SAC
43 | :members:
44 | :undoc-members:
45 |
46 | Default Hyper-parameters
47 | ----------------------------------
48 |
49 | .. automodule:: rlzoo.algorithms.sac.default
50 | :members:
51 | :undoc-members:
52 | :show-inheritance:
53 |
54 |
--------------------------------------------------------------------------------
/docs/algorithms/td3.rst:
--------------------------------------------------------------------------------
1 | TD3
2 | ===========================
3 |
4 | Example
5 | -----------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import TD3
13 |
14 | AlgName = 'TD3'
15 | EnvName = 'Pendulum-v0' # only continuous action
16 | EnvType = 'classic_control'
17 |
18 | # EnvName = 'BipedalWalker-v2'
19 | # EnvType = 'box2d'
20 |
21 | # EnvName = 'Ant-v2'
22 | # EnvType = 'mujoco'
23 |
24 | # EnvName = 'FetchPush-v1'
25 | # EnvType = 'robotics'
26 |
27 | # EnvName = 'FishSwim-v0'
28 | # EnvType = 'dm_control'
29 |
30 | # EnvName = 'ReachTarget'
31 | # EnvType = 'rlbench'
32 |
33 | env = build_env(EnvName, EnvType)
34 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
35 | alg = eval(AlgName+'(**alg_params)')
36 | alg.learn(env=env, mode='train', render=False, **learn_params)
37 | alg.learn(env=env, mode='test', render=True, **learn_params)
38 |
39 | Twin Delayed DDPG
40 | ---------------------------------
41 |
42 | .. autoclass:: rlzoo.algorithms.td3.td3.TD3
43 | :members:
44 | :undoc-members:
45 |
46 | Default Hyper-parameters
47 | ----------------------------------
48 |
49 | .. automodule:: rlzoo.algorithms.td3.default
50 | :members:
51 | :undoc-members:
52 | :show-inheritance:
53 |
54 |
--------------------------------------------------------------------------------
/docs/algorithms/trpo.rst:
--------------------------------------------------------------------------------
1 | TRPO
2 | ===========================
3 |
4 | Example
5 | -----------
6 |
7 | .. code-block:: python
8 | :linenos:
9 |
10 | from rlzoo.common.env_wrappers import build_env
11 | from rlzoo.common.utils import call_default_params
12 | from rlzoo.algorithms import TD3
13 |
14 | AlgName = 'TRPO'
15 | EnvName = 'PongNoFrameskip-v4'
16 | EnvType = 'atari'
17 |
18 | # EnvName = 'CartPole-v0'
19 | # EnvType = 'classic_control'
20 |
21 | # EnvName = 'BipedalWalker-v2'
22 | # EnvType = 'box2d'
23 |
24 | # EnvName = 'Ant-v2'
25 | # EnvType = 'mujoco'
26 |
27 | # EnvName = 'FetchPush-v1'
28 | # EnvType = 'robotics'
29 |
30 | # EnvName = 'FishSwim-v0'
31 | # EnvType = 'dm_control'
32 |
33 | # EnvName = 'ReachTarget'
34 | # EnvType = 'rlbench'
35 |
36 | env = build_env(EnvName, EnvType)
37 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
38 | alg = eval(AlgName+'(**alg_params)')
39 | alg.learn(env=env, mode='train', render=False, **learn_params)
40 | alg.learn(env=env, mode='test', render=True, **learn_params)
41 |
42 | Trust Region Policy Optimization
43 | ---------------------------------
44 |
45 | .. autoclass:: rlzoo.algorithms.trpo.trpo.TRPO
46 | :members:
47 | :undoc-members:
48 |
49 | Default Hyper-parameters
50 | ----------------------------------
51 |
52 | .. automodule:: rlzoo.algorithms.trpo.default
53 | :members:
54 | :undoc-members:
55 | :show-inheritance:
56 |
57 |
--------------------------------------------------------------------------------
/docs/common/basicnets.rst:
--------------------------------------------------------------------------------
1 | Basic Networks
2 | ===========================
3 |
4 |
5 | Basic Networks in RLzoo
6 | ---------------------------------
7 |
8 | .. automodule:: rlzoo.common.basic_nets
9 | :members:
10 | :undoc-members:
11 | :show-inheritance:
12 |
--------------------------------------------------------------------------------
/docs/common/buffer.rst:
--------------------------------------------------------------------------------
1 | Replay Buffer
2 | ===========================
3 |
4 |
5 | Replay Buffer in RLzoo
6 | ---------------------------------
7 |
8 | .. automodule:: rlzoo.common.buffer
9 | :members:
10 | :undoc-members:
11 | :show-inheritance:
12 | :special-members:
13 |
14 |
--------------------------------------------------------------------------------
/docs/common/distributions.rst:
--------------------------------------------------------------------------------
1 | Distributions
2 | ===========================
3 |
4 |
5 | Distributions for Stochastic Policy in RLzoo
6 | ----------------------------------------------
7 |
8 | .. automodule:: rlzoo.common.distributions
9 | :members:
10 | :undoc-members:
11 | :show-inheritance:
12 |
13 |
--------------------------------------------------------------------------------
/docs/common/envlist.rst:
--------------------------------------------------------------------------------
1 | Environment List
2 | ===========================
3 |
4 | .. _env_list:
5 |
6 | List of Supported Environments in RLzoo
7 | ----------------------------------------
8 |
9 | .. automodule:: rlzoo.common.env_list
10 | :members:
11 | :undoc-members:
12 | :show-inheritance:
13 |
14 |
15 |
16 | .. literalinclude:: ../../rlzoo/common/env_list.py
17 | :language: python
18 | :lines: 10-
19 | :linenos:
--------------------------------------------------------------------------------
/docs/common/envwrappers.rst:
--------------------------------------------------------------------------------
1 | Environment Wrappers
2 | ===========================
3 |
4 |
5 | Environment Wrappers in RLzoo
6 | ---------------------------------
7 |
8 | .. automodule:: rlzoo.common.env_wrappers
9 | :members:
10 | :undoc-members:
11 | :show-inheritance:
12 |
13 |
--------------------------------------------------------------------------------
/docs/common/mathutils.rst:
--------------------------------------------------------------------------------
1 | Math Utilities
2 | ===========================
3 |
4 |
5 | Math Utilities in RLzoo
6 | ---------------------------------
7 |
8 | .. automodule:: rlzoo.common.math_utils
9 | :members:
10 | :undoc-members:
11 | :show-inheritance:
12 |
13 |
--------------------------------------------------------------------------------
/docs/common/policynets.rst:
--------------------------------------------------------------------------------
1 | Policy Networks
2 | ===========================
3 |
4 |
5 | Policy Networks in RLzoo
6 | ---------------------------------
7 |
8 | .. autoclass:: rlzoo.common.policy_networks.StochasticContinuousPolicyNetwork
9 | :members:
10 | :undoc-members:
11 | :show-inheritance:
12 | :special-members:
13 |
14 |
15 | .. autoclass:: rlzoo.common.policy_networks.DeterministicContinuousPolicyNetwork
16 | :members:
17 | :undoc-members:
18 | :show-inheritance:
19 | :special-members:
20 |
21 | .. autoclass:: rlzoo.common.policy_networks.DeterministicPolicyNetwork
22 | :members:
23 | :undoc-members:
24 | :show-inheritance:
25 | :special-members:
26 |
27 | .. autoclass:: rlzoo.common.policy_networks.StochasticPolicyNetwork
28 | :members:
29 | :undoc-members:
30 | :show-inheritance:
31 | :special-members:
32 |
--------------------------------------------------------------------------------
/docs/common/utils.rst:
--------------------------------------------------------------------------------
1 | Common Utilities
2 | ===========================
3 |
4 |
5 | Common Utilities in RLzoo
6 | ---------------------------------
7 |
8 | .. automodule:: rlzoo.common.utils
9 | :members:
10 | :undoc-members:
11 | :show-inheritance:
12 |
13 |
--------------------------------------------------------------------------------
/docs/common/valuenets.rst:
--------------------------------------------------------------------------------
1 | Value Networks
2 | ===========================
3 |
4 |
5 | Value Networks in RLzoo
6 | ---------------------------------
7 |
8 | .. autoclass:: rlzoo.common.value_networks.ValueNetwork
9 | :members:
10 | :undoc-members:
11 | :special-members:
12 |
13 | .. autoclass:: rlzoo.common.value_networks.MlpQNetwork
14 | :members:
15 | :undoc-members:
16 | :special-members:
17 |
18 | .. autoclass:: rlzoo.common.value_networks.QNetwork
19 | :members:
20 | :undoc-members:
21 | :special-members:
22 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 |
16 | sys.path.insert(0, os.path.abspath("../")) # Important
17 | sys.path.insert(0, os.path.abspath(os.path.join("..", "rlzoo"))) # Important
18 |
19 | # from rlzoo.algorithms import *
20 | import sphinx_rtd_theme
21 |
22 | # -- Project information -----------------------------------------------------
23 |
24 | project = 'RLzoo'
25 | copyright = '2020, Zihan Ding, Tianyang Yu, Yanhua Huang, Hongming Zhang, Hao Dong'
26 | author = 'Zihan Ding, Tianyang Yu, Yanhua Huang, Hongming Zhang, Hao Dong'
27 |
28 | # The full version, including alpha/beta/rc tags
29 | release = '1.0.3'
30 |
31 |
32 | # -- General configuration ---------------------------------------------------
33 |
34 | # Add any Sphinx extension module names here, as strings. They can be
35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
36 | # ones.
37 |
38 | extensions = [
39 | 'sphinx.ext.autodoc',
40 | 'sphinx.ext.doctest',
41 | 'sphinx.ext.intersphinx',
42 | 'sphinx.ext.coverage',
43 | 'sphinx.ext.imgmath',
44 | 'sphinx.ext.mathjax',
45 | 'sphinx.ext.ifconfig',
46 | 'sphinx.ext.viewcode',
47 | 'sphinx.ext.githubpages',
48 | # 'sphinxcontrib.bibtex',
49 | 'recommonmark'
50 | ]
51 |
52 | autodoc_mock_imports = [
53 | 'cv2',
54 | 'hyperdash',
55 | 'gridfs',
56 | 'horovod',
57 | 'hyperdash',
58 | 'imageio',
59 | 'lxml',
60 | 'matplotlib',
61 | 'nltk',
62 | # 'numpy',
63 | 'PIL',
64 | 'progressbar',
65 | 'pymongo',
66 | 'scipy',
67 | 'skimage',
68 | 'sklearn',
69 | # 'tensorflow',
70 | 'tqdm',
71 | 'h5py',
72 | # 'tensorlayer.third_party.roi_pooling.roi_pooling.roi_pooling_ops', # TL C++ Packages
73 | ]
74 |
75 |
76 | # Add any paths that contain templates here, relative to this directory.
77 | templates_path = ['_templates']
78 | source_suffix = ['.rst', '.md']
79 | master_doc = 'index'
80 |
81 | # List of patterns, relative to source directory, that match files and
82 | # directories to ignore when looking for source files.
83 | # This pattern also affects html_static_path and html_extra_path.
84 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
85 |
86 |
87 | # -- Options for HTML output -------------------------------------------------
88 |
89 | # The theme to use for HTML and HTML Help pages. See the documentation for
90 | # a list of builtin themes.
91 | #
92 | html_theme = 'sphinx_rtd_theme'
93 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
94 | html_logo = './img/rlzoo-logo.png'
95 |
96 |
97 | # Add any paths that contain custom static files (such as style sheets) here,
98 | # relative to this directory. They are copied after the builtin static files,
99 | # so a file named "default.css" will overwrite the builtin "default.css".
100 | html_static_path = ['_static']
101 |
--------------------------------------------------------------------------------
/docs/guide/api.rst:
--------------------------------------------------------------------------------
1 | API
2 | =================================
3 |
4 | make_env()
5 | ----------------------
6 |
7 | It can be used as:
8 |
9 | .. code-block:: python
10 | :linenos:
11 |
12 | env = build_env(EnvName, EnvType)
13 |
14 | call_default_params()
15 | ----------------------
16 |
17 | It can be used as:
18 |
19 | .. code-block:: python
20 | :linenos:
21 |
22 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
23 |
24 | The ``call_default_params`` returns the hyper-parameters stored in two dictionaries ``alg_params`` and ``learn_params``, which can be printed to see what are contained inside. Hyper-parameters in these two dictionaries can also be changed by users before instantiating the agent and starting the learning process.
25 |
26 | If you want to know exactly where the default hyper-parameters come from, they are stored in an individual Python script as ``default.py`` in each algorithm file in ``./rlzoo/algorithms/``.
27 |
28 | alg.learn()
29 | ------------
30 |
31 | It can be used as:
32 |
33 | .. code-block:: python
34 | :linenos:
35 |
36 | # start the training
37 | alg.learn(env=env, mode='train', render=False, **learn_params)
38 | # test after training
39 | alg.learn(env=env, mode='test', render=True, **learn_params)
40 |
41 | where the ``alg`` is an instantiation of DRL algorithm in RLzoo.
--------------------------------------------------------------------------------
/docs/guide/configuration.rst:
--------------------------------------------------------------------------------
1 | Configurations Overview
2 | =================================
3 |
4 | Supported DRL Agorithms
5 | --------------------------
6 | Generally RLzoo supports following DRL algorithms:
7 |
8 | **Value-based methods**
9 |
10 | * `Deep Q-Networks (DQN) `_
11 | * `Double DQN `_
12 | * `Dueling DQN `_
13 | * `Prioritized Experience Replay (PER) `_
14 | * `Retrace `_
15 | * `Noisy DQN `_
16 | * `Distributed DQN `_
17 |
18 | **Policy-based methods**
19 |
20 | * `Vanilla Policy Gradient (VPG) `_
21 | * `Trust Region Policy Optimization (TRPO) `_
22 | * `Proximal Policy Optimization (PPO) `_
23 | * `Distributed PPO (DPPO) `_
24 |
25 | **Actor-critic methods**
26 |
27 | * `Actor-Critic (AC) `_
28 | * `Asychronous Advantage Actor-Critic (A3C) `_
29 | * `Deep Deterministic Policy Gradient (DDPG) `_
30 | * `Twin Delayed DDPG (TD3) `_
31 | * `Soft Actor-Critic (SAC) `_
32 |
33 |
34 | Supported Environments
35 | --------------------------
36 | Generally RLzoo supports following environments for DRL:
37 |
38 | * `OpenAI Gym `_
39 | * Atari
40 | * Box2D
41 | * Classic Control
42 | * MuJoCo
43 | * Robotics
44 | * `DeepMind Control Suite `_
45 |
46 | * `RLBench `_
47 |
48 |
49 | Full list of specific names of environments supported in RLzoo can be checked in :ref:`env_list`.
50 |
51 | Supported Configurations
52 | -----------------------------
53 | Not all configurations (specific RL algorithm on specific environment) are supported in RLzoo, as in other libraries. The supported configurations for RL algorithms with corresponding environments in RLzoo are listed in the following table.
54 |
55 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
56 | | Algorithms | Action Space | Policy | Update | Envs |
57 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
58 | | DQN (double, dueling, PER) | Discrete Only | NA | Off-policy | Atari, Classic Control |
59 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
60 | | AC | Discrete/Continuous | Stochastic | On-policy | All |
61 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
62 | | PG | Discrete/Continuous | Stochastic | On-policy | All |
63 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
64 | | DDPG | Continuous | Deterministic | Off-policy | Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control, RLBench |
65 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
66 | | TD3 | Continuous | Deterministic | Off-policy | Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control, RLBench |
67 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
68 | | SAC | Continuous | Stochastic | Off-policy | Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control, RLBench |
69 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
70 | | A3C | Discrete/Continuous | Stochastic | On-policy | Atari, Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control |
71 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
72 | | PPO | Discrete/Continuous | Stochastic | On-policy | All |
73 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
74 | | DPPO | Discrete/Continuous | Stochastic | On-policy | Atari, Classic Control, Box2D, MuJoCo, Robotics, DeepMind Control |
75 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
76 | | TRPO | Discrete/Continuous | Stochastic | On-policy | All |
77 | +----------------------------+---------------------+---------------+------------+---------------------------------------------------------------------+
--------------------------------------------------------------------------------
/docs/guide/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | =================================
3 |
4 | RLzoo generally requires Python>=3.5. Also if you want to use DeepMind Control Suite environment, Python 3.6 will be required.
5 |
6 | Direct installation:
7 |
8 | .. code-block:: bash
9 | :linenos:
10 |
11 | pip3 install rlzoo --upgrade
12 |
13 | Install from the source code on github:
14 |
15 | .. code-block:: bash
16 | :linenos:
17 |
18 | git clone https://github.com/tensorlayer/RLzoo.git
19 | cd RLzoo
20 | pip3 install .
--------------------------------------------------------------------------------
/docs/guide/quickstart.rst:
--------------------------------------------------------------------------------
1 | Quick Start
2 | =================================
3 |
4 | Simple Usage
5 | ---------------
6 |
7 | Open ``./run_rlzoo.py``:
8 |
9 | .. code-block:: python
10 | :linenos:
11 |
12 | from rlzoo.common.env_wrappers import build_env
13 | from rlzoo.common.utils import call_default_params
14 | from rlzoo.algorithms import TD3
15 | # choose an algorithm
16 | AlgName = 'TD3'
17 | # select a corresponding environment type
18 | EnvType = 'classic_control'
19 | # chose an environment
20 | EnvName = 'Pendulum-v0'
21 | # build an environment with wrappers
22 | env = build_env(EnvName, EnvType)
23 | # call default parameters for the algorithm and learning process
24 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
25 | # instantiate the algorithm
26 | alg = eval(AlgName+'(**alg_params)')
27 | # start the training
28 | alg.learn(env=env, mode='train', render=False, **learn_params)
29 | # test after training
30 | alg.learn(env=env, mode='test', render=True, **learn_params)
31 |
32 |
33 | Run the example:
34 |
35 | .. code-block:: bash
36 |
37 | python run_rlzoo.py
38 |
39 |
40 | Choices for ``AlgName``: 'DQN', 'AC', 'A3C', 'DDPG', 'TD3', 'SAC', 'PG', 'TRPO', 'PPO', 'DPPO'
41 |
42 | Choices for ``EnvType``: 'atari', 'box2d', 'classic_control', 'mujoco', 'robotics', 'dm_control', 'rlbench'
43 |
44 | Choices for ``EnvName`` refers to :ref:`env_list`
45 |
46 |
47 | Another Usage
48 | ---------------
49 |
50 | For providing more flexibility, we provide another usage example of RLzoo with more explicit configurations as follows, where the users can pass in customized networks and otpimizers, etc.
51 |
52 | .. code-block:: python
53 | :linenos:
54 |
55 | import gym
56 | from rlzoo.common.utils import make_env, set_seed
57 | from rlzoo.algorithms import AC
58 | from rlzoo.common.value_networks import ValueNetwork
59 | from rlzoo.common.policy_networks import StochasticPolicyNetwork
60 |
61 | ''' load environment '''
62 | env = gym.make('CartPole-v0').unwrapped
63 | obs_space = env.observation_space
64 | act_space = env.action_space
65 | # reproducible
66 | seed = 2
67 | set_seed(seed, env)
68 |
69 | ''' build networks for the algorithm '''
70 | num_hidden_layer = 4 #number of hidden layers for the networks
71 | hidden_dim = 64 # dimension of hidden layers for the networks
72 | with tf.name_scope('AC'):
73 | with tf.name_scope('Critic'):
74 | # choose the critic network, can be replaced with customized network
75 | critic = ValueNetwork(obs_space, hidden_dim_list=num_hidden_layer * [hidden_dim])
76 | with tf.name_scope('Actor'):
77 | # choose the actor network, can be replaced with customized network
78 | actor = StochasticPolicyNetwork(obs_space, act_space, hidden_dim_list=num_hidden_layer * [hidden_dim], output_activation=tf.nn.tanh)
79 | net_list = [actor, critic] # list of the networks
80 |
81 | ''' choose optimizers '''
82 | a_lr, c_lr = 1e-4, 1e-2 # a_lr: learning rate of the actor; c_lr: learning rate of the critic
83 | a_optimizer = tf.optimizers.Adam(a_lr)
84 | c_optimizer = tf.optimizers.Adam(c_lr)
85 | optimizers_list=[a_optimizer, c_optimizer] # list of optimizers
86 |
87 | # intialize the algorithm model, with algorithm parameters passed in
88 | model = AC(net_list, optimizers_list)
89 | '''
90 | full list of arguments for the algorithm
91 | ----------------------------------------
92 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
93 | optimizers_list: a list of optimizers for all networks and differentiable variables
94 | gamma: discounted factor of reward
95 | action_range: scale of action values
96 | '''
97 |
98 | # start the training process, with learning parameters passed in
99 | model.learn(env, train_episodes=500, max_steps=200,
100 | save_interval=50, mode='train', render=False)
101 | '''
102 | full list of parameters for training
103 | -------------------------------------
104 | env: learning environment
105 | train_episodes: total number of episodes for training
106 | test_episodes: total number of episodes for testing
107 | max_steps: maximum number of steps for one episode
108 | save_interval: time steps for saving the weights and plotting the results
109 | mode: 'train' or 'test'
110 | render: if true, visualize the environment
111 | '''
112 |
113 | # test after training
114 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True)
115 |
116 |
117 |
118 | Interactive Configurations
119 | --------------------------
120 |
121 | We also provide an interactive learning configuration with Jupyter Notebook and *ipywidgets*, where you can select the algorithm, environment, and general learning settings with simple clicking on dropdown lists and sliders!
122 | A video demonstrating the usage is as following.
123 | The interactive mode can be used with `rlzoo/interactive/main.ipynb `_ by running ``$ jupyter notebook`` to open it.
124 |
125 | .. image:: ../../gif/interactive.gif
--------------------------------------------------------------------------------
/docs/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/docs/img/logo.png
--------------------------------------------------------------------------------
/docs/img/rlzoo-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/docs/img/rlzoo-logo.png
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. RLzoo documentation master file, created by
2 | sphinx-quickstart on Wed Apr 29 23:00:36 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to Reinforcement Learning Zoo!
7 | ============================================
8 |
9 | .. image:: img/rlzoo-logo.png
10 | :width: 40 %
11 | :align: center
12 | :target: https://github.com/tensorlayer/rlzoo
13 |
14 | RLzoo is a collection of the most practical reinforcement learning algorithms, frameworks and applications, released on `Github `_ in November 2019. It is implemented with Tensorflow 2.0 and API of neural network layers in TensorLayer 2, to provide a hands-on fast-developing approach for reinforcement learning practices and benchmarks. It supports basic toy-test environments like `OpenAI Gym `_ and `DeepMind Control Suite `_ with very simple configurations. Moreover, RLzoo supports robot learning benchmark environment `RLBench `_ based on Vrep/Pyrep simulator. Other large-scale distributed training framework for more realistic scenarios with Unity 3D, Mujoco, Bullet Physics, etc, will be supported in the future.
15 |
16 | We also provide novices friendly `DRL Tutorials `_ for algorithms implementation, where each algorithm is implemented in an individual script. The tutorials serve as code examples for our Springer textbook `Deep Reinforcement Learning: Fundamentals, Research and Applications `_ , you can get the free PDF if your institute has Springer license.
17 |
18 | .. toctree::
19 | :maxdepth: 1
20 | :caption: User Guide
21 |
22 | guide/installation
23 | guide/quickstart
24 | guide/configuration
25 | guide/api
26 |
27 | .. toctree::
28 | :maxdepth: 1
29 | :caption: RL Algorithms
30 |
31 | algorithms/dqn
32 | algorithms/pg
33 | algorithms/ac
34 | algorithms/a3c
35 | algorithms/ddpg
36 | algorithms/td3
37 | algorithms/sac
38 | algorithms/trpo
39 | algorithms/ppo
40 | algorithms/dppo
41 |
42 | .. toctree::
43 | :maxdepth: 1
44 | :caption: Common
45 |
46 | common/basicnets
47 | common/policynets
48 | common/valuenets
49 | common/buffer
50 | common/distributions
51 | common/envwrappers
52 | common/envlist
53 | common/mathutils
54 | common/utils
55 |
56 | .. toctree::
57 | :maxdepth: 1
58 | :caption: Other Resources
59 |
60 | other/drl_book
61 | other/drl_tutorial
62 |
63 | Contributing
64 | ==================
65 |
66 | This project is under active development, if you want to join the core team, feel free to contact Zihan Ding at zhding[at]mail.ustc.edu.cn
67 |
68 | Citation
69 | ==================
70 |
71 | * :ref:`genindex`
72 | * :ref:`modindex`
73 | * :ref:`search`
74 |
75 |
76 | .. image:: img/logo.png
77 | :width: 70 %
78 | :align: center
79 | :target: https://github.com/tensorlayer/rlzoo
80 |
81 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: My Docs
2 |
--------------------------------------------------------------------------------
/docs/other/drl_book.rst:
--------------------------------------------------------------------------------
1 | DRL Book
2 | ==========
3 |
4 | .. image:: http://deep-reinforcement-learning-book.github.io/assets/images/cover_v1.png
5 | :width: 30 %
6 | :align: center
7 | :target: https://deepreinforcementlearningbook.org
8 |
9 | - You can get the `free PDF `__ if your institute has Springer license.
10 |
11 | Deep reinforcement learning (DRL) relies on the intersection of reinforcement learning (RL) and deep learning (DL). It has been able to solve a wide range of complex decision-making tasks that were previously out of reach for a machine and famously contributed to the success of AlphaGo. Furthermore, it opens up numerous new applications in domains such as healthcare, robotics, smart grids, and finance.
12 |
13 | Divided into three main parts, this book provides a comprehensive and self-contained introduction to DRL. The first part introduces the foundations of DL, RL and widely used DRL methods and discusses their implementation. The second part covers selected DRL research topics, which are useful for those wanting to specialize in DRL research. To help readers gain a deep understanding of DRL and quickly apply the techniques in practice, the third part presents mass applications, such as the intelligent transportation system and learning to run, with detailed explanations.
14 |
15 | The book is intended for computer science students, both undergraduate and postgraduate, who would like to learn DRL from scratch, practice its implementation, and explore the research topics. This book also appeals to engineers and practitioners who do not have strong machine learning background, but want to quickly understand how DRL works and use the techniques in their applications.
16 |
17 | Editors
18 | --------
19 | - Hao Dong - Peking University
20 | - Zihan Ding - Princeton University
21 | - Shanghang Zhang - University of California, Berkeley
22 |
23 | Authors
24 | --------
25 | - Hao Dong - Peking University
26 | - Zihan Ding - Princeton University
27 | - Shanghang Zhang - University of California, Berkeley
28 | - Hang Yuan - Oxford University
29 | - Hongming Zhang - Peking University
30 | - Jingqing Zhang - Imperial College London
31 | - Yanhua Huang - Xiaohongshu Technology Co.
32 | - Tianyang Yu - Nanchang University
33 | - Huaqing Zhang - Google
34 | - Ruitong Huang - Borealis AI
35 |
36 |
37 | .. image:: https://deep-generative-models.github.io/files/web/water-bottom-min.png
38 | :width: 100 %
39 | :align: center
40 | :target: https://github.com/tensorlayer/tensorlayer/edit/master/examples/reinforcement_learning
41 |
42 |
43 |
--------------------------------------------------------------------------------
/docs/other/drl_tutorial.rst:
--------------------------------------------------------------------------------
1 | DRL Tutorial
2 | =================================
3 |
4 |
5 | .. image:: https://tensorlayer.readthedocs.io/en/latest/_images/tl_transparent_logo.png
6 | :width: 30 %
7 | :align: center
8 | :target: https://github.com/tensorlayer/tensorlayer/edit/master/examples/reinforcement_learning
9 |
10 |
11 | Different from RLzoo for simple usage with **high-level APIs**, the `RL tutorial `__ aims to make the reinforcement learning tutorial simple, transparent and straight-forward with **low-level APIs**, as this would not only benefits new learners of reinforcement learning, but also provide convenience for senior researchers to testify their new ideas quickly.
12 |
13 | .. image:: https://deep-generative-models.github.io/files/web/water-bottom-min.png
14 | :width: 100 %
15 | :align: center
16 | :target: https://github.com/tensorlayer/tensorlayer/edit/master/examples/reinforcement_learning
17 |
18 |
19 |
--------------------------------------------------------------------------------
/examples.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | ## Descriptions of Algorithms and Environments in RLZoo
4 |
5 | | Algorithms | Action Space | Policy | Update | Envs |
6 | | -------------------------- | ------------------- | ------------- | ---------- | ------------------------------------------------------------ |
7 | | DQN (double, dueling, PER) | Discrete Only | -- | Off-policy | Atari, Classic Control |
8 | | AC | Discrete/Continuous | Stochastic | On-policy | All |
9 | | PG | Discrete/Continuous | Stochastic | On-policy | All |
10 | | DDPG | Continuous | Deterministic | Off-policy | Classic Control, Box2D, Mujoco, Robotics, DeepMind Control, RLBench |
11 | | TD3 | Continuous | Deterministic | Off-policy | Classic Control, Box2D, Mujoco, Robotics, DeepMind Control, RLBench |
12 | | SAC | Continuous | Stochastic | Off-policy | Classic Control, Box2D, Mujoco, Robotics, DeepMind Control, RLBench |
13 | | A3C | Discrete/Continuous | Stochastic | On-policy | Atari, Classic Control, Box2D, Mujoco, Robotics, DeepMind Control |
14 | | PPO | Discrete/Continuous | Stochastic | On-policy | All |
15 | | DPPO | Discrete/Continuous | Stochastic | On-policy | Atari, Classic Control, Box2D, Mujoco, Robotics, DeepMind Control |
16 | | TRPO | Discrete/Continuous | Stochastic | On-policy | All |
17 |
18 |
19 |
20 | ## 1. Deep Q-Network (DQN)
21 |
22 | ```python
23 | AlgName = 'DQN'
24 | EnvName = 'PongNoFrameskip-v4'
25 | EnvType = 'atari'
26 | # EnvName = 'CartPole-v1'
27 | # EnvType = 'classic_control' # the name of env needs to match the type of env
28 |
29 | env = build_env(EnvName, EnvType)
30 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
31 | alg = eval(AlgName+'(**alg_params)')
32 | alg.learn(env=env, mode='train', **learn_params)
33 | alg.learn(env=env, mode='test', render=True, **learn_params)
34 |
35 | ```
36 |
37 | ## 2. Actor-Critic (AC)
38 |
39 | ```python
40 | AlgName = 'AC'
41 | EnvName = 'PongNoFrameskip-v4'
42 | EnvType = 'atari'
43 |
44 | # EnvName = 'Pendulum-v0'
45 | # EnvType = 'classic_control'
46 |
47 | # EnvName = 'BipedalWalker-v2'
48 | # EnvType = 'box2d'
49 |
50 | # EnvName = 'Ant-v2'
51 | # EnvType = 'mujoco'
52 |
53 | # EnvName = 'FetchPush-v1'
54 | # EnvType = 'robotics'
55 |
56 | # EnvName = 'FishSwim-v0'
57 | # EnvType = 'dm_control'
58 |
59 | # EnvName = 'ReachTarget'
60 | # EnvType = 'rlbench'
61 |
62 | env = build_env(EnvName, EnvType)
63 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
64 | alg = eval(AlgName+'(**alg_params)')
65 | alg.learn(env=env, mode='train', render=False, **learn_params)
66 | alg.learn(env=env, mode='test', render=True, **learn_params)
67 |
68 | ```
69 |
70 | ## 3. Policy Gradient (PG)
71 |
72 | ```python
73 | AlgName = 'PG'
74 | EnvName = 'PongNoFrameskip-v4'
75 | EnvType = 'atari'
76 |
77 | # EnvName = 'CartPole-v0'
78 | # EnvType = 'classic_control'
79 |
80 | # EnvName = 'BipedalWalker-v2'
81 | # EnvType = 'box2d'
82 |
83 | # EnvName = 'Ant-v2'
84 | # EnvType = 'mujoco'
85 |
86 | # EnvName = 'FetchPush-v1'
87 | # EnvType = 'robotics'
88 |
89 | # EnvName = 'FishSwim-v0'
90 | # EnvType = 'dm_control'
91 |
92 | # EnvName = 'ReachTarget'
93 | # EnvType = 'rlbench'
94 |
95 | env = build_env(EnvName, EnvType)
96 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
97 | alg = eval(AlgName+'(**alg_params)')
98 | alg.learn(env=env, mode='train', render=False, **learn_params)
99 | alg.learn(env=env, mode='test', render=True, **learn_params)
100 | ```
101 |
102 | ## 4. Deep Deterministic Policy Gradient (DDPG)
103 |
104 | ```python
105 | AlgName = 'DDPG'
106 | EnvName = 'Pendulum-v0' # only continuous action
107 | EnvType = 'classic_control'
108 |
109 | # EnvName = 'BipedalWalker-v2'
110 | # EnvType = 'box2d'
111 |
112 | # EnvName = 'Ant-v2'
113 | # EnvType = 'mujoco'
114 |
115 | # EnvName = 'FetchPush-v1'
116 | # EnvType = 'robotics'
117 |
118 | # EnvName = 'FishSwim-v0'
119 | # EnvType = 'dm_control'
120 |
121 | # EnvName = 'ReachTarget'
122 | # EnvType = 'rlbench'
123 |
124 | env = build_env(EnvName, EnvType)
125 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
126 | alg = eval(AlgName+'(**alg_params)')
127 | alg.learn(env=env, mode='train', render=False, **learn_params)
128 | alg.learn(env=env, mode='test', render=True, **learn_params)
129 |
130 | ```
131 |
132 |
133 |
134 | ## 5. Twin Delayed DDPG (TD3)
135 |
136 | ```python
137 | AlgName = 'TD3'
138 | EnvName = 'Pendulum-v0' # only continuous action
139 | EnvType = 'classic_control'
140 |
141 | # EnvName = 'BipedalWalker-v2'
142 | # EnvType = 'box2d'
143 |
144 | # EnvName = 'Ant-v2'
145 | # EnvType = 'mujoco'
146 |
147 | # EnvName = 'FetchPush-v1'
148 | # EnvType = 'robotics'
149 |
150 | # EnvName = 'FishSwim-v0'
151 | # EnvType = 'dm_control'
152 |
153 | # EnvName = 'ReachTarget'
154 | # EnvType = 'rlbench'
155 |
156 | env = build_env(EnvName, EnvType)
157 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
158 | alg = eval(AlgName+'(**alg_params)')
159 | alg.learn(env=env, mode='train', render=False, **learn_params)
160 | alg.learn(env=env, mode='test', render=True, **learn_params)
161 | ```
162 |
163 | ## 6. Soft Actor-Critic (SAC)
164 |
165 | ```python
166 | AlgName = 'SAC'
167 | EnvName = 'Pendulum-v0' # only continuous action
168 | EnvType = 'classic_control'
169 |
170 | # EnvName = 'BipedalWalker-v2'
171 | # EnvType = 'box2d'
172 |
173 | # EnvName = 'Ant-v2'
174 | # EnvType = 'mujoco'
175 |
176 | # EnvName = 'FetchPush-v1'
177 | # EnvType = 'robotics'
178 |
179 | # EnvName = 'FishSwim-v0'
180 | # EnvType = 'dm_control'
181 |
182 | # EnvName = 'ReachTarget'
183 | # EnvType = 'rlbench'
184 |
185 | env = build_env(EnvName, EnvType)
186 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
187 | alg = eval(AlgName+'(**alg_params)')
188 | alg.learn(env=env, mode='train', render=False, **learn_params)
189 | alg.learn(env=env, mode='test', render=True, **learn_params)
190 | ```
191 |
192 | ## 7. Asynchronous Advantage Actor-Critic (A3C)
193 |
194 | ```python
195 | AlgName = 'A3C'
196 | EnvName = 'PongNoFrameskip-v4'
197 | EnvType = 'atari'
198 |
199 | # EnvName = 'Pendulum-v0' # only continuous action
200 | # EnvType = 'classic_control'
201 |
202 | # EnvName = 'BipedalWalker-v2'
203 | # EnvType = 'box2d'
204 |
205 | # EnvName = 'Ant-v2'
206 | # EnvType = 'mujoco'
207 |
208 | # EnvName = 'FetchPush-v1'
209 | # EnvType = 'robotics'
210 |
211 | # EnvName = 'FishSwim-v0'
212 | # EnvType = 'dm_control'
213 |
214 | number_workers = 2 # need to specify number of parallel workers
215 | env = build_env(EnvName, EnvType, nenv=number_workers)
216 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
217 | alg = eval(AlgName+'(**alg_params)')
218 | alg.learn(env=env, mode='train', render=False, **learn_params)
219 | alg.learn(env=env, mode='test', render=True, **learn_params)
220 | ```
221 |
222 | ## 8. Proximal Policy Optimization (PPO)
223 |
224 | ```python
225 | EnvName = 'PongNoFrameskip-v4'
226 | EnvType = 'atari'
227 |
228 | # EnvName = 'Pendulum-v0'
229 | # EnvType = 'classic_control'
230 |
231 | # EnvName = 'BipedalWalker-v2'
232 | # EnvType = 'box2d'
233 |
234 | # EnvName = 'Ant-v2'
235 | # EnvType = 'mujoco'
236 |
237 | # EnvName = 'FetchPush-v1'
238 | # EnvType = 'robotics'
239 |
240 | # EnvName = 'FishSwim-v0'
241 | # EnvType = 'dm_control'
242 |
243 | # EnvName = 'ReachTarget'
244 | # EnvType = 'rlbench'
245 |
246 | env = build_env(EnvName, EnvType)
247 | alg_params, learn_params = call_default_params(env, EnvType, 'PPO')
248 | alg = PPO(method='clip', **alg_params) # specify 'clip' or 'penalty' method for PPO
249 | alg.learn(env=env, mode='train', render=False, **learn_params)
250 | alg.learn(env=env, mode='test', render=False, **learn_params)
251 | ```
252 |
253 | ## 9. Distributed Proximal Policy Optimization (DPPO)
254 |
255 | ```python
256 | EnvName = 'PongNoFrameskip-v4'
257 | EnvType = 'atari'
258 |
259 | # EnvName = 'Pendulum-v0'
260 | # EnvType = 'classic_control'
261 |
262 | # EnvName = 'BipedalWalker-v2'
263 | # EnvType = 'box2d'
264 |
265 | # EnvName = 'Ant-v2'
266 | # EnvType = 'mujoco'
267 |
268 | # EnvName = 'FetchPush-v1'
269 | # EnvType = 'robotics'
270 |
271 | # EnvName = 'FishSwim-v0'
272 | # EnvType = 'dm_control'
273 |
274 | # EnvName = 'ReachTarget'
275 | # EnvType = 'rlbench'
276 |
277 | number_workers = 2 # need to specify number of parallel workers
278 | env = build_env(EnvName, EnvType, nenv=number_workers)
279 | alg_params, learn_params = call_default_params(env, EnvType, 'DPPO')
280 | alg = DPPO(method='penalty', **alg_params) # specify 'clip' or 'penalty' method for PPO
281 | alg.learn(env=env, mode='train', render=False, **learn_params)
282 | alg.learn(env=env, mode='test', render=True, **learn_params)
283 | ```
284 |
285 | ## 10. Trust Region Policy Optimization (TRPO)
286 |
287 | ```python
288 | AlgName = 'TRPO'
289 | EnvName = 'PongNoFrameskip-v4'
290 | EnvType = 'atari'
291 |
292 | # EnvName = 'CartPole-v0'
293 | # EnvType = 'classic_control'
294 |
295 | # EnvName = 'BipedalWalker-v2'
296 | # EnvType = 'box2d'
297 |
298 | # EnvName = 'Ant-v2'
299 | # EnvType = 'mujoco'
300 |
301 | # EnvName = 'FetchPush-v1'
302 | # EnvType = 'robotics'
303 |
304 | # EnvName = 'FishSwim-v0'
305 | # EnvType = 'dm_control'
306 |
307 | # EnvName = 'ReachTarget'
308 | # EnvType = 'rlbench'
309 |
310 | env = build_env(EnvName, EnvType)
311 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
312 | alg = eval(AlgName+'(**alg_params)')
313 | alg.learn(env=env, mode='train', render=False, **learn_params)
314 | alg.learn(env=env, mode='test', render=True, **learn_params)
315 | ```
316 |
317 |
--------------------------------------------------------------------------------
/gif/ACM_MM2021_Presentation_Slide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/ACM_MM2021_Presentation_Slide.pdf
--------------------------------------------------------------------------------
/gif/atari.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/atari.gif
--------------------------------------------------------------------------------
/gif/box2d.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/box2d.gif
--------------------------------------------------------------------------------
/gif/classic.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/classic.gif
--------------------------------------------------------------------------------
/gif/dmcontrol.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/dmcontrol.gif
--------------------------------------------------------------------------------
/gif/interactive.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/interactive.gif
--------------------------------------------------------------------------------
/gif/mujoco.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/mujoco.gif
--------------------------------------------------------------------------------
/gif/rlbench.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/rlbench.gif
--------------------------------------------------------------------------------
/gif/robotics.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/gif/robotics.gif
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib==3.0.3
2 | numpy==1.16.3
3 | opencv-python==4.1.0.25
4 | pygame==1.9.6
5 | tensorflow-gpu==2.1.0
6 | tensorflow-probability==0.8.0
7 | tensorlayer>=2.1.0
8 | gym==0.12.5
9 | ipywidgets==7.5.1
10 |
11 |
--------------------------------------------------------------------------------
/rlzoo/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | /img
3 | /log
4 | /model
5 |
--------------------------------------------------------------------------------
/rlzoo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | from .ac.ac import AC
2 | from .pg.pg import PG
3 | from .dqn.dqn import DQN
4 | from .a3c.a3c import A3C
5 | from .ddpg.ddpg import DDPG
6 | from .td3.td3 import TD3
7 | from .sac.sac import SAC
8 | from .ppo.ppo import PPO
9 | from .ppo_penalty.ppo_penalty import PPO_PENALTY
10 | from .ppo_clip.ppo_clip import PPO_CLIP
11 | from .dppo.dppo import DPPO
12 | from .dppo_penalty.dppo_penalty import DPPO_PENALTY
13 | from .dppo_clip.dppo_clip import DPPO_CLIP
14 | from .trpo.trpo import TRPO
15 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/a3c/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/a3c/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/a3c/run_a3c.py:
--------------------------------------------------------------------------------
1 | from rlzoo.algorithms.a3c.a3c import A3C
2 | from rlzoo.common.policy_networks import *
3 | from rlzoo.common.value_networks import *
4 | import gym
5 |
6 | """ load environment """
7 | env_id = 'BipedalWalker-v2'
8 | env = gym.make(env_id).unwrapped
9 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run
10 | action_shape = env.action_space.shape
11 | state_shape = env.observation_space.shape
12 | # reproducible
13 | seed = 2
14 | np.random.seed(seed)
15 | tf.random.set_seed(seed)
16 | env.seed(seed)
17 |
18 | """ build networks for the algorithm """
19 | num_hidden_layer = 4 # number of hidden layers for the networks
20 | hidden_dim = 64 # dimension of hidden layers for the networks
21 | num_workers = 2
22 | net_list2 = []
23 | for i in range(num_workers + 1):
24 | with tf.name_scope('A3C'):
25 | with tf.name_scope('Actor'):
26 | actor = StochasticPolicyNetwork(env.observation_space, env.action_space,
27 | hidden_dim_list=num_hidden_layer * [hidden_dim])
28 | with tf.name_scope('Critic'):
29 | critic = ValueNetwork(env.observation_space, hidden_dim_list=num_hidden_layer * [hidden_dim])
30 | net_list = [actor, critic]
31 | net_list2.append(net_list)
32 |
33 | """ choose optimizers """
34 | actor_lr, critic_lr = 5e-5, 1e-4 # learning rate
35 | a_optimizer = tf.optimizers.RMSprop(actor_lr)
36 | c_optimizer = tf.optimizers.RMSprop(critic_lr)
37 | optimizers_list = [a_optimizer, c_optimizer]
38 |
39 | model = A3C(net_list2, optimizers_list, entropy_beta=0.005)
40 | """
41 | full list of arguments for the algorithm
42 | ----------------------------------------
43 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
44 | optimizers_list: a list of optimizers for all networks and differentiable variables
45 | entropy_beta: factor for entropy boosted exploration
46 | """
47 |
48 | env_list = []
49 | for i in range(num_workers):
50 | env_list.append(gym.make(env_id).unwrapped)
51 | model.learn(env_list, train_episodes=20000, test_episodes=100, max_steps=20000, n_workers=num_workers, update_itr=10,
52 | gamma=0.99, save_interval=500, mode='train')
53 | """
54 | full list of parameters for training
55 | ---------------------------------------
56 | env_list: a list of same learning environments
57 | train_episodes: total number of episodes for training
58 | test_episodes: total number of episodes for testing
59 | max_steps: maximum number of steps for one episode
60 | n_workers: manually set number of workers
61 | update_itr: update global policy after several episodes
62 | gamma: reward discount factor
63 | save_interval: timesteps for saving the weights and plotting the results
64 | mode: train or test
65 | """
66 | # test
67 | model.learn(env_list, test_episodes=100, max_steps=20000, mode='test', render=True)
68 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/ac/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ac/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/ac/ac.py:
--------------------------------------------------------------------------------
1 | """
2 | Actor-Critic
3 | -------------
4 | It uses TD-error as the Advantage.
5 |
6 | Actor Critic History
7 | ----------------------
8 | A3C > DDPG > AC
9 |
10 | Advantage
11 | ----------
12 | AC converge faster than Policy Gradient.
13 |
14 | Disadvantage (IMPORTANT)
15 | ------------------------
16 | The Policy is oscillated (difficult to converge), DDPG can solve
17 | this problem using advantage of DQN.
18 |
19 | Reference
20 | ----------
21 | paper: https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf
22 | View more on MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/
23 | MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/
24 |
25 | Environment
26 | ------------
27 | CartPole-v0: https://gym.openai.com/envs/CartPole-v0
28 |
29 | A pole is attached by an un-actuated joint to a cart, which moves along a
30 | frictionless track. The system is controlled by applying a force of +1 or -1
31 | to the cart. The pendulum starts upright, and the goal is to prevent it from
32 | falling over.
33 |
34 | A reward of +1 is provided for every timestep that the pole remains upright.
35 | The episode ends when the pole is more than 15 degrees from vertical, or the
36 | cart moves more than 2.4 units from the center.
37 |
38 |
39 | Prerequisites
40 | --------------
41 | tensorflow >=2.0.0a0
42 | tensorlayer >=2.0.0
43 |
44 | """
45 | import time
46 |
47 | import tensorlayer as tl
48 |
49 | from rlzoo.common.utils import *
50 | from rlzoo.common.value_networks import *
51 | from rlzoo.common.policy_networks import *
52 |
53 | tl.logging.set_verbosity(tl.logging.DEBUG)
54 |
55 |
56 | ############################### Actor-Critic ####################################
57 | class AC:
58 | def __init__(self, net_list, optimizers_list, gamma=0.9):
59 | assert len(net_list) == 2
60 | assert len(optimizers_list) == 2
61 | self.name = 'AC'
62 | self.actor, self.critic = net_list
63 | assert isinstance(self.critic, ValueNetwork)
64 | assert isinstance(self.actor, StochasticPolicyNetwork)
65 | self.a_optimizer, self.c_optimizer = optimizers_list
66 | self.GAMMA = gamma
67 |
68 | def update(self, s, a, r, s_):
69 | # critic update
70 | v_ = self.critic(np.array([s_]))
71 | with tf.GradientTape() as tape:
72 | v = self.critic(np.array([s]))
73 | td_error = r + self.GAMMA * v_ - v # TD_error = r + lambd * V(newS) - V(S)
74 | loss = tf.square(td_error)
75 | grad = tape.gradient(loss, self.critic.trainable_weights)
76 | self.c_optimizer.apply_gradients(zip(grad, self.critic.trainable_weights))
77 |
78 | # actor update
79 | with tf.GradientTape() as tape:
80 | # _logits = self.actor(np.array([s]))
81 | ## cross-entropy loss weighted by td-error (advantage),
82 | # the cross-entropy mearsures the difference of two probability distributions: the predicted logits and sampled action distribution,
83 | # then weighted by the td-error: small difference of real and predict actions for large td-error (advantage); and vice versa.
84 |
85 | _ = self.actor(np.array([s]))
86 | neg_log_prob = self.actor.policy_dist.neglogp([a])
87 | _exp_v = tf.reduce_mean(neg_log_prob * td_error)
88 | grad = tape.gradient(_exp_v, self.actor.trainable_weights)
89 | self.a_optimizer.apply_gradients(zip(grad, self.actor.trainable_weights))
90 | return _exp_v
91 |
92 | def get_action(self, s):
93 | return self.actor(np.array([s]))[0].numpy()
94 |
95 | def get_action_greedy(self, s):
96 | return self.actor(np.array([s]), greedy=True)[0].numpy()
97 |
98 | def save_ckpt(self, env_name): # save trained weights
99 | save_model(self.actor, 'model_actor', self.name, env_name)
100 | save_model(self.critic, 'model_critic', self.name, env_name)
101 |
102 | def load_ckpt(self, env_name): # load trained weights
103 | load_model(self.actor, 'model_actor', self.name, env_name)
104 | load_model(self.critic, 'model_critic', self.name, env_name)
105 |
106 | def learn(self, env, train_episodes=1000, test_episodes=500, max_steps=200,
107 | save_interval=100, mode='train', render=False, plot_func=None):
108 | """
109 | :param env: learning environment
110 | :param train_episodes: total number of episodes for training
111 | :param test_episodes: total number of episodes for testing
112 | :param max_steps: maximum number of steps for one episode
113 | :param save_interval: time steps for saving the weights and plotting the results
114 | :param mode: 'train' or 'test'
115 | :param render: if true, visualize the environment
116 | :param plot_func: additional function for interactive module
117 | """
118 |
119 | t0 = time.time()
120 | if mode == 'train':
121 | print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id))
122 | reward_buffer = []
123 | for i_episode in range(train_episodes):
124 | s = env.reset()
125 | ep_rs_sum = 0 # rewards of all steps
126 |
127 | for step in range(max_steps):
128 |
129 | if render:
130 | env.render()
131 |
132 | a = self.get_action(s)
133 | s_new, r, done, info = env.step(a)
134 | ep_rs_sum += r
135 |
136 | try:
137 | self.update(s, a, r, s_new) # learn Policy : true_gradient = grad[logPi(s, a) * td_error]
138 | except KeyboardInterrupt: # if Ctrl+C at running actor.learn(), then save model, or exit if not at actor.learn()
139 | self.save_ckpt(env_name=env.spec.id)
140 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id)
141 |
142 | s = s_new
143 |
144 | if done:
145 | break
146 |
147 | reward_buffer.append(ep_rs_sum)
148 | if plot_func is not None:
149 | plot_func(reward_buffer)
150 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}' \
151 | .format(i_episode, train_episodes, ep_rs_sum, time.time() - t0))
152 |
153 | if i_episode % save_interval == 0:
154 | self.save_ckpt(env_name=env.spec.id)
155 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id)
156 |
157 | self.save_ckpt(env_name=env.spec.id)
158 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id)
159 |
160 | elif mode == 'test':
161 | self.load_ckpt(env_name=env.spec.id)
162 | print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id))
163 |
164 | reward_buffer = []
165 | for i_episode in range(test_episodes):
166 | s = env.reset()
167 | ep_rs_sum = 0 # rewards of all steps
168 | for step in range(max_steps):
169 | if render: env.render()
170 | a = self.get_action_greedy(s)
171 | s_new, r, done, info = env.step(a)
172 | s_new = s_new
173 |
174 | ep_rs_sum += r
175 | s = s_new
176 |
177 | if done:
178 | break
179 |
180 | reward_buffer.append(ep_rs_sum)
181 | if plot_func:
182 | plot_func(reward_buffer)
183 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
184 | i_episode, test_episodes, ep_rs_sum, time.time() - t0))
185 |
186 | elif mode is not 'test':
187 | print('unknow mode type')
188 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/ac/run_ac.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.utils import set_seed
2 | from rlzoo.algorithms.ac.ac import AC
3 | from rlzoo.common.value_networks import *
4 | from rlzoo.common.policy_networks import *
5 | import gym
6 |
7 | """ load environment """
8 | # env = gym.make('CartPole-v0').unwrapped
9 | env = gym.make('Pendulum-v0').unwrapped
10 | obs_space = env.observation_space
11 | act_space = env.action_space
12 | # reproducible
13 | seed = 1
14 | set_seed(seed, env)
15 |
16 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run
17 |
18 |
19 | """ build networks for the algorithm """
20 | num_hidden_layer = 2 # number of hidden layers for the networks
21 | hidden_dim = 64 # dimension of hidden layers for the networks
22 | with tf.name_scope('AC'):
23 | with tf.name_scope('Critic'):
24 | critic = ValueNetwork(obs_space, hidden_dim_list=num_hidden_layer * [hidden_dim])
25 | with tf.name_scope('Actor'):
26 | actor = StochasticPolicyNetwork(obs_space, act_space, hidden_dim_list=num_hidden_layer * [hidden_dim],
27 | output_activation=tf.nn.tanh)
28 | net_list = [actor, critic]
29 |
30 | """ choose optimizers """
31 | a_lr, c_lr = 1e-4, 2e-4 # a_lr: learning rate of the actor; c_lr: learning rate of the critic
32 | a_optimizer = tf.optimizers.Adam(a_lr)
33 | c_optimizer = tf.optimizers.Adam(c_lr)
34 | optimizers_list = [a_optimizer, c_optimizer]
35 |
36 | model = AC(net_list, optimizers_list)
37 | """
38 | full list of arguments for the algorithm
39 | ----------------------------------------
40 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
41 | optimizers_list: a list of optimizers for all networks and differentiable variables
42 | gamma: discounted factor of reward
43 | action_range: scale of action values
44 | """
45 |
46 | model.learn(env, train_episodes=500, max_steps=200,
47 | save_interval=50, mode='train', render=False)
48 | """
49 | full list of parameters for training
50 | ---------------------------------------
51 | env: learning environment
52 | train_episodes: total number of episodes for training
53 | test_episodes: total number of episodes for testing
54 | max_steps: maximum number of steps for one episode
55 | save_interval: time steps for saving the weights and plotting the results
56 | mode: 'train' or 'test'
57 | render: if true, visualize the environment
58 | """
59 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True)
60 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/ddpg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ddpg/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/ddpg/ddpg.py:
--------------------------------------------------------------------------------
1 | """
2 | Deep Deterministic Policy Gradient (DDPG)
3 | -----------------------------------------
4 | An algorithm concurrently learns a Q-function and a policy.
5 | It uses off-policy data and the Bellman equation to learn the Q-function,
6 | and uses the Q-function to learn the policy.
7 | Reference
8 | ---------
9 | Deterministic Policy Gradient Algorithms, Silver et al. 2014
10 | Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016
11 | MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/
12 | MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/
13 |
14 | Prerequisites
15 | -------------
16 | tensorflow >=2.0.0a0
17 | tensorflow-probability 0.6.0
18 | tensorlayer >=2.0.0
19 | """
20 |
21 | import time
22 |
23 | from rlzoo.common.utils import *
24 | from rlzoo.common.buffer import *
25 | from rlzoo.common.policy_networks import *
26 | from rlzoo.common.value_networks import *
27 |
28 |
29 | ############################### DDPG ####################################
30 |
31 |
32 | class DDPG(object):
33 | """
34 | DDPG class
35 | """
36 |
37 | def __init__(self, net_list, optimizers_list, replay_buffer_size, action_range=1., tau=0.01):
38 | """
39 | :param net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
40 | :param optimizers_list: a list of optimizers for all networks and differentiable variables
41 | :param replay_buffer_size: the size of buffer for storing explored samples
42 | :param tau: soft update factor
43 | """
44 | assert len(net_list) == 4
45 | assert len(optimizers_list) == 2
46 | self.name = 'DDPG'
47 |
48 | self.critic, self.critic_target, self.actor, self.actor_target = net_list
49 |
50 | assert isinstance(self.critic, QNetwork)
51 | assert isinstance(self.critic_target, QNetwork)
52 | assert isinstance(self.actor, DeterministicPolicyNetwork)
53 | assert isinstance(self.actor_target, DeterministicPolicyNetwork)
54 | assert isinstance(self.actor.action_space, gym.spaces.Box)
55 |
56 | def copy_para(from_model, to_model):
57 | for i, j in zip(from_model.trainable_weights, to_model.trainable_weights):
58 | j.assign(i)
59 |
60 | copy_para(self.actor, self.actor_target)
61 | copy_para(self.critic, self.critic_target)
62 |
63 | self.replay_buffer_size = replay_buffer_size
64 | self.buffer = ReplayBuffer(replay_buffer_size)
65 |
66 | self.ema = tf.train.ExponentialMovingAverage(decay=1 - tau) # soft replacement
67 | self.action_range = action_range
68 |
69 | self.critic_opt, self.actor_opt = optimizers_list
70 |
71 | def ema_update(self):
72 | """
73 | Soft updating by exponential smoothing
74 |
75 | :return: None
76 | """
77 | paras = self.actor.trainable_weights + self.critic.trainable_weights
78 | self.ema.apply(paras)
79 | for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras):
80 | i.assign(self.ema.average(j))
81 |
82 | def sample_action(self):
83 | """ generate random actions for exploration """
84 | a = tf.random.uniform(self.actor.action_space.shape, self.actor.action_space.low, self.actor.action_space.high)
85 | return a
86 |
87 | def get_action(self, s, noise_scale):
88 | """
89 | Choose action with exploration
90 |
91 | :param s: state
92 |
93 | :return: action
94 | """
95 | a = self.actor([s])[0].numpy()*self.action_range
96 |
97 | # add randomness to action selection for exploration
98 | noise = np.random.normal(0, 1, a.shape) * noise_scale
99 | a += noise
100 | a = np.clip(a, self.actor.action_space.low, self.actor.action_space.high)
101 |
102 | return a
103 |
104 | def get_action_greedy(self, s):
105 | """
106 | Choose action
107 |
108 | :param s: state
109 |
110 | :return: action
111 | """
112 | return self.actor([s])[0].numpy()*self.action_range
113 |
114 | def update(self, batch_size, gamma):
115 | """
116 | Update parameters
117 |
118 | :param batch_size: update batch size
119 | :param gamma: reward decay factor
120 |
121 | :return:
122 | """
123 | bs, ba, br, bs_, bd = self.buffer.sample(batch_size)
124 |
125 | ba_ = self.actor_target(bs_)*self.action_range
126 |
127 | q_ = self.critic_target([bs_, ba_])
128 | y = br + (1 - bd) * gamma * q_
129 | with tf.GradientTape() as tape:
130 | q = self.critic([bs, ba])
131 | td_error = tf.losses.mean_squared_error(y, q)
132 | c_grads = tape.gradient(td_error, self.critic.trainable_weights)
133 | self.critic_opt.apply_gradients(zip(c_grads, self.critic.trainable_weights))
134 |
135 | with tf.GradientTape() as tape:
136 | a = self.actor(bs)*self.action_range
137 | q = self.critic([bs, a])
138 | a_loss = - tf.reduce_mean(q) # maximize the q
139 | a_grads = tape.gradient(a_loss, self.actor.trainable_weights)
140 | self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights))
141 | self.ema_update()
142 |
143 | def store_transition(self, s, a, r, s_, d):
144 | """
145 | Store data in data buffer
146 |
147 | :param s: state
148 | :param a: act
149 | :param r: reward
150 | :param s_: next state
151 |
152 | :return: None
153 | """
154 | d = 1 if d else 0
155 |
156 | self.buffer.push(s, a, [r], s_, d)
157 |
158 | def save_ckpt(self, env_name):
159 | """
160 | save trained weights
161 |
162 | :return: None
163 | """
164 | save_model(self.actor, 'model_policy_net', self.name, env_name)
165 | save_model(self.actor_target, 'model_target_policy_net', self.name, env_name)
166 | save_model(self.critic, 'model_q_net', self.name, env_name)
167 | save_model(self.critic_target, 'model_target_q_net', self.name, env_name)
168 |
169 | def load_ckpt(self, env_name):
170 | """
171 | load trained weights
172 |
173 | :return: None
174 | """
175 | load_model(self.actor, 'model_policy_net', self.name, env_name)
176 | load_model(self.actor_target, 'model_target_policy_net', self.name, env_name)
177 | load_model(self.critic, 'model_q_net', self.name, env_name)
178 | load_model(self.critic_target, 'model_target_q_net', self.name, env_name)
179 |
180 | def learn(self, env, train_episodes=200, test_episodes=100, max_steps=200, save_interval=10, explore_steps=500,
181 | mode='train', render=False, batch_size=32, gamma=0.9, noise_scale=1., noise_scale_decay=0.995,
182 | plot_func=None):
183 | """
184 | learn function
185 |
186 | :param env: learning environment
187 | :param train_episodes: total number of episodes for training
188 | :param test_episodes: total number of episodes for testing
189 | :param max_steps: maximum number of steps for one episode
190 | :param save_interval: time steps for saving
191 | :param explore_steps: for random action sampling in the beginning of training
192 | :param mode: train or test mode
193 | :param render: render each step
194 | :param batch_size: update batch size
195 | :param gamma: reward decay factor
196 | :param noise_scale: range of action noise for exploration
197 | :param noise_scale_decay: noise scale decay factor
198 | :param plot_func: additional function for interactive module
199 | :return: None
200 | """
201 |
202 | t0 = time.time()
203 |
204 | if mode == 'train': # train
205 | print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id))
206 | reward_buffer = []
207 | frame_idx = 0
208 | for i in range(1, train_episodes + 1):
209 | s = env.reset()
210 | ep_reward = 0
211 |
212 | for j in range(max_steps):
213 | if render:
214 | env.render()
215 | # Add exploration noise
216 | if frame_idx > explore_steps:
217 | a = self.get_action(s, noise_scale)
218 | else:
219 | a = self.sample_action()
220 | frame_idx += 1
221 |
222 | s_, r, done, info = env.step(a)
223 |
224 | self.store_transition(s, a, r, s_, done)
225 | if len(self.buffer) >= self.replay_buffer_size:
226 | self.update(batch_size, gamma)
227 | noise_scale *= noise_scale_decay
228 | s = s_
229 | ep_reward += r
230 |
231 | if done:
232 | break
233 |
234 | print(
235 | 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
236 | i, train_episodes, ep_reward,
237 | time.time() - t0
238 | )
239 | )
240 |
241 | reward_buffer.append(ep_reward)
242 | if plot_func is not None:
243 | plot_func(reward_buffer)
244 | if i and not i % save_interval:
245 | self.save_ckpt(env_name=env.spec.id)
246 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id)
247 |
248 | self.save_ckpt(env_name=env.spec.id)
249 | plot_save_log(reward_buffer, algorithm_name=self.name, env_name=env.spec.id)
250 |
251 | # test
252 | elif mode == 'test':
253 | self.load_ckpt(env_name=env.spec.id)
254 | print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id))
255 | reward_buffer = []
256 | for eps in range(1, test_episodes+1):
257 | ep_rs_sum = 0
258 | s = env.reset()
259 | for step in range(max_steps):
260 | if render:
261 | env.render()
262 | action = self.get_action_greedy(s)
263 | s, reward, done, info = env.step(action)
264 | ep_rs_sum += reward
265 | if done:
266 | break
267 |
268 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
269 | eps, test_episodes, ep_rs_sum, time.time() - t0)
270 | )
271 | reward_buffer.append(ep_rs_sum)
272 | if plot_func:
273 | plot_func(reward_buffer)
274 | else:
275 | print('unknown mode type')
--------------------------------------------------------------------------------
/rlzoo/algorithms/ddpg/run_ddpg.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.utils import make_env, set_seed
2 | from rlzoo.algorithms.ddpg.ddpg import DDPG
3 | from rlzoo.common.policy_networks import *
4 | from rlzoo.common.value_networks import *
5 | import gym
6 |
7 | """ load environment """
8 | env = gym.make('Pendulum-v0').unwrapped
9 |
10 | obs_space = env.observation_space
11 | act_space = env.action_space
12 |
13 | # reproducible
14 | seed = 2
15 | set_seed(seed, env)
16 |
17 | """ build networks for the algorithm """
18 | name = 'DDPG'
19 | num_hidden_layer = 2 # number of hidden layers for the networks
20 | hidden_dim = 64 # dimension of hidden layers for the networks
21 |
22 | actor = DeterministicPolicyNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer)
23 | critic = QNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer)
24 |
25 | actor_target = DeterministicPolicyNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer, trainable=False)
26 |
27 | critic_target = QNetwork(obs_space, act_space, [hidden_dim] * num_hidden_layer, trainable=False)
28 |
29 | net_list = [critic, critic_target, actor, actor_target]
30 |
31 | """ create model """
32 | actor_lr = 1e-3
33 | critic_lr = 2e-3
34 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)]
35 | replay_buffer_size = 10000
36 | model = DDPG(net_list, optimizers_list, replay_buffer_size)
37 | """
38 | full list of arguments for the algorithm
39 | ----------------------------------------
40 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
41 | optimizers_list: a list of optimizers for all networks and differentiable variables
42 | replay_buffer_size: the size of buffer for storing explored samples
43 | tau: soft update factor
44 | """
45 |
46 | model.learn(env, train_episodes=100, max_steps=200, save_interval=10,
47 | mode='train', render=False, batch_size=32, gamma=0.9, noise_scale=1., noise_scale_decay=0.995)
48 | """
49 | full list of parameters for training
50 | ---------------------------------------
51 | env: learning environment
52 | train_episodes: total number of episodes for training
53 | test_episodes: total number of episodes for testing
54 | max_steps: maximum number of steps for one episode
55 | save_interval: time steps for saving
56 | explore_steps: for random action sampling in the beginning of training
57 | mode: train or test mode
58 | render: render each step
59 | batch_size: update batch size
60 | gamma: reward decay factor
61 | noise_scale: range of action noise for exploration
62 | noise_scale_decay: noise scale decay factor
63 | """
64 |
65 | model.learn(env, test_episodes=10, max_steps=200, mode='test', render=True)
66 |
67 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/dppo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dppo/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/dppo/dppo.py:
--------------------------------------------------------------------------------
1 | from rlzoo.algorithms.dppo_penalty.dppo_penalty import DPPO_PENALTY
2 | from rlzoo.algorithms.dppo_clip.dppo_clip import DPPO_CLIP
3 |
4 |
5 | def DPPO(**alg_params):
6 | method = alg_params['method']
7 | if method == 'penalty':
8 | del alg_params['epsilon']
9 | algo = DPPO_PENALTY
10 | elif method == 'clip':
11 | del alg_params['kl_target']
12 | del alg_params['lam']
13 | algo = DPPO_CLIP
14 | else:
15 | raise ValueError('Method input error. Method can only be penalty or clip')
16 | del alg_params['method']
17 | return algo(**alg_params)
18 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/dppo_clip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dppo_clip/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/dppo_clip/run_dppo_clip.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.utils import set_seed
2 | from rlzoo.algorithms.dppo_clip.dppo_clip import DPPO_CLIP
3 | from rlzoo.common.policy_networks import *
4 | from rlzoo.common.value_networks import *
5 | import gym
6 |
7 | n_workers = 4
8 | """ load environment """
9 | env = [gym.make('Pendulum-v0').unwrapped for i in range(n_workers)]
10 |
11 | # reproducible
12 | seed = 2
13 | set_seed(seed)
14 |
15 | """ build networks for the algorithm """
16 | name = 'DPPO_CLIP'
17 | hidden_dim = 64
18 | num_hidden_layer = 2
19 | critic = ValueNetwork(env[0].observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value')
20 |
21 | actor = StochasticPolicyNetwork(env[0].observation_space, env[0].action_space,
22 | [hidden_dim] * num_hidden_layer,
23 | trainable=True,
24 | name=name + '_policy')
25 | net_list = critic, actor
26 |
27 | """ create model """
28 | actor_lr = 1e-4
29 | critic_lr = 2e-4
30 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)]
31 | model = DPPO_CLIP(net_list, optimizers_list)
32 | """
33 | full list of arguments for the algorithm
34 | ----------------------------------------
35 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
36 | optimizers_list: a list of optimizers for all networks and differentiable variables
37 | epsilon: clip parameter
38 | """
39 |
40 | model.learn(env, train_episodes=1000, max_steps=200, save_interval=50, gamma=0.9,
41 | mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10, n_workers=n_workers)
42 |
43 | """
44 | full list of parameters for training
45 | ---------------------------------------
46 | env: learning environment
47 | train_episodes: total number of episodes for training
48 | test_episodes: total number of episodes for testing
49 | max_steps: maximum number of steps for one episode
50 | save_interval: time steps for saving
51 | gamma: reward discount factor
52 | mode: train or test
53 | batch_size: update batch size
54 | a_update_steps: actor update iteration steps
55 | c_update_steps: critic update iteration steps
56 | n_workers: number of workers
57 | :return: None
58 | """
59 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True)
60 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/dppo_clip_distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dppo_clip_distributed/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/dppo_clip_distributed/dppo_clip.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.policy_networks import StochasticPolicyNetwork
2 | from rlzoo.common.value_networks import ValueNetwork
3 | from rlzoo.common.utils import *
4 | import tensorflow as tf
5 | import numpy as np
6 | import copy
7 | import pickle
8 |
9 |
10 | def write_log(text: str):
11 | pass
12 | # print('infer server: '+text)
13 | # with open('infer_server_log.txt', 'a') as f:
14 | # f.write(str(text) + '\n')
15 |
16 |
17 | EPS = 1e-8
18 |
19 |
20 | class RLAlgorithm:
21 | def __init__(self):
22 | self.state_buffer = [] # shape: (None, [n_env], [state_shape])
23 | self.action_buffer = []
24 | self.reward_buffer = []
25 | self.done_buffer = []
26 | self.next_state_buffer = []
27 | self.logp_buffer = []
28 | self.all_buffer = self.state_buffer, self.action_buffer, self.reward_buffer, self.done_buffer, \
29 | self.next_state_buffer, self.logp_buffer
30 | self.traj_list = []
31 | self.gamma = 0.9
32 | self.name = 'NotNamed'
33 |
34 | @property
35 | def all_weights(self):
36 | raise NotImplementedError
37 |
38 | def update_model(self, params):
39 | raise NotImplementedError
40 |
41 | def _get_value(self, batch_state):
42 | """
43 | return: value: tf.Tensor
44 | """
45 | raise NotImplementedError
46 |
47 | def _get_action(self, batch_state):
48 | """
49 | return: action: tf.Tensor, log_p: tf.Tensor
50 | """
51 | raise NotImplementedError
52 |
53 | @property
54 | def logp_shape(self):
55 | raise NotImplementedError
56 |
57 | def save_ckpt(self, env_name):
58 | """
59 | save trained weights
60 |
61 | :return: None
62 | """
63 | raise NotImplementedError
64 |
65 | def plot_save_log(self, running_reward, env_name):
66 | plot_save_log(running_reward, algorithm_name=self.name, env_name=env_name)
67 |
68 | def collect_data(self, s, a, r, d, s_, log_p, batch_data=False):
69 | if not batch_data:
70 | s, a, r, d, s_, log_p = [s], [a], [r], [d], [s_], [log_p]
71 | for i, data in enumerate([s, a, r, d, s_, log_p]):
72 | self.all_buffer[i].append(data)
73 |
74 | def get_value(self, state, batch_data=False):
75 | if not batch_data:
76 | state = [state]
77 | value = self._get_value(np.array(state))
78 | value_shape = np.shape(value)
79 | value = tf.reshape(value, value_shape[:-1])
80 | return value
81 |
82 | def get_action(self, state, batch_data=False):
83 | if not batch_data:
84 | state = [state]
85 |
86 | state = np.array(state)
87 | action, log_p = self._get_action(state)
88 | action, log_p = action.numpy(), log_p.numpy()
89 | action_shape = np.shape(action)
90 | # 最后一维度是1 是batch但是len=1就不转, 是batch本来要转
91 | # 不是batch时候len=1也要转
92 | if action_shape[-1] == 1 and batch_data ^ (len(action_shape) == 1):
93 | # ((batch_data and not len(action_shape) == 1) or (not batch_data and len(action_shape) == 1)):
94 | action = np.reshape(action, action_shape[:-1]) # 转换
95 | log_p = np.reshape(log_p, log_p.shape[:-1])
96 | return action, log_p
97 |
98 | # def _cal_discounted_r(self, state_list, reward_list, done_list, batch_data=False):
99 | # discounted_r = []
100 | # for r in reward_list[::-1]:
101 | # v_s_ = r + 0.9 * v_s_
102 | # discounted_r.append(v_s_)
103 |
104 | def _cal_discounted_r(self, next_state_list, reward_list, done_list, batch_data=False):
105 | discounted_r = np.zeros_like(reward_list) # reward_buffer shape: [-1, n_env]
106 | # done_list = np.array(done_list, dtype=np.int)
107 | done_list = np.array(done_list)
108 | v_s_ = self.get_value(next_state_list[-1], batch_data) * (1 - done_list[-1])
109 | for i in range(len(reward_list) - 1, -1, -1):
110 | # discounted_r[i] = v_s_ = reward_list[i] + self.gamma * v_s_
111 | discounted_r[i] = v_s_ = reward_list[i] + (1 - done_list[i]) * self.gamma * v_s_
112 | return discounted_r
113 |
114 | def _cal_adv(self, state_list, reward_list, done_list, next_state_list, batch_data=False):
115 | dc_r = self._cal_discounted_r(next_state_list, reward_list, done_list, batch_data)
116 | # dc_r = np.array(
117 | # [[6.5132155], [6.125795], [5.6953278], [5.217031], [4.68559], [4.0951], [3.439], [2.71], [1.9], [1.]])
118 | if batch_data:
119 | s_shape = np.shape(self.state_buffer) # state_buffer shape: [-1, n_env, *obs_shape]
120 | state_list = np.reshape(self.state_buffer, [-1, *s_shape[2:]])
121 | v = self.get_value(state_list, batch_data).numpy()
122 | v = v.reshape(*s_shape[:2])
123 | else:
124 | v = self.get_value(state_list, batch_data).numpy()
125 |
126 | dc_r = np.array(dc_r, dtype=np.float32)
127 | advs = dc_r - v
128 | # advs = (advs - np.mean(advs)) / (np.std(advs) + 1e-8) # norm all env data adv at the same time
129 | return advs
130 |
131 | def _get_traj(self):
132 | traj_list = []
133 | for element in [
134 | self.state_buffer, self.action_buffer, self.reward_buffer, self.done_buffer, self.next_state_buffer,
135 | self._cal_adv(self.state_buffer, self.reward_buffer, self.done_buffer, self.next_state_buffer, True),
136 | self.logp_buffer]:
137 | axes = list(range(len(np.shape(element))))
138 | axes[0], axes[1] = 1, 0
139 | result = np.transpose(element, axes)
140 | # print(result)
141 | traj_list.append(result)
142 | traj_list = list(zip(*traj_list)) #
143 | return traj_list
144 |
145 | def update_traj_list(self):
146 | self.traj_list.extend(self._get_traj())
147 | for buffer in self.all_buffer:
148 | buffer.clear()
149 |
150 |
151 | class DPPO_CLIP(RLAlgorithm):
152 | def __init__(self, net_builder, opt_builder, n_step=100, gamma=0.9, epsilon=0.2):
153 | super().__init__()
154 | self.critic, self.actor = None, None
155 | self.net_builder = net_builder
156 | self.gamma = gamma
157 | self.n_step = n_step
158 | self._logp_shape = None
159 | self.epsilon = epsilon
160 | self.name = 'DPPO_CLIP'
161 | self.acter_optimizer, self.critic_optimizer = opt_builder()
162 |
163 | def init_components(self): # todo init process should be placed
164 | networks = self.net_builder()
165 | assert len(networks) == 2
166 | self.critic, self.actor = networks
167 | assert isinstance(self.critic, ValueNetwork)
168 | assert isinstance(self.actor, StochasticPolicyNetwork)
169 |
170 | @property
171 | def all_weights(self):
172 | return self.critic.trainable_weights + self.actor.trainable_weights
173 |
174 | # api
175 | def _get_action(self, state):
176 | action = self.actor(state)
177 | log_p = self.actor.policy_dist.logp(action)
178 | return action, log_p
179 |
180 | def _get_value(self, state):
181 | return self.critic(state)
182 |
183 | def save_ckpt(self, env_name):
184 | """
185 | save trained weights
186 |
187 | :return: None
188 | """
189 | save_model(self.actor, 'actor', self.name, env_name)
190 | save_model(self.critic, 'critic', self.name, env_name)
191 |
192 | def load_ckpt(self, env_name):
193 | """
194 | load trained weights
195 |
196 | :return: None
197 | """
198 | load_model(self.actor, 'actor', self.name, env_name)
199 | load_model(self.critic, 'critic', self.name, env_name)
200 |
201 | # api
202 | def update_model(self, params):
203 | for i, j in zip(self.all_weights, params):
204 | i.assign(j)
205 | for buffer in self.all_buffer:
206 | buffer.clear()
207 |
208 | def a_train(self, s, a, adv, oldpi_logp):
209 | oldpi_prob = tf.exp(oldpi_logp)
210 | with tf.GradientTape() as tape:
211 | _ = self.actor(s)
212 | pi_prob = tf.exp(self.actor.policy_dist.logp(a))
213 | ratio = pi_prob / (oldpi_prob + EPS)
214 |
215 | surr = ratio * adv
216 | aloss = -tf.reduce_mean(
217 | tf.minimum(surr, tf.clip_by_value(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv))
218 | a_gard = tape.gradient(aloss, self.actor.trainable_weights)
219 | return a_gard
220 |
221 | def c_train(self, dc_r, s):
222 | dc_r = np.array(dc_r, dtype=np.float32)
223 | with tf.GradientTape() as tape:
224 | v = self.critic(s)
225 | advantage = dc_r - v
226 | closs = tf.reduce_mean(tf.square(advantage))
227 | c_grad = tape.gradient(closs, self.critic.trainable_weights)
228 | return c_grad
229 |
230 | def train(self, traj_list, dis_agent=None):
231 | for traj in traj_list:
232 | state_list, action_list, reward_list, done_list, next_state_list, adv_list, logp_list = traj
233 | for _ in range(10):
234 | a_grad = self.a_train(state_list, action_list, adv_list, logp_list)
235 | if dis_agent:
236 | a_grad = [dis_agent.role_all_reduce(grad) for grad in a_grad]
237 | self.acter_optimizer.apply_gradients(zip(a_grad, self.actor.trainable_weights))
238 |
239 | dc_r = self._cal_discounted_r(next_state_list, reward_list, done_list)
240 | for _ in range(10):
241 | c_grad = self.c_train(dc_r, state_list)
242 | if dis_agent:
243 | c_grad = [dis_agent.role_all_reduce(grad) for grad in c_grad]
244 | self.critic_optimizer.apply_gradients(zip(c_grad, self.critic.trainable_weights))
245 |
246 |
247 | if __name__ == '__main__':
248 | from rlzoo.distributed.training_components import net_builder, env_maker, opt_builder
249 | from rlzoo.common.utils import set_seed
250 |
251 | env = env_maker()
252 | # set_seed(1, env)
253 |
254 | agent = DPPO_CLIP(net_builder, opt_builder)
255 | agent.init_components()
256 |
257 | running_reward = []
258 | curr_step, max_step, traj_len = 0, 500 * 200, 200
259 | s = env.reset()
260 | d = False
261 | cnt = 0
262 | while curr_step < max_step:
263 | for _ in range(traj_len):
264 | curr_step += 1
265 | a, logp = agent.get_action(s)
266 | s_, r, d, _ = env.step(a)
267 | agent.collect_data(s, a, r, d, s_, logp)
268 | if d:
269 | s = env.reset()
270 | else:
271 | s = s_
272 | agent.update_traj_list()
273 | agent.train(agent.traj_list)
274 | avg_eps_reward = min(sum(agent.traj_list[0][2]) / (sum(agent.traj_list[0][3] + 1e-10)), traj_len)
275 | agent.traj_list.clear()
276 | running_reward.append(avg_eps_reward)
277 | cnt += 1
278 | print(cnt, curr_step, avg_eps_reward)
279 | agent.plot_save_log(running_reward, env.spec.id)
280 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/dppo_penalty/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dppo_penalty/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/dppo_penalty/run_dppo_penalty.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.utils import set_seed
2 | from rlzoo.algorithms.dppo_penalty.dppo_penalty import DPPO_PENALTY
3 | from rlzoo.common.policy_networks import *
4 | from rlzoo.common.value_networks import *
5 | import gym
6 |
7 |
8 | n_workers = 4
9 | """ load environment """
10 | env = [gym.make('Pendulum-v0').unwrapped for i in range(n_workers)]
11 |
12 | # reproducible
13 | seed = 2
14 | set_seed(seed)
15 |
16 |
17 | """ build networks for the algorithm """
18 | name = 'DPPO_PENALTY'
19 | hidden_dim = 64
20 | num_hidden_layer = 2
21 | critic = ValueNetwork(env[0].observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value')
22 |
23 | actor = StochasticPolicyNetwork(env[0].observation_space, env[0].action_space,
24 | [hidden_dim] * num_hidden_layer, trainable=True,
25 | name=name + '_policy')
26 | net_list = critic, actor
27 |
28 | """ create model """
29 | actor_lr = 1e-4
30 | critic_lr = 2e-4
31 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)]
32 | model = DPPO_PENALTY(net_list, optimizers_list)
33 | """
34 | full list of arguments for the algorithm
35 | ----------------------------------------
36 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
37 | optimizers_list: a list of optimizers for all networks and differentiable variables
38 | kl_target: controls bounds of policy update and adaptive lambda
39 | lam: KL-regularization coefficient
40 | """
41 |
42 | model.learn(env, train_episodes=1000, max_steps=200, save_interval=50, gamma=0.9,
43 | mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10, n_workers=4)
44 |
45 | """
46 | full list of parameters for training
47 | ---------------------------------------
48 | env: learning environment
49 | train_episodes: total number of episodes for training
50 | test_episodes: total number of episodes for testing
51 | max_steps: maximum number of steps for one episode
52 | save_interval: time steps for saving
53 | gamma: reward discount factor
54 | mode: train or test
55 | batch_size: update batch size
56 | a_update_steps: actor update iteration steps
57 | c_update_steps: critic update iteration steps
58 | n_workers: number of workers
59 | :return: None
60 | """
61 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True)
62 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/dqn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/dqn/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/dqn/default.py:
--------------------------------------------------------------------------------
1 | from gym.spaces import Discrete
2 |
3 | from rlzoo.common.utils import set_seed
4 | from rlzoo.common.value_networks import *
5 |
6 | """
7 | full list of algorithm parameters (alg_params)
8 | -----------------------------------------------
9 | -----------------------------------------------
10 |
11 | full list of learning parameters (learn_params)
12 | -----------------------------------------------
13 | double_q (bool): if True double DQN will be used
14 | dueling (bool): if True dueling value estimation will be used
15 | exploration_rate (float): fraction of entire training period over
16 | which the exploration rate is annealed
17 | exploration_final_eps (float): final value of random action probability
18 | batch_size (int): size of a batched sampled from replay buffer for training
19 | train_freq (int): update the model every `train_freq` steps
20 | learning_starts (int): how many steps of the model to collect transitions
21 | for before learning starts
22 | target_network_update_freq (int): update the target network every
23 | `target_network_update_freq` steps
24 | buffer_size (int): size of the replay buffer
25 | prioritized_replay (bool): if True prioritized replay buffer will be used.
26 | prioritized_alpha (float): alpha parameter for prioritized replay
27 | prioritized_beta0 (float): beta parameter for prioritized replay
28 | mode (str): train or test
29 | -----------------------------------------------
30 | """
31 |
32 |
33 | def atari(env, default_seed=False, **kwargs):
34 | if default_seed:
35 | seed = 2
36 | set_seed(seed, env) # reproducible
37 |
38 | assert isinstance(env.action_space, Discrete)
39 |
40 | alg_params = dict(
41 | dueling=True,
42 | double_q=True,
43 | buffer_size=1000,
44 | prioritized_replay=True,
45 | prioritized_alpha=0.6,
46 | prioritized_beta0=0.4,
47 | )
48 | alg_params.update(kwargs)
49 | if alg_params.get('net_list') is None:
50 | alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64],
51 | state_only=True, dueling=alg_params['dueling'])]
52 |
53 | if alg_params.get('optimizers_list') is None:
54 | alg_params['optimizers_list'] = tf.optimizers.Adam(1e-4, epsilon=1e-5, clipnorm=10),
55 |
56 | learn_params = dict(
57 | train_episodes=int(1e5),
58 | test_episodes=10,
59 | max_steps=200,
60 | save_interval=1e4,
61 | batch_size=32,
62 | exploration_rate=0.1,
63 | exploration_final_eps=0.01,
64 | train_freq=4,
65 | learning_starts=10000,
66 | target_network_update_freq=1000,
67 | gamma=0.99,
68 | )
69 |
70 | return alg_params, learn_params
71 |
72 |
73 | def classic_control(env, default_seed=False, **kwargs):
74 | if default_seed:
75 | seed = 2
76 | set_seed(seed, env) # reproducible
77 |
78 | assert isinstance(env.action_space, Discrete)
79 |
80 | alg_params = dict(
81 | dueling=True,
82 | double_q=True,
83 | buffer_size=1000,
84 | prioritized_replay=False,
85 | prioritized_alpha=0.6,
86 | prioritized_beta0=0.4,
87 | )
88 | alg_params.update(kwargs)
89 | if alg_params.get('net_list') is None:
90 | alg_params['net_list'] = [QNetwork(env.observation_space, env.action_space, [64], activation=tf.nn.tanh,
91 | state_only=True, dueling=alg_params['dueling'])]
92 |
93 | if alg_params.get('optimizers_list') is None:
94 | alg_params['optimizers_list'] = tf.optimizers.Adam(5e-3, epsilon=1e-5),
95 |
96 | learn_params = dict(
97 | train_episodes=int(1e3),
98 | test_episodes=10,
99 | max_steps=200,
100 | save_interval=1e3,
101 | batch_size=32,
102 | exploration_rate=0.2,
103 | exploration_final_eps=0.01,
104 | train_freq=4,
105 | learning_starts=200,
106 | target_network_update_freq=50,
107 | gamma=0.99,
108 | )
109 |
110 | return alg_params, learn_params
111 |
112 |
113 | # class CNNQNet(tl.models.Model):
114 | # def __init__(self, in_dim, act_dim, dueling):
115 | # super().__init__()
116 | # self._state_shape = in_dim
117 | # self._action_shape = act_dim,
118 | # self.dueling = dueling
119 | # with tf.name_scope('DQN'):
120 | # with tf.name_scope('CNN'):
121 | # self.cnn = basic_nets.CNNModel(in_dim)
122 | # mlp_in_shape = self.cnn.outputs[0].shape[0]
123 | # with tf.name_scope('QValue'):
124 | # hidden_dim = 256
125 | # self.preq = tl.layers.Dense(
126 | # hidden_dim, tf.nn.relu,
127 | # tf.initializers.Orthogonal(1.0),
128 | # in_channels=mlp_in_shape
129 | # )
130 | # self.qout = tl.layers.Dense(
131 | # act_dim, None,
132 | # tf.initializers.Orthogonal(1.0),
133 | # in_channels=hidden_dim
134 | # )
135 | # if dueling:
136 | # with tf.name_scope('Value'):
137 | # hidden_dim = 256
138 | # self.prev = tl.layers.Dense(
139 | # hidden_dim, tf.nn.relu,
140 | # tf.initializers.Orthogonal(1.0),
141 | # in_channels=mlp_in_shape
142 | # )
143 | # self.vout = tl.layers.Dense(
144 | # 1, None,
145 | # tf.initializers.Orthogonal(1.0),
146 | # in_channels=hidden_dim
147 | # )
148 | #
149 | # def forward(self, obv):
150 | # obv = tf.cast(obv, tf.float32) / 255.0
151 | # mlp_in = tl.layers.flatten_reshape(self.cnn(obv))
152 | # q_out = self.qout(self.preq(mlp_in))
153 | # if self.dueling:
154 | # v_out = self.vout(self.prev(mlp_in))
155 | # q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True)
156 | # return q_out
157 | #
158 | # @property
159 | # def state_shape(self):
160 | # return copy.deepcopy(self._state_shape)
161 | #
162 | # @property
163 | # def action_shape(self):
164 | # return copy.deepcopy(self._action_shape)
165 | #
166 | #
167 | # class MLPQNet(tl.models.Model):
168 | # def __init__(self, in_dim, act_dim, dueling):
169 | # super().__init__()
170 | # self._state_shape = in_dim,
171 | # self._action_shape = act_dim,
172 | # self.dueling = dueling
173 | # hidden_dim = 64
174 | # with tf.name_scope('DQN'):
175 | # with tf.name_scope('MLP'):
176 | # self.mlp = tl.layers.Dense(
177 | # hidden_dim, tf.nn.tanh,
178 | # tf.initializers.Orthogonal(1.0),
179 | # in_channels=in_dim
180 | # )
181 | # with tf.name_scope('QValue'):
182 | # self.qmlp = tl.layers.Dense(
183 | # act_dim, None,
184 | # tf.initializers.Orthogonal(1.0),
185 | # in_channels=hidden_dim
186 | # )
187 | # if dueling:
188 | # with tf.name_scope('Value'):
189 | # self.vmlp = tl.layers.Dense(
190 | # 1, None,
191 | # tf.initializers.Orthogonal(1.0),
192 | # in_channels=hidden_dim
193 | # )
194 | #
195 | # def forward(self, obv):
196 | # obv = tf.cast(obv, tf.float32)
197 | # latent = self.mlp(obv)
198 | # q_out = self.qmlp(latent)
199 | # if self.dueling:
200 | # v_out = self.vmlp(latent)
201 | # q_out = v_out + q_out - tf.reduce_mean(q_out, 1, True)
202 | # return q_out
203 | #
204 | # @property
205 | # def state_shape(self):
206 | # return copy.deepcopy(self._state_shape)
207 | #
208 | # @property
209 | # def action_shape(self):
210 | # return copy.deepcopy(self._action_shape)
211 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/dqn/dqn.py:
--------------------------------------------------------------------------------
1 | """
2 | Deep Q Network
3 | """
4 | import random
5 | from copy import deepcopy
6 |
7 | from rlzoo.common.utils import *
8 | from rlzoo.common.buffer import ReplayBuffer, PrioritizedReplayBuffer
9 | from rlzoo.common.value_networks import *
10 |
11 |
12 | class DQN(object):
13 | """
14 | Papers:
15 |
16 | Mnih V, Kavukcuoglu K, Silver D, et al. Human-level control through deep
17 | reinforcement learning[J]. Nature, 2015, 518(7540): 529.
18 |
19 | Hessel M, Modayil J, Van Hasselt H, et al. Rainbow: Combining Improvements
20 | in Deep Reinforcement Learning[J]. 2017.
21 | """
22 |
23 | def __init__(self, net_list, optimizers_list, double_q, dueling, buffer_size,
24 | prioritized_replay, prioritized_alpha, prioritized_beta0, ):
25 | """
26 | Parameters:
27 | ----------
28 | :param net_list (list): a list of networks (value and policy) used in the algorithm, from common functions or customization
29 | :param optimizers_list (list): a list of optimizers for all networks and differentiable variables
30 | :param double_q (bool): if True double DQN will be used
31 | :param dueling (bool): if True dueling value estimation will be used
32 | :param buffer_size (int): size of the replay buffer
33 | :param prioritized_replay (bool): if True prioritized replay buffer will be used.
34 | :param prioritized_alpha (float): alpha parameter for prioritized replay
35 | :param prioritized_beta0 (float): beta parameter for prioritized replay
36 | """
37 | assert isinstance(net_list[0], QNetwork)
38 | self.name = 'DQN'
39 | if prioritized_replay:
40 | self.buffer = PrioritizedReplayBuffer(
41 | buffer_size, prioritized_alpha, prioritized_beta0)
42 | else:
43 | self.buffer = ReplayBuffer(buffer_size)
44 |
45 | self.network = net_list[0]
46 | self.target_network = deepcopy(net_list[0])
47 | self.network.train()
48 | self.target_network.infer()
49 | self.optimizer = optimizers_list[0]
50 | self.double_q = double_q
51 | self.prioritized_replay = prioritized_replay
52 | self.dueling = dueling
53 |
54 | def get_action(self, obv, eps=0.2):
55 | out_dim = self.network.action_shape[0]
56 | if random.random() < eps:
57 | return int(random.random() * out_dim)
58 | else:
59 | obv = np.expand_dims(obv, 0).astype('float32')
60 | return self.network(obv).numpy().argmax(1)[0]
61 |
62 | def get_action_greedy(self, obv):
63 | obv = np.expand_dims(obv, 0).astype('float32')
64 | return self.network(obv).numpy().argmax(1)[0]
65 |
66 | def sync(self):
67 | """Copy q network to target q network"""
68 |
69 | for var, var_tar in zip(self.network.trainable_weights,
70 | self.target_network.trainable_weights):
71 | var_tar.assign(var)
72 |
73 | def save_ckpt(self, env_name):
74 | """
75 | save trained weights
76 | :return: None
77 | """
78 | save_model(self.network, 'qnet', 'DQN', env_name)
79 |
80 | def load_ckpt(self, env_name):
81 | """
82 | load trained weights
83 | :return: None
84 | """
85 | load_model(self.network, 'qnet', 'DQN', env_name)
86 |
87 | # @tf.function
88 | def _td_error(self, transitions, reward_gamma):
89 | b_o, b_a, b_r, b_o_, b_d = transitions
90 | b_d = tf.cast(b_d, tf.float32)
91 | b_a = tf.cast(b_a, tf.int64)
92 | b_r = tf.cast(b_r, tf.float32)
93 | if self.double_q:
94 | b_a_ = tf.one_hot(tf.argmax(self.network(b_o_), 1), self.network.action_shape[0])
95 | b_q_ = (1 - b_d) * tf.reduce_sum(self.target_network(b_o_) * b_a_, 1)
96 | else:
97 | b_q_ = (1 - b_d) * tf.reduce_max(self.target_network(b_o_), 1)
98 |
99 | b_q = tf.reduce_sum(self.network(b_o) * tf.one_hot(b_a, self.network.action_shape[0]), 1)
100 | return b_q - (b_r + reward_gamma * b_q_)
101 |
102 | def store_transition(self, s, a, r, s_, d):
103 | self.buffer.push(s, a, r, s_, d)
104 |
105 | def update(self, batch_size, gamma):
106 | if self.prioritized_replay:
107 | # sample from prioritized replay buffer
108 | *transitions, b_w, idxs = self.buffer.sample(batch_size)
109 | # calculate weighted huber loss
110 | with tf.GradientTape() as tape:
111 | priorities = self._td_error(transitions, gamma)
112 | huber_loss = tf.where(tf.abs(priorities) < 1,
113 | tf.square(priorities) * 0.5,
114 | tf.abs(priorities) - 0.5)
115 | loss = tf.reduce_mean(huber_loss * b_w)
116 | # backpropagate
117 | grad = tape.gradient(loss, self.network.trainable_weights)
118 | self.optimizer.apply_gradients(zip(grad, self.network.trainable_weights))
119 | # update priorities
120 | priorities = np.clip(np.abs(priorities), 1e-6, None)
121 | self.buffer.update_priorities(idxs, priorities)
122 | else:
123 | # sample from prioritized replay buffer
124 | transitions = self.buffer.sample(batch_size)
125 | # calculate huber loss
126 | with tf.GradientTape() as tape:
127 | td_errors = self._td_error(transitions, gamma)
128 | huber_loss = tf.where(tf.abs(td_errors) < 1,
129 | tf.square(td_errors) * 0.5,
130 | tf.abs(td_errors) - 0.5)
131 | loss = tf.reduce_mean(huber_loss)
132 | # backpropagate
133 | grad = tape.gradient(loss, self.network.trainable_weights)
134 | self.optimizer.apply_gradients(zip(grad, self.network.trainable_weights))
135 |
136 | def learn(
137 | self, env, mode='train', render=False,
138 | train_episodes=1000, test_episodes=10, max_steps=200,
139 | save_interval=1000, gamma=0.99,
140 | exploration_rate=0.2, exploration_final_eps=0.01,
141 | target_network_update_freq=50,
142 | batch_size=32, train_freq=4, learning_starts=200,
143 | plot_func=None
144 | ):
145 |
146 | """
147 | :param env: learning environment
148 | :param mode: train or test
149 | :param render: render each step
150 | :param train_episodes: total number of episodes for training
151 | :param test_episodes: total number of episodes for testing
152 | :param max_steps: maximum number of steps for one episode
153 | :param save_interval: time steps for saving
154 | :param gamma: reward decay factor
155 | :param exploration_rate (float): fraction of entire training period over
156 | which the exploration rate is annealed
157 | :param exploration_final_eps (float): final value of random action probability
158 | :param target_network_update_freq (int): update the target network every
159 | `target_network_update_freq` steps
160 | :param batch_size (int): size of a batched sampled from replay buffer for training
161 | :param train_freq (int): update the model every `train_freq` steps
162 | :param learning_starts (int): how many steps of the model to collect transitions
163 | for before learning starts
164 | :param plot_func: additional function for interactive module
165 |
166 | """
167 | if mode == 'train':
168 | print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id))
169 | reward_buffer = []
170 | i = 0
171 | for episode in range(1, train_episodes + 1):
172 | o = env.reset()
173 | ep_reward = 0
174 | for step in range(1, max_steps + 1):
175 | i += 1
176 | if render:
177 | env.render()
178 | eps = 1 - (1 - exploration_final_eps) * \
179 | min(1, i / (exploration_rate * train_episodes * max_steps))
180 | a = self.get_action(o, eps)
181 |
182 | # execute action and feed to replay buffer
183 | # note that `_` tail in var name means next
184 | o_, r, done, info = env.step(a)
185 | self.store_transition(o, a, r, o_, done)
186 | ep_reward += r
187 |
188 | # update networks
189 | if i >= learning_starts and i % train_freq == 0:
190 | self.update(batch_size, gamma)
191 |
192 | if i % target_network_update_freq == 0:
193 | self.sync()
194 |
195 | # reset current observation
196 | if done:
197 | break
198 | else:
199 | o = o_
200 |
201 | # saving model
202 | if i % save_interval == 0:
203 | self.save_ckpt(env.spec.id)
204 | print(
205 | 'Time steps so far: {}, episode so far: {}, '
206 | 'episode reward: {:.4f}, episode length: {}'
207 | .format(i, episode, ep_reward, step)
208 | )
209 | reward_buffer.append(ep_reward)
210 | if plot_func is not None:
211 | plot_func(reward_buffer)
212 |
213 | elif mode == 'test':
214 | print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id))
215 |
216 | self.load_ckpt(env.spec.id)
217 | self.network.infer()
218 |
219 | reward_buffer = []
220 | for episode in range(1, test_episodes + 1):
221 | o = env.reset()
222 | ep_reward = 0
223 | for step in range(1, max_steps + 1):
224 | if render:
225 | env.render()
226 | a = self.get_action_greedy(o)
227 |
228 | # execute action
229 | # note that `_` tail in var name means next
230 | o_, r, done, info = env.step(a)
231 | ep_reward += r
232 |
233 | if done:
234 | break
235 | else:
236 | o = o_
237 |
238 | print(
239 | 'episode so far: {}, '
240 | 'episode reward: {:.4f}, episode length: {}'
241 | .format(episode, ep_reward, step)
242 | )
243 | reward_buffer.append(ep_reward)
244 | if plot_func is not None:
245 | plot_func(reward_buffer)
246 |
247 | else:
248 | print('unknown mode type')
249 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/dqn/run_dqn.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from rlzoo.algorithms.dqn.dqn import DQN
4 | from rlzoo.algorithms.dqn.default import *
5 | from rlzoo.common.value_networks import *
6 | import gym
7 |
8 | """ load environment """
9 | env = gym.make('CartPole-v0').unwrapped
10 |
11 | obs_space = env.observation_space
12 | act_space = env.action_space
13 |
14 | # reproducible
15 | seed = 2
16 | set_seed(seed, env)
17 |
18 | in_dim = env.observation_space.shape[0]
19 | act_dim = env.action_space.n
20 | """ build networks for the algorithm """
21 | name = 'DQN'
22 | Q_net = QNetwork(env.observation_space, env.action_space, [64], activation=tf.nn.tanh,
23 | state_only=True, dueling=True)
24 | net_list = [Q_net]
25 |
26 | """ create model """
27 | optimizer = tf.optimizers.Adam(5e-3, epsilon=1e-5)
28 | optimizers_list = [optimizer]
29 | model = DQN(net_list, optimizers_list,
30 | double_q=True,
31 | dueling=True,
32 | buffer_size=10000,
33 | prioritized_replay=False,
34 | prioritized_alpha=0.6,
35 | prioritized_beta0=0.4)
36 | """
37 | full list of arguments for the algorithm
38 | ----------------------------------------
39 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
40 | optimizers_list: a list of optimizers for all networks and differentiable variables
41 | replay_buffer_size: the size of buffer for storing explored samples
42 | tau: soft update factor
43 | """
44 |
45 | model.learn(env, mode='train', render=False,
46 | train_episodes=1000,
47 | test_episodes=10,
48 | max_steps=200,
49 | save_interval=1e3,
50 | batch_size=32,
51 | exploration_rate=0.2,
52 | exploration_final_eps=0.01,
53 | train_freq=4,
54 | learning_starts=200,
55 | target_network_update_freq=50,
56 | gamma=0.99, )
57 | """
58 | full list of parameters for training
59 | ---------------------------------------
60 | env: learning environment
61 | train_episodes: total number of episodes for training
62 | test_episodes: total number of episodes for testing
63 | max_steps: maximum number of steps for one episode
64 | save_interval: time steps for saving
65 | explore_steps: for random action sampling in the beginning of training
66 | mode: train or test mode
67 | render: render each step
68 | batch_size: update batch size
69 | gamma: reward decay factor
70 | noise_scale: range of action noise for exploration
71 | noise_scale_decay: noise scale decay factor
72 | """
73 |
74 | model.learn(env, mode='test', render=True,
75 | test_episodes=10,
76 | batch_size=32,
77 | exploration_rate=0.2,
78 | exploration_final_eps=0.01,
79 | train_freq=4,
80 | learning_starts=200,
81 | target_network_update_freq=50,
82 | gamma=0.99, )
83 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/pg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/pg/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/pg/default.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.policy_networks import *
2 | from rlzoo.common.utils import set_seed
3 |
4 | """
5 | full list of algorithm parameters (alg_params)
6 | -----------------------------------------------
7 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
8 | optimizers_list: a list of optimizers for all networks and differentiable variables
9 | -----------------------------------------------
10 |
11 | full list of learning parameters (learn_params)
12 | -----------------------------------------------
13 | train_episodes: total number of episodes for training
14 | test_episodes: total number of episodes for testing
15 | max_steps: maximum number of steps for one episode
16 | save_interval: time steps for saving
17 | mode: train or test
18 | render: render each step
19 | gamma: reward decay
20 | -----------------------------------------------
21 | """
22 |
23 |
24 | def atari(env, default_seed=True):
25 | if default_seed:
26 | seed = 2
27 | set_seed(seed, env) # reproducible
28 |
29 | alg_params = dict()
30 |
31 | if alg_params.get('net_list') is None:
32 | num_hidden_layer = 1 # number of hidden layers for the networks
33 | hidden_dim = 32 # dimension of hidden layers for the networks
34 | with tf.name_scope('PG'):
35 | with tf.name_scope('Policy'):
36 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
37 | num_hidden_layer * [hidden_dim])
38 | net_list = [policy_net]
39 | alg_params['net_list'] = net_list
40 |
41 | if alg_params.get('optimizers_list') is None:
42 | learning_rate = 0.02
43 | policy_optimizer = tf.optimizers.Adam(learning_rate)
44 | optimizers_list = [policy_optimizer]
45 | alg_params['optimizers_list'] = optimizers_list
46 |
47 | learn_params = dict(
48 | train_episodes=200,
49 | test_episodes=100,
50 | max_steps=200,
51 | save_interval=20,
52 | gamma=0.95
53 | )
54 |
55 | return alg_params, learn_params
56 |
57 |
58 | def classic_control(env, default_seed=True):
59 | if default_seed:
60 | seed = 2
61 | set_seed(seed, env) # reproducible
62 |
63 | alg_params = dict()
64 |
65 | if alg_params.get('net_list') is None:
66 | num_hidden_layer = 1 # number of hidden layers for the networks
67 | hidden_dim = 32 # dimension of hidden layers for the networks
68 | with tf.name_scope('PG'):
69 | with tf.name_scope('Policy'):
70 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
71 | num_hidden_layer * [hidden_dim])
72 | net_list = [policy_net]
73 | alg_params['net_list'] = net_list
74 |
75 | if alg_params.get('optimizers_list') is None:
76 | learning_rate = 0.02
77 | policy_optimizer = tf.optimizers.Adam(learning_rate)
78 | optimizers_list = [policy_optimizer]
79 | alg_params['optimizers_list'] = optimizers_list
80 |
81 | learn_params = dict(
82 | train_episodes=200,
83 | test_episodes=100,
84 | max_steps=200,
85 | save_interval=20,
86 | gamma=0.95
87 | )
88 |
89 | return alg_params, learn_params
90 |
91 |
92 | def box2d(env, default_seed=True):
93 | if default_seed:
94 | seed = 2
95 | set_seed(seed, env) # reproducible
96 |
97 | alg_params = dict()
98 |
99 | if alg_params.get('net_list') is None:
100 | num_hidden_layer = 1 # number of hidden layers for the networks
101 | hidden_dim = 32 # dimension of hidden layers for the networks
102 | with tf.name_scope('PG'):
103 | with tf.name_scope('Policy'):
104 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
105 | num_hidden_layer * [hidden_dim])
106 | net_list = [policy_net]
107 | alg_params['net_list'] = net_list
108 |
109 | if alg_params.get('optimizers_list') is None:
110 | learning_rate = 0.02
111 | policy_optimizer = tf.optimizers.Adam(learning_rate)
112 | optimizers_list = [policy_optimizer]
113 | alg_params['optimizers_list'] = optimizers_list
114 |
115 | learn_params = dict(
116 | train_episodes=200,
117 | test_episodes=100,
118 | max_steps=200,
119 | save_interval=20,
120 | gamma=0.95
121 | )
122 |
123 | return alg_params, learn_params
124 |
125 |
126 | def mujoco(env, default_seed=True):
127 | if default_seed:
128 | seed = 2
129 | set_seed(seed, env) # reproducible
130 |
131 | alg_params = dict()
132 |
133 | if alg_params.get('net_list') is None:
134 | num_hidden_layer = 1 # number of hidden layers for the networks
135 | hidden_dim = 32 # dimension of hidden layers for the networks
136 | with tf.name_scope('PG'):
137 | with tf.name_scope('Policy'):
138 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
139 | num_hidden_layer * [hidden_dim])
140 | net_list = [policy_net]
141 | alg_params['net_list'] = net_list
142 |
143 | if alg_params.get('optimizers_list') is None:
144 | learning_rate = 0.02
145 | policy_optimizer = tf.optimizers.Adam(learning_rate)
146 | optimizers_list = [policy_optimizer]
147 | alg_params['optimizers_list'] = optimizers_list
148 |
149 | learn_params = dict(
150 | train_episodes=200,
151 | test_episodes=100,
152 | max_steps=200,
153 | save_interval=20,
154 | gamma=0.95
155 | )
156 |
157 | return alg_params, learn_params
158 |
159 |
160 | def robotics(env, default_seed=True):
161 | if default_seed:
162 | seed = 2
163 | set_seed(seed, env) # reproducible
164 |
165 | alg_params = dict()
166 |
167 | if alg_params.get('net_list') is None:
168 | num_hidden_layer = 1 # number of hidden layers for the networks
169 | hidden_dim = 32 # dimension of hidden layers for the networks
170 | with tf.name_scope('PG'):
171 | with tf.name_scope('Policy'):
172 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
173 | num_hidden_layer * [hidden_dim])
174 | net_list = [policy_net]
175 | alg_params['net_list'] = net_list
176 |
177 | if alg_params.get('optimizers_list') is None:
178 | learning_rate = 0.02
179 | policy_optimizer = tf.optimizers.Adam(learning_rate)
180 | optimizers_list = [policy_optimizer]
181 | alg_params['optimizers_list'] = optimizers_list
182 |
183 | learn_params = dict(
184 | train_episodes=200,
185 | test_episodes=100,
186 | max_steps=200,
187 | save_interval=20,
188 | gamma=0.95
189 | )
190 |
191 | return alg_params, learn_params
192 |
193 |
194 | def dm_control(env, default_seed=True):
195 | if default_seed:
196 | seed = 2
197 | set_seed(seed, env) # reproducible
198 |
199 | alg_params = dict()
200 |
201 | if alg_params.get('net_list') is None:
202 | num_hidden_layer = 1 # number of hidden layers for the networks
203 | hidden_dim = 32 # dimension of hidden layers for the networks
204 | with tf.name_scope('PG'):
205 | with tf.name_scope('Policy'):
206 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
207 | num_hidden_layer * [hidden_dim])
208 | net_list = [policy_net]
209 | alg_params['net_list'] = net_list
210 |
211 | if alg_params.get('optimizers_list') is None:
212 | learning_rate = 0.02
213 | policy_optimizer = tf.optimizers.Adam(learning_rate)
214 | optimizers_list = [policy_optimizer]
215 | alg_params['optimizers_list'] = optimizers_list
216 |
217 | learn_params = dict(
218 | train_episodes=200,
219 | test_episodes=100,
220 | max_steps=200,
221 | save_interval=20,
222 | gamma=0.95
223 | )
224 |
225 | return alg_params, learn_params
226 |
227 |
228 | def rlbench(env, default_seed=True):
229 | if default_seed:
230 | seed = 2
231 | set_seed(seed, env) # reproducible
232 |
233 | alg_params = dict()
234 |
235 | if alg_params.get('net_list') is None:
236 | num_hidden_layer = 1 # number of hidden layers for the networks
237 | hidden_dim = 32 # dimension of hidden layers for the networks
238 | with tf.name_scope('PG'):
239 | with tf.name_scope('Policy'):
240 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
241 | num_hidden_layer * [hidden_dim])
242 | net_list = [policy_net]
243 | alg_params['net_list'] = net_list
244 |
245 | if alg_params.get('optimizers_list') is None:
246 | learning_rate = 0.02
247 | policy_optimizer = tf.optimizers.Adam(learning_rate)
248 | optimizers_list = [policy_optimizer]
249 | alg_params['optimizers_list'] = optimizers_list
250 |
251 | learn_params = dict(
252 | train_episodes=200,
253 | test_episodes=100,
254 | max_steps=200,
255 | save_interval=20,
256 | gamma=0.95
257 | )
258 |
259 | return alg_params, learn_params
260 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/pg/pg.py:
--------------------------------------------------------------------------------
1 | """
2 | Vanilla Policy Gradient(VPG or REINFORCE)
3 | -----------------------------------------
4 | The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance.
5 | It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces.
6 | Here is an example on discrete action space game CartPole-v0.
7 | To apply it on continuous action space, you need to change the last softmax layer and the get_action function.
8 |
9 | Reference
10 | ---------
11 | Cookbook: Barto A G, Sutton R S. Reinforcement Learning: An Introduction[J]. 1998.
12 | MorvanZhou's tutorial page: https://morvanzhou.github.io/tutorials/
13 | MorvanZhou's code: https://github.com/MorvanZhou/Reinforcement-learning-with-tensorflow/
14 |
15 | Prerequisites
16 | --------------
17 | tensorflow >=2.0.0a0
18 | tensorflow-probability 0.6.0
19 | tensorlayer >=2.0.0
20 |
21 | """
22 | import time
23 |
24 | from rlzoo.common.utils import *
25 | from rlzoo.common.policy_networks import *
26 |
27 |
28 | ############################### PG ####################################
29 |
30 |
31 | class PG:
32 | """
33 | PG class
34 | """
35 |
36 | def __init__(self, net_list, optimizers_list):
37 | """
38 | :param net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
39 | :param optimizers_list: a list of optimizers for all networks and differentiable variables
40 |
41 | """
42 | assert len(net_list) == 1
43 | assert len(optimizers_list) == 1
44 | self.name = 'PG'
45 | self.model = net_list[0]
46 | assert isinstance(self.model, StochasticPolicyNetwork)
47 | self.buffer = []
48 | print('Policy Network', self.model)
49 | self.optimizer = optimizers_list[0]
50 |
51 | def get_action(self, s):
52 | """
53 | choose action with probabilities.
54 |
55 | :param s: state
56 |
57 | :return: act
58 | """
59 | return self.model([s])[0].numpy()
60 |
61 | def get_action_greedy(self, s):
62 | """
63 | choose action with greedy policy
64 |
65 | :param s: state
66 |
67 | :return: act
68 | """
69 | return self.model([s], greedy=True).numpy()[0]
70 |
71 | def store_transition(self, s, a, r):
72 | """
73 | store data in memory buffer
74 |
75 | :param s: state
76 | :param a: act
77 | :param r: reward
78 |
79 | :return:
80 | """
81 | self.buffer.append([s, np.array(a, np.float32), np.array(r, np.float32)])
82 |
83 | def update(self, gamma):
84 | """
85 | update policy parameters via stochastic gradient ascent
86 |
87 | :return: None
88 | """
89 | # discount and normalize episode reward
90 | s, a, r = zip(*self.buffer)
91 | s, a, r = np.array(s), np.array(a), np.array(r).flatten()
92 | discounted_ep_rs_norm = self._discount_and_norm_rewards(r, gamma)
93 |
94 | with tf.GradientTape() as tape:
95 | self.model(s)
96 | neg_log_prob = self.model.policy_dist.neglogp(a)
97 | loss = tf.reduce_mean(neg_log_prob * discounted_ep_rs_norm) # reward guided loss
98 |
99 | grad = tape.gradient(loss, self.model.trainable_weights)
100 | self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
101 |
102 | self.buffer = []
103 | return discounted_ep_rs_norm
104 |
105 | def _discount_and_norm_rewards(self, reward_list, gamma):
106 | """
107 | compute discount_and_norm_rewards
108 |
109 | :return: discount_and_norm_rewards
110 | """
111 | # discount episode rewards
112 | discounted_ep_rs = np.zeros_like(reward_list)
113 | running_add = 0
114 | for t in reversed(range(0, len(reward_list))):
115 | running_add = running_add * gamma + reward_list[t]
116 | discounted_ep_rs[t] = running_add
117 |
118 | # normalize episode rewards
119 | discounted_ep_rs -= np.mean(discounted_ep_rs)
120 | std = np.std(discounted_ep_rs)
121 | if std != 0:
122 | discounted_ep_rs /= np.std(discounted_ep_rs)
123 | discounted_ep_rs = discounted_ep_rs[:, np.newaxis]
124 | return discounted_ep_rs
125 |
126 | def save_ckpt(self, env_name):
127 | """
128 | save trained weights
129 |
130 | :return: None
131 | """
132 | save_model(self.model, 'model_policy', self.name, env_name)
133 |
134 | def load_ckpt(self, env_name):
135 | """
136 | load trained weights
137 |
138 | :return: None
139 | """
140 | load_model(self.model, 'model_policy', self.name, env_name)
141 |
142 | def learn(self, env, train_episodes=200, test_episodes=100, max_steps=200, save_interval=100,
143 | mode='train', render=False, gamma=0.95, plot_func=None):
144 | """
145 | :param env: learning environment
146 | :param train_episodes: total number of episodes for training
147 | :param test_episodes: total number of episodes for testing
148 | :param max_steps: maximum number of steps for one episode
149 | :param save_interval: time steps for saving
150 | :param mode: train or test
151 | :param render: render each step
152 | :param gamma: reward decay
153 | :param plot_func: additional function for interactive module
154 | :return: None
155 | """
156 |
157 | if mode == 'train':
158 | print('Training... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id))
159 | reward_buffer = []
160 | t0 = time.time()
161 |
162 | for i_episode in range(1, train_episodes + 1):
163 |
164 | observation = env.reset()
165 |
166 | ep_rs_sum = 0
167 | for step in range(max_steps):
168 | if render:
169 | env.render()
170 | action = self.get_action(observation)
171 | observation_, reward, done, info = env.step(action)
172 | self.store_transition(observation, action, reward)
173 |
174 | ep_rs_sum += reward
175 | observation = observation_
176 |
177 | if done:
178 | break
179 |
180 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
181 | i_episode, train_episodes, ep_rs_sum, time.time() - t0)
182 | )
183 | reward_buffer.append(ep_rs_sum)
184 | if plot_func is not None:
185 | plot_func(reward_buffer)
186 |
187 | self.update(gamma)
188 |
189 | if i_episode and i_episode % save_interval == 0:
190 | self.save_ckpt(env_name=env.spec.id)
191 | plot_save_log(reward_buffer, algorithm_name='PG', env_name=env.spec.id)
192 |
193 | self.save_ckpt(env_name=env.spec.id)
194 | plot_save_log(reward_buffer, algorithm_name='PG', env_name=env.spec.id)
195 |
196 | elif mode == 'test':
197 | # test
198 | self.load_ckpt(env_name=env.spec.id)
199 | print('Testing... | Algorithm: {} | Environment: {}'.format(self.name, env.spec.id))
200 | t0 = time.time()
201 | for eps in range(test_episodes):
202 | observation = env.reset()
203 | ep_rs_sum = 0
204 | for step in range(max_steps):
205 | if render:
206 | env.render()
207 | action = self.get_action_greedy(observation)
208 | observation, reward, done, info = env.step(action)
209 | ep_rs_sum += reward
210 | if done:
211 | break
212 | print('Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'.format(
213 | eps, test_episodes, ep_rs_sum, time.time() - t0)
214 | )
215 |
216 | else:
217 | print('unknown mode type')
218 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/pg/run_pg.py:
--------------------------------------------------------------------------------
1 | from rlzoo.algorithms.pg.pg import PG
2 | from rlzoo.common.policy_networks import *
3 | import gym
4 |
5 | """ load environment """
6 | env = gym.make('CartPole-v0').unwrapped
7 | # env = gym.make('Pendulum-v0').unwrapped
8 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run
9 | obs_space = env.observation_space
10 | act_space = env.action_space
11 |
12 | # reproducible
13 | seed = 2
14 | np.random.seed(seed)
15 | tf.random.set_seed(seed)
16 | env.seed(seed)
17 |
18 | """ build networks for the algorithm """
19 | name = 'pg'
20 | num_hidden_layer = 1 # number of hidden layers for the networks
21 | hidden_dim = 32 # dimension of hidden layers for the networks
22 |
23 | policy_net = StochasticPolicyNetwork(obs_space, act_space, num_hidden_layer * [hidden_dim])
24 | net_list = [policy_net]
25 |
26 | """ choose optimizers """
27 | learning_rate = 0.02
28 | policy_optimizer = tf.optimizers.Adam(learning_rate)
29 | optimizers_list = [policy_optimizer]
30 |
31 | model = PG(net_list, optimizers_list)
32 | """
33 | full list of arguments for the algorithm
34 | ----------------------------------------
35 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
36 | optimizers_list: a list of optimizers for all networks and differentiable variables
37 | """
38 |
39 | model.learn(env, train_episodes=200, max_steps=200, save_interval=20, mode='train', render=False, gamma=0.95)
40 | """
41 | full list of parameters for training
42 | ---------------------------------------
43 | env: learning environment
44 | train_episodes: total number of episodes for training
45 | test_episodes: total number of episodes for testing
46 | max_steps: maximum number of steps for one episode
47 | save_interval: time steps for saving
48 | mode: train or test
49 | render: render each step
50 | gamma: reward decay
51 | """
52 |
53 | # test
54 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True)
55 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/ppo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ppo/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/ppo/ppo.py:
--------------------------------------------------------------------------------
1 | from rlzoo.algorithms.ppo_penalty.ppo_penalty import PPO_PENALTY
2 | from rlzoo.algorithms.ppo_clip.ppo_clip import PPO_CLIP
3 |
4 |
5 | def PPO(**alg_params):
6 | method = alg_params['method']
7 | if method == 'penalty':
8 | del alg_params['epsilon']
9 | algo = PPO_PENALTY
10 | elif method == 'clip':
11 | del alg_params['kl_target']
12 | del alg_params['lam']
13 | algo = PPO_CLIP
14 | else:
15 | raise ValueError('Method input error. Method can only be penalty or clip')
16 | del alg_params['method']
17 | return algo(**alg_params)
18 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/ppo_clip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ppo_clip/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/ppo_clip/run_ppo_clip.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.utils import make_env, set_seed
2 | from rlzoo.algorithms.ppo_clip.ppo_clip import PPO_CLIP
3 | from rlzoo.common.policy_networks import *
4 | from rlzoo.common.value_networks import *
5 | import gym
6 |
7 |
8 | """ load environment """
9 | env = gym.make('Pendulum-v0').unwrapped
10 |
11 | # reproducible
12 | seed = 1
13 | set_seed(seed, env)
14 |
15 | """ build networks for the algorithm """
16 | name = 'PPO_CLIP'
17 | hidden_dim = 64
18 | num_hidden_layer = 2
19 | critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value')
20 |
21 | actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer,
22 | output_activation=tf.nn.tanh, name=name + '_policy')
23 | net_list = critic, actor
24 |
25 | """ create model """
26 | actor_lr = 1e-4
27 | critic_lr = 2e-4
28 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)]
29 |
30 | model = PPO_CLIP(net_list, optimizers_list,)
31 | """
32 | full list of arguments for the algorithm
33 | ----------------------------------------
34 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
35 | optimizers_list: a list of optimizers for all networks and differentiable variables
36 | epsilon: clip parameter
37 | """
38 |
39 | model.learn(env, train_episodes=500, max_steps=200, save_interval=50, gamma=0.9,
40 | mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10)
41 |
42 | """
43 | full list of parameters for training
44 | ---------------------------------------
45 | env: learning environment
46 | train_episodes: total number of episodes for training
47 | test_episodes: total number of episodes for testing
48 | max_steps: maximum number of steps for one episode
49 | save_interval: time steps for saving
50 | gamma: reward discount factor
51 | mode: train or test
52 | render: render each step
53 | batch_size: UPDATE batch size
54 | a_update_steps: actor update iteration steps
55 | c_update_steps: critic update iteration steps
56 | :return: None
57 | """
58 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True)
59 |
60 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/ppo_penalty/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/ppo_penalty/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/ppo_penalty/run_ppo_penalty.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.utils import make_env, set_seed
2 | from rlzoo.algorithms.ppo_penalty.ppo_penalty import PPO_PENALTY
3 | from rlzoo.common.policy_networks import *
4 | from rlzoo.common.value_networks import *
5 | import gym
6 |
7 |
8 | """ load environment """
9 | env = gym.make('Pendulum-v0').unwrapped
10 |
11 | # reproducible
12 | seed = 1
13 | set_seed(seed, env)
14 |
15 | """ build networks for the algorithm """
16 | name = 'PPO_PENALTY'
17 | hidden_dim = 64
18 | num_hidden_layer = 2
19 | critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value')
20 |
21 | actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer,
22 | output_activation=tf.nn.tanh, name=name + '_policy')
23 | net_list = critic, actor
24 |
25 | """ create model """
26 | actor_lr = 1e-4
27 | critic_lr = 2e-4
28 | optimizers_list = [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)]
29 |
30 | model = PPO_PENALTY(net_list, optimizers_list,)
31 | """
32 | full list of arguments for the algorithm
33 | ----------------------------------------
34 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
35 | optimizers_list: a list of optimizers for all networks and differentiable variables
36 | kl_target: controls bounds of policy update and adaptive lambda
37 | lam: KL-regularization coefficient
38 | """
39 |
40 | model.learn(env, train_episodes=500, max_steps=200, save_interval=50, gamma=0.9,
41 | mode='train', render=False, batch_size=32, a_update_steps=10, c_update_steps=10)
42 |
43 | """
44 | full list of parameters for training
45 | ---------------------------------------
46 | env: learning environment
47 | train_episodes: total number of episodes for training
48 | test_episodes: total number of episodes for testing
49 | max_steps: maximum number of steps for one episode
50 | save_interval: times teps for saving
51 | gamma: reward discount factor
52 | mode: train or test
53 | render: render each step
54 | batch_size: update batch size
55 | a_update_steps: actor update iteration steps
56 | c_update_steps: critic update iteration steps
57 | :return: None
58 | """
59 |
60 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True)
61 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/sac/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/sac/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/sac/run_sac.py:
--------------------------------------------------------------------------------
1 | from rlzoo.algorithms.sac.sac import SAC
2 | from rlzoo.common.policy_networks import *
3 | from rlzoo.common.value_networks import *
4 | import gym
5 |
6 | """ load environment """
7 | env = gym.make('Pendulum-v0').unwrapped
8 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run
9 | action_shape = env.action_space.shape
10 | state_shape = env.observation_space.shape
11 | # reproducible
12 | seed = 2
13 | np.random.seed(seed)
14 | tf.random.set_seed(seed)
15 | env.seed(seed)
16 |
17 | """ build networks for the algorithm """
18 | num_hidden_layer = 2 # number of hidden layers for the networks
19 | hidden_dim = 64 # dimension of hidden layers for the networks, default as the same for each layer here
20 | with tf.name_scope('SAC'):
21 | with tf.name_scope('Q_Net1'):
22 | soft_q_net1 = QNetwork(env.observation_space, env.action_space,
23 | hidden_dim_list=num_hidden_layer * [hidden_dim])
24 | with tf.name_scope('Q_Net2'):
25 | soft_q_net2 = QNetwork(env.observation_space, env.action_space,
26 | hidden_dim_list=num_hidden_layer * [hidden_dim])
27 | with tf.name_scope('Target_Q_Net1'):
28 | target_soft_q_net1 = QNetwork(env.observation_space, env.action_space,
29 | hidden_dim_list=num_hidden_layer * [hidden_dim])
30 | with tf.name_scope('Target_Q_Net2'):
31 | target_soft_q_net2 = QNetwork(env.observation_space, env.action_space,
32 | hidden_dim_list=num_hidden_layer * [hidden_dim])
33 | with tf.name_scope('Policy'):
34 | policy_net = StochasticPolicyNetwork(env.observation_space, env.action_space,
35 | hidden_dim_list=num_hidden_layer * [hidden_dim],
36 | output_activation=None,
37 | state_conditioned=True)
38 | net_list = [soft_q_net1, soft_q_net2, target_soft_q_net1, target_soft_q_net2, policy_net]
39 |
40 | """ choose optimizers """
41 | soft_q_lr, policy_lr, alpha_lr = 3e-4, 3e-4, 3e-4 # soft_q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network; alpha_lr: learning rate of the variable alpha
42 | soft_q_optimizer1 = tf.optimizers.Adam(soft_q_lr)
43 | soft_q_optimizer2 = tf.optimizers.Adam(soft_q_lr)
44 | policy_optimizer = tf.optimizers.Adam(policy_lr)
45 | alpha_optimizer = tf.optimizers.Adam(alpha_lr)
46 | optimizers_list = [soft_q_optimizer1, soft_q_optimizer2, policy_optimizer, alpha_optimizer]
47 |
48 | model = SAC(net_list, optimizers_list)
49 | """
50 | full list of arguments for the algorithm
51 | ----------------------------------------
52 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
53 | optimizers_list: a list of optimizers for all networks and differentiable variables
54 | state_dim: dimension of state for the environment
55 | action_dim: dimension of action for the environment
56 | replay_buffer_capacity: the size of buffer for storing explored samples
57 | action_range: value of each action in [-action_range, action_range]
58 | """
59 |
60 | model.learn(env, train_episodes=100, max_steps=150, batch_size=64, explore_steps=500, \
61 | update_itr=3, policy_target_update_interval=3, reward_scale=1., save_interval=10, \
62 | mode='train', AUTO_ENTROPY=True, render=False)
63 | """
64 | full list of parameters for training
65 | ---------------------------------------
66 | env: learning environment
67 | train_episodes: total number of episodes for training
68 | test_episodes: total number of episodes for testing
69 | max_steps: maximum number of steps for one episode
70 | batch_size: udpate batchsize
71 | explore_steps: for random action sampling in the beginning of training
72 | update_itr: repeated updates for single step
73 | policy_target_update_interval: delayed update for the policy network and target networks
74 | reward_scale: value range of reward
75 | save_interval: timesteps for saving the weights and plotting the results
76 | mode: 'train' or 'test'
77 | AUTO_ENTROPY: automatically udpating variable alpha for entropy
78 | DETERMINISTIC: stochastic action policy if False, otherwise deterministic
79 | render: if true, visualize the environment
80 | """
81 | # test
82 | model.learn(env, test_episodes=10, max_steps=150, mode='test', render=True)
83 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/td3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/td3/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/td3/run_td3.py:
--------------------------------------------------------------------------------
1 | from rlzoo.algorithms.td3.td3 import TD3
2 | from rlzoo.common.policy_networks import *
3 | from rlzoo.common.value_networks import *
4 | import gym
5 |
6 | """ load environment """
7 | env = gym.make('Pendulum-v0').unwrapped
8 | # env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized/wrapped environment to run
9 | action_shape = env.action_space.shape
10 | state_shape = env.observation_space.shape
11 | # reproducible
12 | seed = 2
13 | np.random.seed(seed)
14 | tf.random.set_seed(seed)
15 | env.seed(seed)
16 |
17 | """ build networks for the algorithm """
18 | num_hidden_layer = 2 # number of hidden layers for the networks
19 | hidden_dim = 64 # dimension of hidden layers for the networks
20 | with tf.name_scope('TD3'):
21 | with tf.name_scope('Q_Net1'):
22 | q_net1 = QNetwork(env.observation_space, env.action_space,
23 | hidden_dim_list=num_hidden_layer * [hidden_dim])
24 | with tf.name_scope('Q_Net2'):
25 | q_net2 = QNetwork(env.observation_space, env.action_space,
26 | hidden_dim_list=num_hidden_layer * [hidden_dim])
27 | with tf.name_scope('Target_Q_Net1'):
28 | target_q_net1 = QNetwork(env.observation_space, env.action_space,
29 | hidden_dim_list=num_hidden_layer * [hidden_dim])
30 | with tf.name_scope('Target_Q_Net2'):
31 | target_q_net2 = QNetwork(env.observation_space, env.action_space,
32 | hidden_dim_list=num_hidden_layer * [hidden_dim])
33 | with tf.name_scope('Policy'):
34 | policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space,
35 | hidden_dim_list=num_hidden_layer * [hidden_dim])
36 | with tf.name_scope('Target_Policy'):
37 | target_policy_net = DeterministicPolicyNetwork(env.observation_space, env.action_space,
38 | hidden_dim_list=num_hidden_layer * [hidden_dim])
39 | net_list = [q_net1, q_net2, target_q_net1, target_q_net2, policy_net, target_policy_net]
40 |
41 | """ choose optimizers """
42 | q_lr, policy_lr = 3e-4, 3e-4 # q_lr: learning rate of the Q network; policy_lr: learning rate of the policy network
43 | q_optimizer1 = tf.optimizers.Adam(q_lr)
44 | q_optimizer2 = tf.optimizers.Adam(q_lr)
45 | policy_optimizer = tf.optimizers.Adam(policy_lr)
46 | optimizers_list = [q_optimizer1, q_optimizer2, policy_optimizer]
47 |
48 | model = TD3(net_list, optimizers_list)
49 | """
50 | full list of arguments for the algorithm
51 | ----------------------------------------
52 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
53 | optimizers_list: a list of optimizers for all networks and differentiable variables
54 | state_dim: dimension of state for the environment
55 | action_dim: dimension of action for the environment
56 | replay_buffer_capacity: the size of buffer for storing explored samples
57 | policy_target_update_interval: delayed interval for updating the target policy
58 | action_range: value of each action in [-action_range, action_range]
59 | """
60 |
61 | model.learn(env, train_episodes=100, max_steps=150, batch_size=64, explore_steps=500, update_itr=3,
62 | reward_scale=1., save_interval=10, explore_noise_scale=1.0, eval_noise_scale=0.5, mode='train',
63 | render=False)
64 | """
65 | full list of parameters for training
66 | ---------------------------------------
67 | env: learning environment
68 | train_episodes: total number of episodes for training
69 | test_episodes: total number of episodes for testing
70 | max_steps: maximum number of steps for one episode
71 | batch_size: udpate batchsize
72 | explore_steps: for random action sampling in the beginning of training
73 | update_itr: repeated updates for single step
74 | reward_scale: value range of reward
75 | save_interval: timesteps for saving the weights and plotting the results
76 | explore_noise_scale: range of action noise for exploration
77 | eval_noise_scale: range of action noise for evaluation of action value
78 | mode: 'train' or 'test'
79 | render: if true, visualize the environment
80 |
81 | """
82 | # test
83 | model.learn(env, test_episodes=10, max_steps=150, mode='test', render=True)
84 |
--------------------------------------------------------------------------------
/rlzoo/algorithms/trpo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/algorithms/trpo/__init__.py
--------------------------------------------------------------------------------
/rlzoo/algorithms/trpo/run_trpo.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.utils import set_seed
2 | from rlzoo.algorithms.trpo.trpo import TRPO
3 | from rlzoo.common.policy_networks import *
4 | from rlzoo.common.value_networks import *
5 | import gym
6 |
7 | """ load environment """
8 | env = gym.make('Pendulum-v0').unwrapped
9 |
10 | # reproducible
11 | seed = 2
12 | set_seed(seed, env)
13 |
14 | """ build networks for the algorithm """
15 | name = 'TRPO'
16 | hidden_dim = 64
17 | num_hidden_layer = 2
18 | critic = ValueNetwork(env.observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value')
19 |
20 | actor = StochasticPolicyNetwork(env.observation_space, env.action_space, [hidden_dim] * num_hidden_layer,
21 | output_activation=tf.nn.tanh, name=name + '_policy')
22 | net_list = critic, actor
23 |
24 | critic_lr = 1e-3
25 | optimizers_list = [tf.optimizers.Adam(critic_lr)]
26 |
27 | """ create model """
28 | model = TRPO(net_list, optimizers_list, damping_coeff=0.1, cg_iters=10, delta=0.01)
29 | """
30 | full list of arguments for the algorithm
31 | ----------------------------------------
32 | net_list: a list of networks (value and policy) used in the algorithm, from common functions or customization
33 | optimizers_list: a list of optimizers for all networks and differentiable variables
34 | damping_coeff: Artifact for numerical stability
35 | cg_iters: Number of iterations of conjugate gradient to perform
36 | delta: KL-divergence limit for TRPO update.
37 | """
38 |
39 | model.learn(env, mode='train', render=False, train_episodes=2000, max_steps=200, save_interval=100,
40 | gamma=0.9, batch_size=256, backtrack_iters=10, backtrack_coeff=0.8, train_critic_iters=80)
41 | """
42 | full list of parameters for training
43 | ---------------------------------------
44 | env: learning environment
45 | train_episodes: total number of episodes for training
46 | test_episodes: total number of episodes for testing
47 | max_steps: maximum number of steps for one episode
48 | save_interval: time steps for saving
49 | gamma: reward discount factor
50 | mode: train or test
51 | render: render each step
52 | batch_size: update batch size
53 | backtrack_iters: Maximum number of steps allowed in the backtracking line search
54 | backtrack_coeff: How far back to step during backtracking line search
55 | train_critic_iters: critic update iteration steps
56 | """
57 |
58 | model.learn(env, test_episodes=100, max_steps=200, mode='test', render=True)
59 |
--------------------------------------------------------------------------------
/rlzoo/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/common/__init__.py
--------------------------------------------------------------------------------
/rlzoo/common/basic_nets.py:
--------------------------------------------------------------------------------
1 | """Basic neural networks"""
2 | import tensorflow as tf
3 | import tensorlayer as tl
4 | from tensorlayer.layers import Dense, Input
5 | from gym import spaces
6 | from collections import OrderedDict
7 |
8 |
9 | def MLP(input_dim, hidden_dim_list, w_init=tf.initializers.Orthogonal(0.2),
10 | activation=tf.nn.relu, *args, **kwargs):
11 | """Multiple fully-connected layers for approximation
12 |
13 | :param input_dim: (int) size of input tensor
14 | :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers
15 | :param w_init: (callable) initialization method for weights
16 | :param activation: (callable) activation function of hidden layers
17 |
18 | Return:
19 | input tensor, output tensor
20 | """
21 |
22 | l = inputs = Input([None, input_dim])
23 | for i in range(len(hidden_dim_list)):
24 | l = Dense(n_units=hidden_dim_list[i], act=activation, W_init=w_init)(l)
25 | outputs = l
26 |
27 | return inputs, outputs
28 |
29 |
30 | def MLPModel(input_dim, hidden_dim_list, w_init=tf.initializers.Orthogonal(0.2),
31 | activation=tf.nn.relu, *args, **kwargs):
32 | """Multiple fully-connected layers for approximation
33 |
34 | :param input_dim: (int) size of input tensor
35 | :param hidden_dim_list: (list[int]) a list of dimensions of hidden layers
36 | :param w_init: (callable) initialization method for weights
37 | :param activation: (callable) activation function of hidden layers
38 |
39 | Return:
40 | input tensor, output tensor
41 | """
42 | l = inputs = Input([None, input_dim], name='Input_Layer')
43 | for i in range(len(hidden_dim_list)):
44 | l = Dense(n_units=hidden_dim_list[i], act=activation, W_init=w_init, name='Hidden_Layer%d' % (i + 1))(l)
45 | outputs = l
46 |
47 | return tl.models.Model(inputs=inputs, outputs=outputs)
48 |
49 |
50 | def CNN(input_shape, conv_kwargs=None):
51 | """Multiple convolutional layers for approximation
52 | Default setting is equal to architecture used in DQN
53 |
54 | :param input_shape: (tuple[int]) (H, W, C)
55 | :param conv_kwargs: (list[param]) list of conv parameters for tl.layers.Conv2d
56 |
57 | Return:
58 | input tensor, output tensor
59 | """
60 | if not conv_kwargs:
61 | in_channels = input_shape[-1]
62 | conv_kwargs = [
63 | {
64 | 'in_channels': in_channels, 'n_filter': 32, 'act': tf.nn.relu,
65 | 'filter_size': (8, 8), 'strides': (4, 4), 'padding': 'VALID',
66 | 'W_init': tf.initializers.GlorotUniform()
67 | },
68 | {
69 | 'in_channels': 32, 'n_filter': 64, 'act': tf.nn.relu,
70 | 'filter_size': (4, 4), 'strides': (2, 2), 'padding': 'VALID',
71 | 'W_init': tf.initializers.GlorotUniform()
72 | },
73 | {
74 | 'in_channels': 64, 'n_filter': 64, 'act': tf.nn.relu,
75 | 'filter_size': (3, 3), 'strides': (1, 1), 'padding': 'VALID',
76 | 'W_init': tf.initializers.GlorotUniform()
77 | }
78 | ]
79 | l = inputs = tl.layers.Input((1,) + input_shape)
80 |
81 | for i, kwargs in enumerate(conv_kwargs):
82 | # kwargs['name'] = kwargs.get('name', 'cnn_layer{}'.format(i + 1))
83 | l = tl.layers.Conv2d(**kwargs)(l)
84 | outputs = tl.layers.Flatten()(l)
85 |
86 | return inputs, outputs
87 |
88 |
89 | def CNNModel(input_shape, conv_kwargs=None):
90 | """Multiple convolutional layers for approximation
91 | Default setting is equal to architecture used in DQN
92 |
93 | :param input_shape: (tuple[int]) (H, W, C)
94 | :param conv_kwargs: (list[param]) list of conv parameters for tl.layers.Conv2d
95 |
96 | Return:
97 | tl.model.Model
98 | """
99 | if not conv_kwargs:
100 | in_channels = input_shape[-1]
101 | conv_kwargs = [
102 | {
103 | 'in_channels': in_channels, 'n_filter': 32, 'act': tf.nn.relu,
104 | 'filter_size': (8, 8), 'strides': (4, 4), 'padding': 'VALID',
105 | 'W_init': tf.initializers.GlorotUniform()
106 | },
107 | {
108 | 'in_channels': 32, 'n_filter': 64, 'act': tf.nn.relu,
109 | 'filter_size': (4, 4), 'strides': (2, 2), 'padding': 'VALID',
110 | 'W_init': tf.initializers.GlorotUniform()
111 | },
112 | {
113 | 'in_channels': 64, 'n_filter': 64, 'act': tf.nn.relu,
114 | 'filter_size': (3, 3), 'strides': (1, 1), 'padding': 'VALID',
115 | 'W_init': tf.initializers.GlorotUniform()
116 | }
117 | ]
118 |
119 | ni = tl.layers.Input((1,) + input_shape, name='CNN_Input')
120 | hi = ni
121 |
122 | for i, kwargs in enumerate(conv_kwargs):
123 | kwargs['name'] = kwargs.get('name', 'CNN_Layer{}'.format(i + 1))
124 | hi = tl.layers.Conv2d(**kwargs)(hi)
125 | no = tl.layers.Flatten(name='Flatten_Layer')(hi)
126 |
127 | return tl.models.Model(inputs=ni, outputs=no)
128 |
129 |
130 | def CreateInputLayer(state_space, conv_kwargs=None):
131 | def CreateSingleInput(single_state_space):
132 | single_state_shape = single_state_space.shape
133 | # build structure
134 | if len(single_state_shape) == 1:
135 | l = inputs = Input((None,) + single_state_shape, name='input_layer')
136 | else:
137 | with tf.name_scope('CNN'):
138 | inputs, l = CNN(single_state_shape, conv_kwargs=conv_kwargs)
139 | return inputs, l, single_state_shape
140 |
141 | if isinstance(state_space, spaces.Dict):
142 | input_dict, layer_dict, shape_dict = OrderedDict(), OrderedDict(), OrderedDict()
143 | for k, v in state_space.spaces.items():
144 | input_dict[k], layer_dict[k], shape_dict[k] = CreateSingleInput(v)
145 | return input_dict, layer_dict, shape_dict
146 | if isinstance(state_space, spaces.Space):
147 | return CreateSingleInput(state_space)
148 | else:
149 | raise ValueError('state space error')
150 |
--------------------------------------------------------------------------------
/rlzoo/common/build_rlbench_env.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from collections import OrderedDict
3 |
4 | import numpy as np
5 | from gym import spaces
6 |
7 | from pyrep.const import RenderMode
8 | from pyrep.objects.dummy import Dummy
9 | from pyrep.objects.vision_sensor import VisionSensor
10 | from rlbench.environment import Environment
11 | from rlbench.action_modes import ArmActionMode, ActionMode
12 | from rlbench.observation_config import ObservationConfig
13 | from rlbench.tasks import *
14 |
15 |
16 | # Don't forget to add: export PYTHONPATH=PATH_TO_YOUR_LOCAL_RLBENCH_REPO
17 |
18 | # list of state types
19 | state_types = ['left_shoulder_rgb',
20 | 'left_shoulder_depth',
21 | 'left_shoulder_mask',
22 | 'right_shoulder_rgb',
23 | 'right_shoulder_depth',
24 | 'right_shoulder_mask',
25 | 'wrist_rgb',
26 | 'wrist_depth',
27 | 'wrist_mask',
28 | 'joint_velocities',
29 | 'joint_velocities_noise',
30 | 'joint_positions',
31 | 'joint_positions_noise',
32 | 'joint_forces',
33 | 'joint_forces_noise',
34 | 'gripper_pose',
35 | 'gripper_touch_forces',
36 | 'task_low_dim_state']
37 |
38 |
39 | class RLBenchEnv():
40 | """ make RLBench env to have same interfaces as openai.gym """
41 |
42 | def __init__(self, task_name: str, state_type: list = 'state', ):
43 | # render_mode=None):
44 | """
45 | create RL Bench environment
46 | :param task_name: task names can be found in rlbench.tasks
47 | :param state_type: state or vision or a sub list of state_types list like ['left_shoulder_rgb']
48 | """
49 | if state_type == 'state' or state_type == 'vision' or isinstance(state_type, list):
50 | self._state_type = state_type
51 | else:
52 | raise ValueError('State type value error, your value is {}'.format(state_type))
53 | # self._render_mode = render_mode
54 | self._render_mode = None
55 | obs_config = ObservationConfig()
56 | obs_config.set_all(True)
57 | action_mode = ActionMode(ArmActionMode.ABS_JOINT_VELOCITY)
58 | self.env = Environment(
59 | action_mode, obs_config=obs_config, headless=True)
60 | self.env.launch()
61 | try:
62 | self.task = self.env.get_task(getattr(sys.modules[__name__], task_name))
63 | except:
64 | raise NotImplementedError
65 |
66 | _, obs = self.task.reset()
67 | self.spec = Spec(task_name)
68 |
69 | if self._state_type == 'state':
70 | self.observation_space = spaces.Box(
71 | low=-np.inf, high=np.inf, shape=obs.get_low_dim_data().shape)
72 | elif self._state_type == 'vision':
73 | space_dict = OrderedDict()
74 | space_dict["state"] = spaces.Box(
75 | low=-np.inf, high=np.inf, shape=obs.get_low_dim_data().shape)
76 | for i in ["left_shoulder_rgb", "right_shoulder_rgb", "wrist_rgb", "front_rgb"]:
77 | space_dict[i] = spaces.Box(
78 | low=0, high=1, shape=getattr(obs, i).shape)
79 | self.observation_space = spaces.Dict(space_dict)
80 | else:
81 | space_dict = OrderedDict()
82 | for name in self._state_type:
83 | if name.split('_')[-1] in ('rgb', 'depth', 'mask'):
84 | space_dict[name] = spaces.Box(
85 | low=0, high=1, shape=getattr(obs, name).shape)
86 | else:
87 | space_dict[name] = spaces.Box(
88 | low=-np.inf, high=np.inf,
89 | shape=getattr(obs, name).shape)
90 | self.observation_space = spaces.Dict(space_dict)
91 | self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(self.env.action_size,), dtype=np.float32)
92 |
93 | # if render_mode is not None:
94 | # # Add the camera to the scene
95 | # cam_placeholder = Dummy('cam_cinematic_placeholder')
96 | # self._gym_cam = VisionSensor.create([640, 360])
97 | # self._gym_cam.set_pose(cam_placeholder.get_pose())
98 | # if render_mode == 'human':
99 | # self._gym_cam.set_render_mode(RenderMode.OPENGL3_WINDOWED)
100 | # else:
101 | # self._gym_cam.set_render_mode(RenderMode.OPENGL3)
102 |
103 | def _extract_obs(self, obs):
104 | if self._state_type == 'state':
105 | return np.array(obs.get_low_dim_data(), np.float32)
106 | elif self._state_type == 'vision':
107 | return np.array([np.array(obs.get_low_dim_data(), np.float32),
108 | np.array(obs.left_shoulder_rgb, np.float32),
109 | np.array(obs.right_shoulder_rgb, np.float32),
110 | np.array(obs.wrist_rgb, np.float32),
111 | np.array(obs.front_rgb, np.float32), ])
112 | else:
113 | result = ['tag']
114 | for name in self._state_type:
115 | result.append(np.array(getattr(obs, name), np.float32))
116 | return np.delete(np.array(result,), 0, 0)
117 |
118 | def seed(self, seed_value):
119 | # set seed as in openai.gym env
120 | pass
121 |
122 | def render(self, mode='human'):
123 | # todo render available at any time
124 | if self._render_mode is None:
125 | self._render_mode = mode
126 | # Add the camera to the scene
127 | cam_placeholder = Dummy('cam_cinematic_placeholder')
128 | self._gym_cam = VisionSensor.create([640, 360])
129 | self._gym_cam.set_pose(cam_placeholder.get_pose())
130 | if mode == 'human':
131 | self._gym_cam.set_render_mode(RenderMode.OPENGL3_WINDOWED)
132 | else:
133 | self._gym_cam.set_render_mode(RenderMode.OPENGL3)
134 |
135 | if mode != self._render_mode:
136 | raise ValueError(
137 | 'The render mode must match the render mode selected in the '
138 | 'constructor. \nI.e. if you want "human" render mode, then '
139 | 'create the env by calling: '
140 | 'gym.make("reach_target-state-v0", render_mode="human").\n'
141 | 'You passed in mode %s, but expected %s.' % (
142 | mode, self._render_mode))
143 | if mode == 'rgb_array':
144 | return self._gym_cam.capture_rgb()
145 |
146 | def reset(self):
147 | descriptions, obs = self.task.reset()
148 | return self._extract_obs(obs)
149 |
150 | def step(self, action):
151 | obs, reward, terminate = self.task.step(action)
152 | return self._extract_obs(obs), reward, terminate, None
153 |
154 | def close(self):
155 | self.env.shutdown()
156 |
157 |
158 | class Spec():
159 | """ a fake spec """
160 |
161 | def __init__(self, id_name):
162 | self.id = id_name
163 |
--------------------------------------------------------------------------------
/rlzoo/common/distributions.py:
--------------------------------------------------------------------------------
1 | """Definition of parametrized distributions. Adapted from openai/baselines"""
2 | import copy
3 | from functools import wraps
4 |
5 | import numpy as np
6 | import tensorflow as tf
7 | from gym import spaces
8 |
9 |
10 | def expand_dims(func):
11 | @wraps(func)
12 | def wrapper(*args, **kwargs):
13 | result = func(*args, **kwargs)
14 | result = tf.expand_dims(result, axis=-1)
15 | return result
16 |
17 | return wrapper
18 |
19 |
20 | class Distribution(object):
21 | """A particular probability distribution"""
22 |
23 | def set_param(self, *args, **kwargs):
24 | raise NotImplementedError
25 |
26 | def sample(self, *args, **kwargs):
27 | """Sampling from distribution. Allow explore parameters."""
28 | raise NotImplementedError
29 |
30 | def logp(self, x):
31 | """Calculate log probability of a sample."""
32 | return -self.neglogp(x)
33 |
34 | def neglogp(self, x):
35 | """Calculate negative log probability of a sample."""
36 | raise NotImplementedError
37 |
38 | def kl(self, *parameters):
39 | """Calculate Kullback–Leibler divergence"""
40 | raise NotImplementedError
41 |
42 | def entropy(self):
43 | """Calculate the entropy of distribution."""
44 | raise NotImplementedError
45 |
46 |
47 | class Categorical(Distribution):
48 | """Creates a categorical distribution"""
49 |
50 | def __init__(self, ndim, logits=None):
51 | """
52 | Args:
53 | ndim (int): total number of actions
54 | logits (tensor): logits variables
55 | """
56 | self._ndim = ndim
57 | self._logits = logits
58 | self.param = self._logits
59 |
60 | @property
61 | def ndim(self):
62 | return copy.copy(self._ndim)
63 |
64 | def set_param(self, logits):
65 | """
66 | Args:
67 | logits (tensor): logits variables to set
68 | """
69 | self._logits = logits
70 | self.param = self._logits
71 |
72 | def get_param(self):
73 | return copy.deepcopy(self._logits)
74 |
75 | def sample(self):
76 | """ Sample actions from distribution, using the Gumbel-Softmax trick """
77 | u = np.array(np.random.uniform(0, 1, size=np.shape(self._logits)), dtype=np.float32)
78 | res = tf.argmax(self._logits - tf.math.log(-tf.math.log(u)), axis=-1)
79 | return res
80 |
81 | def greedy_sample(self):
82 | """ Get actions greedily """
83 | _probs = tf.nn.softmax(self._logits)
84 | return tf.argmax(_probs, axis=-1)
85 |
86 | def logp(self, x):
87 | return -self.neglogp(x)
88 |
89 | @expand_dims
90 | def neglogp(self, x):
91 | x = np.array(x)
92 | if np.any(x % 1):
93 | raise ValueError('Input float actions in discrete action space')
94 | x = tf.convert_to_tensor(x, tf.int32)
95 | x = tf.one_hot(x, self._ndim, axis=-1)
96 | return tf.nn.softmax_cross_entropy_with_logits(x, self._logits)
97 |
98 | @expand_dims
99 | def kl(self, logits):
100 | """
101 | Args:
102 | logits (tensor): logits variables of another distribution
103 | """
104 | a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True)
105 | a1 = logits - tf.reduce_max(logits, axis=-1, keepdims=True)
106 | ea0 = tf.exp(a0)
107 | ea1 = tf.exp(a1)
108 | z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
109 | z1 = tf.reduce_sum(ea1, axis=-1, keepdims=True)
110 | p0 = ea0 / z0
111 | return tf.reduce_sum(
112 | p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=-1)
113 |
114 | @expand_dims
115 | def entropy(self):
116 | a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True)
117 | ea0 = tf.exp(a0)
118 | z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
119 | p0 = ea0 / z0
120 | return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=-1)
121 |
122 |
123 | class DiagGaussian(Distribution):
124 | """Creates a diagonal Gaussian distribution """
125 |
126 | def __init__(self, ndim, mean_logstd=None):
127 | """
128 | Args:
129 | ndim (int): the dimenstion of actions
130 | mean_logstd (tensor): mean and logstd stacked on the last axis
131 | """
132 | self._ndim = ndim
133 | self.mean = None
134 | self.logstd = None
135 | self.std = None
136 | self.action_mean = None
137 | self.action_scale = None
138 | self.param = self.mean, self.logstd
139 | if mean_logstd is not None:
140 | self.set_param(mean_logstd)
141 |
142 | @property
143 | def ndim(self):
144 | return copy.copy(self._ndim)
145 |
146 | def set_param(self, mean_logstd):
147 | """
148 | Args:
149 | mean_logstd (tensor): mean and log std
150 | """
151 | self.mean, self.logstd = mean_logstd
152 | self.std = tf.math.exp(self.logstd)
153 | self.param = self.mean, self.logstd
154 |
155 | def get_param(self):
156 | """ Get parameters """
157 | return copy.deepcopy(self.mean), copy.deepcopy(self.logstd)
158 |
159 | def sample(self):
160 | """ Get actions in deterministic or stochastic manner """
161 | return self.mean, self.std * np.random.normal(0, 1, np.shape(self.mean))
162 |
163 | def greedy_sample(self):
164 | """ Get actions greedily/deterministically """
165 | return self.mean
166 |
167 | def logp(self, x):
168 | return -self.neglogp(x)
169 |
170 | @expand_dims
171 | def neglogp(self, x):
172 | # here we reverse the action normalization to make the computation of negative log probability correct
173 | x = (x - self.action_mean)/self.action_scale
174 |
175 | return 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
176 | + 0.5 * np.log(2.0 * np.pi) * float(self._ndim) + tf.reduce_sum(self.logstd, axis=-1)
177 |
178 | @expand_dims
179 | def kl(self, mean_logstd):
180 | """
181 | Args:
182 | mean_logstd (tensor): mean and logstd of another distribution
183 | """
184 | mean, logstd = mean_logstd
185 | return tf.reduce_sum(
186 | logstd - self.logstd +
187 | (tf.square(self.std) + tf.square(self.mean - mean))
188 | / (2.0 * tf.square(tf.math.exp(logstd))) - 0.5, axis=-1)
189 |
190 | @expand_dims
191 | def entropy(self):
192 | return tf.reduce_sum(
193 | self.logstd + 0.5 * np.log(2.0 * np.pi * np.e), axis=-1)
194 |
195 |
196 | def make_dist(ac_space):
197 | """Get distribution based on action space
198 |
199 | :param ac_space: gym.spaces.Space
200 | """
201 | if isinstance(ac_space, spaces.Discrete):
202 | return Categorical(ac_space.n)
203 | elif isinstance(ac_space, spaces.Box):
204 | assert len(ac_space.shape) == 1
205 | return DiagGaussian(ac_space.shape[0])
206 | else:
207 | raise NotImplementedError
208 |
--------------------------------------------------------------------------------
/rlzoo/common/math_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions for mathematics utilization.
3 |
4 | # Requirements
5 | tensorflow==2.0.0a0
6 | tensorlayer==2.0.1
7 |
8 | """
9 |
10 |
11 | def flatten_dims(shapes): # will be moved to common
12 | dim = 1
13 | for s in shapes:
14 | dim *= s
15 | return dim
16 |
--------------------------------------------------------------------------------
/rlzoo/common/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Functions for utilization.
3 |
4 | # Requirements
5 | tensorflow==2.0.0a0
6 | tensorlayer==2.0.1
7 |
8 | """
9 | import os
10 | import re
11 |
12 | import gym
13 | import matplotlib.pyplot as plt
14 | import numpy as np
15 | import tensorlayer as tl
16 | import tensorflow as tf
17 | from importlib import import_module
18 |
19 |
20 | def plot(episode_rewards, algorithm_name, env_name):
21 | """
22 | plot the learning curve, saved as ./img/algorithm_name-env_name.png
23 |
24 | :param episode_rewards: array of floats
25 | :param algorithm_name: string
26 | :param env_name: string
27 | """
28 | path = os.path.join('.', 'img')
29 | name = algorithm_name + '-' + env_name
30 | plt.figure(figsize=(10, 5))
31 | plt.title(name)
32 | plt.plot(np.arange(len(episode_rewards)), episode_rewards)
33 | plt.xlabel('Episode')
34 | plt.ylabel('Episode Reward')
35 | if not os.path.exists(path):
36 | os.makedirs(path)
37 | plt.savefig(os.path.join(path, name + '.png'))
38 | plt.close()
39 |
40 |
41 | def plot_save_log(episode_rewards, algorithm_name, env_name):
42 | """
43 | plot the learning curve, saved as ./img/algorithm_name-env_name.png,
44 | and save the rewards log as ./log/algorithm_name-env_name.npy
45 |
46 | :param episode_rewards: array of floats
47 | :param algorithm_name: string
48 | :param env_name: string
49 | """
50 | path = os.path.join('.', 'log')
51 | name = algorithm_name + '-' + env_name
52 | plot(episode_rewards, algorithm_name, env_name)
53 | if not os.path.exists(path):
54 | os.makedirs(path)
55 | np.save(os.path.join(path, name), episode_rewards)
56 |
57 |
58 | def save_model(model, model_name, algorithm_name, env_name):
59 | """
60 | save trained neural network model
61 |
62 | :param model: tensorlayer.models.Model
63 | :param model_name: string, e.g. 'model_sac_q1'
64 | :param algorithm_name: string, e.g. 'SAC'
65 | """
66 | name = algorithm_name + '-' + env_name
67 | path = os.path.join('.', 'model', name)
68 | if not os.path.exists(path):
69 | os.makedirs(path)
70 | tl.files.save_npz(model.trainable_weights, os.path.join(path, model_name))
71 |
72 |
73 | def load_model(model, model_name, algorithm_name, env_name):
74 | """
75 | load saved neural network model
76 |
77 | :param model: tensorlayer.models.Model
78 | :param model_name: string, e.g. 'model_sac_q1'
79 | :param algorithm_name: string, e.g. 'SAC'
80 | """
81 | name = algorithm_name + '-' + env_name
82 | path = os.path.join('.', 'model', name)
83 | try:
84 | param = tl.files.load_npz(path, model_name + '.npz')
85 | for p0, p1 in zip(model.trainable_weights, param):
86 | p0.assign(p1)
87 | except Exception as e:
88 | print('Load Model Fails!')
89 | raise e
90 |
91 |
92 | def parse_all_args(parser):
93 | """ Parse known and unknown args """
94 | common_options, other_args = parser.parse_known_args()
95 | other_options = dict()
96 | index = 0
97 | n = len(other_args)
98 | float_pattern = re.compile(r'^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$')
99 | while index < n: # only str, int and float type will be parsed
100 | if other_args[index].startswith('--'):
101 | if other_args[index].__contains__('='):
102 | key, value = other_args[index].split('=')
103 | index += 1
104 | else:
105 | key, value = other_args[index:index + 2]
106 | index += 2
107 | if re.match(float_pattern, value):
108 | value = float(value)
109 | if value.is_integer():
110 | value = int(value)
111 | other_options[key[2:]] = value
112 | return common_options, other_options
113 |
114 |
115 | def make_env(env_id):
116 | env = gym.make(env_id).unwrapped
117 | """ add env wrappers here """
118 | return env
119 |
120 |
121 | def get_algorithm_module(algorithm, submodule):
122 | """ Get algorithm module in the corresponding folder """
123 | return import_module('.'.join(['rlzoo', 'algorithms', algorithm, submodule]))
124 |
125 |
126 | def call_default_params(env, envtype, alg, default_seed=True):
127 | """ Get the default parameters for training from the default script """
128 | alg = alg.lower()
129 | default = import_module('.'.join(['rlzoo', 'algorithms', alg, 'default']))
130 | params = getattr(default, envtype)(env,
131 | default_seed) # need manually set seed in the main script if default_seed = False
132 | return params
133 |
134 |
135 | def set_seed(seed, env=None):
136 | """ set random seed for reproduciblity """
137 | if isinstance(env, list):
138 | assert isinstance(seed, list)
139 | for i in range(len(env)):
140 | env[i].seed(seed[i])
141 | seed = seed[0] # pick one seed for np and tf
142 | elif env is not None:
143 | env.seed(seed)
144 | np.random.seed(seed)
145 | tf.random.set_seed(seed)
146 |
--------------------------------------------------------------------------------
/rlzoo/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tensorlayer/RLzoo/e3ed8a57bd8130bd7b663f213a388ce972925f30/rlzoo/distributed/__init__.py
--------------------------------------------------------------------------------
/rlzoo/distributed/dis_components.py:
--------------------------------------------------------------------------------
1 | import enum
2 |
3 | import tensorflow as tf
4 | from kungfu.python import current_cluster_size, current_rank
5 | from kungfu.tensorflow.ops import (barrier, request_variable,
6 | request_variable_with_template,
7 | save_variable, subset_all_reduce)
8 | from kungfu.tensorflow.ops.queue import new_queue
9 |
10 |
11 | class Role(enum.Enum):
12 | Learner = 1
13 | Actor = 2
14 | Server = 3
15 |
16 |
17 | def show_role_name(role):
18 | return {
19 | Role.Learner: 'learner',
20 | Role.Actor: 'actor',
21 | Role.Server: 'server',
22 | }[role]
23 |
24 |
25 | def _interval(n, offset=0):
26 | return list(range(offset, offset + n))
27 |
28 |
29 | class Agent:
30 | def __init__(self, n_learners=1, n_actors=1, n_servers=1):
31 | rank = current_rank()
32 | size = current_cluster_size()
33 | if n_learners + n_actors + n_servers != size:
34 | raise RuntimeError('invalid cluster size')
35 | self._n_learners = n_learners
36 | self._n_actors = n_actors
37 | self._n_servers = n_servers
38 | self._global_rank = rank
39 | self._global_size = size
40 | roles = [Role.Learner] * n_learners + [Role.Actor] * n_actors + [Role.Server] * n_servers
41 | rank2role = dict(enumerate(roles))
42 | self._role = rank2role[rank]
43 | self._roles = {
44 | Role.Learner: _interval(n_learners),
45 | Role.Actor: _interval(n_actors, n_learners),
46 | Role.Server: _interval(n_servers, n_learners + n_actors),
47 | }
48 | self._role_sizes = {
49 | Role.Learner: n_learners,
50 | Role.Actor: n_actors,
51 | Role.Server: n_servers,
52 | }
53 | self._role_offsets = {
54 | Role.Learner: 0,
55 | Role.Actor: n_learners,
56 | Role.Server: n_learners + n_actors,
57 | }
58 | self._role_rank = self._global_rank - self._role_offsets[self._role]
59 | self._role_size = self._role_sizes[self._role]
60 |
61 | def _to_global_rank(self, role, role_rank):
62 | return int(self._role_offsets[role] + int(role_rank))
63 |
64 | # metadata APIs
65 | def role(self):
66 | return self._role
67 |
68 | def role_rank(self):
69 | return self._role_rank
70 |
71 | def role_size(self, role=None):
72 | if role is None:
73 | return self._role_size
74 | else:
75 | return self._role_sizes[role]
76 |
77 | # collective APIs
78 | def barrier(self):
79 | return barrier()
80 |
81 | def role_all_reduce(self, x):
82 | role_ranks = self._roles[self._role]
83 | topology = [i for i in range(self._global_size)]
84 | for i in role_ranks:
85 | topology[i] = role_ranks[0]
86 | # TODO: generate subset topology
87 | return subset_all_reduce(x, topology)
88 |
89 | # p2p APIs
90 | def save(self, x, name=None):
91 | return save_variable(x, name=name)
92 |
93 | def request(self, role: Role, role_rank, name, shape, dtype):
94 | role_size = self._role_sizes[role]
95 | assert (0 <= role_rank and role_rank < role_size)
96 | target = self._to_global_rank(role, role_rank)
97 | return request_variable(
98 | target,
99 | name=name,
100 | shape=shape,
101 | dtype=dtype,
102 | )
103 |
104 | def new_queue(self, src, dst):
105 | """create a uni-direction queue."""
106 | role1, rank1 = src
107 | role2, rank2 = dst
108 | srcRank = self._to_global_rank(role1, rank1)
109 | dstRank = self._to_global_rank(role2, rank2)
110 | return new_queue(srcRank, dstRank)
111 |
112 | def new_queue_pair(self, a, b):
113 | """create a pair of queues."""
114 | q1 = self.new_queue(a, b)
115 | q2 = self.new_queue(b, a)
116 | return q1, q2
117 |
118 |
119 | class LearnerExample:
120 | pass
121 |
122 |
123 | class ActorExample:
124 | pass
125 |
126 |
127 | class ServerExample:
128 | pass
129 |
--------------------------------------------------------------------------------
/rlzoo/distributed/run_dis_train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)
5 |
6 | kungfu_flags() {
7 | echo -q
8 | echo -logdir logs
9 |
10 | local ip1=127.0.0.1
11 | local np1=$np
12 |
13 | local ip2=127.0.0.10
14 | local np2=$np
15 | local H=$ip1:$np1,$ip2:$np2
16 | local m=cpu,gpu
17 |
18 | echo -H $ip1:$np1
19 | }
20 |
21 | prun() {
22 | local np=$1
23 | shift
24 | kungfu-run $(kungfu_flags) -np $np $@
25 | }
26 |
27 | n_learner=2
28 | n_actor=2
29 | n_server=1
30 |
31 | flags() {
32 | echo -l $n_learner
33 | echo -a $n_actor
34 | echo -s $n_server
35 | }
36 |
37 | rl_run() {
38 | local n=$((n_learner + n_actor + n_server))
39 | prun $n python3 training_components.py $(flags)
40 | }
41 |
42 | main() {
43 | rl_run
44 | }
45 |
46 | main
47 |
--------------------------------------------------------------------------------
/rlzoo/distributed/start_dis_role.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from rlzoo.distributed.dis_components import *
4 | import tensorflow as tf
5 | import numpy as np
6 |
7 |
8 | def parse_args():
9 | p = argparse.ArgumentParser()
10 | p.add_argument('-l', type=int, default=1)
11 | p.add_argument('-a', type=int, default=1)
12 | p.add_argument('-s', type=int, default=1)
13 | p.add_argument('-f', type=str, default='') # config.json
14 |
15 | args = p.parse_args()
16 | return args
17 |
18 |
19 | def run_learner(agent, args, training_conf, env_conf, agent_conf):
20 | agent_generator = agent_conf['agent_generator']
21 | total_step, traj_len, train_n_traj = training_conf['total_step'], training_conf['traj_len'], training_conf['train_n_traj'],
22 | obs_shape, act_shape = env_conf['obs_shape'], env_conf['act_shape']
23 |
24 | if agent.role_rank() == 0:
25 | param_q = agent.new_queue((Role.Learner, 0), (Role.Server, 0))
26 |
27 | traj_q = agent.new_queue((Role.Server, 0), (Role.Learner, agent.role_rank()))
28 |
29 | rl_agent = agent_generator()
30 | rl_agent.init_components()
31 |
32 | # init model
33 | rl_agent.update_model([agent.role_all_reduce(weights) for weights in rl_agent.all_weights])
34 |
35 | if agent.role_rank() == 0:
36 | for weight in rl_agent.all_weights:
37 | param_q.put(tf.Variable(weight, dtype=tf.float32))
38 |
39 | n_update = total_step // (traj_len * agent.role_size(Role.Learner) * train_n_traj)
40 | for i in range(n_update):
41 | traj_list = [[traj_q.get(dtype=tf.float32, shape=(traj_len, *shape)) for shape in [
42 | obs_shape, act_shape, (), (), obs_shape, (), (1,)]] for _ in range(train_n_traj)]
43 |
44 | rl_agent.train(traj_list, dis_agent=agent)
45 |
46 | # send weights to server
47 | if agent.role_rank() == 0:
48 | for weight in rl_agent.all_weights:
49 | param_q.put(tf.Variable(weight, dtype=tf.float32))
50 | print('learner finished')
51 |
52 |
53 | def run_actor(agent, args, training_conf, env_conf): # sampler
54 | env_maker, total_step = env_conf['env_maker'], training_conf['total_step']
55 |
56 | from gym import spaces
57 |
58 | env = env_maker()
59 | action_q, step_data_q = agent.new_queue_pair((Role.Server, 0), (Role.Actor, agent.role_rank()))
60 |
61 | state, reward, done = env.reset(), 0, False
62 | each_total_step = int(total_step/agent.role_size(Role.Actor))
63 | action_dtype = tf.int32 if isinstance(env.action_space, spaces.Discrete) else tf.float32
64 | for i in range(each_total_step):
65 | step_data_q.put(tf.Variable(state, dtype=tf.float32))
66 | a = action_q.get(dtype=action_dtype, shape=env.action_space.shape).numpy()
67 | next_state, reward, done, _ = env.step(a)
68 | for data in (reward, done, next_state):
69 | step_data_q.put(tf.Variable(data, dtype=tf.float32))
70 | if done:
71 | state = env.reset()
72 | else:
73 | state = next_state
74 | print('actor finished')
75 |
76 |
77 | def run_server(agent, args, training_conf, env_conf, agent_conf):
78 | total_step, traj_len, train_n_traj, save_interval = training_conf['total_step'], training_conf['traj_len'], \
79 | training_conf['train_n_traj'], training_conf['save_interval'],
80 | obs_shape, env_name = env_conf['obs_shape'], env_conf['env_name']
81 | agent_generator = agent_conf['agent_generator']
82 |
83 | from rlzoo.algorithms.dppo_clip_distributed.dppo_clip import DPPO_CLIP
84 | from rlzoo.distributed.dis_components import Role
85 | from gym import spaces
86 |
87 | learner_size = agent.role_size(Role.Learner)
88 | rl_agent: DPPO_CLIP = agent_generator()
89 | rl_agent.init_components()
90 |
91 | # queue to actor
92 | q_list = [agent.new_queue_pair((Role.Server, 0), (Role.Actor, i)) for i in
93 | range(agent.role_size(Role.Actor))]
94 | action_q_list, step_data_q_list = zip(*q_list)
95 |
96 | # queue to learner
97 | param_q = agent.new_queue((Role.Learner, 0), (Role.Server, 0))
98 | traj_q_list = [agent.new_queue((Role.Server, 0), (Role.Learner, i)) for i in
99 | range(agent.role_size(Role.Learner))]
100 |
101 | # syn net weights from learner
102 | all_weights = [param_q.get(dtype=weight.dtype, shape=weight.shape) for weight in rl_agent.all_weights]
103 | rl_agent.update_model(all_weights)
104 |
105 | train_cnt = 0
106 | action_dtype = tf.int32 if isinstance(rl_agent.actor.action_space, spaces.Discrete) else tf.float32
107 |
108 | curr_step = 0
109 |
110 | total_reward_list = []
111 | curr_reward_list = []
112 | tmp_eps_reward = 0
113 | while curr_step < total_step:
114 | # tmp_eps_reward = 0 # todo env with no end
115 | for _ in range(traj_len):
116 | curr_step += agent.role_size(Role.Actor)
117 |
118 | state_list = []
119 | for step_data_q in step_data_q_list:
120 | state_list.append(step_data_q.get(dtype=tf.float32, shape=obs_shape))
121 |
122 | action_list, log_p_list = rl_agent.get_action(state_list, batch_data=True)
123 |
124 | for action_q, action in zip(action_q_list, action_list):
125 | action_q.put(tf.Variable(action, dtype=action_dtype))
126 | reward_list, done_list, next_state_list = [], [], [],
127 | for i, step_data_q in enumerate(step_data_q_list):
128 | reward = step_data_q.get(dtype=tf.float32, shape=())
129 | if i == 0:
130 | tmp_eps_reward += reward
131 | reward_list.append(reward)
132 | done = step_data_q.get(dtype=tf.float32, shape=())
133 | if i == 0 and done:
134 | curr_reward_list.append(tmp_eps_reward)
135 | tmp_eps_reward = 0
136 | done_list.append(done)
137 | next_state_list.append(step_data_q.get(dtype=tf.float32, shape=obs_shape))
138 | rl_agent.collect_data(state_list, action_list, reward_list, done_list, next_state_list, log_p_list, True)
139 |
140 | rl_agent.update_traj_list()
141 |
142 | # send traj to each learner and update weight
143 | learn_traj_len = learner_size * train_n_traj
144 | if len(rl_agent.traj_list) >= learn_traj_len:
145 | train_cnt += 1
146 |
147 | # todo env with end
148 | avg_eps_reward = None
149 | if curr_reward_list:
150 | avg_eps_reward = np.mean(curr_reward_list)
151 | curr_reward_list.clear()
152 | total_reward_list.append(avg_eps_reward)
153 |
154 | # todo env with no end
155 | # avg_eps_reward = tmp_eps_reward
156 | # total_reward_list.append(np.array(avg_eps_reward))
157 |
158 | print('Training iters: {}, steps so far: {}, average eps reward: {}'.format(
159 | train_cnt, curr_step, np.array(avg_eps_reward)))
160 |
161 | rl_agent.plot_save_log(total_reward_list, env_name)
162 |
163 | traj_iter = iter(rl_agent.traj_list[:learn_traj_len])
164 | rl_agent.traj_list = rl_agent.traj_list[learn_traj_len:]
165 |
166 | # send traj data to each learner
167 | for i, traj_q in enumerate(traj_q_list):
168 | for _ in range(train_n_traj):
169 | try:
170 | traj_data = next(traj_iter)
171 | except StopIteration:
172 | break
173 | for data in traj_data:
174 | traj_q.put(tf.Variable(data, dtype=tf.float32))
175 |
176 | # syn net weights from learner
177 | all_weights = [param_q.get(dtype=weight.dtype, shape=weight.shape) for weight in rl_agent.all_weights]
178 | rl_agent.update_model(all_weights)
179 |
180 | # save model
181 | if not train_cnt % save_interval:
182 | rl_agent.save_ckpt(env_name)
183 |
184 | # save the final model
185 | rl_agent.save_ckpt(env_name)
186 | print('Server Finished.')
187 |
188 |
189 | def main(training_conf, env_conf, agent_conf):
190 | args = parse_args()
191 | agent = Agent(n_learners=args.l, n_actors=args.a, n_servers=args.s)
192 |
193 | print('%s : %d/%d' % (agent.role(), agent.role_rank(), agent.role_size()))
194 |
195 | agent.barrier()
196 |
197 | if agent.role() == Role.Learner:
198 | run_learner(agent, args, training_conf, env_conf, agent_conf)
199 | elif agent.role() == Role.Actor:
200 | run_actor(agent, args, training_conf, env_conf)
201 | elif agent.role() == Role.Server:
202 | run_server(agent, args, training_conf, env_conf, agent_conf)
203 | else:
204 | raise RuntimeError('Invalid Role.')
205 |
206 | agent.barrier()
207 |
--------------------------------------------------------------------------------
/rlzoo/distributed/training_components.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.env_wrappers import build_env
2 | from rlzoo.common.policy_networks import *
3 | from rlzoo.common.value_networks import *
4 | from rlzoo.algorithms.dppo_clip_distributed.dppo_clip import DPPO_CLIP
5 | from functools import partial
6 |
7 | # Specify the training configurations
8 | training_conf = {
9 | 'total_step': int(1e7), # overall training timesteps
10 | 'traj_len': 200, # length of the rollout trajectory
11 | 'train_n_traj': 2, # update the models after every certain number of trajectories for each learner
12 | 'save_interval': 10, # saving the models after every certain number of updates
13 | }
14 |
15 | # Specify the environment and launch it
16 | env_name, env_type = 'CartPole-v0', 'classic_control'
17 | env_maker = partial(build_env, env_name, env_type)
18 | temp_env = env_maker()
19 | obs_shape, act_shape = temp_env.observation_space.shape, temp_env.action_space.shape
20 |
21 | env_conf = {
22 | 'env_name': env_name,
23 | 'env_type': env_type,
24 | 'env_maker': env_maker,
25 | 'obs_shape': obs_shape,
26 | 'act_shape': act_shape,
27 | }
28 |
29 |
30 | def build_network(observation_space, action_space, name='DPPO_CLIP'):
31 | """ build networks for the algorithm """
32 | hidden_dim = 256
33 | num_hidden_layer = 2
34 | critic = ValueNetwork(observation_space, [hidden_dim] * num_hidden_layer, name=name + '_value')
35 |
36 | actor = StochasticPolicyNetwork(observation_space, action_space,
37 | [hidden_dim] * num_hidden_layer,
38 | trainable=True,
39 | name=name + '_policy')
40 | return critic, actor
41 |
42 |
43 | def build_opt(actor_lr=1e-4, critic_lr=2e-4):
44 | """ choose the optimizer for learning """
45 | import tensorflow as tf
46 | return [tf.optimizers.Adam(critic_lr), tf.optimizers.Adam(actor_lr)]
47 |
48 |
49 | net_builder = partial(build_network, temp_env.observation_space, temp_env.action_space)
50 | opt_builder = partial(build_opt, )
51 |
52 | agent_conf = {
53 | 'net_builder': net_builder,
54 | 'opt_builder': opt_builder,
55 | 'agent_generator': partial(DPPO_CLIP, net_builder, opt_builder),
56 | }
57 | del temp_env
58 |
59 | from rlzoo.distributed.start_dis_role import main
60 |
61 | print('Start Training.')
62 | main(training_conf, env_conf, agent_conf)
63 | print('Training Finished.')
64 |
--------------------------------------------------------------------------------
/rlzoo/interactive/.gitignore:
--------------------------------------------------------------------------------
1 | img/
2 | log/
3 | model/
4 |
--------------------------------------------------------------------------------
/rlzoo/interactive/common.py:
--------------------------------------------------------------------------------
1 | import decimal
2 |
3 | import ipywidgets as widgets
4 | import numpy as np
5 |
6 | border_list = [None, 'hidden', 'dotted', 'dashed', 'solid', 'double',
7 | 'groove', 'ridge', 'inset', 'outset', 'inherit']
8 |
9 |
10 | class NumInput(widgets.HBox):
11 |
12 | def __init__(self, init_value, step=None, range_min=None, range_max=None):
13 | self.range = [range_min, range_max]
14 | range_min = 0 if range_min is None else range_min
15 | range_max = init_value * 2 if range_max is None else range_max
16 | self.range_size = max([range_max - init_value, init_value - range_min])
17 | if step is None:
18 | fs = decimal.Decimal(str(init_value)).as_tuple().exponent
19 | self.decimals = -fs
20 | step = np.round(np.power(0.1, self.decimals), self.decimals)
21 | else:
22 | fs = decimal.Decimal(str(step)).as_tuple().exponent
23 | fv = decimal.Decimal(str(init_value)).as_tuple().exponent
24 | self.decimals = -min(fs, fv)
25 |
26 | self.step = step
27 |
28 | self.slider = widgets.FloatSlider(
29 | value=init_value,
30 | min=range_min,
31 | max=range_max,
32 | step=step,
33 | description='Slider input:',
34 | disabled=False,
35 | continuous_update=False,
36 | orientation='horizontal',
37 | readout=True,
38 | readout_format='.' + str(self.decimals) + 'f'
39 | )
40 |
41 | self.text = widgets.FloatText(
42 | value=self.slider.value,
43 | description='Manual input:',
44 | disabled=False
45 | )
46 |
47 | def __extend_max(change):
48 | num_new = np.around(change['new'], decimals=self.decimals)
49 | num_old = change['old']
50 | if num_new > num_old:
51 | if num_new - num_old > (self.slider.max - num_old) / 2:
52 | self.range_size *= 2
53 | else:
54 | self.range_size *= 0.5
55 | else:
56 | if num_old - num_new > (num_old - self.slider.min) / 2:
57 | self.range_size *= 2
58 | else:
59 | self.range_size *= 0.5
60 |
61 | if self.range_size < self.step * 10:
62 | self.range_size = self.step * 10
63 |
64 | self.slider.min = num_new - self.range_size if self.range[0] is None else self.range[0]
65 | self.slider.max = num_new + self.range_size if self.range[1] is None else self.range[1]
66 | self.slider.value = num_new
67 | self.text.value = num_new
68 |
69 | self.slider.observe(__extend_max, names='value')
70 | self.text.observe(__extend_max, names='value')
71 | box_layout = widgets.Layout(display='flex',
72 | align_items='stretch',
73 | justify_content='center', )
74 | # self.frame = widgets.HBox([self.slider, self.text], layout=box_layout)
75 | super().__init__([self.slider, self.text], layout=box_layout)
76 | self._int_type = False
77 | if (isinstance(init_value, int) or isinstance(init_value, np.int16) \
78 | or isinstance(init_value, np.int32) or isinstance(init_value, np.int64)) \
79 | and step % 1 == 0:
80 | self._int_type = True
81 |
82 | @property
83 | def value(self):
84 | result = self.slider.value
85 | if self._int_type:
86 | result = int(result)
87 | return result
88 |
89 |
90 | class Border:
91 | def __init__(self, element_list, description=None, size=5, style=0):
92 | if not isinstance(element_list, list):
93 | element_list = [element_list]
94 |
95 | box_layout = widgets.Layout(display='flex',
96 | flex_flow='column',
97 | align_items='flex-start',
98 | align_content='flex-start',
99 | # justify_content='center',
100 | justify_content='space-around',
101 | border=border_list[2]
102 | )
103 | frame = widgets.Box(children=element_list, layout=box_layout)
104 |
105 | if description is not None:
106 | caption = widgets.HTML(value=""+description+"")
107 | children = [caption, frame]
108 | else:
109 | children = [frame]
110 |
111 | box_layout = widgets.Layout(display='flex',
112 | flex_flow='column',
113 | align_items='center',
114 | justify_content='center',
115 | border=border_list[style], )
116 | self.frame = widgets.Box(children=children, layout=box_layout)
117 |
118 |
119 | class InfoDisplay:
120 | def __init__(self, description, detail):
121 | label = widgets.Label(description)
122 | self.data = widgets.Label(detail)
123 | self.frame = widgets.HBox([label, self.data], layout=widgets.Layout(justify_content='flex-start', ))
124 | # border=border_list[2]))
125 |
--------------------------------------------------------------------------------
/rlzoo/run_rlzoo.py:
--------------------------------------------------------------------------------
1 | from rlzoo.common.env_wrappers import *
2 | from rlzoo.common.utils import *
3 | from rlzoo.algorithms import *
4 |
5 | # EnvName = 'PongNoFrameskip-v4'
6 | # EnvType = 'atari'
7 |
8 | # EnvName = 'CartPole-v0'
9 | EnvName = 'Pendulum-v0'
10 | EnvType = 'classic_control'
11 |
12 | # EnvName = 'BipedalWalker-v2'
13 | # EnvType = 'box2d'
14 |
15 | # EnvName = 'Ant-v2'
16 | # EnvType = 'mujoco'
17 |
18 | # EnvName = 'FetchPush-v1'
19 | # EnvType = 'robotics'
20 |
21 | # EnvName = 'FishSwim-v0'
22 | # EnvType = 'dm_control'
23 |
24 | # EnvName = 'ReachTarget'
25 | # EnvType = 'rlbench'
26 | # env = build_env(EnvName, EnvType, state_type='vision')
27 |
28 | AlgName = 'SAC'
29 | env = build_env(EnvName, EnvType)
30 | alg_params, learn_params = call_default_params(env, EnvType, AlgName)
31 | alg = eval(AlgName+'(**alg_params)')
32 | alg.learn(env=env, mode='train', render=False, **learn_params)
33 | alg.learn(env=env, mode='test', render=True, **learn_params)
34 |
35 | # AlgName = 'DPPO'
36 | # number_workers = 2 # need to specify number of parallel workers in parallel algorithms like A3C and DPPO
37 | # env = build_env(EnvName, EnvType, nenv=number_workers)
38 | # alg_params, learn_params = call_default_params(env, EnvType, AlgName)
39 | # alg_params['method'] = 'clip' # specify 'clip' or 'penalty' method for different version of PPO and DPPO
40 | # alg = eval(AlgName+'(**alg_params)')
41 | # alg.learn(env=env, mode='train', render=False, **learn_params)
42 | # alg.learn(env=env, mode='test', render=True, **learn_params)
43 |
44 | # AlgName = 'PPO'
45 | # env = build_env(EnvName, EnvType)
46 | # alg_params, learn_params = call_default_params(env, EnvType, AlgName)
47 | # alg_params['method'] = 'clip' # specify 'clip' or 'penalty' method for different version of PPO and DPPO
48 | # alg = eval(AlgName+'(**alg_params)')
49 | # alg.learn(env=env, mode='train', render=False, **learn_params)
50 | # alg.learn(env=env, mode='test', render=True, **learn_params)
51 |
52 | # AlgName = 'A3C'
53 | # number_workers = 2 # need to specify number of parallel workers
54 | # env = build_env(EnvName, EnvType, nenv=number_workers)
55 | # alg_params, learn_params = call_default_params(env, EnvType, 'A3C')
56 | # alg = eval(AlgName+'(**alg_params)')
57 | # alg.learn(env=env, mode='train', render=False, **learn_params)
58 | # alg.learn(env=env, mode='test', render=True, **learn_params)
59 |
60 | env.close()
61 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | # Read requirements.txt, ignore comments
4 | try:
5 | REQUIRES = list()
6 | f = open("requirements.txt", "rb")
7 | for line in f.read().decode("utf-8").split("\n"):
8 | line = line.strip()
9 | if "#" in line:
10 | line = line[:line.find("#")].strip()
11 | if line:
12 | REQUIRES.append(line)
13 | except:
14 | print("'requirements.txt' not found!")
15 | REQUIRES = list()
16 |
17 | setup(
18 | name = "rlzoo",
19 | version = "1.0.4",
20 | include_package_data=True,
21 | author='Zihan Ding, Tianyang Yu, Yanhua Huang, Hongming Zhang, Hao Dong',
22 | author_email='zhding@mail.ustc.edu.cn',
23 | url = "https://github.com/tensorlayer/RLzoo" ,
24 | license = "apache" ,
25 | packages = find_packages(),
26 | install_requires=REQUIRES,
27 | description = "A collection of reinforcement learning algorithms with hierarchical code structure and convenient APIs.",
28 | keywords = "Reinforcment Learning",
29 | platform=['any'],
30 | python_requires='>=3.5',
31 | )
32 |
--------------------------------------------------------------------------------