├── .gitignore ├── .travis.yml ├── LICENSE ├── docs ├── Makefile ├── _static │ ├── css │ │ └── modify.css │ ├── openai-favicon2_32x32.ico │ ├── openai-favicon2_32x32.png │ └── openai_icon.ico ├── algorithms │ ├── ddpg.rst │ ├── ppo.rst │ ├── sac.rst │ ├── td3.rst │ ├── trpo.rst │ └── vpg.rst ├── conf.py ├── docs_requirements.txt ├── etc │ ├── acknowledgements.rst │ └── author.rst ├── images │ ├── alphago.jpg │ ├── bench │ │ ├── bench_ant.svg │ │ ├── bench_halfcheetah.svg │ │ ├── bench_hopper.svg │ │ ├── bench_swim.svg │ │ └── bench_walker.svg │ ├── ex2-1_trpo_hopper.png │ ├── ex2-2_ddpg_bug.svg │ ├── ex2-2_ddpg_bug_pytorch.png │ ├── knocked-over-stand-up.mp4 │ ├── knocked_down_standup.png │ ├── logo.png │ ├── ms_pacman.png │ ├── openai-favicon.png │ ├── openai-favicon2.png │ ├── openai-favicon2_32x32.png │ ├── plots │ │ ├── ddpg │ │ │ ├── ddpg_ant_performance.svg │ │ │ ├── ddpg_halfcheetah_performance.svg │ │ │ ├── ddpg_hopper_performance.svg │ │ │ ├── ddpg_swimmer_performance.svg │ │ │ └── ddpg_walker2d_performance.svg │ │ ├── ppo │ │ │ ├── ppo_ant_performance.svg │ │ │ ├── ppo_halfcheetah_performance.svg │ │ │ ├── ppo_hopper_performance.svg │ │ │ ├── ppo_swimmer_performance.svg │ │ │ └── ppo_walker2d_performance.svg │ │ ├── pyt │ │ │ ├── pytorch_ant_performance.svg │ │ │ ├── pytorch_halfcheetah_performance.svg │ │ │ ├── pytorch_hopper_performance.svg │ │ │ ├── pytorch_swimmer_performance.svg │ │ │ └── pytorch_walker2d_performance.svg │ │ ├── sac │ │ │ ├── sac_ant_performance.svg │ │ │ ├── sac_halfcheetah_performance.svg │ │ │ ├── sac_hopper_performance.svg │ │ │ ├── sac_swimmer_performance.svg │ │ │ └── sac_walker2d_performance.svg │ │ ├── td3 │ │ │ ├── td3_ant_performance.svg │ │ │ ├── td3_halfcheetah_performance.svg │ │ │ ├── td3_hopper_performance.svg │ │ │ ├── td3_swimmer_performance.svg │ │ │ └── td3_walker2d_performance.svg │ │ ├── tf1 │ │ │ ├── tensorflow_ant_performance.svg │ │ │ ├── tensorflow_halfcheetah_performance.svg │ │ │ ├── tensorflow_hopper_performance.svg │ │ │ ├── tensorflow_swimmer_performance.svg │ │ │ └── tensorflow_walker2d_performance.svg │ │ └── vpg │ │ │ ├── vpg_ant_performance.svg │ │ │ ├── vpg_halfcheetah_performance.svg │ │ │ ├── vpg_hopper_performance.svg │ │ │ ├── vpg_swimmer_performance.svg │ │ │ └── vpg_walker2d_performance.svg │ ├── recolored_logo.png │ ├── rl_algorithms.png │ ├── rl_algorithms.svg │ ├── rl_algorithms.xml │ ├── rl_algorithms_9_12.png │ ├── rl_algorithms_9_15.svg │ ├── rl_algorithms_9_15.xml │ ├── rl_diagram_transparent_bg.png │ ├── spinning-up-in-rl.png │ ├── spinning-up-logo.png │ ├── spinning-up-logo.svg │ └── spinning-up-logo2.png ├── index.rst ├── make.bat ├── spinningup │ ├── bench.rst │ ├── bench_ddpg.rst │ ├── bench_ppo.rst │ ├── bench_sac.rst │ ├── bench_td3.rst │ ├── bench_vpg.rst │ ├── exercise2_1_soln.rst │ ├── exercise2_2_soln.rst │ ├── exercises.rst │ ├── extra_pg_proof1.rst │ ├── extra_pg_proof2.rst │ ├── extra_tf_pg_implementation.rst │ ├── keypapers.rst │ ├── rl_intro.rst │ ├── rl_intro2.rst │ ├── rl_intro3.rst │ ├── rl_intro4.rst │ └── spinningup.rst ├── user │ ├── algorithms.rst │ ├── installation.rst │ ├── introduction.rst │ ├── plotting.rst │ ├── running.rst │ └── saving_and_loading.rst └── utils │ ├── logger.rst │ ├── mpi.rst │ ├── plotter.rst │ └── run_utils.rst ├── readme.md ├── readthedocs.yml ├── setup.py ├── spinup ├── __init__.py ├── algos │ ├── __init__.py │ ├── pytorch │ │ ├── ddpg │ │ │ ├── core.py │ │ │ └── ddpg.py │ │ ├── ppo │ │ │ ├── core.py │ │ │ └── ppo.py │ │ ├── sac │ │ │ ├── core.py │ │ │ └── sac.py │ │ ├── td3 │ │ │ ├── core.py │ │ │ └── td3.py │ │ ├── trpo │ │ │ └── trpo.py │ │ └── vpg │ │ │ ├── core.py │ │ │ └── vpg.py │ └── tf1 │ │ ├── ddpg │ │ ├── __init__.py │ │ ├── core.py │ │ └── ddpg.py │ │ ├── ppo │ │ ├── __init__.py │ │ ├── core.py │ │ └── ppo.py │ │ ├── sac │ │ ├── __init__.py │ │ ├── core.py │ │ └── sac.py │ │ ├── td3 │ │ ├── __init__.py │ │ ├── core.py │ │ └── td3.py │ │ ├── trpo │ │ ├── __init__.py │ │ ├── core.py │ │ └── trpo.py │ │ └── vpg │ │ ├── __init__.py │ │ ├── core.py │ │ └── vpg.py ├── examples │ ├── pytorch │ │ ├── bench_ppo_cartpole.py │ │ └── pg_math │ │ │ ├── 1_simple_pg.py │ │ │ └── 2_rtg_pg.py │ └── tf1 │ │ ├── bench_ppo_cartpole.py │ │ ├── pg_math │ │ ├── 1_simple_pg.py │ │ └── 2_rtg_pg.py │ │ └── train_mnist.py ├── exercises │ ├── common.py │ ├── pytorch │ │ ├── problem_set_1 │ │ │ ├── exercise1_1.py │ │ │ ├── exercise1_2.py │ │ │ ├── exercise1_2_auxiliary.py │ │ │ └── exercise1_3.py │ │ ├── problem_set_1_solutions │ │ │ ├── exercise1_1_soln.py │ │ │ └── exercise1_2_soln.py │ │ └── problem_set_2 │ │ │ └── exercise2_2.py │ └── tf1 │ │ ├── problem_set_1 │ │ ├── exercise1_1.py │ │ ├── exercise1_2.py │ │ └── exercise1_3.py │ │ ├── problem_set_1_solutions │ │ ├── exercise1_1_soln.py │ │ └── exercise1_2_soln.py │ │ └── problem_set_2 │ │ └── exercise2_2.py ├── run.py ├── user_config.py ├── utils │ ├── __init__.py │ ├── logx.py │ ├── mpi_pytorch.py │ ├── mpi_tf.py │ ├── mpi_tools.py │ ├── plot.py │ ├── run_entrypoint.py │ ├── run_utils.py │ ├── serialization_utils.py │ └── test_policy.py └── version.py ├── test └── test_ppo.py └── travis_setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *.*~ 2 | __pycache__/ 3 | *.pkl 4 | data/ 5 | **/*.egg-info 6 | .python-version 7 | .idea/ 8 | .vscode/ 9 | .DS_Store 10 | _build/ 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | env: 2 | global: 3 | - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/travis/.mujoco/mujoco200/bin 4 | 5 | matrix: 6 | include: 7 | - os: linux 8 | language: python 9 | python: "3.6" 10 | 11 | before_install: 12 | - ./travis_setup.sh 13 | 14 | script: 15 | - pip3 install --upgrade -e .[mujoco] 16 | - python3 -c "import mujoco_py" 17 | - python3 -c "import spinup" 18 | - python3 -m pytest 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2018 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = SpinningUp 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_static/css/modify.css: -------------------------------------------------------------------------------- 1 | :root { 2 | /* Colors */ 3 | --color--white: #fff; 4 | --color--lightwash: #f7fbfb; 5 | --color--mediumwash: #eff7f8; 6 | --color--darkwash: #e6f3f3; 7 | --color--warmgraylight: #eeedee; 8 | --color--warmgraydark: #a3acb0; 9 | --color--coolgray1: #c5c5d2; 10 | --color--coolgray2: #8e8ea0; 11 | --color--coolgray3: #6e6e80; 12 | --color--coolgray4: #404452; 13 | --color--black: #050505; 14 | --color--pink: #e6a2e4; 15 | --color--magenta: #dd5ce5; 16 | --color--red: #bd1c5f; 17 | --color--brightred: #ef4146; 18 | --color--orange: #e86c09; 19 | --color--golden: #f4ac36; 20 | --color--yellow: #ebe93d; 21 | --color--lightgreen: #68de7a; 22 | --color--darkgreen: #10a37f; 23 | --color--teal: #2ff3ce; 24 | --color--lightblue: #27b5ea; 25 | --color--mediumblue: #2e95d3; 26 | --color--darkblue: #5436da; 27 | --color--navyblue: #1d0d4c; 28 | --color--lightpurple: #6b40d8; 29 | --color--darkpurple: #412991; 30 | --color--lightgrayishpurple: #cdc3cf; 31 | --color--mediumgrayishpurple: #9c88a3; 32 | --color--darkgrayishpurple: #562f5f; 33 | } 34 | 35 | body { 36 | color: var(--color--darkgray) !important; 37 | fill: var(--color--darkgray) !important; 38 | } 39 | 40 | h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend { 41 | /* font-weight: 500; 42 | font-family: Colfax, sans-serif !important; */ 43 | font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif !important; 44 | } 45 | 46 | .wy-nav-top { 47 | background-color: var(--color--coolgray4) !important; 48 | } 49 | 50 | .rst-content .toc-backref { 51 | color: #404040 !important; 52 | } 53 | 54 | .footnote { 55 | padding-left: 0.75rem; 56 | background-color: var(--color--warmgraylight) !important; 57 | } 58 | 59 | .wy-nav-top a, .wy-nav-top a:visited { 60 | color: var(--color--white) !important; 61 | } 62 | 63 | .wy-menu-vertical header, .wy-menu-vertical p.caption { 64 | font-weight: 500 !important; 65 | letter-spacing: 1px; 66 | margin-top: 1.25rem; 67 | } 68 | 69 | .wy-side-nav-search { 70 | background-color: var(--color--warmgraylight) !important; 71 | } 72 | 73 | .wy-body-for-nav { 74 | background-color: var(--color--coolgray1) !important; 75 | } 76 | 77 | .wy-menu-vertical li span.toctree-expand { 78 | color: var(--color--coolgray2) !important; 79 | } 80 | 81 | .wy-nav-side { 82 | color: var(--color--coolgray1) !important; 83 | background-color: var(--color--coolgray4) !important; 84 | } 85 | 86 | .wy-side-nav-search input[type=text] { 87 | border-color: var(--color--warmgraydark) !important; 88 | } 89 | 90 | a { 91 | color: var(--color--mediumblue) !important; 92 | } 93 | 94 | a:visited { 95 | color: #9B59B6 !important; 96 | } 97 | 98 | .wy-menu-vertical a { 99 | color: var(--color--coolgray2) !important; 100 | } 101 | 102 | .wy-menu-vertical li.current a { 103 | border-right: none !important; 104 | color: var(--color--coolgray4) !important; 105 | } 106 | 107 | .wy-menu-vertical li.current { 108 | background-color: var(--color--warmgraylight) !important; 109 | } 110 | 111 | .wy-menu-vertical li.toctree-l2.current>a { 112 | background-color: var(--color--coolgray1) !important; 113 | } 114 | 115 | .wy-menu-vertical a:hover, .wy-menu-vertical li.current a:hover, .wy-menu-vertical li.toctree-l2.current>a:hover { 116 | color: var(--color--warmgraylight) !important; 117 | background-color: var(--color--coolgray3) !important; 118 | } 119 | 120 | .wy-alert-title, .rst-content .admonition-title { 121 | background-color: var(--color--mediumblue) !important; 122 | } 123 | 124 | .wy-alert, .rst-content .note, .rst-content .attention, .rst-content .caution, .rst-content .danger, .rst-content .error, .rst-content .hint, .rst-content .important, .rst-content .tip, .rst-content .warning, .rst-content .seealso, .rst-content .admonition-todo, .rst-content .admonition { 125 | background-color: var(--color--warmgraylight) !important; 126 | } 127 | 128 | .rst-content dl:not(.docutils) dt { 129 | border-color: var(--color--mediumblue) !important; 130 | background-color: var(--color--warmgraylight) !important; 131 | } 132 | 133 | /* .rst-content pre.literal-block, .rst-content div[class^='highlight'] { 134 | background-color: var(--color--warmgraylight) !important; 135 | } */ 136 | 137 | .wy-table-odd td, .wy-table-striped tr:nth-child(2n-1) td, .rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td { 138 | background-color: var(--color--warmgraylight) !important; 139 | } 140 | 141 | @media screen and (min-width: 1100px) { 142 | .wy-nav-content-wrap { 143 | background-color: var(--color--warmgraylight) !important; 144 | } 145 | } 146 | 147 | .wy-side-nav-search img { 148 | height: auto !important; 149 | width: 100% !important; 150 | padding: 0 !important; 151 | background-color: inherit !important; 152 | border-radius: 0 !important; 153 | margin: 0 !important 154 | } 155 | 156 | .wy-side-nav-search>a, .wy-side-nav-search .wy-dropdown>a { 157 | margin-bottom: 0 !important; 158 | } 159 | 160 | .wy-menu-vertical li.toctree-l1.current>a { 161 | border: none !important; 162 | } 163 | 164 | .wy-side-nav-search>div.version { 165 | color: var(--color--coolgray2) !important; 166 | } -------------------------------------------------------------------------------- /docs/_static/openai-favicon2_32x32.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/_static/openai-favicon2_32x32.ico -------------------------------------------------------------------------------- /docs/_static/openai-favicon2_32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/_static/openai-favicon2_32x32.png -------------------------------------------------------------------------------- /docs/_static/openai_icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/_static/openai_icon.ico -------------------------------------------------------------------------------- /docs/algorithms/vpg.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | Vanilla Policy Gradient 3 | ======================= 4 | 5 | .. contents:: Table of Contents 6 | 7 | 8 | Background 9 | ========== 10 | 11 | (Previously: `Introduction to RL, Part 3`_) 12 | 13 | .. _`Introduction to RL, Part 3`: ../spinningup/rl_intro3.html 14 | 15 | The key idea underlying policy gradients is to push up the probabilities of actions that lead to higher return, and push down the probabilities of actions that lead to lower return, until you arrive at the optimal policy. 16 | 17 | Quick Facts 18 | ----------- 19 | 20 | * VPG is an on-policy algorithm. 21 | * VPG can be used for environments with either discrete or continuous action spaces. 22 | * The Spinning Up implementation of VPG supports parallelization with MPI. 23 | 24 | Key Equations 25 | ------------- 26 | 27 | Let :math:`\pi_{\theta}` denote a policy with parameters :math:`\theta`, and :math:`J(\pi_{\theta})` denote the expected finite-horizon undiscounted return of the policy. The gradient of :math:`J(\pi_{\theta})` is 28 | 29 | .. math:: 30 | 31 | \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{ 32 | \sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t|s_t) A^{\pi_{\theta}}(s_t,a_t) 33 | }, 34 | 35 | where :math:`\tau` is a trajectory and :math:`A^{\pi_{\theta}}` is the advantage function for the current policy. 36 | 37 | The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance: 38 | 39 | .. math:: 40 | 41 | \theta_{k+1} = \theta_k + \alpha \nabla_{\theta} J(\pi_{\theta_k}) 42 | 43 | Policy gradient implementations typically compute advantage function estimates based on the infinite-horizon discounted return, despite otherwise using the finite-horizon undiscounted policy gradient formula. 44 | 45 | Exploration vs. Exploitation 46 | ---------------------------- 47 | 48 | VPG trains a stochastic policy in an on-policy way. This means that it explores by sampling actions according to the latest version of its stochastic policy. The amount of randomness in action selection depends on both initial conditions and the training procedure. Over the course of training, the policy typically becomes progressively less random, as the update rule encourages it to exploit rewards that it has already found. This may cause the policy to get trapped in local optima. 49 | 50 | 51 | Pseudocode 52 | ---------- 53 | 54 | .. math:: 55 | :nowrap: 56 | 57 | \begin{algorithm}[H] 58 | \caption{Vanilla Policy Gradient Algorithm} 59 | \label{alg1} 60 | \begin{algorithmic}[1] 61 | \STATE Input: initial policy parameters $\theta_0$, initial value function parameters $\phi_0$ 62 | \FOR{$k = 0,1,2,...$} 63 | \STATE Collect set of trajectories ${\mathcal D}_k = \{\tau_i\}$ by running policy $\pi_k = \pi(\theta_k)$ in the environment. 64 | \STATE Compute rewards-to-go $\hat{R}_t$. 65 | \STATE Compute advantage estimates, $\hat{A}_t$ (using any method of advantage estimation) based on the current value function $V_{\phi_k}$. 66 | \STATE Estimate policy gradient as 67 | \begin{equation*} 68 | \hat{g}_k = \frac{1}{|{\mathcal D}_k|} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T \left. \nabla_{\theta} \log\pi_{\theta}(a_t|s_t)\right|_{\theta_k} \hat{A}_t. 69 | \end{equation*} 70 | \STATE Compute policy update, either using standard gradient ascent, 71 | \begin{equation*} 72 | \theta_{k+1} = \theta_k + \alpha_k \hat{g}_k, 73 | \end{equation*} 74 | or via another gradient ascent algorithm like Adam. 75 | \STATE Fit value function by regression on mean-squared error: 76 | \begin{equation*} 77 | \phi_{k+1} = \arg \min_{\phi} \frac{1}{|{\mathcal D}_k| T} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T\left( V_{\phi} (s_t) - \hat{R}_t \right)^2, 78 | \end{equation*} 79 | typically via some gradient descent algorithm. 80 | \ENDFOR 81 | \end{algorithmic} 82 | \end{algorithm} 83 | 84 | 85 | Documentation 86 | ============= 87 | 88 | .. admonition:: You Should Know 89 | 90 | In what follows, we give documentation for the PyTorch and Tensorflow implementations of VPG in Spinning Up. They have nearly identical function calls and docstrings, except for details relating to model construction. However, we include both full docstrings for completeness. 91 | 92 | 93 | Documentation: PyTorch Version 94 | ------------------------------ 95 | 96 | .. autofunction:: spinup.vpg_pytorch 97 | 98 | Saved Model Contents: PyTorch Version 99 | ------------------------------------- 100 | 101 | The PyTorch saved model can be loaded with ``ac = torch.load('path/to/model.pt')``, yielding an actor-critic object (``ac``) that has the properties described in the docstring for ``vpg_pytorch``. 102 | 103 | You can get actions from this model with 104 | 105 | .. code-block:: python 106 | 107 | actions = ac.act(torch.as_tensor(obs, dtype=torch.float32)) 108 | 109 | 110 | Documentation: Tensorflow Version 111 | --------------------------------- 112 | 113 | .. autofunction:: spinup.vpg_tf1 114 | 115 | Saved Model Contents: Tensorflow Version 116 | ---------------------------------------- 117 | 118 | The computation graph saved by the logger includes: 119 | 120 | ======== ==================================================================== 121 | Key Value 122 | ======== ==================================================================== 123 | ``x`` Tensorflow placeholder for state input. 124 | ``pi`` Samples an action from the agent, conditioned on states in ``x``. 125 | ``v`` Gives value estimate for states in ``x``. 126 | ======== ==================================================================== 127 | 128 | This saved model can be accessed either by 129 | 130 | * running the trained policy with the `test_policy.py`_ tool, 131 | * or loading the whole saved graph into a program with `restore_tf_graph`_. 132 | 133 | .. _`test_policy.py`: ../user/saving_and_loading.html#loading-and-running-trained-policies 134 | .. _`restore_tf_graph`: ../utils/logger.html#spinup.utils.logx.restore_tf_graph 135 | 136 | References 137 | ========== 138 | 139 | Relevant Papers 140 | --------------- 141 | 142 | - `Policy Gradient Methods for Reinforcement Learning with Function Approximation`_, Sutton et al. 2000 143 | - `Optimizing Expectations: From Deep Reinforcement Learning to Stochastic Computation Graphs`_, Schulman 2016(a) 144 | - `Benchmarking Deep Reinforcement Learning for Continuous Control`_, Duan et al. 2016 145 | - `High Dimensional Continuous Control Using Generalized Advantage Estimation`_, Schulman et al. 2016(b) 146 | 147 | .. _`Policy Gradient Methods for Reinforcement Learning with Function Approximation`: https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf 148 | .. _`Optimizing Expectations: From Deep Reinforcement Learning to Stochastic Computation Graphs`: http://joschu.net/docs/thesis.pdf 149 | .. _`Benchmarking Deep Reinforcement Learning for Continuous Control`: https://arxiv.org/abs/1604.06778 150 | .. _`High Dimensional Continuous Control Using Generalized Advantage Estimation`: https://arxiv.org/abs/1506.02438 151 | 152 | Why These Papers? 153 | ----------------- 154 | 155 | Sutton 2000 is included because it is a timeless classic of reinforcement learning theory, and contains references to the earlier work which led to modern policy gradients. Schulman 2016(a) is included because Chapter 2 contains a lucid introduction to the theory of policy gradient algorithms, including pseudocode. Duan 2016 is a clear, recent benchmark paper that shows how vanilla policy gradient in the deep RL setting (eg with neural network policies and Adam as the optimizer) compares with other deep RL algorithms. Schulman 2016(b) is included because our implementation of VPG makes use of Generalized Advantage Estimation for computing the policy gradient. 156 | 157 | 158 | Other Public Implementations 159 | ---------------------------- 160 | 161 | - rllab_ 162 | - `rllib (Ray)`_ 163 | 164 | .. _rllab: https://github.com/rll/rllab/blob/master/rllab/algos/vpg.py 165 | .. _`rllib (Ray)`: https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg 166 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Spinning Up documentation build configuration file, created by 5 | # sphinx-quickstart on Wed Aug 15 04:21:07 2018. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | # Make sure spinup is accessible without going through setup.py 24 | dirname = os.path.dirname 25 | sys.path.insert(0, dirname(dirname(__file__))) 26 | 27 | # Mock mpi4py to get around having to install it on RTD server (which fails) 28 | # Also to mock PyTorch, because it is too large for the RTD server to download 29 | from unittest.mock import MagicMock 30 | 31 | class Mock(MagicMock): 32 | @classmethod 33 | def __getattr__(cls, name): 34 | return MagicMock() 35 | 36 | MOCK_MODULES = ['mpi4py', 37 | 'torch', 38 | 'torch.optim', 39 | 'torch.nn', 40 | 'torch.distributions', 41 | 'torch.distributions.normal', 42 | 'torch.distributions.categorical', 43 | 'torch.nn.functional', 44 | ] 45 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 46 | 47 | # Finish imports 48 | import spinup 49 | from recommonmark.parser import CommonMarkParser 50 | 51 | 52 | source_parsers = { 53 | '.md': CommonMarkParser, 54 | } 55 | 56 | 57 | # -- General configuration ------------------------------------------------ 58 | 59 | # If your documentation needs a minimal Sphinx version, state it here. 60 | # 61 | # needs_sphinx = '1.0' 62 | 63 | # Add any Sphinx extension module names here, as strings. They can be 64 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 65 | # ones. 66 | extensions = ['sphinx.ext.imgmath', 67 | 'sphinx.ext.viewcode', 68 | 'sphinx.ext.autodoc', 69 | 'sphinx.ext.napoleon'] 70 | 71 | #'sphinx.ext.mathjax', ?? 72 | 73 | # imgmath settings 74 | imgmath_image_format = 'svg' 75 | imgmath_font_size = 14 76 | 77 | # Add any paths that contain templates here, relative to this directory. 78 | templates_path = ['_templates'] 79 | 80 | # The suffix(es) of source filenames. 81 | # You can specify multiple suffix as a list of string: 82 | # 83 | source_suffix = ['.rst', '.md'] 84 | # source_suffix = '.rst' 85 | 86 | # The master toctree document. 87 | master_doc = 'index' 88 | 89 | # General information about the project. 90 | project = 'Spinning Up' 91 | copyright = '2018, OpenAI' 92 | author = 'Joshua Achiam' 93 | 94 | # The version info for the project you're documenting, acts as replacement for 95 | # |version| and |release|, also used in various other places throughout the 96 | # built documents. 97 | # 98 | # The short X.Y version. 99 | version = '' 100 | # The full version, including alpha/beta/rc tags. 101 | release = '' 102 | 103 | # The language for content autogenerated by Sphinx. Refer to documentation 104 | # for a list of supported languages. 105 | # 106 | # This is also used if you do content translation via gettext catalogs. 107 | # Usually you set "language" from the command line for these cases. 108 | language = None 109 | 110 | # List of patterns, relative to source directory, that match files and 111 | # directories to ignore when looking for source files. 112 | # This patterns also effect to html_static_path and html_extra_path 113 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 114 | 115 | # The name of the Pygments (syntax highlighting) style to use. 116 | pygments_style = 'default' #'sphinx' 117 | 118 | # If true, `todo` and `todoList` produce output, else they produce nothing. 119 | todo_include_todos = False 120 | 121 | 122 | # -- Options for HTML output ---------------------------------------------- 123 | 124 | # The theme to use for HTML and HTML Help pages. See the documentation for 125 | # a list of builtin themes. 126 | # 127 | # html_theme = 'alabaster' 128 | html_theme = "sphinx_rtd_theme" 129 | 130 | # Theme options are theme-specific and customize the look and feel of a theme 131 | # further. For a list of options available for each theme, see the 132 | # documentation. 133 | # 134 | # html_theme_options = {} 135 | 136 | # Add any paths that contain custom static files (such as style sheets) here, 137 | # relative to this directory. They are copied after the builtin static files, 138 | # so a file named "default.css" will overwrite the builtin "default.css". 139 | html_static_path = ['_static'] 140 | 141 | html_logo = 'images/spinning-up-logo2.png' 142 | html_theme_options = { 143 | 'logo_only': True 144 | } 145 | #html_favicon = 'openai-favicon2_32x32.ico' 146 | html_favicon = 'openai_icon.ico' 147 | 148 | # -- Options for HTMLHelp output ------------------------------------------ 149 | 150 | # Output file base name for HTML help builder. 151 | htmlhelp_basename = 'SpinningUpdoc' 152 | 153 | # -- Options for LaTeX output --------------------------------------------- 154 | 155 | 156 | imgmath_latex_preamble = r''' 157 | \usepackage{algorithm} 158 | \usepackage{algorithmic} 159 | \usepackage{amsmath} 160 | \usepackage{cancel} 161 | 162 | \usepackage[verbose=true,letterpaper]{geometry} 163 | \geometry{ 164 | textheight=12in, 165 | textwidth=6.5in, 166 | top=1in, 167 | headheight=12pt, 168 | headsep=25pt, 169 | footskip=30pt 170 | } 171 | 172 | \newcommand{\E}{{\mathrm E}} 173 | 174 | \newcommand{\underE}[2]{\underset{\begin{subarray}{c}#1 \end{subarray}}{\E}\left[ #2 \right]} 175 | 176 | \newcommand{\Epi}[1]{\underset{\begin{subarray}{c}\tau \sim \pi \end{subarray}}{\E}\left[ #1 \right]} 177 | ''' 178 | 179 | latex_elements = { 180 | # The paper size ('letterpaper' or 'a4paper'). 181 | # 182 | # 'papersize': 'letterpaper', 183 | 184 | # The font size ('10pt', '11pt' or '12pt'). 185 | # 186 | # 'pointsize': '10pt', 187 | 188 | # Additional stuff for the LaTeX preamble. 189 | # 190 | 'preamble': r''' 191 | \usepackage{algorithm} 192 | \usepackage{algorithmic} 193 | \usepackage{amsmath} 194 | \usepackage{cancel} 195 | 196 | 197 | \newcommand{\E}{{\mathrm E}} 198 | 199 | \newcommand{\underE}[2]{\underset{\begin{subarray}{c}#1 \end{subarray}}{\E}\left[ #2 \right]} 200 | 201 | \newcommand{\Epi}[1]{\underset{\begin{subarray}{c}\tau \sim \pi \end{subarray}}{\E}\left[ #1 \right]} 202 | ''', 203 | 204 | # Latex figure (float) alignment 205 | # 206 | # 'figure_align': 'htbp', 207 | } 208 | 209 | # Grouping the document tree into LaTeX files. List of tuples 210 | # (source start file, target name, title, 211 | # author, documentclass [howto, manual, or own class]). 212 | latex_documents = [ 213 | (master_doc, 'SpinningUp.tex', 'Spinning Up Documentation', 214 | 'Joshua Achiam', 'manual'), 215 | ] 216 | 217 | 218 | # -- Options for manual page output --------------------------------------- 219 | 220 | # One entry per manual page. List of tuples 221 | # (source start file, name, description, authors, manual section). 222 | man_pages = [ 223 | (master_doc, 'spinningup', 'Spinning Up Documentation', 224 | [author], 1) 225 | ] 226 | 227 | 228 | # -- Options for Texinfo output ------------------------------------------- 229 | 230 | # Grouping the document tree into Texinfo files. List of tuples 231 | # (source start file, target name, title, author, 232 | # dir menu entry, description, category) 233 | texinfo_documents = [ 234 | (master_doc, 'SpinningUp', 'Spinning Up Documentation', 235 | author, 'SpinningUp', 'One line description of project.', 236 | 'Miscellaneous'), 237 | ] 238 | 239 | 240 | def setup(app): 241 | app.add_stylesheet('css/modify.css') -------------------------------------------------------------------------------- /docs/docs_requirements.txt: -------------------------------------------------------------------------------- 1 | cloudpickle~=1.2.1 2 | gym~=0.15.3 3 | ipython 4 | joblib 5 | matplotlib 6 | numpy 7 | pandas 8 | pytest 9 | psutil 10 | scipy 11 | seaborn==0.8.1 12 | sphinx==1.5.6 13 | sphinx-autobuild==0.7.1 14 | sphinx-rtd-theme==0.4.1 15 | tensorflow>=1.8.0,<2.0 16 | tqdm -------------------------------------------------------------------------------- /docs/etc/acknowledgements.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Acknowledgements 3 | ================ 4 | 5 | We gratefully acknowledge the contributions of the many people who helped get this project off of the ground, including people who beta tested the software, gave feedback on the material, improved dependencies of Spinning Up code in service of this release, or otherwise supported the project. Given the number of people who were involved at various points, this list of names may not be exhaustive. (If you think you should have been listed here, please do not hesitate to reach out.) 6 | 7 | In no particular order, thank you Alex Ray, Amanda Askell, Ben Garfinkel, Christy Dennison, Coline Devin, Daniel Zeigler, Dylan Hadfield-Menell, Ge Yang, Greg Khan, Jack Clark, Jonas Rothfuss, Larissa Schiavo, Leandro Castelao, Lilian Weng, Maddie Hall, Matthias Plappert, Miles Brundage, Peter Zokhov, and Pieter Abbeel. 8 | 9 | We are also grateful to Pieter Abbeel's group at Berkeley, and the Center for Human-Compatible AI, for giving feedback on presentations about Spinning Up. -------------------------------------------------------------------------------- /docs/etc/author.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | About the Author 3 | ================ 4 | 5 | Spinning Up in Deep RL was primarily developed by Josh Achiam, a research scientist on the OpenAI Safety Team and PhD student at UC Berkeley advised by Pieter Abbeel. Josh studies topics related to safety in deep reinforcement learning, and has previously published work on `safe exploration`_. 6 | 7 | .. _`safe exploration`: https://arxiv.org/abs/1705.10528 -------------------------------------------------------------------------------- /docs/images/alphago.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/alphago.jpg -------------------------------------------------------------------------------- /docs/images/ex2-1_trpo_hopper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/ex2-1_trpo_hopper.png -------------------------------------------------------------------------------- /docs/images/ex2-2_ddpg_bug_pytorch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/ex2-2_ddpg_bug_pytorch.png -------------------------------------------------------------------------------- /docs/images/knocked-over-stand-up.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/knocked-over-stand-up.mp4 -------------------------------------------------------------------------------- /docs/images/knocked_down_standup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/knocked_down_standup.png -------------------------------------------------------------------------------- /docs/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/logo.png -------------------------------------------------------------------------------- /docs/images/ms_pacman.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/ms_pacman.png -------------------------------------------------------------------------------- /docs/images/openai-favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/openai-favicon.png -------------------------------------------------------------------------------- /docs/images/openai-favicon2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/openai-favicon2.png -------------------------------------------------------------------------------- /docs/images/openai-favicon2_32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/openai-favicon2_32x32.png -------------------------------------------------------------------------------- /docs/images/recolored_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/recolored_logo.png -------------------------------------------------------------------------------- /docs/images/rl_algorithms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/rl_algorithms.png -------------------------------------------------------------------------------- /docs/images/rl_algorithms.xml: -------------------------------------------------------------------------------- 1 | 7Z3bdps4FIafxpfpQuLoS+fYWaudpklXZzp3CpZtphh5sNIkffoRRtiAwCFGknGr9KJBCCH4vy1tbUlkZF8sn29StFp8JFMcj6A1fR7ZlyMIge0B9l+W8pKn+K6TJ8zTaMoz7RLuo5+YJ1o89TGa4nUlIyUkptGqmhiSJMEhraShNCVP1WwzElfvukJzLCTchygWU/+KpnRR1A66uxPvcTRf8Ft7xYkHFH6fp+Qx4fcbQXu2+clPL1FRFn/Q9QJNyVMpyb4a2RcpITT/bfl8gePs3RavLb/uuuXstt4pTminC3yMPM/HwA0hq2p4Bse8YvSleBl4yt4NPyQpXZA5SVB8tUs93zwwzooE7GhBlzH/NUYPOD7fvpMLEpOUnUpIkl22piilk0yuWtp1FGclWMUxB8RlxziZFleEMVqvo/DLIkryE/wykB+VLvoXU/rCj9EjJSxp9yAfCFnxq9Y0Jd9xUUumnbX52Z4pWMjyzkhCr9EyijPEv+J0ihLEk/mdAn7YVB5+jujf2SO+c/nRN16uqCAXdU0e05BLAvOkTJlSHi7xDSZLTNMXliHFMaLRjyrViBvHfJtve+ktidhdocUN2QXeOw42N+TAqpaRV4pfVubs1ZIArBXFhJ5jKhTFfik90i5pA3JHqG3LQK0D6oRJVKI6O/zGH7kV61z2PMk+FtYCjF259myhKF8j18BwfQpcC8L5Jwe6PQZHBJ2/rx8ofsSFY+XFlGtWsQDvv0dSnDhbb8SdsAzAWz3vTrLf5tn/G8/17DrFmGW5+1AUyuqXl5vnEoysakJPi4ji+xXadM5PzCGumtXWxbM62FgN6ak/fmhDmtlPKedshr0w7IQ6gHtYR3E0TzJ7ZTzjdB/gP3BK8fNefAt2xEbS4ew8lbxrl6ctyo611c58BbA9NMH+8LhN8GS8WJN4TtKILpZrA48SeByn3vDoZMdWxE7e8JyjNQPBtDzK4HGd8TFbHse4Z8MfSx/LOXOsoMYmdN0DnTOr3sFuG0n5zplroD6FMUegh+IO6HWl2IF2vaixMoo9Q/EpUKypLZZIMfCFZl0dxaCfh2FVMd6qBSpavXsLW51UBZri12Bc18LuGP9gxodeStlWWYb1m+7kWTVx8zIPlrpfv1uXutVnUgpB2e/yG7iwlHDRX0wHSBazX/dzEmKOh6qlbMP0f30tARiomK5kw/R1G6Yliin6foLfd7Cn57jZvy0JNT0PQcMJepLQNZji67az05fG1yVNYKR5ozSeLmnyIqQHsG9JHIVZjT+taLSMfrJak8REsZVEsYEYGPF0zp/Zx7duqQ5OINojHIaD44D6ZJfsYWTPOYmTEBOoiX32F1P2MLJnLP4kxITDiAk0iCnbMjUNPZQLpia411+wvWNF8fqg3ukW1yuI4/r9Vl1LiSEM2IFuoEzbsLNY8Wyk6SyNrmFn0G/56+8oja5hJ7+39GHn57MPGKVJlMzNWFPTWk2tY80i0CwdnC93t58Gh4wXBvhh1gWZKcLBbKjIOHXPeNxADGwgpj7HflB0ol9Y2Kzk0LLITtPOnsD2661X3Wnvuo7Dt6G+NXWwXwDdQKxrg5qaMGEDfPZ+9LpSHAgLoOEYKKPY7B0+DYrVhNTUUexZdRdDIcX2AJYASI3RFXJXEHCVIPDWIJ0H68LKnu+wB7BqQL2azkDVlD3h4cDfQE1nGHORDWrKtk3+8NIH+5sYUfZWF9m+Xv55moGN/X+NcJFbBIKOESxydIfmlbcGTc7aMCazxZG19J5aVejvhr0U0xroaA18T4iZ6GwPCjdIOkF/wMngiPk1YseeN642KjpDx8UgSDov919vDC96eGnazq+MF1WzmpN4tUD/4JQYapRQEwTWEalRtLbHL7uptbigLwQG5fqpDUsL5PupnYcBqhzHj+cfr41FamnHHZ0W6TR91aemZybGquWptt+dRQ9FdqvlnbS+AMcVxlNdXwGo7yc/aKGFqr7swgXGZrrYzJbC142mWICqc1mFqtjc57uzy89/GkTUIAIsnYyo6ngNIPIB0QIEVATE+6s7A4SiFkNnr+I0Reg0e17CcsXujpeMVzBWZCK3Zn1rRxNxWuAaRqcKVDleE8jubWX1gNYkq4eBRQUsOpvT4jta6nb336RoGuENHoYWmbRo6W2b5jc097asEi3zxnq6W8dTZCL3E9OEthpFA0oVO3GH0986qr6Q8uXSNoCoAUTrkEVVoPTy8tbMEssGRAcRvrr5pq9XhogDiXjjfJPOdSZ+k5te07O0N4UrUdavdWd4216Q3VRul13j4uaQvZbVtH6rSOu5ZURcdCx8daPzh5iB8AEPt1aUxA94dJhSbPxUwBtVHgkLUA9T2X99WKJU5Ve/rTIEldnh7i9x5tl3f+7Uvvof -------------------------------------------------------------------------------- /docs/images/rl_algorithms_9_12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/rl_algorithms_9_12.png -------------------------------------------------------------------------------- /docs/images/rl_algorithms_9_15.xml: -------------------------------------------------------------------------------- 1 | 7Z1bd6MqFMc/TR47S/Cax/Q6Z62ZM5121lzOG1WSeMZIjrHTdj79wQiJikltBWI6tA+NiAT9/zZsNmBH9tni8SpDy/lHEuFkBK3ocWSfjyAEtgfonyLlqUzxXadMmGVxxDJtE27j35glWiz1Po7wqpYxJyTJ42U9MSRpisO8loayjDzUs01JUv/WJZphIeE2RImY+i2O8jmvHXS3J97jeDZnX+3xE3co/DnLyH3Kvm8E7en6pzy9QLwsdqOrOYrIQyXJvhjZZxkheflp8XiGk+LZ8sdWXne54+ym3hlO804X+Bh5no+BG0Ja1fAEjlnF8if+MHBEnw07JFk+JzOSouRim3q6vmFcFAno0TxfJOxjgu5wcrp5JmckIRk9lZK0uGyVoyyfFHI10i7jpCjB4scMEJce4zTiV4QJWq3i8Ms8TssT7DJQHlUu+hfn+RM7Rvc5oUnbG/lAyJJdtcoz8hPzWlLtrPXP5gxnocg7JWl+iRZxUiD+FWcRShFLZt8UsMO28vBjnH8vbvGdy45+sHJFBZmoK3KfhUwSWCYVylTyMImvMFngPHuiGTKcoDz+VacaMeOYbfJtLr0mMf1WaDFDdoH3joHNDDmw6mWUlWKXVTl7tiQAG0VRoWc4F4qiHyq3tE1ag9wRatsyUOuAOqUSVaguDn+wW96JdSl7mWQfCmsBxq5ce7ZQlK+Ra2C4PgauBeH8owPdHoMDgs6e1y+U3GPuWHlJzjSrWYD33z3hJ05Wa3EnNAPwlo/bk/TTrPi79lxPLjOMaZabD7xQWr+y3DKXYGR1E3qYxzm+XaJ15/xAHeK6WW1cPKuDjTWQjvzx3S6kqf1Uck6n2AvDTqgDuId1lMSztLBXyjPO9gH+C2c5ftyLL2dHbCQdxs5Dxbt2Wdq86lhbu5mvAbaHJtgfHrcNnoIXa5LMSBbn88XKwKMEHsdpNjw62bEVsVM2PKdoRUEwLY8yeFxnfMiWxzHu2fDH0odyzhwraLAJXfeVzpnV7GA3jaR858w1UB/DmCPQQ3EH9LpS7EC7WdRYGcWeofgYKNbUFkukGPhCs66OYtDPw7DqGG/UAjWt3r2ErU6qAk3xazBuamF3jH9Q40NPlWzLIsPqRd/kWQ1xyzJfLXW/frcp9U6fSSkEVb/Lb+HCUsJFfzEdIFnMft3PUYg5HqqWsg3Tf/taAjBQMV3JhunrNkxLFFP0/QS/79WenuMWvz1IcKSPK7oGU3zddnZ00kh3ljtLExhp9kvjHUyasgjpAexrksRhUeNPyzxexL9prUlqothKothADIx4OufP7MNbt1QHJxANFA7DwXFAc7JL9jCy55zEUYgJ1MQ++4spexjZMxZ/FGLCYcQEWsSUbZmahh7KBVMT3Osv2N6xonh90Ox0+fUK4rh+v1XXUmIIw3GgW6A63LCTr3g20uyS5mDDzqDf8tc/QJqDDTtZZaQPOz+ffMAoS+N0ZsaamtZqah1r8kCzdHC+3Fx/GhwyXhjgu2kXZCKEg+lQkXGanvG4hRjYQkxzjv1V0Yl+YWGzkkPLIjtNO3sC22+2Xk2nves6Dt+G+tbUwX4BdAOxrg1qasKELfDZ+9HrSnEgLICGY6CMYrN3+DgoVhNSU0exZzVdDIUU2wNYAiA1RsflriHgKkHgpUE6DzaFlT3fYQ9g1YB6NZ2Bqil7wsOBf4CazjDmIlvUlG2b7OalD/bXMaLiqc6Lfb3s9TQDG/u/jXCRywNBhwgWObpD88pbgzZnbRiT2eLIWnpPrSr0d0UfimkNdLQGvifETHS2B9wNkk7QX3AyOGLeRuzY88b1RkVn6JgPgqTz8o1kScTbmuG9GOKNotO2s18ZOqomOCfJco7+wRkx1CihJgisA1KjaJmPX/VYGyFCX4gRynVZW5YdyHdZO48IVPmQH08/XhqL1NKOOzot0ml7wU9Dz0KMZfcb3byVFt3xEqy9D8BxhaFV10cAmlvLX7XmQlVfduYCYzNdbGZD4fNGw9ei6lxhoSpM9/nm5Pzz3wYRNYgASycjqjpeA4h8QLQAARUB8f7ixgChqMXQ2as4bcE6zZ6XsHKxu+Ml4xGMFZnItVnq2tFEnB0kDaNTBaocrwmk320V9YDWpKiHgUUFLDqbU/5KLXUb/a8yFMV4jYehRSYtWnrbtqkOzb0trcSOKWQ93S3fDSXdRG4npgntZhTuDpaG0d86ql6W8uXcNoCoAUTrkEVVoPT8/PrKACIZEB1E+Ormm75eGCL0zDfpXHLit7npDT0r21SYElX9dm4S37UtZO9U7qu2hVQeVNtSLp7Wc/eIuP5YeAFH53cyA+FdHm6jKInv8ugwpdj61oAXqjwS1qJ2Upk3WYNR+dnXrAxBZXq4/aecZfbtfz61L/4H -------------------------------------------------------------------------------- /docs/images/rl_diagram_transparent_bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/rl_diagram_transparent_bg.png -------------------------------------------------------------------------------- /docs/images/spinning-up-in-rl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/spinning-up-in-rl.png -------------------------------------------------------------------------------- /docs/images/spinning-up-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/spinning-up-logo.png -------------------------------------------------------------------------------- /docs/images/spinning-up-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | 13 | adsfdsfgArtboard 2 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/images/spinning-up-logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/spinning-up-logo2.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Spinning Up documentation master file, created by 2 | sphinx-quickstart on Wed Aug 15 04:21:07 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Spinning Up in Deep RL! 7 | ================================== 8 | 9 | .. image:: images/spinning-up-in-rl.png 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | :caption: User Documentation 14 | 15 | user/introduction 16 | user/installation 17 | user/algorithms 18 | user/running 19 | user/saving_and_loading 20 | user/plotting 21 | 22 | .. toctree:: 23 | :maxdepth: 2 24 | :caption: Introduction to RL 25 | 26 | spinningup/rl_intro 27 | spinningup/rl_intro2 28 | spinningup/rl_intro3 29 | 30 | .. toctree:: 31 | :maxdepth: 2 32 | :caption: Resources 33 | 34 | spinningup/spinningup 35 | spinningup/keypapers 36 | spinningup/exercises 37 | spinningup/bench 38 | 39 | .. toctree:: 40 | :maxdepth: 2 41 | :caption: Algorithms Docs 42 | 43 | algorithms/vpg 44 | algorithms/trpo 45 | algorithms/ppo 46 | algorithms/ddpg 47 | algorithms/td3 48 | algorithms/sac 49 | 50 | .. toctree:: 51 | :maxdepth: 2 52 | :caption: Utilities Docs 53 | 54 | utils/logger 55 | utils/plotter 56 | utils/mpi 57 | utils/run_utils 58 | 59 | .. toctree:: 60 | :maxdepth: 2 61 | :caption: Etc. 62 | 63 | etc/acknowledgements 64 | etc/author 65 | 66 | Indices and tables 67 | ================== 68 | 69 | * :ref:`genindex` 70 | * :ref:`modindex` 71 | * :ref:`search` 72 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=SpinningUp 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/spinningup/bench.rst: -------------------------------------------------------------------------------- 1 | ========================================== 2 | Benchmarks for Spinning Up Implementations 3 | ========================================== 4 | 5 | .. contents:: Table of Contents 6 | 7 | We benchmarked the Spinning Up algorithm implementations in five environments from the MuJoCo_ Gym task suite: HalfCheetah, Hopper, Walker2d, Swimmer, and Ant. 8 | 9 | .. _MuJoCo: https://gym.openai.com/envs/#mujoco 10 | 11 | Performance in Each Environment 12 | =============================== 13 | 14 | HalfCheetah: PyTorch Versions 15 | ----------------------------- 16 | 17 | .. figure:: ../images/plots/pyt/pytorch_halfcheetah_performance.svg 18 | :align: center 19 | 20 | 3M timestep benchmark for HalfCheetah-v3 using **PyTorch** implementations. 21 | 22 | 23 | HalfCheetah: Tensorflow Versions 24 | -------------------------------- 25 | 26 | .. figure:: ../images/plots/tf1/tensorflow_halfcheetah_performance.svg 27 | :align: center 28 | 29 | 3M timestep benchmark for HalfCheetah-v3 using **Tensorflow** implementations. 30 | 31 | 32 | 33 | Hopper: PyTorch Versions 34 | ------------------------ 35 | 36 | .. figure:: ../images/plots/pyt/pytorch_hopper_performance.svg 37 | :align: center 38 | 39 | 3M timestep benchmark for Hopper-v3 using **PyTorch** implementations. 40 | 41 | 42 | Hopper: Tensorflow Versions 43 | --------------------------- 44 | 45 | .. figure:: ../images/plots/tf1/tensorflow_hopper_performance.svg 46 | :align: center 47 | 48 | 3M timestep benchmark for Hopper-v3 using **Tensorflow** implementations. 49 | 50 | 51 | 52 | 53 | Walker2d: PyTorch Versions 54 | -------------------------- 55 | 56 | .. figure:: ../images/plots/pyt/pytorch_walker2d_performance.svg 57 | :align: center 58 | 59 | 3M timestep benchmark for Walker2d-v3 using **PyTorch** implementations. 60 | 61 | 62 | Walker2d: Tensorflow Versions 63 | ----------------------------- 64 | 65 | .. figure:: ../images/plots/tf1/tensorflow_walker2d_performance.svg 66 | :align: center 67 | 68 | 3M timestep benchmark for Walker2d-v3 using **Tensorflow** implementations. 69 | 70 | 71 | 72 | Swimmer: PyTorch Versions 73 | ------------------------- 74 | 75 | .. figure:: ../images/plots/pyt/pytorch_swimmer_performance.svg 76 | :align: center 77 | 78 | 3M timestep benchmark for Swimmer-v3 using **PyTorch** implementations. 79 | 80 | 81 | Swimmer: Tensorflow Versions 82 | ---------------------------- 83 | 84 | .. figure:: ../images/plots/tf1/tensorflow_swimmer_performance.svg 85 | :align: center 86 | 87 | 3M timestep benchmark for Swimmer-v3 using **Tensorflow** implementations. 88 | 89 | 90 | 91 | Ant: PyTorch Versions 92 | ------------------------ 93 | 94 | .. figure:: ../images/plots/pyt/pytorch_ant_performance.svg 95 | :align: center 96 | 97 | 3M timestep benchmark for Ant-v3 using **PyTorch** implementations. 98 | 99 | 100 | Ant: Tensorflow Versions 101 | --------------------------- 102 | 103 | .. figure:: ../images/plots/tf1/tensorflow_ant_performance.svg 104 | :align: center 105 | 106 | 3M timestep benchmark for Ant-v3 using **Tensorflow** implementations. 107 | 108 | 109 | Experiment Details 110 | ================== 111 | 112 | **Random seeds.** All experiments were run for 10 random seeds each. Graphs show the average (solid line) and std dev (shaded) of performance over random seed over the course of training. 113 | 114 | **Performance metric.** Performance for the on-policy algorithms is measured as the average trajectory return across the batch collected at each epoch. Performance for the off-policy algorithms is measured once every 10,000 steps by running the deterministic policy (or, in the case of SAC, the mean policy) without action noise for ten trajectories, and reporting the average return over those test trajectories. 115 | 116 | **Network architectures.** The on-policy algorithms use networks of size (64, 32) with tanh units for both the policy and the value function. The off-policy algorithms use networks of size (256, 256) with relu units. 117 | 118 | **Batch size.** The on-policy algorithms collected 4000 steps of agent-environment interaction per batch update. The off-policy algorithms used minibatches of size 100 at each gradient descent step. 119 | 120 | All other hyperparameters are left at default settings for the Spinning Up implementations. See algorithm pages for details. 121 | 122 | Learning curves are smoothed by averaging over a window of 11 epochs. 123 | 124 | .. admonition:: You Should Know 125 | 126 | By comparison to the literature, the Spinning Up implementations of DDPG, TD3, and SAC are roughly at-parity with the best reported results for these algorithms. As a result, you can use the Spinning Up implementations of these algorithms for research purposes. 127 | 128 | The Spinning Up implementations of VPG, TRPO, and PPO are overall a bit weaker than the best reported results for these algorithms. This is due to the absence of some standard tricks (such as observation normalization and normalized value regression targets) from our implementations. For research comparisons, you should use the implementations of TRPO or PPO from `OpenAI Baselines`_. 129 | 130 | .. _`OpenAI Baselines`: https://github.com/openai/baselines 131 | 132 | 133 | PyTorch vs Tensorflow 134 | ===================== 135 | 136 | 137 | We provide graphs for head-to-head comparisons between the PyTorch and Tensorflow implementations of each algorithm at the following pages: 138 | 139 | * `VPG Head-to-Head`_ 140 | 141 | * `PPO Head-to-Head`_ 142 | 143 | * `DDPG Head-to-Head`_ 144 | 145 | * `TD3 Head-to-Head`_ 146 | 147 | * `SAC Head-to-Head`_ 148 | 149 | .. _`VPG Head-to-Head`: ../spinningup/bench_vpg.html 150 | .. _`PPO Head-to-Head`: ../spinningup/bench_ppo.html 151 | .. _`DDPG Head-to-Head`: ../spinningup/bench_ddpg.html 152 | .. _`TD3 Head-to-Head`: ../spinningup/bench_td3.html 153 | .. _`SAC Head-to-Head`: ../spinningup/bench_sac.html -------------------------------------------------------------------------------- /docs/spinningup/bench_ddpg.rst: -------------------------------------------------------------------------------- 1 | DDPG Head-to-Head 2 | ================= 3 | 4 | HalfCheetah 5 | ----------- 6 | 7 | .. figure:: ../images/plots/ddpg/ddpg_halfcheetah_performance.svg 8 | :align: center 9 | 10 | 11 | Hopper 12 | ------ 13 | 14 | .. figure:: ../images/plots/ddpg/ddpg_hopper_performance.svg 15 | :align: center 16 | 17 | 18 | Walker2d 19 | -------- 20 | 21 | .. figure:: ../images/plots/ddpg/ddpg_walker2d_performance.svg 22 | :align: center 23 | 24 | Swimmer 25 | ------- 26 | 27 | .. figure:: ../images/plots/ddpg/ddpg_swimmer_performance.svg 28 | :align: center 29 | 30 | 31 | Ant 32 | --- 33 | 34 | .. figure:: ../images/plots/ddpg/ddpg_ant_performance.svg 35 | :align: center -------------------------------------------------------------------------------- /docs/spinningup/bench_ppo.rst: -------------------------------------------------------------------------------- 1 | Proximal Policy Optimization Head-to-Head 2 | ========================================= 3 | 4 | HalfCheetah 5 | ----------- 6 | 7 | .. figure:: ../images/plots/ppo/ppo_halfcheetah_performance.svg 8 | :align: center 9 | 10 | 11 | Hopper 12 | ------ 13 | 14 | .. figure:: ../images/plots/ppo/ppo_hopper_performance.svg 15 | :align: center 16 | 17 | 18 | Walker2d 19 | -------- 20 | 21 | .. figure:: ../images/plots/ppo/ppo_walker2d_performance.svg 22 | :align: center 23 | 24 | Swimmer 25 | ------- 26 | 27 | .. figure:: ../images/plots/ppo/ppo_swimmer_performance.svg 28 | :align: center 29 | 30 | 31 | Ant 32 | --- 33 | 34 | .. figure:: ../images/plots/ppo/ppo_ant_performance.svg 35 | :align: center -------------------------------------------------------------------------------- /docs/spinningup/bench_sac.rst: -------------------------------------------------------------------------------- 1 | SAC Head-to-Head 2 | ================= 3 | 4 | HalfCheetah 5 | ----------- 6 | 7 | .. figure:: ../images/plots/sac/sac_halfcheetah_performance.svg 8 | :align: center 9 | 10 | 11 | Hopper 12 | ------ 13 | 14 | .. figure:: ../images/plots/sac/sac_hopper_performance.svg 15 | :align: center 16 | 17 | 18 | Walker2d 19 | -------- 20 | 21 | .. figure:: ../images/plots/sac/sac_walker2d_performance.svg 22 | :align: center 23 | 24 | Swimmer 25 | ------- 26 | 27 | .. figure:: ../images/plots/sac/sac_swimmer_performance.svg 28 | :align: center 29 | 30 | 31 | Ant 32 | --- 33 | 34 | .. figure:: ../images/plots/sac/sac_ant_performance.svg 35 | :align: center -------------------------------------------------------------------------------- /docs/spinningup/bench_td3.rst: -------------------------------------------------------------------------------- 1 | TD3 Head-to-Head 2 | ================= 3 | 4 | HalfCheetah 5 | ----------- 6 | 7 | .. figure:: ../images/plots/td3/td3_halfcheetah_performance.svg 8 | :align: center 9 | 10 | 11 | Hopper 12 | ------ 13 | 14 | .. figure:: ../images/plots/td3/td3_hopper_performance.svg 15 | :align: center 16 | 17 | 18 | Walker2d 19 | -------- 20 | 21 | .. figure:: ../images/plots/td3/td3_walker2d_performance.svg 22 | :align: center 23 | 24 | Swimmer 25 | ------- 26 | 27 | .. figure:: ../images/plots/td3/td3_swimmer_performance.svg 28 | :align: center 29 | 30 | 31 | Ant 32 | --- 33 | 34 | .. figure:: ../images/plots/td3/td3_ant_performance.svg 35 | :align: center -------------------------------------------------------------------------------- /docs/spinningup/bench_vpg.rst: -------------------------------------------------------------------------------- 1 | Vanilla Policy Gradients Head-to-Head 2 | ===================================== 3 | 4 | HalfCheetah 5 | ----------- 6 | 7 | .. figure:: ../images/plots/vpg/vpg_halfcheetah_performance.svg 8 | :align: center 9 | 10 | 11 | Hopper 12 | ------ 13 | 14 | .. figure:: ../images/plots/vpg/vpg_hopper_performance.svg 15 | :align: center 16 | 17 | 18 | Walker2d 19 | -------- 20 | 21 | .. figure:: ../images/plots/vpg/vpg_walker2d_performance.svg 22 | :align: center 23 | 24 | Swimmer 25 | ------- 26 | 27 | .. figure:: ../images/plots/vpg/vpg_swimmer_performance.svg 28 | :align: center 29 | 30 | 31 | Ant 32 | --- 33 | 34 | .. figure:: ../images/plots/vpg/vpg_ant_performance.svg 35 | :align: center -------------------------------------------------------------------------------- /docs/spinningup/exercise2_1_soln.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Solution to Exercise 2.1 3 | ======================== 4 | 5 | .. figure:: ../images/ex2-1_trpo_hopper.png 6 | :align: center 7 | 8 | Learning curves for TRPO in Hopper-v2 with different values of ``train_v_iters``, averaged over three random seeds. 9 | 10 | 11 | The difference is quite substantial: with a trained value function, the agent is able to quickly make progress. With an untrained value function, the agent gets stuck early on. -------------------------------------------------------------------------------- /docs/spinningup/exercise2_2_soln.rst: -------------------------------------------------------------------------------- 1 | ======================== 2 | Solution to Exercise 2.2 3 | ======================== 4 | 5 | .. figure:: ../images/ex2-2_ddpg_bug.svg 6 | :align: center 7 | 8 | Learning curves for DDPG in HalfCheetah-v2 for bugged and non-bugged actor-critic implementations, averaged over three random seeds. 9 | 10 | 11 | .. admonition:: You Should Know 12 | 13 | This page will give the solution primarily in terms of a detailed analysis of the Tensorflow version of this exercise. However, the problem in the PyTorch version is basically the same and so is its solution. 14 | 15 | 16 | The Bug in the Code: Tensorflow Version 17 | ======================================= 18 | 19 | The only difference between the correct actor-critic code, 20 | 21 | .. code-block:: python 22 | :emphasize-lines: 11, 13 23 | 24 | """ 25 | Actor-Critic 26 | """ 27 | def mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 28 | output_activation=tf.tanh, action_space=None): 29 | act_dim = a.shape.as_list()[-1] 30 | act_limit = action_space.high[0] 31 | with tf.variable_scope('pi'): 32 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 33 | with tf.variable_scope('q'): 34 | q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 35 | with tf.variable_scope('q', reuse=True): 36 | q_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 37 | return pi, q, q_pi 38 | 39 | and the bugged actor-critic code, 40 | 41 | .. code-block:: python 42 | :emphasize-lines: 11, 13 43 | 44 | """ 45 | Bugged Actor-Critic 46 | """ 47 | def bugged_mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 48 | output_activation=tf.tanh, action_space=None): 49 | act_dim = a.shape.as_list()[-1] 50 | act_limit = action_space.high[0] 51 | with tf.variable_scope('pi'): 52 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 53 | with tf.variable_scope('q'): 54 | q = mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None) 55 | with tf.variable_scope('q', reuse=True): 56 | q_pi = mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None) 57 | return pi, q, q_pi 58 | 59 | is the tensor shape for the Q-functions. The correct version squeezes ouputs so that they have shape ``[batch size]``, whereas the bugged version doesn't, resulting in Q-functions with shape ``[batch size, 1]``. 60 | 61 | 62 | The Bug in the Code: PyTorch Version 63 | ==================================== 64 | 65 | In the PyTorch version of the exercise, the difference is virtually the same. The correct actor-critic code computes a forward pass on the Q-function that squeezes its output: 66 | 67 | 68 | .. code-block:: python 69 | :emphasize-lines: 12 70 | 71 | """ 72 | Correct Q-Function 73 | """ 74 | class MLPQFunction(nn.Module): 75 | 76 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 77 | super().__init__() 78 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 79 | 80 | def forward(self, obs, act): 81 | q = self.q(torch.cat([obs, act], dim=-1)) 82 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 83 | 84 | 85 | while the bugged version does not: 86 | 87 | .. code-block:: python 88 | :emphasize-lines: 11 89 | 90 | """ 91 | Bugged Q-Function 92 | """ 93 | class BuggedMLPQFunction(nn.Module): 94 | 95 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 96 | super().__init__() 97 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 98 | 99 | def forward(self, obs, act): 100 | return self.q(torch.cat([obs, act], dim=-1)) 101 | 102 | How it Gums Up the Works: Tensorflow Version 103 | ============================================ 104 | 105 | Consider the excerpt from the part in the code that builds the DDPG computation graph: 106 | 107 | .. code-block:: python 108 | 109 | # Bellman backup for Q function 110 | backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ) 111 | 112 | # DDPG losses 113 | pi_loss = -tf.reduce_mean(q_pi) 114 | q_loss = tf.reduce_mean((q-backup)**2) 115 | 116 | This is where the tensor shape issue comes into play. It's important to know that ``r_ph`` and ``d_ph`` have shape ``[batch size]``. 117 | 118 | The line that produces the Bellman backup was written with the assumption that it would add together tensors with the same shape. However, this line can **also** add together tensors with different shapes, as long as they're broadcast-compatible. 119 | 120 | Tensors with shapes ``[batch size]`` and ``[batch size, 1]`` are broadcast compatible, but the behavior is not actually what you might expect! Check out this example: 121 | 122 | >>> import tensorflow as tf 123 | >>> import numpy as np 124 | >>> x = tf.constant(np.arange(5)) 125 | >>> y = tf.constant(np.arange(5).reshape(-1,1)) 126 | >>> z1 = x * y 127 | >>> z2 = x + y 128 | >>> z3 = x + z1 129 | >>> x.shape 130 | TensorShape([Dimension(5)]) 131 | >>> y.shape 132 | TensorShape([Dimension(5), Dimension(1)]) 133 | >>> z1.shape 134 | TensorShape([Dimension(5), Dimension(5)]) 135 | >>> z2.shape 136 | TensorShape([Dimension(5), Dimension(5)]) 137 | >>> sess = tf.InteractiveSession() 138 | >>> sess.run(z1) 139 | array([[ 0, 0, 0, 0, 0], 140 | [ 0, 1, 2, 3, 4], 141 | [ 0, 2, 4, 6, 8], 142 | [ 0, 3, 6, 9, 12], 143 | [ 0, 4, 8, 12, 16]]) 144 | >>> sess.run(z2) 145 | array([[0, 1, 2, 3, 4], 146 | [1, 2, 3, 4, 5], 147 | [2, 3, 4, 5, 6], 148 | [3, 4, 5, 6, 7], 149 | [4, 5, 6, 7, 8]]) 150 | >>> sess.run(z3) 151 | array([[ 0, 1, 2, 3, 4], 152 | [ 0, 2, 4, 6, 8], 153 | [ 0, 3, 6, 9, 12], 154 | [ 0, 4, 8, 12, 16], 155 | [ 0, 5, 10, 15, 20]]) 156 | 157 | Adding or multiplying a shape ``[5]`` tensor by a shape ``[5,1]`` tensor returns a shape ``[5,5]`` tensor! 158 | 159 | When you don't squeeze the Q-functions, ``q_pi_targ`` has shape ``[batch size, 1]``, and the backup---and in turn, the whole Q-loss---gets totally messed up. 160 | 161 | Broadcast error 1: ``(1 - d_ph) * q_pi_targ`` becomes a ``[batch size, batch size]`` tensor containing the outer product of the mask with the target network Q-values. 162 | 163 | Broadcast error 2: ``r_ph`` then gets treated as a row vector and added to each row of ``(1 - d_ph) * q_pi_targ`` separately. 164 | 165 | Broadcast error 3: ``q_loss`` depends on ``q - backup``, which involves another bad broadcast between ``q`` (shape ``[batch size, 1]``) and ``backup`` (shape ``[batch size, batch size]``). 166 | 167 | To put it mathematically: let :math:`q`, :math:`q'`, :math:`r`, :math:`d` denote vectors containing the q-values, target q-values, rewards, and dones for a given batch, where there are :math:`n` entries in the batch. The correct backup is 168 | 169 | .. math:: 170 | 171 | z_i = r_i + \gamma (1-d_i) q'_i, 172 | 173 | and the correct loss function is 174 | 175 | .. math:: 176 | 177 | \frac{1}{n} \sum_{i=1}^n (q_i - z_i)^2. 178 | 179 | But with these errors, what gets computed is a backup *matrix*, 180 | 181 | .. math:: 182 | 183 | z_{ij} = r_j + \gamma (1-d_j) q'_i, 184 | 185 | and a messed up loss function 186 | 187 | .. math:: 188 | 189 | \frac{1}{n^2} \sum_{i=1}^n \sum_{j=1}^n (q_j - z_{ij})^2. 190 | 191 | If you leave this to run in HalfCheetah long enough, you'll actually see some non-trivial learning process, because weird details specific to this environment partly cancel out the errors. But almost everywhere else, it fails completely. 192 | 193 | 194 | How it Gums Up the Works: PyTorch Version 195 | ========================================= 196 | 197 | Exactly the same broadcasting shenanigans as in the Tensorflow version. Check out `this note`_ in the PyTorch documentation about it. 198 | 199 | 200 | .. figure:: ../images/ex2-2_ddpg_bug_pytorch.png 201 | :align: center 202 | 203 | Learning curves for DDPG in HalfCheetah-v2 for bugged and non-bugged actor-critic implementations using PyTorch, averaged over three random seeds. 204 | 205 | 206 | 207 | .. _`this note`: https://pytorch.org/docs/stable/notes/broadcasting.html#backwards-compatibility -------------------------------------------------------------------------------- /docs/spinningup/exercises.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Exercises 3 | ========= 4 | 5 | 6 | .. contents:: Table of Contents 7 | :depth: 2 8 | 9 | Problem Set 1: Basics of Implementation 10 | --------------------------------------- 11 | 12 | .. admonition:: Exercise 1.1: Gaussian Log-Likelihood 13 | 14 | **Path to Exercise:** 15 | 16 | * PyTorch version: ``spinup/exercises/pytorch/problem_set_1/exercise1_1.py`` 17 | 18 | * Tensorflow version: ``spinup/exercises/tf1/problem_set_1/exercise1_1.py`` 19 | 20 | **Path to Solution:** 21 | 22 | * PyTorch version: ``spinup/exercises/pytorch/problem_set_1_solutions/exercise1_1_soln.py`` 23 | 24 | * Tensorflow version: ``spinup/exercises/tf1/problem_set_1_solutions/exercise1_1_soln.py`` 25 | 26 | 27 | **Instructions.** Write a function that takes in the means and log stds of a batch of diagonal Gaussian distributions, along with (previously-generated) samples from those distributions, and returns the log likelihoods of those samples. (In the Tensorflow version, you will write a function that creates computation graph operations to do this; in the PyTorch version, you will directly operate on given Tensors.) 28 | 29 | You may find it useful to review the formula given in `this section of the RL introduction`_. 30 | 31 | Implement your solution in ``exercise1_1.py``, and run that file to automatically check your work. 32 | 33 | **Evaluation Criteria.** Your solution will be checked by comparing outputs against a known-good implementation, using a batch of random inputs. 34 | 35 | .. _`this section of the RL introduction`: ../spinningup/rl_intro.html#stochastic-policies 36 | 37 | 38 | .. admonition:: Exercise 1.2: Policy for PPO 39 | 40 | **Path to Exercise:** 41 | 42 | * PyTorch version: ``spinup/exercises/pytorch/problem_set_1/exercise1_2.py`` 43 | 44 | * Tensorflow version: ``spinup/exercises/tf1/problem_set_1/exercise1_2.py`` 45 | 46 | **Path to Solution:** 47 | 48 | * PyTorch version: ``spinup/exercises/pytorch/problem_set_1_solutions/exercise1_2_soln.py`` 49 | 50 | * Tensorflow version: ``spinup/exercises/tf1/problem_set_1_solutions/exercise1_2_soln.py`` 51 | 52 | **Instructions.** Implement an MLP diagonal Gaussian policy for PPO. 53 | 54 | Implement your solution in ``exercise1_2.py``, and run that file to automatically check your work. 55 | 56 | **Evaluation Criteria.** Your solution will be evaluated by running for 20 epochs in the InvertedPendulum-v2 Gym environment, and this should take in the ballpark of 3-5 minutes (depending on your machine, and other processes you are running in the background). The bar for success is reaching an average score of over 500 in the last 5 epochs, or getting to a score of 1000 (the maximum possible score) in the last 5 epochs. 57 | 58 | 59 | .. admonition:: Exercise 1.3: Computation Graph for TD3 60 | 61 | **Path to Exercise.** 62 | 63 | * PyTorch version: ``spinup/exercises/pytorch/problem_set_1/exercise1_3.py`` 64 | 65 | * Tensorflow version: ``spinup/exercises/tf1/problem_set_1/exercise1_3.py`` 66 | 67 | **Path to Solution.** 68 | 69 | * PyTorch version: ``spinup/algos/pytorch/td3/td3.py`` 70 | 71 | * Tensorflow version: ``spinup/algos/tf1/td3/td3.py`` 72 | 73 | **Instructions.** Implement the main mathematical logic for the TD3 algorithm. 74 | 75 | As starter code, you are given the entirety of the TD3 algorithm except for the main mathematical logic (essentially, the loss functions and intermediate calculations needed for them). Find "YOUR CODE HERE" to begin. 76 | 77 | You may find it useful to review the pseudocode in our `page on TD3`_. 78 | 79 | Implement your solution in ``exercise1_3.py``, and run that file to see the results of your work. There is no automatic checking for this exercise. 80 | 81 | **Evaluation Criteria.** Evaluate your code by running ``exercise1_3.py`` with HalfCheetah-v2, InvertedPendulum-v2, and one other Gym MuJoCo environment of your choosing (set via the ``--env`` flag). It is set up to use smaller neural networks (hidden sizes [128,128]) than typical for TD3, with a maximum episode length of 150, and to run for only 10 epochs. The goal is to see significant learning progress relatively quickly (in terms of wall clock time). Experiments will likely take on the order of ~10 minutes. 82 | 83 | Use the ``--use_soln`` flag to run Spinning Up's TD3 instead of your implementation. Anecdotally, within 10 epochs, the score in HalfCheetah should go over 300, and the score in InvertedPendulum should max out at 150. 84 | 85 | .. _`page on TD3`: ../algorithms/td3.html 86 | 87 | 88 | Problem Set 2: Algorithm Failure Modes 89 | -------------------------------------- 90 | 91 | .. admonition:: Exercise 2.1: Value Function Fitting in TRPO 92 | 93 | **Path to Exercise.** (Not applicable, there is no code for this one.) 94 | 95 | **Path to Solution.** `Solution available here. <../spinningup/exercise2_1_soln.html>`_ 96 | 97 | Many factors can impact the performance of policy gradient algorithms, but few more drastically than the quality of the learned value function used for advantage estimation. 98 | 99 | In this exercise, you will compare results between runs of TRPO where you put lots of effort into fitting the value function (``train_v_iters=80``), versus where you put very little effort into fitting the value function (``train_v_iters=0``). 100 | 101 | **Instructions.** Run the following command: 102 | 103 | .. parsed-literal:: 104 | 105 | python -m spinup.run trpo --env Hopper-v2 --train_v_iters[v] 0 80 --exp_name ex2-1 --epochs 250 --steps_per_epoch 4000 --seed 0 10 20 --dt 106 | 107 | and plot the results. (These experiments might take ~10 minutes each, and this command runs six of them.) What do you find? 108 | 109 | .. admonition:: Exercise 2.2: Silent Bug in DDPG 110 | 111 | **Path to Exercise.** 112 | 113 | * PyTorch version: ``spinup/exercises/pytorch/problem_set_2/exercise2_2.py`` 114 | 115 | * Tensorflow version: ``spinup/exercises/tf1/problem_set_2/exercise2_2.py`` 116 | 117 | **Path to Solution.** `Solution available here. <../spinningup/exercise2_2_soln.html>`_ 118 | 119 | The hardest part of writing RL code is dealing with bugs, because failures are frequently silent. The code will appear to run correctly, but the agent's performance will degrade relative to a bug-free implementation---sometimes to the extent that it never learns anything. 120 | 121 | In this exercise, you will observe a bug in vivo and compare results against correct code. The bug is the same (conceptually, if not in exact implementation) for both the PyTorch and Tensorflow versions of this exercise. 122 | 123 | **Instructions.** Run ``exercise2_2.py``, which will launch DDPG experiments with and without a bug. The non-bugged version runs the default Spinning Up implementation of DDPG, using a default method for creating the actor and critic networks. The bugged version runs the same DDPG code, except uses a bugged method for creating the networks. 124 | 125 | There will be six experiments in all (three random seeds for each case), and each should take in the ballpark of 10 minutes. When they're finished, plot the results. What is the difference in performance with and without the bug? 126 | 127 | Without referencing the correct actor-critic code (which is to say---don't look in DDPG's ``core.py`` file), try to figure out what the bug is and explain how it breaks things. 128 | 129 | **Hint.** To figure out what's going wrong, think about how the DDPG code implements the DDPG computation graph. For the Tensorflow version, look at this excerpt: 130 | 131 | .. code-block:: python 132 | 133 | # Bellman backup for Q function 134 | backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ) 135 | 136 | # DDPG losses 137 | pi_loss = -tf.reduce_mean(q_pi) 138 | q_loss = tf.reduce_mean((q-backup)**2) 139 | 140 | How could a bug in the actor-critic code have an impact here? 141 | 142 | **Bonus.** Are there any choices of hyperparameters which would have hidden the effects of the bug? 143 | 144 | 145 | Challenges 146 | ---------- 147 | 148 | .. admonition:: Write Code from Scratch 149 | 150 | As we suggest in `the essay <../spinningup/spinningup.html#learn-by-doing>`_, try reimplementing various deep RL algorithms from scratch. 151 | 152 | .. admonition:: Requests for Research 153 | 154 | If you feel comfortable with writing deep learning and deep RL code, consider trying to make progress on any of OpenAI's standing requests for research: 155 | 156 | * `Requests for Research 1 `_ 157 | * `Requests for Research 2 `_ -------------------------------------------------------------------------------- /docs/spinningup/extra_pg_proof1.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Extra Material 3 | ============== 4 | 5 | Proof for Don't Let the Past Distract You 6 | ========================================= 7 | 8 | In this subsection, we will prove that actions should not be reinforced for rewards obtained in the past. 9 | 10 | Expand out :math:`R(\tau)` in the expression for the `simplest policy gradient`_ to obtain: 11 | 12 | .. math:: 13 | 14 | \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(\tau)} \\ 15 | &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \sum_{t'=0}^T R(s_{t'}, a_{t'}, s_{t'+1})} \\ 16 | &= \sum_{t=0}^{T} \sum_{t'=0}^T \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(s_{t'}, a_{t'}, s_{t'+1})}, 17 | 18 | and consider the term 19 | 20 | .. math:: 21 | 22 | \underE{\tau \sim \pi_{\theta}}{f(t,t')} = \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(s_{t'}, a_{t'}, s_{t'+1})}. 23 | 24 | We will show that for the case of :math:`t' < t` (the reward comes before the action being reinforced), this term is zero. This is a complete proof of the original claim, because after dropping terms with :math:`t' < t` from the expression, we are left with the reward-to-go form of the policy gradient, as desired: 25 | 26 | .. math:: 27 | 28 | \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \sum_{t'=t}^T R(s_{t'}, a_{t'}, s_{t'+1})} 29 | 30 | **1. Using the Marginal Distribution.** To proceed, we have to break down the expectation in :math:`\underE{\tau \sim \pi_{\theta}}{f(t,t')}`. It's an expectation over trajectories, but the expression inside the expectation only deals with a few states and actions: :math:`s_t`, :math:`a_t`, :math:`s_{t'}`, :math:`a_{t'}`, and :math:`s_{t'+1}`. So in computing the expectation, we only need to worry about the `marginal distribution`_ over these random variables. 31 | 32 | We derive: 33 | 34 | .. math:: 35 | 36 | \underE{\tau \sim \pi_{\theta}}{f(t,t')} &= \int_{\tau} P(\tau|\pi_{\theta}) f(t,t') \\ 37 | &= \int_{s_t, a_t, s_{t'}, a_{t'}, s_{t'+1}} P(s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} | \pi_{\theta}) f(t,t') \\ 38 | &= \underE{s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{f(t,t')}. 39 | 40 | **2. Probability Chain Rule.** Joint distributions can be calculated in terms of conditional and marginal probabilities via `chain rule of probability`_: :math:`P(A,B) = P(B|A) P(A)`. Here, we use this rule to compute 41 | 42 | .. math:: 43 | 44 | P(s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} | \pi_{\theta}) = P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) P(s_{t'}, a_{t'}, s_{t'+1} | \pi_{\theta}) 45 | 46 | 47 | **3. Separating Expectations Over Multiple Random Variables.** If we have an expectation over two random variables :math:`A` and :math:`B`, we can split it into an inner and outer expectation, where the inner expectation treats the variable from the outer expectation as a constant. Our ability to make this split relies on probability chain rule. Mathematically: 48 | 49 | .. math:: 50 | 51 | \underE{A,B}{f(A,B)} &= \int_{A,B} P(A,B) f(A,B) \\ 52 | &= \int_{A} \int_B P(B|A) P(A) f(A,B) \\ 53 | &= \int_A P(A) \int_B P(B|A) f(A,B) \\ 54 | &= \int_A P(A) \underE{B}{f(A,B) \Big| A} \\ 55 | &= \underE{A}{\underE{B}{f(A,B) \Big| A} } 56 | 57 | An expectation over :math:`s_t, a_t, s_{t'}, a_{t'}, s_{t'+1}` can thus be expressed by 58 | 59 | .. math:: 60 | 61 | \underE{\tau \sim \pi_{\theta}}{f(t,t')} &= \underE{s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{f(t,t')} \\ 62 | &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{\underE{s_t, a_t \sim \pi_{\theta}}{f(t,t') \Big| s_{t'}, a_{t'}, s_{t'+1}}} 63 | 64 | **4. Constants Can Be Pulled Outside of Expectations.** If a term inside an expectation is constant with respect to the variable being expected over, it can be pulled outside of the expectation. To give an example, consider again an expectation over two random variables :math:`A` and :math:`B`, where this time, :math:`f(A,B) = h(A) g(B)`. Then, using the result from before: 65 | 66 | .. math:: 67 | 68 | \underE{A,B}{f(A,B)} &= \underE{A}{\underE{B}{f(A,B) \Big| A}} \\ 69 | &= \underE{A}{\underE{B}{h(A) g(B) \Big| A}}\\ 70 | &= \underE{A}{h(A) \underE{B}{g(B) \Big| A}}. 71 | 72 | The function in our expectation decomposes this way, allowing us to write: 73 | 74 | .. math:: 75 | 76 | \underE{\tau \sim \pi_{\theta}}{f(t,t')} &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{\underE{s_t, a_t \sim \pi_{\theta}}{f(t,t') \Big| s_{t'}, a_{t'}, s_{t'+1}}} \\ 77 | &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{\underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(s_{t'}, a_{t'}, s_{t'+1}) \Big| s_{t'}, a_{t'}, s_{t'+1}}} \\ 78 | &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{R(s_{t'}, a_{t'}, s_{t'+1}) \underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_{t'}, a_{t'}, s_{t'+1}}}. 79 | 80 | **5. Applying the EGLP Lemma.** The last step in our proof relies on the `EGLP lemma`_. At this point, we will only worry about the innermost expectation, 81 | 82 | .. math:: 83 | 84 | \underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_{t'}, a_{t'}, s_{t'+1}} = \int_{s_t, a_t} P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) \nabla_{\theta} \log \pi_{\theta}(a_t |s_t). 85 | 86 | We now have to make a distinction between two cases: :math:`t' < t`, the case where the reward happened before the action, and :math:`t' \geq t`, where it didn't. 87 | 88 | **Case One: Reward Before Action.** If :math:`t' < t`, then the conditional probabilities for actions at :math:`a_t` come from the policy: 89 | 90 | .. math:: 91 | 92 | P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) &= \pi_{\theta}(a_t | s_t) P(s_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}), 93 | 94 | the innermost expectation can be broken down farther into 95 | 96 | .. math:: 97 | 98 | \underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_{t'}, a_{t'}, s_{t'+1}} &= \int_{s_t, a_t} P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \\ 99 | &= \int_{s_t} P(s_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) \int_{a_t} \pi_{\theta}(a_t | s_t) \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \\ 100 | &= \underE{s_t \sim \pi_{\theta}}{ \underE{a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_t } \Big| s_{t'}, a_{t'}, s_{t'+1}}. 101 | 102 | The EGLP lemma says that 103 | 104 | .. math:: 105 | 106 | \underE{a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_t } = 0, 107 | 108 | allowing us to conclude that for :math:`t' < t`, :math:`\underE{\tau \sim \pi_{\theta}}{f(t,t')} = 0`. 109 | 110 | **Case Two: Reward After Action.** What about the :math:`t' \geq t` case, though? Why doesn't the same logic apply? In this case, the conditional probabilities for :math:`a_t` can't be broken down the same way, because you're conditioning **on the future.** Think about it like this: let's say that every day, in the morning, you make a choice between going for a jog and going to work early, and you have a 50-50 chance of each option. If you condition on a future where you went to work early, what are the odds that you went for a jog? Clearly, you didn't. But if you're conditioning on the past---before you made the decision---what are the odds that you will later go for a jog? Now it's back to 50-50. 111 | 112 | So in the case where :math:`t' \geq t`, the conditional distribution over actions :math:`a_t` is **not** :math:`\pi(a_t|s_t)`, and the EGLP lemma does not apply. 113 | 114 | .. _`simplest policy gradient`: ../spinningup/rl_intro3.html#deriving-the-simplest-policy-gradient 115 | .. _`marginal distribution`: https://en.wikipedia.org/wiki/Marginal_distribution 116 | .. _`chain rule of probability`: https://en.wikipedia.org/wiki/Chain_rule_(probability) 117 | .. _`EGLP lemma`: ../spinningup/rl_intro3.html#expected-grad-log-prob-lemma -------------------------------------------------------------------------------- /docs/spinningup/extra_pg_proof2.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | Extra Material 3 | ============== 4 | 5 | Proof for Using Q-Function in Policy Gradient Formula 6 | ===================================================== 7 | 8 | In this section, we will show that 9 | 10 | .. math:: 11 | 12 | \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \Big( \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big) Q^{\pi_{\theta}}(s_t, a_t)}, 13 | 14 | for the finite-horizon undiscounted return setting. (An analagous result holds in the infinite-horizon discounted case using basically the same proof.) 15 | 16 | 17 | The proof of this claim depends on the `law of iterated expectations`_. First, let's rewrite the expression for the policy gradient, starting from the reward-to-go form (using the notation :math:`\hat{R}_t = \sum_{t'=t}^T R(s_t', a_t', s_{t'+1})` to help shorten things): 18 | 19 | .. math:: 20 | 21 | \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \hat{R}_t} \\ 22 | &= \sum_{t=0}^{T} \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \hat{R}_t} 23 | 24 | Define :math:`\tau_{:t} = (s_0, a_0, ..., s_t, a_t)` as the trajectory up to time :math:`t`, and :math:`\tau_{t:}` as the remainder of the trajectory after that. By the law of iterated expectations, we can break up the preceding expression into: 25 | 26 | .. math:: 27 | 28 | \nabla_{\theta} J(\pi_{\theta}) &= \sum_{t=0}^{T} \underE{\tau_{:t} \sim \pi_{\theta}}{ \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \hat{R}_t \right| \tau_{:t}}} 29 | 30 | The grad-log-prob is constant with respect to the inner expectation (because it depends on :math:`s_t` and :math:`a_t`, which the inner expectation conditions on as fixed in :math:`\tau_{:t}`), so it can be pulled out, leaving: 31 | 32 | .. math:: 33 | 34 | \nabla_{\theta} J(\pi_{\theta}) &= \sum_{t=0}^{T} \underE{\tau_{:t} \sim \pi_{\theta}}{ \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \hat{R}_t \right| \tau_{:t}}} 35 | 36 | In Markov Decision Processes, the future only depends on the most recent state and action. As a result, the inner expectation---which expects over the future, conditioned on the entirety of the past (everything up to time :math:`t`)---is equal to the same expectation if it only conditioned on the last timestep (just :math:`(s_t,a_t)`): 37 | 38 | .. math:: 39 | 40 | \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \hat{R}_t \right| \tau_{:t}} = \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \hat{R}_t \right| s_t, a_t}, 41 | 42 | which is the *definition* of :math:`Q^{\pi_{\theta}}(s_t, a_t)`: the expected return, starting from state :math:`s_t` and action :math:`a_t`, when acting on-policy for the rest of the trajectory. 43 | 44 | The result follows immediately. 45 | 46 | .. _`law of iterated expectations`: https://en.wikipedia.org/wiki/Law_of_total_expectation 47 | -------------------------------------------------------------------------------- /docs/spinningup/rl_intro4.rst: -------------------------------------------------------------------------------- 1 | ========================= 2 | Limitations and Frontiers 3 | ========================= 4 | 5 | 6 | Reward Design 7 | ============= 8 | 9 | 10 | Sample Complexity 11 | ================= 12 | 13 | 14 | Long-Horizon Tasks 15 | ================== -------------------------------------------------------------------------------- /docs/user/algorithms.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Algorithms 3 | ========== 4 | 5 | .. contents:: Table of Contents 6 | 7 | What's Included 8 | =============== 9 | 10 | The following algorithms are implemented in the Spinning Up package: 11 | 12 | - `Vanilla Policy Gradient`_ (VPG) 13 | - `Trust Region Policy Optimization`_ (TRPO) 14 | - `Proximal Policy Optimization`_ (PPO) 15 | - `Deep Deterministic Policy Gradient`_ (DDPG) 16 | - `Twin Delayed DDPG`_ (TD3) 17 | - `Soft Actor-Critic`_ (SAC) 18 | 19 | They are all implemented with `MLP`_ (non-recurrent) actor-critics, making them suitable for fully-observed, non-image-based RL environments, e.g. the `Gym Mujoco`_ environments. 20 | 21 | Spinning Up has two implementations for each algorithm (except for TRPO): one that uses `PyTorch`_ as the neural network library, and one that uses `Tensorflow v1`_ as the neural network library. (TRPO is currently only available in Tensorflow.) 22 | 23 | .. _`Gym Mujoco`: https://gym.openai.com/envs/#mujoco 24 | .. _`Vanilla Policy Gradient`: ../algorithms/vpg.html 25 | .. _`Trust Region Policy Optimization`: ../algorithms/trpo.html 26 | .. _`Proximal Policy Optimization`: ../algorithms/ppo.html 27 | .. _`Deep Deterministic Policy Gradient`: ../algorithms/ddpg.html 28 | .. _`Twin Delayed DDPG`: ../algorithms/td3.html 29 | .. _`Soft Actor-Critic`: ../algorithms/sac.html 30 | .. _`MLP`: https://en.wikipedia.org/wiki/Multilayer_perceptron 31 | .. _`PyTorch`: https://pytorch.org/ 32 | .. _`Tensorflow v1`: https://www.tensorflow.org/versions/r1.15/api_docs 33 | 34 | 35 | Why These Algorithms? 36 | ===================== 37 | 38 | We chose the core deep RL algorithms in this package to reflect useful progressions of ideas from the recent history of the field, culminating in two algorithms in particular---PPO and SAC---which are close to state of the art on reliability and sample efficiency among policy-learning algorithms. They also expose some of the trade-offs that get made in designing and using algorithms in deep RL. 39 | 40 | The On-Policy Algorithms 41 | ------------------------ 42 | 43 | Vanilla Policy Gradient is the most basic, entry-level algorithm in the deep RL space because it completely predates the advent of deep RL altogether. The core elements of VPG go all the way back to the late 80s / early 90s. It started a trail of research which ultimately led to stronger algorithms such as TRPO and then PPO soon after. 44 | 45 | A key feature of this line of work is that all of these algorithms are *on-policy*: that is, they don't use old data, which makes them weaker on sample efficiency. But this is for a good reason: these algorithms directly optimize the objective you care about---policy performance---and it works out mathematically that you need on-policy data to calculate the updates. So, this family of algorithms trades off sample efficiency in favor of stability---but you can see the progression of techniques (from VPG to TRPO to PPO) working to make up the deficit on sample efficiency. 46 | 47 | 48 | The Off-Policy Algorithms 49 | ------------------------- 50 | 51 | DDPG is a similarly foundational algorithm to VPG, although much younger---the theory of deterministic policy gradients, which led to DDPG, wasn't published until 2014. DDPG is closely connected to Q-learning algorithms, and it concurrently learns a Q-function and a policy which are updated to improve each other. 52 | 53 | Algorithms like DDPG and Q-Learning are *off-policy*, so they are able to reuse old data very efficiently. They gain this benefit by exploiting Bellman's equations for optimality, which a Q-function can be trained to satisfy using *any* environment interaction data (as long as there's enough experience from the high-reward areas in the environment). 54 | 55 | But problematically, there are no guarantees that doing a good job of satisfying Bellman's equations leads to having great policy performance. *Empirically* one can get great performance---and when it happens, the sample efficiency is wonderful---but the absence of guarantees makes algorithms in this class potentially brittle and unstable. TD3 and SAC are descendants of DDPG which make use of a variety of insights to mitigate these issues. 56 | 57 | 58 | Code Format 59 | =========== 60 | 61 | All implementations in Spinning Up adhere to a standard template. They are split into two files: an algorithm file, which contains the core logic of the algorithm, and a core file, which contains various utilities needed to run the algorithm. 62 | 63 | The algorithm file always starts with a class definition for an experience buffer object, which is used to store information from agent-environment interactions. Next, there is a single function which runs the algorithm. The algorithm function follows a template that is roughly the same across the PyTorch and Tensorflow versions, but we'll break it down for each separately below. Finally, there's some support in each algorithm file for directly running the algorithm in Gym environments from the command line (though this is not the recommended way to run the algorithms---we'll describe how to do that on the `Running Experiments`_ page). 64 | 65 | .. _`Running Experiments`: ../user/running.html 66 | 67 | The Algorithm Function: PyTorch Version 68 | --------------------------------------- 69 | 70 | The algorithm function for a PyTorch implementation performs the following tasks in (roughly) this order: 71 | 72 | 1) Logger setup 73 | 74 | 2) Random seed setting 75 | 76 | 3) Environment instantiation 77 | 78 | 4) Constructing the actor-critic PyTorch module via the ``actor_critic`` function passed to the algorithm function as an argument 79 | 80 | 5) Instantiating the experience buffer 81 | 82 | 6) Setting up callable loss functions that also provide diagnostics specific to the algorithm 83 | 84 | 7) Making PyTorch optimizers 85 | 86 | 8) Setting up model saving through the logger 87 | 88 | 9) Setting up an update function that runs one epoch of optimization or one step of descent 89 | 90 | 10) Running the main loop of the algorithm: 91 | 92 | a) Run the agent in the environment 93 | 94 | b) Periodically update the parameters of the agent according to the main equations of the algorithm 95 | 96 | c) Log key performance metrics and save agent 97 | 98 | 99 | 100 | The Algorithm Function: Tensorflow Version 101 | ------------------------------------------ 102 | 103 | The algorithm function for a Tensorflow implementation performs the following tasks in (roughly) this order: 104 | 105 | 1) Logger setup 106 | 107 | 2) Random seed setting 108 | 109 | 3) Environment instantiation 110 | 111 | 4) Making placeholders for the computation graph 112 | 113 | 5) Building the actor-critic computation graph via the ``actor_critic`` function passed to the algorithm function as an argument 114 | 115 | 6) Instantiating the experience buffer 116 | 117 | 7) Building the computation graph for loss functions and diagnostics specific to the algorithm 118 | 119 | 8) Making training ops 120 | 121 | 9) Making the TF Session and initializing parameters 122 | 123 | 10) Setting up model saving through the logger 124 | 125 | 11) Defining functions needed for running the main loop of the algorithm (e.g. the core update function, get action function, and test agent function, depending on the algorithm) 126 | 127 | 12) Running the main loop of the algorithm: 128 | 129 | a) Run the agent in the environment 130 | 131 | b) Periodically update the parameters of the agent according to the main equations of the algorithm 132 | 133 | c) Log key performance metrics and save agent 134 | 135 | 136 | 137 | The Core File 138 | ------------- 139 | 140 | The core files don't adhere as closely as the algorithms files to a template, but do have some approximate structure: 141 | 142 | 1) **Tensorflow only:** Functions related to making and managing placeholders 143 | 144 | 2) Functions for building sections of computation graph relevant to the ``actor_critic`` method for a particular algorithm 145 | 146 | 3) Any other useful functions 147 | 148 | 4) Implementations for an MLP actor-critic compatible with the algorithm, where both the policy and the value function(s) are represented by simple MLPs 149 | 150 | 151 | -------------------------------------------------------------------------------- /docs/user/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | 6 | .. contents:: Table of Contents 7 | 8 | Spinning Up requires Python3, OpenAI Gym, and OpenMPI. 9 | 10 | Spinning Up is currently only supported on Linux and OSX. It may be possible to install on Windows, though this hasn't been extensively tested. [#]_ 11 | 12 | .. admonition:: You Should Know 13 | 14 | Many examples and benchmarks in Spinning Up refer to RL environments that use the `MuJoCo`_ physics engine. MuJoCo is a proprietary software that requires a license, which is free to trial and free for students, but otherwise is not free. As a result, installing it is optional, but because of its importance to the research community---it is the de facto standard for benchmarking deep RL algorithms in continuous control---it is preferred. 15 | 16 | Don't worry if you decide not to install MuJoCo, though. You can definitely get started in RL by running RL algorithms on the `Classic Control`_ and `Box2d`_ environments in Gym, which are totally free to use. 17 | 18 | .. [#] It looks like at least one person has figured out `a workaround for running on Windows`_. If you try another way and succeed, please let us know how you did it! 19 | 20 | .. _`Classic Control`: https://gym.openai.com/envs/#classic_control 21 | .. _`Box2d`: https://gym.openai.com/envs/#box2d 22 | .. _`MuJoCo`: http://www.mujoco.org/index.html 23 | .. _`a workaround for running on Windows`: https://github.com/openai/spinningup/issues/23 24 | 25 | Installing Python 26 | ================= 27 | 28 | We recommend installing Python through Anaconda. Anaconda is a library that includes Python and many useful packages for Python, as well as an environment manager called conda that makes package management simple. 29 | 30 | Follow `the installation instructions`_ for Anaconda here. Download and install Anaconda3 (at time of writing, `Anaconda3-5.3.0`_). Then create a conda Python 3.6 env for organizing packages used in Spinning Up: 31 | 32 | .. parsed-literal:: 33 | 34 | conda create -n spinningup python=3.6 35 | 36 | To use Python from the environment you just created, activate the environment with: 37 | 38 | .. parsed-literal:: 39 | 40 | conda activate spinningup 41 | 42 | .. admonition:: You Should Know 43 | 44 | If you're new to python environments and package management, this stuff can quickly get confusing or overwhelming, and you'll probably hit some snags along the way. (Especially, you should expect problems like, "I just installed this thing, but it says it's not found when I try to use it!") You may want to read through some clean explanations about what package management is, why it's a good idea, and what commands you'll typically have to execute to correctly use it. 45 | 46 | `FreeCodeCamp`_ has a good explanation worth reading. There's a shorter description on `Towards Data Science`_ which is also helpful and informative. Finally, if you're an extremely patient person, you may want to read the (dry, but very informative) `documentation page from Conda`_. 47 | 48 | .. _`the installation instructions`: https://docs.continuum.io/anaconda/install/ 49 | .. _`Anaconda3-5.3.0`: https://repo.anaconda.com/archive/ 50 | .. _`FreeCodeCamp`: https://medium.freecodecamp.org/why-you-need-python-environments-and-how-to-manage-them-with-conda-85f155f4353c 51 | .. _`Towards Data Science`: https://towardsdatascience.com/environment-management-with-conda-python-2-3-b9961a8a5097 52 | .. _`documentation page from Conda`: https://conda.io/docs/user-guide/tasks/manage-environments.html 53 | .. _`this Github issue for Tensorflow`: https://github.com/tensorflow/tensorflow/issues/20444 54 | 55 | 56 | Installing OpenMPI 57 | ================== 58 | 59 | Ubuntu 60 | ------ 61 | 62 | .. parsed-literal:: 63 | 64 | sudo apt-get update && sudo apt-get install libopenmpi-dev 65 | 66 | 67 | Mac OS X 68 | -------- 69 | Installation of system packages on Mac requires Homebrew_. With Homebrew installed, run the follwing: 70 | 71 | .. parsed-literal:: 72 | 73 | brew install openmpi 74 | 75 | .. _Homebrew: https://brew.sh 76 | 77 | Installing Spinning Up 78 | ====================== 79 | 80 | .. parsed-literal:: 81 | 82 | git clone https://github.com/openai/spinningup.git 83 | cd spinningup 84 | pip install -e . 85 | 86 | .. admonition:: You Should Know 87 | 88 | Spinning Up defaults to installing everything in Gym **except** the MuJoCo environments. In case you run into any trouble with the Gym installation, check out the `Gym`_ github page for help. If you want the MuJoCo environments, see the optional installation section below. 89 | 90 | .. _`Gym`: https://github.com/openai/gym 91 | 92 | Check Your Install 93 | ================== 94 | 95 | To see if you've successfully installed Spinning Up, try running PPO in the LunarLander-v2 environment with 96 | 97 | .. parsed-literal:: 98 | 99 | python -m spinup.run ppo --hid "[32,32]" --env LunarLander-v2 --exp_name installtest --gamma 0.999 100 | 101 | This might run for around 10 minutes, and you can leave it going in the background while you continue reading through documentation. This won't train the agent to completion, but will run it for long enough that you can see *some* learning progress when the results come in. 102 | 103 | After it finishes training, watch a video of the trained policy with 104 | 105 | .. parsed-literal:: 106 | 107 | python -m spinup.run test_policy data/installtest/installtest_s0 108 | 109 | And plot the results with 110 | 111 | .. parsed-literal:: 112 | 113 | python -m spinup.run plot data/installtest/installtest_s0 114 | 115 | 116 | Installing MuJoCo (Optional) 117 | ============================ 118 | 119 | First, go to the `mujoco-py`_ github page. Follow the installation instructions in the README, which describe how to install the MuJoCo physics engine and the mujoco-py package (which allows the use of MuJoCo from Python). 120 | 121 | .. admonition:: You Should Know 122 | 123 | In order to use the MuJoCo simulator, you will need to get a `MuJoCo license`_. Free 30-day licenses are available to anyone, and free 1-year licenses are available to full-time students. 124 | 125 | Once you have installed MuJoCo, install the corresponding Gym environments with 126 | 127 | .. parsed-literal:: 128 | 129 | pip install gym[mujoco,robotics] 130 | 131 | And then check that things are working by running PPO in the Walker2d-v2 environment with 132 | 133 | .. parsed-literal:: 134 | 135 | python -m spinup.run ppo --hid "[32,32]" --env Walker2d-v2 --exp_name mujocotest 136 | 137 | 138 | .. _`mujoco-py`: https://github.com/openai/mujoco-py 139 | .. _`MuJoCo license`: https://www.roboti.us/license.html 140 | -------------------------------------------------------------------------------- /docs/user/plotting.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Plotting Results 3 | ================ 4 | 5 | Spinning Up ships with a simple plotting utility for interpreting results. Run it with: 6 | 7 | .. parsed-literal:: 8 | 9 | python -m spinup.run plot [path/to/output_directory ...] [--legend [LEGEND ...]] 10 | [--xaxis XAXIS] [--value [VALUE ...]] [--count] [--smooth S] 11 | [--select [SEL ...]] [--exclude [EXC ...]] 12 | 13 | 14 | **Positional Arguments:** 15 | 16 | .. option:: logdir 17 | 18 | *strings*. As many log directories (or prefixes to log directories, which the plotter will autocomplete internally) as you'd like to plot from. Logdirs will be searched recursively for experiment outputs. 19 | 20 | .. admonition:: You Should Know 21 | 22 | The internal autocompleting is really handy! Suppose you have run several experiments, with the aim of comparing performance between different algorithms, resulting in a log directory structure of: 23 | 24 | .. parsed-literal:: 25 | 26 | data/ 27 | bench_algo1/ 28 | bench_algo1-seed0/ 29 | bench_algo1-seed10/ 30 | bench_algo2/ 31 | bench_algo2-seed0/ 32 | bench_algo2-seed10/ 33 | 34 | You can easily produce a graph comparing algo1 and algo2 with: 35 | 36 | .. parsed-literal:: 37 | 38 | python spinup/utils/plot.py data/bench_algo 39 | 40 | relying on the autocomplete to find both ``data/bench_algo1`` and ``data/bench_algo2``. 41 | 42 | **Optional Arguments:** 43 | 44 | .. option:: -l, --legend=[LEGEND ...] 45 | 46 | *strings*. Optional way to specify legend for the plot. The plotter legend will automatically use the ``exp_name`` from the ``config.json`` file, unless you tell it otherwise through this flag. This only works if you provide a name for each directory that will get plotted. (Note: this may not be the same as the number of logdir args you provide! Recall that the plotter looks for autocompletes of the logdir args: there may be more than one match for a given logdir prefix, and you will need to provide a legend string for each one of those matches---unless you have removed some of them as candidates via selection or exclusion rules (below).) 47 | 48 | .. option:: -x, --xaxis=XAXIS, default='TotalEnvInteracts' 49 | 50 | *string*. Pick what column from data is used for the x-axis. 51 | 52 | .. option:: -y, --value=[VALUE ...], default='Performance' 53 | 54 | *strings*. Pick what columns from data to graph on the y-axis. Submitting multiple values will produce multiple graphs. Defaults to ``Performance``, which is not an actual output of any algorithm. Instead, ``Performance`` refers to either ``AverageEpRet``, the correct performance measure for the on-policy algorithms, or ``AverageTestEpRet``, the correct performance measure for the off-policy algorithms. The plotter will automatically figure out which of ``AverageEpRet`` or ``AverageTestEpRet`` to report for each separate logdir. 55 | 56 | .. option:: --count 57 | 58 | Optional flag. By default, the plotter shows y-values which are averaged across all results that share an ``exp_name``, which is typically a set of identical experiments that only vary in random seed. But if you'd like to see all of those curves separately, use the ``--count`` flag. 59 | 60 | .. option:: -s, --smooth=S, default=1 61 | 62 | *int*. Smooth data by averaging it over a fixed window. This parameter says how wide the averaging window will be. 63 | 64 | .. option:: --select=[SEL ...] 65 | 66 | *strings*. Optional selection rule: the plotter will only show curves from logdirs that contain all of these substrings. 67 | 68 | .. option:: --exclude=[EXC ...] 69 | 70 | *strings*. Optional exclusion rule: plotter will only show curves from logdirs that do not contain these substrings. 71 | -------------------------------------------------------------------------------- /docs/utils/mpi.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | MPI Tools 3 | ========= 4 | 5 | .. contents:: Table of Contents 6 | 7 | Core MPI Utilities 8 | ================== 9 | 10 | .. automodule:: spinup.utils.mpi_tools 11 | :members: 12 | 13 | 14 | MPI + PyTorch Utilities 15 | ======================= 16 | 17 | ``spinup.utils.mpi_pytorch`` contains a few tools to make it easy to do data-parallel PyTorch optimization across MPI processes. The two main ingredients are syncing parameters and averaging gradients before they are used by the adaptive optimizer. Also there's a hacky fix for a problem where the PyTorch instance in each separate process tries to get too many threads, and they start to clobber each other. 18 | 19 | The pattern for using these tools looks something like this: 20 | 21 | 1) At the beginning of the training script, call ``setup_pytorch_for_mpi()``. (Avoids clobbering problem.) 22 | 23 | 2) After you've constructed a PyTorch module, call ``sync_params(module)``. 24 | 25 | 3) Then, during gradient descent, call ``mpi_avg_grads`` after the backward pass, like so: 26 | 27 | .. code-block:: python 28 | 29 | optimizer.zero_grad() 30 | loss = compute_loss(module) 31 | loss.backward() 32 | mpi_avg_grads(module) # averages gradient buffers across MPI processes! 33 | optimizer.step() 34 | 35 | 36 | .. automodule:: spinup.utils.mpi_pytorch 37 | :members: 38 | 39 | MPI + Tensorflow Utilities 40 | ========================== 41 | 42 | The ``spinup.utils.mpi_tf`` contains a a few tools to make it easy to use the AdamOptimizer across many MPI processes. This is a bit hacky---if you're looking for something more sophisticated and general-purpose, consider `horovod`_. 43 | 44 | .. _`horovod`: https://github.com/uber/horovod 45 | 46 | .. automodule:: spinup.utils.mpi_tf 47 | :members: 48 | -------------------------------------------------------------------------------- /docs/utils/plotter.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Plotter 3 | ======= 4 | 5 | See the page on `plotting results`_ for documentation of the plotter. 6 | 7 | .. _`plotting results`: ../user/plotting.html -------------------------------------------------------------------------------- /docs/utils/run_utils.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Run Utils 3 | ========= 4 | 5 | .. contents:: Table of Contents 6 | 7 | ExperimentGrid 8 | ============== 9 | 10 | Spinning Up ships with a tool called ExperimentGrid for making hyperparameter ablations easier. This is based on (but simpler than) `the rllab tool`_ called VariantGenerator. 11 | 12 | .. _`the rllab tool`: https://github.com/rll/rllab/blob/master/rllab/misc/instrument.py#L173 13 | 14 | .. autoclass:: spinup.utils.run_utils.ExperimentGrid 15 | :members: 16 | 17 | 18 | Calling Experiments 19 | =================== 20 | 21 | .. autofunction:: spinup.utils.run_utils.call_experiment 22 | 23 | .. autofunction:: spinup.utils.run_utils.setup_logger_kwargs 24 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | **Status:** Maintenance (expect bug fixes and minor updates) 2 | 3 | Welcome to Spinning Up in Deep RL! 4 | ================================== 5 | 6 | This is an educational resource produced by OpenAI that makes it easier to learn about deep reinforcement learning (deep RL). 7 | 8 | For the unfamiliar: [reinforcement learning](https://en.wikipedia.org/wiki/Reinforcement_learning) (RL) is a machine learning approach for teaching agents how to solve tasks by trial and error. Deep RL refers to the combination of RL with [deep learning](http://ufldl.stanford.edu/tutorial/). 9 | 10 | This module contains a variety of helpful resources, including: 11 | 12 | - a short [introduction](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html) to RL terminology, kinds of algorithms, and basic theory, 13 | - an [essay](https://spinningup.openai.com/en/latest/spinningup/spinningup.html) about how to grow into an RL research role, 14 | - a [curated list](https://spinningup.openai.com/en/latest/spinningup/keypapers.html) of important papers organized by topic, 15 | - a well-documented [code repo](https://github.com/openai/spinningup) of short, standalone implementations of key algorithms, 16 | - and a few [exercises](https://spinningup.openai.com/en/latest/spinningup/exercises.html) to serve as warm-ups. 17 | 18 | Get started at [spinningup.openai.com](https://spinningup.openai.com)! 19 | 20 | 21 | Citing Spinning Up 22 | ------------------ 23 | 24 | If you reference or use Spinning Up in your research, please cite: 25 | 26 | ``` 27 | @article{SpinningUp2018, 28 | author = {Achiam, Joshua}, 29 | title = {{Spinning Up in Deep Reinforcement Learning}}, 30 | year = {2018} 31 | } 32 | ``` -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | build: 2 | image: latest 3 | 4 | python: 5 | version: 3.6 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os.path import join, dirname, realpath 2 | from setuptools import setup 3 | import sys 4 | 5 | assert sys.version_info.major == 3 and sys.version_info.minor >= 6, \ 6 | "The Spinning Up repo is designed to work with Python 3.6 and greater." \ 7 | + "Please install it before proceeding." 8 | 9 | with open(join("spinup", "version.py")) as version_file: 10 | exec(version_file.read()) 11 | 12 | setup( 13 | name='spinup', 14 | py_modules=['spinup'], 15 | version=__version__,#'0.1', 16 | install_requires=[ 17 | 'cloudpickle==1.2.1', 18 | 'gym[atari,box2d,classic_control]~=0.15.3', 19 | 'ipython', 20 | 'joblib', 21 | 'matplotlib==3.1.1', 22 | 'mpi4py', 23 | 'numpy', 24 | 'pandas', 25 | 'pytest', 26 | 'psutil', 27 | 'scipy', 28 | 'seaborn==0.8.1', 29 | 'tensorflow>=1.8.0,<2.0', 30 | 'torch==1.3.1', 31 | 'tqdm' 32 | ], 33 | description="Teaching tools for introducing people to deep RL.", 34 | author="Joshua Achiam", 35 | ) 36 | -------------------------------------------------------------------------------- /spinup/__init__.py: -------------------------------------------------------------------------------- 1 | # Disable TF deprecation warnings. 2 | # Syntax from tf1 is not expected to be compatible with tf2. 3 | import tensorflow as tf 4 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 5 | 6 | # Algorithms 7 | from spinup.algos.tf1.ddpg.ddpg import ddpg as ddpg_tf1 8 | from spinup.algos.tf1.ppo.ppo import ppo as ppo_tf1 9 | from spinup.algos.tf1.sac.sac import sac as sac_tf1 10 | from spinup.algos.tf1.td3.td3 import td3 as td3_tf1 11 | from spinup.algos.tf1.trpo.trpo import trpo as trpo_tf1 12 | from spinup.algos.tf1.vpg.vpg import vpg as vpg_tf1 13 | 14 | from spinup.algos.pytorch.ddpg.ddpg import ddpg as ddpg_pytorch 15 | from spinup.algos.pytorch.ppo.ppo import ppo as ppo_pytorch 16 | from spinup.algos.pytorch.sac.sac import sac as sac_pytorch 17 | from spinup.algos.pytorch.td3.td3 import td3 as td3_pytorch 18 | from spinup.algos.pytorch.trpo.trpo import trpo as trpo_pytorch 19 | from spinup.algos.pytorch.vpg.vpg import vpg as vpg_pytorch 20 | 21 | # Loggers 22 | from spinup.utils.logx import Logger, EpochLogger 23 | 24 | # Version 25 | from spinup.version import __version__ -------------------------------------------------------------------------------- /spinup/algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/__init__.py -------------------------------------------------------------------------------- /spinup/algos/pytorch/ddpg/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def mlp(sizes, activation, output_activation=nn.Identity): 14 | layers = [] 15 | for j in range(len(sizes)-1): 16 | act = activation if j < len(sizes)-2 else output_activation 17 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 18 | return nn.Sequential(*layers) 19 | 20 | def count_vars(module): 21 | return sum([np.prod(p.shape) for p in module.parameters()]) 22 | 23 | class MLPActor(nn.Module): 24 | 25 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 26 | super().__init__() 27 | pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim] 28 | self.pi = mlp(pi_sizes, activation, nn.Tanh) 29 | self.act_limit = act_limit 30 | 31 | def forward(self, obs): 32 | # Return output from network scaled to action space limits. 33 | return self.act_limit * self.pi(obs) 34 | 35 | class MLPQFunction(nn.Module): 36 | 37 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 38 | super().__init__() 39 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 40 | 41 | def forward(self, obs, act): 42 | q = self.q(torch.cat([obs, act], dim=-1)) 43 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 44 | 45 | class MLPActorCritic(nn.Module): 46 | 47 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 48 | activation=nn.ReLU): 49 | super().__init__() 50 | 51 | obs_dim = observation_space.shape[0] 52 | act_dim = action_space.shape[0] 53 | act_limit = action_space.high[0] 54 | 55 | # build policy and value functions 56 | self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 57 | self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 58 | 59 | def act(self, obs): 60 | with torch.no_grad(): 61 | return self.pi(obs).numpy() 62 | -------------------------------------------------------------------------------- /spinup/algos/pytorch/ppo/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | from gym.spaces import Box, Discrete 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.distributions.normal import Normal 8 | from torch.distributions.categorical import Categorical 9 | 10 | 11 | def combined_shape(length, shape=None): 12 | if shape is None: 13 | return (length,) 14 | return (length, shape) if np.isscalar(shape) else (length, *shape) 15 | 16 | 17 | def mlp(sizes, activation, output_activation=nn.Identity): 18 | layers = [] 19 | for j in range(len(sizes)-1): 20 | act = activation if j < len(sizes)-2 else output_activation 21 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 22 | return nn.Sequential(*layers) 23 | 24 | 25 | def count_vars(module): 26 | return sum([np.prod(p.shape) for p in module.parameters()]) 27 | 28 | 29 | def discount_cumsum(x, discount): 30 | """ 31 | magic from rllab for computing discounted cumulative sums of vectors. 32 | 33 | input: 34 | vector x, 35 | [x0, 36 | x1, 37 | x2] 38 | 39 | output: 40 | [x0 + discount * x1 + discount^2 * x2, 41 | x1 + discount * x2, 42 | x2] 43 | """ 44 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 45 | 46 | 47 | class Actor(nn.Module): 48 | 49 | def _distribution(self, obs): 50 | raise NotImplementedError 51 | 52 | def _log_prob_from_distribution(self, pi, act): 53 | raise NotImplementedError 54 | 55 | def forward(self, obs, act=None): 56 | # Produce action distributions for given observations, and 57 | # optionally compute the log likelihood of given actions under 58 | # those distributions. 59 | pi = self._distribution(obs) 60 | logp_a = None 61 | if act is not None: 62 | logp_a = self._log_prob_from_distribution(pi, act) 63 | return pi, logp_a 64 | 65 | 66 | class MLPCategoricalActor(Actor): 67 | 68 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 69 | super().__init__() 70 | self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 71 | 72 | def _distribution(self, obs): 73 | logits = self.logits_net(obs) 74 | return Categorical(logits=logits) 75 | 76 | def _log_prob_from_distribution(self, pi, act): 77 | return pi.log_prob(act) 78 | 79 | 80 | class MLPGaussianActor(Actor): 81 | 82 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 83 | super().__init__() 84 | log_std = -0.5 * np.ones(act_dim, dtype=np.float32) 85 | self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) 86 | self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 87 | 88 | def _distribution(self, obs): 89 | mu = self.mu_net(obs) 90 | std = torch.exp(self.log_std) 91 | return Normal(mu, std) 92 | 93 | def _log_prob_from_distribution(self, pi, act): 94 | return pi.log_prob(act).sum(axis=-1) # Last axis sum needed for Torch Normal distribution 95 | 96 | 97 | class MLPCritic(nn.Module): 98 | 99 | def __init__(self, obs_dim, hidden_sizes, activation): 100 | super().__init__() 101 | self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation) 102 | 103 | def forward(self, obs): 104 | return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape. 105 | 106 | 107 | 108 | class MLPActorCritic(nn.Module): 109 | 110 | 111 | def __init__(self, observation_space, action_space, 112 | hidden_sizes=(64,64), activation=nn.Tanh): 113 | super().__init__() 114 | 115 | obs_dim = observation_space.shape[0] 116 | 117 | # policy builder depends on action space 118 | if isinstance(action_space, Box): 119 | self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation) 120 | elif isinstance(action_space, Discrete): 121 | self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation) 122 | 123 | # build value function 124 | self.v = MLPCritic(obs_dim, hidden_sizes, activation) 125 | 126 | def step(self, obs): 127 | with torch.no_grad(): 128 | pi = self.pi._distribution(obs) 129 | a = pi.sample() 130 | logp_a = self.pi._log_prob_from_distribution(pi, a) 131 | v = self.v(obs) 132 | return a.numpy(), v.numpy(), logp_a.numpy() 133 | 134 | def act(self, obs): 135 | return self.step(obs)[0] -------------------------------------------------------------------------------- /spinup/algos/pytorch/sac/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.distributions.normal import Normal 8 | 9 | 10 | def combined_shape(length, shape=None): 11 | if shape is None: 12 | return (length,) 13 | return (length, shape) if np.isscalar(shape) else (length, *shape) 14 | 15 | def mlp(sizes, activation, output_activation=nn.Identity): 16 | layers = [] 17 | for j in range(len(sizes)-1): 18 | act = activation if j < len(sizes)-2 else output_activation 19 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 20 | return nn.Sequential(*layers) 21 | 22 | def count_vars(module): 23 | return sum([np.prod(p.shape) for p in module.parameters()]) 24 | 25 | 26 | LOG_STD_MAX = 2 27 | LOG_STD_MIN = -20 28 | 29 | class SquashedGaussianMLPActor(nn.Module): 30 | 31 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 32 | super().__init__() 33 | self.net = mlp([obs_dim] + list(hidden_sizes), activation, activation) 34 | self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim) 35 | self.log_std_layer = nn.Linear(hidden_sizes[-1], act_dim) 36 | self.act_limit = act_limit 37 | 38 | def forward(self, obs, deterministic=False, with_logprob=True): 39 | net_out = self.net(obs) 40 | mu = self.mu_layer(net_out) 41 | log_std = self.log_std_layer(net_out) 42 | log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) 43 | std = torch.exp(log_std) 44 | 45 | # Pre-squash distribution and sample 46 | pi_distribution = Normal(mu, std) 47 | if deterministic: 48 | # Only used for evaluating policy at test time. 49 | pi_action = mu 50 | else: 51 | pi_action = pi_distribution.rsample() 52 | 53 | if with_logprob: 54 | # Compute logprob from Gaussian, and then apply correction for Tanh squashing. 55 | # NOTE: The correction formula is a little bit magic. To get an understanding 56 | # of where it comes from, check out the original SAC paper (arXiv 1801.01290) 57 | # and look in appendix C. This is a more numerically-stable equivalent to Eq 21. 58 | # Try deriving it yourself as a (very difficult) exercise. :) 59 | logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) 60 | logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1) 61 | else: 62 | logp_pi = None 63 | 64 | pi_action = torch.tanh(pi_action) 65 | pi_action = self.act_limit * pi_action 66 | 67 | return pi_action, logp_pi 68 | 69 | 70 | class MLPQFunction(nn.Module): 71 | 72 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 73 | super().__init__() 74 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 75 | 76 | def forward(self, obs, act): 77 | q = self.q(torch.cat([obs, act], dim=-1)) 78 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 79 | 80 | class MLPActorCritic(nn.Module): 81 | 82 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 83 | activation=nn.ReLU): 84 | super().__init__() 85 | 86 | obs_dim = observation_space.shape[0] 87 | act_dim = action_space.shape[0] 88 | act_limit = action_space.high[0] 89 | 90 | # build policy and value functions 91 | self.pi = SquashedGaussianMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 92 | self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 93 | self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 94 | 95 | def act(self, obs, deterministic=False): 96 | with torch.no_grad(): 97 | a, _ = self.pi(obs, deterministic, False) 98 | return a.numpy() 99 | -------------------------------------------------------------------------------- /spinup/algos/pytorch/td3/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def mlp(sizes, activation, output_activation=nn.Identity): 14 | layers = [] 15 | for j in range(len(sizes)-1): 16 | act = activation if j < len(sizes)-2 else output_activation 17 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 18 | return nn.Sequential(*layers) 19 | 20 | def count_vars(module): 21 | return sum([np.prod(p.shape) for p in module.parameters()]) 22 | 23 | class MLPActor(nn.Module): 24 | 25 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 26 | super().__init__() 27 | pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim] 28 | self.pi = mlp(pi_sizes, activation, nn.Tanh) 29 | self.act_limit = act_limit 30 | 31 | def forward(self, obs): 32 | # Return output from network scaled to action space limits. 33 | return self.act_limit * self.pi(obs) 34 | 35 | class MLPQFunction(nn.Module): 36 | 37 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 38 | super().__init__() 39 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 40 | 41 | def forward(self, obs, act): 42 | q = self.q(torch.cat([obs, act], dim=-1)) 43 | return torch.squeeze(q, -1) # Critical to ensure q has right shape. 44 | 45 | class MLPActorCritic(nn.Module): 46 | 47 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 48 | activation=nn.ReLU): 49 | super().__init__() 50 | 51 | obs_dim = observation_space.shape[0] 52 | act_dim = action_space.shape[0] 53 | act_limit = action_space.high[0] 54 | 55 | # build policy and value functions 56 | self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 57 | self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 58 | self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 59 | 60 | def act(self, obs): 61 | with torch.no_grad(): 62 | return self.pi(obs).numpy() 63 | -------------------------------------------------------------------------------- /spinup/algos/pytorch/trpo/trpo.py: -------------------------------------------------------------------------------- 1 | def trpo(*args, **kwargs): 2 | print('\n\nUnfortunately, TRPO has not yet been implemented in PyTorch '\ 3 | + 'for Spinning Up. TRPO will migrate some time in the future.\n\n') 4 | raise NotImplementedError -------------------------------------------------------------------------------- /spinup/algos/pytorch/vpg/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | from gym.spaces import Box, Discrete 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.distributions.normal import Normal 8 | from torch.distributions.categorical import Categorical 9 | 10 | 11 | def combined_shape(length, shape=None): 12 | if shape is None: 13 | return (length,) 14 | return (length, shape) if np.isscalar(shape) else (length, *shape) 15 | 16 | 17 | def mlp(sizes, activation, output_activation=nn.Identity): 18 | layers = [] 19 | for j in range(len(sizes)-1): 20 | act = activation if j < len(sizes)-2 else output_activation 21 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 22 | return nn.Sequential(*layers) 23 | 24 | 25 | def count_vars(module): 26 | return sum([np.prod(p.shape) for p in module.parameters()]) 27 | 28 | 29 | def discount_cumsum(x, discount): 30 | """ 31 | magic from rllab for computing discounted cumulative sums of vectors. 32 | 33 | input: 34 | vector x, 35 | [x0, 36 | x1, 37 | x2] 38 | 39 | output: 40 | [x0 + discount * x1 + discount^2 * x2, 41 | x1 + discount * x2, 42 | x2] 43 | """ 44 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 45 | 46 | 47 | class Actor(nn.Module): 48 | 49 | def _distribution(self, obs): 50 | raise NotImplementedError 51 | 52 | def _log_prob_from_distribution(self, pi, act): 53 | raise NotImplementedError 54 | 55 | def forward(self, obs, act=None): 56 | # Produce action distributions for given observations, and 57 | # optionally compute the log likelihood of given actions under 58 | # those distributions. 59 | pi = self._distribution(obs) 60 | logp_a = None 61 | if act is not None: 62 | logp_a = self._log_prob_from_distribution(pi, act) 63 | return pi, logp_a 64 | 65 | 66 | class MLPCategoricalActor(Actor): 67 | 68 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 69 | super().__init__() 70 | self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 71 | 72 | def _distribution(self, obs): 73 | logits = self.logits_net(obs) 74 | return Categorical(logits=logits) 75 | 76 | def _log_prob_from_distribution(self, pi, act): 77 | return pi.log_prob(act) 78 | 79 | 80 | class MLPGaussianActor(Actor): 81 | 82 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 83 | super().__init__() 84 | log_std = -0.5 * np.ones(act_dim, dtype=np.float32) 85 | self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) 86 | self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 87 | 88 | def _distribution(self, obs): 89 | mu = self.mu_net(obs) 90 | std = torch.exp(self.log_std) 91 | return Normal(mu, std) 92 | 93 | def _log_prob_from_distribution(self, pi, act): 94 | return pi.log_prob(act).sum(axis=-1) # Last axis sum needed for Torch Normal distribution 95 | 96 | 97 | class MLPCritic(nn.Module): 98 | 99 | def __init__(self, obs_dim, hidden_sizes, activation): 100 | super().__init__() 101 | self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation) 102 | 103 | def forward(self, obs): 104 | return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape. 105 | 106 | 107 | 108 | class MLPActorCritic(nn.Module): 109 | 110 | 111 | def __init__(self, observation_space, action_space, 112 | hidden_sizes=(64,64), activation=nn.Tanh): 113 | super().__init__() 114 | 115 | obs_dim = observation_space.shape[0] 116 | 117 | # policy builder depends on action space 118 | if isinstance(action_space, Box): 119 | self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation) 120 | elif isinstance(action_space, Discrete): 121 | self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation) 122 | 123 | # build value function 124 | self.v = MLPCritic(obs_dim, hidden_sizes, activation) 125 | 126 | def step(self, obs): 127 | with torch.no_grad(): 128 | pi = self.pi._distribution(obs) 129 | a = pi.sample() 130 | logp_a = self.pi._log_prob_from_distribution(pi, a) 131 | v = self.v(obs) 132 | return a.numpy(), v.numpy(), logp_a.numpy() 133 | 134 | def act(self, obs): 135 | return self.step(obs)[0] -------------------------------------------------------------------------------- /spinup/algos/tf1/ddpg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/ddpg/__init__.py -------------------------------------------------------------------------------- /spinup/algos/tf1/ddpg/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def placeholder(dim=None): 6 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 7 | 8 | def placeholders(*args): 9 | return [placeholder(dim) for dim in args] 10 | 11 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 12 | for h in hidden_sizes[:-1]: 13 | x = tf.layers.dense(x, units=h, activation=activation) 14 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 15 | 16 | def get_vars(scope): 17 | return [x for x in tf.global_variables() if scope in x.name] 18 | 19 | def count_vars(scope): 20 | v = get_vars(scope) 21 | return sum([np.prod(var.shape.as_list()) for var in v]) 22 | 23 | """ 24 | Actor-Critics 25 | """ 26 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 27 | output_activation=tf.tanh, action_space=None): 28 | act_dim = a.shape.as_list()[-1] 29 | act_limit = action_space.high[0] 30 | with tf.variable_scope('pi'): 31 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 32 | with tf.variable_scope('q'): 33 | q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 34 | with tf.variable_scope('q', reuse=True): 35 | q_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 36 | return pi, q, q_pi 37 | -------------------------------------------------------------------------------- /spinup/algos/tf1/ppo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/ppo/__init__.py -------------------------------------------------------------------------------- /spinup/algos/tf1/ppo/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import scipy.signal 4 | from gym.spaces import Box, Discrete 5 | 6 | EPS = 1e-8 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def placeholder(dim=None): 14 | return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim)) 15 | 16 | def placeholders(*args): 17 | return [placeholder(dim) for dim in args] 18 | 19 | def placeholder_from_space(space): 20 | if isinstance(space, Box): 21 | return placeholder(space.shape) 22 | elif isinstance(space, Discrete): 23 | return tf.placeholder(dtype=tf.int32, shape=(None,)) 24 | raise NotImplementedError 25 | 26 | def placeholders_from_spaces(*args): 27 | return [placeholder_from_space(space) for space in args] 28 | 29 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 30 | for h in hidden_sizes[:-1]: 31 | x = tf.layers.dense(x, units=h, activation=activation) 32 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 33 | 34 | def get_vars(scope=''): 35 | return [x for x in tf.trainable_variables() if scope in x.name] 36 | 37 | def count_vars(scope=''): 38 | v = get_vars(scope) 39 | return sum([np.prod(var.shape.as_list()) for var in v]) 40 | 41 | def gaussian_likelihood(x, mu, log_std): 42 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 43 | return tf.reduce_sum(pre_sum, axis=1) 44 | 45 | def discount_cumsum(x, discount): 46 | """ 47 | magic from rllab for computing discounted cumulative sums of vectors. 48 | 49 | input: 50 | vector x, 51 | [x0, 52 | x1, 53 | x2] 54 | 55 | output: 56 | [x0 + discount * x1 + discount^2 * x2, 57 | x1 + discount * x2, 58 | x2] 59 | """ 60 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 61 | 62 | 63 | """ 64 | Policies 65 | """ 66 | 67 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): 68 | act_dim = action_space.n 69 | logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None) 70 | logp_all = tf.nn.log_softmax(logits) 71 | pi = tf.squeeze(tf.multinomial(logits,1), axis=1) 72 | logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) 73 | logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) 74 | return pi, logp, logp_pi 75 | 76 | 77 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 78 | act_dim = a.shape.as_list()[-1] 79 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 80 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32)) 81 | std = tf.exp(log_std) 82 | pi = mu + tf.random_normal(tf.shape(mu)) * std 83 | logp = gaussian_likelihood(a, mu, log_std) 84 | logp_pi = gaussian_likelihood(pi, mu, log_std) 85 | return pi, logp, logp_pi 86 | 87 | 88 | """ 89 | Actor-Critics 90 | """ 91 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 92 | output_activation=None, policy=None, action_space=None): 93 | 94 | # default policy builder depends on action space 95 | if policy is None and isinstance(action_space, Box): 96 | policy = mlp_gaussian_policy 97 | elif policy is None and isinstance(action_space, Discrete): 98 | policy = mlp_categorical_policy 99 | 100 | with tf.variable_scope('pi'): 101 | pi, logp, logp_pi = policy(x, a, hidden_sizes, activation, output_activation, action_space) 102 | with tf.variable_scope('v'): 103 | v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 104 | return pi, logp, logp_pi, v 105 | -------------------------------------------------------------------------------- /spinup/algos/tf1/sac/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/sac/__init__.py -------------------------------------------------------------------------------- /spinup/algos/tf1/sac/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | EPS = 1e-8 5 | 6 | def placeholder(dim=None): 7 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 8 | 9 | def placeholders(*args): 10 | return [placeholder(dim) for dim in args] 11 | 12 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 13 | for h in hidden_sizes[:-1]: 14 | x = tf.layers.dense(x, units=h, activation=activation) 15 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 16 | 17 | def get_vars(scope): 18 | return [x for x in tf.global_variables() if scope in x.name] 19 | 20 | def count_vars(scope): 21 | v = get_vars(scope) 22 | return sum([np.prod(var.shape.as_list()) for var in v]) 23 | 24 | def gaussian_likelihood(x, mu, log_std): 25 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 26 | return tf.reduce_sum(pre_sum, axis=1) 27 | 28 | 29 | """ 30 | Policies 31 | """ 32 | 33 | LOG_STD_MAX = 2 34 | LOG_STD_MIN = -20 35 | 36 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): 37 | act_dim = a.shape.as_list()[-1] 38 | net = mlp(x, list(hidden_sizes), activation, activation) 39 | mu = tf.layers.dense(net, act_dim, activation=output_activation) 40 | log_std = tf.layers.dense(net, act_dim, activation=None) 41 | log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX) 42 | 43 | std = tf.exp(log_std) 44 | pi = mu + tf.random_normal(tf.shape(mu)) * std 45 | logp_pi = gaussian_likelihood(pi, mu, log_std) 46 | return mu, pi, logp_pi 47 | 48 | def apply_squashing_func(mu, pi, logp_pi): 49 | # Adjustment to log prob 50 | # NOTE: This formula is a little bit magic. To get an understanding of where it 51 | # comes from, check out the original SAC paper (arXiv 1801.01290) and look in 52 | # appendix C. This is a more numerically-stable equivalent to Eq 21. 53 | # Try deriving it yourself as a (very difficult) exercise. :) 54 | logp_pi -= tf.reduce_sum(2*(np.log(2) - pi - tf.nn.softplus(-2*pi)), axis=1) 55 | 56 | # Squash those unbounded actions! 57 | mu = tf.tanh(mu) 58 | pi = tf.tanh(pi) 59 | return mu, pi, logp_pi 60 | 61 | """ 62 | Actor-Critics 63 | """ 64 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 65 | output_activation=None, policy=mlp_gaussian_policy, action_space=None): 66 | # policy 67 | with tf.variable_scope('pi'): 68 | mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation) 69 | mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) 70 | 71 | # make sure actions are in correct range 72 | action_scale = action_space.high[0] 73 | mu *= action_scale 74 | pi *= action_scale 75 | 76 | # vfs 77 | vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 78 | with tf.variable_scope('q1'): 79 | q1 = vf_mlp(tf.concat([x,a], axis=-1)) 80 | with tf.variable_scope('q2'): 81 | q2 = vf_mlp(tf.concat([x,a], axis=-1)) 82 | return mu, pi, logp_pi, q1, q2 83 | -------------------------------------------------------------------------------- /spinup/algos/tf1/td3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/td3/__init__.py -------------------------------------------------------------------------------- /spinup/algos/tf1/td3/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def placeholder(dim=None): 6 | return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,)) 7 | 8 | def placeholders(*args): 9 | return [placeholder(dim) for dim in args] 10 | 11 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 12 | for h in hidden_sizes[:-1]: 13 | x = tf.layers.dense(x, units=h, activation=activation) 14 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 15 | 16 | def get_vars(scope): 17 | return [x for x in tf.global_variables() if scope in x.name] 18 | 19 | def count_vars(scope): 20 | v = get_vars(scope) 21 | return sum([np.prod(var.shape.as_list()) for var in v]) 22 | 23 | """ 24 | Actor-Critics 25 | """ 26 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 27 | output_activation=tf.tanh, action_space=None): 28 | act_dim = a.shape.as_list()[-1] 29 | act_limit = action_space.high[0] 30 | with tf.variable_scope('pi'): 31 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 32 | with tf.variable_scope('q1'): 33 | q1 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 34 | with tf.variable_scope('q2'): 35 | q2 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 36 | with tf.variable_scope('q1', reuse=True): 37 | q1_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1) 38 | return pi, q1, q2, q1_pi 39 | -------------------------------------------------------------------------------- /spinup/algos/tf1/trpo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/trpo/__init__.py -------------------------------------------------------------------------------- /spinup/algos/tf1/trpo/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import scipy.signal 4 | from gym.spaces import Box, Discrete 5 | 6 | EPS = 1e-8 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def keys_as_sorted_list(dict): 14 | return sorted(list(dict.keys())) 15 | 16 | def values_as_sorted_list(dict): 17 | return [dict[k] for k in keys_as_sorted_list(dict)] 18 | 19 | def placeholder(dim=None): 20 | return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim)) 21 | 22 | def placeholders(*args): 23 | return [placeholder(dim) for dim in args] 24 | 25 | def placeholder_from_space(space): 26 | if isinstance(space, Box): 27 | return placeholder(space.shape) 28 | elif isinstance(space, Discrete): 29 | return tf.placeholder(dtype=tf.int32, shape=(None,)) 30 | raise NotImplementedError 31 | 32 | def placeholders_from_spaces(*args): 33 | return [placeholder_from_space(space) for space in args] 34 | 35 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 36 | for h in hidden_sizes[:-1]: 37 | x = tf.layers.dense(x, units=h, activation=activation) 38 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 39 | 40 | def get_vars(scope=''): 41 | return [x for x in tf.trainable_variables() if scope in x.name] 42 | 43 | def count_vars(scope=''): 44 | v = get_vars(scope) 45 | return sum([np.prod(var.shape.as_list()) for var in v]) 46 | 47 | def gaussian_likelihood(x, mu, log_std): 48 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 49 | return tf.reduce_sum(pre_sum, axis=1) 50 | 51 | def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1): 52 | """ 53 | tf symbol for mean KL divergence between two batches of diagonal gaussian distributions, 54 | where distributions are specified by means and log stds. 55 | (https://en.wikipedia.org/wiki/Kullback-Leibler_divergence#Multivariate_normal_distributions) 56 | """ 57 | var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1) 58 | pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1 + EPS) - 1) + log_std1 - log_std0 59 | all_kls = tf.reduce_sum(pre_sum, axis=1) 60 | return tf.reduce_mean(all_kls) 61 | 62 | def categorical_kl(logp0, logp1): 63 | """ 64 | tf symbol for mean KL divergence between two batches of categorical probability distributions, 65 | where the distributions are input as log probs. 66 | """ 67 | all_kls = tf.reduce_sum(tf.exp(logp1) * (logp1 - logp0), axis=1) 68 | return tf.reduce_mean(all_kls) 69 | 70 | def flat_concat(xs): 71 | return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0) 72 | 73 | def flat_grad(f, params): 74 | return flat_concat(tf.gradients(xs=params, ys=f)) 75 | 76 | def hessian_vector_product(f, params): 77 | # for H = grad**2 f, compute Hx 78 | g = flat_grad(f, params) 79 | x = tf.placeholder(tf.float32, shape=g.shape) 80 | return x, flat_grad(tf.reduce_sum(g*x), params) 81 | 82 | def assign_params_from_flat(x, params): 83 | flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars 84 | splits = tf.split(x, [flat_size(p) for p in params]) 85 | new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] 86 | return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) 87 | 88 | def discount_cumsum(x, discount): 89 | """ 90 | magic from rllab for computing discounted cumulative sums of vectors. 91 | 92 | input: 93 | vector x, 94 | [x0, 95 | x1, 96 | x2] 97 | 98 | output: 99 | [x0 + discount * x1 + discount^2 * x2, 100 | x1 + discount * x2, 101 | x2] 102 | """ 103 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 104 | 105 | """ 106 | Policies 107 | """ 108 | 109 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): 110 | act_dim = action_space.n 111 | logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None) 112 | logp_all = tf.nn.log_softmax(logits) 113 | pi = tf.squeeze(tf.multinomial(logits,1), axis=1) 114 | logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) 115 | logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) 116 | 117 | old_logp_all = placeholder(act_dim) 118 | d_kl = categorical_kl(logp_all, old_logp_all) 119 | 120 | info = {'logp_all': logp_all} 121 | info_phs = {'logp_all': old_logp_all} 122 | 123 | return pi, logp, logp_pi, info, info_phs, d_kl 124 | 125 | 126 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 127 | act_dim = a.shape.as_list()[-1] 128 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 129 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32)) 130 | std = tf.exp(log_std) 131 | pi = mu + tf.random_normal(tf.shape(mu)) * std 132 | logp = gaussian_likelihood(a, mu, log_std) 133 | logp_pi = gaussian_likelihood(pi, mu, log_std) 134 | 135 | old_mu_ph, old_log_std_ph = placeholders(act_dim, act_dim) 136 | d_kl = diagonal_gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph) 137 | 138 | info = {'mu': mu, 'log_std': log_std} 139 | info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph} 140 | 141 | return pi, logp, logp_pi, info, info_phs, d_kl 142 | 143 | 144 | """ 145 | Actor-Critics 146 | """ 147 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 148 | output_activation=None, policy=None, action_space=None): 149 | 150 | # default policy builder depends on action space 151 | if policy is None and isinstance(action_space, Box): 152 | policy = mlp_gaussian_policy 153 | elif policy is None and isinstance(action_space, Discrete): 154 | policy = mlp_categorical_policy 155 | 156 | with tf.variable_scope('pi'): 157 | policy_outs = policy(x, a, hidden_sizes, activation, output_activation, action_space) 158 | pi, logp, logp_pi, info, info_phs, d_kl = policy_outs 159 | with tf.variable_scope('v'): 160 | v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 161 | return pi, logp, logp_pi, info, info_phs, d_kl, v 162 | -------------------------------------------------------------------------------- /spinup/algos/tf1/vpg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/vpg/__init__.py -------------------------------------------------------------------------------- /spinup/algos/tf1/vpg/core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import scipy.signal 4 | from gym.spaces import Box, Discrete 5 | 6 | EPS = 1e-8 7 | 8 | def combined_shape(length, shape=None): 9 | if shape is None: 10 | return (length,) 11 | return (length, shape) if np.isscalar(shape) else (length, *shape) 12 | 13 | def placeholder(dim=None): 14 | return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim)) 15 | 16 | def placeholders(*args): 17 | return [placeholder(dim) for dim in args] 18 | 19 | def placeholder_from_space(space): 20 | if isinstance(space, Box): 21 | return placeholder(space.shape) 22 | elif isinstance(space, Discrete): 23 | return tf.placeholder(dtype=tf.int32, shape=(None,)) 24 | raise NotImplementedError 25 | 26 | def placeholders_from_spaces(*args): 27 | return [placeholder_from_space(space) for space in args] 28 | 29 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 30 | for h in hidden_sizes[:-1]: 31 | x = tf.layers.dense(x, units=h, activation=activation) 32 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 33 | 34 | def get_vars(scope=''): 35 | return [x for x in tf.trainable_variables() if scope in x.name] 36 | 37 | def count_vars(scope=''): 38 | v = get_vars(scope) 39 | return sum([np.prod(var.shape.as_list()) for var in v]) 40 | 41 | def gaussian_likelihood(x, mu, log_std): 42 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 43 | return tf.reduce_sum(pre_sum, axis=1) 44 | 45 | def discount_cumsum(x, discount): 46 | """ 47 | magic from rllab for computing discounted cumulative sums of vectors. 48 | 49 | input: 50 | vector x, 51 | [x0, 52 | x1, 53 | x2] 54 | 55 | output: 56 | [x0 + discount * x1 + discount^2 * x2, 57 | x1 + discount * x2, 58 | x2] 59 | """ 60 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 61 | 62 | 63 | """ 64 | Policies 65 | """ 66 | 67 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): 68 | act_dim = action_space.n 69 | logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None) 70 | logp_all = tf.nn.log_softmax(logits) 71 | pi = tf.squeeze(tf.multinomial(logits,1), axis=1) 72 | logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) 73 | logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1) 74 | return pi, logp, logp_pi 75 | 76 | 77 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 78 | act_dim = a.shape.as_list()[-1] 79 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 80 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32)) 81 | std = tf.exp(log_std) 82 | pi = mu + tf.random_normal(tf.shape(mu)) * std 83 | logp = gaussian_likelihood(a, mu, log_std) 84 | logp_pi = gaussian_likelihood(pi, mu, log_std) 85 | return pi, logp, logp_pi 86 | 87 | 88 | """ 89 | Actor-Critics 90 | """ 91 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 92 | output_activation=None, policy=None, action_space=None): 93 | 94 | # default policy builder depends on action space 95 | if policy is None and isinstance(action_space, Box): 96 | policy = mlp_gaussian_policy 97 | elif policy is None and isinstance(action_space, Discrete): 98 | policy = mlp_categorical_policy 99 | 100 | with tf.variable_scope('pi'): 101 | pi, logp, logp_pi = policy(x, a, hidden_sizes, activation, output_activation, action_space) 102 | with tf.variable_scope('v'): 103 | v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1) 104 | return pi, logp, logp_pi, v 105 | -------------------------------------------------------------------------------- /spinup/examples/pytorch/bench_ppo_cartpole.py: -------------------------------------------------------------------------------- 1 | from spinup.utils.run_utils import ExperimentGrid 2 | from spinup import ppo_pytorch 3 | import torch 4 | 5 | if __name__ == '__main__': 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--cpu', type=int, default=4) 9 | parser.add_argument('--num_runs', type=int, default=3) 10 | args = parser.parse_args() 11 | 12 | eg = ExperimentGrid(name='ppo-pyt-bench') 13 | eg.add('env_name', 'CartPole-v0', '', True) 14 | eg.add('seed', [10*i for i in range(args.num_runs)]) 15 | eg.add('epochs', 10) 16 | eg.add('steps_per_epoch', 4000) 17 | eg.add('ac_kwargs:hidden_sizes', [(32,), (64,64)], 'hid') 18 | eg.add('ac_kwargs:activation', [torch.nn.Tanh, torch.nn.ReLU], '') 19 | eg.run(ppo_pytorch, num_cpu=args.cpu) -------------------------------------------------------------------------------- /spinup/examples/pytorch/pg_math/1_simple_pg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.distributions.categorical import Categorical 4 | from torch.optim import Adam 5 | import numpy as np 6 | import gym 7 | from gym.spaces import Discrete, Box 8 | 9 | def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity): 10 | # Build a feedforward neural network. 11 | layers = [] 12 | for j in range(len(sizes)-1): 13 | act = activation if j < len(sizes)-2 else output_activation 14 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 15 | return nn.Sequential(*layers) 16 | 17 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 18 | epochs=50, batch_size=5000, render=False): 19 | 20 | # make environment, check spaces, get obs / act dims 21 | env = gym.make(env_name) 22 | assert isinstance(env.observation_space, Box), \ 23 | "This example only works for envs with continuous state spaces." 24 | assert isinstance(env.action_space, Discrete), \ 25 | "This example only works for envs with discrete action spaces." 26 | 27 | obs_dim = env.observation_space.shape[0] 28 | n_acts = env.action_space.n 29 | 30 | # make core of policy network 31 | logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts]) 32 | 33 | # make function to compute action distribution 34 | def get_policy(obs): 35 | logits = logits_net(obs) 36 | return Categorical(logits=logits) 37 | 38 | # make action selection function (outputs int actions, sampled from policy) 39 | def get_action(obs): 40 | return get_policy(obs).sample().item() 41 | 42 | # make loss function whose gradient, for the right data, is policy gradient 43 | def compute_loss(obs, act, weights): 44 | logp = get_policy(obs).log_prob(act) 45 | return -(logp * weights).mean() 46 | 47 | # make optimizer 48 | optimizer = Adam(logits_net.parameters(), lr=lr) 49 | 50 | # for training policy 51 | def train_one_epoch(): 52 | # make some empty lists for logging. 53 | batch_obs = [] # for observations 54 | batch_acts = [] # for actions 55 | batch_weights = [] # for R(tau) weighting in policy gradient 56 | batch_rets = [] # for measuring episode returns 57 | batch_lens = [] # for measuring episode lengths 58 | 59 | # reset episode-specific variables 60 | obs = env.reset() # first obs comes from starting distribution 61 | done = False # signal from environment that episode is over 62 | ep_rews = [] # list for rewards accrued throughout ep 63 | 64 | # render first episode of each epoch 65 | finished_rendering_this_epoch = False 66 | 67 | # collect experience by acting in the environment with current policy 68 | while True: 69 | 70 | # rendering 71 | if (not finished_rendering_this_epoch) and render: 72 | env.render() 73 | 74 | # save obs 75 | batch_obs.append(obs.copy()) 76 | 77 | # act in the environment 78 | act = get_action(torch.as_tensor(obs, dtype=torch.float32)) 79 | obs, rew, done, _ = env.step(act) 80 | 81 | # save action, reward 82 | batch_acts.append(act) 83 | ep_rews.append(rew) 84 | 85 | if done: 86 | # if episode is over, record info about episode 87 | ep_ret, ep_len = sum(ep_rews), len(ep_rews) 88 | batch_rets.append(ep_ret) 89 | batch_lens.append(ep_len) 90 | 91 | # the weight for each logprob(a|s) is R(tau) 92 | batch_weights += [ep_ret] * ep_len 93 | 94 | # reset episode-specific variables 95 | obs, done, ep_rews = env.reset(), False, [] 96 | 97 | # won't render again this epoch 98 | finished_rendering_this_epoch = True 99 | 100 | # end experience loop if we have enough of it 101 | if len(batch_obs) > batch_size: 102 | break 103 | 104 | # take a single policy gradient update step 105 | optimizer.zero_grad() 106 | batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32), 107 | act=torch.as_tensor(batch_acts, dtype=torch.int32), 108 | weights=torch.as_tensor(batch_weights, dtype=torch.float32) 109 | ) 110 | batch_loss.backward() 111 | optimizer.step() 112 | return batch_loss, batch_rets, batch_lens 113 | 114 | # training loop 115 | for i in range(epochs): 116 | batch_loss, batch_rets, batch_lens = train_one_epoch() 117 | print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'% 118 | (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens))) 119 | 120 | if __name__ == '__main__': 121 | import argparse 122 | parser = argparse.ArgumentParser() 123 | parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0') 124 | parser.add_argument('--render', action='store_true') 125 | parser.add_argument('--lr', type=float, default=1e-2) 126 | args = parser.parse_args() 127 | print('\nUsing simplest formulation of policy gradient.\n') 128 | train(env_name=args.env_name, render=args.render, lr=args.lr) -------------------------------------------------------------------------------- /spinup/examples/pytorch/pg_math/2_rtg_pg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.distributions.categorical import Categorical 4 | from torch.optim import Adam 5 | import numpy as np 6 | import gym 7 | from gym.spaces import Discrete, Box 8 | 9 | def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity): 10 | # Build a feedforward neural network. 11 | layers = [] 12 | for j in range(len(sizes)-1): 13 | act = activation if j < len(sizes)-2 else output_activation 14 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 15 | return nn.Sequential(*layers) 16 | 17 | def reward_to_go(rews): 18 | n = len(rews) 19 | rtgs = np.zeros_like(rews) 20 | for i in reversed(range(n)): 21 | rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0) 22 | return rtgs 23 | 24 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 25 | epochs=50, batch_size=5000, render=False): 26 | 27 | # make environment, check spaces, get obs / act dims 28 | env = gym.make(env_name) 29 | assert isinstance(env.observation_space, Box), \ 30 | "This example only works for envs with continuous state spaces." 31 | assert isinstance(env.action_space, Discrete), \ 32 | "This example only works for envs with discrete action spaces." 33 | 34 | obs_dim = env.observation_space.shape[0] 35 | n_acts = env.action_space.n 36 | 37 | # make core of policy network 38 | logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts]) 39 | 40 | # make function to compute action distribution 41 | def get_policy(obs): 42 | logits = logits_net(obs) 43 | return Categorical(logits=logits) 44 | 45 | # make action selection function (outputs int actions, sampled from policy) 46 | def get_action(obs): 47 | return get_policy(obs).sample().item() 48 | 49 | # make loss function whose gradient, for the right data, is policy gradient 50 | def compute_loss(obs, act, weights): 51 | logp = get_policy(obs).log_prob(act) 52 | return -(logp * weights).mean() 53 | 54 | # make optimizer 55 | optimizer = Adam(logits_net.parameters(), lr=lr) 56 | 57 | # for training policy 58 | def train_one_epoch(): 59 | # make some empty lists for logging. 60 | batch_obs = [] # for observations 61 | batch_acts = [] # for actions 62 | batch_weights = [] # for reward-to-go weighting in policy gradient 63 | batch_rets = [] # for measuring episode returns 64 | batch_lens = [] # for measuring episode lengths 65 | 66 | # reset episode-specific variables 67 | obs = env.reset() # first obs comes from starting distribution 68 | done = False # signal from environment that episode is over 69 | ep_rews = [] # list for rewards accrued throughout ep 70 | 71 | # render first episode of each epoch 72 | finished_rendering_this_epoch = False 73 | 74 | # collect experience by acting in the environment with current policy 75 | while True: 76 | 77 | # rendering 78 | if (not finished_rendering_this_epoch) and render: 79 | env.render() 80 | 81 | # save obs 82 | batch_obs.append(obs.copy()) 83 | 84 | # act in the environment 85 | act = get_action(torch.as_tensor(obs, dtype=torch.float32)) 86 | obs, rew, done, _ = env.step(act) 87 | 88 | # save action, reward 89 | batch_acts.append(act) 90 | ep_rews.append(rew) 91 | 92 | if done: 93 | # if episode is over, record info about episode 94 | ep_ret, ep_len = sum(ep_rews), len(ep_rews) 95 | batch_rets.append(ep_ret) 96 | batch_lens.append(ep_len) 97 | 98 | # the weight for each logprob(a_t|s_t) is reward-to-go from t 99 | batch_weights += list(reward_to_go(ep_rews)) 100 | 101 | # reset episode-specific variables 102 | obs, done, ep_rews = env.reset(), False, [] 103 | 104 | # won't render again this epoch 105 | finished_rendering_this_epoch = True 106 | 107 | # end experience loop if we have enough of it 108 | if len(batch_obs) > batch_size: 109 | break 110 | 111 | # take a single policy gradient update step 112 | optimizer.zero_grad() 113 | batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32), 114 | act=torch.as_tensor(batch_acts, dtype=torch.int32), 115 | weights=torch.as_tensor(batch_weights, dtype=torch.float32) 116 | ) 117 | batch_loss.backward() 118 | optimizer.step() 119 | return batch_loss, batch_rets, batch_lens 120 | 121 | # training loop 122 | for i in range(epochs): 123 | batch_loss, batch_rets, batch_lens = train_one_epoch() 124 | print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'% 125 | (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens))) 126 | 127 | if __name__ == '__main__': 128 | import argparse 129 | parser = argparse.ArgumentParser() 130 | parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0') 131 | parser.add_argument('--render', action='store_true') 132 | parser.add_argument('--lr', type=float, default=1e-2) 133 | args = parser.parse_args() 134 | print('\nUsing reward-to-go formulation of policy gradient.\n') 135 | train(env_name=args.env_name, render=args.render, lr=args.lr) -------------------------------------------------------------------------------- /spinup/examples/tf1/bench_ppo_cartpole.py: -------------------------------------------------------------------------------- 1 | from spinup.utils.run_utils import ExperimentGrid 2 | from spinup import ppo_tf1 3 | import tensorflow as tf 4 | 5 | if __name__ == '__main__': 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--cpu', type=int, default=4) 9 | parser.add_argument('--num_runs', type=int, default=3) 10 | args = parser.parse_args() 11 | 12 | eg = ExperimentGrid(name='ppo-tf1-bench') 13 | eg.add('env_name', 'CartPole-v0', '', True) 14 | eg.add('seed', [10*i for i in range(args.num_runs)]) 15 | eg.add('epochs', 10) 16 | eg.add('steps_per_epoch', 4000) 17 | eg.add('ac_kwargs:hidden_sizes', [(32,), (64,64)], 'hid') 18 | eg.add('ac_kwargs:activation', [tf.tanh, tf.nn.relu], '') 19 | eg.run(ppo_tf1, num_cpu=args.cpu) -------------------------------------------------------------------------------- /spinup/examples/tf1/pg_math/1_simple_pg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import gym 4 | from gym.spaces import Discrete, Box 5 | 6 | def mlp(x, sizes, activation=tf.tanh, output_activation=None): 7 | # Build a feedforward neural network. 8 | for size in sizes[:-1]: 9 | x = tf.layers.dense(x, units=size, activation=activation) 10 | return tf.layers.dense(x, units=sizes[-1], activation=output_activation) 11 | 12 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 13 | epochs=50, batch_size=5000, render=False): 14 | 15 | # make environment, check spaces, get obs / act dims 16 | env = gym.make(env_name) 17 | assert isinstance(env.observation_space, Box), \ 18 | "This example only works for envs with continuous state spaces." 19 | assert isinstance(env.action_space, Discrete), \ 20 | "This example only works for envs with discrete action spaces." 21 | 22 | obs_dim = env.observation_space.shape[0] 23 | n_acts = env.action_space.n 24 | 25 | # make core of policy network 26 | obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32) 27 | logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts]) 28 | 29 | # make action selection op (outputs int actions, sampled from policy) 30 | actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1) 31 | 32 | # make loss function whose gradient, for the right data, is policy gradient 33 | weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32) 34 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32) 35 | action_masks = tf.one_hot(act_ph, n_acts) 36 | log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1) 37 | loss = -tf.reduce_mean(weights_ph * log_probs) 38 | 39 | # make train op 40 | train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) 41 | 42 | sess = tf.InteractiveSession() 43 | sess.run(tf.global_variables_initializer()) 44 | 45 | # for training policy 46 | def train_one_epoch(): 47 | # make some empty lists for logging. 48 | batch_obs = [] # for observations 49 | batch_acts = [] # for actions 50 | batch_weights = [] # for R(tau) weighting in policy gradient 51 | batch_rets = [] # for measuring episode returns 52 | batch_lens = [] # for measuring episode lengths 53 | 54 | # reset episode-specific variables 55 | obs = env.reset() # first obs comes from starting distribution 56 | done = False # signal from environment that episode is over 57 | ep_rews = [] # list for rewards accrued throughout ep 58 | 59 | # render first episode of each epoch 60 | finished_rendering_this_epoch = False 61 | 62 | # collect experience by acting in the environment with current policy 63 | while True: 64 | 65 | # rendering 66 | if (not finished_rendering_this_epoch) and render: 67 | env.render() 68 | 69 | # save obs 70 | batch_obs.append(obs.copy()) 71 | 72 | # act in the environment 73 | act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0] 74 | obs, rew, done, _ = env.step(act) 75 | 76 | # save action, reward 77 | batch_acts.append(act) 78 | ep_rews.append(rew) 79 | 80 | if done: 81 | # if episode is over, record info about episode 82 | ep_ret, ep_len = sum(ep_rews), len(ep_rews) 83 | batch_rets.append(ep_ret) 84 | batch_lens.append(ep_len) 85 | 86 | # the weight for each logprob(a|s) is R(tau) 87 | batch_weights += [ep_ret] * ep_len 88 | 89 | # reset episode-specific variables 90 | obs, done, ep_rews = env.reset(), False, [] 91 | 92 | # won't render again this epoch 93 | finished_rendering_this_epoch = True 94 | 95 | # end experience loop if we have enough of it 96 | if len(batch_obs) > batch_size: 97 | break 98 | 99 | # take a single policy gradient update step 100 | batch_loss, _ = sess.run([loss, train_op], 101 | feed_dict={ 102 | obs_ph: np.array(batch_obs), 103 | act_ph: np.array(batch_acts), 104 | weights_ph: np.array(batch_weights) 105 | }) 106 | return batch_loss, batch_rets, batch_lens 107 | 108 | # training loop 109 | for i in range(epochs): 110 | batch_loss, batch_rets, batch_lens = train_one_epoch() 111 | print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'% 112 | (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens))) 113 | 114 | if __name__ == '__main__': 115 | import argparse 116 | parser = argparse.ArgumentParser() 117 | parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0') 118 | parser.add_argument('--render', action='store_true') 119 | parser.add_argument('--lr', type=float, default=1e-2) 120 | args = parser.parse_args() 121 | print('\nUsing simplest formulation of policy gradient.\n') 122 | train(env_name=args.env_name, render=args.render, lr=args.lr) -------------------------------------------------------------------------------- /spinup/examples/tf1/pg_math/2_rtg_pg.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import gym 4 | from gym.spaces import Discrete, Box 5 | 6 | def mlp(x, sizes, activation=tf.tanh, output_activation=None): 7 | # Build a feedforward neural network. 8 | for size in sizes[:-1]: 9 | x = tf.layers.dense(x, units=size, activation=activation) 10 | return tf.layers.dense(x, units=sizes[-1], activation=output_activation) 11 | 12 | def reward_to_go(rews): 13 | n = len(rews) 14 | rtgs = np.zeros_like(rews) 15 | for i in reversed(range(n)): 16 | rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0) 17 | return rtgs 18 | 19 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 20 | epochs=50, batch_size=5000, render=False): 21 | 22 | # make environment, check spaces, get obs / act dims 23 | env = gym.make(env_name) 24 | assert isinstance(env.observation_space, Box), \ 25 | "This example only works for envs with continuous state spaces." 26 | assert isinstance(env.action_space, Discrete), \ 27 | "This example only works for envs with discrete action spaces." 28 | 29 | obs_dim = env.observation_space.shape[0] 30 | n_acts = env.action_space.n 31 | 32 | # make core of policy network 33 | obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32) 34 | logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts]) 35 | 36 | # make action selection op (outputs int actions, sampled from policy) 37 | actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1) 38 | 39 | # make loss function whose gradient, for the right data, is policy gradient 40 | weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32) 41 | act_ph = tf.placeholder(shape=(None,), dtype=tf.int32) 42 | action_masks = tf.one_hot(act_ph, n_acts) 43 | log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1) 44 | loss = -tf.reduce_mean(weights_ph * log_probs) 45 | 46 | # make train op 47 | train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss) 48 | 49 | sess = tf.InteractiveSession() 50 | sess.run(tf.global_variables_initializer()) 51 | 52 | # for training policy 53 | def train_one_epoch(): 54 | # make some empty lists for logging. 55 | batch_obs = [] # for observations 56 | batch_acts = [] # for actions 57 | batch_weights = [] # for reward-to-go weighting in policy gradient 58 | batch_rets = [] # for measuring episode returns 59 | batch_lens = [] # for measuring episode lengths 60 | 61 | # reset episode-specific variables 62 | obs = env.reset() # first obs comes from starting distribution 63 | done = False # signal from environment that episode is over 64 | ep_rews = [] # list for rewards accrued throughout ep 65 | 66 | # render first episode of each epoch 67 | finished_rendering_this_epoch = False 68 | 69 | # collect experience by acting in the environment with current policy 70 | while True: 71 | 72 | # rendering 73 | if (not finished_rendering_this_epoch) and render: 74 | env.render() 75 | 76 | # save obs 77 | batch_obs.append(obs.copy()) 78 | 79 | # act in the environment 80 | act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0] 81 | obs, rew, done, _ = env.step(act) 82 | 83 | # save action, reward 84 | batch_acts.append(act) 85 | ep_rews.append(rew) 86 | 87 | if done: 88 | # if episode is over, record info about episode 89 | ep_ret, ep_len = sum(ep_rews), len(ep_rews) 90 | batch_rets.append(ep_ret) 91 | batch_lens.append(ep_len) 92 | 93 | # the weight for each logprob(a_t|s_t) is reward-to-go from t 94 | batch_weights += list(reward_to_go(ep_rews)) 95 | 96 | # reset episode-specific variables 97 | obs, done, ep_rews = env.reset(), False, [] 98 | 99 | # won't render again this epoch 100 | finished_rendering_this_epoch = True 101 | 102 | # end experience loop if we have enough of it 103 | if len(batch_obs) > batch_size: 104 | break 105 | 106 | # take a single policy gradient update step 107 | batch_loss, _ = sess.run([loss, train_op], 108 | feed_dict={ 109 | obs_ph: np.array(batch_obs), 110 | act_ph: np.array(batch_acts), 111 | weights_ph: np.array(batch_weights) 112 | }) 113 | return batch_loss, batch_rets, batch_lens 114 | 115 | # training loop 116 | for i in range(epochs): 117 | batch_loss, batch_rets, batch_lens = train_one_epoch() 118 | print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'% 119 | (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens))) 120 | 121 | if __name__ == '__main__': 122 | import argparse 123 | parser = argparse.ArgumentParser() 124 | parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0') 125 | parser.add_argument('--render', action='store_true') 126 | parser.add_argument('--lr', type=float, default=1e-2) 127 | args = parser.parse_args() 128 | print('\nUsing reward-to-go formulation of policy gradient.\n') 129 | train(env_name=args.env_name, render=args.render, lr=args.lr) -------------------------------------------------------------------------------- /spinup/examples/tf1/train_mnist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | from spinup.utils.logx import EpochLogger 5 | 6 | 7 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 8 | for h in hidden_sizes[:-1]: 9 | x = tf.layers.dense(x, units=h, activation=activation) 10 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 11 | 12 | 13 | # Simple script for training an MLP on MNIST. 14 | def train_mnist(steps_per_epoch=100, epochs=5, 15 | lr=1e-3, layers=2, hidden_size=64, 16 | logger_kwargs=dict(), save_freq=1): 17 | 18 | logger = EpochLogger(**logger_kwargs) 19 | logger.save_config(locals()) 20 | 21 | # Load and preprocess MNIST data 22 | (x_train, y_train), _ = tf.keras.datasets.mnist.load_data() 23 | x_train = x_train.reshape(-1, 28*28) / 255.0 24 | 25 | # Define inputs & main outputs from computation graph 26 | x_ph = tf.placeholder(tf.float32, shape=(None, 28*28)) 27 | y_ph = tf.placeholder(tf.int32, shape=(None,)) 28 | logits = mlp(x_ph, hidden_sizes=[hidden_size]*layers + [10], activation=tf.nn.relu) 29 | predict = tf.argmax(logits, axis=1, output_type=tf.int32) 30 | 31 | # Define loss function, accuracy, and training op 32 | y = tf.one_hot(y_ph, 10) 33 | loss = tf.losses.softmax_cross_entropy(y, logits) 34 | acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32)) 35 | train_op = tf.train.AdamOptimizer().minimize(loss) 36 | 37 | # Prepare session 38 | sess = tf.Session() 39 | sess.run(tf.global_variables_initializer()) 40 | 41 | # Setup model saving 42 | logger.setup_tf_saver(sess, inputs={'x': x_ph}, 43 | outputs={'logits': logits, 'predict': predict}) 44 | 45 | start_time = time.time() 46 | 47 | # Run main training loop 48 | for epoch in range(epochs): 49 | for t in range(steps_per_epoch): 50 | idxs = np.random.randint(0, len(x_train), 32) 51 | feed_dict = {x_ph: x_train[idxs], 52 | y_ph: y_train[idxs]} 53 | outs = sess.run([loss, acc, train_op], feed_dict=feed_dict) 54 | logger.store(Loss=outs[0], Acc=outs[1]) 55 | 56 | # Save model 57 | if (epoch % save_freq == 0) or (epoch == epochs-1): 58 | logger.save_state(state_dict=dict(), itr=None) 59 | 60 | # Log info about epoch 61 | logger.log_tabular('Epoch', epoch) 62 | logger.log_tabular('Acc', with_min_and_max=True) 63 | logger.log_tabular('Loss', average_only=True) 64 | logger.log_tabular('TotalGradientSteps', (epoch+1)*steps_per_epoch) 65 | logger.log_tabular('Time', time.time()-start_time) 66 | logger.dump_tabular() 67 | 68 | if __name__ == '__main__': 69 | train_mnist() -------------------------------------------------------------------------------- /spinup/exercises/common.py: -------------------------------------------------------------------------------- 1 | def print_result(correct=False): 2 | print('\n'*5 + '='*50 + '\n'*3) 3 | if correct: 4 | print("Congratulations! Your answer is correct.") 5 | else: 6 | print("Your answer appears to be incorrect. Try again!") 7 | print('\n'*3 + '='*50) -------------------------------------------------------------------------------- /spinup/exercises/pytorch/problem_set_1/exercise1_1.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | """ 5 | 6 | Exercise 1.1: Diagonal Gaussian Likelihood 7 | 8 | Write a function that takes in PyTorch Tensors for the means and 9 | log stds of a batch of diagonal Gaussian distributions, along with a 10 | PyTorch Tensor for (previously-generated) samples from those 11 | distributions, and returns a Tensor containing the log 12 | likelihoods of those samples. 13 | 14 | """ 15 | 16 | def gaussian_likelihood(x, mu, log_std): 17 | """ 18 | Args: 19 | x: Tensor with shape [batch, dim] 20 | mu: Tensor with shape [batch, dim] 21 | log_std: Tensor with shape [batch, dim] or [dim] 22 | 23 | Returns: 24 | Tensor with shape [batch] 25 | """ 26 | ####################### 27 | # # 28 | # YOUR CODE HERE # 29 | # # 30 | ####################### 31 | return torch.zeros(1) 32 | 33 | 34 | if __name__ == '__main__': 35 | """ 36 | Run this file to verify your solution. 37 | """ 38 | from spinup.exercises.pytorch.problem_set_1_solutions import exercise1_1_soln 39 | from spinup.exercises.common import print_result 40 | 41 | batch_size = 32 42 | dim = 10 43 | 44 | x = torch.rand(batch_size, dim) 45 | mu = torch.rand(batch_size, dim) 46 | log_std = torch.rand(dim) 47 | 48 | your_gaussian_likelihood = gaussian_likelihood(x, mu, log_std) 49 | true_gaussian_likelihood = exercise1_1_soln.gaussian_likelihood(x, mu, log_std) 50 | 51 | your_result = your_gaussian_likelihood.detach().numpy() 52 | true_result = true_gaussian_likelihood.detach().numpy() 53 | 54 | correct = np.allclose(your_result, true_result) 55 | print_result(correct) -------------------------------------------------------------------------------- /spinup/exercises/pytorch/problem_set_1/exercise1_2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from spinup.exercises.pytorch.problem_set_1 import exercise1_1 5 | from spinup.exercises.pytorch.problem_set_1 import exercise1_2_auxiliary 6 | 7 | """ 8 | 9 | Exercise 1.2: PPO Gaussian Policy 10 | 11 | You will implement an MLP diagonal Gaussian policy for PPO by 12 | writing an MLP-builder, and a few other key functions. 13 | 14 | Log-likelihoods will be computed using your answer to Exercise 1.1, 15 | so make sure to complete that exercise before beginning this one. 16 | 17 | """ 18 | 19 | def mlp(sizes, activation, output_activation=nn.Identity): 20 | """ 21 | Build a multi-layer perceptron in PyTorch. 22 | 23 | Args: 24 | sizes: Tuple, list, or other iterable giving the number of units 25 | for each layer of the MLP. 26 | 27 | activation: Activation function for all layers except last. 28 | 29 | output_activation: Activation function for last layer. 30 | 31 | Returns: 32 | A PyTorch module that can be called to give the output of the MLP. 33 | (Use an nn.Sequential module.) 34 | 35 | """ 36 | ####################### 37 | # # 38 | # YOUR CODE HERE # 39 | # # 40 | ####################### 41 | pass 42 | 43 | class DiagonalGaussianDistribution: 44 | 45 | def __init__(self, mu, log_std): 46 | self.mu = mu 47 | self.log_std = log_std 48 | 49 | def sample(self): 50 | """ 51 | Returns: 52 | A PyTorch Tensor of samples from the diagonal Gaussian distribution with 53 | mean and log_std given by self.mu and self.log_std. 54 | """ 55 | ####################### 56 | # # 57 | # YOUR CODE HERE # 58 | # # 59 | ####################### 60 | pass 61 | 62 | #================================(Given, ignore)==========================================# 63 | def log_prob(self, value): 64 | return exercise1_1.gaussian_likelihood(value, self.mu, self.log_std) 65 | 66 | def entropy(self): 67 | return 0.5 + 0.5 * np.log(2 * np.pi) + self.log_std.sum(axis=-1) 68 | #=========================================================================================# 69 | 70 | 71 | class MLPGaussianActor(nn.Module): 72 | 73 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 74 | super().__init__() 75 | """ 76 | Initialize an MLP Gaussian Actor by making a PyTorch module for computing the 77 | mean of the distribution given a batch of observations, and a log_std parameter. 78 | 79 | Make log_std a PyTorch Parameter with the same shape as the action vector, 80 | independent of observations, initialized to [-0.5, -0.5, ..., -0.5]. 81 | (Make sure it's trainable!) 82 | """ 83 | ####################### 84 | # # 85 | # YOUR CODE HERE # 86 | # # 87 | ####################### 88 | # self.log_std = 89 | # self.mu_net = 90 | pass 91 | 92 | #================================(Given, ignore)==========================================# 93 | def forward(self, obs, act=None): 94 | mu = self.mu_net(obs) 95 | pi = DiagonalGaussianDistribution(mu, self.log_std) 96 | logp_a = None 97 | if act is not None: 98 | logp_a = pi.log_prob(act) 99 | return pi, logp_a 100 | #=========================================================================================# 101 | 102 | 103 | 104 | if __name__ == '__main__': 105 | """ 106 | Run this file to verify your solution. 107 | """ 108 | 109 | from spinup import ppo_pytorch as ppo 110 | from spinup.exercises.common import print_result 111 | from functools import partial 112 | import gym 113 | import os 114 | import pandas as pd 115 | import psutil 116 | import time 117 | 118 | logdir = "/tmp/experiments/%i"%int(time.time()) 119 | 120 | ActorCritic = partial(exercise1_2_auxiliary.ExerciseActorCritic, actor=MLPGaussianActor) 121 | 122 | ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'), 123 | actor_critic=ActorCritic, 124 | ac_kwargs=dict(hidden_sizes=(64,)), 125 | steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir)) 126 | 127 | # Get scores from last five epochs to evaluate success. 128 | data = pd.read_table(os.path.join(logdir,'progress.txt')) 129 | last_scores = data['AverageEpRet'][-5:] 130 | 131 | # Your implementation is probably correct if the agent has a score >500, 132 | # or if it reaches the top possible score of 1000, in the last five epochs. 133 | correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3 134 | print_result(correct) -------------------------------------------------------------------------------- /spinup/exercises/pytorch/problem_set_1/exercise1_2_auxiliary.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | """ 6 | 7 | Auxiliary code for Exercise 1.2. No part of the exercise requires you to 8 | look into or modify this file (and since it contains an mlp function, 9 | it has spoilers for the answer). Removed from the main file to avoid 10 | cluttering it up. 11 | 12 | In other words, nothing to see here, move along, these are not the 13 | droids you're looking for, and all that... 14 | 15 | """ 16 | 17 | def mlp(sizes, activation, output_activation=nn.Identity): 18 | layers = [] 19 | for j in range(len(sizes)-1): 20 | act = activation if j < len(sizes)-2 else output_activation 21 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 22 | return nn.Sequential(*layers) 23 | 24 | 25 | class MLPCritic(nn.Module): 26 | 27 | def __init__(self, obs_dim, hidden_sizes, activation): 28 | super().__init__() 29 | self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation) 30 | 31 | def forward(self, obs): 32 | return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape. 33 | 34 | 35 | class ExerciseActorCritic(nn.Module): 36 | 37 | def __init__(self, observation_space, action_space, 38 | hidden_sizes=(64,64), activation=nn.Tanh, 39 | actor=None): 40 | super().__init__() 41 | obs_dim = observation_space.shape[0] 42 | self.pi = actor(obs_dim, action_space.shape[0], hidden_sizes, activation) 43 | self.v = MLPCritic(obs_dim, hidden_sizes, activation) 44 | 45 | def step(self, obs): 46 | with torch.no_grad(): 47 | pi, _ = self.pi(obs) 48 | a = pi.sample() 49 | logp_a = pi.log_prob(a) 50 | v = self.v(obs) 51 | return a.numpy(), v.numpy(), logp_a.numpy() 52 | 53 | def act(self, obs): 54 | return self.step(obs)[0] -------------------------------------------------------------------------------- /spinup/exercises/pytorch/problem_set_1_solutions/exercise1_1_soln.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | EPS=1e-8 5 | 6 | def gaussian_likelihood(x, mu, log_std): 7 | pre_sum = -0.5 * (((x-mu)/(torch.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 8 | return pre_sum.sum(axis=-1) -------------------------------------------------------------------------------- /spinup/exercises/pytorch/problem_set_1_solutions/exercise1_2_soln.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | EPS=1e-8 6 | 7 | def mlp(sizes, activation, output_activation=nn.Identity): 8 | layers = [] 9 | for j in range(len(sizes)-1): 10 | act = activation if j < len(sizes)-2 else output_activation 11 | layers += [nn.Linear(sizes[j], sizes[j+1]), act()] 12 | return nn.Sequential(*layers) 13 | 14 | def gaussian_likelihood(x, mu, log_std): 15 | pre_sum = -0.5 * (((x-mu)/(torch.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 16 | return pre_sum.sum(axis=-1) 17 | 18 | 19 | class DiagonalGaussianDistribution: 20 | 21 | def __init__(self, mu, log_std): 22 | self.mu = mu 23 | self.log_std = log_std 24 | 25 | def sample(self): 26 | return self.mu + torch.exp(self.log_std) * torch.randn_like(self.mu) 27 | 28 | def log_prob(self, value): 29 | return gaussian_likelihood(value, self.mu, self.log_std) 30 | 31 | def entropy(self): 32 | return 0.5 + 0.5 * np.log(2 * np.pi) + self.log_std.sum(axis=-1) 33 | 34 | 35 | class MLPGaussianActor(nn.Module): 36 | 37 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 38 | super().__init__() 39 | log_std = -0.5 * np.ones(act_dim, dtype=np.float32) 40 | self.log_std = torch.nn.Parameter(torch.as_tensor(log_std)) 41 | self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation) 42 | 43 | def forward(self, obs, act=None): 44 | mu = self.mu_net(obs) 45 | pi = DiagonalGaussianDistribution(mu, self.log_std) 46 | logp_a = None 47 | if act is not None: 48 | logp_a = pi.log_prob(act) 49 | return pi, logp_a -------------------------------------------------------------------------------- /spinup/exercises/pytorch/problem_set_2/exercise2_2.py: -------------------------------------------------------------------------------- 1 | from spinup.algos.pytorch.ddpg.core import mlp, MLPActorCritic 2 | from spinup.utils.run_utils import ExperimentGrid 3 | from spinup import ddpg_pytorch as ddpg 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | 8 | """ 9 | 10 | Exercise 2.2: Silent Bug in DDPG (PyTorch Version) 11 | 12 | In this exercise, you will run DDPG with a bugged actor critic. Your goal is 13 | to determine whether or not there is any performance degredation, and if so, 14 | figure out what's going wrong. 15 | 16 | You do NOT need to write code for this exercise. 17 | 18 | """ 19 | 20 | """ 21 | Bugged Actor-Critic 22 | """ 23 | 24 | class BuggedMLPActor(nn.Module): 25 | 26 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit): 27 | super().__init__() 28 | pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim] 29 | self.pi = mlp(pi_sizes, activation, nn.Tanh) 30 | self.act_limit = act_limit 31 | 32 | def forward(self, obs): 33 | # Return output from network scaled to action space limits. 34 | return self.act_limit * self.pi(obs) 35 | 36 | class BuggedMLPQFunction(nn.Module): 37 | 38 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation): 39 | super().__init__() 40 | self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation) 41 | 42 | def forward(self, obs, act): 43 | return self.q(torch.cat([obs, act], dim=-1)) 44 | 45 | class BuggedMLPActorCritic(nn.Module): 46 | 47 | def __init__(self, observation_space, action_space, hidden_sizes=(256,256), 48 | activation=nn.ReLU): 49 | super().__init__() 50 | 51 | obs_dim = observation_space.shape[0] 52 | act_dim = action_space.shape[0] 53 | act_limit = action_space.high[0] 54 | 55 | # build policy and value functions 56 | self.pi = BuggedMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit) 57 | self.q = BuggedMLPQFunction(obs_dim, act_dim, hidden_sizes, activation) 58 | 59 | def act(self, obs): 60 | with torch.no_grad(): 61 | return self.pi(obs).numpy() 62 | 63 | 64 | if __name__ == '__main__': 65 | import argparse 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 68 | parser.add_argument('--h', type=int, default=300) 69 | parser.add_argument('--l', type=int, default=1) 70 | parser.add_argument('--num_runs', '-n', type=int, default=3) 71 | parser.add_argument('--steps_per_epoch', '-s', type=int, default=5000) 72 | parser.add_argument('--total_steps', '-t', type=int, default=int(5e4)) 73 | args = parser.parse_args() 74 | 75 | def ddpg_with_actor_critic(bugged, **kwargs): 76 | from spinup.exercises.pytorch.problem_set_2.exercise2_2 import BuggedMLPActorCritic 77 | actor_critic = BuggedMLPActorCritic if bugged else MLPActorCritic 78 | return ddpg(actor_critic=actor_critic, 79 | ac_kwargs=dict(hidden_sizes=[args.h]*args.l), 80 | start_steps=5000, 81 | max_ep_len=150, 82 | batch_size=64, 83 | polyak=0.95, 84 | **kwargs) 85 | 86 | eg = ExperimentGrid(name='ex2-2_ddpg') 87 | eg.add('replay_size', int(args.total_steps)) 88 | eg.add('env_name', args.env, '', True) 89 | eg.add('seed', [10*i for i in range(args.num_runs)]) 90 | eg.add('epochs', int(args.total_steps / args.steps_per_epoch)) 91 | eg.add('steps_per_epoch', args.steps_per_epoch) 92 | eg.add('bugged', [False, True]) 93 | eg.run(ddpg_with_actor_critic, datestamp=True) -------------------------------------------------------------------------------- /spinup/exercises/tf1/problem_set_1/exercise1_1.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | """ 5 | 6 | Exercise 1.1: Diagonal Gaussian Likelihood 7 | 8 | Write a function which takes in Tensorflow symbols for the means and 9 | log stds of a batch of diagonal Gaussian distributions, along with a 10 | Tensorflow placeholder for (previously-generated) samples from those 11 | distributions, and returns a Tensorflow symbol for computing the log 12 | likelihoods of those samples. 13 | 14 | """ 15 | 16 | def gaussian_likelihood(x, mu, log_std): 17 | """ 18 | Args: 19 | x: Tensor with shape [batch, dim] 20 | mu: Tensor with shape [batch, dim] 21 | log_std: Tensor with shape [batch, dim] or [dim] 22 | 23 | Returns: 24 | Tensor with shape [batch] 25 | """ 26 | ####################### 27 | # # 28 | # YOUR CODE HERE # 29 | # # 30 | ####################### 31 | return tf.constant(0) 32 | 33 | 34 | if __name__ == '__main__': 35 | """ 36 | Run this file to verify your solution. 37 | """ 38 | from spinup.exercises.tf1.problem_set_1_solutions import exercise1_1_soln 39 | from spinup.exercises.common import print_result 40 | 41 | sess = tf.Session() 42 | 43 | dim = 10 44 | x = tf.placeholder(tf.float32, shape=(None, dim)) 45 | mu = tf.placeholder(tf.float32, shape=(None, dim)) 46 | log_std = tf.placeholder(tf.float32, shape=(dim,)) 47 | 48 | your_gaussian_likelihood = gaussian_likelihood(x, mu, log_std) 49 | true_gaussian_likelihood = exercise1_1_soln.gaussian_likelihood(x, mu, log_std) 50 | 51 | batch_size = 32 52 | feed_dict = {x: np.random.rand(batch_size, dim), 53 | mu: np.random.rand(batch_size, dim), 54 | log_std: np.random.rand(dim)} 55 | 56 | your_result, true_result = sess.run([your_gaussian_likelihood, true_gaussian_likelihood], 57 | feed_dict=feed_dict) 58 | 59 | correct = np.allclose(your_result, true_result) 60 | print_result(correct) -------------------------------------------------------------------------------- /spinup/exercises/tf1/problem_set_1/exercise1_2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from spinup.exercises.tf1.problem_set_1 import exercise1_1 4 | 5 | """ 6 | 7 | Exercise 1.2: PPO Gaussian Policy 8 | 9 | Implement an MLP diagonal Gaussian policy for PPO. 10 | 11 | Log-likelihoods will be computed using your answer to Exercise 1.1, 12 | so make sure to complete that exercise before beginning this one. 13 | 14 | """ 15 | 16 | EPS = 1e-8 17 | 18 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 19 | """ 20 | Builds a multi-layer perceptron in Tensorflow. 21 | 22 | Args: 23 | x: Input tensor. 24 | 25 | hidden_sizes: Tuple, list, or other iterable giving the number of units 26 | for each hidden layer of the MLP. 27 | 28 | activation: Activation function for all layers except last. 29 | 30 | output_activation: Activation function for last layer. 31 | 32 | Returns: 33 | A TF symbol for the output of an MLP that takes x as an input. 34 | 35 | """ 36 | ####################### 37 | # # 38 | # YOUR CODE HERE # 39 | # # 40 | ####################### 41 | pass 42 | 43 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 44 | """ 45 | Builds symbols to sample actions and compute log-probs of actions. 46 | 47 | Special instructions: Make log_std a tf variable with the same shape as 48 | the action vector, independent of x, initialized to [-0.5, -0.5, ..., -0.5]. 49 | 50 | Args: 51 | x: Input tensor of states. Shape [batch, obs_dim]. 52 | 53 | a: Input tensor of actions. Shape [batch, act_dim]. 54 | 55 | hidden_sizes: Sizes of hidden layers for action network MLP. 56 | 57 | activation: Activation function for all layers except last. 58 | 59 | output_activation: Activation function for last layer (action layer). 60 | 61 | action_space: A gym.spaces object describing the action space of the 62 | environment this agent will interact with. 63 | 64 | Returns: 65 | pi: A symbol for sampling stochastic actions from a Gaussian 66 | distribution. 67 | 68 | logp: A symbol for computing log-likelihoods of actions from a Gaussian 69 | distribution. 70 | 71 | logp_pi: A symbol for computing log-likelihoods of actions in pi from a 72 | Gaussian distribution. 73 | 74 | """ 75 | ####################### 76 | # # 77 | # YOUR CODE HERE # 78 | # # 79 | ####################### 80 | # mu = 81 | # log_std = 82 | # pi = 83 | 84 | logp = exercise1_1.gaussian_likelihood(a, mu, log_std) 85 | logp_pi = exercise1_1.gaussian_likelihood(pi, mu, log_std) 86 | return pi, logp, logp_pi 87 | 88 | 89 | if __name__ == '__main__': 90 | """ 91 | Run this file to verify your solution. 92 | """ 93 | 94 | from spinup import ppo_tf1 as ppo 95 | from spinup.exercises.common import print_result 96 | import gym 97 | import os 98 | import pandas as pd 99 | import psutil 100 | import time 101 | 102 | logdir = "/tmp/experiments/%i"%int(time.time()) 103 | ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'), 104 | ac_kwargs=dict(policy=mlp_gaussian_policy, hidden_sizes=(64,)), 105 | steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir)) 106 | 107 | # Get scores from last five epochs to evaluate success. 108 | data = pd.read_table(os.path.join(logdir,'progress.txt')) 109 | last_scores = data['AverageEpRet'][-5:] 110 | 111 | # Your implementation is probably correct if the agent has a score >500, 112 | # or if it reaches the top possible score of 1000, in the last five epochs. 113 | correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3 114 | print_result(correct) -------------------------------------------------------------------------------- /spinup/exercises/tf1/problem_set_1_solutions/exercise1_1_soln.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | EPS=1e-8 5 | 6 | def gaussian_likelihood(x, mu, log_std): 7 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 8 | return tf.reduce_sum(pre_sum, axis=1) -------------------------------------------------------------------------------- /spinup/exercises/tf1/problem_set_1_solutions/exercise1_2_soln.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | EPS = 1e-8 6 | 7 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None): 8 | for h in hidden_sizes[:-1]: 9 | x = tf.layers.dense(x, units=h, activation=activation) 10 | return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) 11 | 12 | def gaussian_likelihood(x, mu, log_std): 13 | pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 14 | return tf.reduce_sum(pre_sum, axis=1) 15 | 16 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space): 17 | act_dim = a.shape.as_list()[-1] 18 | mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 19 | log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32)) 20 | std = tf.exp(log_std) 21 | pi = mu + tf.random_normal(tf.shape(mu)) * std 22 | logp = gaussian_likelihood(a, mu, log_std) 23 | logp_pi = gaussian_likelihood(pi, mu, log_std) 24 | return pi, logp, logp_pi -------------------------------------------------------------------------------- /spinup/exercises/tf1/problem_set_2/exercise2_2.py: -------------------------------------------------------------------------------- 1 | from spinup.algos.tf1.ddpg.core import mlp, mlp_actor_critic 2 | from spinup.utils.run_utils import ExperimentGrid 3 | from spinup import ddpg_tf1 as ddpg 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | """ 8 | 9 | Exercise 2.2: Silent Bug in DDPG 10 | 11 | In this exercise, you will run DDPG with a bugged actor critic. Your goal is 12 | to determine whether or not there is any performance degredation, and if so, 13 | figure out what's going wrong. 14 | 15 | You do NOT need to write code for this exercise. 16 | 17 | """ 18 | 19 | """ 20 | Bugged Actor-Critic 21 | """ 22 | def bugged_mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 23 | output_activation=tf.tanh, action_space=None): 24 | act_dim = a.shape.as_list()[-1] 25 | act_limit = action_space.high[0] 26 | with tf.variable_scope('pi'): 27 | pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation) 28 | with tf.variable_scope('q'): 29 | q = mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None) 30 | with tf.variable_scope('q', reuse=True): 31 | q_pi = mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None) 32 | return pi, q, q_pi 33 | 34 | 35 | if __name__ == '__main__': 36 | import argparse 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 39 | parser.add_argument('--h', type=int, default=300) 40 | parser.add_argument('--l', type=int, default=1) 41 | parser.add_argument('--num_runs', '-n', type=int, default=3) 42 | parser.add_argument('--steps_per_epoch', '-s', type=int, default=5000) 43 | parser.add_argument('--total_steps', '-t', type=int, default=int(5e4)) 44 | args = parser.parse_args() 45 | 46 | def ddpg_with_actor_critic(bugged, **kwargs): 47 | actor_critic = bugged_mlp_actor_critic if bugged else mlp_actor_critic 48 | return ddpg(actor_critic=actor_critic, 49 | ac_kwargs=dict(hidden_sizes=[args.h]*args.l), 50 | start_steps=5000, 51 | max_ep_len=150, 52 | batch_size=64, 53 | polyak=0.95, 54 | **kwargs) 55 | 56 | eg = ExperimentGrid(name='ex2-2_ddpg') 57 | eg.add('replay_size', int(args.total_steps)) 58 | eg.add('env_name', args.env, '', True) 59 | eg.add('seed', [10*i for i in range(args.num_runs)]) 60 | eg.add('epochs', int(args.total_steps / args.steps_per_epoch)) 61 | eg.add('steps_per_epoch', args.steps_per_epoch) 62 | eg.add('bugged', [False, True]) 63 | eg.run(ddpg_with_actor_critic, datestamp=True) -------------------------------------------------------------------------------- /spinup/user_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | 4 | # Default neural network backend for each algo 5 | # (Must be either 'tf1' or 'pytorch') 6 | DEFAULT_BACKEND = { 7 | 'vpg': 'pytorch', 8 | 'trpo': 'tf1', 9 | 'ppo': 'pytorch', 10 | 'ddpg': 'pytorch', 11 | 'td3': 'pytorch', 12 | 'sac': 'pytorch' 13 | } 14 | 15 | # Where experiment outputs are saved by default: 16 | DEFAULT_DATA_DIR = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),'data') 17 | 18 | # Whether to automatically insert a date and time stamp into the names of 19 | # save directories: 20 | FORCE_DATESTAMP = False 21 | 22 | # Whether GridSearch provides automatically-generated default shorthands: 23 | DEFAULT_SHORTHAND = True 24 | 25 | # Tells the GridSearch how many seconds to pause for before launching 26 | # experiments. 27 | WAIT_BEFORE_LAUNCH = 5 -------------------------------------------------------------------------------- /spinup/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/utils/__init__.py -------------------------------------------------------------------------------- /spinup/utils/mpi_pytorch.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import numpy as np 3 | import os 4 | import torch 5 | from mpi4py import MPI 6 | from spinup.utils.mpi_tools import broadcast, mpi_avg, num_procs, proc_id 7 | 8 | def setup_pytorch_for_mpi(): 9 | """ 10 | Avoid slowdowns caused by each separate process's PyTorch using 11 | more than its fair share of CPU resources. 12 | """ 13 | #print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True) 14 | if torch.get_num_threads()==1: 15 | return 16 | fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1) 17 | torch.set_num_threads(fair_num_threads) 18 | #print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True) 19 | 20 | def mpi_avg_grads(module): 21 | """ Average contents of gradient buffers across MPI processes. """ 22 | if num_procs()==1: 23 | return 24 | for p in module.parameters(): 25 | p_grad_numpy = p.grad.numpy() # numpy view of tensor data 26 | avg_p_grad = mpi_avg(p.grad) 27 | p_grad_numpy[:] = avg_p_grad[:] 28 | 29 | def sync_params(module): 30 | """ Sync all parameters of module across all MPI processes. """ 31 | if num_procs()==1: 32 | return 33 | for p in module.parameters(): 34 | p_numpy = p.data.numpy() 35 | broadcast(p_numpy) -------------------------------------------------------------------------------- /spinup/utils/mpi_tf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from mpi4py import MPI 4 | from spinup.utils.mpi_tools import broadcast 5 | 6 | 7 | def flat_concat(xs): 8 | return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0) 9 | 10 | def assign_params_from_flat(x, params): 11 | flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars 12 | splits = tf.split(x, [flat_size(p) for p in params]) 13 | new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)] 14 | return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)]) 15 | 16 | def sync_params(params): 17 | get_params = flat_concat(params) 18 | def _broadcast(x): 19 | broadcast(x) 20 | return x 21 | synced_params = tf.py_func(_broadcast, [get_params], tf.float32) 22 | return assign_params_from_flat(synced_params, params) 23 | 24 | def sync_all_params(): 25 | """Sync all tf variables across MPI processes.""" 26 | return sync_params(tf.global_variables()) 27 | 28 | 29 | class MpiAdamOptimizer(tf.train.AdamOptimizer): 30 | """ 31 | Adam optimizer that averages gradients across MPI processes. 32 | 33 | The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. 34 | For documentation on method arguments, see the Tensorflow docs page for 35 | the base `AdamOptimizer`_. 36 | 37 | .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py 38 | .. _`AdamOptimizer`: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 39 | """ 40 | 41 | def __init__(self, **kwargs): 42 | self.comm = MPI.COMM_WORLD 43 | tf.train.AdamOptimizer.__init__(self, **kwargs) 44 | 45 | def compute_gradients(self, loss, var_list, **kwargs): 46 | """ 47 | Same as normal compute_gradients, except average grads over processes. 48 | """ 49 | grads_and_vars = super().compute_gradients(loss, var_list, **kwargs) 50 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] 51 | flat_grad = flat_concat([g for g, v in grads_and_vars]) 52 | shapes = [v.shape.as_list() for g, v in grads_and_vars] 53 | sizes = [int(np.prod(s)) for s in shapes] 54 | 55 | num_tasks = self.comm.Get_size() 56 | buf = np.zeros(flat_grad.shape, np.float32) 57 | 58 | def _collect_grads(flat_grad): 59 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) 60 | np.divide(buf, float(num_tasks), out=buf) 61 | return buf 62 | 63 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) 64 | avg_flat_grad.set_shape(flat_grad.shape) 65 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0) 66 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v) 67 | for g, (_, v) in zip(avg_grads, grads_and_vars)] 68 | 69 | return avg_grads_and_vars 70 | 71 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 72 | """ 73 | Same as normal apply_gradients, except sync params after update. 74 | """ 75 | opt = super().apply_gradients(grads_and_vars, global_step, name) 76 | with tf.control_dependencies([opt]): 77 | sync = sync_params([v for g,v in grads_and_vars]) 78 | return tf.group([opt, sync]) -------------------------------------------------------------------------------- /spinup/utils/mpi_tools.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import os, subprocess, sys 3 | import numpy as np 4 | 5 | 6 | def mpi_fork(n, bind_to_core=False): 7 | """ 8 | Re-launches the current script with workers linked by MPI. 9 | 10 | Also, terminates the original process that launched it. 11 | 12 | Taken almost without modification from the Baselines function of the 13 | `same name`_. 14 | 15 | .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py 16 | 17 | Args: 18 | n (int): Number of process to split into. 19 | 20 | bind_to_core (bool): Bind each MPI process to a core. 21 | """ 22 | if n<=1: 23 | return 24 | if os.getenv("IN_MPI") is None: 25 | env = os.environ.copy() 26 | env.update( 27 | MKL_NUM_THREADS="1", 28 | OMP_NUM_THREADS="1", 29 | IN_MPI="1" 30 | ) 31 | args = ["mpirun", "-np", str(n)] 32 | if bind_to_core: 33 | args += ["-bind-to", "core"] 34 | args += [sys.executable] + sys.argv 35 | subprocess.check_call(args, env=env) 36 | sys.exit() 37 | 38 | 39 | def msg(m, string=''): 40 | print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m)) 41 | 42 | def proc_id(): 43 | """Get rank of calling process.""" 44 | return MPI.COMM_WORLD.Get_rank() 45 | 46 | def allreduce(*args, **kwargs): 47 | return MPI.COMM_WORLD.Allreduce(*args, **kwargs) 48 | 49 | def num_procs(): 50 | """Count active MPI processes.""" 51 | return MPI.COMM_WORLD.Get_size() 52 | 53 | def broadcast(x, root=0): 54 | MPI.COMM_WORLD.Bcast(x, root=root) 55 | 56 | def mpi_op(x, op): 57 | x, scalar = ([x], True) if np.isscalar(x) else (x, False) 58 | x = np.asarray(x, dtype=np.float32) 59 | buff = np.zeros_like(x, dtype=np.float32) 60 | allreduce(x, buff, op=op) 61 | return buff[0] if scalar else buff 62 | 63 | def mpi_sum(x): 64 | return mpi_op(x, MPI.SUM) 65 | 66 | def mpi_avg(x): 67 | """Average a scalar or vector over MPI processes.""" 68 | return mpi_sum(x) / num_procs() 69 | 70 | def mpi_statistics_scalar(x, with_min_and_max=False): 71 | """ 72 | Get mean/std and optional min/max of scalar x across MPI processes. 73 | 74 | Args: 75 | x: An array containing samples of the scalar to produce statistics 76 | for. 77 | 78 | with_min_and_max (bool): If true, return min and max of x in 79 | addition to mean and std. 80 | """ 81 | x = np.array(x, dtype=np.float32) 82 | global_sum, global_n = mpi_sum([np.sum(x), len(x)]) 83 | mean = global_sum / global_n 84 | 85 | global_sum_sq = mpi_sum(np.sum((x - mean)**2)) 86 | std = np.sqrt(global_sum_sq / global_n) # compute global std 87 | 88 | if with_min_and_max: 89 | global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN) 90 | global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX) 91 | return mean, std, global_min, global_max 92 | return mean, std -------------------------------------------------------------------------------- /spinup/utils/run_entrypoint.py: -------------------------------------------------------------------------------- 1 | import zlib 2 | import pickle 3 | import base64 4 | 5 | if __name__ == '__main__': 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('encoded_thunk') 9 | args = parser.parse_args() 10 | thunk = pickle.loads(zlib.decompress(base64.b64decode(args.encoded_thunk))) 11 | thunk() -------------------------------------------------------------------------------- /spinup/utils/serialization_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | def convert_json(obj): 4 | """ Convert obj to a version which can be serialized with JSON. """ 5 | if is_json_serializable(obj): 6 | return obj 7 | else: 8 | if isinstance(obj, dict): 9 | return {convert_json(k): convert_json(v) 10 | for k,v in obj.items()} 11 | 12 | elif isinstance(obj, tuple): 13 | return (convert_json(x) for x in obj) 14 | 15 | elif isinstance(obj, list): 16 | return [convert_json(x) for x in obj] 17 | 18 | elif hasattr(obj,'__name__') and not('lambda' in obj.__name__): 19 | return convert_json(obj.__name__) 20 | 21 | elif hasattr(obj,'__dict__') and obj.__dict__: 22 | obj_dict = {convert_json(k): convert_json(v) 23 | for k,v in obj.__dict__.items()} 24 | return {str(obj): obj_dict} 25 | 26 | return str(obj) 27 | 28 | def is_json_serializable(v): 29 | try: 30 | json.dumps(v) 31 | return True 32 | except: 33 | return False -------------------------------------------------------------------------------- /spinup/utils/test_policy.py: -------------------------------------------------------------------------------- 1 | import time 2 | import joblib 3 | import os 4 | import os.path as osp 5 | import tensorflow as tf 6 | import torch 7 | from spinup import EpochLogger 8 | from spinup.utils.logx import restore_tf_graph 9 | 10 | 11 | def load_policy_and_env(fpath, itr='last', deterministic=False): 12 | """ 13 | Load a policy from save, whether it's TF or PyTorch, along with RL env. 14 | 15 | Not exceptionally future-proof, but it will suffice for basic uses of the 16 | Spinning Up implementations. 17 | 18 | Checks to see if there's a tf1_save folder. If yes, assumes the model 19 | is tensorflow and loads it that way. Otherwise, loads as if there's a 20 | PyTorch save. 21 | """ 22 | 23 | # determine if tf save or pytorch save 24 | if any(['tf1_save' in x for x in os.listdir(fpath)]): 25 | backend = 'tf1' 26 | else: 27 | backend = 'pytorch' 28 | 29 | # handle which epoch to load from 30 | if itr=='last': 31 | # check filenames for epoch (AKA iteration) numbers, find maximum value 32 | 33 | if backend == 'tf1': 34 | saves = [int(x[8:]) for x in os.listdir(fpath) if 'tf1_save' in x and len(x)>8] 35 | 36 | elif backend == 'pytorch': 37 | pytsave_path = osp.join(fpath, 'pyt_save') 38 | # Each file in this folder has naming convention 'modelXX.pt', where 39 | # 'XX' is either an integer or empty string. Empty string case 40 | # corresponds to len(x)==8, hence that case is excluded. 41 | saves = [int(x.split('.')[0][5:]) for x in os.listdir(pytsave_path) if len(x)>8 and 'model' in x] 42 | 43 | itr = '%d'%max(saves) if len(saves) > 0 else '' 44 | 45 | else: 46 | assert isinstance(itr, int), \ 47 | "Bad value provided for itr (needs to be int or 'last')." 48 | itr = '%d'%itr 49 | 50 | # load the get_action function 51 | if backend == 'tf1': 52 | get_action = load_tf_policy(fpath, itr, deterministic) 53 | else: 54 | get_action = load_pytorch_policy(fpath, itr, deterministic) 55 | 56 | # try to load environment from save 57 | # (sometimes this will fail because the environment could not be pickled) 58 | try: 59 | state = joblib.load(osp.join(fpath, 'vars'+itr+'.pkl')) 60 | env = state['env'] 61 | except: 62 | env = None 63 | 64 | return env, get_action 65 | 66 | 67 | def load_tf_policy(fpath, itr, deterministic=False): 68 | """ Load a tensorflow policy saved with Spinning Up Logger.""" 69 | 70 | fname = osp.join(fpath, 'tf1_save'+itr) 71 | print('\n\nLoading from %s.\n\n'%fname) 72 | 73 | # load the things! 74 | sess = tf.Session() 75 | model = restore_tf_graph(sess, fname) 76 | 77 | # get the correct op for executing actions 78 | if deterministic and 'mu' in model.keys(): 79 | # 'deterministic' is only a valid option for SAC policies 80 | print('Using deterministic action op.') 81 | action_op = model['mu'] 82 | else: 83 | print('Using default action op.') 84 | action_op = model['pi'] 85 | 86 | # make function for producing an action given a single state 87 | get_action = lambda x : sess.run(action_op, feed_dict={model['x']: x[None,:]})[0] 88 | 89 | return get_action 90 | 91 | 92 | def load_pytorch_policy(fpath, itr, deterministic=False): 93 | """ Load a pytorch policy saved with Spinning Up Logger.""" 94 | 95 | fname = osp.join(fpath, 'pyt_save', 'model'+itr+'.pt') 96 | print('\n\nLoading from %s.\n\n'%fname) 97 | 98 | model = torch.load(fname) 99 | 100 | # make function for producing an action given a single state 101 | def get_action(x): 102 | with torch.no_grad(): 103 | x = torch.as_tensor(x, dtype=torch.float32) 104 | action = model.act(x) 105 | return action 106 | 107 | return get_action 108 | 109 | 110 | def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True): 111 | 112 | assert env is not None, \ 113 | "Environment not found!\n\n It looks like the environment wasn't saved, " + \ 114 | "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \ 115 | "page on Experiment Outputs for how to handle this situation." 116 | 117 | logger = EpochLogger() 118 | o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 119 | while n < num_episodes: 120 | if render: 121 | env.render() 122 | time.sleep(1e-3) 123 | 124 | a = get_action(o) 125 | o, r, d, _ = env.step(a) 126 | ep_ret += r 127 | ep_len += 1 128 | 129 | if d or (ep_len == max_ep_len): 130 | logger.store(EpRet=ep_ret, EpLen=ep_len) 131 | print('Episode %d \t EpRet %.3f \t EpLen %d'%(n, ep_ret, ep_len)) 132 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 133 | n += 1 134 | 135 | logger.log_tabular('EpRet', with_min_and_max=True) 136 | logger.log_tabular('EpLen', average_only=True) 137 | logger.dump_tabular() 138 | 139 | 140 | if __name__ == '__main__': 141 | import argparse 142 | parser = argparse.ArgumentParser() 143 | parser.add_argument('fpath', type=str) 144 | parser.add_argument('--len', '-l', type=int, default=0) 145 | parser.add_argument('--episodes', '-n', type=int, default=100) 146 | parser.add_argument('--norender', '-nr', action='store_true') 147 | parser.add_argument('--itr', '-i', type=int, default=-1) 148 | parser.add_argument('--deterministic', '-d', action='store_true') 149 | args = parser.parse_args() 150 | env, get_action = load_policy_and_env(args.fpath, 151 | args.itr if args.itr >=0 else 'last', 152 | args.deterministic) 153 | run_policy(env, get_action, args.len, args.episodes, not(args.norender)) -------------------------------------------------------------------------------- /spinup/version.py: -------------------------------------------------------------------------------- 1 | version_info = (0, 2, 0) 2 | # format: 3 | # ('spinup_major', 'spinup_minor', 'spinup_patch') 4 | 5 | def get_version(): 6 | "Returns the version as a human-format string." 7 | return '%d.%d.%d' % version_info 8 | 9 | __version__ = get_version() -------------------------------------------------------------------------------- /test/test_ppo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import unittest 4 | from functools import partial 5 | 6 | import gym 7 | import tensorflow as tf 8 | 9 | from spinup import ppo_tf1 as ppo 10 | 11 | 12 | class TestPPO(unittest.TestCase): 13 | def test_cartpole(self): 14 | ''' Test training a small agent in a simple environment ''' 15 | env_fn = partial(gym.make, 'CartPole-v1') 16 | ac_kwargs = dict(hidden_sizes=(32,)) 17 | with tf.Graph().as_default(): 18 | ppo(env_fn, steps_per_epoch=100, epochs=10, ac_kwargs=ac_kwargs) 19 | # TODO: ensure policy has got better at the task 20 | 21 | 22 | if __name__ == '__main__': 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /travis_setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | mkdir -p $HOME/.mujoco 6 | 7 | # Avoid using pyenv in travis, since it adds ~7 minutes to turnaround time 8 | if [ "$TRAVIS_OS_NAME" == "osx" ] 9 | then 10 | # https://github.com/travis-ci/travis-ci/issues/9640 11 | sudo softwareupdate --install "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.4" 12 | brew update 13 | brew install open-mpi 14 | brew install gcc 15 | brew link --overwrite gcc 16 | curl $MUJOCO_FOR_OSX | tar xz -C $HOME/.mujoco/ 17 | elif [ "$TRAVIS_OS_NAME" == "linux" ] 18 | then 19 | # Because this is flaky, try several times 20 | set +e 21 | COUNT=0 22 | while [ $COUNT -lt 5 ]; do 23 | sudo curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf 24 | if [ $? -eq 0 ];then 25 | break 26 | fi 27 | let COUNT=COUNT+1 28 | done 29 | if [ $COUNT -ge 5 ]; then 30 | echo "Failed to download patchelf" 31 | exit 1 32 | fi 33 | set -e 34 | 35 | sudo chmod +x /usr/local/bin/patchelf 36 | curl $MUJOCO_FOR_LINUX | tar xz -C $HOME/.mujoco/ 37 | 38 | sudo apt-get update 39 | sudo apt-get install -y openmpi-bin libopenmpi-dev libosmesa6-dev libglew-dev 40 | fi 41 | --------------------------------------------------------------------------------