├── .gitignore
├── .travis.yml
├── LICENSE
├── docs
    ├── Makefile
    ├── _static
    │   ├── css
    │   │   └── modify.css
    │   ├── openai-favicon2_32x32.ico
    │   ├── openai-favicon2_32x32.png
    │   └── openai_icon.ico
    ├── algorithms
    │   ├── ddpg.rst
    │   ├── ppo.rst
    │   ├── sac.rst
    │   ├── td3.rst
    │   ├── trpo.rst
    │   └── vpg.rst
    ├── conf.py
    ├── docs_requirements.txt
    ├── etc
    │   ├── acknowledgements.rst
    │   └── author.rst
    ├── images
    │   ├── alphago.jpg
    │   ├── bench
    │   │   ├── bench_ant.svg
    │   │   ├── bench_halfcheetah.svg
    │   │   ├── bench_hopper.svg
    │   │   ├── bench_swim.svg
    │   │   └── bench_walker.svg
    │   ├── ex2-1_trpo_hopper.png
    │   ├── ex2-2_ddpg_bug.svg
    │   ├── ex2-2_ddpg_bug_pytorch.png
    │   ├── knocked-over-stand-up.mp4
    │   ├── knocked_down_standup.png
    │   ├── logo.png
    │   ├── ms_pacman.png
    │   ├── openai-favicon.png
    │   ├── openai-favicon2.png
    │   ├── openai-favicon2_32x32.png
    │   ├── plots
    │   │   ├── ddpg
    │   │   │   ├── ddpg_ant_performance.svg
    │   │   │   ├── ddpg_halfcheetah_performance.svg
    │   │   │   ├── ddpg_hopper_performance.svg
    │   │   │   ├── ddpg_swimmer_performance.svg
    │   │   │   └── ddpg_walker2d_performance.svg
    │   │   ├── ppo
    │   │   │   ├── ppo_ant_performance.svg
    │   │   │   ├── ppo_halfcheetah_performance.svg
    │   │   │   ├── ppo_hopper_performance.svg
    │   │   │   ├── ppo_swimmer_performance.svg
    │   │   │   └── ppo_walker2d_performance.svg
    │   │   ├── pyt
    │   │   │   ├── pytorch_ant_performance.svg
    │   │   │   ├── pytorch_halfcheetah_performance.svg
    │   │   │   ├── pytorch_hopper_performance.svg
    │   │   │   ├── pytorch_swimmer_performance.svg
    │   │   │   └── pytorch_walker2d_performance.svg
    │   │   ├── sac
    │   │   │   ├── sac_ant_performance.svg
    │   │   │   ├── sac_halfcheetah_performance.svg
    │   │   │   ├── sac_hopper_performance.svg
    │   │   │   ├── sac_swimmer_performance.svg
    │   │   │   └── sac_walker2d_performance.svg
    │   │   ├── td3
    │   │   │   ├── td3_ant_performance.svg
    │   │   │   ├── td3_halfcheetah_performance.svg
    │   │   │   ├── td3_hopper_performance.svg
    │   │   │   ├── td3_swimmer_performance.svg
    │   │   │   └── td3_walker2d_performance.svg
    │   │   ├── tf1
    │   │   │   ├── tensorflow_ant_performance.svg
    │   │   │   ├── tensorflow_halfcheetah_performance.svg
    │   │   │   ├── tensorflow_hopper_performance.svg
    │   │   │   ├── tensorflow_swimmer_performance.svg
    │   │   │   └── tensorflow_walker2d_performance.svg
    │   │   └── vpg
    │   │   │   ├── vpg_ant_performance.svg
    │   │   │   ├── vpg_halfcheetah_performance.svg
    │   │   │   ├── vpg_hopper_performance.svg
    │   │   │   ├── vpg_swimmer_performance.svg
    │   │   │   └── vpg_walker2d_performance.svg
    │   ├── recolored_logo.png
    │   ├── rl_algorithms.png
    │   ├── rl_algorithms.svg
    │   ├── rl_algorithms.xml
    │   ├── rl_algorithms_9_12.png
    │   ├── rl_algorithms_9_15.svg
    │   ├── rl_algorithms_9_15.xml
    │   ├── rl_diagram_transparent_bg.png
    │   ├── spinning-up-in-rl.png
    │   ├── spinning-up-logo.png
    │   ├── spinning-up-logo.svg
    │   └── spinning-up-logo2.png
    ├── index.rst
    ├── make.bat
    ├── spinningup
    │   ├── bench.rst
    │   ├── bench_ddpg.rst
    │   ├── bench_ppo.rst
    │   ├── bench_sac.rst
    │   ├── bench_td3.rst
    │   ├── bench_vpg.rst
    │   ├── exercise2_1_soln.rst
    │   ├── exercise2_2_soln.rst
    │   ├── exercises.rst
    │   ├── extra_pg_proof1.rst
    │   ├── extra_pg_proof2.rst
    │   ├── extra_tf_pg_implementation.rst
    │   ├── keypapers.rst
    │   ├── rl_intro.rst
    │   ├── rl_intro2.rst
    │   ├── rl_intro3.rst
    │   ├── rl_intro4.rst
    │   └── spinningup.rst
    ├── user
    │   ├── algorithms.rst
    │   ├── installation.rst
    │   ├── introduction.rst
    │   ├── plotting.rst
    │   ├── running.rst
    │   └── saving_and_loading.rst
    └── utils
    │   ├── logger.rst
    │   ├── mpi.rst
    │   ├── plotter.rst
    │   └── run_utils.rst
├── readme.md
├── readthedocs.yml
├── setup.py
├── spinup
    ├── __init__.py
    ├── algos
    │   ├── __init__.py
    │   ├── pytorch
    │   │   ├── ddpg
    │   │   │   ├── core.py
    │   │   │   └── ddpg.py
    │   │   ├── ppo
    │   │   │   ├── core.py
    │   │   │   └── ppo.py
    │   │   ├── sac
    │   │   │   ├── core.py
    │   │   │   └── sac.py
    │   │   ├── td3
    │   │   │   ├── core.py
    │   │   │   └── td3.py
    │   │   ├── trpo
    │   │   │   └── trpo.py
    │   │   └── vpg
    │   │   │   ├── core.py
    │   │   │   └── vpg.py
    │   └── tf1
    │   │   ├── ddpg
    │   │       ├── __init__.py
    │   │       ├── core.py
    │   │       └── ddpg.py
    │   │   ├── ppo
    │   │       ├── __init__.py
    │   │       ├── core.py
    │   │       └── ppo.py
    │   │   ├── sac
    │   │       ├── __init__.py
    │   │       ├── core.py
    │   │       └── sac.py
    │   │   ├── td3
    │   │       ├── __init__.py
    │   │       ├── core.py
    │   │       └── td3.py
    │   │   ├── trpo
    │   │       ├── __init__.py
    │   │       ├── core.py
    │   │       └── trpo.py
    │   │   └── vpg
    │   │       ├── __init__.py
    │   │       ├── core.py
    │   │       └── vpg.py
    ├── examples
    │   ├── pytorch
    │   │   ├── bench_ppo_cartpole.py
    │   │   └── pg_math
    │   │   │   ├── 1_simple_pg.py
    │   │   │   └── 2_rtg_pg.py
    │   └── tf1
    │   │   ├── bench_ppo_cartpole.py
    │   │   ├── pg_math
    │   │       ├── 1_simple_pg.py
    │   │       └── 2_rtg_pg.py
    │   │   └── train_mnist.py
    ├── exercises
    │   ├── common.py
    │   ├── pytorch
    │   │   ├── problem_set_1
    │   │   │   ├── exercise1_1.py
    │   │   │   ├── exercise1_2.py
    │   │   │   ├── exercise1_2_auxiliary.py
    │   │   │   └── exercise1_3.py
    │   │   ├── problem_set_1_solutions
    │   │   │   ├── exercise1_1_soln.py
    │   │   │   └── exercise1_2_soln.py
    │   │   └── problem_set_2
    │   │   │   └── exercise2_2.py
    │   └── tf1
    │   │   ├── problem_set_1
    │   │       ├── exercise1_1.py
    │   │       ├── exercise1_2.py
    │   │       └── exercise1_3.py
    │   │   ├── problem_set_1_solutions
    │   │       ├── exercise1_1_soln.py
    │   │       └── exercise1_2_soln.py
    │   │   └── problem_set_2
    │   │       └── exercise2_2.py
    ├── run.py
    ├── user_config.py
    ├── utils
    │   ├── __init__.py
    │   ├── logx.py
    │   ├── mpi_pytorch.py
    │   ├── mpi_tf.py
    │   ├── mpi_tools.py
    │   ├── plot.py
    │   ├── run_entrypoint.py
    │   ├── run_utils.py
    │   ├── serialization_utils.py
    │   └── test_policy.py
    └── version.py
├── test
    └── test_ppo.py
└── travis_setup.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.*~
 2 | __pycache__/
 3 | *.pkl
 4 | data/
 5 | **/*.egg-info
 6 | .python-version
 7 | .idea/
 8 | .vscode/
 9 | .DS_Store
10 | _build/
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | env:
 2 |  global:
 3 |  - LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/travis/.mujoco/mujoco200/bin
 4 | 
 5 | matrix:
 6 |     include:
 7 |         - os: linux
 8 |           language: python
 9 |           python: "3.6"
10 | 
11 | before_install:
12 |     - ./travis_setup.sh
13 | 
14 | script:
15 |     - pip3 install --upgrade -e .[mujoco]
16 |     - python3 -c "import mujoco_py"
17 |     - python3 -c "import spinup"
18 |     - python3 -m pytest
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2018 OpenAI (http://openai.com)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = SpinningUp
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/_static/css/modify.css:
--------------------------------------------------------------------------------
  1 | :root {
  2 |     /* Colors */
  3 |     --color--white: #fff;
  4 |     --color--lightwash: #f7fbfb;
  5 |     --color--mediumwash: #eff7f8;
  6 |     --color--darkwash: #e6f3f3;
  7 |     --color--warmgraylight: #eeedee;
  8 |     --color--warmgraydark: #a3acb0;
  9 |     --color--coolgray1: #c5c5d2;
 10 |     --color--coolgray2: #8e8ea0;
 11 |     --color--coolgray3: #6e6e80;
 12 |     --color--coolgray4: #404452;
 13 |     --color--black: #050505;
 14 |     --color--pink: #e6a2e4;
 15 |     --color--magenta: #dd5ce5;
 16 |     --color--red: #bd1c5f;
 17 |     --color--brightred: #ef4146;
 18 |     --color--orange: #e86c09;
 19 |     --color--golden: #f4ac36;
 20 |     --color--yellow: #ebe93d;
 21 |     --color--lightgreen: #68de7a;
 22 |     --color--darkgreen: #10a37f;
 23 |     --color--teal: #2ff3ce;
 24 |     --color--lightblue: #27b5ea;
 25 |     --color--mediumblue: #2e95d3;
 26 |     --color--darkblue: #5436da;
 27 |     --color--navyblue: #1d0d4c;
 28 |     --color--lightpurple: #6b40d8;
 29 |     --color--darkpurple: #412991;
 30 |     --color--lightgrayishpurple: #cdc3cf;
 31 |     --color--mediumgrayishpurple: #9c88a3;
 32 |     --color--darkgrayishpurple: #562f5f;
 33 | }
 34 | 
 35 | body {
 36 |   color: var(--color--darkgray) !important;
 37 |   fill: var(--color--darkgray) !important;
 38 | }
 39 | 
 40 | h1, h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend {
 41 |   /* font-weight: 500;
 42 |   font-family: Colfax, sans-serif !important; */
 43 |   font-family: "Lato","proxima-nova","Helvetica Neue",Arial,sans-serif !important;
 44 | }
 45 | 
 46 | .wy-nav-top {
 47 |   background-color: var(--color--coolgray4) !important;
 48 | }
 49 | 
 50 | .rst-content .toc-backref {
 51 |     color: #404040 !important;
 52 | }
 53 | 
 54 | .footnote {
 55 |   padding-left: 0.75rem;
 56 |   background-color: var(--color--warmgraylight) !important;
 57 | }
 58 | 
 59 | .wy-nav-top a, .wy-nav-top a:visited {
 60 |  color: var(--color--white) !important;
 61 | }
 62 | 
 63 | .wy-menu-vertical header, .wy-menu-vertical p.caption {
 64 |   font-weight: 500 !important;
 65 |   letter-spacing: 1px;
 66 |   margin-top: 1.25rem;
 67 | }
 68 | 
 69 | .wy-side-nav-search {
 70 |   background-color: var(--color--warmgraylight) !important;
 71 | }
 72 | 
 73 | .wy-body-for-nav {
 74 |   background-color: var(--color--coolgray1) !important;
 75 | }
 76 | 
 77 | .wy-menu-vertical li span.toctree-expand {
 78 |   color:  var(--color--coolgray2) !important;
 79 | }
 80 | 
 81 | .wy-nav-side {
 82 |   color: var(--color--coolgray1) !important;
 83 |   background-color: var(--color--coolgray4) !important;
 84 | }
 85 | 
 86 | .wy-side-nav-search input[type=text] {
 87 |   border-color: var(--color--warmgraydark) !important;
 88 | }
 89 | 
 90 | a {
 91 |   color: var(--color--mediumblue) !important;
 92 | }
 93 | 
 94 | a:visited {
 95 |   color: #9B59B6 !important;
 96 | }
 97 | 
 98 | .wy-menu-vertical a {
 99 |   color: var(--color--coolgray2) !important;
100 | }
101 | 
102 | .wy-menu-vertical li.current a {
103 |   border-right: none !important;
104 |   color: var(--color--coolgray4) !important;
105 | }
106 | 
107 | .wy-menu-vertical li.current {
108 |   background-color: var(--color--warmgraylight) !important;
109 | }
110 | 
111 | .wy-menu-vertical li.toctree-l2.current>a {
112 |   background-color: var(--color--coolgray1) !important;
113 | }
114 | 
115 | .wy-menu-vertical a:hover, .wy-menu-vertical li.current a:hover, .wy-menu-vertical li.toctree-l2.current>a:hover {
116 |   color: var(--color--warmgraylight) !important;
117 |   background-color: var(--color--coolgray3) !important;
118 | }
119 | 
120 | .wy-alert-title, .rst-content .admonition-title {
121 |   background-color: var(--color--mediumblue) !important;
122 | }
123 | 
124 | .wy-alert, .rst-content .note, .rst-content .attention, .rst-content .caution, .rst-content .danger, .rst-content .error, .rst-content .hint, .rst-content .important, .rst-content .tip, .rst-content .warning, .rst-content .seealso, .rst-content .admonition-todo, .rst-content .admonition {
125 |   background-color: var(--color--warmgraylight) !important;
126 | }
127 | 
128 | .rst-content dl:not(.docutils) dt {
129 |   border-color: var(--color--mediumblue) !important;
130 |   background-color: var(--color--warmgraylight) !important;
131 | }
132 | 
133 | /* .rst-content pre.literal-block, .rst-content div[class^='highlight'] {
134 |   background-color: var(--color--warmgraylight) !important;
135 | } */
136 | 
137 | .wy-table-odd td, .wy-table-striped tr:nth-child(2n-1) td, .rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td {
138 |   background-color: var(--color--warmgraylight) !important;
139 | }
140 | 
141 | @media screen and (min-width: 1100px) {
142 |   .wy-nav-content-wrap {
143 |       background-color: var(--color--warmgraylight) !important;
144 |   }
145 | }
146 | 
147 | .wy-side-nav-search img {
148 |   height: auto !important;
149 |   width: 100% !important;
150 |   padding: 0 !important;
151 |   background-color: inherit !important;
152 |   border-radius: 0 !important;
153 |   margin: 0 !important
154 | }
155 | 
156 | .wy-side-nav-search>a, .wy-side-nav-search .wy-dropdown>a {
157 |   margin-bottom: 0 !important;
158 | }
159 | 
160 | .wy-menu-vertical li.toctree-l1.current>a {
161 |   border: none !important;
162 | }
163 | 
164 | .wy-side-nav-search>div.version {
165 |  color: var(--color--coolgray2) !important;
166 | }


--------------------------------------------------------------------------------
/docs/_static/openai-favicon2_32x32.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/_static/openai-favicon2_32x32.ico


--------------------------------------------------------------------------------
/docs/_static/openai-favicon2_32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/_static/openai-favicon2_32x32.png


--------------------------------------------------------------------------------
/docs/_static/openai_icon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/_static/openai_icon.ico


--------------------------------------------------------------------------------
/docs/algorithms/vpg.rst:
--------------------------------------------------------------------------------
  1 | =======================
  2 | Vanilla Policy Gradient
  3 | =======================
  4 | 
  5 | .. contents:: Table of Contents
  6 | 
  7 | 
  8 | Background
  9 | ==========
 10 | 
 11 | (Previously: `Introduction to RL, Part 3`_)
 12 | 
 13 | .. _`Introduction to RL, Part 3`: ../spinningup/rl_intro3.html
 14 | 
 15 | The key idea underlying policy gradients is to push up the probabilities of actions that lead to higher return, and push down the probabilities of actions that lead to lower return, until you arrive at the optimal policy.
 16 | 
 17 | Quick Facts
 18 | -----------
 19 | 
 20 | * VPG is an on-policy algorithm.
 21 | * VPG can be used for environments with either discrete or continuous action spaces.
 22 | * The Spinning Up implementation of VPG supports parallelization with MPI.
 23 | 
 24 | Key Equations
 25 | -------------
 26 | 
 27 | Let :math:`\pi_{\theta}` denote a policy with parameters :math:`\theta`, and :math:`J(\pi_{\theta})` denote the expected finite-horizon undiscounted return of the policy. The gradient of :math:`J(\pi_{\theta})` is
 28 | 
 29 | .. math:: 
 30 |     
 31 |     \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{
 32 |         \sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t|s_t) A^{\pi_{\theta}}(s_t,a_t)
 33 |         },
 34 | 
 35 | where :math:`\tau` is a trajectory and :math:`A^{\pi_{\theta}}` is the advantage function for the current policy. 
 36 | 
 37 | The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance:
 38 | 
 39 | .. math::
 40 | 
 41 |     \theta_{k+1} = \theta_k + \alpha \nabla_{\theta} J(\pi_{\theta_k})
 42 | 
 43 | Policy gradient implementations typically compute advantage function estimates based on the infinite-horizon discounted return, despite otherwise using the finite-horizon undiscounted policy gradient formula. 
 44 | 
 45 | Exploration vs. Exploitation
 46 | ----------------------------
 47 | 
 48 | VPG trains a stochastic policy in an on-policy way. This means that it explores by sampling actions according to the latest version of its stochastic policy. The amount of randomness in action selection depends on both initial conditions and the training procedure. Over the course of training, the policy typically becomes progressively less random, as the update rule encourages it to exploit rewards that it has already found. This may cause the policy to get trapped in local optima.
 49 | 
 50 | 
 51 | Pseudocode
 52 | ----------
 53 | 
 54 | .. math::
 55 |     :nowrap:
 56 | 
 57 |     \begin{algorithm}[H]
 58 |         \caption{Vanilla Policy Gradient Algorithm}
 59 |         \label{alg1}
 60 |     \begin{algorithmic}[1]
 61 |         \STATE Input: initial policy parameters $\theta_0$, initial value function parameters $\phi_0$
 62 |         \FOR{$k = 0,1,2,...$} 
 63 |         \STATE Collect set of trajectories ${\mathcal D}_k = \{\tau_i\}$ by running policy $\pi_k = \pi(\theta_k)$ in the environment.
 64 |         \STATE Compute rewards-to-go $\hat{R}_t$.
 65 |         \STATE Compute advantage estimates, $\hat{A}_t$ (using any method of advantage estimation) based on the current value function $V_{\phi_k}$.
 66 |         \STATE Estimate policy gradient as
 67 |             \begin{equation*}
 68 |             \hat{g}_k = \frac{1}{|{\mathcal D}_k|} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T \left. \nabla_{\theta} \log\pi_{\theta}(a_t|s_t)\right|_{\theta_k} \hat{A}_t.
 69 |             \end{equation*}
 70 |         \STATE Compute policy update, either using standard gradient ascent,
 71 |             \begin{equation*}
 72 |             \theta_{k+1} = \theta_k + \alpha_k \hat{g}_k,
 73 |             \end{equation*}
 74 |             or via another gradient ascent algorithm like Adam.
 75 |         \STATE Fit value function by regression on mean-squared error:
 76 |             \begin{equation*}
 77 |             \phi_{k+1} = \arg \min_{\phi} \frac{1}{|{\mathcal D}_k| T} \sum_{\tau \in {\mathcal D}_k} \sum_{t=0}^T\left( V_{\phi} (s_t) - \hat{R}_t \right)^2,
 78 |             \end{equation*}
 79 |             typically via some gradient descent algorithm.
 80 |         \ENDFOR
 81 |     \end{algorithmic}
 82 |     \end{algorithm}
 83 | 
 84 | 
 85 | Documentation
 86 | =============
 87 | 
 88 | .. admonition:: You Should Know
 89 | 
 90 |     In what follows, we give documentation for the PyTorch and Tensorflow implementations of VPG in Spinning Up. They have nearly identical function calls and docstrings, except for details relating to model construction. However, we include both full docstrings for completeness.
 91 | 
 92 | 
 93 | Documentation: PyTorch Version
 94 | ------------------------------
 95 | 
 96 | .. autofunction:: spinup.vpg_pytorch
 97 | 
 98 | Saved Model Contents: PyTorch Version
 99 | -------------------------------------
100 | 
101 | The PyTorch saved model can be loaded with ``ac = torch.load('path/to/model.pt')``, yielding an actor-critic object (``ac``) that has the properties described in the docstring for ``vpg_pytorch``. 
102 | 
103 | You can get actions from this model with
104 | 
105 | .. code-block:: python
106 | 
107 |     actions = ac.act(torch.as_tensor(obs, dtype=torch.float32))
108 | 
109 | 
110 | Documentation: Tensorflow Version
111 | ---------------------------------
112 | 
113 | .. autofunction:: spinup.vpg_tf1
114 | 
115 | Saved Model Contents: Tensorflow Version
116 | ----------------------------------------
117 | 
118 | The computation graph saved by the logger includes:
119 | 
120 | ========  ====================================================================
121 | Key       Value
122 | ========  ====================================================================
123 | ``x``     Tensorflow placeholder for state input.
124 | ``pi``    Samples an action from the agent, conditioned on states in ``x``.
125 | ``v``     Gives value estimate for states in ``x``. 
126 | ========  ====================================================================
127 | 
128 | This saved model can be accessed either by
129 | 
130 | * running the trained policy with the `test_policy.py`_ tool,
131 | * or loading the whole saved graph into a program with `restore_tf_graph`_. 
132 | 
133 | .. _`test_policy.py`: ../user/saving_and_loading.html#loading-and-running-trained-policies
134 | .. _`restore_tf_graph`: ../utils/logger.html#spinup.utils.logx.restore_tf_graph
135 | 
136 | References
137 | ==========
138 | 
139 | Relevant Papers
140 | ---------------
141 | 
142 | - `Policy Gradient Methods for Reinforcement Learning with Function Approximation`_, Sutton et al. 2000
143 | - `Optimizing Expectations: From Deep Reinforcement Learning to Stochastic Computation Graphs`_, Schulman 2016(a)
144 | - `Benchmarking Deep Reinforcement Learning for Continuous Control`_, Duan et al. 2016
145 | - `High Dimensional Continuous Control Using Generalized Advantage Estimation`_, Schulman et al. 2016(b)
146 | 
147 | .. _`Policy Gradient Methods for Reinforcement Learning with Function Approximation`: https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf
148 | .. _`Optimizing Expectations: From Deep Reinforcement Learning to Stochastic Computation Graphs`: http://joschu.net/docs/thesis.pdf
149 | .. _`Benchmarking Deep Reinforcement Learning for Continuous Control`: https://arxiv.org/abs/1604.06778
150 | .. _`High Dimensional Continuous Control Using Generalized Advantage Estimation`: https://arxiv.org/abs/1506.02438
151 | 
152 | Why These Papers?
153 | -----------------
154 | 
155 | Sutton 2000 is included because it is a timeless classic of reinforcement learning theory, and contains references to the earlier work which led to modern policy gradients. Schulman 2016(a) is included because Chapter 2 contains a lucid introduction to the theory of policy gradient algorithms, including pseudocode. Duan 2016 is a clear, recent benchmark paper that shows how vanilla policy gradient in the deep RL setting (eg with neural network policies and Adam as the optimizer) compares with other deep RL algorithms. Schulman 2016(b) is included because our implementation of VPG makes use of Generalized Advantage Estimation for computing the policy gradient.
156 | 
157 | 
158 | Other Public Implementations
159 | ----------------------------
160 | 
161 | - rllab_
162 | - `rllib (Ray)`_
163 | 
164 | .. _rllab: https://github.com/rll/rllab/blob/master/rllab/algos/vpg.py
165 | .. _`rllib (Ray)`: https://github.com/ray-project/ray/blob/master/python/ray/rllib/agents/pg
166 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Spinning Up documentation build configuration file, created by
  5 | # sphinx-quickstart on Wed Aug 15 04:21:07 2018.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | 
 23 | # Make sure spinup is accessible without going through setup.py
 24 | dirname = os.path.dirname
 25 | sys.path.insert(0, dirname(dirname(__file__)))
 26 | 
 27 | # Mock mpi4py to get around having to install it on RTD server (which fails)
 28 | # Also to mock PyTorch, because it is too large for the RTD server to download
 29 | from unittest.mock import MagicMock
 30 | 
 31 | class Mock(MagicMock):
 32 |     @classmethod
 33 |     def __getattr__(cls, name):
 34 |         return MagicMock()
 35 | 
 36 | MOCK_MODULES = ['mpi4py', 
 37 |                 'torch', 
 38 |                 'torch.optim', 
 39 |                 'torch.nn',
 40 |                 'torch.distributions',
 41 |                 'torch.distributions.normal',
 42 |                 'torch.distributions.categorical',
 43 |                 'torch.nn.functional',
 44 |                 ]
 45 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 46 | 
 47 | # Finish imports
 48 | import spinup
 49 | from recommonmark.parser import CommonMarkParser
 50 | 
 51 | 
 52 | source_parsers = {
 53 |     '.md': CommonMarkParser,
 54 | }
 55 | 
 56 | 
 57 | # -- General configuration ------------------------------------------------
 58 | 
 59 | # If your documentation needs a minimal Sphinx version, state it here.
 60 | #
 61 | # needs_sphinx = '1.0'
 62 | 
 63 | # Add any Sphinx extension module names here, as strings. They can be
 64 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 65 | # ones.
 66 | extensions = ['sphinx.ext.imgmath',
 67 |     'sphinx.ext.viewcode',
 68 |     'sphinx.ext.autodoc',
 69 |     'sphinx.ext.napoleon']
 70 | 
 71 | #'sphinx.ext.mathjax', ??
 72 | 
 73 | # imgmath settings
 74 | imgmath_image_format = 'svg'
 75 | imgmath_font_size = 14
 76 | 
 77 | # Add any paths that contain templates here, relative to this directory.
 78 | templates_path = ['_templates']
 79 | 
 80 | # The suffix(es) of source filenames.
 81 | # You can specify multiple suffix as a list of string:
 82 | #
 83 | source_suffix = ['.rst', '.md']
 84 | # source_suffix = '.rst'
 85 | 
 86 | # The master toctree document.
 87 | master_doc = 'index'
 88 | 
 89 | # General information about the project.
 90 | project = 'Spinning Up'
 91 | copyright = '2018, OpenAI'
 92 | author = 'Joshua Achiam'
 93 | 
 94 | # The version info for the project you're documenting, acts as replacement for
 95 | # |version| and |release|, also used in various other places throughout the
 96 | # built documents.
 97 | #
 98 | # The short X.Y version.
 99 | version = ''
100 | # The full version, including alpha/beta/rc tags.
101 | release = ''
102 | 
103 | # The language for content autogenerated by Sphinx. Refer to documentation
104 | # for a list of supported languages.
105 | #
106 | # This is also used if you do content translation via gettext catalogs.
107 | # Usually you set "language" from the command line for these cases.
108 | language = None
109 | 
110 | # List of patterns, relative to source directory, that match files and
111 | # directories to ignore when looking for source files.
112 | # This patterns also effect to html_static_path and html_extra_path
113 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
114 | 
115 | # The name of the Pygments (syntax highlighting) style to use.
116 | pygments_style = 'default' #'sphinx'
117 | 
118 | # If true, `todo` and `todoList` produce output, else they produce nothing.
119 | todo_include_todos = False
120 | 
121 | 
122 | # -- Options for HTML output ----------------------------------------------
123 | 
124 | # The theme to use for HTML and HTML Help pages.  See the documentation for
125 | # a list of builtin themes.
126 | #
127 | # html_theme = 'alabaster'
128 | html_theme = "sphinx_rtd_theme"
129 | 
130 | # Theme options are theme-specific and customize the look and feel of a theme
131 | # further.  For a list of options available for each theme, see the
132 | # documentation.
133 | #
134 | # html_theme_options = {}
135 | 
136 | # Add any paths that contain custom static files (such as style sheets) here,
137 | # relative to this directory. They are copied after the builtin static files,
138 | # so a file named "default.css" will overwrite the builtin "default.css".
139 | html_static_path = ['_static']
140 | 
141 | html_logo = 'images/spinning-up-logo2.png'
142 | html_theme_options = {
143 |     'logo_only': True
144 | }
145 | #html_favicon = 'openai-favicon2_32x32.ico'
146 | html_favicon = 'openai_icon.ico'
147 | 
148 | # -- Options for HTMLHelp output ------------------------------------------
149 | 
150 | # Output file base name for HTML help builder.
151 | htmlhelp_basename = 'SpinningUpdoc'
152 | 
153 | # -- Options for LaTeX output ---------------------------------------------
154 | 
155 | 
156 | imgmath_latex_preamble = r'''
157 | \usepackage{algorithm}
158 | \usepackage{algorithmic}
159 | \usepackage{amsmath}
160 | \usepackage{cancel}
161 | 
162 | \usepackage[verbose=true,letterpaper]{geometry}
163 | \geometry{
164 |     textheight=12in,
165 |     textwidth=6.5in,
166 |     top=1in,
167 |     headheight=12pt,
168 |     headsep=25pt,
169 |     footskip=30pt
170 |     }
171 | 
172 | \newcommand{\E}{{\mathrm E}}
173 | 
174 | \newcommand{\underE}[2]{\underset{\begin{subarray}{c}#1 \end{subarray}}{\E}\left[ #2 \right]}
175 | 
176 | \newcommand{\Epi}[1]{\underset{\begin{subarray}{c}\tau \sim \pi \end{subarray}}{\E}\left[ #1 \right]}
177 | '''
178 | 
179 | latex_elements = {
180 |     # The paper size ('letterpaper' or 'a4paper').
181 |     #
182 |     # 'papersize': 'letterpaper',
183 | 
184 |     # The font size ('10pt', '11pt' or '12pt').
185 |     #
186 |     # 'pointsize': '10pt',
187 | 
188 |     # Additional stuff for the LaTeX preamble.
189 |     #
190 |     'preamble': r'''
191 | \usepackage{algorithm}
192 | \usepackage{algorithmic}
193 | \usepackage{amsmath}
194 | \usepackage{cancel}
195 | 
196 | 
197 | \newcommand{\E}{{\mathrm E}}
198 | 
199 | \newcommand{\underE}[2]{\underset{\begin{subarray}{c}#1 \end{subarray}}{\E}\left[ #2 \right]}
200 | 
201 | \newcommand{\Epi}[1]{\underset{\begin{subarray}{c}\tau \sim \pi \end{subarray}}{\E}\left[ #1 \right]}
202 | ''',
203 | 
204 |     # Latex figure (float) alignment
205 |     #
206 |     # 'figure_align': 'htbp',
207 | }
208 | 
209 | # Grouping the document tree into LaTeX files. List of tuples
210 | # (source start file, target name, title,
211 | #  author, documentclass [howto, manual, or own class]).
212 | latex_documents = [
213 |     (master_doc, 'SpinningUp.tex', 'Spinning Up Documentation',
214 |      'Joshua Achiam', 'manual'),
215 | ]
216 | 
217 | 
218 | # -- Options for manual page output ---------------------------------------
219 | 
220 | # One entry per manual page. List of tuples
221 | # (source start file, name, description, authors, manual section).
222 | man_pages = [
223 |     (master_doc, 'spinningup', 'Spinning Up Documentation',
224 |      [author], 1)
225 | ]
226 | 
227 | 
228 | # -- Options for Texinfo output -------------------------------------------
229 | 
230 | # Grouping the document tree into Texinfo files. List of tuples
231 | # (source start file, target name, title, author,
232 | #  dir menu entry, description, category)
233 | texinfo_documents = [
234 |     (master_doc, 'SpinningUp', 'Spinning Up Documentation',
235 |      author, 'SpinningUp', 'One line description of project.',
236 |      'Miscellaneous'),
237 | ]
238 | 
239 | 
240 | def setup(app):
241 |     app.add_stylesheet('css/modify.css')


--------------------------------------------------------------------------------
/docs/docs_requirements.txt:
--------------------------------------------------------------------------------
 1 | cloudpickle~=1.2.1
 2 | gym~=0.15.3
 3 | ipython
 4 | joblib
 5 | matplotlib
 6 | numpy
 7 | pandas
 8 | pytest
 9 | psutil
10 | scipy
11 | seaborn==0.8.1
12 | sphinx==1.5.6
13 | sphinx-autobuild==0.7.1       
14 | sphinx-rtd-theme==0.4.1 
15 | tensorflow>=1.8.0,<2.0
16 | tqdm


--------------------------------------------------------------------------------
/docs/etc/acknowledgements.rst:
--------------------------------------------------------------------------------
1 | ================
2 | Acknowledgements
3 | ================
4 | 
5 | We gratefully acknowledge the contributions of the many people who helped get this project off of the ground, including people who beta tested the software, gave feedback on the material, improved dependencies of Spinning Up code in service of this release, or otherwise supported the project. Given the number of people who were involved at various points, this list of names may not be exhaustive. (If you think you should have been listed here, please do not hesitate to reach out.)
6 | 
7 | In no particular order, thank you Alex Ray, Amanda Askell, Ben Garfinkel, Christy Dennison, Coline Devin, Daniel Zeigler, Dylan Hadfield-Menell, Ge Yang, Greg Khan, Jack Clark, Jonas Rothfuss, Larissa Schiavo, Leandro Castelao, Lilian Weng, Maddie Hall, Matthias Plappert, Miles Brundage, Peter Zokhov, and Pieter Abbeel. 
8 | 
9 | We are also grateful to Pieter Abbeel's group at Berkeley, and the Center for Human-Compatible AI, for giving feedback on presentations about Spinning Up.


--------------------------------------------------------------------------------
/docs/etc/author.rst:
--------------------------------------------------------------------------------
1 | ================
2 | About the Author
3 | ================
4 | 
5 | Spinning Up in Deep RL was primarily developed by Josh Achiam, a research scientist on the OpenAI Safety Team and PhD student at UC Berkeley advised by Pieter Abbeel. Josh studies topics related to safety in deep reinforcement learning, and has previously published work on `safe exploration`_. 
6 | 
7 | .. _`safe exploration`: https://arxiv.org/abs/1705.10528


--------------------------------------------------------------------------------
/docs/images/alphago.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/alphago.jpg


--------------------------------------------------------------------------------
/docs/images/ex2-1_trpo_hopper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/ex2-1_trpo_hopper.png


--------------------------------------------------------------------------------
/docs/images/ex2-2_ddpg_bug_pytorch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/ex2-2_ddpg_bug_pytorch.png


--------------------------------------------------------------------------------
/docs/images/knocked-over-stand-up.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/knocked-over-stand-up.mp4


--------------------------------------------------------------------------------
/docs/images/knocked_down_standup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/knocked_down_standup.png


--------------------------------------------------------------------------------
/docs/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/logo.png


--------------------------------------------------------------------------------
/docs/images/ms_pacman.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/ms_pacman.png


--------------------------------------------------------------------------------
/docs/images/openai-favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/openai-favicon.png


--------------------------------------------------------------------------------
/docs/images/openai-favicon2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/openai-favicon2.png


--------------------------------------------------------------------------------
/docs/images/openai-favicon2_32x32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/openai-favicon2_32x32.png


--------------------------------------------------------------------------------
/docs/images/recolored_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/recolored_logo.png


--------------------------------------------------------------------------------
/docs/images/rl_algorithms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/rl_algorithms.png


--------------------------------------------------------------------------------
/docs/images/rl_algorithms.xml:
--------------------------------------------------------------------------------
1 | <mxfile userAgent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0" version="9.1.2" editor="www.draw.io" type="device"><diagram name="Page-1" id="8ce9d11a-91a2-4d17-14d8-a56ed91bf033">7Z3bdps4FIafxpfpQuLoS+fYWaudpklXZzp3CpZtphh5sNIkffoRRtiAwCFGknGr9KJBCCH4vy1tbUlkZF8sn29StFp8JFMcj6A1fR7ZlyMIge0B9l+W8pKn+K6TJ8zTaMoz7RLuo5+YJ1o89TGa4nUlIyUkptGqmhiSJMEhraShNCVP1WwzElfvukJzLCTchygWU/+KpnRR1A66uxPvcTRf8Ft7xYkHFH6fp+Qx4fcbQXu2+clPL1FRFn/Q9QJNyVMpyb4a2RcpITT/bfl8gePs3RavLb/uuuXstt4pTminC3yMPM/HwA0hq2p4Bse8YvSleBl4yt4NPyQpXZA5SVB8tUs93zwwzooE7GhBlzH/NUYPOD7fvpMLEpOUnUpIkl22piilk0yuWtp1FGclWMUxB8RlxziZFleEMVqvo/DLIkryE/wykB+VLvoXU/rCj9EjJSxp9yAfCFnxq9Y0Jd9xUUumnbX52Z4pWMjyzkhCr9EyijPEv+J0ihLEk/mdAn7YVB5+jujf2SO+c/nRN16uqCAXdU0e05BLAvOkTJlSHi7xDSZLTNMXliHFMaLRjyrViBvHfJtve+ktidhdocUN2QXeOw42N+TAqpaRV4pfVubs1ZIArBXFhJ5jKhTFfik90i5pA3JHqG3LQK0D6oRJVKI6O/zGH7kV61z2PMk+FtYCjF259myhKF8j18BwfQpcC8L5Jwe6PQZHBJ2/rx8ofsSFY+XFlGtWsQDvv0dSnDhbb8SdsAzAWz3vTrLf5tn/G8/17DrFmGW5+1AUyuqXl5vnEoysakJPi4ji+xXadM5PzCGumtXWxbM62FgN6ak/fmhDmtlPKedshr0w7IQ6gHtYR3E0TzJ7ZTzjdB/gP3BK8fNefAt2xEbS4ew8lbxrl6ctyo611c58BbA9NMH+8LhN8GS8WJN4TtKILpZrA48SeByn3vDoZMdWxE7e8JyjNQPBtDzK4HGd8TFbHse4Z8MfSx/LOXOsoMYmdN0DnTOr3sFuG0n5zplroD6FMUegh+IO6HWl2IF2vaixMoo9Q/EpUKypLZZIMfCFZl0dxaCfh2FVMd6qBSpavXsLW51UBZri12Bc18LuGP9gxodeStlWWYb1m+7kWTVx8zIPlrpfv1uXutVnUgpB2e/yG7iwlHDRX0wHSBazX/dzEmKOh6qlbMP0f30tARiomK5kw/R1G6Yliin6foLfd7Cn57jZvy0JNT0PQcMJepLQNZji67az05fG1yVNYKR5ozSeLmnyIqQHsG9JHIVZjT+taLSMfrJak8REsZVEsYEYGPF0zp/Zx7duqQ5OINojHIaD44D6ZJfsYWTPOYmTEBOoiX32F1P2MLJnLP4kxITDiAk0iCnbMjUNPZQLpia411+wvWNF8fqg3ukW1yuI4/r9Vl1LiSEM2IFuoEzbsLNY8Wyk6SyNrmFn0G/56+8oja5hJ7+39GHn57MPGKVJlMzNWFPTWk2tY80i0CwdnC93t58Gh4wXBvhh1gWZKcLBbKjIOHXPeNxADGwgpj7HflB0ol9Y2Kzk0LLITtPOnsD2661X3Wnvuo7Dt6G+NXWwXwDdQKxrg5qaMGEDfPZ+9LpSHAgLoOEYKKPY7B0+DYrVhNTUUexZdRdDIcX2AJYASI3RFXJXEHCVIPDWIJ0H68LKnu+wB7BqQL2azkDVlD3h4cDfQE1nGHORDWrKtk3+8NIH+5sYUfZWF9m+Xv55moGN/X+NcJFbBIKOESxydIfmlbcGTc7aMCazxZG19J5aVejvhr0U0xroaA18T4iZ6GwPCjdIOkF/wMngiPk1YseeN642KjpDx8UgSDov919vDC96eGnazq+MF1WzmpN4tUD/4JQYapRQEwTWEalRtLbHL7uptbigLwQG5fqpDUsL5PupnYcBqhzHj+cfr41FamnHHZ0W6TR91aemZybGquWptt+dRQ9FdqvlnbS+AMcVxlNdXwGo7yc/aKGFqr7swgXGZrrYzJbC142mWICqc1mFqtjc57uzy89/GkTUIAIsnYyo6ngNIPIB0QIEVATE+6s7A4SiFkNnr+I0Reg0e17CcsXujpeMVzBWZCK3Zn1rRxNxWuAaRqcKVDleE8jubWX1gNYkq4eBRQUsOpvT4jta6nb336RoGuENHoYWmbRo6W2b5jc097asEi3zxnq6W8dTZCL3E9OEthpFA0oVO3GH0986qr6Q8uXSNoCoAUTrkEVVoPTy8tbMEssGRAcRvrr5pq9XhogDiXjjfJPOdSZ+k5te07O0N4UrUdavdWd4216Q3VRul13j4uaQvZbVtH6rSOu5ZURcdCx8daPzh5iB8AEPt1aUxA94dJhSbPxUwBtVHgkLUA9T2X99WKJU5Ve/rTIEldnh7i9x5tl3f+7Uvvof</diagram></mxfile>


--------------------------------------------------------------------------------
/docs/images/rl_algorithms_9_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/rl_algorithms_9_12.png


--------------------------------------------------------------------------------
/docs/images/rl_algorithms_9_15.xml:
--------------------------------------------------------------------------------
1 | <mxfile userAgent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0" version="9.1.3" editor="www.draw.io" type="device"><diagram name="Page-1" id="8ce9d11a-91a2-4d17-14d8-a56ed91bf033">7Z1bd6MqFMc/TR47S/Cax/Q6Z62ZM5121lzOG1WSeMZIjrHTdj79wQiJikltBWI6tA+NiAT9/zZsNmBH9tni8SpDy/lHEuFkBK3ocWSfjyAEtgfonyLlqUzxXadMmGVxxDJtE27j35glWiz1Po7wqpYxJyTJ42U9MSRpisO8loayjDzUs01JUv/WJZphIeE2RImY+i2O8jmvHXS3J97jeDZnX+3xE3co/DnLyH3Kvm8E7en6pzy9QLwsdqOrOYrIQyXJvhjZZxkheflp8XiGk+LZ8sdWXne54+ym3hlO804X+Bh5no+BG0Ja1fAEjlnF8if+MHBEnw07JFk+JzOSouRim3q6vmFcFAno0TxfJOxjgu5wcrp5JmckIRk9lZK0uGyVoyyfFHI10i7jpCjB4scMEJce4zTiV4QJWq3i8Ms8TssT7DJQHlUu+hfn+RM7Rvc5oUnbG/lAyJJdtcoz8hPzWlLtrPXP5gxnocg7JWl+iRZxUiD+FWcRShFLZt8UsMO28vBjnH8vbvGdy45+sHJFBZmoK3KfhUwSWCYVylTyMImvMFngPHuiGTKcoDz+VacaMeOYbfJtLr0mMf1WaDFDdoH3joHNDDmw6mWUlWKXVTl7tiQAG0VRoWc4F4qiHyq3tE1ag9wRatsyUOuAOqUSVaguDn+wW96JdSl7mWQfCmsBxq5ce7ZQlK+Ra2C4PgauBeH8owPdHoMDgs6e1y+U3GPuWHlJzjSrWYD33z3hJ05Wa3EnNAPwlo/bk/TTrPi79lxPLjOMaZabD7xQWr+y3DKXYGR1E3qYxzm+XaJ15/xAHeK6WW1cPKuDjTWQjvzx3S6kqf1Uck6n2AvDTqgDuId1lMSztLBXyjPO9gH+C2c5ftyLL2dHbCQdxs5Dxbt2Wdq86lhbu5mvAbaHJtgfHrcNnoIXa5LMSBbn88XKwKMEHsdpNjw62bEVsVM2PKdoRUEwLY8yeFxnfMiWxzHu2fDH0odyzhwraLAJXfeVzpnV7GA3jaR858w1UB/DmCPQQ3EH9LpS7EC7WdRYGcWeofgYKNbUFkukGPhCs66OYtDPw7DqGG/UAjWt3r2ErU6qAk3xazBuamF3jH9Q40NPlWzLIsPqRd/kWQ1xyzJfLXW/frcp9U6fSSkEVb/Lb+HCUsJFfzEdIFnMft3PUYg5HqqWsg3Tf/taAjBQMV3JhunrNkxLFFP0/QS/79WenuMWvz1IcKSPK7oGU3zddnZ00kh3ljtLExhp9kvjHUyasgjpAexrksRhUeNPyzxexL9prUlqothKothADIx4OufP7MNbt1QHJxANFA7DwXFAc7JL9jCy55zEUYgJ1MQ++4spexjZMxZ/FGLCYcQEWsSUbZmahh7KBVMT3Osv2N6xonh90Ox0+fUK4rh+v1XXUmIIw3GgW6A63LCTr3g20uyS5mDDzqDf8tc/QJqDDTtZZaQPOz+ffMAoS+N0ZsaamtZqah1r8kCzdHC+3Fx/GhwyXhjgu2kXZCKEg+lQkXGanvG4hRjYQkxzjv1V0Yl+YWGzkkPLIjtNO3sC22+2Xk2nves6Dt+G+tbUwX4BdAOxrg1qasKELfDZ+9HrSnEgLICGY6CMYrN3+DgoVhNSU0exZzVdDIUU2wNYAiA1RsflriHgKkHgpUE6DzaFlT3fYQ9g1YB6NZ2Bqil7wsOBf4CazjDmIlvUlG2b7OalD/bXMaLiqc6Lfb3s9TQDG/u/jXCRywNBhwgWObpD88pbgzZnbRiT2eLIWnpPrSr0d0UfimkNdLQGvifETHS2B9wNkk7QX3AyOGLeRuzY88b1RkVn6JgPgqTz8o1kScTbmuG9GOKNotO2s18ZOqomOCfJco7+wRkx1CihJgisA1KjaJmPX/VYGyFCX4gRynVZW5YdyHdZO48IVPmQH08/XhqL1NKOOzot0ml7wU9Dz0KMZfcb3byVFt3xEqy9D8BxhaFV10cAmlvLX7XmQlVfduYCYzNdbGZD4fNGw9ei6lxhoSpM9/nm5Pzz3wYRNYgASycjqjpeA4h8QLQAARUB8f7ixgChqMXQ2as4bcE6zZ6XsHKxu+Ml4xGMFZnItVnq2tFEnB0kDaNTBaocrwmk320V9YDWpKiHgUUFLDqbU/5KLXUb/a8yFMV4jYehRSYtWnrbtqkOzb0trcSOKWQ93S3fDSXdRG4npgntZhTuDpaG0d86ql6W8uXcNoCoAUTrkEVVoPT8/PrKACIZEB1E+Ormm75eGCL0zDfpXHLit7npDT0r21SYElX9dm4S37UtZO9U7qu2hVQeVNtSLp7Wc/eIuP5YeAFH53cyA+FdHm6jKInv8ugwpdj61oAXqjwS1qJ2Upk3WYNR+dnXrAxBZXq4/aecZfbtfz61L/4H</diagram></mxfile>


--------------------------------------------------------------------------------
/docs/images/rl_diagram_transparent_bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/rl_diagram_transparent_bg.png


--------------------------------------------------------------------------------
/docs/images/spinning-up-in-rl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/spinning-up-in-rl.png


--------------------------------------------------------------------------------
/docs/images/spinning-up-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/spinning-up-logo.png


--------------------------------------------------------------------------------
/docs/images/spinning-up-logo.svg:
--------------------------------------------------------------------------------
 1 | <svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 300 101.61">
 2 |   <defs>
 3 |     <style>
 4 |       .cls-1 {
 5 |         fill: #1d0d4c;
 6 |       }
 7 | 
 8 |       .cls-2 {
 9 |         fill: #00f183;
10 |       }
11 |     </style>
12 |   </defs>
13 |   <title>adsfdsfgArtboard 2</title>
14 |   <g>
15 |     <path class="cls-1" d="M171.26,31.44c-2,0-3.41.68-4.11,2l-.37.7V31.77h-3.24V45.93h3.4V37.51c0-2,1.1-3.17,3-3.17s2.86,1.12,2.86,3.08v8.51h3.41V36.81C176.21,33.44,174.36,31.44,171.26,31.44Zm-16.53,0c-4,0-6.52,2.51-6.52,6.54v2c0,3.89,2.52,6.3,6.58,6.3a6.18,6.18,0,0,0,5.81-3L158.48,42a4.71,4.71,0,0,1-3.69,1.91,3.06,3.06,0,0,1-3.28-3.39V40H161V37.62C161,33.86,158.56,31.44,154.73,31.44Zm3.2,6.24h-6.42v-.34c0-2.33,1.14-3.61,3.22-3.61s3.2,1.26,3.2,3.39Zm52.12-8.24V26.73H198.27v2.71h4.13V43.22h-4.13v2.71h11.78V43.22h-4.13V29.44Zm-89.37-3.1c-5.27,0-8.54,3.28-8.54,8.56v2.86c0,5.28,3.27,8.56,8.54,8.56s8.53-3.28,8.53-8.56V34.9C129.21,29.62,125.94,26.34,120.68,26.34Zm5,11.62c0,3.5-1.82,5.52-5,5.52s-5-2-5-5.52V34.71c0-3.51,1.83-5.52,5-5.52s5,2,5,5.52Zm14.23-6.52a4.82,4.82,0,0,0-4.14,2l-.37.56V31.77h-3.24v19h3.41V43.91l.36.54a4.74,4.74,0,0,0,4,1.82c2.94,0,5.9-1.92,5.9-6.21V37.65C145.85,34.56,144,31.44,139.92,31.44Zm2.52,8.45c0,2.28-1.33,3.7-3.47,3.7a3.37,3.37,0,0,1-3.39-3.65V37.82a3.41,3.41,0,0,1,3.42-3.7c2.12,0,3.44,1.41,3.44,3.7Zm43.19-13.16-6.88,19.2h3.46l1.32-4.11h7.9l0,0,1.3,4.07h3.46l-6.88-19.2Zm-1.25,12.38,3.1-9.71,3.07,9.71Z"/>
16 |     <g>
17 |       <path class="cls-1" d="M113.25,69.86v-.49h2v.44c0,2.38,1.92,4,5.07,4s5-1.35,5-3.77c0-1.54-1-2.63-2.71-3.09l-5.37-1.41c-2.74-.7-3.77-2.3-3.77-4.47,0-3.36,2.66-5.32,6.57-5.32s6.64,2.2,6.64,5.43v.46h-2v-.41c0-2.25-1.74-3.8-4.66-3.8-2.66,0-4.53,1.22-4.53,3.47a2.73,2.73,0,0,0,2.36,2.91l5.5,1.46c2.33.62,4,2.17,4,4.61,0,3.47-3,5.61-7,5.61S113.25,73.33,113.25,69.86Z"/>
18 |       <path class="cls-1" d="M131.23,79.84V61.24h2v2.17a5.17,5.17,0,0,1,4.69-2.5c3.71,0,5.8,2.69,5.8,6.21v2.12c0,3.61-2.25,6.21-5.75,6.21a5.19,5.19,0,0,1-4.63-2.39v6.78Zm10.38-12.63c0-2.55-1.22-4.61-4.09-4.61s-4.23,2.08-4.23,4.77v1.79a4.29,4.29,0,0,0,4.26,4.61c2.63,0,4.06-2.06,4.06-4.61Z"/>
19 |       <path class="cls-1" d="M148.55,56.33a1.25,1.25,0,1,1-1.3,1.25A1.22,1.22,0,0,1,148.55,56.33Zm-1,4.91h2.06V75.12h-2.06Z"/>
20 |       <path class="cls-1" d="M154.11,61.24h1.95V63.6a4.67,4.67,0,0,1,4.64-2.69c3.12,0,5.1,2.12,5.1,5.45v8.76h-2.06V66.66c0-2.36-1.06-4-3.55-4-2.28,0-4,1.39-4,4.26v8.16h-2.06Z"/>
21 |       <path class="cls-1" d="M170.06,61.24H172V63.6c.65-1.57,2.22-2.69,4.64-2.69,3.11,0,5.09,2.12,5.09,5.45v8.76h-2.06V66.66c0-2.36-1.06-4-3.55-4-2.28,0-4,1.39-4,4.26v8.16h-2.06Z"/>
22 |       <path class="cls-1" d="M187.17,56.33a1.25,1.25,0,1,1-1.3,1.25A1.22,1.22,0,0,1,187.17,56.33Zm-1,4.91h2.06V75.12h-2.06Z"/>
23 |       <path class="cls-1" d="M192.72,61.24h2V63.6c.65-1.57,2.22-2.69,4.63-2.69,3.12,0,5.1,2.12,5.1,5.45v8.76h-2.06V66.66c0-2.36-1.06-4-3.55-4-2.28,0-4,1.39-4,4.26v8.16h-2.07Z"/>
24 |       <path class="cls-1" d="M208.24,76.61h2c.76,1.33,2.09,1.93,4,1.93,2.41,0,4.06-1.36,4.06-3.47V71.92c-.43,1.28-2.33,2.63-4.66,2.63A5.4,5.4,0,0,1,208,68.89V66.58a5.38,5.38,0,0,1,5.7-5.67,5,5,0,0,1,4.71,2.63v-2.3h2V75.12c0,2.88-2.6,5-6.12,5C211.49,80.17,209,78.89,208.24,76.61Zm10.08-7.94V66.88a4.18,4.18,0,0,0-4.17-4.28c-2.71,0-4.12,1.49-4.12,4.14v2c0,2.61,1.38,4.15,4,4.15A4,4,0,0,0,218.32,68.67Z"/>
25 |       <path class="cls-1" d="M230.85,68.72V56.14H233V68.91c0,3,2,4.83,5.17,4.83s5.18-1.87,5.18-4.83V56.14h2.12V68.72c0,3.94-2.8,6.78-7.3,6.78S230.85,72.66,230.85,68.72Z"/>
26 |       <path class="cls-1" d="M249.94,79.84V61.24h1.95v2.17a5.17,5.17,0,0,1,4.69-2.5c3.72,0,5.8,2.69,5.8,6.21v2.12c0,3.61-2.25,6.21-5.74,6.21A5.2,5.2,0,0,1,252,73.06v6.78Zm10.38-12.63c0-2.55-1.22-4.61-4.09-4.61S252,64.68,252,67.37v1.79a4.29,4.29,0,0,0,4.26,4.61c2.63,0,4.06-2.06,4.06-4.61Z"/>
27 |     </g>
28 |     <path class="cls-2" d="M41.25,36V65.64a3.91,3.91,0,0,0,2,3.38L68.89,83.85a3.88,3.88,0,0,0,3.9,0L98.48,69a3.91,3.91,0,0,0,2-3.38V36a3.91,3.91,0,0,0-2-3.38L72.79,17.76a3.88,3.88,0,0,0-3.9,0L43.2,32.59A3.91,3.91,0,0,0,41.25,36Z"/>
29 |     <path class="cls-1" d="M83.72,52.65a13.13,13.13,0,0,0,0-3.65c5.18-4,8.68-7.87,7.49-9.94-1.44-2.48-8.2-.19-12.34,1.51a13.27,13.27,0,0,0-3.16-1.84c-.86-6.47-2.47-11.44-4.86-11.44-2.87,0-4.26,7-4.86,11.45a13.07,13.07,0,0,0-3.16,1.81C56.8,38.05,51.68,37,50.48,39c-1.43,2.49,4,7.21,7.51,9.94a12.91,12.91,0,0,0,0,3.62c-5.2,4-8.71,7.88-7.51,10,.45.78,1.43,1.09,2.7,1.09,2.75,0,6.83-1.45,9.66-2.62A12.87,12.87,0,0,0,66,62.84c.85,6.49,2.47,11.48,4.86,11.48,2.88,0,4.27-7,4.86-11.46a13,13,0,0,0,3.16-1.8c3.84,1.58,7.31,2.61,9.63,2.61,1.34,0,2.29-.34,2.73-1.09C92.63,60.09,87.26,55.38,83.72,52.65ZM81,44.9a12,12,0,0,1,.6,1.19.62.62,0,0,0,.57.37.7.7,0,0,0,.25-.05.62.62,0,0,0,.32-.82,14.58,14.58,0,0,0-.66-1.31,13.14,13.14,0,0,0-2.17-2.79c6.67-2.67,9.76-2.57,10.2-1.81.76,1.33-3.51,6.41-13.45,12.7a1.34,1.34,0,0,0-.27-.17V49.4a1.57,1.57,0,0,0,0-2.83c-.09-2.19-.24-4.36-.47-6.41A11.75,11.75,0,0,1,81,44.9ZM71.87,55.22a.81.81,0,0,0-.24-.17,1.57,1.57,0,0,0-1.81.17L67.53,53.9a1.36,1.36,0,0,0,0-.29,1.58,1.58,0,0,0-1-1.49V49.48a1.19,1.19,0,0,0,.26-.11h0a1.58,1.58,0,0,0,.76-1.66l2.28-1.32a2.07,2.07,0,0,0,.24.17h0a1.51,1.51,0,0,0,.79.21,1.61,1.61,0,0,0,1-.38l2.29,1.32a1.36,1.36,0,0,0,0,.29,1.56,1.56,0,0,0,1.06,1.48v2.64a1.59,1.59,0,0,0-.27.12,1.57,1.57,0,0,0-.76,1.66ZM70.84,39a11.73,11.73,0,0,1,1.33.08.62.62,0,0,0,.14-1.24,14.48,14.48,0,0,0-1.47-.08,13,13,0,0,0-3.52.49c1-7.12,2.66-9.75,3.54-9.75,1.53,0,3.79,6.25,4.27,18l-.29.14-2.43-1.4a1.63,1.63,0,0,0,0-.49,1.61,1.61,0,0,0-.74-1,1.59,1.59,0,0,0-1.69.08c-1.93-1-3.89-2-5.78-2.79A11.78,11.78,0,0,1,70.84,39ZM60.66,44.92c.23-.38.47-.76.73-1.11a.62.62,0,0,0-.13-.87.63.63,0,0,0-.87.13,12.33,12.33,0,0,0-.8,1.23,12.93,12.93,0,0,0-1.34,3.3c-5.67-4.45-7.13-7.18-6.69-8,.76-1.32,7.31-.16,17.72,5.3a1.74,1.74,0,0,0,0,.32l-2.43,1.4a1.82,1.82,0,0,0-.45-.2,1.58,1.58,0,0,0-1.2.16,1.54,1.54,0,0,0-.73,1,1.65,1.65,0,0,0,0,.54c-1.84,1.16-3.63,2.38-5.29,3.6A11.87,11.87,0,0,1,60.66,44.92Zm0,11.76a9.35,9.35,0,0,1-.6-1.18.62.62,0,1,0-1.14.5,12.36,12.36,0,0,0,.66,1.31,13.18,13.18,0,0,0,2.18,2.79c-6.7,2.69-9.8,2.59-10.24,1.83C50.78,60.6,55.06,55.52,65,49.22a2,2,0,0,0,.26.18v2.81a1.57,1.57,0,0,0,0,2.82c.09,2.18.24,4.34.47,6.38A11.71,11.71,0,0,1,60.68,56.68Zm8.86,5.8a.63.63,0,0,0-.69.55.63.63,0,0,0,.55.69,14.48,14.48,0,0,0,1.47.08,12.79,12.79,0,0,0,3.49-.49c-1,7.14-2.65,9.77-3.54,9.77-1.17,0-2.77-3.68-3.67-10.68,0,0,0,0,0-.06-.27-2.12-.47-4.54-.58-7.26l.28-.14,2.43,1.4a1.56,1.56,0,0,0,.78,1.44,1.6,1.6,0,0,0,.79.22,1.45,1.45,0,0,0,.41-.06,1.69,1.69,0,0,0,.49-.23c1.93,1,3.88,2,5.77,2.78A11.89,11.89,0,0,1,69.54,62.48ZM90.12,62c-.58,1-4.56.56-11.07-2.16l-.06,0c-2-.83-4.17-1.86-6.59-3.13a1.62,1.62,0,0,0,0-.31l2.43-1.41a1.63,1.63,0,0,0,.44.21,2,2,0,0,0,.41,0A1.58,1.58,0,0,0,77.23,54a1.41,1.41,0,0,0,0-.54c1.85-1.17,3.65-2.39,5.32-3.62A11.75,11.75,0,0,1,81,56.66a10,10,0,0,1-.72,1.11.61.61,0,0,0,.13.87.58.58,0,0,0,.37.13.61.61,0,0,0,.5-.26,12.33,12.33,0,0,0,.8-1.23A12.5,12.5,0,0,0,83.45,54C89.1,58.47,90.56,61.19,90.12,62Z"/>
30 |   </g>
31 | </svg>
32 | 


--------------------------------------------------------------------------------
/docs/images/spinning-up-logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/docs/images/spinning-up-logo2.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. Spinning Up documentation master file, created by
 2 |    sphinx-quickstart on Wed Aug 15 04:21:07 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Spinning Up in Deep RL!
 7 | ==================================
 8 | 
 9 | .. image:: images/spinning-up-in-rl.png
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 |    :caption: User Documentation
14 | 
15 |    user/introduction
16 |    user/installation
17 |    user/algorithms
18 |    user/running
19 |    user/saving_and_loading
20 |    user/plotting
21 | 
22 | .. toctree::
23 |    :maxdepth: 2
24 |    :caption: Introduction to RL
25 | 
26 |    spinningup/rl_intro
27 |    spinningup/rl_intro2
28 |    spinningup/rl_intro3
29 | 
30 | .. toctree::
31 |    :maxdepth: 2
32 |    :caption: Resources
33 | 
34 |    spinningup/spinningup
35 |    spinningup/keypapers
36 |    spinningup/exercises
37 |    spinningup/bench
38 | 
39 | .. toctree::
40 |    :maxdepth: 2
41 |    :caption: Algorithms Docs
42 | 
43 |    algorithms/vpg
44 |    algorithms/trpo
45 |    algorithms/ppo
46 |    algorithms/ddpg
47 |    algorithms/td3
48 |    algorithms/sac
49 | 
50 | .. toctree::
51 |    :maxdepth: 2
52 |    :caption: Utilities Docs
53 | 
54 |    utils/logger
55 |    utils/plotter
56 |    utils/mpi
57 |    utils/run_utils
58 | 
59 | .. toctree::
60 |    :maxdepth: 2
61 |    :caption: Etc.
62 | 
63 |    etc/acknowledgements
64 |    etc/author
65 | 
66 | Indices and tables
67 | ==================
68 | 
69 | * :ref:`genindex`
70 | * :ref:`modindex`
71 | * :ref:`search`
72 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=SpinningUp
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/spinningup/bench.rst:
--------------------------------------------------------------------------------
  1 | ==========================================
  2 | Benchmarks for Spinning Up Implementations
  3 | ==========================================
  4 | 
  5 | .. contents:: Table of Contents
  6 | 
  7 | We benchmarked the Spinning Up algorithm implementations in five environments from the MuJoCo_ Gym task suite: HalfCheetah, Hopper, Walker2d, Swimmer, and Ant.
  8 | 
  9 | .. _MuJoCo: https://gym.openai.com/envs/#mujoco
 10 | 
 11 | Performance in Each Environment
 12 | ===============================
 13 | 
 14 | HalfCheetah: PyTorch Versions
 15 | -----------------------------
 16 | 
 17 | .. figure:: ../images/plots/pyt/pytorch_halfcheetah_performance.svg
 18 |     :align: center
 19 | 
 20 |     3M timestep benchmark for HalfCheetah-v3 using **PyTorch** implementations.
 21 | 
 22 | 
 23 | HalfCheetah: Tensorflow Versions
 24 | --------------------------------
 25 | 
 26 | .. figure:: ../images/plots/tf1/tensorflow_halfcheetah_performance.svg
 27 |     :align: center
 28 | 
 29 |     3M timestep benchmark for HalfCheetah-v3 using **Tensorflow** implementations.
 30 | 
 31 | 
 32 | 
 33 | Hopper: PyTorch Versions
 34 | ------------------------
 35 | 
 36 | .. figure:: ../images/plots/pyt/pytorch_hopper_performance.svg
 37 |     :align: center
 38 | 
 39 |     3M timestep benchmark for Hopper-v3 using **PyTorch** implementations.
 40 | 
 41 | 
 42 | Hopper: Tensorflow Versions
 43 | ---------------------------
 44 | 
 45 | .. figure:: ../images/plots/tf1/tensorflow_hopper_performance.svg
 46 |     :align: center
 47 | 
 48 |     3M timestep benchmark for Hopper-v3 using **Tensorflow** implementations.
 49 | 
 50 | 
 51 | 
 52 | 
 53 | Walker2d: PyTorch Versions
 54 | --------------------------
 55 | 
 56 | .. figure:: ../images/plots/pyt/pytorch_walker2d_performance.svg
 57 |     :align: center
 58 | 
 59 |     3M timestep benchmark for Walker2d-v3 using **PyTorch** implementations.
 60 | 
 61 | 
 62 | Walker2d: Tensorflow Versions
 63 | -----------------------------
 64 | 
 65 | .. figure:: ../images/plots/tf1/tensorflow_walker2d_performance.svg
 66 |     :align: center
 67 | 
 68 |     3M timestep benchmark for Walker2d-v3 using **Tensorflow** implementations.
 69 | 
 70 | 
 71 | 
 72 | Swimmer: PyTorch Versions
 73 | -------------------------
 74 | 
 75 | .. figure:: ../images/plots/pyt/pytorch_swimmer_performance.svg
 76 |     :align: center
 77 | 
 78 |     3M timestep benchmark for Swimmer-v3 using **PyTorch** implementations.
 79 | 
 80 | 
 81 | Swimmer: Tensorflow Versions
 82 | ----------------------------
 83 | 
 84 | .. figure:: ../images/plots/tf1/tensorflow_swimmer_performance.svg
 85 |     :align: center
 86 | 
 87 |     3M timestep benchmark for Swimmer-v3 using **Tensorflow** implementations.
 88 | 
 89 | 
 90 | 
 91 | Ant: PyTorch Versions
 92 | ------------------------
 93 | 
 94 | .. figure:: ../images/plots/pyt/pytorch_ant_performance.svg
 95 |     :align: center
 96 | 
 97 |     3M timestep benchmark for Ant-v3 using **PyTorch** implementations.
 98 | 
 99 | 
100 | Ant: Tensorflow Versions
101 | ---------------------------
102 | 
103 | .. figure:: ../images/plots/tf1/tensorflow_ant_performance.svg
104 |     :align: center
105 | 
106 |     3M timestep benchmark for Ant-v3 using **Tensorflow** implementations.
107 | 
108 | 
109 | Experiment Details
110 | ==================
111 | 
112 | **Random seeds.** All experiments were run for 10 random seeds each. Graphs show the average (solid line) and std dev (shaded) of performance over random seed over the course of training.
113 | 
114 | **Performance metric.** Performance for the on-policy algorithms is measured as the average trajectory return across the batch collected at each epoch. Performance for the off-policy algorithms is measured once every 10,000 steps by running the deterministic policy (or, in the case of SAC, the mean policy) without action noise for ten trajectories, and reporting the average return over those test trajectories.
115 | 
116 | **Network architectures.** The on-policy algorithms use networks of size (64, 32) with tanh units for both the policy and the value function. The off-policy algorithms use networks of size (256, 256) with relu units.
117 | 
118 | **Batch size.** The on-policy algorithms collected 4000 steps of agent-environment interaction per batch update. The off-policy algorithms used minibatches of size 100 at each gradient descent step.
119 | 
120 | All other hyperparameters are left at default settings for the Spinning Up implementations. See algorithm pages for details.
121 | 
122 | Learning curves are smoothed by averaging over a window of 11 epochs.
123 | 
124 | .. admonition:: You Should Know
125 | 
126 |     By comparison to the literature, the Spinning Up implementations of DDPG, TD3, and SAC are roughly at-parity with the best reported results for these algorithms. As a result, you can use the Spinning Up implementations of these algorithms for research purposes.
127 | 
128 |     The Spinning Up implementations of VPG, TRPO, and PPO are overall a bit weaker than the best reported results for these algorithms. This is due to the absence of some standard tricks (such as observation normalization and normalized value regression targets) from our implementations. For research comparisons, you should use the implementations of TRPO or PPO from `OpenAI Baselines`_.
129 | 
130 | .. _`OpenAI Baselines`: https://github.com/openai/baselines
131 | 
132 | 
133 | PyTorch vs Tensorflow
134 | =====================
135 | 
136 | 
137 | We provide graphs for head-to-head comparisons between the PyTorch and Tensorflow implementations of each algorithm at the following pages:
138 | 
139 | * `VPG Head-to-Head`_
140 | 
141 | * `PPO Head-to-Head`_
142 | 
143 | * `DDPG Head-to-Head`_
144 | 
145 | * `TD3 Head-to-Head`_
146 | 
147 | * `SAC Head-to-Head`_
148 | 
149 | .. _`VPG Head-to-Head`: ../spinningup/bench_vpg.html
150 | .. _`PPO Head-to-Head`: ../spinningup/bench_ppo.html
151 | .. _`DDPG Head-to-Head`: ../spinningup/bench_ddpg.html
152 | .. _`TD3 Head-to-Head`: ../spinningup/bench_td3.html
153 | .. _`SAC Head-to-Head`: ../spinningup/bench_sac.html


--------------------------------------------------------------------------------
/docs/spinningup/bench_ddpg.rst:
--------------------------------------------------------------------------------
 1 | DDPG Head-to-Head
 2 | =================
 3 | 
 4 | HalfCheetah
 5 | -----------
 6 | 
 7 | .. figure:: ../images/plots/ddpg/ddpg_halfcheetah_performance.svg
 8 |     :align: center
 9 | 
10 | 
11 | Hopper
12 | ------
13 | 
14 | .. figure:: ../images/plots/ddpg/ddpg_hopper_performance.svg
15 |     :align: center
16 | 
17 | 
18 | Walker2d
19 | --------
20 | 
21 | .. figure:: ../images/plots/ddpg/ddpg_walker2d_performance.svg
22 |     :align: center
23 | 
24 | Swimmer
25 | -------
26 | 
27 | .. figure:: ../images/plots/ddpg/ddpg_swimmer_performance.svg
28 |     :align: center
29 | 
30 | 
31 | Ant
32 | ---
33 | 
34 | .. figure:: ../images/plots/ddpg/ddpg_ant_performance.svg
35 |     :align: center


--------------------------------------------------------------------------------
/docs/spinningup/bench_ppo.rst:
--------------------------------------------------------------------------------
 1 | Proximal Policy Optimization Head-to-Head
 2 | =========================================
 3 | 
 4 | HalfCheetah
 5 | -----------
 6 | 
 7 | .. figure:: ../images/plots/ppo/ppo_halfcheetah_performance.svg
 8 |     :align: center
 9 | 
10 | 
11 | Hopper
12 | ------
13 | 
14 | .. figure:: ../images/plots/ppo/ppo_hopper_performance.svg
15 |     :align: center
16 | 
17 | 
18 | Walker2d
19 | --------
20 | 
21 | .. figure:: ../images/plots/ppo/ppo_walker2d_performance.svg
22 |     :align: center
23 | 
24 | Swimmer
25 | -------
26 | 
27 | .. figure:: ../images/plots/ppo/ppo_swimmer_performance.svg
28 |     :align: center
29 | 
30 | 
31 | Ant
32 | ---
33 | 
34 | .. figure:: ../images/plots/ppo/ppo_ant_performance.svg
35 |     :align: center


--------------------------------------------------------------------------------
/docs/spinningup/bench_sac.rst:
--------------------------------------------------------------------------------
 1 | SAC Head-to-Head
 2 | =================
 3 | 
 4 | HalfCheetah
 5 | -----------
 6 | 
 7 | .. figure:: ../images/plots/sac/sac_halfcheetah_performance.svg
 8 |     :align: center
 9 | 
10 | 
11 | Hopper
12 | ------
13 | 
14 | .. figure:: ../images/plots/sac/sac_hopper_performance.svg
15 |     :align: center
16 | 
17 | 
18 | Walker2d
19 | --------
20 | 
21 | .. figure:: ../images/plots/sac/sac_walker2d_performance.svg
22 |     :align: center
23 | 
24 | Swimmer
25 | -------
26 | 
27 | .. figure:: ../images/plots/sac/sac_swimmer_performance.svg
28 |     :align: center
29 | 
30 | 
31 | Ant
32 | ---
33 | 
34 | .. figure:: ../images/plots/sac/sac_ant_performance.svg
35 |     :align: center


--------------------------------------------------------------------------------
/docs/spinningup/bench_td3.rst:
--------------------------------------------------------------------------------
 1 | TD3 Head-to-Head
 2 | =================
 3 | 
 4 | HalfCheetah
 5 | -----------
 6 | 
 7 | .. figure:: ../images/plots/td3/td3_halfcheetah_performance.svg
 8 |     :align: center
 9 | 
10 | 
11 | Hopper
12 | ------
13 | 
14 | .. figure:: ../images/plots/td3/td3_hopper_performance.svg
15 |     :align: center
16 | 
17 | 
18 | Walker2d
19 | --------
20 | 
21 | .. figure:: ../images/plots/td3/td3_walker2d_performance.svg
22 |     :align: center
23 | 
24 | Swimmer
25 | -------
26 | 
27 | .. figure:: ../images/plots/td3/td3_swimmer_performance.svg
28 |     :align: center
29 | 
30 | 
31 | Ant
32 | ---
33 | 
34 | .. figure:: ../images/plots/td3/td3_ant_performance.svg
35 |     :align: center


--------------------------------------------------------------------------------
/docs/spinningup/bench_vpg.rst:
--------------------------------------------------------------------------------
 1 | Vanilla Policy Gradients Head-to-Head
 2 | =====================================
 3 | 
 4 | HalfCheetah
 5 | -----------
 6 | 
 7 | .. figure:: ../images/plots/vpg/vpg_halfcheetah_performance.svg
 8 |     :align: center
 9 | 
10 | 
11 | Hopper
12 | ------
13 | 
14 | .. figure:: ../images/plots/vpg/vpg_hopper_performance.svg
15 |     :align: center
16 | 
17 | 
18 | Walker2d
19 | --------
20 | 
21 | .. figure:: ../images/plots/vpg/vpg_walker2d_performance.svg
22 |     :align: center
23 | 
24 | Swimmer
25 | -------
26 | 
27 | .. figure:: ../images/plots/vpg/vpg_swimmer_performance.svg
28 |     :align: center
29 | 
30 | 
31 | Ant
32 | ---
33 | 
34 | .. figure:: ../images/plots/vpg/vpg_ant_performance.svg
35 |     :align: center


--------------------------------------------------------------------------------
/docs/spinningup/exercise2_1_soln.rst:
--------------------------------------------------------------------------------
 1 | ========================
 2 | Solution to Exercise 2.1
 3 | ========================
 4 | 
 5 | .. figure:: ../images/ex2-1_trpo_hopper.png
 6 |     :align: center
 7 | 
 8 |     Learning curves for TRPO in Hopper-v2 with different values of ``train_v_iters``, averaged over three random seeds.
 9 | 
10 | 
11 | The difference is quite substantial: with a trained value function, the agent is able to quickly make progress. With an untrained value function, the agent gets stuck early on.


--------------------------------------------------------------------------------
/docs/spinningup/exercise2_2_soln.rst:
--------------------------------------------------------------------------------
  1 | ========================
  2 | Solution to Exercise 2.2
  3 | ========================
  4 | 
  5 | .. figure:: ../images/ex2-2_ddpg_bug.svg
  6 |     :align: center
  7 | 
  8 |     Learning curves for DDPG in HalfCheetah-v2 for bugged and non-bugged actor-critic implementations, averaged over three random seeds.
  9 | 
 10 | 
 11 | .. admonition:: You Should Know
 12 | 
 13 |     This page will give the solution primarily in terms of a detailed analysis of the Tensorflow version of this exercise. However, the problem in the PyTorch version is basically the same and so is its solution.
 14 | 
 15 | 
 16 | The Bug in the Code: Tensorflow Version
 17 | =======================================
 18 | 
 19 | The only difference between the correct actor-critic code,
 20 | 
 21 | .. code-block:: python
 22 |     :emphasize-lines: 11, 13
 23 | 
 24 |     """
 25 |     Actor-Critic
 26 |     """
 27 |     def mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 
 28 |                          output_activation=tf.tanh, action_space=None):
 29 |         act_dim = a.shape.as_list()[-1]
 30 |         act_limit = action_space.high[0]
 31 |         with tf.variable_scope('pi'):
 32 |             pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
 33 |         with tf.variable_scope('q'):
 34 |             q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
 35 |         with tf.variable_scope('q', reuse=True):
 36 |             q_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
 37 |         return pi, q, q_pi
 38 | 
 39 | and the bugged actor-critic code,
 40 | 
 41 | .. code-block:: python
 42 |     :emphasize-lines: 11, 13
 43 | 
 44 |     """
 45 |     Bugged Actor-Critic
 46 |     """
 47 |     def bugged_mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 
 48 |                                 output_activation=tf.tanh, action_space=None):
 49 |         act_dim = a.shape.as_list()[-1]
 50 |         act_limit = action_space.high[0]
 51 |         with tf.variable_scope('pi'):
 52 |             pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
 53 |         with tf.variable_scope('q'):
 54 |             q = mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None)
 55 |         with tf.variable_scope('q', reuse=True):
 56 |             q_pi = mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None)
 57 |         return pi, q, q_pi
 58 | 
 59 | is the tensor shape for the Q-functions. The correct version squeezes ouputs so that they have shape ``[batch size]``, whereas the bugged version doesn't, resulting in Q-functions with shape ``[batch size, 1]``.
 60 | 
 61 | 
 62 | The Bug in the Code: PyTorch Version
 63 | ====================================
 64 | 
 65 | In the PyTorch version of the exercise, the difference is virtually the same. The correct actor-critic code computes a forward pass on the Q-function that squeezes its output:
 66 | 
 67 | 
 68 | .. code-block:: python
 69 |     :emphasize-lines: 12
 70 | 
 71 |     """
 72 |     Correct Q-Function
 73 |     """
 74 |     class MLPQFunction(nn.Module):
 75 | 
 76 |         def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 77 |             super().__init__()
 78 |             self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
 79 | 
 80 |         def forward(self, obs, act):
 81 |             q = self.q(torch.cat([obs, act], dim=-1))
 82 |             return torch.squeeze(q, -1) # Critical to ensure q has right shape.
 83 | 
 84 | 
 85 | while the bugged version does not:
 86 | 
 87 | .. code-block:: python
 88 |     :emphasize-lines: 11
 89 | 
 90 |     """
 91 |     Bugged Q-Function
 92 |     """
 93 |     class BuggedMLPQFunction(nn.Module):
 94 | 
 95 |         def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 96 |             super().__init__()
 97 |             self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
 98 | 
 99 |         def forward(self, obs, act):
100 |             return self.q(torch.cat([obs, act], dim=-1))
101 | 
102 | How it Gums Up the Works: Tensorflow Version
103 | ============================================
104 | 
105 | Consider the excerpt from the part in the code that builds the DDPG computation graph:
106 | 
107 | .. code-block:: python
108 | 
109 |     # Bellman backup for Q function
110 |     backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ)
111 | 
112 |     # DDPG losses
113 |     pi_loss = -tf.reduce_mean(q_pi)
114 |     q_loss = tf.reduce_mean((q-backup)**2)
115 | 
116 | This is where the tensor shape issue comes into play. It's important to know that ``r_ph`` and ``d_ph`` have shape ``[batch size]``.
117 | 
118 | The line that produces the Bellman backup was written with the assumption that it would add together tensors with the same shape. However, this line can **also** add together tensors with different shapes, as long as they're broadcast-compatible. 
119 | 
120 | Tensors with shapes ``[batch size]`` and ``[batch size, 1]`` are broadcast compatible, but the behavior is not actually what you might expect! Check out this example:
121 | 
122 | >>> import tensorflow as tf
123 | >>> import numpy as np
124 | >>> x = tf.constant(np.arange(5))
125 | >>> y = tf.constant(np.arange(5).reshape(-1,1))
126 | >>> z1 = x * y
127 | >>> z2 = x + y
128 | >>> z3 = x + z1
129 | >>> x.shape
130 | TensorShape([Dimension(5)])
131 | >>> y.shape
132 | TensorShape([Dimension(5), Dimension(1)])
133 | >>> z1.shape
134 | TensorShape([Dimension(5), Dimension(5)])
135 | >>> z2.shape
136 | TensorShape([Dimension(5), Dimension(5)])
137 | >>> sess = tf.InteractiveSession()
138 | >>> sess.run(z1)
139 | array([[ 0,  0,  0,  0,  0],
140 |        [ 0,  1,  2,  3,  4],
141 |        [ 0,  2,  4,  6,  8],
142 |        [ 0,  3,  6,  9, 12],
143 |        [ 0,  4,  8, 12, 16]])
144 | >>> sess.run(z2)
145 | array([[0, 1, 2, 3, 4],
146 |        [1, 2, 3, 4, 5],
147 |        [2, 3, 4, 5, 6],
148 |        [3, 4, 5, 6, 7],
149 |        [4, 5, 6, 7, 8]])
150 | >>> sess.run(z3)
151 | array([[ 0,  1,  2,  3,  4],
152 |        [ 0,  2,  4,  6,  8],
153 |        [ 0,  3,  6,  9, 12],
154 |        [ 0,  4,  8, 12, 16],
155 |        [ 0,  5, 10, 15, 20]])
156 | 
157 | Adding or multiplying a shape ``[5]`` tensor by a shape ``[5,1]`` tensor returns a shape ``[5,5]`` tensor!
158 | 
159 | When you don't squeeze the Q-functions, ``q_pi_targ`` has shape ``[batch size, 1]``, and the backup---and in turn, the whole Q-loss---gets totally messed up. 
160 | 
161 | Broadcast error 1: ``(1 - d_ph) * q_pi_targ`` becomes a ``[batch size, batch size]`` tensor containing the outer product of the mask with the target network Q-values. 
162 | 
163 | Broadcast error 2: ``r_ph`` then gets treated as a row vector and added to each row of ``(1 - d_ph) * q_pi_targ`` separately.
164 | 
165 | Broadcast error 3: ``q_loss`` depends on ``q - backup``, which involves another bad broadcast between ``q`` (shape ``[batch size, 1]``) and ``backup`` (shape ``[batch size, batch size]``). 
166 | 
167 | To put it mathematically: let :math:`q`, :math:`q'`, :math:`r`, :math:`d` denote vectors containing the q-values, target q-values, rewards, and dones for a given batch, where there are :math:`n` entries in the batch. The correct backup is
168 | 
169 | .. math::
170 | 
171 |     z_i = r_i + \gamma (1-d_i) q'_i,
172 | 
173 | and the correct loss function is 
174 | 
175 | .. math::
176 |     
177 |     \frac{1}{n} \sum_{i=1}^n (q_i - z_i)^2.
178 | 
179 | But with these errors, what gets computed is a backup *matrix*,
180 | 
181 | .. math::
182 | 
183 |     z_{ij} = r_j + \gamma (1-d_j) q'_i,
184 | 
185 | and a messed up loss function
186 | 
187 | .. math::
188 | 
189 |     \frac{1}{n^2} \sum_{i=1}^n \sum_{j=1}^n (q_j - z_{ij})^2.
190 | 
191 | If you leave this to run in HalfCheetah long enough, you'll actually see some non-trivial learning process, because weird details specific to this environment partly cancel out the errors. But almost everywhere else, it fails completely.
192 | 
193 | 
194 | How it Gums Up the Works: PyTorch Version
195 | =========================================
196 | 
197 | Exactly the same broadcasting shenanigans as in the Tensorflow version. Check out `this note`_ in the PyTorch documentation about it.
198 | 
199 | 
200 | .. figure:: ../images/ex2-2_ddpg_bug_pytorch.png
201 |     :align: center
202 | 
203 |     Learning curves for DDPG in HalfCheetah-v2 for bugged and non-bugged actor-critic implementations using PyTorch, averaged over three random seeds.
204 | 
205 | 
206 | 
207 | .. _`this note`: https://pytorch.org/docs/stable/notes/broadcasting.html#backwards-compatibility


--------------------------------------------------------------------------------
/docs/spinningup/exercises.rst:
--------------------------------------------------------------------------------
  1 | =========
  2 | Exercises
  3 | =========
  4 | 
  5 | 
  6 | .. contents:: Table of Contents
  7 |     :depth: 2
  8 | 
  9 | Problem Set 1: Basics of Implementation
 10 | ---------------------------------------
 11 | 
 12 | .. admonition:: Exercise 1.1: Gaussian Log-Likelihood
 13 | 
 14 |     **Path to Exercise:** 
 15 | 
 16 |     * PyTorch version: ``spinup/exercises/pytorch/problem_set_1/exercise1_1.py``
 17 |     
 18 |     * Tensorflow version: ``spinup/exercises/tf1/problem_set_1/exercise1_1.py``
 19 | 
 20 |     **Path to Solution:** 
 21 | 
 22 |     * PyTorch version: ``spinup/exercises/pytorch/problem_set_1_solutions/exercise1_1_soln.py``
 23 | 
 24 |     * Tensorflow version: ``spinup/exercises/tf1/problem_set_1_solutions/exercise1_1_soln.py``
 25 | 
 26 | 
 27 |     **Instructions.** Write a function that takes in the means and log stds of a batch of diagonal Gaussian distributions, along with (previously-generated) samples from those distributions, and returns the log likelihoods of those samples. (In the Tensorflow version, you will write a function that creates computation graph operations to do this; in the PyTorch version, you will directly operate on given Tensors.)
 28 | 
 29 |     You may find it useful to review the formula given in `this section of the RL introduction`_.
 30 | 
 31 |     Implement your solution in ``exercise1_1.py``, and run that file to automatically check your work.
 32 | 
 33 |     **Evaluation Criteria.** Your solution will be checked by comparing outputs against a known-good implementation, using a batch of random inputs.
 34 | 
 35 | .. _`this section of the RL introduction`: ../spinningup/rl_intro.html#stochastic-policies
 36 | 
 37 | 
 38 | .. admonition:: Exercise 1.2: Policy for PPO
 39 | 
 40 |     **Path to Exercise:** 
 41 | 
 42 |     * PyTorch version: ``spinup/exercises/pytorch/problem_set_1/exercise1_2.py``
 43 |     
 44 |     * Tensorflow version: ``spinup/exercises/tf1/problem_set_1/exercise1_2.py``
 45 | 
 46 |     **Path to Solution:** 
 47 | 
 48 |     * PyTorch version: ``spinup/exercises/pytorch/problem_set_1_solutions/exercise1_2_soln.py``
 49 | 
 50 |     * Tensorflow version: ``spinup/exercises/tf1/problem_set_1_solutions/exercise1_2_soln.py``
 51 | 
 52 |     **Instructions.** Implement an MLP diagonal Gaussian policy for PPO. 
 53 | 
 54 |     Implement your solution in ``exercise1_2.py``, and run that file to automatically check your work. 
 55 | 
 56 |     **Evaluation Criteria.** Your solution will be evaluated by running for 20 epochs in the InvertedPendulum-v2 Gym environment, and this should take in the ballpark of 3-5 minutes (depending on your machine, and other processes you are running in the background). The bar for success is reaching an average score of over 500 in the last 5 epochs, or getting to a score of 1000 (the maximum possible score) in the last 5 epochs.
 57 | 
 58 | 
 59 | .. admonition:: Exercise 1.3: Computation Graph for TD3
 60 | 
 61 |     **Path to Exercise.** 
 62 | 
 63 |     * PyTorch version: ``spinup/exercises/pytorch/problem_set_1/exercise1_3.py``
 64 |     
 65 |     * Tensorflow version: ``spinup/exercises/tf1/problem_set_1/exercise1_3.py``
 66 | 
 67 |     **Path to Solution.** 
 68 | 
 69 |     * PyTorch version: ``spinup/algos/pytorch/td3/td3.py``
 70 | 
 71 |     * Tensorflow version: ``spinup/algos/tf1/td3/td3.py``
 72 | 
 73 |     **Instructions.** Implement the main mathematical logic for the TD3 algorithm.
 74 | 
 75 |     As starter code, you are given the entirety of the TD3 algorithm except for the main mathematical logic (essentially, the loss functions and intermediate calculations needed for them). Find "YOUR CODE HERE" to begin. 
 76 | 
 77 |     You may find it useful to review the pseudocode in our `page on TD3`_.
 78 | 
 79 |     Implement your solution in ``exercise1_3.py``, and run that file to see the results of your work. There is no automatic checking for this exercise.
 80 | 
 81 |     **Evaluation Criteria.** Evaluate your code by running ``exercise1_3.py`` with HalfCheetah-v2, InvertedPendulum-v2, and one other Gym MuJoCo environment of your choosing (set via the ``--env`` flag). It is set up to use smaller neural networks (hidden sizes [128,128]) than typical for TD3, with a maximum episode length of 150, and to run for only 10 epochs. The goal is to see significant learning progress relatively quickly (in terms of wall clock time). Experiments will likely take on the order of ~10 minutes. 
 82 | 
 83 |     Use the ``--use_soln`` flag to run Spinning Up's TD3 instead of your implementation. Anecdotally, within 10 epochs, the score in HalfCheetah should go over 300, and the score in InvertedPendulum should max out at 150.
 84 | 
 85 | .. _`page on TD3`: ../algorithms/td3.html
 86 | 
 87 | 
 88 | Problem Set 2: Algorithm Failure Modes
 89 | --------------------------------------
 90 | 
 91 | .. admonition:: Exercise 2.1: Value Function Fitting in TRPO
 92 | 
 93 |     **Path to Exercise.** (Not applicable, there is no code for this one.)
 94 | 
 95 |     **Path to Solution.** `Solution available here. <../spinningup/exercise2_1_soln.html>`_
 96 | 
 97 |     Many factors can impact the performance of policy gradient algorithms, but few more drastically than the quality of the learned value function used for advantage estimation. 
 98 | 
 99 |     In this exercise, you will compare results between runs of TRPO where you put lots of effort into fitting the value function (``train_v_iters=80``), versus where you put very little effort into fitting the value function (``train_v_iters=0``). 
100 | 
101 |     **Instructions.** Run the following command:
102 | 
103 |     .. parsed-literal::
104 | 
105 |         python -m spinup.run trpo --env Hopper-v2 --train_v_iters[v] 0 80 --exp_name ex2-1 --epochs 250 --steps_per_epoch 4000 --seed 0 10 20 --dt
106 | 
107 |     and plot the results. (These experiments might take ~10 minutes each, and this command runs six of them.) What do you find?
108 | 
109 | .. admonition:: Exercise 2.2: Silent Bug in DDPG
110 | 
111 |     **Path to Exercise.** 
112 | 
113 |     * PyTorch version: ``spinup/exercises/pytorch/problem_set_2/exercise2_2.py``
114 |     
115 |     * Tensorflow version: ``spinup/exercises/tf1/problem_set_2/exercise2_2.py``
116 | 
117 |     **Path to Solution.** `Solution available here. <../spinningup/exercise2_2_soln.html>`_
118 | 
119 |     The hardest part of writing RL code is dealing with bugs, because failures are frequently silent. The code will appear to run correctly, but the agent's performance will degrade relative to a bug-free implementation---sometimes to the extent that it never learns anything.
120 | 
121 |     In this exercise, you will observe a bug in vivo and compare results against correct code. The bug is the same (conceptually, if not in exact implementation) for both the PyTorch and Tensorflow versions of this exercise. 
122 | 
123 |     **Instructions.** Run ``exercise2_2.py``, which will launch DDPG experiments with and without a bug. The non-bugged version runs the default Spinning Up implementation of DDPG, using a default method for creating the actor and critic networks. The bugged version runs the same DDPG code, except uses a bugged method for creating the networks.
124 | 
125 |     There will be six experiments in all (three random seeds for each case), and each should take in the ballpark of 10 minutes. When they're finished, plot the results. What is the difference in performance with and without the bug? 
126 | 
127 |     Without referencing the correct actor-critic code (which is to say---don't look in DDPG's ``core.py`` file), try to figure out what the bug is and explain how it breaks things.
128 | 
129 |     **Hint.** To figure out what's going wrong, think about how the DDPG code implements the DDPG computation graph. For the Tensorflow version, look at this excerpt:
130 | 
131 |     .. code-block:: python
132 | 
133 |         # Bellman backup for Q function
134 |         backup = tf.stop_gradient(r_ph + gamma*(1-d_ph)*q_pi_targ)
135 | 
136 |         # DDPG losses
137 |         pi_loss = -tf.reduce_mean(q_pi)
138 |         q_loss = tf.reduce_mean((q-backup)**2)
139 | 
140 |     How could a bug in the actor-critic code have an impact here?
141 | 
142 |     **Bonus.** Are there any choices of hyperparameters which would have hidden the effects of the bug? 
143 | 
144 | 
145 | Challenges
146 | ----------
147 | 
148 | .. admonition:: Write Code from Scratch
149 | 
150 |     As we suggest in `the essay <../spinningup/spinningup.html#learn-by-doing>`_, try reimplementing various deep RL algorithms from scratch. 
151 | 
152 | .. admonition:: Requests for Research
153 | 
154 |     If you feel comfortable with writing deep learning and deep RL code, consider trying to make progress on any of OpenAI's standing requests for research:
155 | 
156 |     * `Requests for Research 1 <https://openai.com/requests-for-research/>`_
157 |     * `Requests for Research 2 <https://blog.openai.com/requests-for-research-2/>`_


--------------------------------------------------------------------------------
/docs/spinningup/extra_pg_proof1.rst:
--------------------------------------------------------------------------------
  1 | ==============
  2 | Extra Material
  3 | ==============
  4 | 
  5 | Proof for Don't Let the Past Distract You
  6 | =========================================
  7 | 
  8 | In this subsection, we will prove that actions should not be reinforced for rewards obtained in the past.
  9 | 
 10 | Expand out :math:`R(\tau)` in the expression for the `simplest policy gradient`_ to obtain:
 11 | 
 12 | .. math::
 13 | 
 14 |     \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(\tau)} \\
 15 |     &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \sum_{t'=0}^T R(s_{t'}, a_{t'}, s_{t'+1})} \\
 16 |     &= \sum_{t=0}^{T} \sum_{t'=0}^T  \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(s_{t'}, a_{t'}, s_{t'+1})},
 17 | 
 18 | and consider the term
 19 | 
 20 | .. math::
 21 | 
 22 |     \underE{\tau \sim \pi_{\theta}}{f(t,t')} = \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(s_{t'}, a_{t'}, s_{t'+1})}.
 23 | 
 24 | We will show that for the case of :math:`t' < t` (the reward comes before the action being reinforced), this term is zero. This is a complete proof of the original claim, because after dropping terms with :math:`t' < t` from the expression, we are left with the reward-to-go form of the policy gradient, as desired:
 25 | 
 26 | .. math::
 27 | 
 28 |     \nabla_{\theta} J(\pi_{\theta}) = \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \sum_{t'=t}^T R(s_{t'}, a_{t'}, s_{t'+1})}
 29 | 
 30 | **1. Using the Marginal Distribution.** To proceed, we have to break down the expectation in :math:`\underE{\tau \sim \pi_{\theta}}{f(t,t')}`. It's an expectation over trajectories, but the expression inside the expectation only deals with a few states and actions: :math:`s_t`, :math:`a_t`, :math:`s_{t'}`, :math:`a_{t'}`, and :math:`s_{t'+1}`. So in computing the expectation, we only need to worry about the `marginal distribution`_ over these random variables. 
 31 | 
 32 | We derive:
 33 | 
 34 | .. math:: 
 35 | 
 36 |     \underE{\tau \sim \pi_{\theta}}{f(t,t')} &= \int_{\tau} P(\tau|\pi_{\theta}) f(t,t') \\
 37 |     &= \int_{s_t, a_t, s_{t'}, a_{t'}, s_{t'+1}} P(s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} | \pi_{\theta}) f(t,t') \\
 38 |     &= \underE{s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{f(t,t')}.
 39 | 
 40 | **2. Probability Chain Rule.** Joint distributions can be calculated in terms of conditional and marginal probabilities via `chain rule of probability`_: :math:`P(A,B) = P(B|A) P(A)`. Here, we use this rule to compute
 41 | 
 42 | .. math::
 43 | 
 44 |     P(s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} | \pi_{\theta}) = P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) P(s_{t'}, a_{t'}, s_{t'+1} | \pi_{\theta})
 45 | 
 46 | 
 47 | **3. Separating Expectations Over Multiple Random Variables.** If we have an expectation over two random variables :math:`A` and :math:`B`, we can split it into an inner and outer expectation, where the inner expectation treats the variable from the outer expectation as a constant. Our ability to make this split relies on probability chain rule. Mathematically:
 48 | 
 49 | .. math::
 50 | 
 51 |     \underE{A,B}{f(A,B)} &= \int_{A,B} P(A,B) f(A,B) \\
 52 |     &= \int_{A} \int_B P(B|A) P(A) f(A,B) \\
 53 |     &= \int_A P(A) \int_B P(B|A) f(A,B) \\
 54 |     &= \int_A P(A) \underE{B}{f(A,B) \Big| A} \\
 55 |     &= \underE{A}{\underE{B}{f(A,B) \Big| A} }
 56 | 
 57 | An expectation over :math:`s_t, a_t, s_{t'}, a_{t'}, s_{t'+1}` can thus be expressed by
 58 | 
 59 | .. math:: 
 60 | 
 61 |     \underE{\tau \sim \pi_{\theta}}{f(t,t')} &= \underE{s_t, a_t, s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{f(t,t')} \\
 62 |     &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{\underE{s_t, a_t \sim \pi_{\theta}}{f(t,t') \Big| s_{t'}, a_{t'}, s_{t'+1}}}
 63 | 
 64 | **4. Constants Can Be Pulled Outside of Expectations.** If a term inside an expectation is constant with respect to the variable being expected over, it can be pulled outside of the expectation. To give an example, consider again an expectation over two random variables :math:`A` and :math:`B`, where this time, :math:`f(A,B) = h(A) g(B)`. Then, using the result from before:
 65 | 
 66 | .. math::
 67 | 
 68 |     \underE{A,B}{f(A,B)} &= \underE{A}{\underE{B}{f(A,B) \Big| A}} \\
 69 |     &= \underE{A}{\underE{B}{h(A) g(B) \Big| A}}\\
 70 |     &= \underE{A}{h(A) \underE{B}{g(B) \Big| A}}.
 71 | 
 72 | The function in our expectation decomposes this way, allowing us to write:
 73 | 
 74 | .. math:: 
 75 | 
 76 |     \underE{\tau \sim \pi_{\theta}}{f(t,t')} &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{\underE{s_t, a_t \sim \pi_{\theta}}{f(t,t') \Big| s_{t'}, a_{t'}, s_{t'+1}}} \\
 77 |     &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{\underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) R(s_{t'}, a_{t'}, s_{t'+1}) \Big| s_{t'}, a_{t'}, s_{t'+1}}} \\
 78 |     &= \underE{s_{t'}, a_{t'}, s_{t'+1} \sim \pi_{\theta}}{R(s_{t'}, a_{t'}, s_{t'+1})  \underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_{t'}, a_{t'}, s_{t'+1}}}.
 79 | 
 80 | **5. Applying the EGLP Lemma.** The last step in our proof relies on the `EGLP lemma`_. At this point, we will only worry about the innermost expectation, 
 81 | 
 82 | .. math::
 83 | 
 84 |     \underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_{t'}, a_{t'}, s_{t'+1}} = \int_{s_t, a_t} P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) \nabla_{\theta} \log \pi_{\theta}(a_t |s_t).
 85 | 
 86 | We now have to make a distinction between two cases: :math:`t' < t`, the case where the reward happened before the action, and :math:`t' \geq t`, where it didn't.
 87 | 
 88 | **Case One: Reward Before Action.** If :math:`t' < t`, then the conditional probabilities for actions at :math:`a_t` come from the policy:
 89 | 
 90 | .. math::
 91 | 
 92 |     P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) &= \pi_{\theta}(a_t | s_t) P(s_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}),
 93 | 
 94 | the innermost expectation can be broken down farther into
 95 | 
 96 | .. math::
 97 | 
 98 |     \underE{s_t, a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_{t'}, a_{t'}, s_{t'+1}} &= \int_{s_t, a_t} P(s_t, a_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \\
 99 |     &= \int_{s_t} P(s_t | \pi_{\theta}, s_{t'}, a_{t'}, s_{t'+1}) \int_{a_t} \pi_{\theta}(a_t | s_t) \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \\
100 |     &= \underE{s_t \sim \pi_{\theta}}{ \underE{a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_t } \Big| s_{t'}, a_{t'}, s_{t'+1}}.
101 | 
102 | The EGLP lemma says that 
103 | 
104 | .. math::
105 | 
106 |     \underE{a_t \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big| s_t } = 0,
107 | 
108 | allowing us to conclude that for :math:`t' < t`, :math:`\underE{\tau \sim \pi_{\theta}}{f(t,t')} = 0`. 
109 | 
110 | **Case Two: Reward After Action.** What about the :math:`t' \geq t` case, though? Why doesn't the same logic apply? In this case, the conditional probabilities for :math:`a_t` can't be broken down the same way, because you're conditioning **on the future.** Think about it like this: let's say that every day, in the morning, you make a choice between going for a jog and going to work early, and you have a 50-50 chance of each option. If you condition on a future where you went to work early, what are the odds that you went for a jog? Clearly, you didn't. But if you're conditioning on the past---before you made the decision---what are the odds that you will later go for a jog? Now it's back to 50-50. 
111 | 
112 | So in the case where :math:`t' \geq t`, the conditional distribution over actions :math:`a_t` is **not** :math:`\pi(a_t|s_t)`, and the EGLP lemma does not apply. 
113 | 
114 | .. _`simplest policy gradient`: ../spinningup/rl_intro3.html#deriving-the-simplest-policy-gradient
115 | .. _`marginal distribution`: https://en.wikipedia.org/wiki/Marginal_distribution
116 | .. _`chain rule of probability`: https://en.wikipedia.org/wiki/Chain_rule_(probability)
117 | .. _`EGLP lemma`: ../spinningup/rl_intro3.html#expected-grad-log-prob-lemma


--------------------------------------------------------------------------------
/docs/spinningup/extra_pg_proof2.rst:
--------------------------------------------------------------------------------
 1 | ==============
 2 | Extra Material
 3 | ==============
 4 | 
 5 | Proof for Using Q-Function in Policy Gradient Formula
 6 | =====================================================
 7 | 
 8 | In this section, we will show that
 9 | 
10 | .. math::
11 | 
12 |     \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \Big( \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Big) Q^{\pi_{\theta}}(s_t, a_t)},
13 | 
14 | for the finite-horizon undiscounted return setting. (An analagous result holds in the infinite-horizon discounted case using basically the same proof.)
15 | 
16 | 
17 | The proof of this claim depends on the `law of iterated expectations`_. First, let's rewrite the expression for the policy gradient, starting from the reward-to-go form (using the notation :math:`\hat{R}_t = \sum_{t'=t}^T R(s_t', a_t', s_{t'+1})` to help shorten things):
18 | 
19 | .. math::
20 | 
21 |     \nabla_{\theta} J(\pi_{\theta}) &= \underE{\tau \sim \pi_{\theta}}{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \hat{R}_t} \\
22 |     &= \sum_{t=0}^{T} \underE{\tau \sim \pi_{\theta}}{\nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \hat{R}_t}
23 | 
24 | Define :math:`\tau_{:t} = (s_0, a_0, ..., s_t, a_t)` as the trajectory up to time :math:`t`, and :math:`\tau_{t:}` as the remainder of the trajectory after that. By the law of iterated expectations, we can break up the preceding expression into:
25 | 
26 | .. math::
27 | 
28 |     \nabla_{\theta} J(\pi_{\theta}) &= \sum_{t=0}^{T} \underE{\tau_{:t} \sim \pi_{\theta}}{ \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \hat{R}_t \right| \tau_{:t}}}
29 | 
30 | The grad-log-prob is constant with respect to the inner expectation (because it depends on :math:`s_t` and :math:`a_t`, which the inner expectation conditions on as fixed in :math:`\tau_{:t}`), so it can be pulled out, leaving:
31 | 
32 | .. math::
33 | 
34 |     \nabla_{\theta} J(\pi_{\theta}) &= \sum_{t=0}^{T} \underE{\tau_{:t} \sim \pi_{\theta}}{ \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \hat{R}_t \right| \tau_{:t}}}
35 | 
36 | In Markov Decision Processes, the future only depends on the most recent state and action. As a result, the inner expectation---which expects over the future, conditioned on the entirety of the past (everything up to time :math:`t`)---is equal to the same expectation if it only conditioned on the last timestep (just :math:`(s_t,a_t)`):
37 | 
38 | .. math::
39 | 
40 |     \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \hat{R}_t \right| \tau_{:t}} = \underE{\tau_{t:} \sim \pi_{\theta}}{ \left. \hat{R}_t \right| s_t, a_t},
41 | 
42 | which is the *definition* of :math:`Q^{\pi_{\theta}}(s_t, a_t)`: the expected return, starting from state :math:`s_t` and action :math:`a_t`, when acting on-policy for the rest of the trajectory. 
43 | 
44 | The result follows immediately.
45 | 
46 | .. _`law of iterated expectations`: https://en.wikipedia.org/wiki/Law_of_total_expectation
47 | 


--------------------------------------------------------------------------------
/docs/spinningup/rl_intro4.rst:
--------------------------------------------------------------------------------
 1 | =========================
 2 | Limitations and Frontiers
 3 | =========================
 4 | 
 5 | 
 6 | Reward Design
 7 | =============
 8 | 
 9 | 
10 | Sample Complexity
11 | =================
12 | 
13 | 
14 | Long-Horizon Tasks
15 | ==================


--------------------------------------------------------------------------------
/docs/user/algorithms.rst:
--------------------------------------------------------------------------------
  1 | ==========
  2 | Algorithms
  3 | ==========
  4 | 
  5 | .. contents:: Table of Contents
  6 | 
  7 | What's Included
  8 | ===============
  9 | 
 10 | The following algorithms are implemented in the Spinning Up package:
 11 | 
 12 | - `Vanilla Policy Gradient`_ (VPG)
 13 | - `Trust Region Policy Optimization`_ (TRPO)
 14 | - `Proximal Policy Optimization`_ (PPO)
 15 | - `Deep Deterministic Policy Gradient`_ (DDPG)
 16 | - `Twin Delayed DDPG`_ (TD3)
 17 | - `Soft Actor-Critic`_ (SAC)
 18 | 
 19 | They are all implemented with `MLP`_ (non-recurrent) actor-critics, making them suitable for fully-observed, non-image-based RL environments, e.g. the `Gym Mujoco`_ environments.
 20 | 
 21 | Spinning Up has two implementations for each algorithm (except for TRPO): one that uses `PyTorch`_ as the neural network library, and one that uses `Tensorflow v1`_ as the neural network library. (TRPO is currently only available in Tensorflow.)
 22 | 
 23 | .. _`Gym Mujoco`: https://gym.openai.com/envs/#mujoco
 24 | .. _`Vanilla Policy Gradient`: ../algorithms/vpg.html
 25 | .. _`Trust Region Policy Optimization`: ../algorithms/trpo.html
 26 | .. _`Proximal Policy Optimization`: ../algorithms/ppo.html
 27 | .. _`Deep Deterministic Policy Gradient`: ../algorithms/ddpg.html
 28 | .. _`Twin Delayed DDPG`: ../algorithms/td3.html
 29 | .. _`Soft Actor-Critic`: ../algorithms/sac.html
 30 | .. _`MLP`: https://en.wikipedia.org/wiki/Multilayer_perceptron
 31 | .. _`PyTorch`: https://pytorch.org/
 32 | .. _`Tensorflow v1`: https://www.tensorflow.org/versions/r1.15/api_docs
 33 | 
 34 | 
 35 | Why These Algorithms?
 36 | =====================
 37 | 
 38 | We chose the core deep RL algorithms in this package to reflect useful progressions of ideas from the recent history of the field, culminating in two algorithms in particular---PPO and SAC---which are close to state of the art on reliability and sample efficiency among policy-learning algorithms. They also expose some of the trade-offs that get made in designing and using algorithms in deep RL.
 39 | 
 40 | The On-Policy Algorithms
 41 | ------------------------
 42 | 
 43 | Vanilla Policy Gradient is the most basic, entry-level algorithm in the deep RL space because it completely predates the advent of deep RL altogether. The core elements of VPG go all the way back to the late 80s / early 90s. It started a trail of research which ultimately led to stronger algorithms such as TRPO and then PPO soon after. 
 44 | 
 45 | A key feature of this line of work is that all of these algorithms are *on-policy*: that is, they don't use old data, which makes them weaker on sample efficiency. But this is for a good reason: these algorithms directly optimize the objective you care about---policy performance---and it works out mathematically that you need on-policy data to calculate the updates. So, this family of algorithms trades off sample efficiency in favor of stability---but you can see the progression of techniques (from VPG to TRPO to PPO) working to make up the deficit on sample efficiency.
 46 | 
 47 | 
 48 | The Off-Policy Algorithms
 49 | -------------------------
 50 | 
 51 | DDPG is a similarly foundational algorithm to VPG, although much younger---the theory of deterministic policy gradients, which led to DDPG, wasn't published until 2014. DDPG is closely connected to Q-learning algorithms, and it concurrently learns a Q-function and a policy which are updated to improve each other. 
 52 | 
 53 | Algorithms like DDPG and Q-Learning are *off-policy*, so they are able to reuse old data very efficiently. They gain this benefit by exploiting Bellman's equations for optimality, which a Q-function can be trained to satisfy using *any* environment interaction data (as long as there's enough experience from the high-reward areas in the environment). 
 54 | 
 55 | But problematically, there are no guarantees that doing a good job of satisfying Bellman's equations leads to having great policy performance. *Empirically* one can get great performance---and when it happens, the sample efficiency is wonderful---but the absence of guarantees makes algorithms in this class potentially brittle and unstable. TD3 and SAC are descendants of DDPG which make use of a variety of insights to mitigate these issues.
 56 | 
 57 | 
 58 | Code Format
 59 | ===========
 60 | 
 61 | All implementations in Spinning Up adhere to a standard template. They are split into two files: an algorithm file, which contains the core logic of the algorithm, and a core file, which contains various utilities needed to run the algorithm.
 62 | 
 63 | The algorithm file always starts with a class definition for an experience buffer object, which is used to store information from agent-environment interactions. Next, there is a single function which runs the algorithm. The algorithm function follows a template that is roughly the same across the PyTorch and Tensorflow versions, but we'll break it down for each separately below. Finally, there's some support in each algorithm file for directly running the algorithm in Gym environments from the command line (though this is not the recommended way to run the algorithms---we'll describe how to do that on the `Running Experiments`_ page).
 64 | 
 65 | .. _`Running Experiments`: ../user/running.html
 66 | 
 67 | The Algorithm Function: PyTorch Version
 68 | ---------------------------------------
 69 | 
 70 | The algorithm function for a PyTorch implementation performs the following tasks in (roughly) this order:
 71 |     
 72 |     1) Logger setup
 73 | 
 74 |     2) Random seed setting
 75 |     
 76 |     3) Environment instantiation
 77 |     
 78 |     4) Constructing the actor-critic PyTorch module via the ``actor_critic`` function passed to the algorithm function as an argument
 79 |     
 80 |     5) Instantiating the experience buffer
 81 |     
 82 |     6) Setting up callable loss functions that also provide diagnostics specific to the algorithm
 83 |     
 84 |     7) Making PyTorch optimizers
 85 |     
 86 |     8) Setting up model saving through the logger
 87 | 
 88 |     9) Setting up an update function that runs one epoch of optimization or one step of descent
 89 |     
 90 |     10) Running the main loop of the algorithm:
 91 |     
 92 |         a) Run the agent in the environment
 93 |     
 94 |         b) Periodically update the parameters of the agent according to the main equations of the algorithm
 95 |     
 96 |         c) Log key performance metrics and save agent
 97 | 
 98 | 
 99 | 
100 | The Algorithm Function: Tensorflow Version
101 | ------------------------------------------
102 | 
103 | The algorithm function for a Tensorflow implementation performs the following tasks in (roughly) this order:
104 | 
105 |     1) Logger setup
106 | 
107 |     2) Random seed setting
108 |     
109 |     3) Environment instantiation
110 |     
111 |     4) Making placeholders for the computation graph
112 |     
113 |     5) Building the actor-critic computation graph via the ``actor_critic`` function passed to the algorithm function as an argument
114 |     
115 |     6) Instantiating the experience buffer
116 |     
117 |     7) Building the computation graph for loss functions and diagnostics specific to the algorithm
118 |     
119 |     8) Making training ops
120 |     
121 |     9) Making the TF Session and initializing parameters
122 |     
123 |     10) Setting up model saving through the logger
124 |     
125 |     11) Defining functions needed for running the main loop of the algorithm (e.g. the core update function, get action function, and test agent function, depending on the algorithm)
126 |     
127 |     12) Running the main loop of the algorithm:
128 |     
129 |         a) Run the agent in the environment
130 |     
131 |         b) Periodically update the parameters of the agent according to the main equations of the algorithm
132 |     
133 |         c) Log key performance metrics and save agent
134 | 
135 | 
136 | 
137 | The Core File
138 | -------------
139 | 
140 | The core files don't adhere as closely as the algorithms files to a template, but do have some approximate structure:
141 | 
142 |     1) **Tensorflow only:** Functions related to making and managing placeholders
143 | 
144 |     2) Functions for building sections of computation graph relevant to the ``actor_critic`` method for a particular algorithm
145 | 
146 |     3) Any other useful functions
147 | 
148 |     4) Implementations for an MLP actor-critic compatible with the algorithm, where both the policy and the value function(s) are represented by simple MLPs
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/docs/user/installation.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Installation
  3 | ============
  4 | 
  5 | 
  6 | .. contents:: Table of Contents
  7 | 
  8 | Spinning Up requires Python3, OpenAI Gym, and OpenMPI. 
  9 | 
 10 | Spinning Up is currently only supported on Linux and OSX. It may be possible to install on Windows, though this hasn't been extensively tested. [#]_ 
 11 | 
 12 | .. admonition:: You Should Know
 13 | 
 14 |     Many examples and benchmarks in Spinning Up refer to RL environments that use the `MuJoCo`_ physics engine. MuJoCo is a proprietary software that requires a license, which is free to trial and free for students, but otherwise is not free. As a result, installing it is optional, but because of its importance to the research community---it is the de facto standard for benchmarking deep RL algorithms in continuous control---it is preferred. 
 15 | 
 16 |     Don't worry if you decide not to install MuJoCo, though. You can definitely get started in RL by running RL algorithms on the `Classic Control`_ and `Box2d`_ environments in Gym, which are totally free to use.
 17 | 
 18 | .. [#] It looks like at least one person has figured out `a workaround for running on Windows`_. If you try another way and succeed, please let us know how you did it!
 19 | 
 20 | .. _`Classic Control`: https://gym.openai.com/envs/#classic_control
 21 | .. _`Box2d`: https://gym.openai.com/envs/#box2d
 22 | .. _`MuJoCo`: http://www.mujoco.org/index.html
 23 | .. _`a workaround for running on Windows`: https://github.com/openai/spinningup/issues/23
 24 | 
 25 | Installing Python
 26 | =================
 27 | 
 28 | We recommend installing Python through Anaconda. Anaconda is a library that includes Python and many useful packages for Python, as well as an environment manager called conda that makes package management simple.
 29 | 
 30 | Follow `the installation instructions`_ for Anaconda here. Download and install Anaconda3 (at time of writing, `Anaconda3-5.3.0`_). Then create a conda Python 3.6 env for organizing packages used in Spinning Up:
 31 | 
 32 | .. parsed-literal::
 33 | 
 34 |     conda create -n spinningup python=3.6
 35 | 
 36 | To use Python from the environment you just created, activate the environment with:
 37 | 
 38 | .. parsed-literal::
 39 | 
 40 |     conda activate spinningup
 41 | 
 42 | .. admonition:: You Should Know
 43 | 
 44 |     If you're new to python environments and package management, this stuff can quickly get confusing or overwhelming, and you'll probably hit some snags along the way. (Especially, you should expect problems like, "I just installed this thing, but it says it's not found when I try to use it!") You may want to read through some clean explanations about what package management is, why it's a good idea, and what commands you'll typically have to execute to correctly use it. 
 45 | 
 46 |     `FreeCodeCamp`_ has a good explanation worth reading. There's a shorter description on `Towards Data Science`_ which is also helpful and informative. Finally, if you're an extremely patient person, you may want to read the (dry, but very informative) `documentation page from Conda`_.
 47 | 
 48 | .. _`the installation instructions`: https://docs.continuum.io/anaconda/install/
 49 | .. _`Anaconda3-5.3.0`: https://repo.anaconda.com/archive/
 50 | .. _`FreeCodeCamp`: https://medium.freecodecamp.org/why-you-need-python-environments-and-how-to-manage-them-with-conda-85f155f4353c
 51 | .. _`Towards Data Science`: https://towardsdatascience.com/environment-management-with-conda-python-2-3-b9961a8a5097
 52 | .. _`documentation page from Conda`: https://conda.io/docs/user-guide/tasks/manage-environments.html
 53 | .. _`this Github issue for Tensorflow`: https://github.com/tensorflow/tensorflow/issues/20444
 54 | 
 55 | 
 56 | Installing OpenMPI
 57 | ==================
 58 | 
 59 | Ubuntu 
 60 | ------
 61 | 
 62 | .. parsed-literal::
 63 | 
 64 |     sudo apt-get update && sudo apt-get install libopenmpi-dev
 65 | 
 66 | 
 67 | Mac OS X
 68 | --------
 69 | Installation of system packages on Mac requires Homebrew_. With Homebrew installed, run the follwing:
 70 | 
 71 | .. parsed-literal::
 72 | 
 73 |     brew install openmpi
 74 | 
 75 | .. _Homebrew: https://brew.sh
 76 | 
 77 | Installing Spinning Up
 78 | ======================
 79 | 
 80 | .. parsed-literal::
 81 | 
 82 |     git clone https://github.com/openai/spinningup.git
 83 |     cd spinningup
 84 |     pip install -e .
 85 | 
 86 | .. admonition:: You Should Know
 87 | 
 88 |     Spinning Up defaults to installing everything in Gym **except** the MuJoCo environments. In case you run into any trouble with the Gym installation, check out the `Gym`_ github page for help. If you want the MuJoCo environments, see the optional installation section below.
 89 | 
 90 | .. _`Gym`: https://github.com/openai/gym
 91 | 
 92 | Check Your Install
 93 | ==================
 94 | 
 95 | To see if you've successfully installed Spinning Up, try running PPO in the LunarLander-v2 environment with
 96 | 
 97 | .. parsed-literal::
 98 | 
 99 |     python -m spinup.run ppo --hid "[32,32]" --env LunarLander-v2 --exp_name installtest --gamma 0.999
100 | 
101 | This might run for around 10 minutes, and you can leave it going in the background while you continue reading through documentation. This won't train the agent to completion, but will run it for long enough that you can see *some* learning progress when the results come in.
102 | 
103 | After it finishes training, watch a video of the trained policy with
104 | 
105 | .. parsed-literal::
106 | 
107 |     python -m spinup.run test_policy data/installtest/installtest_s0
108 | 
109 | And plot the results with
110 | 
111 | .. parsed-literal::
112 | 
113 |     python -m spinup.run plot data/installtest/installtest_s0
114 | 
115 | 
116 | Installing MuJoCo (Optional)
117 | ============================
118 | 
119 | First, go to the `mujoco-py`_ github page. Follow the installation instructions in the README, which describe how to install the MuJoCo physics engine and the mujoco-py package (which allows the use of MuJoCo from Python). 
120 | 
121 | .. admonition:: You Should Know
122 | 
123 |     In order to use the MuJoCo simulator, you will need to get a `MuJoCo license`_. Free 30-day licenses are available to anyone, and free 1-year licenses are available to full-time students.
124 | 
125 | Once you have installed MuJoCo, install the corresponding Gym environments with
126 | 
127 | .. parsed-literal::
128 | 
129 |     pip install gym[mujoco,robotics]
130 | 
131 | And then check that things are working by running PPO in the Walker2d-v2 environment with
132 | 
133 | .. parsed-literal::
134 | 
135 |     python -m spinup.run ppo --hid "[32,32]" --env Walker2d-v2 --exp_name mujocotest
136 | 
137 | 
138 | .. _`mujoco-py`: https://github.com/openai/mujoco-py
139 | .. _`MuJoCo license`: https://www.roboti.us/license.html
140 | 


--------------------------------------------------------------------------------
/docs/user/plotting.rst:
--------------------------------------------------------------------------------
 1 | ================
 2 | Plotting Results
 3 | ================
 4 | 
 5 | Spinning Up ships with a simple plotting utility for interpreting results. Run it with:
 6 | 
 7 | .. parsed-literal::
 8 | 
 9 |     python -m spinup.run plot [path/to/output_directory ...] [--legend [LEGEND ...]] 
10 |         [--xaxis XAXIS] [--value [VALUE ...]] [--count] [--smooth S]
11 |         [--select [SEL ...]] [--exclude [EXC ...]]
12 | 
13 | 
14 | **Positional Arguments:**
15 | 
16 | .. option:: logdir
17 | 
18 |     *strings*. As many log directories (or prefixes to log directories, which the plotter will autocomplete internally) as you'd like to plot from. Logdirs will be searched recursively for experiment outputs.
19 | 
20 |     .. admonition:: You Should Know
21 | 
22 |         The internal autocompleting is really handy! Suppose you have run several experiments, with the aim of comparing performance between different algorithms, resulting in a log directory structure of:
23 | 
24 |         .. parsed-literal::
25 | 
26 |             data/
27 |                 bench_algo1/
28 |                     bench_algo1-seed0/
29 |                     bench_algo1-seed10/
30 |                 bench_algo2/
31 |                     bench_algo2-seed0/
32 |                     bench_algo2-seed10/
33 | 
34 |         You can easily produce a graph comparing algo1 and algo2 with:
35 | 
36 |         .. parsed-literal::
37 | 
38 |             python spinup/utils/plot.py data/bench_algo
39 | 
40 |         relying on the autocomplete to find both ``data/bench_algo1`` and ``data/bench_algo2``.
41 | 
42 | **Optional Arguments:**
43 | 
44 | .. option:: -l, --legend=[LEGEND ...]
45 | 
46 |     *strings*. Optional way to specify legend for the plot. The plotter legend will automatically use the ``exp_name`` from the ``config.json`` file, unless you tell it otherwise through this flag. This only works if you provide a name for each directory that will get plotted. (Note: this may not be the same as the number of logdir args you provide! Recall that the plotter looks for autocompletes of the logdir args: there may be more than one match for a given logdir prefix, and you will need to provide a legend string for each one of those matches---unless you have removed some of them as candidates via selection or exclusion rules (below).)
47 | 
48 | .. option:: -x, --xaxis=XAXIS, default='TotalEnvInteracts'
49 | 
50 |     *string*. Pick what column from data is used for the x-axis.
51 | 
52 | .. option:: -y, --value=[VALUE ...], default='Performance'
53 | 
54 |     *strings*. Pick what columns from data to graph on the y-axis. Submitting multiple values will produce multiple graphs. Defaults to ``Performance``, which is not an actual output of any algorithm. Instead, ``Performance`` refers to either ``AverageEpRet``, the correct performance measure for the on-policy algorithms, or ``AverageTestEpRet``, the correct performance measure for the off-policy algorithms. The plotter will automatically figure out which of ``AverageEpRet`` or ``AverageTestEpRet`` to report for each separate logdir.
55 | 
56 | .. option:: --count
57 | 
58 |     Optional flag. By default, the plotter shows y-values which are averaged across all results that share an ``exp_name``, which is typically a set of identical experiments that only vary in random seed. But if you'd like to see all of those curves separately, use the ``--count`` flag.
59 | 
60 | .. option:: -s, --smooth=S, default=1
61 |     
62 |     *int*. Smooth data by averaging it over a fixed window. This parameter says how wide the averaging window will be.
63 | 
64 | .. option:: --select=[SEL ...]
65 | 
66 |     *strings*. Optional selection rule: the plotter will only show curves from logdirs that contain all of these substrings.
67 | 
68 | .. option:: --exclude=[EXC ...]
69 | 
70 |     *strings*. Optional exclusion rule: plotter will only show curves from logdirs that do not contain these substrings.
71 | 


--------------------------------------------------------------------------------
/docs/utils/mpi.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | MPI Tools
 3 | =========
 4 | 
 5 | .. contents:: Table of Contents
 6 | 
 7 | Core MPI Utilities
 8 | ==================
 9 | 
10 | .. automodule:: spinup.utils.mpi_tools
11 |     :members:
12 | 
13 | 
14 | MPI + PyTorch Utilities
15 | =======================
16 | 
17 | ``spinup.utils.mpi_pytorch`` contains a few tools to make it easy to do data-parallel PyTorch optimization across MPI processes. The two main ingredients are syncing parameters and averaging gradients before they are used by the adaptive optimizer. Also there's a hacky fix for a problem where the PyTorch instance in each separate process tries to get too many threads, and they start to clobber each other.
18 | 
19 | The pattern for using these tools looks something like this: 
20 | 
21 | 1) At the beginning of the training script, call ``setup_pytorch_for_mpi()``. (Avoids clobbering problem.)
22 | 
23 | 2) After you've constructed a PyTorch module, call ``sync_params(module)``.
24 | 
25 | 3) Then, during gradient descent, call ``mpi_avg_grads`` after the backward pass, like so:
26 | 
27 | .. code-block:: python
28 | 
29 |     optimizer.zero_grad()
30 |     loss = compute_loss(module)
31 |     loss.backward()
32 |     mpi_avg_grads(module)   # averages gradient buffers across MPI processes!
33 |     optimizer.step()
34 | 
35 | 
36 | .. automodule:: spinup.utils.mpi_pytorch
37 |     :members:
38 | 
39 | MPI + Tensorflow Utilities
40 | ==========================
41 | 
42 | The ``spinup.utils.mpi_tf`` contains a a few tools to make it easy to use the AdamOptimizer across many MPI processes. This is a bit hacky---if you're looking for something more sophisticated and general-purpose, consider `horovod`_.
43 | 
44 | .. _`horovod`: https://github.com/uber/horovod
45 | 
46 | .. automodule:: spinup.utils.mpi_tf
47 |     :members:
48 | 


--------------------------------------------------------------------------------
/docs/utils/plotter.rst:
--------------------------------------------------------------------------------
1 | =======
2 | Plotter
3 | =======
4 | 
5 | See the page on `plotting results`_ for documentation of the plotter.
6 | 
7 | .. _`plotting results`: ../user/plotting.html


--------------------------------------------------------------------------------
/docs/utils/run_utils.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Run Utils
 3 | =========
 4 | 
 5 | .. contents:: Table of Contents
 6 | 
 7 | ExperimentGrid
 8 | ==============
 9 | 
10 | Spinning Up ships with a tool called ExperimentGrid for making hyperparameter ablations easier. This is based on (but simpler than) `the rllab tool`_ called VariantGenerator.
11 | 
12 | .. _`the rllab tool`: https://github.com/rll/rllab/blob/master/rllab/misc/instrument.py#L173
13 | 
14 | .. autoclass:: spinup.utils.run_utils.ExperimentGrid
15 |     :members:
16 | 
17 | 
18 | Calling Experiments
19 | ===================
20 | 
21 | .. autofunction:: spinup.utils.run_utils.call_experiment
22 | 
23 | .. autofunction:: spinup.utils.run_utils.setup_logger_kwargs
24 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | **Status:** Maintenance (expect bug fixes and minor updates)
 2 | 
 3 | Welcome to Spinning Up in Deep RL! 
 4 | ==================================
 5 | 
 6 | This is an educational resource produced by OpenAI that makes it easier to learn about deep reinforcement learning (deep RL).
 7 | 
 8 | For the unfamiliar: [reinforcement learning](https://en.wikipedia.org/wiki/Reinforcement_learning) (RL) is a machine learning approach for teaching agents how to solve tasks by trial and error. Deep RL refers to the combination of RL with [deep learning](http://ufldl.stanford.edu/tutorial/).
 9 | 
10 | This module contains a variety of helpful resources, including:
11 | 
12 | - a short [introduction](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html) to RL terminology, kinds of algorithms, and basic theory,
13 | - an [essay](https://spinningup.openai.com/en/latest/spinningup/spinningup.html) about how to grow into an RL research role,
14 | - a [curated list](https://spinningup.openai.com/en/latest/spinningup/keypapers.html) of important papers organized by topic,
15 | - a well-documented [code repo](https://github.com/openai/spinningup) of short, standalone implementations of key algorithms,
16 | - and a few [exercises](https://spinningup.openai.com/en/latest/spinningup/exercises.html) to serve as warm-ups.
17 | 
18 | Get started at [spinningup.openai.com](https://spinningup.openai.com)!
19 | 
20 | 
21 | Citing Spinning Up
22 | ------------------
23 | 
24 | If you reference or use Spinning Up in your research, please cite:
25 | 
26 | ```
27 | @article{SpinningUp2018,
28 |     author = {Achiam, Joshua},
29 |     title = {{Spinning Up in Deep Reinforcement Learning}},
30 |     year = {2018}
31 | }
32 | ```


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
1 | build:
2 |     image: latest
3 | 
4 | python:
5 |     version: 3.6


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from os.path import join, dirname, realpath
 2 | from setuptools import setup
 3 | import sys
 4 | 
 5 | assert sys.version_info.major == 3 and sys.version_info.minor >= 6, \
 6 |     "The Spinning Up repo is designed to work with Python 3.6 and greater." \
 7 |     + "Please install it before proceeding."
 8 | 
 9 | with open(join("spinup", "version.py")) as version_file:
10 |     exec(version_file.read())
11 | 
12 | setup(
13 |     name='spinup',
14 |     py_modules=['spinup'],
15 |     version=__version__,#'0.1',
16 |     install_requires=[
17 |         'cloudpickle==1.2.1',
18 |         'gym[atari,box2d,classic_control]~=0.15.3',
19 |         'ipython',
20 |         'joblib',
21 |         'matplotlib==3.1.1',
22 |         'mpi4py',
23 |         'numpy',
24 |         'pandas',
25 |         'pytest',
26 |         'psutil',
27 |         'scipy',
28 |         'seaborn==0.8.1',
29 |         'tensorflow>=1.8.0,<2.0',
30 |         'torch==1.3.1',
31 |         'tqdm'
32 |     ],
33 |     description="Teaching tools for introducing people to deep RL.",
34 |     author="Joshua Achiam",
35 | )
36 | 


--------------------------------------------------------------------------------
/spinup/__init__.py:
--------------------------------------------------------------------------------
 1 | # Disable TF deprecation warnings.
 2 | # Syntax from tf1 is not expected to be compatible with tf2.
 3 | import tensorflow as tf
 4 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
 5 | 
 6 | # Algorithms
 7 | from spinup.algos.tf1.ddpg.ddpg import ddpg as ddpg_tf1
 8 | from spinup.algos.tf1.ppo.ppo import ppo as ppo_tf1
 9 | from spinup.algos.tf1.sac.sac import sac as sac_tf1
10 | from spinup.algos.tf1.td3.td3 import td3 as td3_tf1
11 | from spinup.algos.tf1.trpo.trpo import trpo as trpo_tf1
12 | from spinup.algos.tf1.vpg.vpg import vpg as vpg_tf1
13 | 
14 | from spinup.algos.pytorch.ddpg.ddpg import ddpg as ddpg_pytorch
15 | from spinup.algos.pytorch.ppo.ppo import ppo as ppo_pytorch
16 | from spinup.algos.pytorch.sac.sac import sac as sac_pytorch
17 | from spinup.algos.pytorch.td3.td3 import td3 as td3_pytorch
18 | from spinup.algos.pytorch.trpo.trpo import trpo as trpo_pytorch
19 | from spinup.algos.pytorch.vpg.vpg import vpg as vpg_pytorch
20 | 
21 | # Loggers
22 | from spinup.utils.logx import Logger, EpochLogger
23 | 
24 | # Version
25 | from spinup.version import __version__


--------------------------------------------------------------------------------
/spinup/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/__init__.py


--------------------------------------------------------------------------------
/spinup/algos/pytorch/ddpg/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | def combined_shape(length, shape=None):
 9 |     if shape is None:
10 |         return (length,)
11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
12 | 
13 | def mlp(sizes, activation, output_activation=nn.Identity):
14 |     layers = []
15 |     for j in range(len(sizes)-1):
16 |         act = activation if j < len(sizes)-2 else output_activation
17 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
18 |     return nn.Sequential(*layers)
19 | 
20 | def count_vars(module):
21 |     return sum([np.prod(p.shape) for p in module.parameters()])
22 | 
23 | class MLPActor(nn.Module):
24 | 
25 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
26 |         super().__init__()
27 |         pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
28 |         self.pi = mlp(pi_sizes, activation, nn.Tanh)
29 |         self.act_limit = act_limit
30 | 
31 |     def forward(self, obs):
32 |         # Return output from network scaled to action space limits.
33 |         return self.act_limit * self.pi(obs)
34 | 
35 | class MLPQFunction(nn.Module):
36 | 
37 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
38 |         super().__init__()
39 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
40 | 
41 |     def forward(self, obs, act):
42 |         q = self.q(torch.cat([obs, act], dim=-1))
43 |         return torch.squeeze(q, -1) # Critical to ensure q has right shape.
44 | 
45 | class MLPActorCritic(nn.Module):
46 | 
47 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
48 |                  activation=nn.ReLU):
49 |         super().__init__()
50 | 
51 |         obs_dim = observation_space.shape[0]
52 |         act_dim = action_space.shape[0]
53 |         act_limit = action_space.high[0]
54 | 
55 |         # build policy and value functions
56 |         self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
57 |         self.q = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
58 | 
59 |     def act(self, obs):
60 |         with torch.no_grad():
61 |             return self.pi(obs).numpy()
62 | 


--------------------------------------------------------------------------------
/spinup/algos/pytorch/ppo/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.signal
  3 | from gym.spaces import Box, Discrete
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.distributions.normal import Normal
  8 | from torch.distributions.categorical import Categorical
  9 | 
 10 | 
 11 | def combined_shape(length, shape=None):
 12 |     if shape is None:
 13 |         return (length,)
 14 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 15 | 
 16 | 
 17 | def mlp(sizes, activation, output_activation=nn.Identity):
 18 |     layers = []
 19 |     for j in range(len(sizes)-1):
 20 |         act = activation if j < len(sizes)-2 else output_activation
 21 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 22 |     return nn.Sequential(*layers)
 23 | 
 24 | 
 25 | def count_vars(module):
 26 |     return sum([np.prod(p.shape) for p in module.parameters()])
 27 | 
 28 | 
 29 | def discount_cumsum(x, discount):
 30 |     """
 31 |     magic from rllab for computing discounted cumulative sums of vectors.
 32 | 
 33 |     input: 
 34 |         vector x, 
 35 |         [x0, 
 36 |          x1, 
 37 |          x2]
 38 | 
 39 |     output:
 40 |         [x0 + discount * x1 + discount^2 * x2,  
 41 |          x1 + discount * x2,
 42 |          x2]
 43 |     """
 44 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 45 | 
 46 | 
 47 | class Actor(nn.Module):
 48 | 
 49 |     def _distribution(self, obs):
 50 |         raise NotImplementedError
 51 | 
 52 |     def _log_prob_from_distribution(self, pi, act):
 53 |         raise NotImplementedError
 54 | 
 55 |     def forward(self, obs, act=None):
 56 |         # Produce action distributions for given observations, and 
 57 |         # optionally compute the log likelihood of given actions under
 58 |         # those distributions.
 59 |         pi = self._distribution(obs)
 60 |         logp_a = None
 61 |         if act is not None:
 62 |             logp_a = self._log_prob_from_distribution(pi, act)
 63 |         return pi, logp_a
 64 | 
 65 | 
 66 | class MLPCategoricalActor(Actor):
 67 |     
 68 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 69 |         super().__init__()
 70 |         self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
 71 | 
 72 |     def _distribution(self, obs):
 73 |         logits = self.logits_net(obs)
 74 |         return Categorical(logits=logits)
 75 | 
 76 |     def _log_prob_from_distribution(self, pi, act):
 77 |         return pi.log_prob(act)
 78 | 
 79 | 
 80 | class MLPGaussianActor(Actor):
 81 | 
 82 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 83 |         super().__init__()
 84 |         log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
 85 |         self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
 86 |         self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
 87 | 
 88 |     def _distribution(self, obs):
 89 |         mu = self.mu_net(obs)
 90 |         std = torch.exp(self.log_std)
 91 |         return Normal(mu, std)
 92 | 
 93 |     def _log_prob_from_distribution(self, pi, act):
 94 |         return pi.log_prob(act).sum(axis=-1)    # Last axis sum needed for Torch Normal distribution
 95 | 
 96 | 
 97 | class MLPCritic(nn.Module):
 98 | 
 99 |     def __init__(self, obs_dim, hidden_sizes, activation):
100 |         super().__init__()
101 |         self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
102 | 
103 |     def forward(self, obs):
104 |         return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
105 | 
106 | 
107 | 
108 | class MLPActorCritic(nn.Module):
109 | 
110 | 
111 |     def __init__(self, observation_space, action_space, 
112 |                  hidden_sizes=(64,64), activation=nn.Tanh):
113 |         super().__init__()
114 | 
115 |         obs_dim = observation_space.shape[0]
116 | 
117 |         # policy builder depends on action space
118 |         if isinstance(action_space, Box):
119 |             self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
120 |         elif isinstance(action_space, Discrete):
121 |             self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
122 | 
123 |         # build value function
124 |         self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
125 | 
126 |     def step(self, obs):
127 |         with torch.no_grad():
128 |             pi = self.pi._distribution(obs)
129 |             a = pi.sample()
130 |             logp_a = self.pi._log_prob_from_distribution(pi, a)
131 |             v = self.v(obs)
132 |         return a.numpy(), v.numpy(), logp_a.numpy()
133 | 
134 |     def act(self, obs):
135 |         return self.step(obs)[0]


--------------------------------------------------------------------------------
/spinup/algos/pytorch/sac/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.functional as F
 7 | from torch.distributions.normal import Normal
 8 | 
 9 | 
10 | def combined_shape(length, shape=None):
11 |     if shape is None:
12 |         return (length,)
13 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
14 | 
15 | def mlp(sizes, activation, output_activation=nn.Identity):
16 |     layers = []
17 |     for j in range(len(sizes)-1):
18 |         act = activation if j < len(sizes)-2 else output_activation
19 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
20 |     return nn.Sequential(*layers)
21 | 
22 | def count_vars(module):
23 |     return sum([np.prod(p.shape) for p in module.parameters()])
24 | 
25 | 
26 | LOG_STD_MAX = 2
27 | LOG_STD_MIN = -20
28 | 
29 | class SquashedGaussianMLPActor(nn.Module):
30 | 
31 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
32 |         super().__init__()
33 |         self.net = mlp([obs_dim] + list(hidden_sizes), activation, activation)
34 |         self.mu_layer = nn.Linear(hidden_sizes[-1], act_dim)
35 |         self.log_std_layer = nn.Linear(hidden_sizes[-1], act_dim)
36 |         self.act_limit = act_limit
37 | 
38 |     def forward(self, obs, deterministic=False, with_logprob=True):
39 |         net_out = self.net(obs)
40 |         mu = self.mu_layer(net_out)
41 |         log_std = self.log_std_layer(net_out)
42 |         log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
43 |         std = torch.exp(log_std)
44 | 
45 |         # Pre-squash distribution and sample
46 |         pi_distribution = Normal(mu, std)
47 |         if deterministic:
48 |             # Only used for evaluating policy at test time.
49 |             pi_action = mu
50 |         else:
51 |             pi_action = pi_distribution.rsample()
52 | 
53 |         if with_logprob:
54 |             # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
55 |             # NOTE: The correction formula is a little bit magic. To get an understanding 
56 |             # of where it comes from, check out the original SAC paper (arXiv 1801.01290) 
57 |             # and look in appendix C. This is a more numerically-stable equivalent to Eq 21.
58 |             # Try deriving it yourself as a (very difficult) exercise. :)
59 |             logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
60 |             logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1)
61 |         else:
62 |             logp_pi = None
63 | 
64 |         pi_action = torch.tanh(pi_action)
65 |         pi_action = self.act_limit * pi_action
66 | 
67 |         return pi_action, logp_pi
68 | 
69 | 
70 | class MLPQFunction(nn.Module):
71 | 
72 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
73 |         super().__init__()
74 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
75 | 
76 |     def forward(self, obs, act):
77 |         q = self.q(torch.cat([obs, act], dim=-1))
78 |         return torch.squeeze(q, -1) # Critical to ensure q has right shape.
79 | 
80 | class MLPActorCritic(nn.Module):
81 | 
82 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
83 |                  activation=nn.ReLU):
84 |         super().__init__()
85 | 
86 |         obs_dim = observation_space.shape[0]
87 |         act_dim = action_space.shape[0]
88 |         act_limit = action_space.high[0]
89 | 
90 |         # build policy and value functions
91 |         self.pi = SquashedGaussianMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
92 |         self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
93 |         self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
94 | 
95 |     def act(self, obs, deterministic=False):
96 |         with torch.no_grad():
97 |             a, _ = self.pi(obs, deterministic, False)
98 |             return a.numpy()
99 | 


--------------------------------------------------------------------------------
/spinup/algos/pytorch/td3/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | def combined_shape(length, shape=None):
 9 |     if shape is None:
10 |         return (length,)
11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
12 | 
13 | def mlp(sizes, activation, output_activation=nn.Identity):
14 |     layers = []
15 |     for j in range(len(sizes)-1):
16 |         act = activation if j < len(sizes)-2 else output_activation
17 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
18 |     return nn.Sequential(*layers)
19 | 
20 | def count_vars(module):
21 |     return sum([np.prod(p.shape) for p in module.parameters()])
22 | 
23 | class MLPActor(nn.Module):
24 | 
25 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
26 |         super().__init__()
27 |         pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
28 |         self.pi = mlp(pi_sizes, activation, nn.Tanh)
29 |         self.act_limit = act_limit
30 | 
31 |     def forward(self, obs):
32 |         # Return output from network scaled to action space limits.
33 |         return self.act_limit * self.pi(obs)
34 | 
35 | class MLPQFunction(nn.Module):
36 | 
37 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
38 |         super().__init__()
39 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
40 | 
41 |     def forward(self, obs, act):
42 |         q = self.q(torch.cat([obs, act], dim=-1))
43 |         return torch.squeeze(q, -1) # Critical to ensure q has right shape.
44 | 
45 | class MLPActorCritic(nn.Module):
46 | 
47 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
48 |                  activation=nn.ReLU):
49 |         super().__init__()
50 | 
51 |         obs_dim = observation_space.shape[0]
52 |         act_dim = action_space.shape[0]
53 |         act_limit = action_space.high[0]
54 | 
55 |         # build policy and value functions
56 |         self.pi = MLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
57 |         self.q1 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
58 |         self.q2 = MLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
59 | 
60 |     def act(self, obs):
61 |         with torch.no_grad():
62 |             return self.pi(obs).numpy()
63 | 


--------------------------------------------------------------------------------
/spinup/algos/pytorch/trpo/trpo.py:
--------------------------------------------------------------------------------
1 | def trpo(*args, **kwargs):
2 |     print('\n\nUnfortunately, TRPO has not yet been implemented in PyTorch '\
3 |         + 'for Spinning Up. TRPO will migrate some time in the future.\n\n')
4 |     raise NotImplementedError


--------------------------------------------------------------------------------
/spinup/algos/pytorch/vpg/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.signal
  3 | from gym.spaces import Box, Discrete
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.distributions.normal import Normal
  8 | from torch.distributions.categorical import Categorical
  9 | 
 10 | 
 11 | def combined_shape(length, shape=None):
 12 |     if shape is None:
 13 |         return (length,)
 14 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 15 | 
 16 | 
 17 | def mlp(sizes, activation, output_activation=nn.Identity):
 18 |     layers = []
 19 |     for j in range(len(sizes)-1):
 20 |         act = activation if j < len(sizes)-2 else output_activation
 21 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 22 |     return nn.Sequential(*layers)
 23 | 
 24 | 
 25 | def count_vars(module):
 26 |     return sum([np.prod(p.shape) for p in module.parameters()])
 27 | 
 28 | 
 29 | def discount_cumsum(x, discount):
 30 |     """
 31 |     magic from rllab for computing discounted cumulative sums of vectors.
 32 | 
 33 |     input: 
 34 |         vector x, 
 35 |         [x0, 
 36 |          x1, 
 37 |          x2]
 38 | 
 39 |     output:
 40 |         [x0 + discount * x1 + discount^2 * x2,  
 41 |          x1 + discount * x2,
 42 |          x2]
 43 |     """
 44 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 45 | 
 46 | 
 47 | class Actor(nn.Module):
 48 | 
 49 |     def _distribution(self, obs):
 50 |         raise NotImplementedError
 51 | 
 52 |     def _log_prob_from_distribution(self, pi, act):
 53 |         raise NotImplementedError
 54 | 
 55 |     def forward(self, obs, act=None):
 56 |         # Produce action distributions for given observations, and 
 57 |         # optionally compute the log likelihood of given actions under
 58 |         # those distributions.
 59 |         pi = self._distribution(obs)
 60 |         logp_a = None
 61 |         if act is not None:
 62 |             logp_a = self._log_prob_from_distribution(pi, act)
 63 |         return pi, logp_a
 64 | 
 65 | 
 66 | class MLPCategoricalActor(Actor):
 67 |     
 68 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 69 |         super().__init__()
 70 |         self.logits_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
 71 | 
 72 |     def _distribution(self, obs):
 73 |         logits = self.logits_net(obs)
 74 |         return Categorical(logits=logits)
 75 | 
 76 |     def _log_prob_from_distribution(self, pi, act):
 77 |         return pi.log_prob(act)
 78 | 
 79 | 
 80 | class MLPGaussianActor(Actor):
 81 | 
 82 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 83 |         super().__init__()
 84 |         log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
 85 |         self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
 86 |         self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
 87 | 
 88 |     def _distribution(self, obs):
 89 |         mu = self.mu_net(obs)
 90 |         std = torch.exp(self.log_std)
 91 |         return Normal(mu, std)
 92 | 
 93 |     def _log_prob_from_distribution(self, pi, act):
 94 |         return pi.log_prob(act).sum(axis=-1)    # Last axis sum needed for Torch Normal distribution
 95 | 
 96 | 
 97 | class MLPCritic(nn.Module):
 98 | 
 99 |     def __init__(self, obs_dim, hidden_sizes, activation):
100 |         super().__init__()
101 |         self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
102 | 
103 |     def forward(self, obs):
104 |         return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
105 | 
106 | 
107 | 
108 | class MLPActorCritic(nn.Module):
109 | 
110 | 
111 |     def __init__(self, observation_space, action_space, 
112 |                  hidden_sizes=(64,64), activation=nn.Tanh):
113 |         super().__init__()
114 | 
115 |         obs_dim = observation_space.shape[0]
116 | 
117 |         # policy builder depends on action space
118 |         if isinstance(action_space, Box):
119 |             self.pi = MLPGaussianActor(obs_dim, action_space.shape[0], hidden_sizes, activation)
120 |         elif isinstance(action_space, Discrete):
121 |             self.pi = MLPCategoricalActor(obs_dim, action_space.n, hidden_sizes, activation)
122 | 
123 |         # build value function
124 |         self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
125 | 
126 |     def step(self, obs):
127 |         with torch.no_grad():
128 |             pi = self.pi._distribution(obs)
129 |             a = pi.sample()
130 |             logp_a = self.pi._log_prob_from_distribution(pi, a)
131 |             v = self.v(obs)
132 |         return a.numpy(), v.numpy(), logp_a.numpy()
133 | 
134 |     def act(self, obs):
135 |         return self.step(obs)[0]


--------------------------------------------------------------------------------
/spinup/algos/tf1/ddpg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/ddpg/__init__.py


--------------------------------------------------------------------------------
/spinup/algos/tf1/ddpg/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def placeholder(dim=None):
 6 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
 7 | 
 8 | def placeholders(*args):
 9 |     return [placeholder(dim) for dim in args]
10 | 
11 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
12 |     for h in hidden_sizes[:-1]:
13 |         x = tf.layers.dense(x, units=h, activation=activation)
14 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
15 | 
16 | def get_vars(scope):
17 |     return [x for x in tf.global_variables() if scope in x.name]
18 | 
19 | def count_vars(scope):
20 |     v = get_vars(scope)
21 |     return sum([np.prod(var.shape.as_list()) for var in v])
22 | 
23 | """
24 | Actor-Critics
25 | """
26 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 
27 |                      output_activation=tf.tanh, action_space=None):
28 |     act_dim = a.shape.as_list()[-1]
29 |     act_limit = action_space.high[0]
30 |     with tf.variable_scope('pi'):
31 |         pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
32 |     with tf.variable_scope('q'):
33 |         q = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
34 |     with tf.variable_scope('q', reuse=True):
35 |         q_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
36 |     return pi, q, q_pi
37 | 


--------------------------------------------------------------------------------
/spinup/algos/tf1/ppo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/ppo/__init__.py


--------------------------------------------------------------------------------
/spinup/algos/tf1/ppo/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import scipy.signal
  4 | from gym.spaces import Box, Discrete
  5 | 
  6 | EPS = 1e-8
  7 | 
  8 | def combined_shape(length, shape=None):
  9 |     if shape is None:
 10 |         return (length,)
 11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 12 | 
 13 | def placeholder(dim=None):
 14 |     return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim))
 15 | 
 16 | def placeholders(*args):
 17 |     return [placeholder(dim) for dim in args]
 18 | 
 19 | def placeholder_from_space(space):
 20 |     if isinstance(space, Box):
 21 |         return placeholder(space.shape)
 22 |     elif isinstance(space, Discrete):
 23 |         return tf.placeholder(dtype=tf.int32, shape=(None,))
 24 |     raise NotImplementedError
 25 | 
 26 | def placeholders_from_spaces(*args):
 27 |     return [placeholder_from_space(space) for space in args]
 28 | 
 29 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 30 |     for h in hidden_sizes[:-1]:
 31 |         x = tf.layers.dense(x, units=h, activation=activation)
 32 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 33 | 
 34 | def get_vars(scope=''):
 35 |     return [x for x in tf.trainable_variables() if scope in x.name]
 36 | 
 37 | def count_vars(scope=''):
 38 |     v = get_vars(scope)
 39 |     return sum([np.prod(var.shape.as_list()) for var in v])
 40 | 
 41 | def gaussian_likelihood(x, mu, log_std):
 42 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 43 |     return tf.reduce_sum(pre_sum, axis=1)
 44 | 
 45 | def discount_cumsum(x, discount):
 46 |     """
 47 |     magic from rllab for computing discounted cumulative sums of vectors.
 48 | 
 49 |     input: 
 50 |         vector x, 
 51 |         [x0, 
 52 |          x1, 
 53 |          x2]
 54 | 
 55 |     output:
 56 |         [x0 + discount * x1 + discount^2 * x2,  
 57 |          x1 + discount * x2,
 58 |          x2]
 59 |     """
 60 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 61 | 
 62 | 
 63 | """
 64 | Policies
 65 | """
 66 | 
 67 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 68 |     act_dim = action_space.n
 69 |     logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None)
 70 |     logp_all = tf.nn.log_softmax(logits)
 71 |     pi = tf.squeeze(tf.multinomial(logits,1), axis=1)
 72 |     logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
 73 |     logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1)
 74 |     return pi, logp, logp_pi
 75 | 
 76 | 
 77 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 78 |     act_dim = a.shape.as_list()[-1]
 79 |     mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
 80 |     log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
 81 |     std = tf.exp(log_std)
 82 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
 83 |     logp = gaussian_likelihood(a, mu, log_std)
 84 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
 85 |     return pi, logp, logp_pi
 86 | 
 87 | 
 88 | """
 89 | Actor-Critics
 90 | """
 91 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 
 92 |                      output_activation=None, policy=None, action_space=None):
 93 | 
 94 |     # default policy builder depends on action space
 95 |     if policy is None and isinstance(action_space, Box):
 96 |         policy = mlp_gaussian_policy
 97 |     elif policy is None and isinstance(action_space, Discrete):
 98 |         policy = mlp_categorical_policy
 99 | 
100 |     with tf.variable_scope('pi'):
101 |         pi, logp, logp_pi = policy(x, a, hidden_sizes, activation, output_activation, action_space)
102 |     with tf.variable_scope('v'):
103 |         v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
104 |     return pi, logp, logp_pi, v
105 | 


--------------------------------------------------------------------------------
/spinup/algos/tf1/sac/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/sac/__init__.py


--------------------------------------------------------------------------------
/spinup/algos/tf1/sac/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | EPS = 1e-8
 5 | 
 6 | def placeholder(dim=None):
 7 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
 8 | 
 9 | def placeholders(*args):
10 |     return [placeholder(dim) for dim in args]
11 | 
12 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
13 |     for h in hidden_sizes[:-1]:
14 |         x = tf.layers.dense(x, units=h, activation=activation)
15 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
16 | 
17 | def get_vars(scope):
18 |     return [x for x in tf.global_variables() if scope in x.name]
19 | 
20 | def count_vars(scope):
21 |     v = get_vars(scope)
22 |     return sum([np.prod(var.shape.as_list()) for var in v])
23 | 
24 | def gaussian_likelihood(x, mu, log_std):
25 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
26 |     return tf.reduce_sum(pre_sum, axis=1)
27 | 
28 | 
29 | """
30 | Policies
31 | """
32 | 
33 | LOG_STD_MAX = 2
34 | LOG_STD_MIN = -20
35 | 
36 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation):
37 |     act_dim = a.shape.as_list()[-1]
38 |     net = mlp(x, list(hidden_sizes), activation, activation)
39 |     mu = tf.layers.dense(net, act_dim, activation=output_activation)
40 |     log_std = tf.layers.dense(net, act_dim, activation=None)
41 |     log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)
42 | 
43 |     std = tf.exp(log_std)
44 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
45 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
46 |     return mu, pi, logp_pi
47 | 
48 | def apply_squashing_func(mu, pi, logp_pi):
49 |     # Adjustment to log prob
50 |     # NOTE: This formula is a little bit magic. To get an understanding of where it
51 |     # comes from, check out the original SAC paper (arXiv 1801.01290) and look in
52 |     # appendix C. This is a more numerically-stable equivalent to Eq 21.
53 |     # Try deriving it yourself as a (very difficult) exercise. :)
54 |     logp_pi -= tf.reduce_sum(2*(np.log(2) - pi - tf.nn.softplus(-2*pi)), axis=1)
55 | 
56 |     # Squash those unbounded actions!
57 |     mu = tf.tanh(mu)
58 |     pi = tf.tanh(pi)
59 |     return mu, pi, logp_pi
60 | 
61 | """
62 | Actor-Critics
63 | """
64 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 
65 |                      output_activation=None, policy=mlp_gaussian_policy, action_space=None):
66 |     # policy
67 |     with tf.variable_scope('pi'):
68 |         mu, pi, logp_pi = policy(x, a, hidden_sizes, activation, output_activation)
69 |         mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi)
70 | 
71 |     # make sure actions are in correct range
72 |     action_scale = action_space.high[0]
73 |     mu *= action_scale
74 |     pi *= action_scale
75 | 
76 |     # vfs
77 |     vf_mlp = lambda x : tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
78 |     with tf.variable_scope('q1'):
79 |         q1 = vf_mlp(tf.concat([x,a], axis=-1))
80 |     with tf.variable_scope('q2'):
81 |         q2 = vf_mlp(tf.concat([x,a], axis=-1))
82 |     return mu, pi, logp_pi, q1, q2
83 | 


--------------------------------------------------------------------------------
/spinup/algos/tf1/td3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/td3/__init__.py


--------------------------------------------------------------------------------
/spinup/algos/tf1/td3/core.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def placeholder(dim=None):
 6 |     return tf.placeholder(dtype=tf.float32, shape=(None,dim) if dim else (None,))
 7 | 
 8 | def placeholders(*args):
 9 |     return [placeholder(dim) for dim in args]
10 | 
11 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
12 |     for h in hidden_sizes[:-1]:
13 |         x = tf.layers.dense(x, units=h, activation=activation)
14 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
15 | 
16 | def get_vars(scope):
17 |     return [x for x in tf.global_variables() if scope in x.name]
18 | 
19 | def count_vars(scope):
20 |     v = get_vars(scope)
21 |     return sum([np.prod(var.shape.as_list()) for var in v])
22 | 
23 | """
24 | Actor-Critics
25 | """
26 | def mlp_actor_critic(x, a, hidden_sizes=(256,256), activation=tf.nn.relu, 
27 |                      output_activation=tf.tanh, action_space=None):
28 |     act_dim = a.shape.as_list()[-1]
29 |     act_limit = action_space.high[0]
30 |     with tf.variable_scope('pi'):
31 |         pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
32 |     with tf.variable_scope('q1'):
33 |         q1 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
34 |     with tf.variable_scope('q2'):
35 |         q2 = tf.squeeze(mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
36 |     with tf.variable_scope('q1', reuse=True):
37 |         q1_pi = tf.squeeze(mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None), axis=1)
38 |     return pi, q1, q2, q1_pi
39 | 


--------------------------------------------------------------------------------
/spinup/algos/tf1/trpo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/trpo/__init__.py


--------------------------------------------------------------------------------
/spinup/algos/tf1/trpo/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import scipy.signal
  4 | from gym.spaces import Box, Discrete
  5 | 
  6 | EPS = 1e-8
  7 | 
  8 | def combined_shape(length, shape=None):
  9 |     if shape is None:
 10 |         return (length,)
 11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 12 | 
 13 | def keys_as_sorted_list(dict):
 14 |     return sorted(list(dict.keys()))
 15 | 
 16 | def values_as_sorted_list(dict):
 17 |     return [dict[k] for k in keys_as_sorted_list(dict)]
 18 | 
 19 | def placeholder(dim=None):
 20 |     return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim))
 21 | 
 22 | def placeholders(*args):
 23 |     return [placeholder(dim) for dim in args]
 24 | 
 25 | def placeholder_from_space(space):
 26 |     if isinstance(space, Box):
 27 |         return placeholder(space.shape)
 28 |     elif isinstance(space, Discrete):
 29 |         return tf.placeholder(dtype=tf.int32, shape=(None,))
 30 |     raise NotImplementedError
 31 | 
 32 | def placeholders_from_spaces(*args):
 33 |     return [placeholder_from_space(space) for space in args]
 34 | 
 35 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 36 |     for h in hidden_sizes[:-1]:
 37 |         x = tf.layers.dense(x, units=h, activation=activation)
 38 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 39 | 
 40 | def get_vars(scope=''):
 41 |     return [x for x in tf.trainable_variables() if scope in x.name]
 42 | 
 43 | def count_vars(scope=''):
 44 |     v = get_vars(scope)
 45 |     return sum([np.prod(var.shape.as_list()) for var in v])
 46 | 
 47 | def gaussian_likelihood(x, mu, log_std):
 48 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 49 |     return tf.reduce_sum(pre_sum, axis=1)
 50 | 
 51 | def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1):
 52 |     """
 53 |     tf symbol for mean KL divergence between two batches of diagonal gaussian distributions,
 54 |     where distributions are specified by means and log stds.
 55 |     (https://en.wikipedia.org/wiki/Kullback-Leibler_divergence#Multivariate_normal_distributions)
 56 |     """
 57 |     var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1)
 58 |     pre_sum = 0.5*(((mu1- mu0)**2 + var0)/(var1 + EPS) - 1) +  log_std1 - log_std0
 59 |     all_kls = tf.reduce_sum(pre_sum, axis=1)
 60 |     return tf.reduce_mean(all_kls)
 61 | 
 62 | def categorical_kl(logp0, logp1):
 63 |     """
 64 |     tf symbol for mean KL divergence between two batches of categorical probability distributions,
 65 |     where the distributions are input as log probs.
 66 |     """
 67 |     all_kls = tf.reduce_sum(tf.exp(logp1) * (logp1 - logp0), axis=1)
 68 |     return tf.reduce_mean(all_kls)
 69 | 
 70 | def flat_concat(xs):
 71 |     return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0)
 72 | 
 73 | def flat_grad(f, params):
 74 |     return flat_concat(tf.gradients(xs=params, ys=f))
 75 | 
 76 | def hessian_vector_product(f, params):
 77 |     # for H = grad**2 f, compute Hx
 78 |     g = flat_grad(f, params)
 79 |     x = tf.placeholder(tf.float32, shape=g.shape)
 80 |     return x, flat_grad(tf.reduce_sum(g*x), params)
 81 | 
 82 | def assign_params_from_flat(x, params):
 83 |     flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars
 84 |     splits = tf.split(x, [flat_size(p) for p in params])
 85 |     new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)]
 86 |     return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)])
 87 | 
 88 | def discount_cumsum(x, discount):
 89 |     """
 90 |     magic from rllab for computing discounted cumulative sums of vectors.
 91 | 
 92 |     input: 
 93 |         vector x, 
 94 |         [x0, 
 95 |          x1, 
 96 |          x2]
 97 | 
 98 |     output:
 99 |         [x0 + discount * x1 + discount^2 * x2,  
100 |          x1 + discount * x2,
101 |          x2]
102 |     """
103 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
104 | 
105 | """
106 | Policies
107 | """
108 | 
109 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space):
110 |     act_dim = action_space.n
111 |     logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None)
112 |     logp_all = tf.nn.log_softmax(logits)
113 |     pi = tf.squeeze(tf.multinomial(logits,1), axis=1)
114 |     logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
115 |     logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1)
116 | 
117 |     old_logp_all = placeholder(act_dim)
118 |     d_kl = categorical_kl(logp_all, old_logp_all)
119 | 
120 |     info = {'logp_all': logp_all}
121 |     info_phs = {'logp_all': old_logp_all}
122 | 
123 |     return pi, logp, logp_pi, info, info_phs, d_kl
124 | 
125 | 
126 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
127 |     act_dim = a.shape.as_list()[-1]
128 |     mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
129 |     log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
130 |     std = tf.exp(log_std)
131 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
132 |     logp = gaussian_likelihood(a, mu, log_std)
133 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
134 | 
135 |     old_mu_ph, old_log_std_ph = placeholders(act_dim, act_dim)
136 |     d_kl = diagonal_gaussian_kl(mu, log_std, old_mu_ph, old_log_std_ph)
137 | 
138 |     info = {'mu': mu, 'log_std': log_std}
139 |     info_phs = {'mu': old_mu_ph, 'log_std': old_log_std_ph}
140 | 
141 |     return pi, logp, logp_pi, info, info_phs, d_kl
142 | 
143 | 
144 | """
145 | Actor-Critics
146 | """
147 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 
148 |                      output_activation=None, policy=None, action_space=None):
149 | 
150 |     # default policy builder depends on action space
151 |     if policy is None and isinstance(action_space, Box):
152 |         policy = mlp_gaussian_policy
153 |     elif policy is None and isinstance(action_space, Discrete):
154 |         policy = mlp_categorical_policy
155 | 
156 |     with tf.variable_scope('pi'):
157 |         policy_outs = policy(x, a, hidden_sizes, activation, output_activation, action_space)
158 |         pi, logp, logp_pi, info, info_phs, d_kl = policy_outs
159 |     with tf.variable_scope('v'):
160 |         v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
161 |     return pi, logp, logp_pi, info, info_phs, d_kl, v
162 | 


--------------------------------------------------------------------------------
/spinup/algos/tf1/vpg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/algos/tf1/vpg/__init__.py


--------------------------------------------------------------------------------
/spinup/algos/tf1/vpg/core.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import scipy.signal
  4 | from gym.spaces import Box, Discrete
  5 | 
  6 | EPS = 1e-8
  7 | 
  8 | def combined_shape(length, shape=None):
  9 |     if shape is None:
 10 |         return (length,)
 11 |     return (length, shape) if np.isscalar(shape) else (length, *shape)
 12 | 
 13 | def placeholder(dim=None):
 14 |     return tf.placeholder(dtype=tf.float32, shape=combined_shape(None,dim))
 15 | 
 16 | def placeholders(*args):
 17 |     return [placeholder(dim) for dim in args]
 18 | 
 19 | def placeholder_from_space(space):
 20 |     if isinstance(space, Box):
 21 |         return placeholder(space.shape)
 22 |     elif isinstance(space, Discrete):
 23 |         return tf.placeholder(dtype=tf.int32, shape=(None,))
 24 |     raise NotImplementedError
 25 | 
 26 | def placeholders_from_spaces(*args):
 27 |     return [placeholder_from_space(space) for space in args]
 28 | 
 29 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 30 |     for h in hidden_sizes[:-1]:
 31 |         x = tf.layers.dense(x, units=h, activation=activation)
 32 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 33 | 
 34 | def get_vars(scope=''):
 35 |     return [x for x in tf.trainable_variables() if scope in x.name]
 36 | 
 37 | def count_vars(scope=''):
 38 |     v = get_vars(scope)
 39 |     return sum([np.prod(var.shape.as_list()) for var in v])
 40 | 
 41 | def gaussian_likelihood(x, mu, log_std):
 42 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
 43 |     return tf.reduce_sum(pre_sum, axis=1)
 44 | 
 45 | def discount_cumsum(x, discount):
 46 |     """
 47 |     magic from rllab for computing discounted cumulative sums of vectors.
 48 | 
 49 |     input: 
 50 |         vector x, 
 51 |         [x0, 
 52 |          x1, 
 53 |          x2]
 54 | 
 55 |     output:
 56 |         [x0 + discount * x1 + discount^2 * x2,  
 57 |          x1 + discount * x2,
 58 |          x2]
 59 |     """
 60 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 61 | 
 62 | 
 63 | """
 64 | Policies
 65 | """
 66 | 
 67 | def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 68 |     act_dim = action_space.n
 69 |     logits = mlp(x, list(hidden_sizes)+[act_dim], activation, None)
 70 |     logp_all = tf.nn.log_softmax(logits)
 71 |     pi = tf.squeeze(tf.multinomial(logits,1), axis=1)
 72 |     logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1)
 73 |     logp_pi = tf.reduce_sum(tf.one_hot(pi, depth=act_dim) * logp_all, axis=1)
 74 |     return pi, logp, logp_pi
 75 | 
 76 | 
 77 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 78 |     act_dim = a.shape.as_list()[-1]
 79 |     mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
 80 |     log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
 81 |     std = tf.exp(log_std)
 82 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
 83 |     logp = gaussian_likelihood(a, mu, log_std)
 84 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
 85 |     return pi, logp, logp_pi
 86 | 
 87 | 
 88 | """
 89 | Actor-Critics
 90 | """
 91 | def mlp_actor_critic(x, a, hidden_sizes=(64,64), activation=tf.tanh, 
 92 |                      output_activation=None, policy=None, action_space=None):
 93 | 
 94 |     # default policy builder depends on action space
 95 |     if policy is None and isinstance(action_space, Box):
 96 |         policy = mlp_gaussian_policy
 97 |     elif policy is None and isinstance(action_space, Discrete):
 98 |         policy = mlp_categorical_policy
 99 | 
100 |     with tf.variable_scope('pi'):
101 |         pi, logp, logp_pi = policy(x, a, hidden_sizes, activation, output_activation, action_space)
102 |     with tf.variable_scope('v'):
103 |         v = tf.squeeze(mlp(x, list(hidden_sizes)+[1], activation, None), axis=1)
104 |     return pi, logp, logp_pi, v
105 | 


--------------------------------------------------------------------------------
/spinup/examples/pytorch/bench_ppo_cartpole.py:
--------------------------------------------------------------------------------
 1 | from spinup.utils.run_utils import ExperimentGrid
 2 | from spinup import ppo_pytorch
 3 | import torch
 4 | 
 5 | if __name__ == '__main__':
 6 |     import argparse
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('--cpu', type=int, default=4)
 9 |     parser.add_argument('--num_runs', type=int, default=3)
10 |     args = parser.parse_args()
11 | 
12 |     eg = ExperimentGrid(name='ppo-pyt-bench')
13 |     eg.add('env_name', 'CartPole-v0', '', True)
14 |     eg.add('seed', [10*i for i in range(args.num_runs)])
15 |     eg.add('epochs', 10)
16 |     eg.add('steps_per_epoch', 4000)
17 |     eg.add('ac_kwargs:hidden_sizes', [(32,), (64,64)], 'hid')
18 |     eg.add('ac_kwargs:activation', [torch.nn.Tanh, torch.nn.ReLU], '')
19 |     eg.run(ppo_pytorch, num_cpu=args.cpu)


--------------------------------------------------------------------------------
/spinup/examples/pytorch/pg_math/1_simple_pg.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.distributions.categorical import Categorical
  4 | from torch.optim import Adam
  5 | import numpy as np
  6 | import gym
  7 | from gym.spaces import Discrete, Box
  8 | 
  9 | def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
 10 |     # Build a feedforward neural network.
 11 |     layers = []
 12 |     for j in range(len(sizes)-1):
 13 |         act = activation if j < len(sizes)-2 else output_activation
 14 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 15 |     return nn.Sequential(*layers)
 16 | 
 17 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
 18 |           epochs=50, batch_size=5000, render=False):
 19 | 
 20 |     # make environment, check spaces, get obs / act dims
 21 |     env = gym.make(env_name)
 22 |     assert isinstance(env.observation_space, Box), \
 23 |         "This example only works for envs with continuous state spaces."
 24 |     assert isinstance(env.action_space, Discrete), \
 25 |         "This example only works for envs with discrete action spaces."
 26 | 
 27 |     obs_dim = env.observation_space.shape[0]
 28 |     n_acts = env.action_space.n
 29 | 
 30 |     # make core of policy network
 31 |     logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])
 32 | 
 33 |     # make function to compute action distribution
 34 |     def get_policy(obs):
 35 |         logits = logits_net(obs)
 36 |         return Categorical(logits=logits)
 37 | 
 38 |     # make action selection function (outputs int actions, sampled from policy)
 39 |     def get_action(obs):
 40 |         return get_policy(obs).sample().item()
 41 | 
 42 |     # make loss function whose gradient, for the right data, is policy gradient
 43 |     def compute_loss(obs, act, weights):
 44 |         logp = get_policy(obs).log_prob(act)
 45 |         return -(logp * weights).mean()
 46 | 
 47 |     # make optimizer
 48 |     optimizer = Adam(logits_net.parameters(), lr=lr)
 49 | 
 50 |     # for training policy
 51 |     def train_one_epoch():
 52 |         # make some empty lists for logging.
 53 |         batch_obs = []          # for observations
 54 |         batch_acts = []         # for actions
 55 |         batch_weights = []      # for R(tau) weighting in policy gradient
 56 |         batch_rets = []         # for measuring episode returns
 57 |         batch_lens = []         # for measuring episode lengths
 58 | 
 59 |         # reset episode-specific variables
 60 |         obs = env.reset()       # first obs comes from starting distribution
 61 |         done = False            # signal from environment that episode is over
 62 |         ep_rews = []            # list for rewards accrued throughout ep
 63 | 
 64 |         # render first episode of each epoch
 65 |         finished_rendering_this_epoch = False
 66 | 
 67 |         # collect experience by acting in the environment with current policy
 68 |         while True:
 69 | 
 70 |             # rendering
 71 |             if (not finished_rendering_this_epoch) and render:
 72 |                 env.render()
 73 | 
 74 |             # save obs
 75 |             batch_obs.append(obs.copy())
 76 | 
 77 |             # act in the environment
 78 |             act = get_action(torch.as_tensor(obs, dtype=torch.float32))
 79 |             obs, rew, done, _ = env.step(act)
 80 | 
 81 |             # save action, reward
 82 |             batch_acts.append(act)
 83 |             ep_rews.append(rew)
 84 | 
 85 |             if done:
 86 |                 # if episode is over, record info about episode
 87 |                 ep_ret, ep_len = sum(ep_rews), len(ep_rews)
 88 |                 batch_rets.append(ep_ret)
 89 |                 batch_lens.append(ep_len)
 90 | 
 91 |                 # the weight for each logprob(a|s) is R(tau)
 92 |                 batch_weights += [ep_ret] * ep_len
 93 | 
 94 |                 # reset episode-specific variables
 95 |                 obs, done, ep_rews = env.reset(), False, []
 96 | 
 97 |                 # won't render again this epoch
 98 |                 finished_rendering_this_epoch = True
 99 | 
100 |                 # end experience loop if we have enough of it
101 |                 if len(batch_obs) > batch_size:
102 |                     break
103 | 
104 |         # take a single policy gradient update step
105 |         optimizer.zero_grad()
106 |         batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
107 |                                   act=torch.as_tensor(batch_acts, dtype=torch.int32),
108 |                                   weights=torch.as_tensor(batch_weights, dtype=torch.float32)
109 |                                   )
110 |         batch_loss.backward()
111 |         optimizer.step()
112 |         return batch_loss, batch_rets, batch_lens
113 | 
114 |     # training loop
115 |     for i in range(epochs):
116 |         batch_loss, batch_rets, batch_lens = train_one_epoch()
117 |         print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
118 |                 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
119 | 
120 | if __name__ == '__main__':
121 |     import argparse
122 |     parser = argparse.ArgumentParser()
123 |     parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
124 |     parser.add_argument('--render', action='store_true')
125 |     parser.add_argument('--lr', type=float, default=1e-2)
126 |     args = parser.parse_args()
127 |     print('\nUsing simplest formulation of policy gradient.\n')
128 |     train(env_name=args.env_name, render=args.render, lr=args.lr)


--------------------------------------------------------------------------------
/spinup/examples/pytorch/pg_math/2_rtg_pg.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.distributions.categorical import Categorical
  4 | from torch.optim import Adam
  5 | import numpy as np
  6 | import gym
  7 | from gym.spaces import Discrete, Box
  8 | 
  9 | def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
 10 |     # Build a feedforward neural network.
 11 |     layers = []
 12 |     for j in range(len(sizes)-1):
 13 |         act = activation if j < len(sizes)-2 else output_activation
 14 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
 15 |     return nn.Sequential(*layers)
 16 | 
 17 | def reward_to_go(rews):
 18 |     n = len(rews)
 19 |     rtgs = np.zeros_like(rews)
 20 |     for i in reversed(range(n)):
 21 |         rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
 22 |     return rtgs
 23 | 
 24 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
 25 |           epochs=50, batch_size=5000, render=False):
 26 | 
 27 |     # make environment, check spaces, get obs / act dims
 28 |     env = gym.make(env_name)
 29 |     assert isinstance(env.observation_space, Box), \
 30 |         "This example only works for envs with continuous state spaces."
 31 |     assert isinstance(env.action_space, Discrete), \
 32 |         "This example only works for envs with discrete action spaces."
 33 | 
 34 |     obs_dim = env.observation_space.shape[0]
 35 |     n_acts = env.action_space.n
 36 | 
 37 |     # make core of policy network
 38 |     logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])
 39 | 
 40 |     # make function to compute action distribution
 41 |     def get_policy(obs):
 42 |         logits = logits_net(obs)
 43 |         return Categorical(logits=logits)
 44 | 
 45 |     # make action selection function (outputs int actions, sampled from policy)
 46 |     def get_action(obs):
 47 |         return get_policy(obs).sample().item()
 48 | 
 49 |     # make loss function whose gradient, for the right data, is policy gradient
 50 |     def compute_loss(obs, act, weights):
 51 |         logp = get_policy(obs).log_prob(act)
 52 |         return -(logp * weights).mean()
 53 | 
 54 |     # make optimizer
 55 |     optimizer = Adam(logits_net.parameters(), lr=lr)
 56 | 
 57 |     # for training policy
 58 |     def train_one_epoch():
 59 |         # make some empty lists for logging.
 60 |         batch_obs = []          # for observations
 61 |         batch_acts = []         # for actions
 62 |         batch_weights = []      # for reward-to-go weighting in policy gradient
 63 |         batch_rets = []         # for measuring episode returns
 64 |         batch_lens = []         # for measuring episode lengths
 65 | 
 66 |         # reset episode-specific variables
 67 |         obs = env.reset()       # first obs comes from starting distribution
 68 |         done = False            # signal from environment that episode is over
 69 |         ep_rews = []            # list for rewards accrued throughout ep
 70 | 
 71 |         # render first episode of each epoch
 72 |         finished_rendering_this_epoch = False
 73 | 
 74 |         # collect experience by acting in the environment with current policy
 75 |         while True:
 76 | 
 77 |             # rendering
 78 |             if (not finished_rendering_this_epoch) and render:
 79 |                 env.render()
 80 | 
 81 |             # save obs
 82 |             batch_obs.append(obs.copy())
 83 | 
 84 |             # act in the environment
 85 |             act = get_action(torch.as_tensor(obs, dtype=torch.float32))
 86 |             obs, rew, done, _ = env.step(act)
 87 | 
 88 |             # save action, reward
 89 |             batch_acts.append(act)
 90 |             ep_rews.append(rew)
 91 | 
 92 |             if done:
 93 |                 # if episode is over, record info about episode
 94 |                 ep_ret, ep_len = sum(ep_rews), len(ep_rews)
 95 |                 batch_rets.append(ep_ret)
 96 |                 batch_lens.append(ep_len)
 97 | 
 98 |                 # the weight for each logprob(a_t|s_t) is reward-to-go from t
 99 |                 batch_weights += list(reward_to_go(ep_rews))
100 | 
101 |                 # reset episode-specific variables
102 |                 obs, done, ep_rews = env.reset(), False, []
103 | 
104 |                 # won't render again this epoch
105 |                 finished_rendering_this_epoch = True
106 | 
107 |                 # end experience loop if we have enough of it
108 |                 if len(batch_obs) > batch_size:
109 |                     break
110 | 
111 |         # take a single policy gradient update step
112 |         optimizer.zero_grad()
113 |         batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
114 |                                   act=torch.as_tensor(batch_acts, dtype=torch.int32),
115 |                                   weights=torch.as_tensor(batch_weights, dtype=torch.float32)
116 |                                   )
117 |         batch_loss.backward()
118 |         optimizer.step()
119 |         return batch_loss, batch_rets, batch_lens
120 | 
121 |     # training loop
122 |     for i in range(epochs):
123 |         batch_loss, batch_rets, batch_lens = train_one_epoch()
124 |         print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
125 |                 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
126 | 
127 | if __name__ == '__main__':
128 |     import argparse
129 |     parser = argparse.ArgumentParser()
130 |     parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
131 |     parser.add_argument('--render', action='store_true')
132 |     parser.add_argument('--lr', type=float, default=1e-2)
133 |     args = parser.parse_args()
134 |     print('\nUsing reward-to-go formulation of policy gradient.\n')
135 |     train(env_name=args.env_name, render=args.render, lr=args.lr)


--------------------------------------------------------------------------------
/spinup/examples/tf1/bench_ppo_cartpole.py:
--------------------------------------------------------------------------------
 1 | from spinup.utils.run_utils import ExperimentGrid
 2 | from spinup import ppo_tf1
 3 | import tensorflow as tf
 4 | 
 5 | if __name__ == '__main__':
 6 |     import argparse
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('--cpu', type=int, default=4)
 9 |     parser.add_argument('--num_runs', type=int, default=3)
10 |     args = parser.parse_args()
11 | 
12 |     eg = ExperimentGrid(name='ppo-tf1-bench')
13 |     eg.add('env_name', 'CartPole-v0', '', True)
14 |     eg.add('seed', [10*i for i in range(args.num_runs)])
15 |     eg.add('epochs', 10)
16 |     eg.add('steps_per_epoch', 4000)
17 |     eg.add('ac_kwargs:hidden_sizes', [(32,), (64,64)], 'hid')
18 |     eg.add('ac_kwargs:activation', [tf.tanh, tf.nn.relu], '')
19 |     eg.run(ppo_tf1, num_cpu=args.cpu)


--------------------------------------------------------------------------------
/spinup/examples/tf1/pg_math/1_simple_pg.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import gym
  4 | from gym.spaces import Discrete, Box
  5 | 
  6 | def mlp(x, sizes, activation=tf.tanh, output_activation=None):
  7 |     # Build a feedforward neural network.
  8 |     for size in sizes[:-1]:
  9 |         x = tf.layers.dense(x, units=size, activation=activation)
 10 |     return tf.layers.dense(x, units=sizes[-1], activation=output_activation)
 11 | 
 12 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
 13 |           epochs=50, batch_size=5000, render=False):
 14 | 
 15 |     # make environment, check spaces, get obs / act dims
 16 |     env = gym.make(env_name)
 17 |     assert isinstance(env.observation_space, Box), \
 18 |         "This example only works for envs with continuous state spaces."
 19 |     assert isinstance(env.action_space, Discrete), \
 20 |         "This example only works for envs with discrete action spaces."
 21 | 
 22 |     obs_dim = env.observation_space.shape[0]
 23 |     n_acts = env.action_space.n
 24 | 
 25 |     # make core of policy network
 26 |     obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
 27 |     logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts])
 28 | 
 29 |     # make action selection op (outputs int actions, sampled from policy)
 30 |     actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1)
 31 | 
 32 |     # make loss function whose gradient, for the right data, is policy gradient
 33 |     weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)
 34 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
 35 |     action_masks = tf.one_hot(act_ph, n_acts)
 36 |     log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
 37 |     loss = -tf.reduce_mean(weights_ph * log_probs)
 38 | 
 39 |     # make train op
 40 |     train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
 41 | 
 42 |     sess = tf.InteractiveSession()
 43 |     sess.run(tf.global_variables_initializer())
 44 | 
 45 |     # for training policy
 46 |     def train_one_epoch():
 47 |         # make some empty lists for logging.
 48 |         batch_obs = []          # for observations
 49 |         batch_acts = []         # for actions
 50 |         batch_weights = []      # for R(tau) weighting in policy gradient
 51 |         batch_rets = []         # for measuring episode returns
 52 |         batch_lens = []         # for measuring episode lengths
 53 | 
 54 |         # reset episode-specific variables
 55 |         obs = env.reset()       # first obs comes from starting distribution
 56 |         done = False            # signal from environment that episode is over
 57 |         ep_rews = []            # list for rewards accrued throughout ep
 58 | 
 59 |         # render first episode of each epoch
 60 |         finished_rendering_this_epoch = False
 61 | 
 62 |         # collect experience by acting in the environment with current policy
 63 |         while True:
 64 | 
 65 |             # rendering
 66 |             if (not finished_rendering_this_epoch) and render:
 67 |                 env.render()
 68 | 
 69 |             # save obs
 70 |             batch_obs.append(obs.copy())
 71 | 
 72 |             # act in the environment
 73 |             act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]
 74 |             obs, rew, done, _ = env.step(act)
 75 | 
 76 |             # save action, reward
 77 |             batch_acts.append(act)
 78 |             ep_rews.append(rew)
 79 | 
 80 |             if done:
 81 |                 # if episode is over, record info about episode
 82 |                 ep_ret, ep_len = sum(ep_rews), len(ep_rews)
 83 |                 batch_rets.append(ep_ret)
 84 |                 batch_lens.append(ep_len)
 85 | 
 86 |                 # the weight for each logprob(a|s) is R(tau)
 87 |                 batch_weights += [ep_ret] * ep_len
 88 | 
 89 |                 # reset episode-specific variables
 90 |                 obs, done, ep_rews = env.reset(), False, []
 91 | 
 92 |                 # won't render again this epoch
 93 |                 finished_rendering_this_epoch = True
 94 | 
 95 |                 # end experience loop if we have enough of it
 96 |                 if len(batch_obs) > batch_size:
 97 |                     break
 98 | 
 99 |         # take a single policy gradient update step
100 |         batch_loss, _ = sess.run([loss, train_op],
101 |                                  feed_dict={
102 |                                     obs_ph: np.array(batch_obs),
103 |                                     act_ph: np.array(batch_acts),
104 |                                     weights_ph: np.array(batch_weights)
105 |                                  })
106 |         return batch_loss, batch_rets, batch_lens
107 | 
108 |     # training loop
109 |     for i in range(epochs):
110 |         batch_loss, batch_rets, batch_lens = train_one_epoch()
111 |         print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
112 |                 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
113 | 
114 | if __name__ == '__main__':
115 |     import argparse
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
118 |     parser.add_argument('--render', action='store_true')
119 |     parser.add_argument('--lr', type=float, default=1e-2)
120 |     args = parser.parse_args()
121 |     print('\nUsing simplest formulation of policy gradient.\n')
122 |     train(env_name=args.env_name, render=args.render, lr=args.lr)


--------------------------------------------------------------------------------
/spinup/examples/tf1/pg_math/2_rtg_pg.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import gym
  4 | from gym.spaces import Discrete, Box
  5 | 
  6 | def mlp(x, sizes, activation=tf.tanh, output_activation=None):
  7 |     # Build a feedforward neural network.
  8 |     for size in sizes[:-1]:
  9 |         x = tf.layers.dense(x, units=size, activation=activation)
 10 |     return tf.layers.dense(x, units=sizes[-1], activation=output_activation)
 11 | 
 12 | def reward_to_go(rews):
 13 |     n = len(rews)
 14 |     rtgs = np.zeros_like(rews)
 15 |     for i in reversed(range(n)):
 16 |         rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
 17 |     return rtgs
 18 | 
 19 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
 20 |           epochs=50, batch_size=5000, render=False):
 21 | 
 22 |     # make environment, check spaces, get obs / act dims
 23 |     env = gym.make(env_name)
 24 |     assert isinstance(env.observation_space, Box), \
 25 |         "This example only works for envs with continuous state spaces."
 26 |     assert isinstance(env.action_space, Discrete), \
 27 |         "This example only works for envs with discrete action spaces."
 28 | 
 29 |     obs_dim = env.observation_space.shape[0]
 30 |     n_acts = env.action_space.n
 31 | 
 32 |     # make core of policy network
 33 |     obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)
 34 |     logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts])
 35 | 
 36 |     # make action selection op (outputs int actions, sampled from policy)
 37 |     actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1)
 38 | 
 39 |     # make loss function whose gradient, for the right data, is policy gradient
 40 |     weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)
 41 |     act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)
 42 |     action_masks = tf.one_hot(act_ph, n_acts)
 43 |     log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)
 44 |     loss = -tf.reduce_mean(weights_ph * log_probs)
 45 | 
 46 |     # make train op
 47 |     train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
 48 | 
 49 |     sess = tf.InteractiveSession()
 50 |     sess.run(tf.global_variables_initializer())
 51 | 
 52 |     # for training policy
 53 |     def train_one_epoch():
 54 |         # make some empty lists for logging.
 55 |         batch_obs = []          # for observations
 56 |         batch_acts = []         # for actions
 57 |         batch_weights = []      # for reward-to-go weighting in policy gradient
 58 |         batch_rets = []         # for measuring episode returns
 59 |         batch_lens = []         # for measuring episode lengths
 60 | 
 61 |         # reset episode-specific variables
 62 |         obs = env.reset()       # first obs comes from starting distribution
 63 |         done = False            # signal from environment that episode is over
 64 |         ep_rews = []            # list for rewards accrued throughout ep
 65 | 
 66 |         # render first episode of each epoch
 67 |         finished_rendering_this_epoch = False
 68 | 
 69 |         # collect experience by acting in the environment with current policy
 70 |         while True:
 71 | 
 72 |             # rendering
 73 |             if (not finished_rendering_this_epoch) and render:
 74 |                 env.render()
 75 | 
 76 |             # save obs
 77 |             batch_obs.append(obs.copy())
 78 | 
 79 |             # act in the environment
 80 |             act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]
 81 |             obs, rew, done, _ = env.step(act)
 82 | 
 83 |             # save action, reward
 84 |             batch_acts.append(act)
 85 |             ep_rews.append(rew)
 86 | 
 87 |             if done:
 88 |                 # if episode is over, record info about episode
 89 |                 ep_ret, ep_len = sum(ep_rews), len(ep_rews)
 90 |                 batch_rets.append(ep_ret)
 91 |                 batch_lens.append(ep_len)
 92 | 
 93 |                 # the weight for each logprob(a_t|s_t) is reward-to-go from t
 94 |                 batch_weights += list(reward_to_go(ep_rews))
 95 | 
 96 |                 # reset episode-specific variables
 97 |                 obs, done, ep_rews = env.reset(), False, []
 98 | 
 99 |                 # won't render again this epoch
100 |                 finished_rendering_this_epoch = True
101 | 
102 |                 # end experience loop if we have enough of it
103 |                 if len(batch_obs) > batch_size:
104 |                     break
105 | 
106 |         # take a single policy gradient update step
107 |         batch_loss, _ = sess.run([loss, train_op],
108 |                                  feed_dict={
109 |                                     obs_ph: np.array(batch_obs),
110 |                                     act_ph: np.array(batch_acts),
111 |                                     weights_ph: np.array(batch_weights)
112 |                                  })
113 |         return batch_loss, batch_rets, batch_lens
114 | 
115 |     # training loop
116 |     for i in range(epochs):
117 |         batch_loss, batch_rets, batch_lens = train_one_epoch()
118 |         print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
119 |                 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
120 | 
121 | if __name__ == '__main__':
122 |     import argparse
123 |     parser = argparse.ArgumentParser()
124 |     parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
125 |     parser.add_argument('--render', action='store_true')
126 |     parser.add_argument('--lr', type=float, default=1e-2)
127 |     args = parser.parse_args()
128 |     print('\nUsing reward-to-go formulation of policy gradient.\n')
129 |     train(env_name=args.env_name, render=args.render, lr=args.lr)


--------------------------------------------------------------------------------
/spinup/examples/tf1/train_mnist.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import time
 4 | from spinup.utils.logx import EpochLogger
 5 | 
 6 | 
 7 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 8 |     for h in hidden_sizes[:-1]:
 9 |         x = tf.layers.dense(x, units=h, activation=activation)
10 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
11 | 
12 | 
13 | # Simple script for training an MLP on MNIST.
14 | def train_mnist(steps_per_epoch=100, epochs=5, 
15 |                 lr=1e-3, layers=2, hidden_size=64, 
16 |                 logger_kwargs=dict(), save_freq=1):
17 | 
18 |     logger = EpochLogger(**logger_kwargs)
19 |     logger.save_config(locals())
20 | 
21 |     # Load and preprocess MNIST data
22 |     (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
23 |     x_train = x_train.reshape(-1, 28*28) / 255.0
24 | 
25 |     # Define inputs & main outputs from computation graph
26 |     x_ph = tf.placeholder(tf.float32, shape=(None, 28*28))
27 |     y_ph = tf.placeholder(tf.int32, shape=(None,))
28 |     logits = mlp(x_ph, hidden_sizes=[hidden_size]*layers + [10], activation=tf.nn.relu)
29 |     predict = tf.argmax(logits, axis=1, output_type=tf.int32)
30 | 
31 |     # Define loss function, accuracy, and training op
32 |     y = tf.one_hot(y_ph, 10)
33 |     loss = tf.losses.softmax_cross_entropy(y, logits)
34 |     acc = tf.reduce_mean(tf.cast(tf.equal(y_ph, predict), tf.float32))
35 |     train_op = tf.train.AdamOptimizer().minimize(loss)
36 | 
37 |     # Prepare session
38 |     sess = tf.Session()
39 |     sess.run(tf.global_variables_initializer())
40 | 
41 |     # Setup model saving
42 |     logger.setup_tf_saver(sess, inputs={'x': x_ph}, 
43 |                                 outputs={'logits': logits, 'predict': predict})
44 | 
45 |     start_time = time.time()
46 | 
47 |     # Run main training loop
48 |     for epoch in range(epochs):
49 |         for t in range(steps_per_epoch):
50 |             idxs = np.random.randint(0, len(x_train), 32)
51 |             feed_dict = {x_ph: x_train[idxs],
52 |                          y_ph: y_train[idxs]}
53 |             outs = sess.run([loss, acc, train_op], feed_dict=feed_dict)
54 |             logger.store(Loss=outs[0], Acc=outs[1])
55 | 
56 |         # Save model
57 |         if (epoch % save_freq == 0) or (epoch == epochs-1):
58 |             logger.save_state(state_dict=dict(), itr=None)
59 | 
60 |         # Log info about epoch
61 |         logger.log_tabular('Epoch', epoch)
62 |         logger.log_tabular('Acc', with_min_and_max=True)
63 |         logger.log_tabular('Loss', average_only=True)
64 |         logger.log_tabular('TotalGradientSteps', (epoch+1)*steps_per_epoch)
65 |         logger.log_tabular('Time', time.time()-start_time)
66 |         logger.dump_tabular()
67 | 
68 | if __name__ == '__main__':
69 |     train_mnist()


--------------------------------------------------------------------------------
/spinup/exercises/common.py:
--------------------------------------------------------------------------------
1 | def print_result(correct=False):
2 |     print('\n'*5 + '='*50 + '\n'*3)
3 |     if correct:
4 |         print("Congratulations! Your answer is correct.")
5 |     else:
6 |         print("Your answer appears to be incorrect. Try again!")
7 |     print('\n'*3 + '='*50)


--------------------------------------------------------------------------------
/spinup/exercises/pytorch/problem_set_1/exercise1_1.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | 
 4 | """
 5 | 
 6 | Exercise 1.1: Diagonal Gaussian Likelihood
 7 | 
 8 | Write a function that takes in PyTorch Tensors for the means and 
 9 | log stds of a batch of diagonal Gaussian distributions, along with a 
10 | PyTorch Tensor for (previously-generated) samples from those 
11 | distributions, and returns a Tensor containing the log 
12 | likelihoods of those samples.
13 | 
14 | """
15 | 
16 | def gaussian_likelihood(x, mu, log_std):
17 |     """
18 |     Args:
19 |         x: Tensor with shape [batch, dim]
20 |         mu: Tensor with shape [batch, dim]
21 |         log_std: Tensor with shape [batch, dim] or [dim]
22 | 
23 |     Returns:
24 |         Tensor with shape [batch]
25 |     """
26 |     #######################
27 |     #                     #
28 |     #   YOUR CODE HERE    #
29 |     #                     #
30 |     #######################
31 |     return torch.zeros(1)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     """
36 |     Run this file to verify your solution.
37 |     """
38 |     from spinup.exercises.pytorch.problem_set_1_solutions import exercise1_1_soln
39 |     from spinup.exercises.common import print_result
40 | 
41 |     batch_size = 32
42 |     dim = 10
43 | 
44 |     x = torch.rand(batch_size, dim)
45 |     mu = torch.rand(batch_size, dim)
46 |     log_std = torch.rand(dim)
47 | 
48 |     your_gaussian_likelihood = gaussian_likelihood(x, mu, log_std)
49 |     true_gaussian_likelihood = exercise1_1_soln.gaussian_likelihood(x, mu, log_std)
50 | 
51 |     your_result = your_gaussian_likelihood.detach().numpy()
52 |     true_result = true_gaussian_likelihood.detach().numpy()
53 | 
54 |     correct = np.allclose(your_result, true_result)
55 |     print_result(correct)


--------------------------------------------------------------------------------
/spinup/exercises/pytorch/problem_set_1/exercise1_2.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | from spinup.exercises.pytorch.problem_set_1 import exercise1_1
  5 | from spinup.exercises.pytorch.problem_set_1 import exercise1_2_auxiliary
  6 | 
  7 | """
  8 | 
  9 | Exercise 1.2: PPO Gaussian Policy
 10 | 
 11 | You will implement an MLP diagonal Gaussian policy for PPO by
 12 | writing an MLP-builder, and a few other key functions.
 13 | 
 14 | Log-likelihoods will be computed using your answer to Exercise 1.1,
 15 | so make sure to complete that exercise before beginning this one.
 16 | 
 17 | """
 18 | 
 19 | def mlp(sizes, activation, output_activation=nn.Identity):
 20 |     """
 21 |     Build a multi-layer perceptron in PyTorch.
 22 | 
 23 |     Args:
 24 |         sizes: Tuple, list, or other iterable giving the number of units
 25 |             for each layer of the MLP. 
 26 | 
 27 |         activation: Activation function for all layers except last.
 28 | 
 29 |         output_activation: Activation function for last layer.
 30 | 
 31 |     Returns:
 32 |         A PyTorch module that can be called to give the output of the MLP.
 33 |         (Use an nn.Sequential module.)
 34 | 
 35 |     """
 36 |     #######################
 37 |     #                     #
 38 |     #   YOUR CODE HERE    #
 39 |     #                     #
 40 |     #######################
 41 |     pass
 42 | 
 43 | class DiagonalGaussianDistribution:
 44 | 
 45 |     def __init__(self, mu, log_std):
 46 |         self.mu = mu
 47 |         self.log_std = log_std
 48 | 
 49 |     def sample(self):
 50 |         """
 51 |         Returns:
 52 |             A PyTorch Tensor of samples from the diagonal Gaussian distribution with
 53 |             mean and log_std given by self.mu and self.log_std.
 54 |         """
 55 |         #######################
 56 |         #                     #
 57 |         #   YOUR CODE HERE    #
 58 |         #                     #
 59 |         #######################
 60 |         pass
 61 | 
 62 |     #================================(Given, ignore)==========================================#
 63 |     def log_prob(self, value):
 64 |         return exercise1_1.gaussian_likelihood(value, self.mu, self.log_std)
 65 | 
 66 |     def entropy(self):
 67 |         return 0.5 + 0.5 * np.log(2 * np.pi) + self.log_std.sum(axis=-1)
 68 |     #=========================================================================================#
 69 | 
 70 | 
 71 | class MLPGaussianActor(nn.Module):
 72 | 
 73 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
 74 |         super().__init__()
 75 |         """
 76 |         Initialize an MLP Gaussian Actor by making a PyTorch module for computing the
 77 |         mean of the distribution given a batch of observations, and a log_std parameter.
 78 | 
 79 |         Make log_std a PyTorch Parameter with the same shape as the action vector, 
 80 |         independent of observations, initialized to [-0.5, -0.5, ..., -0.5].
 81 |         (Make sure it's trainable!)
 82 |         """
 83 |         #######################
 84 |         #                     #
 85 |         #   YOUR CODE HERE    #
 86 |         #                     #
 87 |         #######################
 88 |         # self.log_std = 
 89 |         # self.mu_net = 
 90 |         pass 
 91 | 
 92 |     #================================(Given, ignore)==========================================#
 93 |     def forward(self, obs, act=None):
 94 |         mu = self.mu_net(obs)
 95 |         pi = DiagonalGaussianDistribution(mu, self.log_std)
 96 |         logp_a = None
 97 |         if act is not None:
 98 |             logp_a = pi.log_prob(act)
 99 |         return pi, logp_a
100 |     #=========================================================================================#
101 | 
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     """
106 |     Run this file to verify your solution.
107 |     """
108 | 
109 |     from spinup import ppo_pytorch as ppo
110 |     from spinup.exercises.common import print_result
111 |     from functools import partial
112 |     import gym
113 |     import os
114 |     import pandas as pd
115 |     import psutil
116 |     import time
117 | 
118 |     logdir = "/tmp/experiments/%i"%int(time.time())
119 | 
120 |     ActorCritic = partial(exercise1_2_auxiliary.ExerciseActorCritic, actor=MLPGaussianActor)
121 |     
122 |     ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'),
123 |         actor_critic=ActorCritic,
124 |         ac_kwargs=dict(hidden_sizes=(64,)),
125 |         steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir))
126 | 
127 |     # Get scores from last five epochs to evaluate success.
128 |     data = pd.read_table(os.path.join(logdir,'progress.txt'))
129 |     last_scores = data['AverageEpRet'][-5:]
130 | 
131 |     # Your implementation is probably correct if the agent has a score >500,
132 |     # or if it reaches the top possible score of 1000, in the last five epochs.
133 |     correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3
134 |     print_result(correct)


--------------------------------------------------------------------------------
/spinup/exercises/pytorch/problem_set_1/exercise1_2_auxiliary.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | """
 6 | 
 7 | Auxiliary code for Exercise 1.2. No part of the exercise requires you to 
 8 | look into or modify this file (and since it contains an mlp function, 
 9 | it has spoilers for the answer). Removed from the main file to avoid
10 | cluttering it up.
11 | 
12 | In other words, nothing to see here, move along, these are not the
13 | droids you're looking for, and all that...
14 | 
15 | """
16 | 
17 | def mlp(sizes, activation, output_activation=nn.Identity):
18 |     layers = []
19 |     for j in range(len(sizes)-1):
20 |         act = activation if j < len(sizes)-2 else output_activation
21 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
22 |     return nn.Sequential(*layers)
23 | 
24 | 
25 | class MLPCritic(nn.Module):
26 | 
27 |     def __init__(self, obs_dim, hidden_sizes, activation):
28 |         super().__init__()
29 |         self.v_net = mlp([obs_dim] + list(hidden_sizes) + [1], activation)
30 | 
31 |     def forward(self, obs):
32 |         return torch.squeeze(self.v_net(obs), -1) # Critical to ensure v has right shape.
33 | 
34 | 
35 | class ExerciseActorCritic(nn.Module):
36 | 
37 |     def __init__(self, observation_space, action_space, 
38 |                  hidden_sizes=(64,64), activation=nn.Tanh,
39 |                  actor=None):
40 |         super().__init__()
41 |         obs_dim = observation_space.shape[0]
42 |         self.pi = actor(obs_dim, action_space.shape[0], hidden_sizes, activation)
43 |         self.v  = MLPCritic(obs_dim, hidden_sizes, activation)
44 | 
45 |     def step(self, obs):
46 |         with torch.no_grad():
47 |             pi, _ = self.pi(obs)
48 |             a = pi.sample()
49 |             logp_a = pi.log_prob(a)
50 |             v = self.v(obs)
51 |         return a.numpy(), v.numpy(), logp_a.numpy()
52 | 
53 |     def act(self, obs):
54 |         return self.step(obs)[0]


--------------------------------------------------------------------------------
/spinup/exercises/pytorch/problem_set_1_solutions/exercise1_1_soln.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | 
4 | EPS=1e-8
5 | 
6 | def gaussian_likelihood(x, mu, log_std):
7 |     pre_sum = -0.5 * (((x-mu)/(torch.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
8 |     return pre_sum.sum(axis=-1)


--------------------------------------------------------------------------------
/spinup/exercises/pytorch/problem_set_1_solutions/exercise1_2_soln.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | EPS=1e-8
 6 | 
 7 | def mlp(sizes, activation, output_activation=nn.Identity):
 8 |     layers = []
 9 |     for j in range(len(sizes)-1):
10 |         act = activation if j < len(sizes)-2 else output_activation
11 |         layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
12 |     return nn.Sequential(*layers)
13 | 
14 | def gaussian_likelihood(x, mu, log_std):
15 |     pre_sum = -0.5 * (((x-mu)/(torch.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
16 |     return pre_sum.sum(axis=-1)
17 | 
18 | 
19 | class DiagonalGaussianDistribution:
20 | 
21 |     def __init__(self, mu, log_std):
22 |         self.mu = mu
23 |         self.log_std = log_std
24 | 
25 |     def sample(self):
26 |         return self.mu + torch.exp(self.log_std) * torch.randn_like(self.mu)
27 | 
28 |     def log_prob(self, value):
29 |         return gaussian_likelihood(value, self.mu, self.log_std)
30 | 
31 |     def entropy(self):
32 |         return 0.5 + 0.5 * np.log(2 * np.pi) + self.log_std.sum(axis=-1)
33 | 
34 | 
35 | class MLPGaussianActor(nn.Module):
36 | 
37 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
38 |         super().__init__()
39 |         log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
40 |         self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
41 |         self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)
42 | 
43 |     def forward(self, obs, act=None):
44 |         mu = self.mu_net(obs)
45 |         pi = DiagonalGaussianDistribution(mu, self.log_std)
46 |         logp_a = None
47 |         if act is not None:
48 |             logp_a = pi.log_prob(act)
49 |         return pi, logp_a


--------------------------------------------------------------------------------
/spinup/exercises/pytorch/problem_set_2/exercise2_2.py:
--------------------------------------------------------------------------------
 1 | from spinup.algos.pytorch.ddpg.core import mlp, MLPActorCritic
 2 | from spinup.utils.run_utils import ExperimentGrid
 3 | from spinup import ddpg_pytorch as ddpg
 4 | import numpy as np
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | """
 9 | 
10 | Exercise 2.2: Silent Bug in DDPG (PyTorch Version)
11 | 
12 | In this exercise, you will run DDPG with a bugged actor critic. Your goal is
13 | to determine whether or not there is any performance degredation, and if so,
14 | figure out what's going wrong.
15 | 
16 | You do NOT need to write code for this exercise.
17 | 
18 | """
19 | 
20 | """
21 | Bugged Actor-Critic
22 | """
23 | 
24 | class BuggedMLPActor(nn.Module):
25 | 
26 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation, act_limit):
27 |         super().__init__()
28 |         pi_sizes = [obs_dim] + list(hidden_sizes) + [act_dim]
29 |         self.pi = mlp(pi_sizes, activation, nn.Tanh)
30 |         self.act_limit = act_limit
31 | 
32 |     def forward(self, obs):
33 |         # Return output from network scaled to action space limits.
34 |         return self.act_limit * self.pi(obs)
35 | 
36 | class BuggedMLPQFunction(nn.Module):
37 | 
38 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
39 |         super().__init__()
40 |         self.q = mlp([obs_dim + act_dim] + list(hidden_sizes) + [1], activation)
41 | 
42 |     def forward(self, obs, act):
43 |         return self.q(torch.cat([obs, act], dim=-1))
44 | 
45 | class BuggedMLPActorCritic(nn.Module):
46 | 
47 |     def __init__(self, observation_space, action_space, hidden_sizes=(256,256),
48 |                  activation=nn.ReLU):
49 |         super().__init__()
50 | 
51 |         obs_dim = observation_space.shape[0]
52 |         act_dim = action_space.shape[0]
53 |         act_limit = action_space.high[0]
54 | 
55 |         # build policy and value functions
56 |         self.pi = BuggedMLPActor(obs_dim, act_dim, hidden_sizes, activation, act_limit)
57 |         self.q = BuggedMLPQFunction(obs_dim, act_dim, hidden_sizes, activation)
58 | 
59 |     def act(self, obs):
60 |         with torch.no_grad():
61 |             return self.pi(obs).numpy()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     import argparse
66 |     parser = argparse.ArgumentParser()
67 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
68 |     parser.add_argument('--h', type=int, default=300)
69 |     parser.add_argument('--l', type=int, default=1)
70 |     parser.add_argument('--num_runs', '-n', type=int, default=3)
71 |     parser.add_argument('--steps_per_epoch', '-s', type=int, default=5000)
72 |     parser.add_argument('--total_steps', '-t', type=int, default=int(5e4))
73 |     args = parser.parse_args()
74 | 
75 |     def ddpg_with_actor_critic(bugged, **kwargs):
76 |         from spinup.exercises.pytorch.problem_set_2.exercise2_2 import BuggedMLPActorCritic
77 |         actor_critic = BuggedMLPActorCritic if bugged else MLPActorCritic
78 |         return ddpg(actor_critic=actor_critic, 
79 |                     ac_kwargs=dict(hidden_sizes=[args.h]*args.l),
80 |                     start_steps=5000,
81 |                     max_ep_len=150,
82 |                     batch_size=64,
83 |                     polyak=0.95,
84 |                     **kwargs)
85 | 
86 |     eg = ExperimentGrid(name='ex2-2_ddpg')
87 |     eg.add('replay_size', int(args.total_steps))
88 |     eg.add('env_name', args.env, '', True)
89 |     eg.add('seed', [10*i for i in range(args.num_runs)])
90 |     eg.add('epochs', int(args.total_steps / args.steps_per_epoch))
91 |     eg.add('steps_per_epoch', args.steps_per_epoch)
92 |     eg.add('bugged', [False, True])
93 |     eg.run(ddpg_with_actor_critic, datestamp=True)


--------------------------------------------------------------------------------
/spinup/exercises/tf1/problem_set_1/exercise1_1.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | """
 5 | 
 6 | Exercise 1.1: Diagonal Gaussian Likelihood
 7 | 
 8 | Write a function which takes in Tensorflow symbols for the means and 
 9 | log stds of a batch of diagonal Gaussian distributions, along with a 
10 | Tensorflow placeholder for (previously-generated) samples from those 
11 | distributions, and returns a Tensorflow symbol for computing the log 
12 | likelihoods of those samples.
13 | 
14 | """
15 | 
16 | def gaussian_likelihood(x, mu, log_std):
17 |     """
18 |     Args:
19 |         x: Tensor with shape [batch, dim]
20 |         mu: Tensor with shape [batch, dim]
21 |         log_std: Tensor with shape [batch, dim] or [dim]
22 | 
23 |     Returns:
24 |         Tensor with shape [batch]
25 |     """
26 |     #######################
27 |     #                     #
28 |     #   YOUR CODE HERE    #
29 |     #                     #
30 |     #######################
31 |     return tf.constant(0)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     """
36 |     Run this file to verify your solution.
37 |     """
38 |     from spinup.exercises.tf1.problem_set_1_solutions import exercise1_1_soln
39 |     from spinup.exercises.common import print_result
40 | 
41 |     sess = tf.Session()
42 | 
43 |     dim = 10
44 |     x = tf.placeholder(tf.float32, shape=(None, dim))
45 |     mu = tf.placeholder(tf.float32, shape=(None, dim))
46 |     log_std = tf.placeholder(tf.float32, shape=(dim,))
47 | 
48 |     your_gaussian_likelihood = gaussian_likelihood(x, mu, log_std)
49 |     true_gaussian_likelihood = exercise1_1_soln.gaussian_likelihood(x, mu, log_std)
50 | 
51 |     batch_size = 32
52 |     feed_dict = {x: np.random.rand(batch_size, dim),
53 |                  mu: np.random.rand(batch_size, dim),
54 |                  log_std: np.random.rand(dim)}
55 | 
56 |     your_result, true_result = sess.run([your_gaussian_likelihood, true_gaussian_likelihood],
57 |                                         feed_dict=feed_dict)
58 | 
59 |     correct = np.allclose(your_result, true_result)
60 |     print_result(correct)


--------------------------------------------------------------------------------
/spinup/exercises/tf1/problem_set_1/exercise1_2.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from spinup.exercises.tf1.problem_set_1 import exercise1_1
  4 | 
  5 | """
  6 | 
  7 | Exercise 1.2: PPO Gaussian Policy
  8 | 
  9 | Implement an MLP diagonal Gaussian policy for PPO. 
 10 | 
 11 | Log-likelihoods will be computed using your answer to Exercise 1.1,
 12 | so make sure to complete that exercise before beginning this one.
 13 | 
 14 | """
 15 | 
 16 | EPS = 1e-8
 17 | 
 18 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 19 |     """
 20 |     Builds a multi-layer perceptron in Tensorflow.
 21 | 
 22 |     Args:
 23 |         x: Input tensor.
 24 | 
 25 |         hidden_sizes: Tuple, list, or other iterable giving the number of units
 26 |             for each hidden layer of the MLP.
 27 | 
 28 |         activation: Activation function for all layers except last.
 29 | 
 30 |         output_activation: Activation function for last layer.
 31 | 
 32 |     Returns:
 33 |         A TF symbol for the output of an MLP that takes x as an input.
 34 | 
 35 |     """
 36 |     #######################
 37 |     #                     #
 38 |     #   YOUR CODE HERE    #
 39 |     #                     #
 40 |     #######################
 41 |     pass
 42 | 
 43 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
 44 |     """
 45 |     Builds symbols to sample actions and compute log-probs of actions.
 46 | 
 47 |     Special instructions: Make log_std a tf variable with the same shape as
 48 |     the action vector, independent of x, initialized to [-0.5, -0.5, ..., -0.5].
 49 | 
 50 |     Args:
 51 |         x: Input tensor of states. Shape [batch, obs_dim].
 52 | 
 53 |         a: Input tensor of actions. Shape [batch, act_dim].
 54 | 
 55 |         hidden_sizes: Sizes of hidden layers for action network MLP.
 56 | 
 57 |         activation: Activation function for all layers except last.
 58 | 
 59 |         output_activation: Activation function for last layer (action layer).
 60 | 
 61 |         action_space: A gym.spaces object describing the action space of the
 62 |             environment this agent will interact with.
 63 | 
 64 |     Returns:
 65 |         pi: A symbol for sampling stochastic actions from a Gaussian 
 66 |             distribution.
 67 | 
 68 |         logp: A symbol for computing log-likelihoods of actions from a Gaussian 
 69 |             distribution.
 70 | 
 71 |         logp_pi: A symbol for computing log-likelihoods of actions in pi from a 
 72 |             Gaussian distribution.
 73 | 
 74 |     """
 75 |     #######################
 76 |     #                     #
 77 |     #   YOUR CODE HERE    #
 78 |     #                     #
 79 |     #######################
 80 |     # mu = 
 81 |     # log_std = 
 82 |     # pi = 
 83 | 
 84 |     logp = exercise1_1.gaussian_likelihood(a, mu, log_std)
 85 |     logp_pi = exercise1_1.gaussian_likelihood(pi, mu, log_std)
 86 |     return pi, logp, logp_pi
 87 | 
 88 | 
 89 | if __name__ == '__main__':
 90 |     """
 91 |     Run this file to verify your solution.
 92 |     """
 93 | 
 94 |     from spinup import ppo_tf1 as ppo
 95 |     from spinup.exercises.common import print_result
 96 |     import gym
 97 |     import os
 98 |     import pandas as pd
 99 |     import psutil
100 |     import time
101 | 
102 |     logdir = "/tmp/experiments/%i"%int(time.time())
103 |     ppo(env_fn = lambda : gym.make('InvertedPendulum-v2'),
104 |         ac_kwargs=dict(policy=mlp_gaussian_policy, hidden_sizes=(64,)),
105 |         steps_per_epoch=4000, epochs=20, logger_kwargs=dict(output_dir=logdir))
106 | 
107 |     # Get scores from last five epochs to evaluate success.
108 |     data = pd.read_table(os.path.join(logdir,'progress.txt'))
109 |     last_scores = data['AverageEpRet'][-5:]
110 | 
111 |     # Your implementation is probably correct if the agent has a score >500,
112 |     # or if it reaches the top possible score of 1000, in the last five epochs.
113 |     correct = np.mean(last_scores) > 500 or np.max(last_scores)==1e3
114 |     print_result(correct)


--------------------------------------------------------------------------------
/spinup/exercises/tf1/problem_set_1_solutions/exercise1_1_soln.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | 
4 | EPS=1e-8
5 | 
6 | def gaussian_likelihood(x, mu, log_std):
7 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
8 |     return tf.reduce_sum(pre_sum, axis=1)


--------------------------------------------------------------------------------
/spinup/exercises/tf1/problem_set_1_solutions/exercise1_2_soln.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | EPS = 1e-8
 6 | 
 7 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 8 |     for h in hidden_sizes[:-1]:
 9 |         x = tf.layers.dense(x, units=h, activation=activation)
10 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
11 | 
12 | def gaussian_likelihood(x, mu, log_std):
13 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi))
14 |     return tf.reduce_sum(pre_sum, axis=1)
15 | 
16 | def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation, action_space):
17 |     act_dim = a.shape.as_list()[-1]
18 |     mu = mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
19 |     log_std = tf.get_variable(name='log_std', initializer=-0.5*np.ones(act_dim, dtype=np.float32))
20 |     std = tf.exp(log_std)
21 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
22 |     logp = gaussian_likelihood(a, mu, log_std)
23 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
24 |     return pi, logp, logp_pi


--------------------------------------------------------------------------------
/spinup/exercises/tf1/problem_set_2/exercise2_2.py:
--------------------------------------------------------------------------------
 1 | from spinup.algos.tf1.ddpg.core import mlp, mlp_actor_critic
 2 | from spinup.utils.run_utils import ExperimentGrid
 3 | from spinup import ddpg_tf1 as ddpg
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | 
 7 | """
 8 | 
 9 | Exercise 2.2: Silent Bug in DDPG
10 | 
11 | In this exercise, you will run DDPG with a bugged actor critic. Your goal is
12 | to determine whether or not there is any performance degredation, and if so,
13 | figure out what's going wrong.
14 | 
15 | You do NOT need to write code for this exercise.
16 | 
17 | """
18 | 
19 | """
20 | Bugged Actor-Critic
21 | """
22 | def bugged_mlp_actor_critic(x, a, hidden_sizes=(400,300), activation=tf.nn.relu, 
23 |                             output_activation=tf.tanh, action_space=None):
24 |     act_dim = a.shape.as_list()[-1]
25 |     act_limit = action_space.high[0]
26 |     with tf.variable_scope('pi'):
27 |         pi = act_limit * mlp(x, list(hidden_sizes)+[act_dim], activation, output_activation)
28 |     with tf.variable_scope('q'):
29 |         q = mlp(tf.concat([x,a], axis=-1), list(hidden_sizes)+[1], activation, None)
30 |     with tf.variable_scope('q', reuse=True):
31 |         q_pi = mlp(tf.concat([x,pi], axis=-1), list(hidden_sizes)+[1], activation, None)
32 |     return pi, q, q_pi
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     import argparse
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
39 |     parser.add_argument('--h', type=int, default=300)
40 |     parser.add_argument('--l', type=int, default=1)
41 |     parser.add_argument('--num_runs', '-n', type=int, default=3)
42 |     parser.add_argument('--steps_per_epoch', '-s', type=int, default=5000)
43 |     parser.add_argument('--total_steps', '-t', type=int, default=int(5e4))
44 |     args = parser.parse_args()
45 | 
46 |     def ddpg_with_actor_critic(bugged, **kwargs):
47 |         actor_critic = bugged_mlp_actor_critic if bugged else mlp_actor_critic
48 |         return ddpg(actor_critic=actor_critic, 
49 |                     ac_kwargs=dict(hidden_sizes=[args.h]*args.l),
50 |                     start_steps=5000,
51 |                     max_ep_len=150,
52 |                     batch_size=64,
53 |                     polyak=0.95,
54 |                     **kwargs)
55 | 
56 |     eg = ExperimentGrid(name='ex2-2_ddpg')
57 |     eg.add('replay_size', int(args.total_steps))
58 |     eg.add('env_name', args.env, '', True)
59 |     eg.add('seed', [10*i for i in range(args.num_runs)])
60 |     eg.add('epochs', int(args.total_steps / args.steps_per_epoch))
61 |     eg.add('steps_per_epoch', args.steps_per_epoch)
62 |     eg.add('bugged', [False, True])
63 |     eg.run(ddpg_with_actor_critic, datestamp=True)


--------------------------------------------------------------------------------
/spinup/user_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as osp
 3 | 
 4 | # Default neural network backend for each algo
 5 | # (Must be either 'tf1' or 'pytorch')
 6 | DEFAULT_BACKEND = {
 7 |     'vpg': 'pytorch',
 8 |     'trpo': 'tf1',
 9 |     'ppo': 'pytorch',
10 |     'ddpg': 'pytorch',
11 |     'td3': 'pytorch',
12 |     'sac': 'pytorch'
13 | }
14 | 
15 | # Where experiment outputs are saved by default:
16 | DEFAULT_DATA_DIR = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))),'data')
17 | 
18 | # Whether to automatically insert a date and time stamp into the names of
19 | # save directories:
20 | FORCE_DATESTAMP = False
21 | 
22 | # Whether GridSearch provides automatically-generated default shorthands:
23 | DEFAULT_SHORTHAND = True
24 | 
25 | # Tells the GridSearch how many seconds to pause for before launching 
26 | # experiments.
27 | WAIT_BEFORE_LAUNCH = 5


--------------------------------------------------------------------------------
/spinup/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/spinningup/038665d62d569055401d91856abb287263096178/spinup/utils/__init__.py


--------------------------------------------------------------------------------
/spinup/utils/mpi_pytorch.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | import numpy as np
 3 | import os
 4 | import torch
 5 | from mpi4py import MPI
 6 | from spinup.utils.mpi_tools import broadcast, mpi_avg, num_procs, proc_id
 7 | 
 8 | def setup_pytorch_for_mpi():
 9 |     """
10 |     Avoid slowdowns caused by each separate process's PyTorch using
11 |     more than its fair share of CPU resources.
12 |     """
13 |     #print('Proc %d: Reporting original number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
14 |     if torch.get_num_threads()==1:
15 |         return
16 |     fair_num_threads = max(int(torch.get_num_threads() / num_procs()), 1)
17 |     torch.set_num_threads(fair_num_threads)
18 |     #print('Proc %d: Reporting new number of Torch threads as %d.'%(proc_id(), torch.get_num_threads()), flush=True)
19 | 
20 | def mpi_avg_grads(module):
21 |     """ Average contents of gradient buffers across MPI processes. """
22 |     if num_procs()==1:
23 |         return
24 |     for p in module.parameters():
25 |         p_grad_numpy = p.grad.numpy()   # numpy view of tensor data
26 |         avg_p_grad = mpi_avg(p.grad)
27 |         p_grad_numpy[:] = avg_p_grad[:]
28 | 
29 | def sync_params(module):
30 |     """ Sync all parameters of module across all MPI processes. """
31 |     if num_procs()==1:
32 |         return
33 |     for p in module.parameters():
34 |         p_numpy = p.data.numpy()
35 |         broadcast(p_numpy)


--------------------------------------------------------------------------------
/spinup/utils/mpi_tf.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from mpi4py import MPI
 4 | from spinup.utils.mpi_tools import broadcast
 5 | 
 6 | 
 7 | def flat_concat(xs):
 8 |     return tf.concat([tf.reshape(x,(-1,)) for x in xs], axis=0)
 9 | 
10 | def assign_params_from_flat(x, params):
11 |     flat_size = lambda p : int(np.prod(p.shape.as_list())) # the 'int' is important for scalars
12 |     splits = tf.split(x, [flat_size(p) for p in params])
13 |     new_params = [tf.reshape(p_new, p.shape) for p, p_new in zip(params, splits)]
14 |     return tf.group([tf.assign(p, p_new) for p, p_new in zip(params, new_params)])
15 | 
16 | def sync_params(params):
17 |     get_params = flat_concat(params)
18 |     def _broadcast(x):
19 |         broadcast(x)
20 |         return x
21 |     synced_params = tf.py_func(_broadcast, [get_params], tf.float32)
22 |     return assign_params_from_flat(synced_params, params)
23 | 
24 | def sync_all_params():
25 |     """Sync all tf variables across MPI processes."""
26 |     return sync_params(tf.global_variables())
27 | 
28 | 
29 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
30 |     """
31 |     Adam optimizer that averages gradients across MPI processes.
32 | 
33 |     The compute_gradients method is taken from Baselines `MpiAdamOptimizer`_. 
34 |     For documentation on method arguments, see the Tensorflow docs page for 
35 |     the base `AdamOptimizer`_.
36 | 
37 |     .. _`MpiAdamOptimizer`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_adam_optimizer.py
38 |     .. _`AdamOptimizer`: https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
39 |     """
40 | 
41 |     def __init__(self, **kwargs):
42 |         self.comm = MPI.COMM_WORLD
43 |         tf.train.AdamOptimizer.__init__(self, **kwargs)
44 | 
45 |     def compute_gradients(self, loss, var_list, **kwargs):
46 |         """
47 |         Same as normal compute_gradients, except average grads over processes.
48 |         """
49 |         grads_and_vars = super().compute_gradients(loss, var_list, **kwargs)
50 |         grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
51 |         flat_grad = flat_concat([g for g, v in grads_and_vars])
52 |         shapes = [v.shape.as_list() for g, v in grads_and_vars]
53 |         sizes = [int(np.prod(s)) for s in shapes]
54 | 
55 |         num_tasks = self.comm.Get_size()
56 |         buf = np.zeros(flat_grad.shape, np.float32)
57 | 
58 |         def _collect_grads(flat_grad):
59 |             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
60 |             np.divide(buf, float(num_tasks), out=buf)
61 |             return buf
62 | 
63 |         avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
64 |         avg_flat_grad.set_shape(flat_grad.shape)
65 |         avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
66 |         avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
67 |                     for g, (_, v) in zip(avg_grads, grads_and_vars)]
68 | 
69 |         return avg_grads_and_vars
70 | 
71 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
72 |         """
73 |         Same as normal apply_gradients, except sync params after update.
74 |         """
75 |         opt = super().apply_gradients(grads_and_vars, global_step, name)
76 |         with tf.control_dependencies([opt]):
77 |             sync = sync_params([v for g,v in grads_and_vars])
78 |         return tf.group([opt, sync])


--------------------------------------------------------------------------------
/spinup/utils/mpi_tools.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import os, subprocess, sys
 3 | import numpy as np
 4 | 
 5 | 
 6 | def mpi_fork(n, bind_to_core=False):
 7 |     """
 8 |     Re-launches the current script with workers linked by MPI.
 9 | 
10 |     Also, terminates the original process that launched it.
11 | 
12 |     Taken almost without modification from the Baselines function of the
13 |     `same name`_.
14 | 
15 |     .. _`same name`: https://github.com/openai/baselines/blob/master/baselines/common/mpi_fork.py
16 | 
17 |     Args:
18 |         n (int): Number of process to split into.
19 | 
20 |         bind_to_core (bool): Bind each MPI process to a core.
21 |     """
22 |     if n<=1: 
23 |         return
24 |     if os.getenv("IN_MPI") is None:
25 |         env = os.environ.copy()
26 |         env.update(
27 |             MKL_NUM_THREADS="1",
28 |             OMP_NUM_THREADS="1",
29 |             IN_MPI="1"
30 |         )
31 |         args = ["mpirun", "-np", str(n)]
32 |         if bind_to_core:
33 |             args += ["-bind-to", "core"]
34 |         args += [sys.executable] + sys.argv
35 |         subprocess.check_call(args, env=env)
36 |         sys.exit()
37 | 
38 | 
39 | def msg(m, string=''):
40 |     print(('Message from %d: %s \t '%(MPI.COMM_WORLD.Get_rank(), string))+str(m))
41 | 
42 | def proc_id():
43 |     """Get rank of calling process."""
44 |     return MPI.COMM_WORLD.Get_rank()
45 | 
46 | def allreduce(*args, **kwargs):
47 |     return MPI.COMM_WORLD.Allreduce(*args, **kwargs)
48 | 
49 | def num_procs():
50 |     """Count active MPI processes."""
51 |     return MPI.COMM_WORLD.Get_size()
52 | 
53 | def broadcast(x, root=0):
54 |     MPI.COMM_WORLD.Bcast(x, root=root)
55 | 
56 | def mpi_op(x, op):
57 |     x, scalar = ([x], True) if np.isscalar(x) else (x, False)
58 |     x = np.asarray(x, dtype=np.float32)
59 |     buff = np.zeros_like(x, dtype=np.float32)
60 |     allreduce(x, buff, op=op)
61 |     return buff[0] if scalar else buff
62 | 
63 | def mpi_sum(x):
64 |     return mpi_op(x, MPI.SUM)
65 | 
66 | def mpi_avg(x):
67 |     """Average a scalar or vector over MPI processes."""
68 |     return mpi_sum(x) / num_procs()
69 |     
70 | def mpi_statistics_scalar(x, with_min_and_max=False):
71 |     """
72 |     Get mean/std and optional min/max of scalar x across MPI processes.
73 | 
74 |     Args:
75 |         x: An array containing samples of the scalar to produce statistics
76 |             for.
77 | 
78 |         with_min_and_max (bool): If true, return min and max of x in 
79 |             addition to mean and std.
80 |     """
81 |     x = np.array(x, dtype=np.float32)
82 |     global_sum, global_n = mpi_sum([np.sum(x), len(x)])
83 |     mean = global_sum / global_n
84 | 
85 |     global_sum_sq = mpi_sum(np.sum((x - mean)**2))
86 |     std = np.sqrt(global_sum_sq / global_n)  # compute global std
87 | 
88 |     if with_min_and_max:
89 |         global_min = mpi_op(np.min(x) if len(x) > 0 else np.inf, op=MPI.MIN)
90 |         global_max = mpi_op(np.max(x) if len(x) > 0 else -np.inf, op=MPI.MAX)
91 |         return mean, std, global_min, global_max
92 |     return mean, std


--------------------------------------------------------------------------------
/spinup/utils/run_entrypoint.py:
--------------------------------------------------------------------------------
 1 | import zlib
 2 | import pickle
 3 | import base64
 4 | 
 5 | if __name__ == '__main__':
 6 |     import argparse
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('encoded_thunk')
 9 |     args = parser.parse_args()
10 |     thunk = pickle.loads(zlib.decompress(base64.b64decode(args.encoded_thunk)))
11 |     thunk()


--------------------------------------------------------------------------------
/spinup/utils/serialization_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | def convert_json(obj):
 4 |     """ Convert obj to a version which can be serialized with JSON. """
 5 |     if is_json_serializable(obj):
 6 |         return obj
 7 |     else:
 8 |         if isinstance(obj, dict):
 9 |             return {convert_json(k): convert_json(v) 
10 |                     for k,v in obj.items()}
11 | 
12 |         elif isinstance(obj, tuple):
13 |             return (convert_json(x) for x in obj)
14 | 
15 |         elif isinstance(obj, list):
16 |             return [convert_json(x) for x in obj]
17 | 
18 |         elif hasattr(obj,'__name__') and not('lambda' in obj.__name__):
19 |             return convert_json(obj.__name__)
20 | 
21 |         elif hasattr(obj,'__dict__') and obj.__dict__:
22 |             obj_dict = {convert_json(k): convert_json(v) 
23 |                         for k,v in obj.__dict__.items()}
24 |             return {str(obj): obj_dict}
25 | 
26 |         return str(obj)
27 | 
28 | def is_json_serializable(v):
29 |     try:
30 |         json.dumps(v)
31 |         return True
32 |     except:
33 |         return False


--------------------------------------------------------------------------------
/spinup/utils/test_policy.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import joblib
  3 | import os
  4 | import os.path as osp
  5 | import tensorflow as tf
  6 | import torch
  7 | from spinup import EpochLogger
  8 | from spinup.utils.logx import restore_tf_graph
  9 | 
 10 | 
 11 | def load_policy_and_env(fpath, itr='last', deterministic=False):
 12 |     """
 13 |     Load a policy from save, whether it's TF or PyTorch, along with RL env.
 14 | 
 15 |     Not exceptionally future-proof, but it will suffice for basic uses of the 
 16 |     Spinning Up implementations.
 17 | 
 18 |     Checks to see if there's a tf1_save folder. If yes, assumes the model
 19 |     is tensorflow and loads it that way. Otherwise, loads as if there's a 
 20 |     PyTorch save.
 21 |     """
 22 | 
 23 |     # determine if tf save or pytorch save
 24 |     if any(['tf1_save' in x for x in os.listdir(fpath)]):
 25 |         backend = 'tf1'
 26 |     else:
 27 |         backend = 'pytorch'
 28 | 
 29 |     # handle which epoch to load from
 30 |     if itr=='last':
 31 |         # check filenames for epoch (AKA iteration) numbers, find maximum value
 32 | 
 33 |         if backend == 'tf1':
 34 |             saves = [int(x[8:]) for x in os.listdir(fpath) if 'tf1_save' in x and len(x)>8]
 35 | 
 36 |         elif backend == 'pytorch':
 37 |             pytsave_path = osp.join(fpath, 'pyt_save')
 38 |             # Each file in this folder has naming convention 'modelXX.pt', where
 39 |             # 'XX' is either an integer or empty string. Empty string case
 40 |             # corresponds to len(x)==8, hence that case is excluded.
 41 |             saves = [int(x.split('.')[0][5:]) for x in os.listdir(pytsave_path) if len(x)>8 and 'model' in x]
 42 | 
 43 |         itr = '%d'%max(saves) if len(saves) > 0 else ''
 44 | 
 45 |     else:
 46 |         assert isinstance(itr, int), \
 47 |             "Bad value provided for itr (needs to be int or 'last')."
 48 |         itr = '%d'%itr
 49 | 
 50 |     # load the get_action function
 51 |     if backend == 'tf1':
 52 |         get_action = load_tf_policy(fpath, itr, deterministic)
 53 |     else:
 54 |         get_action = load_pytorch_policy(fpath, itr, deterministic)
 55 | 
 56 |     # try to load environment from save
 57 |     # (sometimes this will fail because the environment could not be pickled)
 58 |     try:
 59 |         state = joblib.load(osp.join(fpath, 'vars'+itr+'.pkl'))
 60 |         env = state['env']
 61 |     except:
 62 |         env = None
 63 | 
 64 |     return env, get_action
 65 | 
 66 | 
 67 | def load_tf_policy(fpath, itr, deterministic=False):
 68 |     """ Load a tensorflow policy saved with Spinning Up Logger."""
 69 | 
 70 |     fname = osp.join(fpath, 'tf1_save'+itr)
 71 |     print('\n\nLoading from %s.\n\n'%fname)
 72 | 
 73 |     # load the things!
 74 |     sess = tf.Session()
 75 |     model = restore_tf_graph(sess, fname)
 76 | 
 77 |     # get the correct op for executing actions
 78 |     if deterministic and 'mu' in model.keys():
 79 |         # 'deterministic' is only a valid option for SAC policies
 80 |         print('Using deterministic action op.')
 81 |         action_op = model['mu']
 82 |     else:
 83 |         print('Using default action op.')
 84 |         action_op = model['pi']
 85 | 
 86 |     # make function for producing an action given a single state
 87 |     get_action = lambda x : sess.run(action_op, feed_dict={model['x']: x[None,:]})[0]
 88 | 
 89 |     return get_action
 90 | 
 91 | 
 92 | def load_pytorch_policy(fpath, itr, deterministic=False):
 93 |     """ Load a pytorch policy saved with Spinning Up Logger."""
 94 |     
 95 |     fname = osp.join(fpath, 'pyt_save', 'model'+itr+'.pt')
 96 |     print('\n\nLoading from %s.\n\n'%fname)
 97 | 
 98 |     model = torch.load(fname)
 99 | 
100 |     # make function for producing an action given a single state
101 |     def get_action(x):
102 |         with torch.no_grad():
103 |             x = torch.as_tensor(x, dtype=torch.float32)
104 |             action = model.act(x)
105 |         return action
106 | 
107 |     return get_action
108 | 
109 | 
110 | def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True):
111 | 
112 |     assert env is not None, \
113 |         "Environment not found!\n\n It looks like the environment wasn't saved, " + \
114 |         "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \
115 |         "page on Experiment Outputs for how to handle this situation."
116 | 
117 |     logger = EpochLogger()
118 |     o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0
119 |     while n < num_episodes:
120 |         if render:
121 |             env.render()
122 |             time.sleep(1e-3)
123 | 
124 |         a = get_action(o)
125 |         o, r, d, _ = env.step(a)
126 |         ep_ret += r
127 |         ep_len += 1
128 | 
129 |         if d or (ep_len == max_ep_len):
130 |             logger.store(EpRet=ep_ret, EpLen=ep_len)
131 |             print('Episode %d \t EpRet %.3f \t EpLen %d'%(n, ep_ret, ep_len))
132 |             o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
133 |             n += 1
134 | 
135 |     logger.log_tabular('EpRet', with_min_and_max=True)
136 |     logger.log_tabular('EpLen', average_only=True)
137 |     logger.dump_tabular()
138 | 
139 | 
140 | if __name__ == '__main__':
141 |     import argparse
142 |     parser = argparse.ArgumentParser()
143 |     parser.add_argument('fpath', type=str)
144 |     parser.add_argument('--len', '-l', type=int, default=0)
145 |     parser.add_argument('--episodes', '-n', type=int, default=100)
146 |     parser.add_argument('--norender', '-nr', action='store_true')
147 |     parser.add_argument('--itr', '-i', type=int, default=-1)
148 |     parser.add_argument('--deterministic', '-d', action='store_true')
149 |     args = parser.parse_args()
150 |     env, get_action = load_policy_and_env(args.fpath, 
151 |                                           args.itr if args.itr >=0 else 'last',
152 |                                           args.deterministic)
153 |     run_policy(env, get_action, args.len, args.episodes, not(args.norender))


--------------------------------------------------------------------------------
/spinup/version.py:
--------------------------------------------------------------------------------
1 | version_info = (0, 2, 0)
2 | # format:
3 | # ('spinup_major', 'spinup_minor', 'spinup_patch')
4 | 
5 | def get_version():
6 |     "Returns the version as a human-format string."
7 |     return '%d.%d.%d' % version_info
8 | 
9 | __version__ = get_version()


--------------------------------------------------------------------------------
/test/test_ppo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import unittest
 4 | from functools import partial
 5 | 
 6 | import gym
 7 | import tensorflow as tf
 8 | 
 9 | from spinup import ppo_tf1 as ppo
10 | 
11 | 
12 | class TestPPO(unittest.TestCase):
13 |     def test_cartpole(self):
14 |         ''' Test training a small agent in a simple environment '''
15 |         env_fn = partial(gym.make, 'CartPole-v1')
16 |         ac_kwargs = dict(hidden_sizes=(32,))
17 |         with tf.Graph().as_default():
18 |             ppo(env_fn, steps_per_epoch=100, epochs=10, ac_kwargs=ac_kwargs)
19 |         # TODO: ensure policy has got better at the task
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     unittest.main()
24 | 


--------------------------------------------------------------------------------
/travis_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | mkdir -p $HOME/.mujoco
 6 | 
 7 | # Avoid using pyenv in travis, since it adds ~7 minutes to turnaround time
 8 | if [ "$TRAVIS_OS_NAME" == "osx" ]
 9 | then
10 |     # https://github.com/travis-ci/travis-ci/issues/9640
11 |     sudo softwareupdate --install "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.4"
12 |     brew update
13 |     brew install open-mpi
14 |     brew install gcc
15 |     brew link --overwrite gcc
16 |     curl $MUJOCO_FOR_OSX | tar xz -C $HOME/.mujoco/
17 | elif [ "$TRAVIS_OS_NAME" == "linux" ]
18 | then
19 |     # Because this is flaky, try several times
20 |     set +e
21 |     COUNT=0
22 |     while [  $COUNT -lt 5 ]; do
23 |        sudo curl -o /usr/local/bin/patchelf https://s3-us-west-2.amazonaws.com/openai-sci-artifacts/manual-builds/patchelf_0.9_amd64.elf
24 |        if [ $? -eq 0 ];then
25 |           break
26 |        fi
27 |        let COUNT=COUNT+1
28 |     done
29 |     if [ $COUNT -ge 5 ]; then
30 |         echo "Failed to download patchelf"
31 |         exit 1
32 |     fi
33 |     set -e
34 | 
35 |     sudo chmod +x /usr/local/bin/patchelf
36 |     curl $MUJOCO_FOR_LINUX | tar xz -C $HOME/.mujoco/
37 | 
38 |     sudo apt-get update
39 |     sudo apt-get install -y openmpi-bin libopenmpi-dev libosmesa6-dev libglew-dev
40 | fi
41 | 


--------------------------------------------------------------------------------